fix tabix index generation (#7858)

* fix tabix index generation * tests, doc
broadinstitute · Jun 7, 2022 · d4f083d · d4f083d
1 parent 844789f
commit d4f083d
Show file tree

Hide file tree

Showing 6 changed files with 255 additions and 9 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/DumpTabixIndex.java b/src/main/java/org/broadinstitute/hellbender/tools/DumpTabixIndex.java
@@ -0,0 +1,189 @@
+package org.broadinstitute.hellbender.tools;
+
+import htsjdk.samtools.util.FileExtensions;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.engine.GATKPath;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.zip.GZIPInputStream;
+
+@CommandLineProgramProperties(
+        summary = "Prints a text file describing the contents of the tabix index input file.",
+        oneLineSummary = "Dumps a tabix index file.",
+        usageExample = "gatk DumpTabixIndex -I tabixIndex.tbi -O output.txt",
+        programGroup = DiagnosticsAndQCProgramGroup.class
+)
+@DocumentedFeature
+public class DumpTabixIndex extends CommandLineProgram {
+    @Argument( doc = "Tabix index file.",
+            fullName = "tabix-index",
+            shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME)
+    GATKPath tabixIndexFile;
+
+    @Argument( doc = "Output file.",
+            fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
+            shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME)
+    GATKPath outputFile;
+
+    @Override
+    protected Object doWork() {
+        if ( !tabixIndexFile.hasExtension(FileExtensions.TABIX_INDEX) ) {
+            throw new UserException("Expected a " + FileExtensions.TABIX_INDEX + " file as input.");
+        }
+
+        final PrintStream writer = new PrintStream(outputFile.getOutputStream());
+        try ( final PushbackInputStream is =
+                      new PushbackInputStream(new GZIPInputStream(tabixIndexFile.getInputStream())) ) {
+            dumpTabixIndex(is, writer);
+        } catch ( final IOException ioe ) {
+            throw new UserException("Trouble reading index.", ioe);
+        }
+        if ( writer.checkError() ) {
+            throw new UserException("Trouble writing output.");
+        }
+
+        return null;
+    }
+
+    public static void dumpTabixIndex( final PushbackInputStream is,
+                                       final PrintStream writer ) throws IOException {
+        if ( readByte(is) != 'T' || readByte(is) != 'B' ||
+                readByte(is) != 'I' || readByte(is) != 1 ) {
+            throw new UserException("Incorrect magic number for tabix index");
+        }
+        final int nTigs = readInt(is);
+        final int format = readInt(is);
+        final int seqCol = readInt(is);
+        final int begCol = readInt(is);
+        final int endCol = readInt(is);
+        final char meta = (char)readInt(is);
+        final int skip = readInt(is);
+        final int namesLen = readInt(is);
+        writer.println("#tigs\tformat\tseqCol\tbegCol\tendCol\tmetaChr\tskip");
+        writer.println(nTigs + "\t" + format + "\t" + seqCol + "\t" + begCol + "\t" +
+                endCol + "\t" + meta + "\t" + skip + "\n");
+        final List<String> tigNames = readContigNames(is, nTigs, namesLen);
+        for ( final String tigName : tigNames ) {
+            writer.println(tigName + " binned index:");
+            int nBins = readInt(is);
+            while ( nBins-- > 0 ) {
+                final int binNo = readInt(is);
+                int nChunks = readInt(is);
+                if ( binNo > 37448 ) {
+                    if ( nChunks != 2 ) {
+                        throw new UserException("pseudobin has " + nChunks + " chunks");
+                    }
+                    final long chunkStart = readLong(is);
+                    final long chunkEnd = readLong(is);
+                    final long nMapped = readLong(is);
+                    final long nUnmapped = readLong(is);
+                    writer.println(tigName + " summary: mapped=" + nMapped +
+                            "\tplaced=" + nUnmapped +
+                            "\tstart=" + Long.toHexString(chunkStart >>> 16) +
+                            ":" + Long.toHexString(chunkStart & 0xffff) +
+                            "\tend=" + Long.toHexString(chunkEnd >>> 16) +
+                            ":" + Long.toHexString(chunkStart & 0xffff));
+                    continue;
+                }
+                if ( binNo == 0 ) {
+                    writer.print(binNo + "\t" + tigName + ":0M-512M\t");
+                } else if ( binNo <= 8 ) {
+                    final int binStart = (binNo - 1)*64;
+                    writer.print(binNo + "\t" + tigName + ":" + binStart + "M-" + (binStart+64) + "M\t");
+                } else if ( binNo <= 72 ) {
+                    final int binStart = (binNo - 9)*8;
+                    writer.print(binNo + "\t" + tigName + ":" + binStart + "M-" + (binStart+8) + "M\t");
+                } else if ( binNo <= 584 ) {
+                    final int binStart = binNo - 73;
+                    writer.print(binNo + "\t" + tigName + ":" + binStart + "M-" + (binStart + 1) + "M\t");
+                } else if ( binNo <= 4680 ) {
+                    final int binStart = (binNo - 585)*128;
+                    writer.print(binNo + "\t" + tigName + ":" + binStart + "K-" + (binStart + 128) + "K\t");
+                } else {
+                    final int binStart = (binNo - 4681)*16;
+                    writer.print(binNo + "\t" + tigName + ":" + binStart + "K-" + (binStart + 16) + "K\t");
+                }
+                while ( nChunks-- > 0 ) {
+                    final long chunkStart = readLong(is);
+                    final long chunkEnd = readLong(is);
+                    writer.print("\t" + Long.toHexString(chunkStart >>> 16) +
+                            ":" + Long.toHexString(chunkStart & 0xffff) +
+                            "->" + Long.toHexString(chunkEnd >>> 16) +
+                            ":" + Long.toHexString(chunkEnd & 0xffff));
+                }
+                writer.println();
+            }
+            int nIntervals = readInt(is);
+            int nK = 0;
+            writer.println();
+            writer.println(tigName + " linear index:");
+            while ( nIntervals-- > 0 ) {
+                final long chunkOffset = readLong(is);
+                writer.println(nK + "K\t" + Long.toHexString(chunkOffset >>> 16) +
+                        ":" + Long.toHexString(chunkOffset & 0xffff));
+                nK += 16;
+            }
+        }
+        int nextByte;
+        if ( (nextByte = is.read()) != -1 ) {
+            is.unread(nextByte);
+            final long nUnmapped = readLong(is);
+            writer.println(nUnmapped + " unplaced reads.");
+        }
+        if ( (nextByte = is.read()) != -1 ) {
+            throw new UserException("Unexpected data follows index.");
+        }
+    }
+
+    public static List<String> readContigNames( final InputStream is,
+                                                final int nNames,
+                                                final int namesLen ) throws IOException {
+        final List<String> names = new ArrayList<>(nNames);
+        final StringBuilder sb = new StringBuilder();
+        int namesProcessed = 0;
+        int nextByte;
+        int nBytesRead = 0;
+        while ( namesProcessed++ < nNames ) {
+            while ( (nextByte = readByte(is)) != 0 ) {
+                sb.append((char)nextByte);
+                nBytesRead += 1;
+            }
+            nBytesRead += 1;
+            names.add(sb.toString());
+            sb.setLength(0);
+        }
+        if ( nBytesRead != namesLen ) {
+            throw new UserException("Contig names didn't have the correct length.");
+        }
+        return names;
+    }
+
+    public static int readByte( final InputStream is ) throws IOException {
+        final int value = is.read();
+        if ( value == -1 ) {
+            throw new IOException("Tried to read past EOF");
+        }
+        return value;
+    }
+
+    // tabix integers are little-endian
+    public static int readShort( final InputStream is ) throws IOException {
+        return readByte(is) | (readByte(is) << 8);
+    }
+
+    public static int readInt( final InputStream is ) throws IOException {
+        return readShort(is) | (readShort(is) << 16);
+    }
+
+    public static long readLong( final InputStream is ) throws IOException {
+        return (readInt(is) & 0xffffffffL) | ((long)readInt(is) << 32);
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/io/FeatureOutputStream.java b/src/main/java/org/broadinstitute/hellbender/utils/io/FeatureOutputStream.java
@@ -3,6 +3,7 @@
 import htsjdk.samtools.SAMSequenceDictionary;
 import htsjdk.samtools.util.BlockCompressedOutputStream;
 import htsjdk.samtools.util.IOUtil;
+import htsjdk.samtools.util.LocationAware;
 import htsjdk.samtools.util.PositionalOutputStream;
 import htsjdk.tribble.Feature;
 import htsjdk.tribble.index.Index;
@@ -15,6 +16,7 @@
 import org.broadinstitute.hellbender.utils.codecs.FeatureSink;
 
 import java.io.IOException;
+import java.io.OutputStream;
 import java.nio.file.Path;
 import java.util.function.Function;
 
@@ -27,14 +29,21 @@ public class FeatureOutputStream <F extends Feature> implements FeatureSink<F> {
     private static final String NEWLINE_CHARACTER = "\n";
 
     private final Function<F, String> encoder;
-    private final PositionalOutputStream outputStream;
+    private final OutputStream outputStream;
+    private final LocationAware locationAware; // same object as outputStream, above
+    // This is necessary because there is no LocationAwareOutputStream class for
+    // PositionalOutputStream and BlockCompressedOutputStream to extend.  Sadly.
+    // Do not wrap the BlockCompressedOutputStream in a PositionalOutputStream. The
+    // PositionalOutputStream just counts bytes output, but that's not a valid virtual file offset
+    // for a bgzip-compressed file.
+
     private final IndexCreator indexCreator;
     private final Path featurePath;
 
     /**
      * @param file file to write to
      * @param tabixFormat column descriptions for the tabix index
-     * @param header metaData for the features
+     * @param encoder functor to transform feature into a line of text
      */
     public FeatureOutputStream( final GATKPath file,
                                 final TabixFormat tabixFormat,
@@ -47,11 +56,15 @@ public FeatureOutputStream( final GATKPath file,
         Utils.nonNull(dict);
         this.encoder = encoder;
         if (IOUtil.hasBlockCompressedExtension(file.toPath())) {
-            outputStream = new PositionalOutputStream(
-                            new BlockCompressedOutputStream(file.toString(), compressionLevel));
+            final BlockCompressedOutputStream bcos =
+                    new BlockCompressedOutputStream(file.toString(), compressionLevel);
+            outputStream = bcos;
+            locationAware = bcos;
             indexCreator = new TabixIndexCreator(dict, tabixFormat);
         } else {
-            outputStream = new PositionalOutputStream(file.getOutputStream());
+            final PositionalOutputStream pos = new PositionalOutputStream(file.getOutputStream());
+            outputStream = pos;
+            locationAware = pos;
             indexCreator = null;
         }
         featurePath = file.toPath();
@@ -80,7 +93,7 @@ public void writeHeader(final String header) {
     public void write(final F feature) {
         Utils.nonNull(feature);
         if (indexCreator != null) {
-            indexCreator.addFeature(feature, outputStream.getPosition());
+            indexCreator.addFeature(feature, locationAware.getPosition());
         }
         try {
             outputStream.write((encoder.apply(feature) + NEWLINE_CHARACTER).getBytes());
@@ -96,7 +109,7 @@ public void write(final F feature) {
     public void close() {
         try {
             if (indexCreator != null) {
-                final Index index = indexCreator.finalizeIndex(outputStream.getPosition());
+                final Index index = indexCreator.finalizeIndex(locationAware.getPosition());
                 index.writeBasedOnFeaturePath(featurePath);
             }
             outputStream.close();

diff --git a/src/test/java/org/broadinstitute/hellbender/tools/DumpTabixIndexIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/DumpTabixIndexIntegrationTest.java
@@ -0,0 +1,25 @@
+package org.broadinstitute.hellbender.tools;
+
+import htsjdk.samtools.util.Log;
+import org.broadinstitute.hellbender.CommandLineProgramTest;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
+import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
+import org.broadinstitute.hellbender.tools.sv.LocusDepthtoBAF;
+import org.testng.annotations.Test;
+
+import java.io.IOException;
+import java.util.Collections;
+
+public class DumpTabixIndexIntegrationTest extends CommandLineProgramTest {
+    @Test
+    public void trivialDumpTabixTest() throws IOException {
+        final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
+        argsBuilder.add(StandardArgumentDefinitions.INPUT_SHORT_NAME, toolsTestDir + "test.tbi");
+        argsBuilder.add(StandardArgumentDefinitions.OUTPUT_SHORT_NAME, "%s");
+        final IntegrationTestSpec testSpec =
+                new IntegrationTestSpec(argsBuilder.getString(),
+                        Collections.singletonList(toolsTestDir + "test.tbi.expected.txt"));
+        testSpec.executeTest("dump tabix index", this);
+    }
+}
diff --git a/src/test/java/org/broadinstitute/hellbender/utils/io/FeatureOutputStreamUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/io/FeatureOutputStreamUnitTest.java
@@ -10,9 +10,12 @@
 import htsjdk.tribble.readers.AsciiLineReader;
 import htsjdk.tribble.readers.AsciiLineReaderIterator;
 import htsjdk.tribble.readers.LineIterator;
+import htsjdk.tribble.readers.PositionalBufferedStream;
 import org.broadinstitute.hellbender.GATKBaseTest;
+import org.broadinstitute.hellbender.engine.FeatureDataSource;
 import org.broadinstitute.hellbender.engine.GATKPath;
 import org.broadinstitute.hellbender.tools.sv.*;
+import org.broadinstitute.hellbender.utils.SimpleInterval;
 import org.broadinstitute.hellbender.utils.codecs.BafEvidenceCodec;
 import org.broadinstitute.hellbender.utils.codecs.DepthEvidenceCodec;
 import org.broadinstitute.hellbender.utils.codecs.DiscordantPairEvidenceCodec;
@@ -94,17 +97,23 @@ public <T extends Feature> void test(final ArrayList<T> featureList,
     private <T extends Feature> void testWithStream(final FeatureOutputStream<T> stream, final ArrayList<T> featureList,
                                 final Path outFilePath, final FeatureCodec<T, LineIterator> codec,
                                 final String expectedHeader, final boolean indexed) throws IOException {
-        final Path outIndexPath = Paths.get(outFilePath + FileExtensions.TABIX_INDEX);
         if (expectedHeader != null) {
             stream.writeHeader(expectedHeader);
         }
         featureList.stream().forEachOrdered(s -> stream.write(s));
         stream.close();
         if (indexed) {
+            final Path outIndexPath = Paths.get(outFilePath + FileExtensions.TABIX_INDEX);
             Assert.assertTrue(Files.exists(outIndexPath), "Index does not exist");
+
+            // try to use the index a little
+            final FeatureDataSource<T> dataSource = new FeatureDataSource<T>(outFilePath.toFile());
+            dataSource.query(new SimpleInterval("chr1", 1234, 5678));
+            dataSource.close();
         }
         final InputStream inputStream = IOUtil.isBlockCompressed(outFilePath) ?
-                new BlockCompressedInputStream(outFilePath.toFile()) : new FileInputStream(outFilePath.toFile());
+                new BlockCompressedInputStream(outFilePath.toFile()) :
+                new PositionalBufferedStream(new FileInputStream(outFilePath.toFile()));
         final LineIterator reader = new AsciiLineReaderIterator(AsciiLineReader.from(inputStream));
 
         //Check header

diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/test.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/test.tbi
diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/test.tbi.expected.txt b/src/test/resources/org/broadinstitute/hellbender/tools/test.tbi.expected.txt
@@ -0,0 +1,10 @@
+#tigs	format	seqCol	begCol	endCol	metaChr	skip
+1	0	1	2	2	#	0
+
+chr9 binned index:
+4681	chr9:0K-16K		0:0->35:0
+chr9 summary: mapped=1	placed=0	start=0:0	end=35:0
+
+chr9 linear index:
+0K	0:0
+0 unplaced reads.