Skip to content

Commit

Permalink
fix tabix index generation (#7858)
Browse files Browse the repository at this point in the history
* fix tabix index generation

* tests, doc
  • Loading branch information
tedsharpe committed Jun 7, 2022
1 parent 844789f commit d4f083d
Show file tree
Hide file tree
Showing 6 changed files with 255 additions and 9 deletions.
189 changes: 189 additions & 0 deletions src/main/java/org/broadinstitute/hellbender/tools/DumpTabixIndex.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
package org.broadinstitute.hellbender.tools;

import htsjdk.samtools.util.FileExtensions;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.exceptions.UserException;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;

import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;

@CommandLineProgramProperties(
summary = "Prints a text file describing the contents of the tabix index input file.",
oneLineSummary = "Dumps a tabix index file.",
usageExample = "gatk DumpTabixIndex -I tabixIndex.tbi -O output.txt",
programGroup = DiagnosticsAndQCProgramGroup.class
)
@DocumentedFeature
public class DumpTabixIndex extends CommandLineProgram {
@Argument( doc = "Tabix index file.",
fullName = "tabix-index",
shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME)
GATKPath tabixIndexFile;

@Argument( doc = "Output file.",
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME)
GATKPath outputFile;

@Override
protected Object doWork() {
if ( !tabixIndexFile.hasExtension(FileExtensions.TABIX_INDEX) ) {
throw new UserException("Expected a " + FileExtensions.TABIX_INDEX + " file as input.");
}

final PrintStream writer = new PrintStream(outputFile.getOutputStream());
try ( final PushbackInputStream is =
new PushbackInputStream(new GZIPInputStream(tabixIndexFile.getInputStream())) ) {
dumpTabixIndex(is, writer);
} catch ( final IOException ioe ) {
throw new UserException("Trouble reading index.", ioe);
}
if ( writer.checkError() ) {
throw new UserException("Trouble writing output.");
}

return null;
}

public static void dumpTabixIndex( final PushbackInputStream is,
final PrintStream writer ) throws IOException {
if ( readByte(is) != 'T' || readByte(is) != 'B' ||
readByte(is) != 'I' || readByte(is) != 1 ) {
throw new UserException("Incorrect magic number for tabix index");
}
final int nTigs = readInt(is);
final int format = readInt(is);
final int seqCol = readInt(is);
final int begCol = readInt(is);
final int endCol = readInt(is);
final char meta = (char)readInt(is);
final int skip = readInt(is);
final int namesLen = readInt(is);
writer.println("#tigs\tformat\tseqCol\tbegCol\tendCol\tmetaChr\tskip");
writer.println(nTigs + "\t" + format + "\t" + seqCol + "\t" + begCol + "\t" +
endCol + "\t" + meta + "\t" + skip + "\n");
final List<String> tigNames = readContigNames(is, nTigs, namesLen);
for ( final String tigName : tigNames ) {
writer.println(tigName + " binned index:");
int nBins = readInt(is);
while ( nBins-- > 0 ) {
final int binNo = readInt(is);
int nChunks = readInt(is);
if ( binNo > 37448 ) {
if ( nChunks != 2 ) {
throw new UserException("pseudobin has " + nChunks + " chunks");
}
final long chunkStart = readLong(is);
final long chunkEnd = readLong(is);
final long nMapped = readLong(is);
final long nUnmapped = readLong(is);
writer.println(tigName + " summary: mapped=" + nMapped +
"\tplaced=" + nUnmapped +
"\tstart=" + Long.toHexString(chunkStart >>> 16) +
":" + Long.toHexString(chunkStart & 0xffff) +
"\tend=" + Long.toHexString(chunkEnd >>> 16) +
":" + Long.toHexString(chunkStart & 0xffff));
continue;
}
if ( binNo == 0 ) {
writer.print(binNo + "\t" + tigName + ":0M-512M\t");
} else if ( binNo <= 8 ) {
final int binStart = (binNo - 1)*64;
writer.print(binNo + "\t" + tigName + ":" + binStart + "M-" + (binStart+64) + "M\t");
} else if ( binNo <= 72 ) {
final int binStart = (binNo - 9)*8;
writer.print(binNo + "\t" + tigName + ":" + binStart + "M-" + (binStart+8) + "M\t");
} else if ( binNo <= 584 ) {
final int binStart = binNo - 73;
writer.print(binNo + "\t" + tigName + ":" + binStart + "M-" + (binStart + 1) + "M\t");
} else if ( binNo <= 4680 ) {
final int binStart = (binNo - 585)*128;
writer.print(binNo + "\t" + tigName + ":" + binStart + "K-" + (binStart + 128) + "K\t");
} else {
final int binStart = (binNo - 4681)*16;
writer.print(binNo + "\t" + tigName + ":" + binStart + "K-" + (binStart + 16) + "K\t");
}
while ( nChunks-- > 0 ) {
final long chunkStart = readLong(is);
final long chunkEnd = readLong(is);
writer.print("\t" + Long.toHexString(chunkStart >>> 16) +
":" + Long.toHexString(chunkStart & 0xffff) +
"->" + Long.toHexString(chunkEnd >>> 16) +
":" + Long.toHexString(chunkEnd & 0xffff));
}
writer.println();
}
int nIntervals = readInt(is);
int nK = 0;
writer.println();
writer.println(tigName + " linear index:");
while ( nIntervals-- > 0 ) {
final long chunkOffset = readLong(is);
writer.println(nK + "K\t" + Long.toHexString(chunkOffset >>> 16) +
":" + Long.toHexString(chunkOffset & 0xffff));
nK += 16;
}
}
int nextByte;
if ( (nextByte = is.read()) != -1 ) {
is.unread(nextByte);
final long nUnmapped = readLong(is);
writer.println(nUnmapped + " unplaced reads.");
}
if ( (nextByte = is.read()) != -1 ) {
throw new UserException("Unexpected data follows index.");
}
}

public static List<String> readContigNames( final InputStream is,
final int nNames,
final int namesLen ) throws IOException {
final List<String> names = new ArrayList<>(nNames);
final StringBuilder sb = new StringBuilder();
int namesProcessed = 0;
int nextByte;
int nBytesRead = 0;
while ( namesProcessed++ < nNames ) {
while ( (nextByte = readByte(is)) != 0 ) {
sb.append((char)nextByte);
nBytesRead += 1;
}
nBytesRead += 1;
names.add(sb.toString());
sb.setLength(0);
}
if ( nBytesRead != namesLen ) {
throw new UserException("Contig names didn't have the correct length.");
}
return names;
}

public static int readByte( final InputStream is ) throws IOException {
final int value = is.read();
if ( value == -1 ) {
throw new IOException("Tried to read past EOF");
}
return value;
}

// tabix integers are little-endian
public static int readShort( final InputStream is ) throws IOException {
return readByte(is) | (readByte(is) << 8);
}

public static int readInt( final InputStream is ) throws IOException {
return readShort(is) | (readShort(is) << 16);
}

public static long readLong( final InputStream is ) throws IOException {
return (readInt(is) & 0xffffffffL) | ((long)readInt(is) << 32);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.BlockCompressedOutputStream;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.LocationAware;
import htsjdk.samtools.util.PositionalOutputStream;
import htsjdk.tribble.Feature;
import htsjdk.tribble.index.Index;
Expand All @@ -15,6 +16,7 @@
import org.broadinstitute.hellbender.utils.codecs.FeatureSink;

import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Path;
import java.util.function.Function;

Expand All @@ -27,14 +29,21 @@ public class FeatureOutputStream <F extends Feature> implements FeatureSink<F> {
private static final String NEWLINE_CHARACTER = "\n";

private final Function<F, String> encoder;
private final PositionalOutputStream outputStream;
private final OutputStream outputStream;
private final LocationAware locationAware; // same object as outputStream, above
// This is necessary because there is no LocationAwareOutputStream class for
// PositionalOutputStream and BlockCompressedOutputStream to extend. Sadly.
// Do not wrap the BlockCompressedOutputStream in a PositionalOutputStream. The
// PositionalOutputStream just counts bytes output, but that's not a valid virtual file offset
// for a bgzip-compressed file.

private final IndexCreator indexCreator;
private final Path featurePath;

/**
* @param file file to write to
* @param tabixFormat column descriptions for the tabix index
* @param header metaData for the features
* @param encoder functor to transform feature into a line of text
*/
public FeatureOutputStream( final GATKPath file,
final TabixFormat tabixFormat,
Expand All @@ -47,11 +56,15 @@ public FeatureOutputStream( final GATKPath file,
Utils.nonNull(dict);
this.encoder = encoder;
if (IOUtil.hasBlockCompressedExtension(file.toPath())) {
outputStream = new PositionalOutputStream(
new BlockCompressedOutputStream(file.toString(), compressionLevel));
final BlockCompressedOutputStream bcos =
new BlockCompressedOutputStream(file.toString(), compressionLevel);
outputStream = bcos;
locationAware = bcos;
indexCreator = new TabixIndexCreator(dict, tabixFormat);
} else {
outputStream = new PositionalOutputStream(file.getOutputStream());
final PositionalOutputStream pos = new PositionalOutputStream(file.getOutputStream());
outputStream = pos;
locationAware = pos;
indexCreator = null;
}
featurePath = file.toPath();
Expand Down Expand Up @@ -80,7 +93,7 @@ public void writeHeader(final String header) {
public void write(final F feature) {
Utils.nonNull(feature);
if (indexCreator != null) {
indexCreator.addFeature(feature, outputStream.getPosition());
indexCreator.addFeature(feature, locationAware.getPosition());
}
try {
outputStream.write((encoder.apply(feature) + NEWLINE_CHARACTER).getBytes());
Expand All @@ -96,7 +109,7 @@ public void write(final F feature) {
public void close() {
try {
if (indexCreator != null) {
final Index index = indexCreator.finalizeIndex(outputStream.getPosition());
final Index index = indexCreator.finalizeIndex(locationAware.getPosition());
index.writeBasedOnFeaturePath(featurePath);
}
outputStream.close();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package org.broadinstitute.hellbender.tools;

import htsjdk.samtools.util.Log;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
import org.broadinstitute.hellbender.tools.sv.LocusDepthtoBAF;
import org.testng.annotations.Test;

import java.io.IOException;
import java.util.Collections;

public class DumpTabixIndexIntegrationTest extends CommandLineProgramTest {
@Test
public void trivialDumpTabixTest() throws IOException {
final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
argsBuilder.add(StandardArgumentDefinitions.INPUT_SHORT_NAME, toolsTestDir + "test.tbi");
argsBuilder.add(StandardArgumentDefinitions.OUTPUT_SHORT_NAME, "%s");
final IntegrationTestSpec testSpec =
new IntegrationTestSpec(argsBuilder.getString(),
Collections.singletonList(toolsTestDir + "test.tbi.expected.txt"));
testSpec.executeTest("dump tabix index", this);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@
import htsjdk.tribble.readers.AsciiLineReader;
import htsjdk.tribble.readers.AsciiLineReaderIterator;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.tools.sv.*;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.codecs.BafEvidenceCodec;
import org.broadinstitute.hellbender.utils.codecs.DepthEvidenceCodec;
import org.broadinstitute.hellbender.utils.codecs.DiscordantPairEvidenceCodec;
Expand Down Expand Up @@ -94,17 +97,23 @@ public <T extends Feature> void test(final ArrayList<T> featureList,
private <T extends Feature> void testWithStream(final FeatureOutputStream<T> stream, final ArrayList<T> featureList,
final Path outFilePath, final FeatureCodec<T, LineIterator> codec,
final String expectedHeader, final boolean indexed) throws IOException {
final Path outIndexPath = Paths.get(outFilePath + FileExtensions.TABIX_INDEX);
if (expectedHeader != null) {
stream.writeHeader(expectedHeader);
}
featureList.stream().forEachOrdered(s -> stream.write(s));
stream.close();
if (indexed) {
final Path outIndexPath = Paths.get(outFilePath + FileExtensions.TABIX_INDEX);
Assert.assertTrue(Files.exists(outIndexPath), "Index does not exist");

// try to use the index a little
final FeatureDataSource<T> dataSource = new FeatureDataSource<T>(outFilePath.toFile());
dataSource.query(new SimpleInterval("chr1", 1234, 5678));
dataSource.close();
}
final InputStream inputStream = IOUtil.isBlockCompressed(outFilePath) ?
new BlockCompressedInputStream(outFilePath.toFile()) : new FileInputStream(outFilePath.toFile());
new BlockCompressedInputStream(outFilePath.toFile()) :
new PositionalBufferedStream(new FileInputStream(outFilePath.toFile()));
final LineIterator reader = new AsciiLineReaderIterator(AsciiLineReader.from(inputStream));

//Check header
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#tigs format seqCol begCol endCol metaChr skip
1 0 1 2 2 # 0

chr9 binned index:
4681 chr9:0K-16K 0:0->35:0
chr9 summary: mapped=1 placed=0 start=0:0 end=35:0

chr9 linear index:
0K 0:0
0 unplaced reads.

0 comments on commit d4f083d

Please sign in to comment.