Skip to content

Commit

Permalink
tests, doc
Browse files Browse the repository at this point in the history
  • Loading branch information
tedsharpe committed Jun 7, 2022
1 parent e9d3a1f commit 8d1db54
Show file tree
Hide file tree
Showing 6 changed files with 166 additions and 93 deletions.
203 changes: 113 additions & 90 deletions src/main/java/org/broadinstitute/hellbender/tools/DumpTabixIndex.java
Original file line number Diff line number Diff line change
@@ -1,123 +1,146 @@
package org.broadinstitute.hellbender.tools;

import htsjdk.samtools.util.FileExtensions;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.exceptions.UserException;
import picard.cmdline.programgroups.DiagnosticsAndQCProgramGroup;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;

@CommandLineProgramProperties(
summary = "Prints a tabix index file as text.",
summary = "Prints a text file describing the contents of the tabix index input file.",
oneLineSummary = "Dumps a tabix index file.",
usageExample = "gatk DumpTabixIndex -I tabixIndex.tbi -O output.txt",
programGroup = DiagnosticsAndQCProgramGroup.class
)
@ExperimentalFeature
@DocumentedFeature
public class DumpTabixIndex extends CommandLineProgram {
@Argument( doc = "Tabix index file.",
fullName = "tabix-index",
shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME)
String tabixIndexFile;
GATKPath tabixIndexFile;

@Argument( doc = "Output file.",
fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME)
GATKPath outputFile;

@Override
protected Object doWork() {
try ( final InputStream is = new GZIPInputStream(new FileInputStream(tabixIndexFile)) ) {
if ( readByte(is) != 'T' || readByte(is) != 'B' ||
readByte(is) != 'I' || readByte(is) != 1 ) {
throw new UserException("Incorrect magic number for tabix index");
}
final int nTigs = readInt(is);
final int format = readInt(is);
final int seqCol = readInt(is);
final int begCol = readInt(is);
final int endCol = readInt(is);
final char meta = (char)readInt(is);
final int skip = readInt(is);
final int namesLen = readInt(is);
System.out.println("#tigs\tformat\tseqCol\tbegCol\tendCol\tmetaChr\tskip");
System.out.println(nTigs + "\t" + format + "\t" + seqCol + "\t" + begCol + "\t" +
endCol + "\t" + meta + "\t" + skip + "\n");
final List<String> tigNames = readContigNames(is, nTigs, namesLen);
for ( final String tigName : tigNames ) {
System.out.println(tigName + " binned index:");
int nBins = readInt(is);
while ( nBins-- > 0 ) {
final int binNo = readInt(is);
int nChunks = readInt(is);
if ( binNo > 37448 ) {
if ( nChunks != 2 ) {
throw new UserException("pseudobin has " + nChunks + " chunks");
}
final long chunkStart = readLong(is);
final long chunkEnd = readLong(is);
final long nMapped = readLong(is);
final long nUnmapped = readLong(is);
System.out.println(tigName + " summary: mapped=" + nMapped +
"\tplaced=" + nUnmapped +
"\tstart=" + Long.toHexString(chunkStart >>> 16) +
":" + Long.toHexString(chunkStart & 0xffff) +
"\tend=" + Long.toHexString(chunkEnd >>> 16) +
":" + Long.toHexString(chunkStart & 0xffff));
continue;
}
if ( binNo == 0 ) {
System.out.print(binNo + "\t" + tigName + ":0M-512M\t");
} else if ( binNo <= 8 ) {
final int binStart = (binNo - 1)*64;
System.out.print(binNo + "\t" + tigName + ":" + binStart + "M-" + (binStart+64) + "M\t");
} else if ( binNo <= 72 ) {
final int binStart = (binNo - 9)*8;
System.out.print(binNo + "\t" + tigName + ":" + binStart + "M-" + (binStart+8) + "M\t");
} else if ( binNo <= 584 ) {
final int binStart = binNo - 73;
System.out.print(binNo + "\t" + tigName + ":" + binStart + "M-" + (binStart + 1) + "M\t");
} else if ( binNo <= 4680 ) {
final int binStart = (binNo - 585)*128;
System.out.print(binNo + "\t" + tigName + ":" + binStart + "K-" + (binStart + 128) + "K\t");
} else {
final int binStart = (binNo - 4681)*16;
System.out.print(binNo + "\t" + tigName + ":" + binStart + "K-" + (binStart + 16) + "K\t");
}
while ( nChunks-- > 0 ) {
final long chunkStart = readLong(is);
final long chunkEnd = readLong(is);
System.out.print("\t" + Long.toHexString(chunkStart >>> 16) +
":" + Long.toHexString(chunkStart & 0xffff) +
"->" + Long.toHexString(chunkEnd >>> 16) +
":" + Long.toHexString(chunkEnd & 0xffff));
if ( !tabixIndexFile.hasExtension(FileExtensions.TABIX_INDEX) ) {
throw new UserException("Expected a " + FileExtensions.TABIX_INDEX + " file as input.");
}

final PrintStream writer = new PrintStream(outputFile.getOutputStream());
try ( final PushbackInputStream is =
new PushbackInputStream(new GZIPInputStream(tabixIndexFile.getInputStream())) ) {
dumpTabixIndex(is, writer);
} catch ( final IOException ioe ) {
throw new UserException("Trouble reading index.", ioe);
}
if ( writer.checkError() ) {
throw new UserException("Trouble writing output.");
}

return null;
}

public static void dumpTabixIndex( final PushbackInputStream is,
final PrintStream writer ) throws IOException {
if ( readByte(is) != 'T' || readByte(is) != 'B' ||
readByte(is) != 'I' || readByte(is) != 1 ) {
throw new UserException("Incorrect magic number for tabix index");
}
final int nTigs = readInt(is);
final int format = readInt(is);
final int seqCol = readInt(is);
final int begCol = readInt(is);
final int endCol = readInt(is);
final char meta = (char)readInt(is);
final int skip = readInt(is);
final int namesLen = readInt(is);
writer.println("#tigs\tformat\tseqCol\tbegCol\tendCol\tmetaChr\tskip");
writer.println(nTigs + "\t" + format + "\t" + seqCol + "\t" + begCol + "\t" +
endCol + "\t" + meta + "\t" + skip + "\n");
final List<String> tigNames = readContigNames(is, nTigs, namesLen);
for ( final String tigName : tigNames ) {
writer.println(tigName + " binned index:");
int nBins = readInt(is);
while ( nBins-- > 0 ) {
final int binNo = readInt(is);
int nChunks = readInt(is);
if ( binNo > 37448 ) {
if ( nChunks != 2 ) {
throw new UserException("pseudobin has " + nChunks + " chunks");
}
System.out.println();
final long chunkStart = readLong(is);
final long chunkEnd = readLong(is);
final long nMapped = readLong(is);
final long nUnmapped = readLong(is);
writer.println(tigName + " summary: mapped=" + nMapped +
"\tplaced=" + nUnmapped +
"\tstart=" + Long.toHexString(chunkStart >>> 16) +
":" + Long.toHexString(chunkStart & 0xffff) +
"\tend=" + Long.toHexString(chunkEnd >>> 16) +
":" + Long.toHexString(chunkStart & 0xffff));
continue;
}
int nIntervals = readInt(is);
int nK = 0;
System.out.println();
System.out.println(tigName + " linear index:");
while ( nIntervals-- > 0 ) {
final long chunkOffset = readLong(is);
System.out.println(nK + "K\t" + Long.toHexString(chunkOffset >>> 16) +
":" + Long.toHexString(chunkOffset & 0xffff));
nK += 16;
if ( binNo == 0 ) {
writer.print(binNo + "\t" + tigName + ":0M-512M\t");
} else if ( binNo <= 8 ) {
final int binStart = (binNo - 1)*64;
writer.print(binNo + "\t" + tigName + ":" + binStart + "M-" + (binStart+64) + "M\t");
} else if ( binNo <= 72 ) {
final int binStart = (binNo - 9)*8;
writer.print(binNo + "\t" + tigName + ":" + binStart + "M-" + (binStart+8) + "M\t");
} else if ( binNo <= 584 ) {
final int binStart = binNo - 73;
writer.print(binNo + "\t" + tigName + ":" + binStart + "M-" + (binStart + 1) + "M\t");
} else if ( binNo <= 4680 ) {
final int binStart = (binNo - 585)*128;
writer.print(binNo + "\t" + tigName + ":" + binStart + "K-" + (binStart + 128) + "K\t");
} else {
final int binStart = (binNo - 4681)*16;
writer.print(binNo + "\t" + tigName + ":" + binStart + "K-" + (binStart + 16) + "K\t");
}
while ( nChunks-- > 0 ) {
final long chunkStart = readLong(is);
final long chunkEnd = readLong(is);
writer.print("\t" + Long.toHexString(chunkStart >>> 16) +
":" + Long.toHexString(chunkStart & 0xffff) +
"->" + Long.toHexString(chunkEnd >>> 16) +
":" + Long.toHexString(chunkEnd & 0xffff));
}
writer.println();
}
if ( is.available() > 0 ) {
final long nUnmapped = readLong(is);
System.out.println(nUnmapped + " unplaced reads.");
}
if ( is.available() != 0 ) {
throw new UserException("Unexpected data follows index.");
int nIntervals = readInt(is);
int nK = 0;
writer.println();
writer.println(tigName + " linear index:");
while ( nIntervals-- > 0 ) {
final long chunkOffset = readLong(is);
writer.println(nK + "K\t" + Long.toHexString(chunkOffset >>> 16) +
":" + Long.toHexString(chunkOffset & 0xffff));
nK += 16;
}
} catch ( final IOException ioe ) {
throw new UserException("Can't read " + tabixIndexFile + " as gzip input stream", ioe);
}
return null;
int nextByte;
if ( (nextByte = is.read()) != -1 ) {
is.unread(nextByte);
final long nUnmapped = readLong(is);
writer.println(nUnmapped + " unplaced reads.");
}
if ( (nextByte = is.read()) != -1 ) {
throw new UserException("Unexpected data follows index.");
}
}

public static List<String> readContigNames( final InputStream is,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,13 @@ public class FeatureOutputStream <F extends Feature> implements FeatureSink<F> {

private final Function<F, String> encoder;
private final OutputStream outputStream;
private final LocationAware locationAware;
private final LocationAware locationAware; // same object as outputStream, above
// This is necessary because there is no LocationAwareOutputStream class for
// PositionalOutputStream and BlockCompressedOutputStream to extend. Sadly.
// Do not wrap the BlockCompressedOutputStream in a PositionalOutputStream. The
// PositionalOutputStream just counts bytes output, but that's not a valid virtual file offset
// for a bgzip-compressed file.

private final IndexCreator indexCreator;
private final Path featurePath;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package org.broadinstitute.hellbender.tools;

import htsjdk.samtools.util.Log;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
import org.broadinstitute.hellbender.tools.sv.LocusDepthtoBAF;
import org.testng.annotations.Test;

import java.io.IOException;
import java.util.Collections;

public class DumpTabixIndexIntegrationTest extends CommandLineProgramTest {
@Test
public void trivialDumpTabixTest() throws IOException {
final ArgumentsBuilder argsBuilder = new ArgumentsBuilder();
argsBuilder.add(StandardArgumentDefinitions.INPUT_SHORT_NAME, toolsTestDir + "test.tbi");
argsBuilder.add(StandardArgumentDefinitions.OUTPUT_SHORT_NAME, "%s");
final IntegrationTestSpec testSpec =
new IntegrationTestSpec(argsBuilder.getString(),
Collections.singletonList(toolsTestDir + "test.tbi.expected.txt"));
testSpec.executeTest("dump tabix index", this);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@
import htsjdk.tribble.readers.AsciiLineReader;
import htsjdk.tribble.readers.AsciiLineReaderIterator;
import htsjdk.tribble.readers.LineIterator;
import htsjdk.tribble.readers.PositionalBufferedStream;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.tools.sv.*;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.codecs.BafEvidenceCodec;
import org.broadinstitute.hellbender.utils.codecs.DepthEvidenceCodec;
import org.broadinstitute.hellbender.utils.codecs.DiscordantPairEvidenceCodec;
Expand Down Expand Up @@ -94,17 +97,23 @@ public <T extends Feature> void test(final ArrayList<T> featureList,
private <T extends Feature> void testWithStream(final FeatureOutputStream<T> stream, final ArrayList<T> featureList,
final Path outFilePath, final FeatureCodec<T, LineIterator> codec,
final String expectedHeader, final boolean indexed) throws IOException {
final Path outIndexPath = Paths.get(outFilePath + FileExtensions.TABIX_INDEX);
if (expectedHeader != null) {
stream.writeHeader(expectedHeader);
}
featureList.stream().forEachOrdered(s -> stream.write(s));
stream.close();
if (indexed) {
final Path outIndexPath = Paths.get(outFilePath + FileExtensions.TABIX_INDEX);
Assert.assertTrue(Files.exists(outIndexPath), "Index does not exist");

// try to use the index a little
final FeatureDataSource<T> dataSource = new FeatureDataSource<T>(outFilePath.toFile());
dataSource.query(new SimpleInterval("chr1", 1234, 5678));
dataSource.close();
}
final InputStream inputStream = IOUtil.isBlockCompressed(outFilePath) ?
new BlockCompressedInputStream(outFilePath.toFile()) : new FileInputStream(outFilePath.toFile());
new BlockCompressedInputStream(outFilePath.toFile()) :
new PositionalBufferedStream(new FileInputStream(outFilePath.toFile()));
final LineIterator reader = new AsciiLineReaderIterator(AsciiLineReader.from(inputStream));

//Check header
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#tigs format seqCol begCol endCol metaChr skip
1 0 1 2 2 # 0

chr9 binned index:
4681 chr9:0K-16K 0:0->35:0
chr9 summary: mapped=1 placed=0 start=0:0 end=35:0

chr9 linear index:
0K 0:0
0 unplaced reads.

0 comments on commit 8d1db54

Please sign in to comment.