From 29d5649c2b1a2ee87b05c1a72877b2f2e2bbacff Mon Sep 17 00:00:00 2001 From: jamesemery Date: Fri, 29 Oct 2021 14:19:51 -0400 Subject: [PATCH] Added a debug option to output a side-vcf with variants detected by assembly for HaplotypeCaller and Mutect (#7384) --- .../HaplotypeCallerEngine.java | 15 +++++++++++++++ ...dThreadingAssemblerArgumentCollection.java | 5 +++-- .../readthreading/ReadThreadingAssembler.java | 16 ++++++++++++++++ .../tools/walkers/mutect/Mutect2Engine.java | 19 ++++++++++++++++++- 4 files changed, 52 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java index 5235d0d934c..c8448511483 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/HaplotypeCallerEngine.java @@ -93,6 +93,9 @@ public final class HaplotypeCallerEngine implements AssemblyRegionEvaluator { // writes Haplotypes to a bam file when the -bamout option is specified private Optional haplotypeBAMWriter; + // writes Variants from assembly graph + private Optional assembledEventMapVcfOutputWriter; + private Optional> assembledEventMapVariants; private Set sampleSet; private SampleList samplesList; @@ -239,6 +242,16 @@ private void initialize(boolean createBamOutIndex, final boolean createBamOutMD5 haplotypeBAMWriter = AssemblyBasedCallerUtils.createBamWriter(hcArgs, createBamOutIndex, createBamOutMD5, readsHeader); assemblyEngine = hcArgs.createReadThreadingAssembler(); + assembledEventMapVcfOutputWriter = Optional.ofNullable(hcArgs.assemblerArgs.debugAssemblyVariantsOut != null ? + GATKVariantContextUtils.createVCFWriter( + new GATKPath(hcArgs.assemblerArgs.debugAssemblyVariantsOut).toPath(), + readsHeader.getSequenceDictionary(), + false, + Options.DO_NOT_WRITE_GENOTYPES, Options.INDEX_ON_THE_FLY) + : null); + assembledEventMapVariants = Optional.ofNullable(hcArgs.assemblerArgs.debugAssemblyVariantsOut != null ? + new PriorityQueue<>(200, new VariantContextComparator(readsHeader.getSequenceDictionary())) : null); + assembledEventMapVcfOutputWriter.ifPresent(writer -> writeHeader(writer, readsHeader.getSequenceDictionary(), new HashSet<>())); likelihoodCalculationEngine = AssemblyBasedCallerUtils.createLikelihoodCalculationEngine(hcArgs.likelihoodArgs, !hcArgs.softClipLowQualityEnds); } @@ -571,6 +584,7 @@ public List callRegion(final AssemblyRegion region, final Featur // run the local assembler, getting back a collection of information on how we should proceed final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(region, givenAlleles, hcArgs, readsHeader, samplesList, logger, referenceReader, assemblyEngine, aligner, !hcArgs.doNotCorrectOverlappingBaseQualities); + ReadThreadingAssembler.addAssembledVariantsToEventMapOutput(untrimmedAssemblyResult, assembledEventMapVariants, hcArgs.maxMnpDistance, assembledEventMapVcfOutputWriter); if (assemblyDebugOutStream != null) { try { @@ -776,6 +790,7 @@ public void shutdown() { if ( haplotypeBAMWriter.isPresent() ) { haplotypeBAMWriter.get().close(); } + assembledEventMapVcfOutputWriter.ifPresent(writer -> {assembledEventMapVariants.get().forEach(writer::add); writer.close();}); if ( referenceReader != null){ try { referenceReader.close(); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ReadThreadingAssemblerArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ReadThreadingAssemblerArgumentCollection.java index 4d17c663354..e4acd7aaec3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ReadThreadingAssemblerArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/ReadThreadingAssemblerArgumentCollection.java @@ -4,7 +4,6 @@ import org.broadinstitute.barclay.argparser.Advanced; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.Hidden; -import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection; import org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingAssembler; import org.broadinstitute.hellbender.utils.MathUtils; @@ -168,7 +167,9 @@ public abstract class ReadThreadingAssemblerArgumentCollection implements Serial @Hidden @Argument(fullName="debug-graph-transformations", doc="Write DOT formatted graph files out of the assembler for only this graph size", optional = true) public boolean debugGraphTransformations = false; - + @Hidden + @Argument(fullName = "debug-assembly-variants-out", doc="Write a VCF file containing all of the variants from the event map (pre-filter) for each active region", optional = true) + public String debugAssemblyVariantsOut = null; /** * This argument is meant for debugging and is not immediately useful for normal analysis use. */ diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java index 97606285419..0d1f42f9353 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/readthreading/ReadThreadingAssembler.java @@ -5,6 +5,8 @@ import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMFileHeader; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.writer.VariantContextWriter; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.broadinstitute.gatk.nativebindings.smithwaterman.SWOverhangStrategy; @@ -806,6 +808,10 @@ public void setGraphWriter(File graphOutputPath) { this.graphOutputPath = graphOutputPath; } + public void setEventMapOut(File graphOutputPath) { + this.graphOutputPath = graphOutputPath; + } + public byte getMinBaseQualityToUseInAssembly() { return minBaseQualityToUseInAssembly; } @@ -875,4 +881,14 @@ public void setArtificialHaplotypeRecoveryMode(boolean disableUncoveredJunctionT recoverHaplotypesFromEdgesNotCoveredInJunctionTrees = false; } } + + // Debug output writer for the event map that ensures events are correctly sorted (even if there are cross region overlaps) + public static void addAssembledVariantsToEventMapOutput(final AssemblyResultSet untrimmedAssemblyResult, final Optional> assembledEventMapVariants, final int maxMnpDistance, final Optional assembledEventMapVcfOutputWriter) { + assembledEventMapVariants.ifPresent(queue -> + untrimmedAssemblyResult.getVariationEvents(maxMnpDistance).forEach(event -> { + if (queue.size() >= 300) { + queue.stream().limit(200).forEachOrdered(vc -> assembledEventMapVcfOutputWriter.get().add(vc)); + } + queue.add(event);})); + } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java index 9995f57fe73..2f4937bb569 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect2Engine.java @@ -4,6 +4,8 @@ import htsjdk.samtools.util.Locatable; import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextComparator; +import htsjdk.variant.variantcontext.writer.Options; import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.VCFConstants; import htsjdk.variant.vcf.VCFHeader; @@ -50,6 +52,7 @@ import org.broadinstitute.hellbender.utils.smithwaterman.SmithWatermanAligner; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.broadinstitute.hellbender.utils.variant.GATKVCFHeaderLines; +import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils; import java.io.File; import java.nio.file.Path; @@ -101,6 +104,8 @@ public final class Mutect2Engine implements AssemblyRegionEvaluator { private ReadLikelihoodCalculationEngine likelihoodCalculationEngine; private SomaticGenotypingEngine genotypingEngine; private Optional haplotypeBAMWriter; + private Optional assembledEventMapVcfOutputWriter; + private Optional> assembledEventMapVariants; private VariantAnnotatorEngine annotationEngine; private final SmithWatermanAligner aligner; private final AssemblyRegionTrimmer trimmer; @@ -153,6 +158,16 @@ public Mutect2Engine(final M2ArgumentCollection MTAC, AssemblyRegionArgumentColl referenceConfidenceModel = new SomaticReferenceConfidenceModel(samplesList, header, 0, MTAC.minAF); //TODO: do something classier with the indel size arg final List tumorSamples = ReadUtils.getSamplesFromHeader(header).stream().filter(this::isTumorSample).collect(Collectors.toList()); f1R2CountsCollector = MTAC.f1r2TarGz == null ? Optional.empty() : Optional.of(new F1R2CountsCollector(MTAC.f1r2Args, header, MTAC.f1r2TarGz, tumorSamples)); + assembledEventMapVcfOutputWriter = Optional.ofNullable(MTAC.assemblerArgs.debugAssemblyVariantsOut != null ? + GATKVariantContextUtils.createVCFWriter( + new GATKPath(MTAC.assemblerArgs.debugAssemblyVariantsOut).toPath(), + header.getSequenceDictionary(), + false, + Options.DO_NOT_WRITE_GENOTYPES, Options.INDEX_ON_THE_FLY) + : null); + assembledEventMapVariants = Optional.ofNullable(MTAC.assemblerArgs.debugAssemblyVariantsOut != null ? + new PriorityQueue<>(200, new VariantContextComparator(header.getSequenceDictionary())) : null); + assembledEventMapVcfOutputWriter.ifPresent(writer -> {VCFHeader head = new VCFHeader(); head.getSequenceDictionary(); writer.writeHeader(head);}); } //default M2 read filters. Cheap ones come first in order to fail fast. @@ -231,6 +246,7 @@ public List callRegion(final AssemblyRegion originalAssemblyRegi .filter(vc -> MTAC.forceCallFiltered || vc.isNotFiltered()).collect(Collectors.toList()); final AssemblyResultSet untrimmedAssemblyResult = AssemblyBasedCallerUtils.assembleReads(originalAssemblyRegion, givenAlleles, MTAC, header, samplesList, logger, referenceReader, assemblyEngine, aligner, false); + ReadThreadingAssembler.addAssembledVariantsToEventMapOutput(untrimmedAssemblyResult, assembledEventMapVariants, MTAC.maxMnpDistance, assembledEventMapVcfOutputWriter); final SortedSet allVariationEvents = untrimmedAssemblyResult.getVariationEvents(MTAC.maxMnpDistance); final AssemblyRegionTrimmer.Result trimmingResult = trimmer.trim(originalAssemblyRegion, allVariationEvents, referenceContext); @@ -355,7 +371,8 @@ public void writeExtraOutputs(final File statsTable) { public void shutdown() { likelihoodCalculationEngine.close(); aligner.close(); - haplotypeBAMWriter.ifPresent(writer -> writer.close()); + haplotypeBAMWriter.ifPresent(HaplotypeBAMWriter::close); + assembledEventMapVcfOutputWriter.ifPresent(writer -> {assembledEventMapVariants.get().forEach(writer::add); writer.close();}); referenceReader.close(); }