From 48a52e1a1d95213924a7b06586ffc89a912f5815 Mon Sep 17 00:00:00 2001 From: nalinigans Date: Tue, 11 Aug 2020 14:04:39 -0700 Subject: [PATCH] Add GenomicsDBArgumentCollection to CreateSomaticPanelOfNormals tool --- .../mutect/CreateSomaticPanelOfNormals.java | 14 +++ ...eSomaticPanelOfNormalsIntegrationTest.java | 96 +++++++++++++------ 2 files changed, 81 insertions(+), 29 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/CreateSomaticPanelOfNormals.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/CreateSomaticPanelOfNormals.java index 6ba8c7fbabd..58f6334ee80 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/CreateSomaticPanelOfNormals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/CreateSomaticPanelOfNormals.java @@ -7,11 +7,14 @@ import htsjdk.variant.variantcontext.writer.VariantContextWriter; import htsjdk.variant.vcf.*; import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.ArgumentCollection; import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection; +import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions; import org.broadinstitute.hellbender.tools.walkers.readorientation.BetaDistributionShape; import org.broadinstitute.hellbender.tools.walkers.validation.basicshortmutpileup.BetaBinomialDistribution; import org.broadinstitute.hellbender.utils.MathUtils; @@ -115,10 +118,21 @@ public class CreateSomaticPanelOfNormals extends VariantWalker { @Argument(fullName= MAX_GERMLINE_PROBABILITY_LONG_NAME, doc="Skip genotypes with germline probability greater than this value", optional = true) public double maxGermlineProbability = DEFAULT_MAX_GERMLINE_PROBABILITY; + @ArgumentCollection + private GenomicsDBArgumentCollection genomicsdbArgs = new GenomicsDBArgumentCollection(); + private VariantContextWriter vcfWriter; private int numSamples; + @Override + protected GenomicsDBOptions getGenomicsDBOptions() { + if (genomicsDBOptions == null) { + genomicsDBOptions = new GenomicsDBOptions(referenceArguments.getReferencePath(), genomicsdbArgs); + } + return genomicsDBOptions; + } + @Override public void onTraversalStart() { final Set headerInfo = new HashSet<>(getDefaultToolVCFHeaderLines()); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/CreateSomaticPanelOfNormalsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/CreateSomaticPanelOfNormalsIntegrationTest.java index 0352df3faa4..6e89c3afe1d 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/CreateSomaticPanelOfNormalsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/mutect/CreateSomaticPanelOfNormalsIntegrationTest.java @@ -5,6 +5,7 @@ import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; import org.broadinstitute.hellbender.testutils.GenomicsDBTestUtils; import org.broadinstitute.hellbender.testutils.VariantContextTestUtils; +import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection; import org.broadinstitute.hellbender.tools.walkers.readorientation.BetaDistributionShape; import org.broadinstitute.hellbender.utils.MathUtils; import org.broadinstitute.hellbender.utils.SimpleInterval; @@ -29,6 +30,28 @@ public class CreateSomaticPanelOfNormalsIntegrationTest extends CommandLineProgr // positions 10,000,000 - 11,000,000 of chr 20 and with most annotations removed private static final File GNOMAD = new File(largeFileTestDir, "very-small-gnomad.vcf"); + void checkPonVariants(final List ponVariants, final List inputVariants) { + for (int n = 0; n < ponVariants.size(); n++) { + final VariantContext ponVC = ponVariants.get(n); + + final VariantContext inputVC = inputVariants.get(n); + + Assert.assertEquals(ponVC.getAlleles(), inputVC.getAlleles()); + + // by construction, every pon sample has each variant + final double ponFraction = ponVC.getAttributeAsDouble(CreateSomaticPanelOfNormals.FRACTION_INFO_FIELD, 0.0); + Assert.assertEquals(ponFraction, 1.0, 1.0e-6); + + List ponBeta = ponVC.getAttributeAsDoubleList(CreateSomaticPanelOfNormals.BETA_SHAPE_INFO_FIELD, 1.0); + final double ponAF = ponBeta.get(0) / (ponBeta.get(0) + ponBeta.get(1)); + + final int[] AD = inputVC.getGenotype(0).getAD(); + final double empiricalAF = (double) (MathUtils.sum(AD) - AD[0]) / MathUtils.sum(AD); + + Assert.assertEquals(ponAF, empiricalAF, 0.2); + } + } + @Test public void testTwoIdenticalVcfs() throws IOException { final File vcf1 = new File(PON_VCFS_DIR, "sample1.vcf"); @@ -55,23 +78,15 @@ public void testTwoIdenticalVcfs() throws IOException { // this checks that even filtered variants make it into the pon Assert.assertEquals(ponVariants.size(), inputVariants.size()); - for (int n = 0; n < ponVariants.size(); n++) { - final VariantContext ponVC = ponVariants.get(n); - final VariantContext inputVC = inputVariants.get(n); - Assert.assertEquals(ponVC.getAlleles(), inputVC.getAlleles()); - - // by construction, every pon sample has each variant - final double ponFraction = ponVC.getAttributeAsDouble(CreateSomaticPanelOfNormals.FRACTION_INFO_FIELD, 0.0); - Assert.assertEquals(ponFraction, 1.0, 1.0e-6); - - final List ponBeta = ponVC.getAttributeAsDoubleList(CreateSomaticPanelOfNormals.BETA_SHAPE_INFO_FIELD, 1.0); - final double ponAF = ponBeta.get(0) / (ponBeta.get(0) + ponBeta.get(1)); - - final int[] AD = inputVC.getGenotype(0).getAD(); - final double empiricalAF = (double) (MathUtils.sum(AD) - AD[0]) / MathUtils.sum(AD); + checkPonVariants(ponVariants, inputVariants); - Assert.assertEquals(ponAF, empiricalAF, 0.2); - } + // Test again by requesting BCF codec stream from GenomicsDBFeatureReader + args.add(GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME, true); + Utils.resetRandomGenerator(); + runCommandLine(args); + final List ponVariantsWithBCFCodecStreaming = VariantContextTestUtils.streamVcf(output).collect(Collectors.toList()); + Assert.assertEquals(ponVariantsWithBCFCodecStreaming.size(), inputVariants.size()); + checkPonVariants(ponVariantsWithBCFCodecStreaming, inputVariants); } @Test @@ -95,6 +110,12 @@ public void testJustOneVcf() throws IOException { final List ponVariants = VariantContextTestUtils.streamVcf(output).collect(Collectors.toList()); Assert.assertTrue(ponVariants.isEmpty()); + + // Test again by requesting BCF codec stream from GenomicsDBFeatureReader + args.add(GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME, true); + Utils.resetRandomGenerator(); + runCommandLine(args); + Assert.assertTrue(VariantContextTestUtils.streamVcf(output).collect(Collectors.toList()).isEmpty()); } @@ -124,6 +145,30 @@ public void testGnomad() throws IOException { // here is a variant that should definitely be skipped as germline Assert.assertTrue(ponVariants.stream().mapToInt(VariantContext::getStart).noneMatch(start -> start == 10_000_117)); + + // Test again by requesting BCF codec stream from GenomicsDBFeatureReader + args.add(GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME, true); + Utils.resetRandomGenerator(); + runCommandLine(args); + final List ponVariantsFromBCFCodecStream = VariantContextTestUtils.streamVcf(output).collect(Collectors.toList()); + Assert.assertTrue(!ponVariantsFromBCFCodecStream.isEmpty()); + Assert.assertTrue(ponVariantsFromBCFCodecStream.stream().mapToInt(VariantContext::getStart).noneMatch(start -> start == 10_000_117)); + } + + void checkPonVariantsFromThreeVcfs(final List ponVariants) { + Assert.assertEquals(ponVariants.size(), 3); + + final double[] ponFractions = ponVariants.stream() + .mapToDouble(vc -> vc.getAttributeAsDouble(CreateSomaticPanelOfNormals.FRACTION_INFO_FIELD, 0.0)) + .toArray(); + Assert.assertEquals(ponFractions[0], 2.0/3, 0.01); + Assert.assertEquals(ponFractions[1], 1, 0.01); + Assert.assertEquals(ponFractions[2], 1, 0.01); + + final List betas = ponVariants.stream().map(vc -> { + final List ponBeta = vc.getAttributeAsDoubleList(CreateSomaticPanelOfNormals.BETA_SHAPE_INFO_FIELD, 1.0); + return new BetaDistributionShape(ponBeta.get(0), ponBeta.get(1)); + }).collect(Collectors.toList()); } /** @@ -154,19 +199,12 @@ public void testThreeVcfs() throws IOException { runCommandLine(args); final List ponVariants = VariantContextTestUtils.streamVcf(output).collect(Collectors.toList()); - Assert.assertEquals(ponVariants.size(), 3); - - final double[] ponFractions = ponVariants.stream() - .mapToDouble(vc -> vc.getAttributeAsDouble(CreateSomaticPanelOfNormals.FRACTION_INFO_FIELD, 0.0)) - .toArray(); - Assert.assertEquals(ponFractions[0], 2.0/3, 0.01); - Assert.assertEquals(ponFractions[1], 1, 0.01); - Assert.assertEquals(ponFractions[2], 1, 0.01); - - final List betas = ponVariants.stream().map(vc -> { - final List ponBeta = vc.getAttributeAsDoubleList(CreateSomaticPanelOfNormals.BETA_SHAPE_INFO_FIELD, 1.0); - return new BetaDistributionShape(ponBeta.get(0), ponBeta.get(1)); - }).collect(Collectors.toList()); + checkPonVariantsFromThreeVcfs(ponVariants); + // Test again by requesting BCF codec stream from GenomicsDBFeatureReader + args.add(GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME, true); + Utils.resetRandomGenerator(); + runCommandLine(args); + checkPonVariantsFromThreeVcfs(VariantContextTestUtils.streamVcf(output).collect(Collectors.toList())); } } \ No newline at end of file