Skip to content

Commit

Permalink
Add GenomicsDBArgumentCollection to CreateSomaticPanelOfNormals tool (#…
Browse files Browse the repository at this point in the history
…6746)

* allow configuring GenomicsDB options when running CreateSomaticPanelOfNormals
  • Loading branch information
nalinigans committed Dec 23, 2022
1 parent aa05918 commit 9ffd649
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
import org.broadinstitute.hellbender.tools.walkers.readorientation.BetaDistributionShape;
import org.broadinstitute.hellbender.tools.walkers.validation.basicshortmutpileup.BetaBinomialDistribution;
import org.broadinstitute.hellbender.utils.MathUtils;
Expand Down Expand Up @@ -115,10 +118,21 @@ public class CreateSomaticPanelOfNormals extends VariantWalker {
@Argument(fullName= MAX_GERMLINE_PROBABILITY_LONG_NAME, doc="Skip genotypes with germline probability greater than this value", optional = true)
public double maxGermlineProbability = DEFAULT_MAX_GERMLINE_PROBABILITY;

@ArgumentCollection
private GenomicsDBArgumentCollection genomicsdbArgs = new GenomicsDBArgumentCollection();

private VariantContextWriter vcfWriter;

private int numSamples;

@Override
protected GenomicsDBOptions getGenomicsDBOptions() {
if (genomicsDBOptions == null) {
genomicsDBOptions = new GenomicsDBOptions(referenceArguments.getReferencePath(), genomicsdbArgs);
}
return genomicsDBOptions;
}

@Override
public void onTraversalStart() {
final Set<VCFHeaderLine> headerInfo = new HashSet<>(getDefaultToolVCFHeaderLines());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
import org.broadinstitute.hellbender.testutils.GenomicsDBTestUtils;
import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.readorientation.BetaDistributionShape;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
Expand All @@ -29,6 +30,28 @@ public class CreateSomaticPanelOfNormalsIntegrationTest extends CommandLineProgr
// positions 10,000,000 - 11,000,000 of chr 20 and with most annotations removed
private static final File GNOMAD = new File(largeFileTestDir, "very-small-gnomad.vcf");

void checkPonVariants(final List<VariantContext> ponVariants, final List<VariantContext> inputVariants) {
for (int n = 0; n < ponVariants.size(); n++) {
final VariantContext ponVC = ponVariants.get(n);

final VariantContext inputVC = inputVariants.get(n);

Assert.assertEquals(ponVC.getAlleles(), inputVC.getAlleles());

// by construction, every pon sample has each variant
final double ponFraction = ponVC.getAttributeAsDouble(CreateSomaticPanelOfNormals.FRACTION_INFO_FIELD, 0.0);
Assert.assertEquals(ponFraction, 1.0, 1.0e-6);

List<Double> ponBeta = ponVC.getAttributeAsDoubleList(CreateSomaticPanelOfNormals.BETA_SHAPE_INFO_FIELD, 1.0);
final double ponAF = ponBeta.get(0) / (ponBeta.get(0) + ponBeta.get(1));

final int[] AD = inputVC.getGenotype(0).getAD();
final double empiricalAF = (double) (MathUtils.sum(AD) - AD[0]) / MathUtils.sum(AD);

Assert.assertEquals(ponAF, empiricalAF, 0.2);
}
}

@Test
public void testTwoIdenticalVcfs() throws IOException {
final File vcf1 = new File(PON_VCFS_DIR, "sample1.vcf");
Expand All @@ -55,23 +78,15 @@ public void testTwoIdenticalVcfs() throws IOException {
// this checks that even filtered variants make it into the pon
Assert.assertEquals(ponVariants.size(), inputVariants.size());

for (int n = 0; n < ponVariants.size(); n++) {
final VariantContext ponVC = ponVariants.get(n);
final VariantContext inputVC = inputVariants.get(n);
Assert.assertEquals(ponVC.getAlleles(), inputVC.getAlleles());

// by construction, every pon sample has each variant
final double ponFraction = ponVC.getAttributeAsDouble(CreateSomaticPanelOfNormals.FRACTION_INFO_FIELD, 0.0);
Assert.assertEquals(ponFraction, 1.0, 1.0e-6);

final List<Double> ponBeta = ponVC.getAttributeAsDoubleList(CreateSomaticPanelOfNormals.BETA_SHAPE_INFO_FIELD, 1.0);
final double ponAF = ponBeta.get(0) / (ponBeta.get(0) + ponBeta.get(1));

final int[] AD = inputVC.getGenotype(0).getAD();
final double empiricalAF = (double) (MathUtils.sum(AD) - AD[0]) / MathUtils.sum(AD);
checkPonVariants(ponVariants, inputVariants);

Assert.assertEquals(ponAF, empiricalAF, 0.2);
}
// Test again by requesting BCF codec stream from GenomicsDBFeatureReader
args.add(GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME, true);
Utils.resetRandomGenerator();
runCommandLine(args);
final List<VariantContext> ponVariantsWithBCFCodecStreaming = VariantContextTestUtils.streamVcf(output).collect(Collectors.toList());
Assert.assertEquals(ponVariantsWithBCFCodecStreaming.size(), inputVariants.size());
checkPonVariants(ponVariantsWithBCFCodecStreaming, inputVariants);
}

@Test
Expand All @@ -95,6 +110,12 @@ public void testJustOneVcf() throws IOException {

final List<VariantContext> ponVariants = VariantContextTestUtils.streamVcf(output).collect(Collectors.toList());
Assert.assertTrue(ponVariants.isEmpty());

// Test again by requesting BCF codec stream from GenomicsDBFeatureReader
args.add(GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME, true);
Utils.resetRandomGenerator();
runCommandLine(args);
Assert.assertTrue(VariantContextTestUtils.streamVcf(output).collect(Collectors.toList()).isEmpty());
}


Expand Down Expand Up @@ -124,6 +145,30 @@ public void testGnomad() throws IOException {

// here is a variant that should definitely be skipped as germline
Assert.assertTrue(ponVariants.stream().mapToInt(VariantContext::getStart).noneMatch(start -> start == 10_000_117));

// Test again by requesting BCF codec stream from GenomicsDBFeatureReader
args.add(GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME, true);
Utils.resetRandomGenerator();
runCommandLine(args);
final List<VariantContext> ponVariantsFromBCFCodecStream = VariantContextTestUtils.streamVcf(output).collect(Collectors.toList());
Assert.assertTrue(!ponVariantsFromBCFCodecStream.isEmpty());
Assert.assertTrue(ponVariantsFromBCFCodecStream.stream().mapToInt(VariantContext::getStart).noneMatch(start -> start == 10_000_117));
}

void checkPonVariantsFromThreeVcfs(final List<VariantContext> ponVariants) {
Assert.assertEquals(ponVariants.size(), 3);

final double[] ponFractions = ponVariants.stream()
.mapToDouble(vc -> vc.getAttributeAsDouble(CreateSomaticPanelOfNormals.FRACTION_INFO_FIELD, 0.0))
.toArray();
Assert.assertEquals(ponFractions[0], 2.0/3, 0.01);
Assert.assertEquals(ponFractions[1], 1, 0.01);
Assert.assertEquals(ponFractions[2], 1, 0.01);

final List<BetaDistributionShape> betas = ponVariants.stream().map(vc -> {
final List<Double> ponBeta = vc.getAttributeAsDoubleList(CreateSomaticPanelOfNormals.BETA_SHAPE_INFO_FIELD, 1.0);
return new BetaDistributionShape(ponBeta.get(0), ponBeta.get(1));
}).collect(Collectors.toList());
}

/**
Expand Down Expand Up @@ -154,19 +199,12 @@ public void testThreeVcfs() throws IOException {
runCommandLine(args);

final List<VariantContext> ponVariants = VariantContextTestUtils.streamVcf(output).collect(Collectors.toList());
Assert.assertEquals(ponVariants.size(), 3);

final double[] ponFractions = ponVariants.stream()
.mapToDouble(vc -> vc.getAttributeAsDouble(CreateSomaticPanelOfNormals.FRACTION_INFO_FIELD, 0.0))
.toArray();
Assert.assertEquals(ponFractions[0], 2.0/3, 0.01);
Assert.assertEquals(ponFractions[1], 1, 0.01);
Assert.assertEquals(ponFractions[2], 1, 0.01);

final List<BetaDistributionShape> betas = ponVariants.stream().map(vc -> {
final List<Double> ponBeta = vc.getAttributeAsDoubleList(CreateSomaticPanelOfNormals.BETA_SHAPE_INFO_FIELD, 1.0);
return new BetaDistributionShape(ponBeta.get(0), ponBeta.get(1));
}).collect(Collectors.toList());
checkPonVariantsFromThreeVcfs(ponVariants);

// Test again by requesting BCF codec stream from GenomicsDBFeatureReader
args.add(GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME, true);
Utils.resetRandomGenerator();
runCommandLine(args);
checkPonVariantsFromThreeVcfs(VariantContextTestUtils.streamVcf(output).collect(Collectors.toList()));
}
}

0 comments on commit 9ffd649

Please sign in to comment.