Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose GenomicsDBArgumentCollection with CreateSomaticPanelOfNormals #6746

Merged
merged 1 commit into from
Dec 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.variant.vcf.*;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.BetaFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBOptions;
import org.broadinstitute.hellbender.tools.walkers.readorientation.BetaDistributionShape;
import org.broadinstitute.hellbender.tools.walkers.validation.basicshortmutpileup.BetaBinomialDistribution;
import org.broadinstitute.hellbender.utils.MathUtils;
Expand Down Expand Up @@ -115,10 +118,21 @@ public class CreateSomaticPanelOfNormals extends VariantWalker {
@Argument(fullName= MAX_GERMLINE_PROBABILITY_LONG_NAME, doc="Skip genotypes with germline probability greater than this value", optional = true)
public double maxGermlineProbability = DEFAULT_MAX_GERMLINE_PROBABILITY;

@ArgumentCollection
private GenomicsDBArgumentCollection genomicsdbArgs = new GenomicsDBArgumentCollection();

private VariantContextWriter vcfWriter;

private int numSamples;

@Override
protected GenomicsDBOptions getGenomicsDBOptions() {
if (genomicsDBOptions == null) {
genomicsDBOptions = new GenomicsDBOptions(referenceArguments.getReferencePath(), genomicsdbArgs);
}
return genomicsDBOptions;
}

@Override
public void onTraversalStart() {
final Set<VCFHeaderLine> headerInfo = new HashSet<>(getDefaultToolVCFHeaderLines());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
import org.broadinstitute.hellbender.testutils.GenomicsDBTestUtils;
import org.broadinstitute.hellbender.testutils.VariantContextTestUtils;
import org.broadinstitute.hellbender.tools.genomicsdb.GenomicsDBArgumentCollection;
import org.broadinstitute.hellbender.tools.walkers.readorientation.BetaDistributionShape;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
Expand All @@ -29,6 +30,28 @@ public class CreateSomaticPanelOfNormalsIntegrationTest extends CommandLineProgr
// positions 10,000,000 - 11,000,000 of chr 20 and with most annotations removed
private static final File GNOMAD = new File(largeFileTestDir, "very-small-gnomad.vcf");

void checkPonVariants(final List<VariantContext> ponVariants, final List<VariantContext> inputVariants) {
for (int n = 0; n < ponVariants.size(); n++) {
final VariantContext ponVC = ponVariants.get(n);

final VariantContext inputVC = inputVariants.get(n);

Assert.assertEquals(ponVC.getAlleles(), inputVC.getAlleles());

// by construction, every pon sample has each variant
final double ponFraction = ponVC.getAttributeAsDouble(CreateSomaticPanelOfNormals.FRACTION_INFO_FIELD, 0.0);
Assert.assertEquals(ponFraction, 1.0, 1.0e-6);

List<Double> ponBeta = ponVC.getAttributeAsDoubleList(CreateSomaticPanelOfNormals.BETA_SHAPE_INFO_FIELD, 1.0);
final double ponAF = ponBeta.get(0) / (ponBeta.get(0) + ponBeta.get(1));

final int[] AD = inputVC.getGenotype(0).getAD();
final double empiricalAF = (double) (MathUtils.sum(AD) - AD[0]) / MathUtils.sum(AD);

Assert.assertEquals(ponAF, empiricalAF, 0.2);
}
}

@Test
public void testTwoIdenticalVcfs() throws IOException {
final File vcf1 = new File(PON_VCFS_DIR, "sample1.vcf");
Expand All @@ -55,23 +78,15 @@ public void testTwoIdenticalVcfs() throws IOException {
// this checks that even filtered variants make it into the pon
Assert.assertEquals(ponVariants.size(), inputVariants.size());

for (int n = 0; n < ponVariants.size(); n++) {
final VariantContext ponVC = ponVariants.get(n);
final VariantContext inputVC = inputVariants.get(n);
Assert.assertEquals(ponVC.getAlleles(), inputVC.getAlleles());

// by construction, every pon sample has each variant
final double ponFraction = ponVC.getAttributeAsDouble(CreateSomaticPanelOfNormals.FRACTION_INFO_FIELD, 0.0);
Assert.assertEquals(ponFraction, 1.0, 1.0e-6);

final List<Double> ponBeta = ponVC.getAttributeAsDoubleList(CreateSomaticPanelOfNormals.BETA_SHAPE_INFO_FIELD, 1.0);
final double ponAF = ponBeta.get(0) / (ponBeta.get(0) + ponBeta.get(1));

final int[] AD = inputVC.getGenotype(0).getAD();
final double empiricalAF = (double) (MathUtils.sum(AD) - AD[0]) / MathUtils.sum(AD);
checkPonVariants(ponVariants, inputVariants);

Assert.assertEquals(ponAF, empiricalAF, 0.2);
}
// Test again by requesting BCF codec stream from GenomicsDBFeatureReader
args.add(GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME, true);
Utils.resetRandomGenerator();
runCommandLine(args);
final List<VariantContext> ponVariantsWithBCFCodecStreaming = VariantContextTestUtils.streamVcf(output).collect(Collectors.toList());
Assert.assertEquals(ponVariantsWithBCFCodecStreaming.size(), inputVariants.size());
checkPonVariants(ponVariantsWithBCFCodecStreaming, inputVariants);
}

@Test
Expand All @@ -95,6 +110,12 @@ public void testJustOneVcf() throws IOException {

final List<VariantContext> ponVariants = VariantContextTestUtils.streamVcf(output).collect(Collectors.toList());
Assert.assertTrue(ponVariants.isEmpty());

// Test again by requesting BCF codec stream from GenomicsDBFeatureReader
args.add(GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME, true);
Utils.resetRandomGenerator();
runCommandLine(args);
Assert.assertTrue(VariantContextTestUtils.streamVcf(output).collect(Collectors.toList()).isEmpty());
}


Expand Down Expand Up @@ -124,6 +145,30 @@ public void testGnomad() throws IOException {

// here is a variant that should definitely be skipped as germline
Assert.assertTrue(ponVariants.stream().mapToInt(VariantContext::getStart).noneMatch(start -> start == 10_000_117));

// Test again by requesting BCF codec stream from GenomicsDBFeatureReader
args.add(GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME, true);
Utils.resetRandomGenerator();
runCommandLine(args);
final List<VariantContext> ponVariantsFromBCFCodecStream = VariantContextTestUtils.streamVcf(output).collect(Collectors.toList());
Assert.assertTrue(!ponVariantsFromBCFCodecStream.isEmpty());
Assert.assertTrue(ponVariantsFromBCFCodecStream.stream().mapToInt(VariantContext::getStart).noneMatch(start -> start == 10_000_117));
}

void checkPonVariantsFromThreeVcfs(final List<VariantContext> ponVariants) {
Assert.assertEquals(ponVariants.size(), 3);

final double[] ponFractions = ponVariants.stream()
.mapToDouble(vc -> vc.getAttributeAsDouble(CreateSomaticPanelOfNormals.FRACTION_INFO_FIELD, 0.0))
.toArray();
Assert.assertEquals(ponFractions[0], 2.0/3, 0.01);
Assert.assertEquals(ponFractions[1], 1, 0.01);
Assert.assertEquals(ponFractions[2], 1, 0.01);

final List<BetaDistributionShape> betas = ponVariants.stream().map(vc -> {
final List<Double> ponBeta = vc.getAttributeAsDoubleList(CreateSomaticPanelOfNormals.BETA_SHAPE_INFO_FIELD, 1.0);
return new BetaDistributionShape(ponBeta.get(0), ponBeta.get(1));
}).collect(Collectors.toList());
}

/**
Expand Down Expand Up @@ -154,19 +199,12 @@ public void testThreeVcfs() throws IOException {
runCommandLine(args);

final List<VariantContext> ponVariants = VariantContextTestUtils.streamVcf(output).collect(Collectors.toList());
Assert.assertEquals(ponVariants.size(), 3);

final double[] ponFractions = ponVariants.stream()
.mapToDouble(vc -> vc.getAttributeAsDouble(CreateSomaticPanelOfNormals.FRACTION_INFO_FIELD, 0.0))
.toArray();
Assert.assertEquals(ponFractions[0], 2.0/3, 0.01);
Assert.assertEquals(ponFractions[1], 1, 0.01);
Assert.assertEquals(ponFractions[2], 1, 0.01);

final List<BetaDistributionShape> betas = ponVariants.stream().map(vc -> {
final List<Double> ponBeta = vc.getAttributeAsDoubleList(CreateSomaticPanelOfNormals.BETA_SHAPE_INFO_FIELD, 1.0);
return new BetaDistributionShape(ponBeta.get(0), ponBeta.get(1));
}).collect(Collectors.toList());
checkPonVariantsFromThreeVcfs(ponVariants);

// Test again by requesting BCF codec stream from GenomicsDBFeatureReader
args.add(GenomicsDBArgumentCollection.USE_BCF_CODEC_LONG_NAME, true);
Utils.resetRandomGenerator();
runCommandLine(args);
checkPonVariantsFromThreeVcfs(VariantContextTestUtils.streamVcf(output).collect(Collectors.toList()));
}
}