Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow custom ploidy regions in HaplotypeCaller #8464

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.Feature;
import htsjdk.tribble.NamedFeature;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.param.ParamUtils;
Expand All @@ -19,7 +21,17 @@
public SimpleCount(final SimpleInterval interval,
final int count) {
this.interval = Utils.nonNull(interval);
this.count = ParamUtils.isPositiveOrZero(count, "Can't construct SimpleCount with negative count.");
this.count = ParamUtils.isPositiveOrZero(count, "Can't construct SimpleCount with negative count at " + interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd() + ".");
}

public SimpleCount(final NamedFeature namedFeature) {

Check warning on line 27 in src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java#L27

Added line #L27 was not covered by tests
try {
int count = Integer.parseInt(namedFeature.getName());
this.interval = new SimpleInterval(namedFeature);
this.count = ParamUtils.isPositiveOrZero(count, "Can't construct SimpleCount with negative count at " + namedFeature.getContig() + ":" + namedFeature.getStart() + "-" + namedFeature.getEnd() + ".");
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Error parsing name into integer for feature at " + namedFeature.getContig() + ":" + namedFeature.getStart() + "-" + namedFeature.getEnd() + ".");
}

Check warning on line 34 in src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/copynumber/formats/records/SimpleCount.java#L29-L34

Added lines #L29 - L34 were not covered by tests
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public class GenotypeGVCFsEngine
private VariantAnnotatorEngine annotationEngine = null;

//the genotyping engine
private GenotypingEngine<?> forceOutputGenotypingEngine = null;
private GenotypingEngine forceOutputGenotypingEngine = null;
private MinimalGenotypingEngine genotypingEngine = null;

// the INFO field annotation key names to remove
Expand Down Expand Up @@ -303,7 +303,7 @@ private VariantContext callSomaticGenotypes(final VariantContext vc, double tlod
final VariantContextBuilder builder = new VariantContextBuilder(vc);
final VariantContext regenotypedVC = builder.genotypes(newGenotypes).make();

final int maxAltAlleles = genotypingEngine.getConfiguration().genotypeArgs.maxAlternateAlleles;
final int maxAltAlleles = genotypingEngine.getGenotypeArgs().maxAlternateAlleles;
List<Allele> allelesToKeep;

//we need to make sure all alleles pass the tlodThreshold
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ public GenotypeCalculationArgumentCollection clone() {
* Using this argument instructs the genotyper to annotate (in the INFO field) the number of alternate alleles that were originally discovered at the site.
*/
@Argument(fullName = "annotate-with-num-discovered-alleles", doc = "If provided, we will annotate records with the number of alternate alleles that were discovered (but not necessarily genotyped) at a given site", optional = true)
public boolean ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED = false;
public boolean annotateNumberOfAllelesDiscovered = false;

/**
* The expected heterozygosity value used to compute prior probability that a locus is non-reference.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,13 @@
/**
* Base class for genotyper engines.
*/
public abstract class GenotypingEngine<Config extends StandardCallerArgumentCollection> {
public abstract class GenotypingEngine {

private final static int TOO_LONG_PL = 100000;

protected final AlleleFrequencyCalculator alleleFrequencyCalculator;

protected final Config configuration;
private final StandardCallerArgumentCollection configuration;

protected VariantAnnotatorEngine annotationEngine;

Expand Down Expand Up @@ -63,7 +63,7 @@ public abstract class GenotypingEngine<Config extends StandardCallerArgumentColl
*
* @throws IllegalArgumentException if any of {@code samples}, {@code configuration} is {@code null} or if {@code samples} is empty.
*/
protected GenotypingEngine(final Config configuration,
protected GenotypingEngine(final StandardCallerArgumentCollection configuration,
final SampleList samples,
final boolean doAlleleSpecificCalcs) {
this.configuration = Utils.nonNull(configuration, "the configuration cannot be null");
Expand All @@ -89,12 +89,16 @@ public void setLogger(final Logger logger) {

public Set<VCFInfoHeaderLine> getAppropriateVCFInfoHeaders() {
final Set<VCFInfoHeaderLine> headerInfo = new LinkedHashSet<>();
if ( configuration.genotypeArgs.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) {
if ( getGenotypeArgs().annotateNumberOfAllelesDiscovered) {
headerInfo.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY));
}
return headerInfo;
}

public GenotypeCalculationArgumentCollection getGenotypeArgs() {
return getConfiguration().genotypeArgs;
}

/**
* Changes the annotation engine for this genotyping-engine.
*
Expand All @@ -109,7 +113,7 @@ public void setAnnotationEngine(final VariantAnnotatorEngine annotationEngine) {
*
* @return never {@code null}.
*/
public Config getConfiguration() {
protected StandardCallerArgumentCollection getConfiguration() {
return configuration;
}

Expand All @@ -131,8 +135,8 @@ public VariantContext calculateGenotypes(final VariantContext vc, final Genotype
return null;
}

final int defaultPloidy = configuration.genotypeArgs.samplePloidy;
final int maxAltAlleles = configuration.genotypeArgs.maxAlternateAlleles;
final int defaultPloidy = getGenotypeArgs().samplePloidy;
final int maxAltAlleles = getGenotypeArgs().maxAlternateAlleles;

VariantContext reducedVC = vc;
if (maxAltAlleles < vc.getAlternateAlleles().size()) {
Expand All @@ -156,7 +160,7 @@ public VariantContext calculateGenotypes(final VariantContext vc, final Genotype

// note the math.abs is necessary because -10 * 0.0 => -0.0 which isn't nice
final double log10Confidence =
!outputAlternativeAlleles.siteIsMonomorphic || configuration.annotateAllSitesWithPLs
!outputAlternativeAlleles.siteIsMonomorphic || getConfiguration().annotateAllSitesWithPLs
? AFresult.log10ProbOnlyRefAlleleExists() + 0.0 : AFresult.log10ProbVariantPresent() + 0.0;

// Add 0.0 removes -0.0 occurrences.
Expand Down Expand Up @@ -188,11 +192,11 @@ && noAllelesOrFirstAlleleIsNotNonRef(outputAlternativeAlleles.alleles) && forced
// create the genotypes
//TODO: omit subsetting if output alleles is not a proper subset of vc.getAlleles
final GenotypesContext genotypes = outputAlleles.size() == 1 ? GATKVariantContextUtils.subsetToRefOnly(vc, defaultPloidy) :
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), outputAlleles, gpc, configuration.genotypeArgs.genotypeAssignmentMethod);
AlleleSubsettingUtils.subsetAlleles(vc.getGenotypes(), defaultPloidy, vc.getAlleles(), outputAlleles, gpc, getGenotypeArgs().genotypeAssignmentMethod);

if (configuration.genotypeArgs.usePosteriorProbabilitiesToCalculateQual && hasPosteriors(genotypes)) {
if (getGenotypeArgs().usePosteriorProbabilitiesToCalculateQual && hasPosteriors(genotypes)) {
final double log10NoVariantPosterior = phredNoVariantPosteriorProbability(outputAlleles, genotypes) * -.1;
final double qualUpdate = !outputAlternativeAlleles.siteIsMonomorphic || configuration.annotateAllSitesWithPLs
final double qualUpdate = !outputAlternativeAlleles.siteIsMonomorphic || getConfiguration().annotateAllSitesWithPLs
? log10NoVariantPosterior + 0.0 : MathUtils.log10OneMinusPow10(log10NoVariantPosterior) + 0.0;
if (!Double.isNaN(qualUpdate)) {
builder.log10PError(qualUpdate);
Expand Down Expand Up @@ -260,30 +264,23 @@ static boolean noAllelesOrFirstAlleleIsNotNonRef(List<Allele> altAlleles) {
protected abstract String callSourceString();

/**
* Holds information about the alternative allele subsetting based on supporting evidence, genotyping and
* output modes.
*/
private static class OutputAlleleSubset {
private final List<Allele> alleles;
private final boolean siteIsMonomorphic;
private final List<Integer> mleCounts;

private OutputAlleleSubset(final List<Allele> alleles, final List<Integer> mleCounts, final boolean siteIsMonomorphic) {
* Holds information about the alternative allele subsetting based on supporting evidence, genotyping and
* output modes.
*/
private record OutputAlleleSubset(List<Allele> alleles, List<Integer> mleCounts, boolean siteIsMonomorphic) {
private OutputAlleleSubset {
Utils.nonNull(alleles, "alleles");
Utils.nonNull(mleCounts, "mleCounts");
this.siteIsMonomorphic = siteIsMonomorphic;
this.alleles = alleles;
this.mleCounts = mleCounts;
}

private List<Allele> outputAlleles(final Allele referenceAllele) {
return Stream.concat(Stream.of(referenceAllele), alleles.stream()).collect(Collectors.toList());
}
private List<Allele> outputAlleles(final Allele referenceAllele) {
return Stream.concat(Stream.of(referenceAllele), alleles.stream()).collect(Collectors.toList());
}

public List<Integer> alternativeAlleleMLECounts() {
return mleCounts;
public List<Integer> alternativeAlleleMLECounts() {
return mleCounts;
}
}
}


/**
Expand All @@ -308,7 +305,7 @@ private OutputAlleleSubset calculateOutputAlleleSubset(final AFCalculationResult
// we want to keep the NON_REF symbolic allele but only in the absence of a non-symbolic allele, e.g.
// if we combined a ref / NON_REF gVCF with a ref / alt gVCF
final boolean isNonRefWhichIsLoneAltAllele = alternativeAlleleCount == 1 && allele.equals(Allele.NON_REF_ALLELE);
final boolean isPlausible = afCalculationResult.passesThreshold(allele, configuration.genotypeArgs.standardConfidenceForCalling);
final boolean isPlausible = afCalculationResult.passesThreshold(allele, getGenotypeArgs().standardConfidenceForCalling);

//it's possible that the upstream deletion that spanned this site was not emitted, mooting the symbolic spanning deletion allele
final boolean isSpuriousSpanningDeletion = GATKVCFConstants.isSpanningDeletion(allele) && !isVcCoveredByDeletion(vc);
Expand Down Expand Up @@ -415,16 +412,16 @@ protected final boolean cannotBeGenotyped(final VariantContext vc) {
* This does not necessarily emit calls for all sites in a region (see {@link OutputMode}).
*/
protected boolean emitAllActiveSites() {
return configuration.outputMode == OutputMode.EMIT_ALL_ACTIVE_SITES;
return getConfiguration().outputMode == OutputMode.EMIT_ALL_ACTIVE_SITES;
}

protected final boolean passesEmitThreshold(final double conf, final boolean bestGuessIsRef) {
return (configuration.outputMode == OutputMode.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) &&
return (getConfiguration().outputMode == OutputMode.EMIT_ALL_CONFIDENT_SITES || !bestGuessIsRef) &&
passesCallThreshold(conf);
}

protected final boolean passesCallThreshold(final double conf) {
return conf >= configuration.genotypeArgs.standardConfidenceForCalling;
return conf >= getGenotypeArgs().standardConfidenceForCalling;
}

protected Map<String,Object> composeCallAttributes(final VariantContext vc, final List<Integer> alleleCountsofMLE,
Expand Down Expand Up @@ -457,7 +454,7 @@ protected Map<String,Object> composeCallAttributes(final VariantContext vc, fina
attributes.put(GATKVCFConstants.AS_QUAL_KEY, perAlleleQuals.stream().mapToInt(q -> Math.round(q)).boxed().collect(Collectors.toList()));
}

if ( configuration.genotypeArgs.ANNOTATE_NUMBER_OF_ALLELES_DISCOVERED ) {
if ( getGenotypeArgs().annotateNumberOfAllelesDiscovered) {
attributes.put(GATKVCFConstants.NUMBER_OF_DISCOVERED_ALLELES_KEY, vc.getAlternateAlleles().size());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
* used only by the HaplotypeCaller for its isActive() determination. Should not be used for
* any other purpose!
*/
public final class MinimalGenotypingEngine extends GenotypingEngine<StandardCallerArgumentCollection> {
public final class MinimalGenotypingEngine extends GenotypingEngine {

private final DragstrParams dragstrParams;

Expand Down Expand Up @@ -61,7 +61,7 @@

@Override
protected boolean forceKeepAllele(final Allele allele) {
return configuration.annotateAllSitesWithPLs;
return getConfiguration().annotateAllSitesWithPLs;
}

@Override
Expand All @@ -71,8 +71,8 @@

@Override
public VariantContext calculateGenotypes(final VariantContext vc) {
if (dragstrParams == null || getConfiguration().genotypeArgs.dontUseDragstrPriors || !GATKVariantContextUtils.containsInlineIndel(vc) || referenceContext == null) {
final GenotypePriorCalculator gpc = GenotypePriorCalculator.assumingHW(configuration.genotypeArgs);
if (dragstrParams == null || getGenotypeArgs().dontUseDragstrPriors || !GATKVariantContextUtils.containsInlineIndel(vc) || referenceContext == null) {
final GenotypePriorCalculator gpc = GenotypePriorCalculator.assumingHW(getGenotypeArgs());
return calculateGenotypes(vc, gpc, Collections.emptyList());
} else {

Expand All @@ -90,7 +90,7 @@
final int period = analyzer.period(startOffset);
final int repeats = analyzer.repeatLength(startOffset);

final GenotypePriorCalculator gpc = GenotypePriorCalculator.givenDragstrParams(dragstrParams, period, repeats, Math.log10(getConfiguration().genotypeArgs.snpHeterozygosity), 2.0);
final GenotypePriorCalculator gpc = GenotypePriorCalculator.givenDragstrParams(dragstrParams, period, repeats, Math.log10(getGenotypeArgs().snpHeterozygosity), 2.0);

Check warning on line 93 in src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/MinimalGenotypingEngine.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/org/broadinstitute/hellbender/tools/walkers/genotyper/MinimalGenotypingEngine.java#L93

Added line #L93 was not covered by tests
return calculateGenotypes(vc, gpc, Collections.emptyList());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
/**
* This is pulled out so that every caller isn't exposed to the arguments from every other caller.
*/
public class StandardCallerArgumentCollection implements Serializable {
public final class StandardCallerArgumentCollection implements Serializable {
private static final long serialVersionUID = 1L;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public class AlleleFilteringHC extends AlleleFiltering {
public AlleleFilteringHC(HaplotypeCallerArgumentCollection _hcargs, OutputStreamWriter assemblyDebugStream, HaplotypeCallerGenotypingEngine _genotypingEngine){
super(_hcargs, assemblyDebugStream);
genotypingEngine = _genotypingEngine;
GenotypeCalculationArgumentCollection config = genotypingEngine.getConfiguration().genotypeArgs;
GenotypeCalculationArgumentCollection config = genotypingEngine.getGenotypeArgs();
afCalc = AlleleFrequencyCalculator.makeCalculator(config);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,21 @@
* argument. Note however that very high ploidies (such as are encountered in large pooled experiments) may cause
* performance challenges including excessive slowness. We are working on resolving these limitations.</p>
*
* <p>For having variable ploidy in different regions, like making haploid calls outside the PAR on chrX or chrY,
* see the --ploidy-regions flag. The -ploidy flag sets the default ploidy to use everywhere, and --ploidy-regions
* should be a .bed or .interval_list with "name" column containing the desired ploidy to use in that region
* when genotyping. Note that variants near the boundary may not have the matching ploidy since the ploidy used will
* be determined using the following precedence: </p>
* <ol>
* <li>ploidy given in --ploidy-regions for all intervals overlapping the active region when calling your variant
* (with ties broken by using largest ploidy); note ploidy interval may only overlap the active region and determine
* the ploidy of your variant even if the end coordinate written for your variant lies outside the given region;</li>
* <li>ploidy given via global -ploidy flag;</li>
* <li>ploidy determined by the default global built-in constant for humans (2).</li>
rickymagner marked this conversation as resolved.
Show resolved Hide resolved
* </ol>
*
* <p>Coordinates for the PAR for CRCh38 can be found <a href='http://useast.ensembl.org/info/genome/genebuild/human_PARS.html'>here</a>.</p>
*
* <h3>Additional Notes</h3>
* <ul>
* <li>When working with PCR-free data, be sure to set `-pcr_indel_model NONE` (see argument below).</li>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;

import htsjdk.tribble.NamedFeature;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.arrow.util.VisibleForTesting;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.Hidden;
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection;
Expand All @@ -23,9 +27,11 @@
/**
* Set of arguments for the {@link HaplotypeCallerEngine}
*/
public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable{
public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable {
private static final long serialVersionUID = 1L;

public static final String PLOIDY_REGIONS_NAME = "ploidy-regions";

public static final String GQ_BAND_LONG_NAME = "gvcf-gq-bands";
public static final String GQ_BAND_SHORT_NAME = "GQB";
public static final String DO_NOT_CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "do-not-correct-overlapping-quality";
Expand Down Expand Up @@ -61,6 +67,9 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu
return new HaplotypeCallerReadThreadingAssemblerArgumentCollection();
}

@Argument(fullName = PLOIDY_REGIONS_NAME, shortName = PLOIDY_REGIONS_NAME, doc = "Interval file with column specifying desired ploidy for genotyping models. Overrides default ploidy and user-provided --ploidy argument in specific regions.", optional = true)
public FeatureInput<NamedFeature> ploidyRegions = null;

/**
* You can use this argument to specify that HC should process a single sample out of a multisample BAM file. This
* is especially useful if your samples are all in the same file but you need to run them individually through HC
Expand Down Expand Up @@ -312,6 +321,12 @@ boolean isFlowBasedCallingMode() {
return flowMode != FlowMode.NONE;
}

// Copy method used to create new hcArgs with same fields except input ploidy model
public HaplotypeCallerArgumentCollection copyWithNewPloidy(int ploidy) {
jamesemery marked this conversation as resolved.
Show resolved Hide resolved
HaplotypeCallerArgumentCollection newArgsWithNewPloidy = SerializationUtils.clone(this);
newArgsWithNewPloidy.standardArgs.genotypeArgs.samplePloidy = ploidy;
return newArgsWithNewPloidy;
}

/**
* the different flow modes, in terms of their parameters and their values
Expand Down
Loading
Loading