Skip to content

Commit

Permalink
Support for custom ploidy regions in HaplotypeCaller (#8609)
Browse files Browse the repository at this point in the history
For having variable ploidy in different regions, like making haploid calls outside the PAR on chrX or chrY, 
there is now a --ploidy-regions flag. The -ploidy flag sets the default ploidy to use everywhere, and --ploidy-regions
should be a .bed or .interval_list with "name" column containing the desired ploidy to use in that region
when genotyping. Note that variants near the boundary may not have the matching ploidy since the ploidy used will be determined using the following precedence:

* ploidy given in --ploidy-regions for all intervals overlapping the active region when calling your variant
  with ties broken by using largest ploidy); note ploidy interval may only overlap the active region and determine 
  the ploidy of your variant even if the end coordinate written for your variant lies outside the given region
* ploidy given via global -ploidy flag
* ploidy determined by the default global built-in constant for humans (2).

---------

Co-authored-by: Ty Kay <[email protected]>
Co-authored-by: rickymagner <[email protected]>
  • Loading branch information
3 people committed Dec 11, 2023
1 parent e29cbc3 commit 0b18579
Show file tree
Hide file tree
Showing 11 changed files with 598 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.Feature;
import htsjdk.tribble.NamedFeature;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.param.ParamUtils;
Expand All @@ -19,7 +21,17 @@ public class SimpleCount implements Locatable, Feature {
public SimpleCount(final SimpleInterval interval,
final int count) {
this.interval = Utils.nonNull(interval);
this.count = ParamUtils.isPositiveOrZero(count, "Can't construct SimpleCount with negative count.");
this.count = ParamUtils.isPositiveOrZero(count, "Can't construct SimpleCount with negative count at " + interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd() + ".");
}

public SimpleCount(final NamedFeature namedFeature) {
try {
int count = Integer.parseInt(namedFeature.getName());
this.interval = new SimpleInterval(namedFeature);
this.count = ParamUtils.isPositiveOrZero(count, "Can't construct SimpleCount with negative count at " + namedFeature.getContig() + ":" + namedFeature.getStart() + "-" + namedFeature.getEnd() + ".");
} catch (NumberFormatException e) {
throw new IllegalArgumentException("Error parsing name into integer for feature at " + namedFeature.getContig() + ":" + namedFeature.getStart() + "-" + namedFeature.getEnd() + ".");
}
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,21 @@
* argument. Note however that very high ploidies (such as are encountered in large pooled experiments) may cause
* performance challenges including excessive slowness. We are working on resolving these limitations.</p>
*
* <p>For having variable ploidy in different regions, like making haploid calls outside the PAR on chrX or chrY,
* see the --ploidy-regions flag. The -ploidy flag sets the default ploidy to use everywhere, and --ploidy-regions
* should be a .bed or .interval_list with "name" column containing the desired ploidy to use in that region
* when genotyping. Note that variants near the boundary may not have the matching ploidy since the ploidy used will
* be determined using the following precedence: </p>
* <ol>
* <li>ploidy given in --ploidy-regions for all intervals overlapping the active region when calling your variant
* (with ties broken by using largest ploidy); note ploidy interval may only overlap the active region and determine
* the ploidy of your variant even if the end coordinate written for your variant lies outside the given region;</li>
* <li>ploidy given via global -ploidy flag;</li>
* <li>ploidy determined by the default global built-in constant for humans (2).</li>
* </ol>
*
* <p>Coordinates for the PAR for CRCh38 can be found <a href='http://useast.ensembl.org/info/genome/genebuild/human_PARS.html'>here</a>.</p>
*
* <h3>Additional Notes</h3>
* <ul>
* <li>When working with PCR-free data, be sure to set `-pcr_indel_model NONE` (see argument below).</li>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
package org.broadinstitute.hellbender.tools.walkers.haplotypecaller;

import htsjdk.tribble.NamedFeature;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.arrow.util.VisibleForTesting;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.SerializationUtils;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.Hidden;
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.DbsnpArgumentCollection;
import org.broadinstitute.hellbender.engine.FeatureDataSource;
import org.broadinstitute.hellbender.engine.FeatureInput;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.spark.AssemblyRegionArgumentCollection;
Expand All @@ -23,9 +27,11 @@
/**
* Set of arguments for the {@link HaplotypeCallerEngine}
*/
public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable{
public class HaplotypeCallerArgumentCollection extends AssemblyBasedCallerArgumentCollection implements Serializable {
private static final long serialVersionUID = 1L;

public static final String PLOIDY_REGIONS_NAME = "ploidy-regions";

public static final String GQ_BAND_LONG_NAME = "gvcf-gq-bands";
public static final String GQ_BAND_SHORT_NAME = "GQB";
public static final String DO_NOT_CORRECT_OVERLAPPING_BASE_QUALITIES_LONG_NAME = "do-not-correct-overlapping-quality";
Expand Down Expand Up @@ -61,6 +67,9 @@ protected ReadThreadingAssemblerArgumentCollection getReadThreadingAssemblerArgu
return new HaplotypeCallerReadThreadingAssemblerArgumentCollection();
}

@Argument(fullName = PLOIDY_REGIONS_NAME, shortName = PLOIDY_REGIONS_NAME, doc = "Interval file with column specifying desired ploidy for genotyping models. Overrides default ploidy and user-provided --ploidy argument in specific regions.", optional = true)
public FeatureInput<NamedFeature> ploidyRegions = null;

/**
* You can use this argument to specify that HC should process a single sample out of a multisample BAM file. This
* is especially useful if your samples are all in the same file but you need to run them individually through HC
Expand Down Expand Up @@ -312,6 +321,12 @@ boolean isFlowBasedCallingMode() {
return flowMode != FlowMode.NONE;
}

// Copy method used to create new hcArgs with same fields except input ploidy model
public HaplotypeCallerArgumentCollection copyWithNewPloidy(int ploidy) {
HaplotypeCallerArgumentCollection newArgsWithNewPloidy = SerializationUtils.clone(this);
newArgsWithNewPloidy.standardArgs.genotypeArgs.samplePloidy = ploidy;
return newArgsWithNewPloidy;
}

/**
* the different flow modes, in terms of their parameters and their values
Expand Down
Loading

0 comments on commit 0b18579

Please sign in to comment.