Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LocusDepthtoBAF tool #7776

Merged
merged 4 commits into from
Apr 28, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,29 @@ public final class BafEvidence implements SVFeature {
private final double value;

public final static String BCI_VERSION = "1.0";
public final static double MISSING_VALUE = -1.;

public BafEvidence( final String sample, final String contig,
final int position, final double value ) {
Utils.nonNull(sample);
Utils.nonNull(contig);
Utils.validateArg(position > 0, "starting position must be positive");
Utils.validateArg(value >= 0. || value == MISSING_VALUE,
"value must be non-negative or missing");
this.sample = sample;
this.contig = contig;
this.position = position;
this.value = value;
}

// value-altering constructor
public BafEvidence( final BafEvidence that, final double value ) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please add a quick unit test for this?

this.sample = that.sample;
this.contig = that.contig;
this.position = that.position;
this.value = value;
}

public String getSample() {
return sample;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.util.Locatable;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.*;

Expand All @@ -11,25 +12,26 @@ public final class LocusDepth implements SVFeature {
private final String contig;
private final int position;
private final String sample;
private final byte refCall; // index into nucleotideValues
private final int[] depths;
private final int[] depths; // four slots for each call, ACGT

public final static String BCI_VERSION = "1.0";

public LocusDepth( final Locatable loc, final String sample, final byte refCall ) {
this.contig = loc.getContig();
this.position = loc.getStart();
this.sample = sample;
this.refCall = refCall;
this.depths = new int[4];
public LocusDepth( final Locatable loc, final String sample ) {
this(loc.getContig(), loc.getStart(), sample, 0, 0, 0, 0);
}

public LocusDepth( final String contig, final int position,
final String sample, final byte refCall,
public LocusDepth( final String contig, final int position, final String sample,
final int aDepth, final int cDepth, final int gDepth, final int tDepth ) {
Utils.nonNull(contig, "contig may not be null");
Utils.validateArg(position > 0, "starting position must be positive");
Utils.nonNull(sample, "sample must not be null");
Utils.validateArg(aDepth >= 0, "depth must be non-negative");
Utils.validateArg(cDepth >= 0, "depth must be non-negative");
Utils.validateArg(gDepth >= 0, "depth must be non-negative");
Utils.validateArg(tDepth >= 0, "depth must be non-negative");
this.contig = contig;
this.position = position;
this.sample = sample;
this.refCall = refCall;
this.depths = new int[4];
depths[0] = aDepth;
depths[1] = cDepth;
Expand Down Expand Up @@ -57,13 +59,11 @@ public int getStart() {
}

public String getSample() { return sample; }
public char getRefCall() {
return (char)refCall;
}
public int getADepth() { return depths[0]; }
public int getCDepth() { return depths[1]; }
public int getGDepth() { return depths[2]; }
public int getTDepth() { return depths[3]; }

public int getTotalDepth() { return depths[0] + depths[1] + depths[2] + depths[3]; }

// idx is index into "ACGT"
public int getDepth( final int idx ) { return depths[idx]; }

@Override
public LocusDepth extractSamples( final Set<String> sampleNames, final Object header ) {
Expand All @@ -77,7 +77,7 @@ public boolean equals( final Object obj ) {
}

public boolean equals( final LocusDepth that ) {
return this.position == that.position && this.refCall == that.refCall &&
return this.position == that.position &&
this.depths[0] == that.depths[0] && this.depths[1] == that.depths[1] &&
this.depths[2] == that.depths[2] && this.depths[3] == that.depths[3] &&
Objects.equals(this.contig, that.contig) &&
Expand All @@ -86,12 +86,12 @@ public boolean equals( final LocusDepth that ) {

@Override
public int hashCode() {
return Objects.hash(contig, position, sample, refCall, Arrays.hashCode(depths));
return Objects.hash(contig, position, sample, Arrays.hashCode(depths));
}

@Override
public String toString() {
return contig + "\t" + position + "\t" + sample + "\t" + (char)refCall + "\t" +
depths[0] + "\t" + depths[1] + "\t" + depths[2] + "\t" + depths[3];
return sample + "@" + contig + ":" + position +
"[" + depths[0] + "," + depths[1] + "," + depths[2] + "," + depths[3] + "]";
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
package org.broadinstitute.hellbender.tools.sv;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.Feature;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import org.apache.commons.math3.distribution.ChiSquaredDistribution;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.MathUtils.RunningAverage;
import org.broadinstitute.hellbender.utils.Nucleotide;
import org.broadinstitute.hellbender.utils.codecs.FeatureOutputCodec;
import org.broadinstitute.hellbender.utils.codecs.FeatureOutputCodecFinder;
import org.broadinstitute.hellbender.utils.codecs.FeatureSink;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

/**
* <p>Merges locus-sorted LocusDepth evidence files, and calculates the bi-allelic frequency (baf)
* for each sample and site, and writes these values as a BafEvidence output file.</p>
* <p>Samples at sites that have too few depth counts (controlled by --min-total-depth), and samples
* failing a Pearson's chi square test for goodness of fit to a bi-allelic model are excluded. (I.e.,
* only samples that are judged to be heterozygous at a site are included.)</p>
* <p>Across all evaluable samples, sites where the samples have bafs with too much variance
* (controlled by --max-std) are excluded. The median baf value at each site is adjusted to be
* exactly 0.5.</p>
*
* <h3>Inputs</h3>
*
* <ul>
* <li>
* One or more LocusDepth evidence files, or a file containing a list of evidence files, one per line.
* </li>
* </ul>
*
* <h3>Output</h3>
*
* <ul>
* <li>
* An output file containing merged BafEvidence from the inputs.
* </li>
* </ul>
*
* <h3>Usage example</h3>
*
* <pre>
* gatk LocusDepthtoBAF \
* -F file1.ld.txt.gz [-F file2.ld.txt.gz ...] \
* -O merged.baf.bci
* </pre>
*
* @author Ted Sharpe &lt;[email protected]&gt;
*/
@CommandLineProgramProperties(
summary = "Convert LocusDepth to BafEvidence",
oneLineSummary = "Convert LocusDepth to BafEvidence",
programGroup = StructuralVariantDiscoveryProgramGroup.class
)
@ExperimentalFeature
public class LocusDepthtoBAF extends MultiFeatureWalker<LocusDepth> {
public static final String LOCUS_DEPTH_FILE_NAME = "locus-depth";
public static final String BAF_SITES_VCF_LONG_NAME = "baf-sites-vcf";
public static final String SAMPLE_NAMES_NAME = "sample-names";
public static final String BAF_EVIDENCE_FILE_NAME = "baf-evidence-output";
public static final String BIALLELIC_ONLY_NAME = "biallelic-only";
public static final String MIN_TOTAL_DEPTH_NAME = "min-total-depth";
public static final String MAX_STD_NAME = "max-std";
public static final String MIN_HET_PROBABILITY = "min-het-probability";
public static final String COMPRESSION_LEVEL_NAME = "compression-level";
public static ChiSquaredDistribution chiSqDist = new ChiSquaredDistribution(null, 1.);

@Argument(
doc = "Locus depth files to process",
fullName = LOCUS_DEPTH_FILE_NAME,
shortName = StandardArgumentDefinitions.FEATURE_SHORT_NAME )
private List<FeatureInput<LocusDepth>> locusDepthFiles;

@Argument(fullName = BAF_SITES_VCF_LONG_NAME,
doc = "Input VCF of SNPs marking loci for allele counts")
public GATKPath bafSitesFile;

@Argument(fullName = SAMPLE_NAMES_NAME,
doc = "Sample names. Necessary only when writing a *.baf.bci output file.", optional = true)
private List<String> sampleNames;

@Argument(
doc = "BAF output file",
fullName = BAF_EVIDENCE_FILE_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME )
private GATKPath bafOutputFile;

@Argument(
doc = "Process only bi-allelic SNP sites",
fullName = BIALLELIC_ONLY_NAME, optional = true )
private boolean biAllelicOnly = true;

@Argument(
doc = "Maximum standard deviation across bafs at a locus (otherwise locus will be ignored)",
fullName = MAX_STD_NAME, optional = true )
private double maxStdDev = .2;

@Argument(
doc = "Minimum total depth at a locus (otherwise locus will be ignored)",
fullName = MIN_TOTAL_DEPTH_NAME, optional = true )
private int minTotalDepth = 10;

@Argument(
doc = "Minimum probability of site being biallelic and heterozygous",
fullName = MIN_HET_PROBABILITY, optional = true )
private double minHetProbability = .5;

@Argument(
doc = "Output compression level",
fullName = COMPRESSION_LEVEL_NAME,
minValue = 0, maxValue = 9, optional = true )
private int compressionLevel = 4;

private FeatureDataSource<VariantContext> bafSitesSource;
private Iterator<VariantContext> bafSitesIterator;
private FeatureSink<BafEvidence> writer;
private final List<LocusDepth> sameLocusBuffer = new ArrayList<>();

@Override
@SuppressWarnings("unchecked")
public void onTraversalStart() {
super.onTraversalStart();
bafSitesSource = new FeatureDataSource<>(bafSitesFile.toPath().toString());
bafSitesIterator = bafSitesSource.iterator();
final SAMSequenceDictionary dict = getDictionary();
dict.assertSameDictionary(bafSitesSource.getSequenceDictionary());
final FeatureOutputCodec<? extends Feature, ? extends FeatureSink<? extends Feature>> codec =
FeatureOutputCodecFinder.find(bafOutputFile);
if ( !codec.getFeatureType().isAssignableFrom(BafEvidence.class) ) {
throw new UserException("We're intending to write BafEvidence, but the feature type " +
"associated with the output file expects features of type " +
codec.getFeatureType().getSimpleName());
}
writer = (FeatureSink<BafEvidence>)codec.makeSink(bafOutputFile, dict, sampleNames,
compressionLevel);
}

@Override
public void apply( LocusDepth locusDepth, Object header,
ReadsContext reads, ReferenceContext ref ) {
if ( !sameLocus(locusDepth) ) {
processBuffer();
}
sameLocusBuffer.add(locusDepth);
}

@Override
public Object onTraversalSuccess() {
super.onTraversalSuccess();
if ( !sameLocusBuffer.isEmpty() ) {
processBuffer();
}
bafSitesSource.close();
writer.close();
return null;
}

/** Performs a Pearson's chi square test for goodness of fit to a biallelic model to determine
* heterozygosity. If this fails, BafEvidence.MISSING_VALUE is returned. If the test succeeds,
* the depth of the alt call as a fraction of the total depth is returned.
*/
@VisibleForTesting double calcBAF( final LocusDepth locusDepth,
final int refIndex, final int altIndex ) {
final int totalDepth = locusDepth.getTotalDepth();
if ( totalDepth < minTotalDepth ) {
return BafEvidence.MISSING_VALUE;
}
double expectRefAlt = totalDepth / 2.;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
double expectRefAlt = totalDepth / 2.;
final double expectRefAlt = totalDepth / 2.;

final double altDepth = locusDepth.getDepth(altIndex);
final double refDiff = locusDepth.getDepth(refIndex) - expectRefAlt;
final double altDiff = altDepth - expectRefAlt;
double chiSq = (refDiff * refDiff + altDiff * altDiff) / expectRefAlt;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
double chiSq = (refDiff * refDiff + altDiff * altDiff) / expectRefAlt;
final double chiSq = (refDiff * refDiff + altDiff * altDiff) / expectRefAlt;

double fitProb = 1. - chiSqDist.cumulativeProbability(chiSq);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add comment that this is a Pearson chi-squared test

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
double fitProb = 1. - chiSqDist.cumulativeProbability(chiSq);
final double fitProb = 1. - chiSqDist.cumulativeProbability(chiSq);

if ( fitProb < minHetProbability ) {
return BafEvidence.MISSING_VALUE;
}
return altDepth / totalDepth;
}

private boolean sameLocus( final Locatable locus ) {
if ( sameLocusBuffer.isEmpty() ) {
return true;
}
final LocusDepth curLocus = sameLocusBuffer.get(0);
return curLocus.getContig().equals(locus.getContig()) &&
curLocus.getStart() == locus.getStart();
}

private void processBuffer() {
final VariantContext vc = nextSite();
if ( !sameLocus(vc) ) {
final Locatable loc = sameLocusBuffer.get(0);
throw new UserException("expecting locus " + loc.getContig() + ":" + loc.getStart() +
", but found locus " + vc.getContig() + ":" + vc.getStart() + " in baf sites vcf");
}
final List<Allele> alleles = vc.getAlleles();
final int refIndex = Nucleotide.decode(alleles.get(0).getBases()[0]).ordinal();
if ( refIndex > 3 ) {
throw new UserException("ref call is not [ACGT] in vcf at " + vc.getContig() + ":" + vc.getStart());
}
final int nAlleles = alleles.size();
final int[] altIndices = new int[nAlleles - 1];
for ( int alleleIndex = 1; alleleIndex < nAlleles; ++alleleIndex ) {
final int altIndex = Nucleotide.decode(alleles.get(1).getBases()[0]).ordinal();
if ( altIndex > 3 ) {
throw new UserException("alt call is not [ACGT] in vcf at " + vc.getContig() + ":" + vc.getStart());
}
altIndices[alleleIndex - 1] = altIndex;
}
final List<BafEvidence> beList = new ArrayList<>(sameLocusBuffer.size());
for ( final LocusDepth ld : sameLocusBuffer ) {
for ( final int altIndex : altIndices ) {
double baf = calcBAF(ld, refIndex, altIndex);
if ( baf != BafEvidence.MISSING_VALUE ) {
beList.add(new BafEvidence(ld.getSample(), ld.getContig(), ld.getStart(), baf));
break;
}
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In some cases this will lead to bias. I didn't think through some of the problems multi-allelics are going to cause us with compatibility here. Apologies for flip-flopping on this, but let's stick to bi-allelic for both LD collection and BAF generation. Regardless, it seems like this implementation would be problematic in some edge cases. I think we will need to either provide the sites list to this tool, or go back to storing the ref/alt alleles in LD (which would be fine, probably simpler and faster).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll remove the multi-allelic option, which kind of weirded me out anyway.
However, I'm confused by the last part of your comment: we do provide the sites list to this tool. That's where I'm getting the ref/alt call info, since it's no longer in the LD records. (See lines 210-223.)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry just realized I missed that, all good to just rip out the multi-allelics.

}
int nBafs = beList.size();
if ( nBafs == 1 ) {
writer.write(new BafEvidence(beList.get(0), .5));
} else {
final RunningAverage ra = new RunningAverage();
for ( final BafEvidence bafEvidence : beList ) {
ra.add(bafEvidence.getValue());
}
final double stddev = ra.stddev();
if ( stddev <= maxStdDev ) {
final double[] vals = new double[nBafs];
for ( int idx = 0; idx != nBafs; ++idx ) {
vals[idx] = beList.get(idx).getValue();
}
Arrays.sort(vals);
int midIdx = nBafs / 2;
double median =
(nBafs & 1) != 0 ? vals[midIdx] : (vals[midIdx] + vals[midIdx - 1])/2.;
final double adj = .5 - median;
for ( final BafEvidence bafEvidence : beList ) {
writer.write(new BafEvidence(bafEvidence, bafEvidence.getValue()+adj));
}
}
}
sameLocusBuffer.clear();
}

private VariantContext nextSite() {
while ( bafSitesIterator.hasNext() ) {
final VariantContext vc = bafSitesIterator.next();
if ( vc.isSNP() && (!biAllelicOnly || vc.isBiallelic()) ) {
return vc;
}
}
throw new UserException("baf sites vcf exhausted before locus depth data");
}
}
Loading