Skip to content

Commit

Permalink
Bigger Permutect tensors and Permutect test datasets can be annotated…
Browse files Browse the repository at this point in the history
… with truth VCF (#8836)

* added 20 more Permutect read features

* Permutect test data can, like training data, be annotated with a truth VCF
  • Loading branch information
davidbenjamin committed May 17, 2024
1 parent a3bbfc4 commit 4ed93fe
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine;
import org.broadinstitute.hellbender.utils.pileup.PileupElement;
import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
import org.broadinstitute.hellbender.utils.read.Fragment;
import org.broadinstitute.hellbender.utils.read.GATKRead;
Expand All @@ -20,6 +22,7 @@

import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

/**
* For each sample and for each allele a list feature vectors of supporting reads
Expand All @@ -33,6 +36,11 @@ public class FeaturizedReadSets {
public static final int DEFAULT_BASE_QUALITY = 25;

private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);
private static final int FEATURES_PER_RANGE = 5;
private static final List<Integer> RANGES = List.of(5, 10, 25, 50);
public static final int NUM_RANGED_FEATURES = FEATURES_PER_RANGE * RANGES.size();
private static final int VERY_BAD_QUAL_THRESHOLD = 10;
private static final int BAD_QUAL_THRESHOLD = 20;

private FeaturizedReadSets() { }

Expand Down Expand Up @@ -92,9 +100,9 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext
result.add(read.isReverseStrand() ? 1 : 0);

// distances from ends of read
final int readPosition = ReadPosition.getPosition(read, vc).orElse(0);
result.add(readPosition);
result.add(read.getLength() - readPosition);
final int readPositionOfVariantStart = ReadPosition.getPosition(read, vc).orElse(0);
result.add(readPositionOfVariantStart);
result.add(read.getLength() - readPositionOfVariantStart);


result.add(Math.abs(read.getFragmentLength()));
Expand Down Expand Up @@ -123,15 +131,64 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext
vc.getContig(), vc.getStart()));
result.add(3);
result.add(2);

for (int n = 0; n < NUM_RANGED_FEATURES; n++) {
result.add(0);
}
} else {
final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
byte[] haplotypeBases = haplotype.getBases();
final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotypeBases, read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
final GATKRead copy = read.copy();
copy.setCigar(readToHaplotypeAlignment.getCigar());
final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotypeBases, readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
result.add(mismatchCount);

final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
result.add((int) indelsVsBestHaplotype);

final int readStartInHaplotype = readToHaplotypeAlignment.getAlignmentOffset();
final AlignmentStateMachine asm = new AlignmentStateMachine(copy);
asm.stepForwardOnGenome();
final List<int[]> rangedFeatures = RANGES.stream().map(range -> new int[FEATURES_PER_RANGE]).toList();

while (!asm.isRightEdge()) {
final PileupElement pe = asm.makePileupElement();
final int distanceFromVariant = Math.abs(asm.getReadOffset() - readPositionOfVariantStart);

// pick which array's features we are accounting. If the ranges are 5, 10, 25, 50 and the distance is, say 8, then the '<= 10' range is relevant
final OptionalInt relevantRange = IntStream.range(0, RANGES.size()).filter(n -> distanceFromVariant <= RANGES.get(n)).findFirst();
if (relevantRange.isPresent()) {
final int[] featuresToAddTo = rangedFeatures.get(relevantRange.getAsInt());
if (pe.isAfterInsertion()) {
featuresToAddTo[0]++;
}

if (pe.isDeletion()) {
featuresToAddTo[1]++;
} else {
final byte base = pe.getBase();
final byte qual = pe.getQual();
final byte haplotypeBase = haplotypeBases[asm.getGenomeOffset() + readStartInHaplotype];

if (base != haplotypeBase) {
featuresToAddTo[2]++;
}

if (qual < VERY_BAD_QUAL_THRESHOLD) {
featuresToAddTo[3]++;
} else if (qual < BAD_QUAL_THRESHOLD) {
featuresToAddTo[4]++;
}
}
}
asm.stepForwardOnGenome();
}

for (final int[] featuresToAdd : rangedFeatures) {
for (final int val : featuresToAdd) {
result.add(val);
}
}
}
Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ public double getDefaultAlleleFrequency() {
public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA;

public enum Mutect3DatasetMode {
ILLUMINA(11),
ULTIMA(11);
ILLUMINA(11 + FeaturizedReadSets.NUM_RANGED_FEATURES),
ULTIMA(11 + FeaturizedReadSets.NUM_RANGED_FEATURES);

final private int numReadFeatures;

Expand All @@ -229,6 +229,10 @@ public int getNumReadFeatures() {
* VCF of known calls for a sample used for generating a Mutect3 training dataset. Unfiltered variants (PASS or empty FILTER field)
* contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors.
* If this VCF is not given the dataset is generated with an weak-labelling strategy based on allele fractions.
*
* Although the normal use of this input is in generating training data, it can also be used when generating test data
* for making Permutect calls. In this case, the test data is labeled with truth from the VCF, Permutect makes filtered calls as
* usual, and Permutect uses the labels to analyze the quality of its results.
*/
@Argument(fullName= MUTECT3_TRAINING_TRUTH_LONG_NAME, doc="VCF file of known variants for labeling Mutect3 training data", optional = true)
public FeatureInput<VariantContext> mutect3TrainingTruth;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
final int diff = altAlleleString.length() - refAllele.length();
final VariantType type = diff == 0 ? VariantType.SNV : ( diff > 0 ? VariantType.INSERTION : VariantType.DELETION);

if (trainingMode) {
if (trainingMode) { // training mode -- collecting tensors to train the Permutect artifact model
final ArrayBlockingQueue<Integer> unmatchedQueue = unmatchedArtifactAltCounts.get(type);
final boolean likelySeqError = tumorLods[n] < TLOD_THRESHOLD;

Expand Down Expand Up @@ -182,8 +182,13 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
} else {
labels.add(Label.IGNORE);
}
} else {
labels.add(Label.UNLABELED);
} else { // not training mode -- we are generating tensors in order to apply the Permutect artifact model to a callset
if (truthVCs.isPresent()) {
// here, for the purposes of test data, both sequencing errors and technical artifacts get the "ARTIFACT" label
labels.add(truthAlleles.contains(remappedAltAlelle) ? Label.VARIANT : Label.ARTIFACT);
} else {
labels.add(Label.UNLABELED);
}
}
}

Expand Down

0 comments on commit 4ed93fe

Please sign in to comment.