Bigger Permutect tensors and Permutect test datasets can be annotated…

… with truth VCF (#8836) * added 20 more Permutect read features * Permutect test data can, like training data, be annotated with a truth VCF
broadinstitute · May 17, 2024 · 4ed93fe · 4ed93fe
1 parent a3bbfc4
commit 4ed93fe
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 10 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/FeaturizedReadSets.java
@@ -11,6 +11,8 @@
 import org.broadinstitute.hellbender.utils.Utils;
 import org.broadinstitute.hellbender.utils.genotyper.AlleleLikelihoods;
 import org.broadinstitute.hellbender.utils.haplotype.Haplotype;
+import org.broadinstitute.hellbender.utils.locusiterator.AlignmentStateMachine;
+import org.broadinstitute.hellbender.utils.pileup.PileupElement;
 import org.broadinstitute.hellbender.utils.read.AlignmentUtils;
 import org.broadinstitute.hellbender.utils.read.Fragment;
 import org.broadinstitute.hellbender.utils.read.GATKRead;
@@ -20,6 +22,7 @@
 
 import java.util.*;
 import java.util.stream.Collectors;
+import java.util.stream.IntStream;
 
 /**
  * For each sample and for each allele a list feature vectors of supporting reads
@@ -33,6 +36,11 @@ public class FeaturizedReadSets {
     public static final int DEFAULT_BASE_QUALITY = 25;
 
     private static final SmithWatermanAligner aligner = SmithWatermanAligner.getAligner(SmithWatermanAligner.Implementation.JAVA);
+    private static final int FEATURES_PER_RANGE = 5;
+    private static final List<Integer> RANGES = List.of(5, 10, 25, 50);
+    public static final int NUM_RANGED_FEATURES = FEATURES_PER_RANGE * RANGES.size();
+    private static final int VERY_BAD_QUAL_THRESHOLD = 10;
+    private static final int BAD_QUAL_THRESHOLD = 20;
 
     private FeaturizedReadSets() { }
 
@@ -92,9 +100,9 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext
         result.add(read.isReverseStrand() ? 1 : 0);
 
         // distances from ends of read
-        final int readPosition = ReadPosition.getPosition(read, vc).orElse(0);
-        result.add(readPosition);
-        result.add(read.getLength() - readPosition);
+        final int readPositionOfVariantStart = ReadPosition.getPosition(read, vc).orElse(0);
+        result.add(readPositionOfVariantStart);
+        result.add(read.getLength() - readPositionOfVariantStart);
 
 
         result.add(Math.abs(read.getFragmentLength()));
@@ -123,15 +131,64 @@ private static List<Integer> featurize(final GATKRead read, final VariantContext
                     vc.getContig(), vc.getStart()));
             result.add(3);
             result.add(2);
+
+            for (int n = 0; n < NUM_RANGED_FEATURES; n++) {
+                result.add(0);
+            }
         } else {
-            final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotype.getBases(), read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
+            byte[] haplotypeBases = haplotype.getBases();
+            final SmithWatermanAlignment readToHaplotypeAlignment = aligner.align(haplotypeBases, read.getBases(), SmithWatermanAlignmentConstants.ALIGNMENT_TO_BEST_HAPLOTYPE_SW_PARAMETERS, SWOverhangStrategy.SOFTCLIP);
             final GATKRead copy = read.copy();
             copy.setCigar(readToHaplotypeAlignment.getCigar());
-            final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotype.getBases(), readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
+            final int mismatchCount = AlignmentUtils.getMismatchCount(copy, haplotypeBases, readToHaplotypeAlignment.getAlignmentOffset()).numMismatches;
             result.add(mismatchCount);
 
             final long indelsVsBestHaplotype = readToHaplotypeAlignment.getCigar().getCigarElements().stream().filter(el -> el.getOperator().isIndel()).count();
             result.add((int) indelsVsBestHaplotype);
+
+            final int readStartInHaplotype = readToHaplotypeAlignment.getAlignmentOffset();
+            final AlignmentStateMachine asm = new AlignmentStateMachine(copy);
+            asm.stepForwardOnGenome();
+            final List<int[]> rangedFeatures = RANGES.stream().map(range -> new int[FEATURES_PER_RANGE]).toList();
+
+            while (!asm.isRightEdge()) {
+                final PileupElement pe = asm.makePileupElement();
+                final int distanceFromVariant = Math.abs(asm.getReadOffset() - readPositionOfVariantStart);
+
+                // pick which array's features we are accounting.  If the ranges are 5, 10, 25, 50 and the distance is, say 8, then the '<= 10' range is relevant
+                final OptionalInt relevantRange = IntStream.range(0, RANGES.size()).filter(n -> distanceFromVariant <= RANGES.get(n)).findFirst();
+                if (relevantRange.isPresent()) {
+                    final int[] featuresToAddTo = rangedFeatures.get(relevantRange.getAsInt());
+                    if (pe.isAfterInsertion()) {
+                        featuresToAddTo[0]++;
+                    }
+
+                    if (pe.isDeletion()) {
+                        featuresToAddTo[1]++;
+                    } else {
+                        final byte base = pe.getBase();
+                        final byte qual = pe.getQual();
+                        final byte haplotypeBase = haplotypeBases[asm.getGenomeOffset() + readStartInHaplotype];
+
+                        if (base != haplotypeBase) {
+                            featuresToAddTo[2]++;
+                        }
+
+                        if (qual < VERY_BAD_QUAL_THRESHOLD) {
+                            featuresToAddTo[3]++;
+                        } else if (qual < BAD_QUAL_THRESHOLD) {
+                            featuresToAddTo[4]++;
+                        }
+                    }
+                }
+                asm.stepForwardOnGenome();
+            }
+
+            for (final int[] featuresToAdd : rangedFeatures) {
+                for (final int val : featuresToAdd) {
+                    result.add(val);
+                }
+            }
         }
         Utils.validate(result.size() == mutect3DatasetMode.getNumReadFeatures(), "Wrong number of features");
 

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/M2ArgumentCollection.java
@@ -211,8 +211,8 @@ public double getDefaultAlleleFrequency() {
     public Mutect3DatasetMode mutect3DatasetMode = Mutect3DatasetMode.ILLUMINA;
 
     public enum Mutect3DatasetMode {
-        ILLUMINA(11),
-        ULTIMA(11);
+        ILLUMINA(11 + FeaturizedReadSets.NUM_RANGED_FEATURES),
+        ULTIMA(11 + FeaturizedReadSets.NUM_RANGED_FEATURES);
 
         final private int numReadFeatures;
 
@@ -229,6 +229,10 @@ public int getNumReadFeatures() {
      * VCF of known calls for a sample used for generating a Mutect3 training dataset.  Unfiltered variants (PASS or empty FILTER field)
      * contained in this VCF are considered good; other variants (i.e. filtered in this VCF or absent from it) are considered errors.
      * If this VCF is not given the dataset is generated with an weak-labelling strategy based on allele fractions.
+     *
+     * Although the normal use of this input is in generating training data, it can also be used when generating test data
+     * for making Permutect calls.  In this case, the test data is labeled with truth from the VCF, Permutect makes filtered calls as
+     * usual, and Permutect uses the labels to analyze the quality of its results.
      */
     @Argument(fullName= MUTECT3_TRAINING_TRUTH_LONG_NAME, doc="VCF file of known variants for labeling Mutect3 training data", optional = true)
     public FeatureInput<VariantContext> mutect3TrainingTruth;

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/mutect/Mutect3DatasetEngine.java
@@ -153,7 +153,7 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
             final int diff = altAlleleString.length() - refAllele.length();
             final VariantType type = diff == 0 ? VariantType.SNV : ( diff > 0 ? VariantType.INSERTION : VariantType.DELETION);
 
-            if (trainingMode) {
+            if (trainingMode) { // training mode -- collecting tensors to train the Permutect artifact model
                 final ArrayBlockingQueue<Integer> unmatchedQueue = unmatchedArtifactAltCounts.get(type);
                 final boolean likelySeqError = tumorLods[n] < TLOD_THRESHOLD;
 
@@ -182,8 +182,13 @@ public void addData(final ReferenceContext ref, final VariantContext vc, Optiona
                 } else {
                     labels.add(Label.IGNORE);
                 }
-            } else {
-                labels.add(Label.UNLABELED);
+            } else {    // not training mode -- we are generating tensors in order to apply the Permutect artifact model to a callset
+                if (truthVCs.isPresent()) {
+                    // here, for the purposes of test data, both sequencing errors and technical artifacts get the "ARTIFACT" label
+                    labels.add(truthAlleles.contains(remappedAltAlelle) ? Label.VARIANT : Label.ARTIFACT);
+                } else {
+                    labels.add(Label.UNLABELED);
+                }
             }
         }