SNVQ recalibration tool added for flow based reads (#8697)

Co-authored-by: Dror Kessler <[email protected]>
broadinstitute · Apr 4, 2024 · 6739e6d · 6739e6d
1 parent 724b5bc
commit 6739e6d
Show file tree

Hide file tree

Showing 73 changed files with 273,250 additions and 272,313 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/FlowBasedArgumentCollection.java
@@ -10,7 +10,6 @@ public class FlowBasedArgumentCollection implements Serializable {
     private static final long serialVersionUID = 0;
 
     public static final String FLOW_USE_T0_TAG = "flow-use-t0-tag";
-    public static final String PROBABILITY_RATIO_THRESHOLD_LONG_NAME = "flow-probability-threshold";
     public static final String REMOVE_LONGER_THAN_ONE_INDELS_LONG_NAME = "flow-remove-non-single-base-pair-indels";
     public static final String REMOVE_ONE_TO_ZERO_PROBS_LONG_NAME = "flow-remove-one-zero-probs";
     public static final String NUMBER_OF_POSSIBLE_PROBS_LONG_NAME = "flow-quantization-bins";
@@ -27,8 +26,7 @@ public class FlowBasedArgumentCollection implements Serializable {
 
 
 
-    private static final double DEFAULT_RATIO_THRESHOLD = 0.003;
-    private static final double DEFAULT_FILLING_VALUE = 0.001;
+    public static final double DEFAULT_FILLING_VALUE = 0.001;
     private static final boolean DEFAULT_REMOVE_LONGER_INDELS = false;
     private static final boolean DEFAULT_REMOVE_ONE_TO_ZERO = false;
     private static final boolean DEFAULT_SYMMETRIC_INDELS = false;
@@ -45,10 +43,6 @@ public class FlowBasedArgumentCollection implements Serializable {
     @Argument(fullName = FLOW_USE_T0_TAG, doc = "Use t0 tag if exists in the read to create flow matrix", optional = true)
     public boolean useT0Tag = DEFAULT_FLOW_USE_T0_TAG;
 
-    @Advanced
-    @Argument(fullName = PROBABILITY_RATIO_THRESHOLD_LONG_NAME, doc = "Lowest probability ratio to be used as an option", optional = true)
-    public double probabilityRatioThreshold = DEFAULT_RATIO_THRESHOLD;
-
     @Advanced
     @Argument(fullName = REMOVE_LONGER_THAN_ONE_INDELS_LONG_NAME, doc = "Should the probabilities of more then 1 indel be used", optional = true)
     public boolean removeLongerThanOneIndels = DEFAULT_REMOVE_LONGER_INDELS;

diff --git a/...in/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQuality.java b/...in/java/org/broadinstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQuality.java
diff --git a/...nstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQualityArgumentCollection.java b/...nstitute/hellbender/tools/walkers/featuremapping/AddFlowSNVQualityArgumentCollection.java
@@ -0,0 +1,70 @@
+package org.broadinstitute.hellbender.tools.walkers.featuremapping;
+
+import org.broadinstitute.barclay.argparser.Advanced;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.Hidden;
+
+import java.io.Serializable;
+import java.util.List;
+
+/**
+ * Set of arguments for the {@link  AddFlowSNVQuality}
+ */
+public class AddFlowSNVQualityArgumentCollection implements Serializable{
+    private static final long serialVersionUID = 1L;
+    public static final String MAX_PHRED_SCORE_FULL_NAME = "max-phred-score";
+    public static final String KEEP_SUPPLEMENTARY_ALIGNMENTS_FULL_NAME = "keep-supplementary-alignments";
+    public static final String INCLUDE_QC_FAILED_READ_FULL_NAME = "include-qc-failed-read";
+    public static final String SNVQ_MODE_FULL_NAME = "snvq-mode";
+    public static final String OUTPUT_QUALITY_ATTRIBUTE_FULL_NAME = "output-quality-attribute";
+    public static final String DEBUG_READ_NAME_FULL_NAME = "debug-read-name";
+    public static final String DEBUG_COLLECT_STATS_INTO_FULL_NAME = "debug-collect-stats-into";
+
+    public enum SnvqModeEnum {
+        Legacy,
+        Optimistic,
+        Pessimistic,
+        Geometric
+    };
+
+    /**
+     *  maximum value for
+     *  delta in score
+     **/
+    @Argument(fullName = MAX_PHRED_SCORE_FULL_NAME, doc = "Limit value for phred scores", optional = true)
+    public double maxPhredScore = Double.NaN;
+
+    /**
+     *  keep supplementary alignments?
+     **/
+    @Argument(fullName = KEEP_SUPPLEMENTARY_ALIGNMENTS_FULL_NAME, doc = "keep supplementary alignments ?", optional = true)
+    public boolean keepSupplementaryAlignments = true;
+
+    @Advanced
+    @Argument(fullName= INCLUDE_QC_FAILED_READ_FULL_NAME, doc = "include reads with QC failed flag", optional = true)
+    public boolean includeQcFailedReads = true;
+
+    /**
+     * snvq computation mode
+     */
+    @Argument(fullName = SNVQ_MODE_FULL_NAME, doc = "snvq calculation mode.", optional = true)
+    public SnvqModeEnum snvMode = SnvqModeEnum.Geometric;
+
+    /**
+     * By default this tool overwrites the QUAL field with the new qualities. Setting this argument saves the original qualities in the specified SAM tag.
+     */
+    @Argument(fullName = OUTPUT_QUALITY_ATTRIBUTE_FULL_NAME, doc = "alternate SAM tag to put original quality scores instead of overwriting the QUAL field. If not used, QUAL will be overwritten.", optional = true)
+    public String outputQualityAttribute = null;
+
+    /**
+     *  debug read names?
+     **/
+    @Hidden
+    @Argument(fullName = DEBUG_READ_NAME_FULL_NAME, doc = "Read names of reads to output details of as part of debugging. ", optional = true)
+    public List<String> debugReadName = null;
+
+    @Advanced
+    @Hidden
+    @Argument(fullName= DEBUG_COLLECT_STATS_INTO_FULL_NAME, doc = "Statistics about the reads will be output to given filename.", optional = true)
+    public String debugCollectStatsInto = null;
+}
diff --git a/...main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java b/...main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java
@@ -239,6 +239,25 @@ private static double[][] extractErrorProbBands(final FlowBasedRead flowRead, fi
         return result;
     }
 
+    /**
+     * The following functions estimate the error probability for an hmer. specifically two error
+     * probability values are generated: one for the first base of the hmer and another for the
+     * rest of its bases.
+     *
+     * The computation itself is performed in a subsequent function: generateSidedHmerBaseErrorProbability
+     * It iterates over the possible valid combinations of errors and sums them up.
+     *
+     * @param key - key (hmer length) in flow space
+     * @param errorProbBands - for each flow (position in the key) three error probabilities are provided:
+     *                       [0] - for the hmer being one base shorter
+     *                       [1] - for the hmer to be at its length
+     *                       [2] - for the hmer to be one base longer
+     * @param flow - the flow (index) for which to generate the probabilities (0 <= flow < key.length)
+     * @param flowOrderLength - the cycle length of of the flow order (usually 4)
+     * @return an array of two probabilities:
+     * [0] - probability for the first base of the hmer
+     * [1] - probability for the rest of the bases of the hmer
+     */
     @VisibleForTesting
     protected static double[] generateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow, final int flowOrderLength) {
 

diff --git a/...java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java b/...java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthReadsBuilder.java
@@ -858,9 +858,9 @@ else if ( hasQ || hasZ ) {
         cols.put("ReadName", read.getName());
 
         // haplotypes and reference scores
-        cols.put("PaternalHaplotypeScore", paternal.score);
-        cols.put("MaternalHaplotypeScore", maternal.score);
-        cols.put("RefHaplotypeScore", refScore);
+        cols.put("PaternalHaplotypeScore", String.format("%.6f", paternal.score));
+        cols.put("MaternalHaplotypeScore", String.format("%.6f", maternal.score));
+        cols.put("RefHaplotypeScore", String.format("%.6f", refScore));
 
         // build haplotype keys
         final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/GroundTruthScorer.java
@@ -157,6 +157,7 @@ public class GroundTruthScorer extends ReadWalker {
     private static final int BASE_VALUE_MAX = FlowBasedRead.DEFAULT_FLOW_ORDER.length() - 1;
 
     private static final double NORMALIZED_SCORE_THRESHOLD_DEFAULT = -0.1;
+    private static final double DEFAULT_RATIO_THRESHOLD = 0.003;
 
     /*
      Private accumulator class for counting false/true observations (hence Boolean).
@@ -502,7 +503,7 @@ public void closeTool() {
         // write reports
         if ( reportFilePath != null ) {
             final GATKReport report = new GATKReport(
-                    BooleanAccumulator.newReportTable(qualReport, "qual", fbargs.probabilityRatioThreshold, omitZerosFromReport),
+                    BooleanAccumulator.newReportTable(qualReport, "qual", DEFAULT_RATIO_THRESHOLD, omitZerosFromReport),
                     BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", omitZerosFromReport),
                     BooleanAccumulator.newReportTable(qualReport, "qual", "hmer", "deviation", "base", omitZerosFromReport),
                     PercentileReport.newReportTable(percentileReports, qualityPercentiles)

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/SeriesStats.java
@@ -1,23 +1,51 @@
 package org.broadinstitute.hellbender.tools.walkers.groundtruth;
 
-import org.apache.commons.collections.map.LazySortedMap;
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
 
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.SortedMap;
 import java.util.TreeMap;
 import java.util.concurrent.atomic.AtomicInteger;
 
 public class SeriesStats {
 
+    private static final Logger logger = LogManager.getLogger(SeriesStats.class);
+
     // local state
     private double last = Double.NaN;
     private int count = 0;
     private double sum = 0;
     private double min = Double.NaN;
     private double max = Double.NaN;
     private SortedMap<Double, AtomicInteger> bins = new TreeMap<>();
+    private int intCount = 0;
+    private Map<Double, SeriesStats> auxBins = new LinkedHashMap<>();
+
+    public void csvWrite(final String path) throws IOException {
+        logger.info("Writing SeriesStats " + toDigest() + " into " + path);
+        PrintWriter pw = new PrintWriter(path);
+        pw.println("value,count");
+        boolean intKeys = isIntKeys();
+        for (Map.Entry<Double, AtomicInteger> entry : bins.entrySet() ) {
+            if ( intKeys ) {
+                pw.println(String.format("%d,%d", entry.getKey().intValue(), entry.getValue().get()));
+            } else {
+                pw.println(String.format("%f,%d", entry.getKey(), entry.getValue().get()));
+            }
+        }
+        pw.close();
+    }
 
-    void add(double v) {
+    public void add(int v) {
+        add((double)v);
+        intCount++;
+    }
+
+    public void add(double v) {
 
         // save in simple values
         last = v;
@@ -31,10 +59,11 @@ void add(double v) {
         count++;
 
         // save in bins
-        if ( bins.containsKey(v) ) {
-            bins.get(v).incrementAndGet();
+        final Double key = v;
+        if ( bins.containsKey(key) ) {
+            bins.get(key).incrementAndGet();
         } else {
-            bins.put(v, new AtomicInteger(1));
+            bins.put(key, new AtomicInteger(1));
         }
     }
 
@@ -109,4 +138,23 @@ public double getStd() {
         return Math.sqrt(variance);
     }
 
+    public Map<Double, AtomicInteger> getBins() {
+        return this.bins;
+    }
+
+    public Map<Double, SeriesStats> getAuxBins() {
+        return this.auxBins;
+    }
+
+    public String toDigest() {
+        if ( isIntKeys() ) {
+            return String.format("count=%d, min=%d, max=%d, median=%d, bin.count=%d", getCount(), (int)getMin(), (int)getMax(), (int)getMedian(), getBins().size());
+        } else {
+            return String.format("count=%d, min=%f, max=%f, median=%f, bin.count=%d", getCount(), getMin(), getMax(), getMedian(), getBins().size());
+        }
+    }
+
+    private boolean isIntKeys() {
+        return (count == intCount);
+    }
 }
diff --git a/...nstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java b/...nstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedAlignmentLikelihoodEngine.java
@@ -133,12 +133,12 @@ public AlleleLikelihoods<GATKRead, Haplotype> computeReadLikelihoods(final List<
     @Override
     public ToDoubleFunction<GATKRead> log10MinTrueLikelihood(final double expectedErrorRate, final boolean capLikelihoods) {
         final double log10ErrorRate = Math.log10(expectedErrorRate);
-        final double catastrophicErrorRate = fbargs.fillingValue;
-        final double log10catastrophicErrorRate = Math.log10(fbargs.fillingValue);
+        final double largeEventErrorRate = Math.max(fbargs.fillingValue, 0.000001); // error rate for non-hmer/snv errors that are not seq. errors.
+        final double log10catastrophicErrorRate = Math.log10(largeEventErrorRate);
         return read -> {
             final double maxErrorsForRead = capLikelihoods ? Math.max(MAX_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * expectedErrorRate)) : Math.ceil(read.getLength() * expectedErrorRate);
-            final double maxCatastrophicErrorsForRead = capLikelihoods ? Math.max(MAX_CATASTROPHIC_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * fbargs.fillingValue)) :
-                    Math.ceil(read.getLength() * fbargs.fillingValue);
+            final double maxCatastrophicErrorsForRead = capLikelihoods ? Math.max(MAX_CATASTROPHIC_ERRORS_FOR_READ_CAP, Math.ceil(read.getLength() * largeEventErrorRate)) :
+                    Math.ceil(read.getLength() * largeEventErrorRate);
             return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead * log10catastrophicErrorRate;
         };
     }

diff --git a/.../java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java b/.../java/org/broadinstitute/hellbender/tools/walkers/haplotypecaller/FlowBasedHMMEngine.java
@@ -132,12 +132,13 @@ public AlleleLikelihoods<GATKRead, Haplotype> computeReadLikelihoods(final List<
     @Override
     public ToDoubleFunction<GATKRead> log10MinTrueLikelihood(final double expectedErrorRate, final boolean capLikelihoods) {
         final double log10ErrorRate = Math.log10(expectedErrorRate);
-        final double catastrophicErrorRate = Math.log10(fbargs.fillingValue);
+        final double largeEventErrorRate = 0.001; // error rate for non-hmer/snv errors that are not seq. errors.
+        final double log10catastrophicErrorRate = Math.log10(largeEventErrorRate);
 
         return read -> {
             final double maxErrorsForRead = Math.max(3.0, Math.ceil(read.getLength() * expectedErrorRate));
-            final double maxCatastrophicErrorsForRead = Math.max(2.0, Math.ceil(read.getLength() * fbargs.fillingValue));
-            return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead*catastrophicErrorRate;
+            final double maxCatastrophicErrorsForRead = Math.max(2.0, Math.ceil(read.getLength() * largeEventErrorRate));
+            return maxErrorsForRead * log10ErrorRate + maxCatastrophicErrorsForRead*log10catastrophicErrorRate;
         };
     }