From d2783f77a5a7c92e35644089e8982af77f2d819b Mon Sep 17 00:00:00 2001 From: Dror Kessler Date: Tue, 2 May 2023 10:09:40 +0200 Subject: [PATCH] AddFlowBaseQuality tool (#8235) * AddFlowBaseQuality tool --------- Co-authored-by: Ilya Soifer --- .../groundtruth/AddFlowBaseQuality.java | 337 ++++++++++++++++++ .../AddFlowBaseQualityIntegrationTest.java | 94 +++++ .../add_flow_base_quality_output.sam | 3 + .../large/groundTruth/gt_scorer_input.bam | 3 + .../large/groundTruth/gt_scorer_input.bam.bai | 3 + 5 files changed, 440 insertions(+) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQualityIntegrationTest.java create mode 100644 src/test/resources/large/groundTruth/add_flow_base_quality_output.sam create mode 100644 src/test/resources/large/groundTruth/gt_scorer_input.bam create mode 100644 src/test/resources/large/groundTruth/gt_scorer_input.bam.bai diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java new file mode 100644 index 00000000000..a0f804ffc11 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQuality.java @@ -0,0 +1,337 @@ +package org.broadinstitute.hellbender.tools.walkers.groundtruth; + +import com.google.common.annotations.VisibleForTesting; +import org.broadinstitute.barclay.argparser.*; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup; +import org.broadinstitute.hellbender.engine.FeatureContext; +import org.broadinstitute.hellbender.engine.GATKPath; +import org.broadinstitute.hellbender.engine.ReadWalker; +import org.broadinstitute.hellbender.engine.ReferenceContext; +import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection; +import org.broadinstitute.hellbender.utils.read.FlowBasedRead; +import org.broadinstitute.hellbender.utils.read.FlowBasedReadUtils; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * Convert quality string that reports probability of indel errors (in flow chemistry) to the quality string that approximates probability of mismatch error. + * + *

+ *

+ * + *

+ * The reference is strictly required when handling CRAM files. + *

+ * + *

Input

+ * + * + *

Output

+ * + * + * {@GATK.walkertype ReadWalker} + */ +@CommandLineProgramProperties( + summary = "Prints reads from the input SAM/BAM/CRAM file to the SAM/BAM/CRAM file while adding a base quality attribute.", + oneLineSummary = "Add base quality attribute to reads in in the SAM/BAM/CRAM file", + programGroup = FlowBasedProgramGroup.class +) +@ExperimentalFeature +@WorkflowProperties +public final class AddFlowBaseQuality extends ReadWalker { + + public static final String MINIMAL_ERROR_RATE_LONG_NAME = "minimal-error-rate"; + public static final String MAXIMAL_QUALITY_SCORE_LONG_NAME = "maximal-quality-score"; + public static final String REPLACE_QUALITY_MODE_LONG_NAME = "replace-quality-mode"; + public static final String BASE_QUALITY_ATTRIBUTE_NAME = "XQ"; + public static final String OLD_QUALITY_ATTRIBUTE_NAME = "OQ"; + public static final char PHRED_ASCII_BASE = '!'; + + public static final int ERROR_PROB_BAND_1LESS = 0; + public static final int ERROR_PROB_BAND_KEY = 1; + public static final int ERROR_PROB_BAND_1MORE = 2; + public static final int ERROR_PROB_BANDS = 3; + + @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + doc="Write output to this file") + @WorkflowOutput(optionalCompanions={StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION}) + public GATKPath output; + private SAMFileGATKReadWriter outputWriter; + + @Argument(fullName = MINIMAL_ERROR_RATE_LONG_NAME, doc = "lower floor for flow error rate values") + public double minErrorRate = 1e-3; + + @Argument(fullName = MAXIMAL_QUALITY_SCORE_LONG_NAME, doc = "clip quality score to the given value)") + public int maxQualityScore = 93; + + @Argument(fullName = REPLACE_QUALITY_MODE_LONG_NAME, doc = "replace existing base qualities while saving previous qualities to OQ (when true) or simply write to BQ (when false) ") + public boolean replaceQualityMode = false; + + @ArgumentCollection + public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection(); + + @Override + public void onTraversalStart() { + outputWriter = createSAMWriter(output, true); + + // do not trim the boundary homopolymers as the default + fbargs.keepBoundaryFlows = true; + } + + @Override + public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) { + outputWriter.addRead(addBaseQuality(read)); + } + + @Override + public void closeTool() { + if ( outputWriter != null ) { + outputWriter.close(); + } + } + + /** + * this is the actual 'worker' of the tool + */ + private GATKRead addBaseQuality(final GATKRead read) { + + // convert to a flow base read + final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read); + final FlowBasedRead fbRead = new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs); + final int flowOrderLength = calcFlowOrderLength(rgInfo.flowOrder); + + // generate base quality + final double[] baseErrorProb = generateBaseErrorProbability(fbRead, flowOrderLength); + final byte[] phred = convertErrorProbToPhred(baseErrorProb); + + // install in read + if ( !replaceQualityMode ) { + read.setAttribute(BASE_QUALITY_ATTRIBUTE_NAME, convertPhredToString(phred)); + } else { + read.setAttribute(OLD_QUALITY_ATTRIBUTE_NAME, convertPhredToString(read.getBaseQualitiesNoCopy())); + read.setBaseQualities(phred); + } + + // return reused read + return read; + } + + private byte[] convertErrorProbToPhred(double[] errorProb) { + + final byte[] phred = new byte[errorProb.length]; + for ( int i = 0 ; i < errorProb.length ; i++ ) { + if ( errorProb[i] == 0 ) { + phred[i] = (byte)(maxQualityScore); + } else { + phred[i] = (byte)Math.min((maxQualityScore), (int) (-10 * Math.log10(errorProb[i]))); + } + } + return phred; + } + + private String convertPhredToString(final byte[] phred) { + + final byte[] s = new byte[phred.length]; + for ( int i = 0 ; i < phred.length ; i++ ) { + s[i] = (byte)(phred[i] + PHRED_ASCII_BASE); + } + return new String(s); + } + + private int calcFlowOrderLength(String flowOrder) { + + final int i = flowOrder.indexOf(flowOrder.charAt(0), 1); + + return (i < 0) ? flowOrder.length() : i; + } + + private double[] generateBaseErrorProbability(final FlowBasedRead fbRead, final int flowOrderLength) { + + /** + * access key and error probabilities + * for a description of the flow probabilities see {@link FlowBasedRead#flowMatrix} + */ + final int[] key = fbRead.getKey(); + final double[][] errorProbBands = extractErrorProbBands(fbRead, minErrorRate); + final double[] result = new double[fbRead.getBasesNoCopy().length]; + + // loop over hmers via flow key + int base = 0; + for ( int flow = 0 ; flow < key.length ; flow++ ) { + if ( key[flow] != 0 ) { + + // establish hmer quality + final int hmerLength = key[flow]; + final double[] hmerBaseErrorProbs = generateHmerBaseErrorProbabilities(key, errorProbBands, flow, flowOrderLength); + + // fill in + + // first base of the read gets the original error probability of the hmer, otherwise use computed error probability + if ( base == 0 ) { + result[base++] = errorProbBands[ERROR_PROB_BAND_KEY][flow]; + } else { + result[base++] = hmerBaseErrorProbs[0]; // first base, or only base in case of a single base hmer + } + + // for hmers longer than 1 + if ( hmerLength > 1 ) { + + // skip all but last (leave with zero error probability) + base += (hmerLength - 2); + + // fill last base from computed error probability + result[base++] = hmerBaseErrorProbs[1]; // last base, if hmer is longer than 1 + } + + // override result for the last base with the orignal hmer error probability + if ( base == result.length ) { + result[base - 1] = errorProbBands[ERROR_PROB_BAND_KEY][flow]; + } + } + } + + return result; + } + + // extract error probability bands. middle (1) band is the key prob. + // lower (0) and high (2) are corresponding to -1 and +1 in hmer lengths + private static double[][] extractErrorProbBands(final FlowBasedRead flowRead, final double minValue) { + + // access key + final int[] key = flowRead.getKey(); + + // allocate result + double[][] result = new double[ERROR_PROB_BANDS][]; + for ( int i = 0 ; i < result.length ; i++ ) { + result[i] = new double[key.length]; + } + + for ( int i = 0 ; i < key.length ; i++ ) { + + // extract key probability + result[ERROR_PROB_BAND_KEY][i] = Math.max(flowRead.getProb(i, key[i]), minValue); + + // -1 + if ( key[i] > 0 ) { + result[ERROR_PROB_BAND_1LESS][i] = Math.max(flowRead.getProb(i, key[i] - 1), minValue); + } else { + result[ERROR_PROB_BAND_1LESS][i] = minValue; + } + + // +1 + if ( key[i] < flowRead.getMaxHmer() ) { + result[ERROR_PROB_BAND_1MORE][i] = Math.max(flowRead.getProb(i, key[i] + 1), minValue); + } else { + result[ERROR_PROB_BAND_1MORE][i] = minValue; + } + } + + return result; + } + + @VisibleForTesting + protected static double[] generateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow, final int flowOrderLength) { + + // result is left/right error probabilities + final double[] errorProbs = new double[2]; + final int hmerLength = key[flow]; + + errorProbs[0] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, -1, flowOrderLength); + if ( hmerLength != 1 ) { + errorProbs[1] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, 1, flowOrderLength); + } + + return errorProbs; + } + + private static double generateSidedHmerBaseErrorProbability(final int[] key, final double[][] errorProbBands, final int flow, final int sideIncr, final int flowOrderLength) { + + // create a key slice of the area around the flow/hmer. + final int minIndex = Math.max(flow - flowOrderLength + 1, 0); + final int maxIndex = Math.min(flow + flowOrderLength - 1, key.length - 1); + final int[] slice = Arrays.copyOfRange(key, minIndex, maxIndex + 1); + final int hmerLength = key[flow]; + + // walk the flows towards the side until (and including) the first non-zero key + // on hmers of length 1 we walk both sides + final List slices = new LinkedList<>(); + final int[] incrs = (hmerLength != 1) + ? new int[] { sideIncr } + : new int[] { sideIncr, -sideIncr}; + for (int incr : incrs) { + for (int sideFlow = flow + incr; sideFlow >= 0 && sideFlow < key.length; sideFlow += incr) { + + // create a alternative key slice by incrementing sideFlow and decrementing flow + final int[] altSlice = Arrays.copyOf(slice, slice.length); + altSlice[sideFlow - minIndex] += 1; + altSlice[flow - minIndex] -= 1; + if (sliceIsValid(altSlice, flowOrderLength)) { + slices.add(altSlice); + } + + // is the sideFlow non-zero? if so, break + if (key[sideFlow] != 0) { + break; + } + } + } + + // at this point, we have a list of valid slices. figure out the error probability for each of them + // and compute the base quality + final double keyP = sliceProb(slice, minIndex, key, errorProbBands); + double sumP = keyP; + for ( int[] s : slices ) { + sumP += sliceProb(s, minIndex, key, errorProbBands); + } + final double ep = 1 - (keyP / sumP); + + return ep; + } + + // compute probability for a slice + private static double sliceProb(int[] slice, int minIndex, int[] key, double[][] errorProbBands) { + + double p = 1.0; + for ( int i = 0 ; i < slice.length ; i++ ) { + final int band; + if ( slice[i] < key[i + minIndex] ) { + band = ERROR_PROB_BAND_1LESS; + } else if ( slice[i] > key[i + minIndex] ) { + band = ERROR_PROB_BAND_1MORE; + } else { + band = ERROR_PROB_BAND_KEY; + } + p *= errorProbBands[band][i + minIndex]; + } + return p; + } + + private static boolean sliceIsValid(final int[] slice, final int flowOrderLength) { + + // look for strings of consecutive zeros in length of flowOrderLength - 1 + int consecutiveZeros = 0; + for ( int key : slice ) { + if ( key != 0 ) { + consecutiveZeros = 0; + } else { + consecutiveZeros++; + if ( consecutiveZeros >= (flowOrderLength - 1) ) { + return false; + } + } + } + + // if here, not found -> valid + return true; + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQualityIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQualityIntegrationTest.java new file mode 100644 index 00000000000..e2c706e7cd4 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/groundtruth/AddFlowBaseQualityIntegrationTest.java @@ -0,0 +1,94 @@ +package org.broadinstitute.hellbender.tools.walkers.groundtruth; + +import org.apache.commons.math3.util.Precision; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; +import org.broadinstitute.hellbender.tools.walkers.variantrecalling.FlowTestConstants; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; + +public class AddFlowBaseQualityIntegrationTest extends CommandLineProgramTest { + + public static final boolean UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS = false; + public static final String OUTPUT_FILENAME = "add_flow_base_quality_output.sam"; + + private static String testDir = publicTestDir + FlowTestConstants.GROUND_TRUTH_DATA_DIR; + + @Test + public void assertThatExpectedOutputUpdateToggleIsDisabled() { + Assert.assertFalse(UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS, "The toggle to update expected outputs should not be left enabled"); + } + + @Test + public void testBasic() throws IOException { + + final File outputDir = createTempDir("testGroundTruthTest"); + final File expectedFile = new File(testDir + "/" + OUTPUT_FILENAME); + final File outputFile = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? expectedFile : new File(outputDir + "/" + OUTPUT_FILENAME); + + final String[] args = buildCommonArgs(outputFile); + + runCommandLine(args); // no assert, just make sure we don't throw + + // make sure we've generated the otuput file + Assert.assertTrue(outputFile.exists()); + + // walk the output and expected files, compare non-comment lines + if ( !UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ) { + IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile, "@"); + } + } + + private String[] buildCommonArgs(final File outputFile) { + + return new String[] { + "-R", largeFileTestDir + "/Homo_sapiens_assembly38.fasta.gz", + "-I", testDir + "/gt_scorer_input.bam", + "--output", outputFile.getAbsolutePath(), + "--intervals", "chr9:71000-74000", + }; + } + + @DataProvider(name = "generateHmerBaseErrorProbabilities") + public Object[][] getGenerateHmerBaseErrorProbabilities() { + return new Object[][] { + { + new int[] { 1, 0, 0, 1, 0, 1, 1 }, // key + new double[][] { + { 0.1, 0.1, 0.01, 0.1, 0.001, 0.05, 0.1}, // 1 less + { 0.9, 0.9, 0.99, 0.9, 0.999, 0.95, 0.9}, // key + { 0.1, 0.1, 0.01, 0.1, 0.001, 0.05, 0.1} // 1 more + }, // error prob bands + 3, // flow + 4, // flowOrderLength + new double[] {0.00112, 0}, // result + 5 // result precision + }, + { + new int[] { 1, 0, 0, 2, 0, 1, 1 }, // key + new double[][] { + { 0.1, 0.1, 0.01, 0.1, 0.001, 0.05, 0.1}, // 1 less + { 0.9, 0.9, 0.99, 0.9, 0.999, 0.95, 0.9}, // key + { 0.1, 0.1, 0.01, 0.1, 0.001, 0.05, 0.1} // 1 more + }, // error prob bands + 3, // flow + 4, // flowOrderLength + new double[] {0.02516, 0.00592}, // result (errorProbs) + 5 // result precision + } + }; + } + + @Test(dataProvider = "generateHmerBaseErrorProbabilities") + public void testGenerateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow, final int flowOrderLength, final double[] result, final int resultPrecision) { + + final double[] errorProbs = AddFlowBaseQuality.generateHmerBaseErrorProbabilities(key, errorProbBands, flow, flowOrderLength); + + Assert.assertEquals(Arrays.stream(errorProbs).map(v -> Precision.round(v, resultPrecision)).toArray(), result); + } +} diff --git a/src/test/resources/large/groundTruth/add_flow_base_quality_output.sam b/src/test/resources/large/groundTruth/add_flow_base_quality_output.sam new file mode 100644 index 00000000000..5f6781cdd68 --- /dev/null +++ b/src/test/resources/large/groundTruth/add_flow_base_quality_output.sam @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1db2b9e54e01301829d21e65914d3382d5f56ce0009d39ccf74a38d7e94aa688 +size 722388 diff --git a/src/test/resources/large/groundTruth/gt_scorer_input.bam b/src/test/resources/large/groundTruth/gt_scorer_input.bam new file mode 100644 index 00000000000..8d430706033 --- /dev/null +++ b/src/test/resources/large/groundTruth/gt_scorer_input.bam @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abcd9c5112f07e70f4e11ea577c384f51185049100a4c7561441365ad9ccb33f +size 414812 diff --git a/src/test/resources/large/groundTruth/gt_scorer_input.bam.bai b/src/test/resources/large/groundTruth/gt_scorer_input.bam.bai new file mode 100644 index 00000000000..ea764df1d32 --- /dev/null +++ b/src/test/resources/large/groundTruth/gt_scorer_input.bam.bai @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9fd2b8436bb7a787f99e768a4df68bbeca74f4526998b54ae0c0c9f7f5d5e76 +size 28112