Skip to content

Commit

Permalink
AddFlowBaseQuality tool (#8235)
Browse files Browse the repository at this point in the history
* AddFlowBaseQuality tool

---------

Co-authored-by: Ilya Soifer <[email protected]>
  • Loading branch information
dror27 and ilyasoifer committed May 2, 2023
1 parent dd35fa9 commit d2783f7
Show file tree
Hide file tree
Showing 5 changed files with 440 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,337 @@
package org.broadinstitute.hellbender.tools.walkers.groundtruth;

import com.google.common.annotations.VisibleForTesting;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.FlowBasedProgramGroup;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;
import org.broadinstitute.hellbender.utils.read.FlowBasedRead;
import org.broadinstitute.hellbender.utils.read.FlowBasedReadUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;

import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;

/**
* Convert quality string that reports probability of indel errors (in flow chemistry) to the quality string that approximates probability of mismatch error.
*
* <p>
* </p>
*
* <p>
* The reference is strictly required when handling CRAM files.
* </p>
*
* <h3> Input </h3>
* <ul>
* <li> Coordinate-sorted and indexed SAM/BAM/CRAM </li>
* </ul>
*
* <h3> Output </h3>
* <ul>
* <li> Coordinate-sorted and indexed SAM/BAM/CRAM </li>
* </ul>
*
* {@GATK.walkertype ReadWalker}
*/
@CommandLineProgramProperties(
summary = "Prints reads from the input SAM/BAM/CRAM file to the SAM/BAM/CRAM file while adding a base quality attribute.",
oneLineSummary = "Add base quality attribute to reads in in the SAM/BAM/CRAM file",
programGroup = FlowBasedProgramGroup.class
)
@ExperimentalFeature
@WorkflowProperties
public final class AddFlowBaseQuality extends ReadWalker {

public static final String MINIMAL_ERROR_RATE_LONG_NAME = "minimal-error-rate";
public static final String MAXIMAL_QUALITY_SCORE_LONG_NAME = "maximal-quality-score";
public static final String REPLACE_QUALITY_MODE_LONG_NAME = "replace-quality-mode";
public static final String BASE_QUALITY_ATTRIBUTE_NAME = "XQ";
public static final String OLD_QUALITY_ATTRIBUTE_NAME = "OQ";
public static final char PHRED_ASCII_BASE = '!';

public static final int ERROR_PROB_BAND_1LESS = 0;
public static final int ERROR_PROB_BAND_KEY = 1;
public static final int ERROR_PROB_BAND_1MORE = 2;
public static final int ERROR_PROB_BANDS = 3;

@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc="Write output to this file")
@WorkflowOutput(optionalCompanions={StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION})
public GATKPath output;
private SAMFileGATKReadWriter outputWriter;

@Argument(fullName = MINIMAL_ERROR_RATE_LONG_NAME, doc = "lower floor for flow error rate values")
public double minErrorRate = 1e-3;

@Argument(fullName = MAXIMAL_QUALITY_SCORE_LONG_NAME, doc = "clip quality score to the given value)")
public int maxQualityScore = 93;

@Argument(fullName = REPLACE_QUALITY_MODE_LONG_NAME, doc = "replace existing base qualities while saving previous qualities to OQ (when true) or simply write to BQ (when false) ")
public boolean replaceQualityMode = false;

@ArgumentCollection
public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection();

@Override
public void onTraversalStart() {
outputWriter = createSAMWriter(output, true);

// do not trim the boundary homopolymers as the default
fbargs.keepBoundaryFlows = true;
}

@Override
public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) {
outputWriter.addRead(addBaseQuality(read));
}

@Override
public void closeTool() {
if ( outputWriter != null ) {
outputWriter.close();
}
}

/**
* this is the actual 'worker' of the tool
*/
private GATKRead addBaseQuality(final GATKRead read) {

// convert to a flow base read
final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
final FlowBasedRead fbRead = new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
final int flowOrderLength = calcFlowOrderLength(rgInfo.flowOrder);

// generate base quality
final double[] baseErrorProb = generateBaseErrorProbability(fbRead, flowOrderLength);
final byte[] phred = convertErrorProbToPhred(baseErrorProb);

// install in read
if ( !replaceQualityMode ) {
read.setAttribute(BASE_QUALITY_ATTRIBUTE_NAME, convertPhredToString(phred));
} else {
read.setAttribute(OLD_QUALITY_ATTRIBUTE_NAME, convertPhredToString(read.getBaseQualitiesNoCopy()));
read.setBaseQualities(phred);
}

// return reused read
return read;
}

private byte[] convertErrorProbToPhred(double[] errorProb) {

final byte[] phred = new byte[errorProb.length];
for ( int i = 0 ; i < errorProb.length ; i++ ) {
if ( errorProb[i] == 0 ) {
phred[i] = (byte)(maxQualityScore);
} else {
phred[i] = (byte)Math.min((maxQualityScore), (int) (-10 * Math.log10(errorProb[i])));
}
}
return phred;
}

private String convertPhredToString(final byte[] phred) {

final byte[] s = new byte[phred.length];
for ( int i = 0 ; i < phred.length ; i++ ) {
s[i] = (byte)(phred[i] + PHRED_ASCII_BASE);
}
return new String(s);
}

private int calcFlowOrderLength(String flowOrder) {

final int i = flowOrder.indexOf(flowOrder.charAt(0), 1);

return (i < 0) ? flowOrder.length() : i;
}

private double[] generateBaseErrorProbability(final FlowBasedRead fbRead, final int flowOrderLength) {

/**
* access key and error probabilities
* for a description of the flow probabilities see {@link FlowBasedRead#flowMatrix}
*/
final int[] key = fbRead.getKey();
final double[][] errorProbBands = extractErrorProbBands(fbRead, minErrorRate);
final double[] result = new double[fbRead.getBasesNoCopy().length];

// loop over hmers via flow key
int base = 0;
for ( int flow = 0 ; flow < key.length ; flow++ ) {
if ( key[flow] != 0 ) {

// establish hmer quality
final int hmerLength = key[flow];
final double[] hmerBaseErrorProbs = generateHmerBaseErrorProbabilities(key, errorProbBands, flow, flowOrderLength);

// fill in

// first base of the read gets the original error probability of the hmer, otherwise use computed error probability
if ( base == 0 ) {
result[base++] = errorProbBands[ERROR_PROB_BAND_KEY][flow];
} else {
result[base++] = hmerBaseErrorProbs[0]; // first base, or only base in case of a single base hmer
}

// for hmers longer than 1
if ( hmerLength > 1 ) {

// skip all but last (leave with zero error probability)
base += (hmerLength - 2);

// fill last base from computed error probability
result[base++] = hmerBaseErrorProbs[1]; // last base, if hmer is longer than 1
}

// override result for the last base with the orignal hmer error probability
if ( base == result.length ) {
result[base - 1] = errorProbBands[ERROR_PROB_BAND_KEY][flow];
}
}
}

return result;
}

// extract error probability bands. middle (1) band is the key prob.
// lower (0) and high (2) are corresponding to -1 and +1 in hmer lengths
private static double[][] extractErrorProbBands(final FlowBasedRead flowRead, final double minValue) {

// access key
final int[] key = flowRead.getKey();

// allocate result
double[][] result = new double[ERROR_PROB_BANDS][];
for ( int i = 0 ; i < result.length ; i++ ) {
result[i] = new double[key.length];
}

for ( int i = 0 ; i < key.length ; i++ ) {

// extract key probability
result[ERROR_PROB_BAND_KEY][i] = Math.max(flowRead.getProb(i, key[i]), minValue);

// -1
if ( key[i] > 0 ) {
result[ERROR_PROB_BAND_1LESS][i] = Math.max(flowRead.getProb(i, key[i] - 1), minValue);
} else {
result[ERROR_PROB_BAND_1LESS][i] = minValue;
}

// +1
if ( key[i] < flowRead.getMaxHmer() ) {
result[ERROR_PROB_BAND_1MORE][i] = Math.max(flowRead.getProb(i, key[i] + 1), minValue);
} else {
result[ERROR_PROB_BAND_1MORE][i] = minValue;
}
}

return result;
}

@VisibleForTesting
protected static double[] generateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow, final int flowOrderLength) {

// result is left/right error probabilities
final double[] errorProbs = new double[2];
final int hmerLength = key[flow];

errorProbs[0] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, -1, flowOrderLength);
if ( hmerLength != 1 ) {
errorProbs[1] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, 1, flowOrderLength);
}

return errorProbs;
}

private static double generateSidedHmerBaseErrorProbability(final int[] key, final double[][] errorProbBands, final int flow, final int sideIncr, final int flowOrderLength) {

// create a key slice of the area around the flow/hmer.
final int minIndex = Math.max(flow - flowOrderLength + 1, 0);
final int maxIndex = Math.min(flow + flowOrderLength - 1, key.length - 1);
final int[] slice = Arrays.copyOfRange(key, minIndex, maxIndex + 1);
final int hmerLength = key[flow];

// walk the flows towards the side until (and including) the first non-zero key
// on hmers of length 1 we walk both sides
final List<int[]> slices = new LinkedList<>();
final int[] incrs = (hmerLength != 1)
? new int[] { sideIncr }
: new int[] { sideIncr, -sideIncr};
for (int incr : incrs) {
for (int sideFlow = flow + incr; sideFlow >= 0 && sideFlow < key.length; sideFlow += incr) {

// create a alternative key slice by incrementing sideFlow and decrementing flow
final int[] altSlice = Arrays.copyOf(slice, slice.length);
altSlice[sideFlow - minIndex] += 1;
altSlice[flow - minIndex] -= 1;
if (sliceIsValid(altSlice, flowOrderLength)) {
slices.add(altSlice);
}

// is the sideFlow non-zero? if so, break
if (key[sideFlow] != 0) {
break;
}
}
}

// at this point, we have a list of valid slices. figure out the error probability for each of them
// and compute the base quality
final double keyP = sliceProb(slice, minIndex, key, errorProbBands);
double sumP = keyP;
for ( int[] s : slices ) {
sumP += sliceProb(s, minIndex, key, errorProbBands);
}
final double ep = 1 - (keyP / sumP);

return ep;
}

// compute probability for a slice
private static double sliceProb(int[] slice, int minIndex, int[] key, double[][] errorProbBands) {

double p = 1.0;
for ( int i = 0 ; i < slice.length ; i++ ) {
final int band;
if ( slice[i] < key[i + minIndex] ) {
band = ERROR_PROB_BAND_1LESS;
} else if ( slice[i] > key[i + minIndex] ) {
band = ERROR_PROB_BAND_1MORE;
} else {
band = ERROR_PROB_BAND_KEY;
}
p *= errorProbBands[band][i + minIndex];
}
return p;
}

private static boolean sliceIsValid(final int[] slice, final int flowOrderLength) {

// look for strings of consecutive zeros in length of flowOrderLength - 1
int consecutiveZeros = 0;
for ( int key : slice ) {
if ( key != 0 ) {
consecutiveZeros = 0;
} else {
consecutiveZeros++;
if ( consecutiveZeros >= (flowOrderLength - 1) ) {
return false;
}
}
}

// if here, not found -> valid
return true;
}
}
Loading

0 comments on commit d2783f7

Please sign in to comment.