Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AddFlowBaseQuality tool #8235

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,334 @@
package org.broadinstitute.hellbender.tools.walkers.groundtruth;

import com.google.common.annotations.VisibleForTesting;
import org.broadinstitute.barclay.argparser.*;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.engine.FeatureContext;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.engine.ReadWalker;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;
import org.broadinstitute.hellbender.utils.read.FlowBasedRead;
import org.broadinstitute.hellbender.utils.read.FlowBasedReadUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter;
import picard.cmdline.programgroups.ReadDataManipulationProgramGroup;

import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;

/**
* Write reads from SAM format file (SAM/BAM/CRAM) that pass criteria to a new file while adding a base-quality attribute
dror27 marked this conversation as resolved.
Show resolved Hide resolved
*
dror27 marked this conversation as resolved.
Show resolved Hide resolved
* <p>
* </p>
*
* <p>
* The reference is strictly required when handling CRAM files.
* </p>
*
* <h3> Input </h3>
* <ul>
* <li> Coordinate-sorted and indexed SAM/BAM/CRAM </li>
* </ul>
*
* <h3> Output </h3>
* <ul>
* <li> Coordinate-sorted and indexed SAM/BAM/CRAM </li>
* </ul>
*
* {@GATK.walkertype ReadWalker}
*/
@CommandLineProgramProperties(
summary = "Prints reads from the input SAM/BAM/CRAM file to the SAM/BAM/CRAM file while adding a base quality attribute.",
oneLineSummary = "Add base quality attribute to reads in in the SAM/BAM/CRAM file",
programGroup = ReadDataManipulationProgramGroup.class
)
@ExperimentalFeature
@WorkflowProperties
public final class AddFlowBaseQuality extends ReadWalker {

public static final String MINIMAL_ERROR_RATE_LONG_NAME = "minimal-error-rate";
public static final String MAXIMAL_QUALITY_SCORE_LONG_NAME = "maximal-quality-score";
public static final String REPLACE_QUALITY_MODE_LONG_NAME = "replace-quality-mode";
public static final String BASE_QUALITY_ATTRIBUTE_NAME = "BQ";
public static final String OLD_QUALITY_ATTRIBUTE_NAME = "OQ";
public static final char PHRED_ASCII_BASE = '!';

public static final int ERROR_PROB_BAND_1LESS = 0;
public static final int ERROR_PROB_BAND_KEY = 1;
public static final int ERROR_PROB_BAND_1MORE = 2;
public static final int ERROR_PROB_BANDS = 3;

@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME,
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME,
doc="Write output to this file")
@WorkflowOutput(optionalCompanions={StandardArgumentDefinitions.OUTPUT_INDEX_COMPANION})
public GATKPath output;
private SAMFileGATKReadWriter outputWriter;

@Argument(fullName = MINIMAL_ERROR_RATE_LONG_NAME)
dror27 marked this conversation as resolved.
Show resolved Hide resolved
public double minErrorRate = 1e-3;

@Argument(fullName = MAXIMAL_QUALITY_SCORE_LONG_NAME)
public int maxQualityScore = 126;

@Argument(fullName = REPLACE_QUALITY_MODE_LONG_NAME)
public boolean replaceQualityMode = false;

@ArgumentCollection
public FlowBasedArgumentCollection fbargs = new FlowBasedArgumentCollection();

@Override
public void onTraversalStart() {
outputWriter = createSAMWriter(output, true);

// work with original probabilities
dror27 marked this conversation as resolved.
Show resolved Hide resolved
fbargs.keepBoundaryFlows = true;
}

@Override
public void apply(final GATKRead read, final ReferenceContext referenceContext, final FeatureContext featureContext) {
outputWriter.addRead(addBaseQuality(read));
}

@Override
public void closeTool() {
if ( outputWriter != null ) {
outputWriter.close();
}
}

/**
* this is the actual 'worker' of the tool
*/
private GATKRead addBaseQuality(final GATKRead read) {

// convert to a flow base read
final FlowBasedReadUtils.ReadGroupInfo rgInfo = FlowBasedReadUtils.getReadGroupInfo(getHeaderForReads(), read);
final FlowBasedRead fbRead = new FlowBasedRead(read, rgInfo.flowOrder, rgInfo.maxClass, fbargs);
final int flowOrderLength = calcFlowOrderLength(rgInfo.flowOrder);

// generate base quality
final double[] baseErrorProb = generateBaseErrorProbability(fbRead, flowOrderLength);
final byte[] phred = convertErrorProbToPhred(baseErrorProb);

// install in read
if ( !replaceQualityMode ) {
read.setAttribute(BASE_QUALITY_ATTRIBUTE_NAME, convertPhredToString(phred));
} else {
read.setAttribute(OLD_QUALITY_ATTRIBUTE_NAME, convertPhredToString(read.getBaseQualitiesNoCopy()));
read.setBaseQualities(phred);
}

// return reused read
return read;
}

private byte[] convertErrorProbToPhred(double[] errorProb) {

final byte[] phred = new byte[errorProb.length];
for ( int i = 0 ; i < errorProb.length ; i++ ) {
if ( errorProb[i] == 0 ) {
phred[i] = (byte)(maxQualityScore - PHRED_ASCII_BASE);
} else {
phred[i] = (byte)Math.min((maxQualityScore - PHRED_ASCII_BASE), (int) (-10 * Math.log10(errorProb[i])));
}
}
return phred;
}

private String convertPhredToString(final byte[] phred) {

final byte[] s = new byte[phred.length];
for ( int i = 0 ; i < phred.length ; i++ ) {
s[i] = (byte)(phred[i] + PHRED_ASCII_BASE);
}
return new String(s);
}

private int calcFlowOrderLength(String flowOrder) {

final int i = flowOrder.indexOf(flowOrder.charAt(0), 1);

return (i < 0) ? flowOrder.length() : i;
}

private double[] generateBaseErrorProbability(final FlowBasedRead fbRead, final int flowOrderLength) {

// access key and error probabilities
final int[] key = fbRead.getKey();
final double[][] errorProbBands = extractErrorProbBands(fbRead, minErrorRate);
final double[] result = new double[fbRead.getBasesNoCopy().length];

// loop over hmers via flow key
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This section isn't clear to me. Can you add some documentation about the flow key? I think I'm just missing the structure of the flow error probabilities. You can either add some more comments here, or just point the reader to something if it already exists elsewhere.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added a javadoc link to FlowBasedRead#flowMatrix where the flow probabilities are described.

int base = 0;
for ( int flow = 0 ; flow < key.length ; flow++ ) {
if ( key[flow] != 0 ) {

// establish hmer quality
final int hmerLength = key[flow];
final double[] hmerBaseErrorProbs = generateHmerBaseErrorProbabilities(key, errorProbBands, flow, flowOrderLength);

// fill in

// first base of the read gets the original error probability of the hmer, otherwise use computed error probability
if ( base == 0 ) {
result[base++] = errorProbBands[ERROR_PROB_BAND_KEY][flow];
} else {
result[base++] = hmerBaseErrorProbs[0]; // first base, or only base in case of a single base hmer
}

// for hmers longer than 1
if ( hmerLength > 1 ) {

// skip all but last (leave with zero error probability)
base += (hmerLength - 2);

// fill last base from computed error probability
result[base++] = hmerBaseErrorProbs[1]; // last base, if hmer is longer than 1
}

// override result for the last base with the orignal hmer error probability
if ( base == result.length ) {
result[base - 1] = errorProbBands[ERROR_PROB_BAND_KEY][flow];
}
}
}

return result;
}

// extract error probability bands. middle (1) band is the key prob.
// lower (0) and high (2) are corresponding to -1 and +1 in hmer lengths
private static double[][] extractErrorProbBands(final FlowBasedRead flowRead, final double minValue) {

// access key
final int[] key = flowRead.getKey();

// allocate result
double[][] result = new double[ERROR_PROB_BANDS][];
for ( int i = 0 ; i < result.length ; i++ ) {
result[i] = new double[key.length];
}

for ( int i = 0 ; i < key.length ; i++ ) {

// extract key probability
result[ERROR_PROB_BAND_KEY][i] = Math.max(flowRead.getProb(i, key[i]), minValue);

// -1
if ( key[i] > 0 ) {
result[ERROR_PROB_BAND_1LESS][i] = Math.max(flowRead.getProb(i, key[i] - 1), minValue);
} else {
result[ERROR_PROB_BAND_1LESS][i] = minValue;
}

// +1
if ( key[i] < flowRead.getMaxHmer() ) {
result[ERROR_PROB_BAND_1MORE][i] = Math.max(flowRead.getProb(i, key[i] + 1), minValue);
} else {
result[ERROR_PROB_BAND_1MORE][i] = minValue;
}
}

return result;
}

@VisibleForTesting
protected static double[] generateHmerBaseErrorProbabilities(final int[] key, final double[][] errorProbBands, final int flow, final int flowOrderLength) {

// result is left/right error probabilities
final double[] errorProbs = new double[2];
final int hmerLength = key[flow];

errorProbs[0] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, -1, flowOrderLength);
if ( hmerLength != 1 ) {
errorProbs[1] = generateSidedHmerBaseErrorProbability(key, errorProbBands, flow, 1, flowOrderLength);
}

return errorProbs;
}

private static double generateSidedHmerBaseErrorProbability(final int[] key, final double[][] errorProbBands, final int flow, final int sideIncr, final int flowOrderLength) {

// create a key slice of the area around the flow/hmer.
final int minIndex = Math.max(flow - flowOrderLength + 1, 0);
final int maxIndex = Math.min(flow + flowOrderLength - 1, key.length - 1);
final int[] slice = Arrays.copyOfRange(key, minIndex, maxIndex + 1);
final int hmerLength = key[flow];

// walk the flows towards the side until (and including) the first non-zero key
// on hmers of length 1 we walk both sides
final List<int[]> slices = new LinkedList<>();
final int[] incrs = (hmerLength != 1)
? new int[] { sideIncr }
: new int[] { sideIncr, -sideIncr};
for (int incr : incrs) {
for (int sideFlow = flow + incr; sideFlow >= 0 && sideFlow < key.length; sideFlow += incr) {

// create a alternative key slice by incrementing sideFlow and decrementing flow
final int[] altSlice = Arrays.copyOf(slice, slice.length);
altSlice[sideFlow - minIndex] += 1;
altSlice[flow - minIndex] -= 1;
if (sliceIsValid(altSlice, flowOrderLength)) {
slices.add(altSlice);
}

// is the sideFlow non-zero? if so, break
if (key[sideFlow] != 0) {
break;
}
}
}

// at this point, we have a list of valid slices. figure out the error probability for each of them
// and compute the base quality
final double keyP = sliceProb(slice, minIndex, key, errorProbBands);
double sumP = keyP;
for ( int[] s : slices ) {
sumP += sliceProb(s, minIndex, key, errorProbBands);
}
final double ep = 1 - (keyP / sumP);

return ep;
}

// compute probability for a slice
private static double sliceProb(int[] slice, int minIndex, int[] key, double[][] errorProbBands) {

double p = 1.0;
for ( int i = 0 ; i < slice.length ; i++ ) {
final int band;
if ( slice[i] < key[i + minIndex] ) {
band = ERROR_PROB_BAND_1LESS;
} else if ( slice[i] > key[i + minIndex] ) {
band = ERROR_PROB_BAND_1MORE;
} else {
band = ERROR_PROB_BAND_KEY;
}
p *= errorProbBands[band][i + minIndex];
}
return p;
}

private static boolean sliceIsValid(final int[] slice, final int flowOrderLength) {

// look for strings of consecutive zeros in length of flowOrderLength - 1
int consecutiveZeros = 0;
for ( int key : slice ) {
if ( key != 0 ) {
consecutiveZeros = 0;
} else {
consecutiveZeros++;
if ( consecutiveZeros >= (flowOrderLength - 1) ) {
return false;
}
}
}

// if here, not found -> valid
return true;
}
}
Loading