Skip to content

Commit

Permalink
FlowFeatureMapper: added surrounding-median-quality-size feature (#8222)
Browse files Browse the repository at this point in the history
  • Loading branch information
dror27 committed Mar 7, 2023
1 parent 330c59a commit 4ba4ab5
Show file tree
Hide file tree
Showing 10 changed files with 198 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,10 @@ public final class FlowFeatureMapper extends ReadWalker {
private static final String VCF_LENGTH = "X_LENGTH";
private static final String VCF_EDIST = "X_EDIST";
private static final String VCF_INDEX = "X_INDEX";
private static final String VCF_SMQ_LEFT = "X_SMQ_LEFT";
private static final String VCF_SMQ_RIGHT = "X_SMQ_RIGHT";
private static final String VCF_SMQ_LEFT_MEAN = "X_SMQ_LEFT_MEAN";
private static final String VCF_SMQ_RIGHT_MEAN = "X_SMQ_RIGHT_MEAN";

private static final Double LOWEST_PROB = 0.0001;

Expand Down Expand Up @@ -166,6 +170,10 @@ protected static class MappedFeature implements Comparable<MappedFeature> {
int featuresOnRead;
int refEditDistance;
int index;
int smqLeft;
int smqRight;
int smqLeftMean;
int smqRightMean;

public MappedFeature(GATKRead read, FlowFeatureMapperArgumentCollection.MappingFeatureEnum type, byte[] readBases,
byte[] refBases, int readBasesOffset, int start, int offsetDelta) {
Expand Down Expand Up @@ -303,6 +311,10 @@ public VCFHeader makeVCFHeader(final SAMSequenceDictionary sequenceDictionary, f
headerInfo.add(new VCFInfoHeaderLine(VCF_LENGTH, 1, VCFHeaderLineType.Integer, "Read length"));
headerInfo.add(new VCFInfoHeaderLine(VCF_EDIST, 1, VCFHeaderLineType.Integer, "Read Levenshtein edit distance from reference"));
headerInfo.add(new VCFInfoHeaderLine(VCF_INDEX, 1, VCFHeaderLineType.Integer, "Ordinal index, from start of the read, where the feature was found"));
headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_LEFT, 1, VCFHeaderLineType.Integer, "Ordinal Median quality of N bases to the left of the feature"));
headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_RIGHT, 1, VCFHeaderLineType.Integer, "Ordinal Median quality of N bases to the right of the feature"));
headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_LEFT_MEAN, 1, VCFHeaderLineType.Integer, "Ordinal Mean quality of N bases to the left of the feature"));
headerInfo.add(new VCFInfoHeaderLine(VCF_SMQ_RIGHT_MEAN, 1, VCFHeaderLineType.Integer, "Ordinal Mean quality of N bases to the right of the feature"));
for ( String name : fmArgs.copyAttr ) {
headerInfo.add(new VCFInfoHeaderLine(fmArgs.copyAttrPrefix + name, 1, VCFHeaderLineType.String, "copy-attr: " + name));
}
Expand Down Expand Up @@ -602,6 +614,17 @@ private void emitFeature(final MappedFeature fr) {
vcb.attribute(VCF_LENGTH, fr.read.getLength());
vcb.attribute(VCF_EDIST, fr.refEditDistance);
vcb.attribute(VCF_INDEX, fr.index);

// median/mean quality on?
if ( fmArgs.surroundingMediaQualitySize != null ) {
vcb.attribute(VCF_SMQ_LEFT, fr.smqLeft);
vcb.attribute(VCF_SMQ_RIGHT, fr.smqRight);
}
if ( fmArgs.surroundingMeanQualitySize != null ) {
vcb.attribute(VCF_SMQ_LEFT_MEAN, fr.smqLeftMean);
vcb.attribute(VCF_SMQ_RIGHT_MEAN, fr.smqRightMean);
}

for ( String name : fmArgs.copyAttr ) {
if ( fr.read.hasAttribute(name) ) {
vcb.attribute(fmArgs.copyAttrPrefix + name, fr.read.getAttributeAsString(name));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,4 +102,18 @@ enum MappingFeatureEnum {
@Hidden
@Argument(fullName = "debug-read-name", doc = "debug specific reads?", optional = true)
public List<String> debugReadName = null;

/**
* surrounding media quality (size) - if not specified, this feature is off
**/
@Hidden
@Argument(fullName = "surrounding-median-quality-size", doc = "number of bases around the feature to calculate surrounding median quality", optional = true)
public Integer surroundingMediaQualitySize = null;

/**
* surrounding mean quality (size) - if not specified, this feature is off
**/
@Hidden
@Argument(fullName = "surrounding-mean-quality-size", doc = "number of bases around the feature to calculate surrounding mean quality", optional = true)
public Integer surroundingMeanQualitySize = null;
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package org.broadinstitute.hellbender.tools.walkers.featuremapping;

import htsjdk.samtools.CigarElement;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.broadinstitute.hellbender.engine.ReferenceContext;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.read.FlowBasedRead;

Expand All @@ -23,11 +25,15 @@ public class SNVMapper implements FeatureMapper {
final int identAfter;
final int minCigarElementLength;
final LevenshteinDistance levDistance = new LevenshteinDistance();
final Integer smqSize;
final Integer smqSizeMean;

public SNVMapper(FlowFeatureMapperArgumentCollection fmArgs) {
identBefore = fmArgs.snvIdenticalBases;
identAfter = (fmArgs.snvIdenticalBasesAfter != 0) ? fmArgs.snvIdenticalBasesAfter : identBefore;
minCigarElementLength = identBefore + 1 + identAfter;
smqSize = fmArgs.surroundingMediaQualitySize;
smqSizeMean = fmArgs.surroundingMeanQualitySize;

// adjust minimal read length
FlowBasedRead.setMinimalReadLength(1 + 1 + identAfter);
Expand Down Expand Up @@ -117,6 +123,46 @@ public void forEachOnRead(GATKRead read, ReferenceContext referenceContext, Cons
feature.index = readOfs;
else
feature.index = hardLength - readOfs;

// reverse quals?
if ( smqSize != null || smqSizeMean != null ) {

// prepare qualities
final byte[] quals;
if (!read.isReverseStrand()) {
quals = read.getBaseQualitiesNoCopy();
} else {
quals = read.getBaseQualities();
ArrayUtils.reverse(quals);
}

// surrounding median quality?
if ( smqSize != null ) {
feature.smqLeft = calcSmq(quals, feature.index - 1 - smqSize, feature.index - 1, true);
feature.smqRight = calcSmq(quals, feature.index + 1, feature.index + 1 + smqSize, true);
if ( read.isReverseStrand() ) {

// left and right are reversed
int tmp = feature.smqLeft;
feature.smqLeft = feature.smqRight;
feature.smqRight = tmp;
}
}
if ( smqSizeMean != null ) {
feature.smqLeftMean = calcSmq(quals, feature.index - 1 - smqSizeMean, feature.index - 1, false);
feature.smqRightMean = calcSmq(quals, feature.index + 1, feature.index + 1 + smqSizeMean, false);
if ( read.isReverseStrand() ) {

// left and right are reversed
int tmp = feature.smqLeftMean;
feature.smqLeftMean = feature.smqRightMean;
feature.smqRightMean = tmp;
}
}

}


features.add(feature);
}
}
Expand All @@ -142,6 +188,40 @@ public void forEachOnRead(GATKRead read, ReferenceContext referenceContext, Cons
}
}

private int calcSmq(final byte[] quals, int from, int to, boolean median) {

// limit from/to
from = Math.max(0, Math.min(quals.length, from));
to = Math.max(0, Math.min(quals.length, to - 1));
if ( from > to ) {
throw new GATKException("invalid qualities range: from > to");
}

// calc median
byte[] range = Arrays.copyOfRange(quals, from, to + 1);
if ( range.length == 0 ) {
throw new GATKException("invalid qualities range: can't be empty");
}

if ( median ) {
Arrays.sort(range);
int midIndex = range.length / 2;
if ((range.length % 2) == 1) {
// odd
return range[midIndex];
} else {
// even
return (range[midIndex - 1] + range[midIndex]) / 2;
}
} else {
int sum = 0;
for ( int i = 0 ; i < range.length ; i++ ) {
sum += range[i];
}
return sum / range.length;
}
}

public boolean noFeatureButFilterAt(GATKRead read, ReferenceContext referenceContext, int start) {

// access bases
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,69 @@ public void testBasic() throws IOException {
}
}

@Test
public void testSurroundingMedianQuality() throws IOException {

final File outputDir = createTempDir("testFlowFeatureMapperTest");
final File expectedFile = new File(testDir + "/snv_feature_mapper_smq_output.vcf");
final File outputFile = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? expectedFile : new File(outputDir + "/snv_feature_mapper_smq_output.vcf");

final String[] args = new String[] {
"-R", largeFileTestDir + "/Homo_sapiens_assembly38.fasta.gz",
"-O", outputFile.getAbsolutePath(),
"-I", testDir + "/snv_feature_mapper_input.bam",
"--copy-attr", "tr",
"--limit-score", "100",
"--min-score", "0",
"--snv-identical-bases", "10",
"--debug-negatives", "false",
"--debug-read-name", "150451-BC94-0645901755",
"--surrounding-median-quality-size", "1000" // use a high value to stress the code
};

// run the tool
runCommandLine(args); // no assert, just make sure we don't throw

// make sure we've generated the otuput file
Assert.assertTrue(outputFile.exists());

// walk the output and expected files, compare non-comment lines
if ( !UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ) {
IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile, "#");
}
}

@Test
public void testSurroundingAllQuality() throws IOException {

final File outputDir = createTempDir("testFlowFeatureMapperTest");
final File expectedFile = new File(testDir + "/snv_feature_mapper_smq_all_output.vcf");
final File outputFile = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? expectedFile : new File(outputDir + "/snv_feature_mapper_smq_all_output.vcf");

final String[] args = new String[] {
"-R", largeFileTestDir + "/Homo_sapiens_assembly38.fasta.gz",
"-O", outputFile.getAbsolutePath(),
"-I", testDir + "/snv_feature_mapper_input.bam",
"--copy-attr", "tr",
"--limit-score", "100",
"--min-score", "0",
"--snv-identical-bases", "10",
"--debug-negatives", "false",
"--debug-read-name", "150451-BC94-0645901755",
"--surrounding-median-quality-size", "1000",
"--surrounding-mean-quality-size", "1000"
};

// run the tool
runCommandLine(args); // no assert, just make sure we don't throw

// make sure we've generated the otuput file
Assert.assertTrue(outputFile.exists());

// walk the output and expected files, compare non-comment lines
if ( !UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ) {
IntegrationTestSpec.assertEqualTextFiles(outputFile, expectedFile, "#");
}
}

}
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown

0 comments on commit 4ba4ab5

Please sign in to comment.