Skip to content

Commit

Permalink
[BIOIN-1570] Fixed edge case in variant annotation (#8810)
Browse files Browse the repository at this point in the history
* [BIOIN-1570] Fixed edge case in variant annotation when the variant is close to the edge of the reference
  • Loading branch information
ilyasoifer committed Jun 3, 2024
1 parent 4ed93fe commit d4744f7
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
* are accumulated as well.
*/
public abstract class FlowAnnotatorBase implements InfoFieldAnnotation {
private final static Logger logger = LogManager.getLogger(FlowAnnotatorBase.class);
protected final OneShotLogger flowMissingOneShotLogger = new OneShotLogger(FlowAnnotatorBase.class);


Expand Down Expand Up @@ -202,7 +201,7 @@ protected void variantType(final VariantContext vc, final LocalContext localCont
if (isSpecial(alleles.get(i))){
continue;
}
if ((localContext.hmerIndelLength.get(i)==null) || (localContext.hmerIndelLength.get(i)==0)){
if ((localContext.hmerIndelLength==null) || (localContext.hmerIndelLength.get(i)==null) || (localContext.hmerIndelLength.get(i)==0)){
isHmer=false;
}
}
Expand Down Expand Up @@ -250,7 +249,12 @@ protected void isHmerIndel(final VariantContext vc, final LocalContext localCont
// get byte before and after
final byte before = getReferenceNucleotide(localContext, vc.getStart() - 1);
final byte[] after = getReferenceHmerPlus(localContext, vc.getEnd() + 1, MOTIF_SIZE);

if (after.length==0){
flowMissingOneShotLogger.warn("Failed to get hmer from reference, probably because the " +
"variant is very close to the end of the chromosome, isHmerIndel and RightMotif annotations will not be calculated. " +
"Variant position: " + vc.getContig() + ":" + vc.getEnd() + 1);
return;
}
// build two haplotypes. add byte before and after
final byte[] refHap = buildHaplotype(before, ref.getBases(), after);
final byte[] altHap = buildHaplotype(before, alt.getBases(), after);
Expand Down Expand Up @@ -338,6 +342,12 @@ protected void getLeftMotif(final VariantContext vc, final LocalContext localCon
}

String motif = getRefMotif(localContext, vc.getStart() - MOTIF_SIZE, MOTIF_SIZE);
if (motif.length() != MOTIF_SIZE){
flowMissingOneShotLogger.warn("Failed to get motif from reference, probably because the variant is very close to the " +
"start of the chromosome. LeftMotif annotation will not be calculated. " +
"Variant position: " + vc.getContig() + ":" + vc.getStart());
return;
}
if ( a.length() != refLength ) {
motif = motif.substring(1) + vc.getReference().getBaseString().substring(0, 1);
}
Expand All @@ -350,8 +360,13 @@ protected void getLeftMotif(final VariantContext vc, final LocalContext localCon
protected void getRightMotif(final VariantContext vc, final LocalContext localContext) {

final int refLength = vc.getReference().length();
final String motif = getRefMotif(localContext, vc.getStart() + refLength, MOTIF_SIZE);

String motif = getRefMotif(localContext, vc.getStart() + refLength, MOTIF_SIZE);
if (motif.length() != MOTIF_SIZE){
flowMissingOneShotLogger.warn("Failed to get motif from reference, probably because " +
"the variant is close to the end of the chromosome. RightMotif annotation will not be calculated. " +
"Variant position: " + vc.getContig() + ":" + vc.getStart());
return;
}
// fill empty entries (non indel alelles)
for ( int i = 0 ; i < localContext.rightMotif.size() ; i++ ) {
if ( localContext.rightMotif.get(i) == null ) {
Expand All @@ -366,6 +381,11 @@ protected void gcContent(final VariantContext vc, final LocalContext localContex

final int begin = vc.getStart() - (GC_CONTENT_SIZE / 2);
final String seq = getRefMotif(localContext, begin + 1, GC_CONTENT_SIZE);
if ( seq.length() != GC_CONTENT_SIZE ) {
flowMissingOneShotLogger.warn("gcContent will not be calculated at position " + vc.getContig() + ":" + vc.getStart() +
" because the variant is too close to the edge of the chromosome");
return;
}
int gcCount = 0;
for ( byte b : seq.getBytes() ) {
if ( b == 'G' || b == 'C' ) {
Expand Down Expand Up @@ -424,19 +444,25 @@ protected void cycleSkip(final VariantContext vc, final LocalContext localContex
localContext.attributes.put(GATKVCFConstants.FLOW_CYCLESKIP_STATUS, css);
}

// get a single nucleoid from reference
// get a single nucleotid from reference
private byte getReferenceNucleotide(final LocalContext localContext, final int start) {
final int index = start - localContext.ref.getWindow().getStart();
final byte[] bases = localContext.ref.getBases();
Utils.validIndex(index, bases.length);
Utils.validIndex(index, bases.length); // do not catch, if here the location is outside of the reference, there is a problem!
return bases[index];
}

// get an hmer from reference plus a number of additional bases
private byte[] getReferenceHmerPlus(final LocalContext localContext, final int start, final int additional) {
int index = start - localContext.ref.getWindow().getStart();
final byte[] bases = localContext.ref.getBases();
Utils.validIndex(index, bases.length);
try {
Utils.validIndex(index, bases.length);
} catch (IllegalArgumentException e) {
flowMissingOneShotLogger.warn("Failed to get hmer from reference, too close to the edge. " +
"Position: " + localContext.ref.getContig() + ":" + index);
return new byte[0];
}

// get hmer
final StringBuilder sb = new StringBuilder();
Expand All @@ -458,8 +484,12 @@ private String getRefMotif(final LocalContext localContext, final int start, fin
final byte[] bases = localContext.ref.getBases();
final int startIndex = start - localContext.ref.getWindow().getStart();
final int endIndex = startIndex + length;
Utils.validIndex(startIndex, bases.length);
Utils.validIndex(endIndex-1, bases.length);
try {
Utils.validIndex(startIndex, bases.length);
Utils.validIndex(endIndex-1, bases.length);
} catch (IllegalArgumentException e) {
return "";
}
return new String(Arrays.copyOfRange(bases, startIndex, endIndex));
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,28 @@ public Object[][] getTestData() {
// ins hmer indel
"TATCT C ATTGACCAA", "CA",
"ins", "1", "2", "A", "ATCTC", "TTGAC", "0.3", "NA", "h-indel"
},
//The following tests test the edge case of variant so close to the edge of the chromosome
//that some of the annotations can't be calculated
{
// Close to 5' edge of the chromosome, expect not to crash on finding the left motif
"ATCT C ATTGACCAA", "CA",
"ins", "1", "2", "A", null, "TTGAC", "0.3", "NA", "h-indel"
},
{
// close to 3' edge of the chromosome, should not find left motif or GC content
"TATCT C ATT", "CA",
"ins", "1", "2", "A", "ATCTC", null, null, "NA", "h-indel"
},
{
// very close to 3' edge of the chromosome, should not find left motif, GC content or if it is
// an hmer indel
"TATCT C ", "CA",
"ins", "1", null, null, "ATCTC", null, null, "NA", "non-h-indel"
}



};

return testData;
Expand Down

0 comments on commit d4744f7

Please sign in to comment.