Skip to content

Commit

Permalink
Fixed an issue of data that contain tp tag sometimes recognized as fl…
Browse files Browse the repository at this point in the history
…ow (#8337)
  • Loading branch information
ilyasoifer committed May 26, 2023
1 parent e3f30ff commit 4456792
Show file tree
Hide file tree
Showing 14 changed files with 259 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ public static void finalizeRegion(final AssemblyRegion region,
if ( region.isFinalized() ) {
return;
}

final byte minTailQualityToUse = errorCorrectReads ? HaplotypeCallerEngine.MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION : minTailQuality;

final List<GATKRead> readsToUse = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ public enum NGSPlatform {
ION_TORRENT(SequencerFlowClass.FLOW, "IONTORRENT"),
CAPILLARY(SequencerFlowClass.OTHER, "CAPILLARY"),
HELICOS(SequencerFlowClass.OTHER, "HELICOS"),
ULTIMA(SequencerFlowClass.FLOW, "ULTIMA"),
UNKNOWN(SequencerFlowClass.OTHER, "UNKNOWN");


/**
* Array of the prefix names in a BAM file for each of the platforms.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ public FlowBasedRead(final GATKRead read, final String flowOrder, final int maxH
public FlowBasedRead(final SAMRecord samRecord, final String flowOrder, final int maxHmer, final FlowBasedArgumentCollection fbargs) {
super(samRecord);
Utils.nonNull(fbargs);
Utils.validate(FlowBasedReadUtils.isFlow(samRecord), "FlowBasedRead can only be used on flow reads. failing read: " + samRecord);
Utils.validate(FlowBasedReadUtils.hasFlowTags(samRecord), "FlowBasedRead can only be used on flow reads. failing read: " + samRecord);
this.fbargs = fbargs;
this.maxHmer = maxHmer;
this.samRecord = samRecord;
Expand Down Expand Up @@ -352,7 +352,7 @@ private void parseSingleHmer(final double[] probs, final byte[] tp, final int fl
if (flowMatrix[loc][flowIdx] == fbargs.fillingValue) {
flowMatrix[loc][flowIdx] = probs[i];
} else {
flowMatrix[loc][flowIdx] += probs[i];
flowMatrix[loc][flowIdx] += probs[i];
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,29 @@
package org.broadinstitute.hellbender.utils.read;

import gov.nih.nlm.ncbi.ngs.NGS;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.SequenceUtil;
import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;
import org.broadinstitute.hellbender.utils.NGSPlatform;
import org.broadinstitute.hellbender.utils.SequencerFlowClass;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.LinkedHashMap;
import java.util.Map;


/**
* utility class for flow based read
* Utility class for working with flow-based reads
*
* The main member static class is {@code ReadGroupInfo} that contains methods that allow
* working with headers of flow based reads and extracting the flow order and the maximal hmer class called.
* It also contains methods to check how the read was clipped ({@code readEndMarkedUnclipped}, {@code readEndMarkedUncertain})
* Lastly, {@code FlowBasedReadUtils.isFlowPlatform} is **the** function to determine if the data are flow-based
*
*/
public class FlowBasedReadUtils {

Expand All @@ -26,17 +36,35 @@ public class FlowBasedReadUtils {
static public class ReadGroupInfo {
final public String flowOrder;
final public int maxClass;

final public boolean isFlowPlatform;
private String reversedFlowOrder = null;

public ReadGroupInfo(final SAMReadGroupRecord readGroup) {
if (readGroup.getPlatform()==null){
isFlowPlatform = false;
} else if (NGSPlatform.fromReadGroupPL(readGroup.getPlatform())==NGSPlatform.UNKNOWN){
isFlowPlatform = false;
} else if (NGSPlatform.fromReadGroupPL(readGroup.getPlatform()) == NGSPlatform.LS454) {
//old Ultima data can have PLATFORM==LS454 and not have FO tag
isFlowPlatform = true;
} else if (NGSPlatform.fromReadGroupPL(readGroup.getPlatform()) == NGSPlatform.ULTIMA){
if (readGroup.getFlowOrder()!=null) {
isFlowPlatform = true;
} else {
throw new RuntimeException("Malformed Ultima read group identified, aborting: " + readGroup);
}
} else {
isFlowPlatform = false;
}

Utils.nonNull(readGroup);
this.flowOrder = readGroup.getFlowOrder();
Utils.nonNull(this.flowOrder);

String mc = readGroup.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG);
this.maxClass = (mc == null) ? FlowBasedRead.MAX_CLASS : Integer.parseInt(mc);
if (isFlowPlatform) {
this.flowOrder = readGroup.getFlowOrder();
String mc = readGroup.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG);
this.maxClass = (mc == null) ? FlowBasedRead.MAX_CLASS : Integer.parseInt(mc);
} else { // not a flow platform
this.flowOrder = null;
this.maxClass = 0;
}
}

public synchronized String getReversedFlowOrder() {
Expand All @@ -45,6 +73,7 @@ public synchronized String getReversedFlowOrder() {
}
return reversedFlowOrder;
}

}

public static boolean readEndMarkedUncertain(final GATKRead rec) {
Expand Down Expand Up @@ -118,22 +147,36 @@ public static int flowSumOfBaseQualities(final GATKRead read) {
}
}

public static boolean isFlow(final GATKRead rec) {
public static boolean hasFlowTags(final GATKRead rec) {
return rec.hasAttribute(FlowBasedRead.FLOW_MATRIX_TAG_NAME)
|| rec.hasAttribute(FlowBasedRead.FLOW_MATRiX_OLD_TAG_KR)
|| rec.hasAttribute(FlowBasedRead.FLOW_MATRiX_OLD_TAG_TI);
}

public static boolean isFlow(final SAMRecord rec) {
public static boolean hasFlowTags(final SAMRecord rec) {
return rec.hasAttribute(FlowBasedRead.FLOW_MATRIX_TAG_NAME)
|| rec.hasAttribute(FlowBasedRead.FLOW_MATRiX_OLD_TAG_KR)
|| rec.hasAttribute(FlowBasedRead.FLOW_MATRiX_OLD_TAG_TI);

}

/**
*
* This is the function to run if you want to ask if the data are flow-based
*
* @param hdr - file header
* @param read - the read
* @return true if the read is flow-based
*/
public static boolean isFlowPlatform(final SAMFileHeader hdr, final GATKRead read) {
if (!hasFlowTags(read)){
return false;
}
return getReadGroupInfo(hdr, read).isFlowPlatform;
}
public static synchronized ReadGroupInfo getReadGroupInfo(final SAMFileHeader hdr, final GATKRead read) {

if ( !isFlow(read) ) {
if ( !hasFlowTags(read) ) {
throw new IllegalArgumentException("read must be flow based: " + read);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public FlowModeFragment(final GATKRead first, final SAMFileHeader header, int pa
headerLibraryMap.get(MarkDuplicatesSparkUtils.getLibraryForRead(first, header, LibraryIdGenerator.UNKNOWN_LIBRARY)));

this.score = (this.end != FlowBasedReadUtils.FLOW_BASED_INSIGNIFICANT_END)
? ((mdArgs.FLOW_QUALITY_SUM_STRATEGY && FlowBasedReadUtils.isFlow(first)) ? computeFlowDuplicateScore(first, start, end) : scoringStrategy.score(first))
? ((mdArgs.FLOW_QUALITY_SUM_STRATEGY && FlowBasedReadUtils.hasFlowTags(first)) ? computeFlowDuplicateScore(first, start, end) : scoringStrategy.score(first))
: -1;
}

Expand Down
2 changes: 2 additions & 0 deletions src/test/java/org/broadinstitute/hellbender/GATKBaseTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ public abstract class GATKBaseTest extends BaseTest {

// ~600,000 reads from chromosomes 20 and 21 of an NA12878 WGS bam aligned to b37, plus ~50,000 unmapped reads
public static final String NA12878_20_21_WGS_bam = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam";
public static final String NA12878_20_21_WGS_mmp2_bam = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.mmp2.bam";
public static final String NA12878_20_21_WGS_cram = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.cram";

public static final String NA12878_20_21_covered_regions = publicTestDir + "wgs_calling_regions.v1.chr20_chr21.interval_list";

// ~10,000 reads from chromosome 20 of NA12878 RNA-seq, aligned to b37 with STAR
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,38 @@ public void testVCFModeIsConsistentWithPastResults(final String inputFileName, f
IntegrationTestSpec.assertEqualTextFiles(output, expected);
}
}


/*
* Test that minimap2 data are supported and consistent with past results
*/
@Test
public void testVCFModeMinimap2IsConsistentWithPastResults() throws Exception {
final String inputFileName = NA12878_20_21_WGS_mmp2_bam;
final String referenceFileName = b37_reference_20_21;
Utils.resetRandomGenerator();

final File output = createTempFile("testVCFModeIsConsistentWithPastResults", ".vcf");
final File expected = new File(TEST_FILES_DIR, "expected.testVCFMode.mmp2.gatk4.vcf");

final String outputPath = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? expected.getAbsolutePath() : output.getAbsolutePath();

final String[] args = {
"-I", inputFileName,
"-R", referenceFileName,
"-L", "20:10000000-10100000",
"-O", outputPath,
"-pairHMM", "AVX_LOGLESS_CACHING",
"--" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"
};

runCommandLine(args);

// Test for an exact match against past results
if ( ! UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ) {
IntegrationTestSpec.assertEqualTextFiles(output, expected);
}
}

/*
* Test that in JunctionTree mode we're consistent with past JunctionTree results (over non-complicated data)
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
package org.broadinstitute.hellbender.utils.read;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.*;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;


public class FlowBasedReadUtilsUnitTest extends GATKBaseTest{
@Test
void testReadGroupParsing() throws Exception{
void testReadGroupParsing(){
final String testResourceDir = publicTestDir + "org/broadinstitute/hellbender/utils/read/flow/reads/";
final String inputDir = testResourceDir + "/input/";

Expand All @@ -24,9 +26,116 @@ void testReadGroupParsing() throws Exception{
FlowBasedReadUtils.ReadGroupInfo frg1 = new FlowBasedReadUtils.ReadGroupInfo(rg1);
assert(frg1.maxClass==12);
assert(frg1.flowOrder.startsWith("TGCA"));
assert(frg1.flowOrder.startsWith("TGCA"));
assert(frg1.isFlowPlatform);

SAMReadGroupRecord rg2 = header.getReadGroup("UGAv3-73");
FlowBasedReadUtils.ReadGroupInfo frg2 = new FlowBasedReadUtils.ReadGroupInfo(rg2);
assert(frg2.maxClass==20);
assert(frg2.flowOrder.startsWith("TGCA"));
assert(frg2.isFlowPlatform);
SAMReadGroupRecord rg3 = header.getReadGroup("UGAv3-74");
FlowBasedReadUtils.ReadGroupInfo frg3 = new FlowBasedReadUtils.ReadGroupInfo(rg3);
assert(!frg3.isFlowPlatform);

}

//testing that an occasional read with tp (but not ULTIMA) does not get identified as flow-based.
// Here the read group is:
// @RG ID:test SM:HG001 PL:ILLUMINA
// but the reads have tp because they were aligned with minimap2:
// 20FUKAAXX100202:5:43:9081:11499/2 272 chr14_GL000009v2_random 156269 ... tp:A:S cm:i:12 s1:i:87 de:f:0.0099 rl:i:0
@Test
void testNonFlowReadGroupParsing(){
final String testResourceDir = publicTestDir + "org/broadinstitute/hellbender/utils/read/flow/reads/";
final String inputDir = testResourceDir + "/input/";

final Path inputFile = FileSystems.getDefault().getPath(inputDir, "non_flow_reads_with_tp.bam");
final SamReader reader = SamReaderFactory.makeDefault().open(new File(inputFile.toString()));
SAMFileHeader header = reader.getFileHeader();
SAMRecord read = reader.iterator().next();
GATKRead gread = new SAMRecordToGATKReadAdapter(read);
assert(FlowBasedReadUtils.hasFlowTags(gread));
FlowBasedReadUtils.ReadGroupInfo frg1 = FlowBasedReadUtils.getReadGroupInfo(header, gread);
assert(!frg1.isFlowPlatform);
}

private GATKRead makeRead(final byte[] bases, final boolean isReverse) {

byte[] quals = new byte[bases.length];

final String cigar = String.format("%dM", bases.length);
GATKRead read = ArtificialReadUtils.createArtificialRead(bases, quals, cigar);
read.setPosition("chr1",100);
read.setIsReverseStrand(isReverse);
return read;
}

@DataProvider(name="isFlowPlatform")
Object[][] getIsFlowPlatformInputs() {
List<Object[]> tests = new ArrayList<>();
SAMFileHeader hdr = new SAMFileHeader();

GATKRead read1 = makeRead(new byte[]{'T','A','G','C','G','A'}, false);
SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1");
rg1.setPlatform("ILLUMINA");
read1.setReadGroup("rg1");
hdr.addReadGroup(rg1);
tests.add(new Object[]{hdr, read1, false});

GATKRead read2 = makeRead(new byte[]{'T','A','G','C','G','A'}, false);
read2.setAttribute("tp","blabla");
SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2");
rg2.setPlatform("ILLUMINA");
read2.setReadGroup("rg2");
hdr.addReadGroup(rg2);
tests.add(new Object[]{hdr, read2, false});

GATKRead read3 = makeRead(new byte[]{'T','A','G','C','G','A'}, false);
read3.setAttribute("tp",new byte[6]);
SAMReadGroupRecord rg3 = new SAMReadGroupRecord("rg3");
rg3.setPlatform("LS454");
read3.setReadGroup("rg3");
hdr.addReadGroup(rg3);
tests.add(new Object[]{hdr, read3, true});

GATKRead read4 = makeRead(new byte[]{'T','A','G','C','G','A'}, false);
read4.setAttribute("tp",new byte[6]);
SAMReadGroupRecord rg4 = new SAMReadGroupRecord("rg4");
rg4.setPlatform("ULTIMA");
rg4.setAttribute("FO","TGCATGCA");
read4.setReadGroup("rg4");
hdr.addReadGroup(rg4);
tests.add(new Object[]{hdr, read4, true});

GATKRead read5 = makeRead(new byte[]{'T','A','G','C','G','A'}, false);
read5.setAttribute("tp",new byte[6]);
SAMReadGroupRecord rg5 = new SAMReadGroupRecord("rg5");
rg5.setPlatform("LS454");
rg5.setAttribute("FO","TGCATGCA");
read5.setReadGroup("rg5");
hdr.addReadGroup(rg5);
tests.add(new Object[]{hdr, read5, true});


return tests.toArray(new Object[][]{});
}

@Test (dataProvider = "isFlowPlatform")
void testIsFlowPlatform(final SAMFileHeader hdr, final GATKRead read, final boolean answer){
assert(FlowBasedReadUtils.isFlowPlatform(hdr, read) == answer);
}

@Test(expectedExceptions = {RuntimeException.class})
void testMalformedUltimaRGThrowsException(){
GATKRead read = makeRead(new byte[]{'T','A','G','C','G','A'}, false);
read.setAttribute("tp",new byte[6]);
SAMReadGroupRecord rg = new SAMReadGroupRecord("rg");
rg.setPlatform("ULTIMA");
read.setReadGroup("rg");
SAMFileHeader hdr = new SAMFileHeader();
hdr.addReadGroup(rg);
FlowBasedReadUtils.isFlowPlatform(hdr, read);
}

}
Git LFS file not shown
Git LFS file not shown
Loading

0 comments on commit 4456792

Please sign in to comment.