-
Notifications
You must be signed in to change notification settings - Fork 586
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fixed an issue of data that contain tp tag sometimes recognized as flow #8337
Changes from all commits
9fb19f3
096470a
06edf28
f22faa3
a01aa7a
9b700d6
4115227
5ce230a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,29 @@ | ||
package org.broadinstitute.hellbender.utils.read; | ||
|
||
import gov.nih.nlm.ncbi.ngs.NGS; | ||
import htsjdk.samtools.SAMFileHeader; | ||
import htsjdk.samtools.SAMReadGroupRecord; | ||
import htsjdk.samtools.SAMRecord; | ||
import htsjdk.samtools.util.SequenceUtil; | ||
import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection; | ||
import org.broadinstitute.hellbender.exceptions.GATKException; | ||
import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection; | ||
import org.broadinstitute.hellbender.utils.NGSPlatform; | ||
import org.broadinstitute.hellbender.utils.SequencerFlowClass; | ||
import org.broadinstitute.hellbender.utils.Utils; | ||
|
||
import java.util.LinkedHashMap; | ||
import java.util.Map; | ||
|
||
|
||
/** | ||
* utility class for flow based read | ||
* Utility class for working with flow-based reads | ||
* | ||
* The main member static class is {@code ReadGroupInfo} that contains methods that allow | ||
* working with headers of flow based reads and extracting the flow order and the maximal hmer class called. | ||
* It also contains methods to check how the read was clipped ({@code readEndMarkedUnclipped}, {@code readEndMarkedUncertain}) | ||
* Lastly, {@code FlowBasedReadUtils.isFlowPlatform} is **the** function to determine if the data are flow-based | ||
* | ||
*/ | ||
public class FlowBasedReadUtils { | ||
|
||
|
@@ -26,17 +36,35 @@ public class FlowBasedReadUtils { | |
static public class ReadGroupInfo { | ||
final public String flowOrder; | ||
final public int maxClass; | ||
|
||
final public boolean isFlowPlatform; | ||
private String reversedFlowOrder = null; | ||
|
||
public ReadGroupInfo(final SAMReadGroupRecord readGroup) { | ||
if (readGroup.getPlatform()==null){ | ||
isFlowPlatform = false; | ||
} else if (NGSPlatform.fromReadGroupPL(readGroup.getPlatform())==NGSPlatform.UNKNOWN){ | ||
isFlowPlatform = false; | ||
} else if (NGSPlatform.fromReadGroupPL(readGroup.getPlatform()) == NGSPlatform.LS454) { | ||
//old Ultima data can have PLATFORM==LS454 and not have FO tag | ||
isFlowPlatform = true; | ||
} else if (NGSPlatform.fromReadGroupPL(readGroup.getPlatform()) == NGSPlatform.ULTIMA){ | ||
if (readGroup.getFlowOrder()!=null) { | ||
isFlowPlatform = true; | ||
} else { | ||
throw new RuntimeException("Malformed Ultima read group identified, aborting: " + readGroup); | ||
} | ||
} else { | ||
isFlowPlatform = false; | ||
} | ||
|
||
Utils.nonNull(readGroup); | ||
this.flowOrder = readGroup.getFlowOrder(); | ||
Utils.nonNull(this.flowOrder); | ||
|
||
String mc = readGroup.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG); | ||
this.maxClass = (mc == null) ? FlowBasedRead.MAX_CLASS : Integer.parseInt(mc); | ||
if (isFlowPlatform) { | ||
this.flowOrder = readGroup.getFlowOrder(); | ||
String mc = readGroup.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG); | ||
this.maxClass = (mc == null) ? FlowBasedRead.MAX_CLASS : Integer.parseInt(mc); | ||
} else { // not a flow platform | ||
this.flowOrder = null; | ||
this.maxClass = 0; | ||
} | ||
} | ||
|
||
public synchronized String getReversedFlowOrder() { | ||
|
@@ -45,6 +73,7 @@ public synchronized String getReversedFlowOrder() { | |
} | ||
return reversedFlowOrder; | ||
} | ||
|
||
} | ||
|
||
public static boolean readEndMarkedUncertain(final GATKRead rec) { | ||
|
@@ -118,22 +147,36 @@ public static int flowSumOfBaseQualities(final GATKRead read) { | |
} | ||
} | ||
|
||
public static boolean isFlow(final GATKRead rec) { | ||
public static boolean hasFlowTags(final GATKRead rec) { | ||
return rec.hasAttribute(FlowBasedRead.FLOW_MATRIX_TAG_NAME) | ||
|| rec.hasAttribute(FlowBasedRead.FLOW_MATRiX_OLD_TAG_KR) | ||
|| rec.hasAttribute(FlowBasedRead.FLOW_MATRiX_OLD_TAG_TI); | ||
} | ||
|
||
public static boolean isFlow(final SAMRecord rec) { | ||
public static boolean hasFlowTags(final SAMRecord rec) { | ||
return rec.hasAttribute(FlowBasedRead.FLOW_MATRIX_TAG_NAME) | ||
|| rec.hasAttribute(FlowBasedRead.FLOW_MATRiX_OLD_TAG_KR) | ||
|| rec.hasAttribute(FlowBasedRead.FLOW_MATRiX_OLD_TAG_TI); | ||
|
||
} | ||
|
||
/** | ||
* | ||
* This is the function to run if you want to ask if the data are flow-based | ||
* | ||
* @param hdr - file header | ||
* @param read - the read | ||
* @return true if the read is flow-based | ||
*/ | ||
public static boolean isFlowPlatform(final SAMFileHeader hdr, final GATKRead read) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you just add a javadoc to this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And also possibly update the class javadoc to point people to this method as the definitive "is this flow based and should i trigger that code" method There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed |
||
if (!hasFlowTags(read)){ | ||
return false; | ||
} | ||
return getReadGroupInfo(hdr, read).isFlowPlatform; | ||
} | ||
public static synchronized ReadGroupInfo getReadGroupInfo(final SAMFileHeader hdr, final GATKRead read) { | ||
|
||
if ( !isFlow(read) ) { | ||
if ( !hasFlowTags(read) ) { | ||
throw new IllegalArgumentException("read must be flow based: " + read); | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -101,7 +101,38 @@ public void testVCFModeIsConsistentWithPastResults(final String inputFileName, f | |
IntegrationTestSpec.assertEqualTextFiles(output, expected); | ||
} | ||
} | ||
|
||
|
||
/* | ||
* Test that minimap2 data are supported and consistent with past results | ||
*/ | ||
@Test | ||
public void testVCFModeMinimap2IsConsistentWithPastResults() throws Exception { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this doesn't get run without adding the @test tag There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
final String inputFileName = NA12878_20_21_WGS_mmp2_bam; | ||
final String referenceFileName = b37_reference_20_21; | ||
Utils.resetRandomGenerator(); | ||
|
||
final File output = createTempFile("testVCFModeIsConsistentWithPastResults", ".vcf"); | ||
final File expected = new File(TEST_FILES_DIR, "expected.testVCFMode.mmp2.gatk4.vcf"); | ||
|
||
final String outputPath = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? expected.getAbsolutePath() : output.getAbsolutePath(); | ||
|
||
final String[] args = { | ||
"-I", inputFileName, | ||
"-R", referenceFileName, | ||
"-L", "20:10000000-10100000", | ||
"-O", outputPath, | ||
"-pairHMM", "AVX_LOGLESS_CACHING", | ||
"--" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false" | ||
}; | ||
|
||
runCommandLine(args); | ||
|
||
// Test for an exact match against past results | ||
if ( ! UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ) { | ||
IntegrationTestSpec.assertEqualTextFiles(output, expected); | ||
} | ||
} | ||
|
||
/* | ||
* Test that in JunctionTree mode we're consistent with past JunctionTree results (over non-complicated data) | ||
*/ | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,19 +1,21 @@ | ||
package org.broadinstitute.hellbender.utils.read; | ||
import htsjdk.samtools.SAMFileHeader; | ||
import htsjdk.samtools.SAMReadGroupRecord; | ||
import htsjdk.samtools.SamReader; | ||
import htsjdk.samtools.SamReaderFactory; | ||
import htsjdk.samtools.*; | ||
import org.broadinstitute.hellbender.GATKBaseTest; | ||
import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection; | ||
import org.testng.Assert; | ||
import org.testng.annotations.DataProvider; | ||
import org.testng.annotations.Test; | ||
|
||
import java.io.File; | ||
import java.nio.file.FileSystems; | ||
import java.nio.file.Path; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
|
||
public class FlowBasedReadUtilsUnitTest extends GATKBaseTest{ | ||
@Test | ||
void testReadGroupParsing() throws Exception{ | ||
void testReadGroupParsing(){ | ||
final String testResourceDir = publicTestDir + "org/broadinstitute/hellbender/utils/read/flow/reads/"; | ||
final String inputDir = testResourceDir + "/input/"; | ||
|
||
|
@@ -24,9 +26,116 @@ void testReadGroupParsing() throws Exception{ | |
FlowBasedReadUtils.ReadGroupInfo frg1 = new FlowBasedReadUtils.ReadGroupInfo(rg1); | ||
assert(frg1.maxClass==12); | ||
assert(frg1.flowOrder.startsWith("TGCA")); | ||
assert(frg1.flowOrder.startsWith("TGCA")); | ||
assert(frg1.isFlowPlatform); | ||
|
||
SAMReadGroupRecord rg2 = header.getReadGroup("UGAv3-73"); | ||
FlowBasedReadUtils.ReadGroupInfo frg2 = new FlowBasedReadUtils.ReadGroupInfo(rg2); | ||
assert(frg2.maxClass==20); | ||
assert(frg2.flowOrder.startsWith("TGCA")); | ||
assert(frg2.isFlowPlatform); | ||
SAMReadGroupRecord rg3 = header.getReadGroup("UGAv3-74"); | ||
FlowBasedReadUtils.ReadGroupInfo frg3 = new FlowBasedReadUtils.ReadGroupInfo(rg3); | ||
assert(!frg3.isFlowPlatform); | ||
|
||
} | ||
|
||
//testing that an occasional read with tp (but not ULTIMA) does not get identified as flow-based. | ||
// Here the read group is: | ||
// @RG ID:test SM:HG001 PL:ILLUMINA | ||
// but the reads have tp because they were aligned with minimap2: | ||
// 20FUKAAXX100202:5:43:9081:11499/2 272 chr14_GL000009v2_random 156269 ... tp:A:S cm:i:12 s1:i:87 de:f:0.0099 rl:i:0 | ||
@Test | ||
void testNonFlowReadGroupParsing(){ | ||
final String testResourceDir = publicTestDir + "org/broadinstitute/hellbender/utils/read/flow/reads/"; | ||
final String inputDir = testResourceDir + "/input/"; | ||
|
||
final Path inputFile = FileSystems.getDefault().getPath(inputDir, "non_flow_reads_with_tp.bam"); | ||
final SamReader reader = SamReaderFactory.makeDefault().open(new File(inputFile.toString())); | ||
SAMFileHeader header = reader.getFileHeader(); | ||
SAMRecord read = reader.iterator().next(); | ||
GATKRead gread = new SAMRecordToGATKReadAdapter(read); | ||
assert(FlowBasedReadUtils.hasFlowTags(gread)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you record the actual readgroups from that file in a comment here and add a bit of explanation to this test? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed |
||
FlowBasedReadUtils.ReadGroupInfo frg1 = FlowBasedReadUtils.getReadGroupInfo(header, gread); | ||
assert(!frg1.isFlowPlatform); | ||
} | ||
|
||
private GATKRead makeRead(final byte[] bases, final boolean isReverse) { | ||
|
||
byte[] quals = new byte[bases.length]; | ||
|
||
final String cigar = String.format("%dM", bases.length); | ||
GATKRead read = ArtificialReadUtils.createArtificialRead(bases, quals, cigar); | ||
read.setPosition("chr1",100); | ||
read.setIsReverseStrand(isReverse); | ||
return read; | ||
} | ||
|
||
@DataProvider(name="isFlowPlatform") | ||
Object[][] getIsFlowPlatformInputs() { | ||
List<Object[]> tests = new ArrayList<>(); | ||
SAMFileHeader hdr = new SAMFileHeader(); | ||
|
||
GATKRead read1 = makeRead(new byte[]{'T','A','G','C','G','A'}, false); | ||
SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); | ||
rg1.setPlatform("ILLUMINA"); | ||
read1.setReadGroup("rg1"); | ||
hdr.addReadGroup(rg1); | ||
tests.add(new Object[]{hdr, read1, false}); | ||
|
||
GATKRead read2 = makeRead(new byte[]{'T','A','G','C','G','A'}, false); | ||
read2.setAttribute("tp","blabla"); | ||
SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); | ||
rg2.setPlatform("ILLUMINA"); | ||
read2.setReadGroup("rg2"); | ||
hdr.addReadGroup(rg2); | ||
tests.add(new Object[]{hdr, read2, false}); | ||
|
||
GATKRead read3 = makeRead(new byte[]{'T','A','G','C','G','A'}, false); | ||
read3.setAttribute("tp",new byte[6]); | ||
SAMReadGroupRecord rg3 = new SAMReadGroupRecord("rg3"); | ||
rg3.setPlatform("LS454"); | ||
read3.setReadGroup("rg3"); | ||
hdr.addReadGroup(rg3); | ||
tests.add(new Object[]{hdr, read3, true}); | ||
|
||
GATKRead read4 = makeRead(new byte[]{'T','A','G','C','G','A'}, false); | ||
read4.setAttribute("tp",new byte[6]); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test looks good, can you add in a check for the special cases where Platform=454 +FO tag present and Platform=454 + no FO tag present? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
SAMReadGroupRecord rg4 = new SAMReadGroupRecord("rg4"); | ||
rg4.setPlatform("ULTIMA"); | ||
rg4.setAttribute("FO","TGCATGCA"); | ||
read4.setReadGroup("rg4"); | ||
hdr.addReadGroup(rg4); | ||
tests.add(new Object[]{hdr, read4, true}); | ||
|
||
GATKRead read5 = makeRead(new byte[]{'T','A','G','C','G','A'}, false); | ||
read5.setAttribute("tp",new byte[6]); | ||
SAMReadGroupRecord rg5 = new SAMReadGroupRecord("rg5"); | ||
rg5.setPlatform("LS454"); | ||
rg5.setAttribute("FO","TGCATGCA"); | ||
read5.setReadGroup("rg5"); | ||
hdr.addReadGroup(rg5); | ||
tests.add(new Object[]{hdr, read5, true}); | ||
|
||
|
||
return tests.toArray(new Object[][]{}); | ||
} | ||
|
||
@Test (dataProvider = "isFlowPlatform") | ||
void testIsFlowPlatform(final SAMFileHeader hdr, final GATKRead read, final boolean answer){ | ||
assert(FlowBasedReadUtils.isFlowPlatform(hdr, read) == answer); | ||
} | ||
|
||
@Test(expectedExceptions = {RuntimeException.class}) | ||
void testMalformedUltimaRGThrowsException(){ | ||
GATKRead read = makeRead(new byte[]{'T','A','G','C','G','A'}, false); | ||
read.setAttribute("tp",new byte[6]); | ||
SAMReadGroupRecord rg = new SAMReadGroupRecord("rg"); | ||
rg.setPlatform("ULTIMA"); | ||
read.setReadGroup("rg"); | ||
SAMFileHeader hdr = new SAMFileHeader(); | ||
hdr.addReadGroup(rg); | ||
FlowBasedReadUtils.isFlowPlatform(hdr, read); | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know 454 data is ancient at this point but i would still prefer this LS454 case to explicitly check for
I'm not sure i trust the Ultima flow ordering code is going to handle 454 data well without testing it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is not this check is supposed to do. Old Ultima data were labeled LS454 and initially did not have the FO tag. Afterwards the tag was added .
The full check is done by isFlowPlatform function that also verifies that the reads have tp tag.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ahh i see... horrifying... well i'm alright to let this through in this case then