Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed an issue of data that contain tp tag sometimes recognized as flow #8337

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ public static void finalizeRegion(final AssemblyRegion region,
if ( region.isFinalized() ) {
return;
}

final byte minTailQualityToUse = errorCorrectReads ? HaplotypeCallerEngine.MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION : minTailQuality;

final List<GATKRead> readsToUse = new ArrayList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ public enum NGSPlatform {
ION_TORRENT(SequencerFlowClass.FLOW, "IONTORRENT"),
CAPILLARY(SequencerFlowClass.OTHER, "CAPILLARY"),
HELICOS(SequencerFlowClass.OTHER, "HELICOS"),
ULTIMA(SequencerFlowClass.FLOW, "ULTIMA"),
UNKNOWN(SequencerFlowClass.OTHER, "UNKNOWN");


/**
* Array of the prefix names in a BAM file for each of the platforms.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ public FlowBasedRead(final GATKRead read, final String flowOrder, final int maxH
public FlowBasedRead(final SAMRecord samRecord, final String flowOrder, final int maxHmer, final FlowBasedArgumentCollection fbargs) {
super(samRecord);
Utils.nonNull(fbargs);
Utils.validate(FlowBasedReadUtils.isFlow(samRecord), "FlowBasedRead can only be used on flow reads. failing read: " + samRecord);
Utils.validate(FlowBasedReadUtils.hasFlowTags(samRecord), "FlowBasedRead can only be used on flow reads. failing read: " + samRecord);
this.fbargs = fbargs;
this.maxHmer = maxHmer;
this.samRecord = samRecord;
Expand Down Expand Up @@ -352,7 +352,7 @@ private void parseSingleHmer(final double[] probs, final byte[] tp, final int fl
if (flowMatrix[loc][flowIdx] == fbargs.fillingValue) {
flowMatrix[loc][flowIdx] = probs[i];
} else {
flowMatrix[loc][flowIdx] += probs[i];
flowMatrix[loc][flowIdx] += probs[i];
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,29 @@
package org.broadinstitute.hellbender.utils.read;

import gov.nih.nlm.ncbi.ngs.NGS;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.util.SequenceUtil;
import org.broadinstitute.hellbender.cmdline.argumentcollections.MarkDuplicatesSparkArgumentCollection;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;
import org.broadinstitute.hellbender.utils.NGSPlatform;
import org.broadinstitute.hellbender.utils.SequencerFlowClass;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.LinkedHashMap;
import java.util.Map;


/**
* utility class for flow based read
* Utility class for working with flow-based reads
*
* The main member static class is {@code ReadGroupInfo} that contains methods that allow
* working with headers of flow based reads and extracting the flow order and the maximal hmer class called.
* It also contains methods to check how the read was clipped ({@code readEndMarkedUnclipped}, {@code readEndMarkedUncertain})
* Lastly, {@code FlowBasedReadUtils.isFlowPlatform} is **the** function to determine if the data are flow-based
*
*/
public class FlowBasedReadUtils {

Expand All @@ -26,17 +36,35 @@ public class FlowBasedReadUtils {
static public class ReadGroupInfo {
final public String flowOrder;
final public int maxClass;

final public boolean isFlowPlatform;
private String reversedFlowOrder = null;

public ReadGroupInfo(final SAMReadGroupRecord readGroup) {
if (readGroup.getPlatform()==null){
isFlowPlatform = false;
} else if (NGSPlatform.fromReadGroupPL(readGroup.getPlatform())==NGSPlatform.UNKNOWN){
isFlowPlatform = false;
} else if (NGSPlatform.fromReadGroupPL(readGroup.getPlatform()) == NGSPlatform.LS454) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know 454 data is ancient at this point but i would still prefer this LS454 case to explicitly check for

if (readGroup.getFlowOrder()!=null) {
                    isFlowPlatform = true; else false;

I'm not sure i trust the Ultima flow ordering code is going to handle 454 data well without testing it

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is not this check is supposed to do. Old Ultima data were labeled LS454 and initially did not have the FO tag. Afterwards the tag was added .
The full check is done by isFlowPlatform function that also verifies that the reads have tp tag.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh i see... horrifying... well i'm alright to let this through in this case then

//old Ultima data can have PLATFORM==LS454 and not have FO tag
isFlowPlatform = true;
} else if (NGSPlatform.fromReadGroupPL(readGroup.getPlatform()) == NGSPlatform.ULTIMA){
if (readGroup.getFlowOrder()!=null) {
isFlowPlatform = true;
} else {
throw new RuntimeException("Malformed Ultima read group identified, aborting: " + readGroup);
}
} else {
isFlowPlatform = false;
}

Utils.nonNull(readGroup);
this.flowOrder = readGroup.getFlowOrder();
Utils.nonNull(this.flowOrder);

String mc = readGroup.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG);
this.maxClass = (mc == null) ? FlowBasedRead.MAX_CLASS : Integer.parseInt(mc);
if (isFlowPlatform) {
this.flowOrder = readGroup.getFlowOrder();
String mc = readGroup.getAttribute(FlowBasedRead.MAX_CLASS_READ_GROUP_TAG);
this.maxClass = (mc == null) ? FlowBasedRead.MAX_CLASS : Integer.parseInt(mc);
} else { // not a flow platform
this.flowOrder = null;
this.maxClass = 0;
}
}

public synchronized String getReversedFlowOrder() {
Expand All @@ -45,6 +73,7 @@ public synchronized String getReversedFlowOrder() {
}
return reversedFlowOrder;
}

}

public static boolean readEndMarkedUncertain(final GATKRead rec) {
Expand Down Expand Up @@ -118,22 +147,36 @@ public static int flowSumOfBaseQualities(final GATKRead read) {
}
}

public static boolean isFlow(final GATKRead rec) {
public static boolean hasFlowTags(final GATKRead rec) {
return rec.hasAttribute(FlowBasedRead.FLOW_MATRIX_TAG_NAME)
|| rec.hasAttribute(FlowBasedRead.FLOW_MATRiX_OLD_TAG_KR)
|| rec.hasAttribute(FlowBasedRead.FLOW_MATRiX_OLD_TAG_TI);
}

public static boolean isFlow(final SAMRecord rec) {
public static boolean hasFlowTags(final SAMRecord rec) {
return rec.hasAttribute(FlowBasedRead.FLOW_MATRIX_TAG_NAME)
|| rec.hasAttribute(FlowBasedRead.FLOW_MATRiX_OLD_TAG_KR)
|| rec.hasAttribute(FlowBasedRead.FLOW_MATRiX_OLD_TAG_TI);

}

/**
*
* This is the function to run if you want to ask if the data are flow-based
*
* @param hdr - file header
* @param read - the read
* @return true if the read is flow-based
*/
public static boolean isFlowPlatform(final SAMFileHeader hdr, final GATKRead read) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you just add a javadoc to this?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And also possibly update the class javadoc to point people to this method as the definitive "is this flow based and should i trigger that code" method

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

if (!hasFlowTags(read)){
return false;
}
return getReadGroupInfo(hdr, read).isFlowPlatform;
}
public static synchronized ReadGroupInfo getReadGroupInfo(final SAMFileHeader hdr, final GATKRead read) {

if ( !isFlow(read) ) {
if ( !hasFlowTags(read) ) {
throw new IllegalArgumentException("read must be flow based: " + read);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public FlowModeFragment(final GATKRead first, final SAMFileHeader header, int pa
headerLibraryMap.get(MarkDuplicatesSparkUtils.getLibraryForRead(first, header, LibraryIdGenerator.UNKNOWN_LIBRARY)));

this.score = (this.end != FlowBasedReadUtils.FLOW_BASED_INSIGNIFICANT_END)
? ((mdArgs.FLOW_QUALITY_SUM_STRATEGY && FlowBasedReadUtils.isFlow(first)) ? computeFlowDuplicateScore(first, start, end) : scoringStrategy.score(first))
? ((mdArgs.FLOW_QUALITY_SUM_STRATEGY && FlowBasedReadUtils.hasFlowTags(first)) ? computeFlowDuplicateScore(first, start, end) : scoringStrategy.score(first))
: -1;
}

Expand Down
2 changes: 2 additions & 0 deletions src/test/java/org/broadinstitute/hellbender/GATKBaseTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ public abstract class GATKBaseTest extends BaseTest {

// ~600,000 reads from chromosomes 20 and 21 of an NA12878 WGS bam aligned to b37, plus ~50,000 unmapped reads
public static final String NA12878_20_21_WGS_bam = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam";
public static final String NA12878_20_21_WGS_mmp2_bam = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.mmp2.bam";
public static final String NA12878_20_21_WGS_cram = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.cram";

public static final String NA12878_20_21_covered_regions = publicTestDir + "wgs_calling_regions.v1.chr20_chr21.interval_list";

// ~10,000 reads from chromosome 20 of NA12878 RNA-seq, aligned to b37 with STAR
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,38 @@ public void testVCFModeIsConsistentWithPastResults(final String inputFileName, f
IntegrationTestSpec.assertEqualTextFiles(output, expected);
}
}


/*
* Test that minimap2 data are supported and consistent with past results
*/
@Test
public void testVCFModeMinimap2IsConsistentWithPastResults() throws Exception {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doesn't get run without adding the @test tag

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

final String inputFileName = NA12878_20_21_WGS_mmp2_bam;
final String referenceFileName = b37_reference_20_21;
Utils.resetRandomGenerator();

final File output = createTempFile("testVCFModeIsConsistentWithPastResults", ".vcf");
final File expected = new File(TEST_FILES_DIR, "expected.testVCFMode.mmp2.gatk4.vcf");

final String outputPath = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? expected.getAbsolutePath() : output.getAbsolutePath();

final String[] args = {
"-I", inputFileName,
"-R", referenceFileName,
"-L", "20:10000000-10100000",
"-O", outputPath,
"-pairHMM", "AVX_LOGLESS_CACHING",
"--" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"
};

runCommandLine(args);

// Test for an exact match against past results
if ( ! UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ) {
IntegrationTestSpec.assertEqualTextFiles(output, expected);
}
}

/*
* Test that in JunctionTree mode we're consistent with past JunctionTree results (over non-complicated data)
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,21 @@
package org.broadinstitute.hellbender.utils.read;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.*;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.tools.FlowBasedArgumentCollection;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;


public class FlowBasedReadUtilsUnitTest extends GATKBaseTest{
@Test
void testReadGroupParsing() throws Exception{
void testReadGroupParsing(){
final String testResourceDir = publicTestDir + "org/broadinstitute/hellbender/utils/read/flow/reads/";
final String inputDir = testResourceDir + "/input/";

Expand All @@ -24,9 +26,116 @@ void testReadGroupParsing() throws Exception{
FlowBasedReadUtils.ReadGroupInfo frg1 = new FlowBasedReadUtils.ReadGroupInfo(rg1);
assert(frg1.maxClass==12);
assert(frg1.flowOrder.startsWith("TGCA"));
assert(frg1.flowOrder.startsWith("TGCA"));
assert(frg1.isFlowPlatform);

SAMReadGroupRecord rg2 = header.getReadGroup("UGAv3-73");
FlowBasedReadUtils.ReadGroupInfo frg2 = new FlowBasedReadUtils.ReadGroupInfo(rg2);
assert(frg2.maxClass==20);
assert(frg2.flowOrder.startsWith("TGCA"));
assert(frg2.isFlowPlatform);
SAMReadGroupRecord rg3 = header.getReadGroup("UGAv3-74");
FlowBasedReadUtils.ReadGroupInfo frg3 = new FlowBasedReadUtils.ReadGroupInfo(rg3);
assert(!frg3.isFlowPlatform);

}

//testing that an occasional read with tp (but not ULTIMA) does not get identified as flow-based.
// Here the read group is:
// @RG ID:test SM:HG001 PL:ILLUMINA
// but the reads have tp because they were aligned with minimap2:
// 20FUKAAXX100202:5:43:9081:11499/2 272 chr14_GL000009v2_random 156269 ... tp:A:S cm:i:12 s1:i:87 de:f:0.0099 rl:i:0
@Test
void testNonFlowReadGroupParsing(){
final String testResourceDir = publicTestDir + "org/broadinstitute/hellbender/utils/read/flow/reads/";
final String inputDir = testResourceDir + "/input/";

final Path inputFile = FileSystems.getDefault().getPath(inputDir, "non_flow_reads_with_tp.bam");
final SamReader reader = SamReaderFactory.makeDefault().open(new File(inputFile.toString()));
SAMFileHeader header = reader.getFileHeader();
SAMRecord read = reader.iterator().next();
GATKRead gread = new SAMRecordToGATKReadAdapter(read);
assert(FlowBasedReadUtils.hasFlowTags(gread));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you record the actual readgroups from that file in a comment here and add a bit of explanation to this test?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed

FlowBasedReadUtils.ReadGroupInfo frg1 = FlowBasedReadUtils.getReadGroupInfo(header, gread);
assert(!frg1.isFlowPlatform);
}

private GATKRead makeRead(final byte[] bases, final boolean isReverse) {

byte[] quals = new byte[bases.length];

final String cigar = String.format("%dM", bases.length);
GATKRead read = ArtificialReadUtils.createArtificialRead(bases, quals, cigar);
read.setPosition("chr1",100);
read.setIsReverseStrand(isReverse);
return read;
}

@DataProvider(name="isFlowPlatform")
Object[][] getIsFlowPlatformInputs() {
List<Object[]> tests = new ArrayList<>();
SAMFileHeader hdr = new SAMFileHeader();

GATKRead read1 = makeRead(new byte[]{'T','A','G','C','G','A'}, false);
SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1");
rg1.setPlatform("ILLUMINA");
read1.setReadGroup("rg1");
hdr.addReadGroup(rg1);
tests.add(new Object[]{hdr, read1, false});

GATKRead read2 = makeRead(new byte[]{'T','A','G','C','G','A'}, false);
read2.setAttribute("tp","blabla");
SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2");
rg2.setPlatform("ILLUMINA");
read2.setReadGroup("rg2");
hdr.addReadGroup(rg2);
tests.add(new Object[]{hdr, read2, false});

GATKRead read3 = makeRead(new byte[]{'T','A','G','C','G','A'}, false);
read3.setAttribute("tp",new byte[6]);
SAMReadGroupRecord rg3 = new SAMReadGroupRecord("rg3");
rg3.setPlatform("LS454");
read3.setReadGroup("rg3");
hdr.addReadGroup(rg3);
tests.add(new Object[]{hdr, read3, true});

GATKRead read4 = makeRead(new byte[]{'T','A','G','C','G','A'}, false);
read4.setAttribute("tp",new byte[6]);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test looks good, can you add in a check for the special cases where Platform=454 +FO tag present and Platform=454 + no FO tag present?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

SAMReadGroupRecord rg4 = new SAMReadGroupRecord("rg4");
rg4.setPlatform("ULTIMA");
rg4.setAttribute("FO","TGCATGCA");
read4.setReadGroup("rg4");
hdr.addReadGroup(rg4);
tests.add(new Object[]{hdr, read4, true});

GATKRead read5 = makeRead(new byte[]{'T','A','G','C','G','A'}, false);
read5.setAttribute("tp",new byte[6]);
SAMReadGroupRecord rg5 = new SAMReadGroupRecord("rg5");
rg5.setPlatform("LS454");
rg5.setAttribute("FO","TGCATGCA");
read5.setReadGroup("rg5");
hdr.addReadGroup(rg5);
tests.add(new Object[]{hdr, read5, true});


return tests.toArray(new Object[][]{});
}

@Test (dataProvider = "isFlowPlatform")
void testIsFlowPlatform(final SAMFileHeader hdr, final GATKRead read, final boolean answer){
assert(FlowBasedReadUtils.isFlowPlatform(hdr, read) == answer);
}

@Test(expectedExceptions = {RuntimeException.class})
void testMalformedUltimaRGThrowsException(){
GATKRead read = makeRead(new byte[]{'T','A','G','C','G','A'}, false);
read.setAttribute("tp",new byte[6]);
SAMReadGroupRecord rg = new SAMReadGroupRecord("rg");
rg.setPlatform("ULTIMA");
read.setReadGroup("rg");
SAMFileHeader hdr = new SAMFileHeader();
hdr.addReadGroup(rg);
FlowBasedReadUtils.isFlowPlatform(hdr, read);
}

}
Git LFS file not shown
Git LFS file not shown
Loading