Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Created a debug output mode that dumps the the exact inputs/outputs of the PairHMM to a file #7660

Merged
merged 6 commits into from
Mar 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ public static ReadLikelihoodCalculationEngine createLikelihoodCalculationEngine(
: QualityUtils.qualToErrorProbLog10(likelihoodArgs.phredScaledGlobalReadMismappingRate);

return new PairHMMLikelihoodCalculationEngine((byte) likelihoodArgs.gcpHMM, likelihoodArgs.dontUseDragstrPairHMMScores ? null : DragstrParamUtils.parse(likelihoodArgs.dragstrParams),
likelihoodArgs.pairHMMNativeArgs.getPairHMMArgs(), likelihoodArgs.pairHMM, log10GlobalReadMismappingRate, likelihoodArgs.pcrErrorModel,
likelihoodArgs.pairHMMNativeArgs.getPairHMMArgs(), likelihoodArgs.pairHMM, likelihoodArgs.pairHmmResultsFile, log10GlobalReadMismappingRate, likelihoodArgs.pcrErrorModel,
likelihoodArgs.BASE_QUALITY_SCORE_THRESHOLD, likelihoodArgs.enableDynamicReadDisqualification, likelihoodArgs.readDisqualificationThresholdConstant,
likelihoodArgs.expectedErrorRatePerBase, !likelihoodArgs.disableSymmetricallyNormalizeAllelesToReference, likelihoodArgs.disableCapReadQualitiesToMapQ, handleSoftclips);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,13 @@ public final class LikelihoodEngineArgumentCollection implements Serializable {
@Argument(fullName="dynamic-read-disqualification-threshold", doc="Constant used to scale the dynamic read disqualificaiton")
public double readDisqualificationThresholdConstant = PairHMMLikelihoodCalculationEngine.DEFAULT_DYNAMIC_DISQUALIFICATION_SCALE_FACTOR;

/**
* Argument for generating a file of all of the inputs and outputs for the pair hmm
*/
@Advanced
@Argument(fullName="pair-hmm-results-file", doc="File to write exact pairHMM inputs/outputs to for debugging purposes", optional = true)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do all of the available HMMs respect this argument, or only some of them? If only some, list the ones that respect it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all of them as far as i've tested. I have a test for all of them but the AVX was producing some different output on travis...

public GATKPath pairHmmResultsFile = null;

@ArgumentCollection
public PairHMMNativeArgumentCollection pairHMMNativeArgs = new PairHMMNativeArgumentCollection();
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.gatk.nativebindings.pairhmm.PairHMMNativeArguments;
import org.broadinstitute.hellbender.engine.GATKPath;
import org.broadinstitute.hellbender.utils.dragstr.DragstrParams;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.QualityUtils;
Expand All @@ -18,6 +19,7 @@
import org.broadinstitute.hellbender.utils.read.ReadUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVariantContextUtils;

import java.io.OutputStreamWriter;
import java.util.*;
import java.util.function.ToDoubleFunction;

Expand Down Expand Up @@ -104,14 +106,15 @@ public PairHMMLikelihoodCalculationEngine(final byte constantGCP,
final PairHMM.Implementation hmmType,
final double log10globalReadMismappingRate,
final PCRErrorModel pcrErrorModel) {
this( constantGCP, dragstrParams, arguments, hmmType, log10globalReadMismappingRate, pcrErrorModel, PairHMM.BASE_QUALITY_SCORE_THRESHOLD, false, DEFAULT_DYNAMIC_DISQUALIFICATION_SCALE_FACTOR, DEFAULT_EXPECTED_ERROR_RATE_PER_BASE, true, false, true);
this( constantGCP, dragstrParams, arguments, hmmType, null, log10globalReadMismappingRate, pcrErrorModel, PairHMM.BASE_QUALITY_SCORE_THRESHOLD, false, DEFAULT_DYNAMIC_DISQUALIFICATION_SCALE_FACTOR, DEFAULT_EXPECTED_ERROR_RATE_PER_BASE, true, false, true);
}

/**
* Create a new PairHMMLikelihoodCalculationEngine using provided parameters and hmm to do its calculations
*
* @param constantGCP the gap continuation penalty to use with the PairHMM
* @param hmmType the type of the HMM to use
* @param resultsFile output file to dump per-read, per-haplotype inputs and outputs for debugging purposes (null if not enabled).
* @param log10globalReadMismappingRate the global mismapping probability, in log10(prob) units. A value of
* -3 means that the chance that a read doesn't actually belong at this
* location in the genome is 1 in 1000. The effect of this parameter is
Expand All @@ -128,6 +131,7 @@ public PairHMMLikelihoodCalculationEngine(final byte constantGCP,
final DragstrParams dragstrParams,
final PairHMMNativeArguments arguments,
final PairHMM.Implementation hmmType,
final GATKPath resultsFile,
jamesemery marked this conversation as resolved.
Show resolved Hide resolved
final double log10globalReadMismappingRate,
final PCRErrorModel pcrErrorModel,
final byte baseQualityScoreThreshold,
Expand All @@ -150,6 +154,9 @@ public PairHMMLikelihoodCalculationEngine(final byte constantGCP,
this.log10globalReadMismappingRate = log10globalReadMismappingRate;
this.pcrErrorModel = this.dragstrParams == null ? pcrErrorModel : PCRErrorModel.NONE;
this.pairHMM = hmmType.makeNewHMM(arguments);
if (resultsFile != null) {
pairHMM.setAndInitializeDebugOutputStream(new OutputStreamWriter(resultsFile.getOutputStream()));
}
this.dynamicDisqualification = dynamicReadDisqualificaiton;
this.readDisqualificationScale = readDisqualificationScale;
this.symmetricallyNormalizeAllelesToReference = symmetricallyNormalizeAllelesToReference;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package org.broadinstitute.hellbender.utils.pairhmm;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.SAMUtils;
import htsjdk.variant.variantcontext.Allele;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.gatk.nativebindings.pairhmm.PairHMMNativeArguments;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.MathUtils;
import org.broadinstitute.hellbender.utils.Utils;
Expand All @@ -13,6 +15,8 @@
import org.broadinstitute.hellbender.utils.read.GATKRead;

import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
Expand All @@ -31,6 +35,8 @@ public abstract class PairHMM implements Closeable{
protected byte[] previousHaplotypeBases;
protected int hapStartIndex;

protected OutputStreamWriter debugOutputStream;

public enum Implementation {
/* Very slow implementation which uses very accurate log10 sum functions. Only meant to be used as a reference test implementation */
EXACT(args -> {
Expand Down Expand Up @@ -228,6 +234,7 @@ public void computeLog10Likelihoods(final LikelihoodMatrix<GATKRead, Haplotype>
readBases, readQuals, readInsQuals, readDelQuals, overallGCP, isFirstHaplotype, nextAlleleBases);
logLikelihoods.set(a, readIndex, lk);
mLogLikelihoodArray[idx++] = lk;
writeToResultsFileIfApplicable(readBases, readQuals, readInsQuals, readDelQuals, overallGCP, alleleBases, lk);
jamesemery marked this conversation as resolved.
Show resolved Hide resolved
}
readIndex++;
}
Expand All @@ -239,6 +246,7 @@ public void computeLog10Likelihoods(final LikelihoodMatrix<GATKRead, Haplotype>
}
}


/**
* Compute the total probability of read arising from haplotypeBases given base substitution, insertion, and deletion
* probabilities.
Expand Down Expand Up @@ -350,12 +358,46 @@ public double[] getLogLikelihoodArray() {
return mLogLikelihoodArray;
}

/**
* Attach a debugOuputStream to this HMM instance
*/
public void setAndInitializeDebugOutputStream(final OutputStreamWriter writer) {
try {
debugOutputStream = writer;
debugOutputStream.write("# hap-bases read-bases read-qual read-ins-qual read-del-qual gcp expected-result");
} catch (IOException e) {
throw new GATKException("Error writing to specified HMM results output stream", e);
}
}

/**
* Method to be invoked by implementing HMM engines to output the various hmm inputs/outputs with uniform formatting.
*/
protected void writeToResultsFileIfApplicable(byte[] readBases, byte[] readQuals, byte[] readInsQuals, byte[] readDelQuals, byte[] overallGCP, byte[] alleleBases, double lk) {

if (debugOutputStream!= null) {
try {
debugOutputStream.write("\n" + new String(alleleBases) + " " + new String(readBases) + " " + SAMUtils.phredToFastq(readQuals) + " " + SAMUtils.phredToFastq(readInsQuals) + " " + SAMUtils.phredToFastq(readDelQuals) + " " + SAMUtils.phredToFastq(overallGCP) + " " + String.format("%e",lk));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Confirming that the values in this table will never contain internal whitespace?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so. If the alleles or the phred to fastq mehtods are returning results with whitespaces in them we have much bigger problems...

} catch (IOException e) {
throw new GATKException("Error writing to specified HMM results output stream", e);
}
}
}

/**
* Called at the end of the program to close files, print profiling information etc
*/
@Override
public void close() {
if(doProfiling)
logger.info("Total compute time in PairHMM computeLogLikelihoods() : "+(pairHMMComputeTime*1e-9));
if(doProfiling) {
logger.info("Total compute time in PairHMM computeLogLikelihoods() : " + (pairHMMComputeTime * 1e-9));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the time unit here? Seconds?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seconds I believe just as it has in the past for gatk. Its using nanoTime so converting to seconds should be 10^-9.

}
if(debugOutputStream != null) {
try {
debugOutputStream.close();
} catch (IOException e) {
throw new GATKException("Error closing the pairHMM debug output stream", e);
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ public void computeLog10Likelihoods(final LikelihoodMatrix<GATKRead, Haplotype>
//get idx of current haplotype in the list and use this idx to get the right likelihoodValue
final int idxInsideHaplotypeList = haplotypeToHaplotypeListIdxMap.get(haplotype);
logLikelihoods.set(hapIdx, r, mLogLikelihoodArray[readIdx + idxInsideHaplotypeList]);
writeToResultsFileIfApplicable(readDataArray[r].readBases, readDataArray[r].readQuals, readDataArray[r].insertionGOP, readDataArray[r].deletionGOP, readDataArray[r].overallGCP, haplotype.getBases(), mLogLikelihoodArray[readIdx + idxInsideHaplotypeList]);
++hapIdx;
}
readIdx += numHaplotypes;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.pairhmm.PairHMM;
import org.broadinstitute.hellbender.utils.read.GATKRead;
import org.broadinstitute.hellbender.utils.text.XReadLines;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;
import org.broadinstitute.hellbender.utils.variant.HomoSapiensConstants;
import org.testng.Assert;
Expand Down Expand Up @@ -1606,4 +1608,58 @@ private static boolean isGVCFReferenceBlock( final VariantContext vc ) {
vc.getAlternateAlleles().size() == 1 &&
vc.getAlternateAllele(0).equals(Allele.NON_REF_ALLELE);
}


@DataProvider(name="PairHMMResultsModes")
public Object[][] PairHMMResultsModes() {
return new Object[][] {
{PairHMM.Implementation.AVX_LOGLESS_CACHING, new File(TEST_FILES_DIR, "expected.AVX.hmmresults.txt")},
{PairHMM.Implementation.LOGLESS_CACHING, new File(TEST_FILES_DIR, "expected.Java.hmmresults.txt")},
{PairHMM.Implementation.ORIGINAL, new File(TEST_FILES_DIR, "expected.Original.hmmresults.txt")},
{PairHMM.Implementation.EXACT, new File(TEST_FILES_DIR, "expected.Exact.hmmresults.txt")},

};
}

@Test(dataProvider = "PairHMMResultsModes")
public void testPairHMMResultsFile(final PairHMM.Implementation implementation, final File expected) throws Exception {
Utils.resetRandomGenerator();

final File vcfOutput = createTempFile("hmmResultFileTest", ".vcf");
final File hmmOutput = createTempFile("hmmResult", ".txt");

final String hmmOutputPath = UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ? expected.getAbsolutePath() : hmmOutput.getAbsolutePath();

final String[] args = {
"-I", NA12878_20_21_WGS_bam,
"-R", b37_reference_20_21,
"-L", "20:10000117",
"-ip", "100",
"-O", vcfOutput.getAbsolutePath(),
"--pair-hmm-results-file", hmmOutputPath,
"-pairHMM", implementation.name()
};

runCommandLine(args);

if ( ! UPDATE_EXACT_MATCH_EXPECTED_OUTPUTS ) {
// Travis instances appear to produce subtly different results for the AVX caching results. Here we ensure that
// the test is weak enough to pass even if there are some integer rounding mismatches.
// TODO It merits investigation into what exactly is mismatching on travis
if (implementation == PairHMM.Implementation.AVX_LOGLESS_CACHING) {
XReadLines actualLines = new XReadLines(hmmOutput);
XReadLines expectedLines = new XReadLines(expected);

while (actualLines.hasNext() && expectedLines.hasNext()) {
final String expectedLine = expectedLines.next();
final String actualLine = actualLines.next();
Assert.assertEquals(actualLine.split(" ").length, expectedLine.split(" ").length);
}
Assert.assertEquals(actualLines.hasNext(), expectedLines.hasNext());
// For the java HMMs we expect exact matching outputs so we assert that.
} else {
IntegrationTestSpec.assertEqualTextFiles(hmmOutput, expected);
}
}
}
}
Loading