Skip to content

Commit

Permalink
Update HTSJDK to 4.1.1 and Picard to 3.2.0
Browse files Browse the repository at this point in the history
Included a unit test to check for the presence of the fix in HTSJDK 4.1.1 for the
CRAM base corruption bug reported in #8768

Resolves #8768
  • Loading branch information
droazen committed Jun 29, 2024
1 parent 92dc4ae commit a070c47
Show file tree
Hide file tree
Showing 2 changed files with 175 additions and 2 deletions.
4 changes: 2 additions & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ repositories {
mavenLocal()
}

final htsjdkVersion = System.getProperty('htsjdk.version','4.1.0')
final picardVersion = System.getProperty('picard.version','3.1.1')
final htsjdkVersion = System.getProperty('htsjdk.version','4.1.1')
final picardVersion = System.getProperty('picard.version','3.2.0')
final barclayVersion = System.getProperty('barclay.version','5.0.0')
final sparkVersion = System.getProperty('spark.version', '3.5.0')
final hadoopVersion = System.getProperty('hadoop.version', '3.3.6')
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
package org.broadinstitute.hellbender.engine;

import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.cram.build.CRAMReferenceRegion;
import htsjdk.samtools.cram.ref.ReferenceContext;
import htsjdk.samtools.cram.ref.ReferenceSource;
import htsjdk.samtools.cram.structure.AlignmentContext;
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class CRAMSupportUnitTest extends GATKBaseTest {

private static final int REFERENCE_SEQUENCE_ZERO = 0;
private static final int REFERENCE_CONTIG_LENGTH = 10000;

private static final SAMFileHeader SAM_FILE_HEADER = createSAMFileHeader();

private static SAMFileHeader createSAMFileHeader() {
final List<SAMSequenceRecord> sequenceRecords = new ArrayList<>();
sequenceRecords.add(new SAMSequenceRecord("0", REFERENCE_CONTIG_LENGTH));
sequenceRecords.add(new SAMSequenceRecord("1", REFERENCE_CONTIG_LENGTH));
final SAMFileHeader header = new SAMFileHeader(new SAMSequenceDictionary(sequenceRecords));
header.setSortOrder(SAMFileHeader.SortOrder.coordinate);
return header;
}

/*
* This is a unit test ported from CRAMReferenceRegionTest in HTSJDK that tests for the conditions that
* trigger the CRAM base corruption bug reported in https://github.com/broadinstitute/gatk/issues/8768 and
* fixed in HTSJDK 4.1.1 / GATK 4.6.0.0.
*
* This test fails using HTSJDK version 4.1.0, which predates the fix in https://github.com/samtools/htsjdk/pull/1708
* for the bug reported in https://github.com/broadinstitute/gatk/issues/8768
*
* Simulates the state transitions that occur when writing a CRAM file; specifically, use transitions that mirror
* the ones that occur when writing a CRAM with the conditions that affect (and that fail without the fix to)
* https://github.com/broadinstitute/gatk/issues/8768, i.e., a container with one or more containers with reads
* aligned to position 1 of a given contig, followed by one or more containers with reads aligned past position 1
* of the same contig.
*/
@Test
public void testCRAMBaseCorruptionBugIssue8768() {
// start with an entire reference sequence
final CRAMReferenceRegion cramReferenceRegion = getAlternatingReferenceRegion();
cramReferenceRegion.fetchReferenceBases(REFERENCE_SEQUENCE_ZERO);
final long fullRegionFragmentLength = cramReferenceRegion.getRegionLength();
Assert.assertEquals(fullRegionFragmentLength, REFERENCE_CONTIG_LENGTH);

// transition to a shorter reference fragment using fetchReferenceBasesByRegion, then back to the full region
final int SHORT_FRAGMENT_LENGTH = 5;
Assert.assertTrue(SHORT_FRAGMENT_LENGTH < fullRegionFragmentLength);
cramReferenceRegion.fetchReferenceBasesByRegion(REFERENCE_SEQUENCE_ZERO, 0, SHORT_FRAGMENT_LENGTH);
Assert.assertEquals(cramReferenceRegion.getRegionLength(), SHORT_FRAGMENT_LENGTH);

// now transition back to the full sequence; this is where the bug previously would have occurred
cramReferenceRegion.fetchReferenceBases(REFERENCE_SEQUENCE_ZERO);
// this assert would fail without the fix
Assert.assertEquals(cramReferenceRegion.getRegionLength(), fullRegionFragmentLength);

// transition to a shorter region fragment length using fetchReferenceBasesByRegion(AlignmentContext), then back to the full region
Assert.assertTrue(SHORT_FRAGMENT_LENGTH < fullRegionFragmentLength);
cramReferenceRegion.fetchReferenceBasesByRegion(
new AlignmentContext(
new ReferenceContext(REFERENCE_SEQUENCE_ZERO),
1,
SHORT_FRAGMENT_LENGTH));
Assert.assertEquals(cramReferenceRegion.getRegionLength(), SHORT_FRAGMENT_LENGTH);

// now transition back to the full sequence
cramReferenceRegion.fetchReferenceBases(REFERENCE_SEQUENCE_ZERO);
Assert.assertEquals(cramReferenceRegion.getRegionLength(), fullRegionFragmentLength);
}

private static CRAMReferenceRegion getAlternatingReferenceRegion() {
return new CRAMReferenceRegion(
new ReferenceSource(getReferenceFileWithAlternatingBases(REFERENCE_CONTIG_LENGTH)),
SAM_FILE_HEADER.getSequenceDictionary());
}

private static InMemoryReferenceSequenceFile getReferenceFileWithAlternatingBases(final int length) {
final InMemoryReferenceSequenceFile referenceFile = new InMemoryReferenceSequenceFile();

// one contig with repeated ACGT...
final byte[] seq0Bases = getRepeatingBaseSequence(REFERENCE_CONTIG_LENGTH, false);
referenceFile.add("0", seq0Bases);

// one contig with repeated TGCA...
final byte[] seq1Bases = getRepeatingBaseSequence(REFERENCE_CONTIG_LENGTH, true);
referenceFile.add("1", seq1Bases);

return referenceFile;
}

// fill an array with the repeated base sequence "ACGTACGTACGT...", or reversed
private static byte[] getRepeatingBaseSequence(final int length, final boolean reversed) {
byte[] bases = new byte[length];
for (int i = 0; (i + 4) < bases.length; i += 4) {
bases[i] = (byte) (reversed ? 'T' : 'A');
bases[i+1] = (byte) (reversed ? 'G' : 'C');
bases[i+2] = (byte) (reversed ? 'C' : 'G');
bases[i+3] = (byte) (reversed ? 'A' : 'T');
}
return bases;
}
private static class InMemoryReferenceSequenceFile implements
ReferenceSequenceFile {
Map<String, ReferenceSequence> map = new HashMap<String, ReferenceSequence>();
List<String> index;
int current = 0;

public void add(final String name, final byte[] bases) {
final ReferenceSequence sequence = new ReferenceSequence(name,
map.size(), bases);
map.put(sequence.getName(), sequence);
}

@Override
public void reset() {
current = 0;
}

@Override
public ReferenceSequence nextSequence() {
if (current >= index.size()) return null;
return map.get(index.get(current++));
}

@Override
public boolean isIndexed() {
return true;
}

@Override
public ReferenceSequence getSubsequenceAt(final String contig, final long start,
final long stop) {
final ReferenceSequence sequence = getSequence(contig);
if (sequence == null) return null;
final byte[] bases = new byte[(int) (stop - start) + 1];
System.arraycopy(sequence.getBases(), (int) start - 1, bases, 0,
bases.length);
return new ReferenceSequence(contig, sequence.getContigIndex(),
bases);
}

@Override
public SAMSequenceDictionary getSequenceDictionary() {
return null;
}

@Override
public ReferenceSequence getSequence(final String contig) {
return map.get(contig);
}

@Override
public void close() throws IOException {
map.clear();
index.clear();
current = 0;
}
}
}

0 comments on commit a070c47

Please sign in to comment.