Skip to content

Commit

Permalink
Fixed a bug in pileup mode relating to number of haplotypes (#8489)
Browse files Browse the repository at this point in the history
* Limited the number of haplotypes returned in pileup mode to the number requested.
* Modified pileup assembly code to be more deterministic.
  • Loading branch information
jonn-smith committed Aug 28, 2023
1 parent de371aa commit c9bf941
Show file tree
Hide file tree
Showing 8 changed files with 113 additions and 89 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -470,12 +470,22 @@ static Set<Haplotype> filterPileupHaplotypes(final Set<Haplotype> onlyNewHaploty
.filter(kmer -> kmerReadCounts.getOrDefault(kmer, new MutableInt(0)).intValue() > 0)
.count()));

// if there are ties, we pass any haplotype with a score as good as the numPileupHaplotypes-th best
final long minimumPassingScore = scores.values().stream().sorted(Comparator.reverseOrder()).skip(numPileupHaplotypes - 1).findFirst().get();
return onlyNewHaplotypes.stream().filter(h -> scores.get(h) >= minimumPassingScore).collect(Collectors.toSet());
// Get the minimum passing score from all haplotypes:
final long minimumPassingScore = scores.values().stream()
.sorted(Comparator.reverseOrder())
.skip(numPileupHaplotypes - 1)
.findFirst().get();

// If there are ties, we pass any haplotype with a score as good as the numPileupHaplotypes-th best, with
// final ordering determined by string representation (for determinism).
return onlyNewHaplotypes.stream()
.filter(h -> scores.get(h) >= minimumPassingScore)
.sorted(Comparator.<Haplotype, Long>comparing(scores::get).reversed()
.thenComparing(Haplotype::getBaseString)
).limit(numPileupHaplotypes)
.collect(Collectors.toCollection(LinkedHashSet::new));
}


private static Set<Kmer> kmerizeSequence(byte[] sequence, int kmerSize){
final Set<Kmer> allKmers = new LinkedHashSet<>();
final int stopPosition = sequence.length - kmerSize;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -675,23 +675,27 @@ public void injectPileupEvents(final AssemblyRegion region, final AssemblyBasedC
return; // nothing to do
}

if (debug) {
logger.info("Number of haplotypes pre-pileup injection: " + this.getHaplotypeCount());
}

final Haplotype refHaplotype = getReferenceHaplotype();
final Map<Integer, List<Event>> assembledEventByStart = getVariationEvents(argumentCollection.maxMnpDistance).stream()
.collect(Collectors.groupingBy(Event::getStart));
final Collection<Event> assembledIndels = getVariationEvents(argumentCollection.maxMnpDistance).stream().
filter(Event::isIndel).collect(Collectors.toList());
filter(Event::isIndel).toList();

Set<Haplotype> baseHaplotypes = new TreeSet<>();
baseHaplotypes.addAll(getHaplotypeList().stream()
.sorted(Comparator.comparingInt((Haplotype hap) -> hap.isReference() ? 1 : 0).thenComparingDouble(hap -> hap.getScore()).reversed())
.sorted(Comparator.comparingInt((Haplotype hap) -> hap.isReference() ? 1 : 0).thenComparingDouble(Haplotype::getScore).reversed())
.limit(AssemblyBasedCallerUtils.NUM_HAPLOTYPES_TO_INJECT_FORCE_CALLING_ALLELES_INTO)
.collect(Collectors.toList()));
.toList());

//TODO its unclear whether the correct answer here is to use the hardclipped pileup reads (which we used in generating the pileup alleles for specificty reasons)
//TODO or if it would be more accurate to use the softclipped bases here in filtering down the haplotypes. I suspect the latter but I will evaluate later.
Map<Kmer, MutableInt> kmerReadCounts = AssemblyBasedCallerUtils.getKmerReadCounts(region.getHardClippedPileupReads(), argumentCollection.pileupDetectionArgs.filteringKmerSize);

for (final Event event : goodPileupEvents.stream().sorted(Comparator.comparingInt(Event::getStart)).collect(Collectors.toList())) {
for (final Event event : goodPileupEvents.stream().sorted(Comparator.comparingInt(Event::getStart)).toList()) {

if (argumentCollection.pileupDetectionArgs.debugPileupStdout) System.out.println("Processing new Haplotypes for Pileup Allele that was not in the assembly: " + event);

Expand All @@ -705,7 +709,7 @@ public void injectPileupEvents(final AssemblyRegion region, final AssemblyBasedC
continue;
}

final Set<Haplotype> newPileupHaplotypes = new HashSet<>();
final Set<Haplotype> newPileupHaplotypes = new LinkedHashSet<>();
for (final Haplotype baseHaplotype : baseHaplotypes) {
final Haplotype insertedHaplotype = makeHaplotypeWithInsertedEvent(baseHaplotype, refHaplotype, event, aligner, argumentCollection.getHaplotypeToReferenceSWParameters());
if (insertedHaplotype != null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1309,28 +1309,37 @@ public Object[][] filterPileupHaplotypesDataProvider() {
put(new Kmer("GAA"), 1);
}};

final List<Haplotype> bigHaplotypeList = Arrays.asList(hapA,hapB,hapB,hapB,hapB,hapB,hapC,hapD,hapF,hapF,hapF,hapF,hapF,hapF);

// NOTE: we limit the number of haplotypes that are returned by the `numPileupHaplotypes` parameter.

Object[][] tests = new Object[][] {
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,5,3,Arrays.asList(hapA,hapB,hapC,hapD)}, //returns all when no filtering required

// These haplotypes are all equivalent, these test stability of the filtering
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,1,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,2,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,3,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,1,3, List.of(hapD) },
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,2,3,Arrays.asList(hapA,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,3,3,Arrays.asList(hapA,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),flatSupportAllKmers,4,3,Arrays.asList(hapA,hapB,hapC,hapD)},

// Repetitive kmers in hapF don't get double counted
new Object[]{Arrays.asList(hapA,hapB,hapD,hapF),hapFRepeatedKmers,2,3,Arrays.asList(hapF,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapD,hapF),hapFRepeatedKmers,1,3,Arrays.asList(hapF, hapD)}, //currently repeated kmers only count as singular evidence
new Object[]{Arrays.asList(hapA,hapB,hapD,hapF),hapFRepeatedKmers,2,3,Arrays.asList(hapF, hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapD,hapF),hapFRepeatedKmers,1,3, List.of(hapD) }, //currently repeated kmers only count as singular evidence

// These tests demonstrate that the weights in the map don't matter
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,1,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,2,3,Arrays.asList(hapA,hapB,hapC,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,3,3,Arrays.asList(hapA,hapB,hapC,hapD)}, // Despite hapD having good support it is not weighted higher
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,1,3, List.of(hapD) },
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,2,3,Arrays.asList(hapA,hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD),hapDKmersHighSupport,3,3,Arrays.asList(hapA,hapC,hapD)}, // Despite hapD having good support it is not weighted higher

// Test of the output when only one hap has support
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,1,3,Arrays.asList(hapD)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,2,3,Arrays.asList(hapD,hapA, hapC)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,1,3, List.of(hapD) },
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,2,3,Arrays.asList(hapD,hapA)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,3,3,Arrays.asList(hapD,hapA,hapC)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,4,3,Arrays.asList(hapD,hapA,hapC,hapB,hapF)},
new Object[]{Arrays.asList(hapA,hapB,hapC,hapD,hapF),hapDKmers,4,3,Arrays.asList(hapD,hapA,hapC,hapB)},

// Test of when there are a LOT of haplotypes and we want to limit the number returned:
new Object[]{bigHaplotypeList, hapDKmers, 1, 3, List.of(hapD)},
new Object[]{bigHaplotypeList, hapDKmers, 2, 3, List.of(hapA, hapD)},
};

return tests;
Expand All @@ -1345,7 +1354,8 @@ public void testFilterPileupHaplotypes(final List<Haplotype> inputHaplotypes,
final List<Haplotype> expected) {
final Map<Kmer, MutableInt> counts = kmerReadCounts.entrySet().stream()
.collect(Collectors.toMap(entry -> entry.getKey(), entry -> new MutableInt(entry.getValue())));
Set<Haplotype> actual = AssemblyBasedCallerUtils.filterPileupHaplotypes(new HashSet<>(inputHaplotypes), counts, numPileupHaplotypes, kmerSize);

final Set<Haplotype> actual = AssemblyBasedCallerUtils.filterPileupHaplotypes(new HashSet<>(inputHaplotypes), counts, numPileupHaplotypes, kmerSize);

Assert.assertEquals(actual, new HashSet<>(expected));
}
Expand Down
Loading

0 comments on commit c9bf941

Please sign in to comment.