Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
fcunial committed Aug 26, 2023
1 parent 1cbd43a commit 3539336
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 19 deletions.
2 changes: 1 addition & 1 deletion src/de/mpi_cbg/revant/apps/CollectKmers.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public static void main(String[] args) throws IOException {
else lastAvoidedInterval=-1;
RepeatAlphabet.loadBoundaries(str3);
readLength=Integer.parseInt(str4);
RepeatAlphabet.getKmers(str1,K,kmers,null,avoidedIntervals,lastAvoidedInterval,readLength,-1/*argument not used*/,-1/*argument not used*/,-1/*argument not used*/,-1/*argument not used*/,-1/*argument not used*/,RepeatAlphabet.boundaries,-1/*argument not used*/,-1/*argument not used*/,-1.0/*argument not used*/,tmpKmer,tmpArray2,tmpArray3,tmpMap,tmpChar);
RepeatAlphabet.getKmers(str1,K,kmers,null,avoidedIntervals,lastAvoidedInterval,readLength,-1/*argument not used*/,-1/*argument not used*/,-1/*argument not used*/,-1/*argument not used*/,-1/*argument not used*/,-1/*argument not used*/,RepeatAlphabet.boundaries,-1/*argument not used*/,-1/*argument not used*/,-1.0/*argument not used*/,tmpKmer,tmpArray2,tmpArray3,tmpMap,tmpChar);
str1=br1.readLine(); str2=INTERVALS_FILE_EXISTS?br2.readLine():null;
str3=br3.readLine(); str4=br4.readLine(); row++;
}
Expand Down
6 changes: 4 additions & 2 deletions src/de/mpi_cbg/revant/apps/CompactKmers.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import java.io.*;

import de.mpi_cbg.revant.util.Math;
import de.mpi_cbg.revant.util.IO;

/**
* Given a sorted file that contains the k-mers extracted by several threads, the program
Expand Down Expand Up @@ -32,6 +33,7 @@ public static void main(String[] args) throws IOException {
final String OUTPUT_FILE_HISTOGRAM = args[13].equalsIgnoreCase("null")?null:args[13];

final double SIGNIFICANCE_LEVEL = 0.05; // Conventional
final int MIN_MISSING_LENGTH = IO.quantum; // Arbitrary
final String SEPARATOR = ",";

boolean equal;
Expand Down Expand Up @@ -85,7 +87,7 @@ public static void main(String[] args) throws IOException {
else {
kmer.set(previous,K,previousCount,previousCountPartial,previousSameReadCount);
if ( (DISCARD_SAME_READ_KMERS?previousSameReadCount==1:true) &&
(KEEP_ALL_FREQUENT?kmer.isFrequent(K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL):(kmer.isUnique(K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,MIN_ALIGNMENT_LENGTH,SIGNIFICANCE_LEVEL)!=-1))
(KEEP_ALL_FREQUENT?kmer.isFrequent(K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL):(kmer.isUnique(K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,MIN_ALIGNMENT_LENGTH,MIN_MISSING_LENGTH,SIGNIFICANCE_LEVEL)!=-1))
) {
for (i=0; i<K; i++) bw.write(previous[i]+SEPARATOR);
bw.write(previousCount+SEPARATOR+previousCountPartial+SEPARATOR+previousSameReadCount+"\n");
Expand All @@ -99,7 +101,7 @@ public static void main(String[] args) throws IOException {
br.close();
kmer.set(previous,K,previousCount,previousCountPartial,previousSameReadCount);
if ( (DISCARD_SAME_READ_KMERS?previousSameReadCount==1:true) &&
(KEEP_ALL_FREQUENT?kmer.isFrequent(K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL):(kmer.isUnique(K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,MIN_ALIGNMENT_LENGTH,SIGNIFICANCE_LEVEL)!=-1))
(KEEP_ALL_FREQUENT?kmer.isFrequent(K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL):(kmer.isUnique(K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,MIN_ALIGNMENT_LENGTH,MIN_MISSING_LENGTH,SIGNIFICANCE_LEVEL)!=-1))
) {
for (i=0; i<K; i++) bw.write(previous[i]+SEPARATOR);
bw.write(previousCount+SEPARATOR+previousCountPartial+SEPARATOR+previousSameReadCount+"\n");
Expand Down
5 changes: 4 additions & 1 deletion src/de/mpi_cbg/revant/apps/GetShortestUniqueIntervals.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import java.util.Arrays;
import java.util.HashMap;

import de.mpi_cbg.revant.util.IO;

/**
* Let a shortest unique intervals file contain, for every translated read, the list of
* all intervals such that: (1) every interval is the occurrence of a unique h-mer for all
Expand Down Expand Up @@ -36,6 +38,7 @@ public static void main(String[] args) throws IOException {
final String NEW_INTERVALS_FILE = args[15]; // Output

boolean OLD_INTERVALS_FILE_EXISTS = !OLD_INTERVALS_FILE.equalsIgnoreCase("null");
final int MIN_MISSING_LENGTH = IO.quantum; // Arbitrary

int i, p;
int row, nBlocks, nPairs, lastUniqueInterval, readLength;
Expand Down Expand Up @@ -95,7 +98,7 @@ public static void main(String[] args) throws IOException {
else lastUniqueInterval=-1;
RepeatAlphabet.loadBoundaries(str3);
readLength=Integer.parseInt(str4);
lastUniqueInterval=RepeatAlphabet.getKmers(str1,K,null,kmers,uniqueIntervals,lastUniqueInterval,readLength,N_READS,AVG_READ_LENGTH,GENOME_LENGTH,N_HAPLOTYPES,MIN_ALIGNMENT_LENGTH,RepeatAlphabet.boundaries,IDENTITY_THRESHOLD,DISTANCE_THRESHOLD,CHARACTER_FRACTION,tmpKmer,tmpArray2,tmpArray3,null,tmpChar);
lastUniqueInterval=RepeatAlphabet.getKmers(str1,K,null,kmers,uniqueIntervals,lastUniqueInterval,readLength,N_READS,AVG_READ_LENGTH,GENOME_LENGTH,N_HAPLOTYPES,MIN_ALIGNMENT_LENGTH,MIN_MISSING_LENGTH,RepeatAlphabet.boundaries,IDENTITY_THRESHOLD,DISTANCE_THRESHOLD,CHARACTER_FRACTION,tmpKmer,tmpArray2,tmpArray3,null,tmpChar);
if (lastUniqueInterval>0) {
nPairs=(lastUniqueInterval+1)/3;
if (pairs.length<nPairs) {
Expand Down
41 changes: 26 additions & 15 deletions src/de/mpi_cbg/revant/apps/RepeatAlphabet.java
Original file line number Diff line number Diff line change
Expand Up @@ -2064,12 +2064,14 @@ else if (c<=lastUnique_old) {
* @param identityThreshold,distanceThreshold used only when $newKmers$ is null; see
* $isCharacterAmbiguousInBlock()$;
* @param minAlignmentLength read-repeat;
* @param minMissingLength min length that must be missing from a character for it to
* be considerd a partial occurrence;
* @param tmpKmer temporary space;
* @param tmpArray2 temporary space, of size at least k;
* @param tmpArray3 temporary space, of size at least 2k;
* @param tmpMap temporary hashmap, used only if $newKmers$ is not null.
*/
public static final int getKmers(String str, int k, HashMap<Kmer,Kmer> newKmers, HashMap<Kmer,Kmer> oldKmers, int[] avoidedIntervals, int lastAvoidedInterval, int readLength, int nReads, int avgReadLength, long genomeLength, int nHaplotypes, int minAlignmentLength, int[] boundaries, int identityThreshold, int distanceThreshold, double characterFraction, Kmer tmpKmer, int[] tmpArray2, int[] tmpArray3, HashMap<Kmer,Kmer> tmpMap, Character tmpChar) {
public static final int getKmers(String str, int k, HashMap<Kmer,Kmer> newKmers, HashMap<Kmer,Kmer> oldKmers, int[] avoidedIntervals, int lastAvoidedInterval, int readLength, int nReads, int avgReadLength, long genomeLength, int nHaplotypes, int minAlignmentLength, int minMissingLength, int[] boundaries, int identityThreshold, int distanceThreshold, double characterFraction, Kmer tmpKmer, int[] tmpArray2, int[] tmpArray3, HashMap<Kmer,Kmer> tmpMap, Character tmpChar) {
final int UNIQUE_MODE = 1;
final int MAX_KMERS_TO_ENUMERATE = 500000; // Arbitrary, just for speedup.
int i, j, p;
Expand Down Expand Up @@ -2103,7 +2105,7 @@ public static final int getKmers(String str, int k, HashMap<Kmer,Kmer> newKmers,
nKmers=lastInBlock_int[i]+1;
for (p=i+1; p<=i+k-1; p++) nKmers*=lastInBlock_int[p]+1;
if (nKmers<0 || nKmers>MAX_KMERS_TO_ENUMERATE) continue;
nHaplotypesPrime=getKmers_impl(i,k,nBlocks,null,oldKmers,nReads,avgReadLength,genomeLength,nHaplotypes,minAlignmentLength,tmpKmer,stack,tmpArray2,tmpArray3);
nHaplotypesPrime=getKmers_impl(i,k,nBlocks,readLength,null,oldKmers,nReads,avgReadLength,genomeLength,nHaplotypes,minAlignmentLength,minMissingLength,tmpKmer,stack,tmpArray2,tmpArray3);
if (nHaplotypesPrime==-1) continue;
if ( (i!=0 && i!=nBlocks-k) ||
(i==0 && i!=nBlocks-k && !isCharacterAmbiguousInBlock(tmpArray2[0],intBlocks[0],lastInBlock_int[0],true,boundaries,nBlocks,readLength,identityThreshold,distanceThreshold,characterFraction)) ||
Expand All @@ -2127,7 +2129,7 @@ public static final int getKmers(String str, int k, HashMap<Kmer,Kmer> newKmers,
nKmers=lastInBlock_int[i]+1;
for (p=i+1; p<=i+k-1; p++) nKmers*=lastInBlock_int[p]+1;
if (nKmers<0 || nKmers>MAX_KMERS_TO_ENUMERATE) continue;
getKmers_impl(i,k,nBlocks,tmpMap,oldKmers,-1,-1,-1,-1,-1,tmpKmer,stack,tmpArray2,tmpArray3);
getKmers_impl(i,k,nBlocks,readLength,tmpMap,oldKmers,-1,-1,-1,-1,-1,-1,tmpKmer,stack,tmpArray2,tmpArray3);
}
iterator=tmpMap.keySet().iterator();
while (iterator.hasNext()) {
Expand Down Expand Up @@ -2326,6 +2328,8 @@ private static final Kmer kmerPool_allocate(int k) {
* be already initialized.
*
* @param minAlignmentLength read-repeat;
* @param minMissingLength min length that must be missing from a character for it to
* be considerd a partial occurrence;
* @param key temporary space;
* @param tmpArray1 temporary space, of size at least equal to 3 times the number of
* elements in $intBlocks$ plus one;
Expand All @@ -2334,37 +2338,39 @@ private static final Kmer kmerPool_allocate(int k) {
* canonization, was found in $oldKmers$;
* @param tmpArray3 temporary space, of size at least 2k.
*/
private static final int getKmers_impl(int first, int k, int nBlocks, HashMap<Kmer,Kmer> newKmers, HashMap<Kmer,Kmer> oldKmers, int nReads, int avgReadLength, long genomeLength, int nHaplotypes, int minAlignmentLength, Kmer key, int[] tmpArray1, int[] tmpArray2, int[] tmpArray3) {
private static final int getKmers_impl(int first, int k, int nBlocks, int readLength, HashMap<Kmer,Kmer> newKmers, HashMap<Kmer,Kmer> oldKmers, int nReads, int avgReadLength, long genomeLength, int nHaplotypes, int minAlignmentLength, int minMissingLength, Kmer key, int[] tmpArray1, int[] tmpArray2, int[] tmpArray3) {
final int SPANNING_BPS = (IO.quantum*3)>>1; // Arbitrary
final double SIGNIFICANCE_LEVEL = 0.05; // Conventional
final int MIN_MISSING_LENGTH = IO.quantum; // Arbitrary
int i;
int top1, top2, row, column, lastChild, length;
int top1, top2, row, column, lastChild, length, firstLength, lastLength;
Kmer value, newKey;

top1=-1; top2=-1;
tmpArray1[++top1]=-1; tmpArray1[++top1]=-1; tmpArray1[++top1]=first-1;
while (top1>=0) {
row=tmpArray1[top1]; column=tmpArray1[top1-1]; lastChild=tmpArray1[top1-2];
if (row==first+k-1) {
key.set(tmpArray2,0,k,true);
key.canonize(k,tmpArray3);
key.set(tmpArray2,0,k,true);
firstLength=alphabet[tmpArray2[0]].getLength(); lastLength=alphabet[tmpArray2[k-1]].getLength();
key.canonize(k,tmpArray3);
if (newKmers!=null) { // Counting k-mers
value=newKmers.get(key);
if (value==null) {
newKey=kmerPool_allocate(k);
newKey.set(key.sequence,0,k,true);
if (first==0 || first+k-1==nBlocks-1) { newKey.count=0; newKey.countPartial=1; }
if ((first==0 && boundaries[0]<firstLength-MIN_MISSING_LENGTH) || (first+k-1==nBlocks-1 && readLength-(nBlocks>1?boundaries[nBlocks-2]:0)<lastLength-MIN_MISSING_LENGTH)) { newKey.count=0; newKey.countPartial=1; }
else { newKey.count=1; newKey.countPartial=0; }
newKmers.put(newKey,newKey);
}
else {
if (first==0 || first+k-1==nBlocks-1) value.countPartial++;
if ((first==0 && boundaries[0]<firstLength-MIN_MISSING_LENGTH) || (first+k-1==nBlocks-1 && readLength-(nBlocks>1?boundaries[nBlocks-2]:0)<lastLength-MIN_MISSING_LENGTH)) value.countPartial++;
else value.count++;
}
}
else { // Marking k-mers
value=oldKmers.get(key);
if (value!=null) return value.isUnique(k,nReads,avgReadLength,SPANNING_BPS,genomeLength,nHaplotypes,minAlignmentLength,SIGNIFICANCE_LEVEL);
if (value!=null) return value.isUnique(k,nReads,avgReadLength,SPANNING_BPS,genomeLength,nHaplotypes,minAlignmentLength,minMissingLength,SIGNIFICANCE_LEVEL);
}
}
if (row==first+k-1 || lastChild==lastInBlock_int[row+1]) { top1-=3; top2--; }
Expand Down Expand Up @@ -3375,32 +3381,37 @@ else if (tmpArray[i]>tmpArray[k+i]) {
* fully observed in a read;
* @param genomeLength of one haplotype;
* @param minAlignmentLength read-repeat;
* @param minMissingLength min length that must be missing from a character for it
* to be considerd a partial occurrence;
* @return -1 if the p-value of $count$ fails the significance test for every
* ploidy of the k-mer (assuming a Poisson distribution for every possible
* ploidy); this happens if the k-mer is noise or a repeat; otherwise, the
* smallest ploidy of the k-mer for which the significance test does not fail.
*/
public final int isUnique(int k, int nReads, int avgReadLength, int spanningBps, long genomeLength, int nHaplotypes, int minAlignmentLength, double significanceLevel) {
public final int isUnique(int k, int nReads, int avgReadLength, int spanningBps, long genomeLength, int nHaplotypes, int minAlignmentLength, int minMissingLength, double significanceLevel) {
int i, length;
double p;
PoissonDistribution distribution;

length=0;
for (i=0; i<k; i++) length+=alphabet[sequence[i]].getLength();
final double surface = avgReadLength-length-(spanningBps<<1);
if (surface<0) return -1;
if (surface<=0) return -1;
final double baseFull = surface/genomeLength;
final double surfaceL = avgReadLength-spanningBps-(length-alphabet[sequence[0]].getLength()+minAlignmentLength);
final double surfaceR = avgReadLength-spanningBps-(length-alphabet[sequence[k-1]].getLength()+minAlignmentLength);
final double basePartial = (surfaceL+surfaceR)/genomeLength;
final double surfaceL = Math.max(alphabet[sequence[0]].getLength()-minAlignmentLength-minMissingLength,0);
final double surfaceR = Math.max(alphabet[sequence[k-1]].getLength()-minAlignmentLength-minMissingLength,0);
final double basePartial = (surfaceL+surfaceR==0?2:surfaceL+surfaceR)/genomeLength;
final int quantum = nReads/nHaplotypes;
boolean fabio = k==5 && (sequence[0]>=869 && sequence[0]<=877) && (sequence[1]>=1951 && sequence[1]<=1982) && (sequence[2]>=1951 && sequence[2]<=1982) && (sequence[3]>=1951 && sequence[3]<=1982) && (sequence[4]>=1951 && sequence[4]<=1982) && count==6 && countPartial==6;
for (i=0; i<nHaplotypes; i++) {
distribution=PoissonDistribution.of(baseFull*quantum*(i+1));
p=distribution.cumulativeProbability(count);
p=2.0*Math.min(p,1.0-p);
if (fabio) System.err.println("isUnique> 1 p="+p+" of="+(baseFull*quantum*(i+1))+" significanceLevel="+significanceLevel+" count="+count+" countPartial="+countPartial+" avgReadLength="+avgReadLength+" length="+length+" surface="+surface+" genomeLength="+genomeLength+" nReads="+nReads+" nHaplotypes="+nHaplotypes);
if (p<=significanceLevel) continue;
distribution=PoissonDistribution.of(basePartial*quantum*(i+1));
p=distribution.survivalProbability(countPartial);
if (fabio) System.err.println("isUnique> 2 p="+p+" of="+(basePartial*quantum*(i+1))+" significanceLevel="+significanceLevel+" count="+count+" countPartial="+countPartial);
if (p>significanceLevel) return i+1; // Excluding only repeats
}
return -1;
Expand Down

0 comments on commit 3539336

Please sign in to comment.