Skip to content

Commit

Permalink
added Poisson tests for unique k-mers
Browse files Browse the repository at this point in the history
  • Loading branch information
fcunial committed Aug 13, 2023
1 parent f0cbc18 commit df0e8b6
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 26 deletions.
3 changes: 2 additions & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#
SOURCE_DIR="./src"
SOURCE_DIR_NESTED="${SOURCE_DIR}/de/mpi_cbg/revant"
LIBRARIES="./lib/commons-statistics-distribution-1.0.jar"
BUILD_DIR="./bin"
DEBUGGING_INFO="1" # 0/1

Expand All @@ -24,4 +25,4 @@ javac ${COMPILER_FLAGS} -cp ${BUILD_DIR} -d ${BUILD_DIR} ${SOURCE_DIR_NESTED}/fr
javac ${COMPILER_FLAGS} -cp ${BUILD_DIR} -d ${BUILD_DIR} ${SOURCE_DIR_NESTED}/consensus/*.java
javac ${COMPILER_FLAGS} -cp ${BUILD_DIR} -d ${BUILD_DIR} ${SOURCE_DIR_NESTED}/graphics/*.java
javac ${COMPILER_FLAGS} -cp ${BUILD_DIR} -d ${BUILD_DIR} ${SOURCE_DIR_NESTED}/biology/*.java
javac ${COMPILER_FLAGS} -cp ${BUILD_DIR} -d ${BUILD_DIR} ${SOURCE_DIR_NESTED}/apps/*.java
javac ${COMPILER_FLAGS} -cp ${BUILD_DIR}:${LIBRARIES} -d ${BUILD_DIR} ${SOURCE_DIR_NESTED}/apps/*.java
11 changes: 7 additions & 4 deletions scripts/6-repeatAlphabet/3-getUniqueSubstrings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
# This is the only section of the script that needs to be customized.
#
INPUT_DIR=$1
N_HAPLOTYPES=$2
HAPLOTYPE_COVERAGE=$3 # Of one haplotype
GENOME_LENGTH=$2
N_HAPLOTYPES=$3
MAX_K=$4 # Stops looking for unique k-mers after this length. Should be set using the
# histogram of recoded lengths.
N_THREADS=$5
Expand All @@ -24,13 +24,16 @@ DISTANCE_THRESHOLD=$8
CHARACTER_THRESHOLD=$9
UNIQUE_MODE="1" # Non-repetitive blocks are allowed in a k-mer, except at the first/last
# position of the k-mer. Usually a good choice.
SPANNING_BPS="150" # Bps before and after a k-mer to consider it observed in a read.
# ----------------------------------------------------------------------------------------

set -o pipefail; set -e; set -u
export LC_ALL=C # To speed up the $sort$ command.
READ_LENGTHS_FILE="${INPUT_DIR}/reads-lengths.txt"
READ_IDS_FILE="${INPUT_DIR}/reads-ids.txt"
N_READS=$(wc -l < ${READ_IDS_FILE})
READ_LENGTHS_FILE="${INPUT_DIR}/reads-lengths.txt"
AVG_READ_LENGTH=$(paste -sd+ ${READ_LENGTHS_FILE} | bc)
AVG_READ_LENGTH=$(( ${AVG_READ_LENGTH} / ${N_READS} ))
TMPFILE_NAME="getUniqueSubstrings-tmp"
TMPFILE_PATH="${INPUT_DIR}/${TMPFILE_NAME}"
READS_TRANSLATED_FILE="${INPUT_DIR}/reads-translated-disambiguated.txt"
Expand Down Expand Up @@ -101,7 +104,7 @@ for K in $(seq 1 ${MAX_K}); do
UNIQUE_KMERS_FILE="${INPUT_DIR}/unique-k${K}.txt"
OUTPUT_FILE_HISTOGRAM="${INPUT_DIR}/histogram-k${K}.txt"
echo "Finding unique ${K}-mers..."
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CompactKmers ${TMPFILE_PATH}-${K}.txt ${K} ${MIN_FREQUENCY_UNIQUE} ${MAX_FREQUENCY_UNIQUE} ${UNIQUE_KMERS_FILE} ${MAX_HISTOGRAM_COUNT} 1 ${OUTPUT_FILE_HISTOGRAM}
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CompactKmers ${TMPFILE_PATH}-${K}.txt ${K} ${GENOME_LENGTH} ${N_HAPLOTYPES} ${N_READS} ${AVG_READ_LENGTH} ${SPANNING_BPS} 1 ${ALPHABET_FILE} ${MAX_HISTOGRAM_COUNT} ${UNIQUE_KMERS_FILE} ${OUTPUT_FILE_HISTOGRAM}
echo "Updating shortest unique intervals file..."
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-0-*"); do
THREAD_ID=${FILE#${INPUT_DIR}/${TMPFILE_NAME}-0-}
Expand Down
3 changes: 2 additions & 1 deletion scripts/6-repeatAlphabet/master.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#
INPUT_DIR=$1
# ---------------------------- Properties of the genome ----------------------------------
GENOME_LENGTH=$(wc -c < ${INPUT_DIR}/genome.fasta)
N_HAPLOTYPES="1"
HAPLOTYPE_COVERAGE="30" # Of one haplotype
# ----------------------------- Properties of the reads ----------------------------------
Expand Down Expand Up @@ -56,7 +57,7 @@ fi
PERIODIC_ENDPOINTS_FIXED=$(cat ${INPUT_DIR}/buildAlphabet-tmp-return.txt)
MIN_K_FOR_DISAMBIGUATION="2"; MAX_K_FOR_DISAMBIGUATION="4"
./2-fixEndBlocks.sh ${INPUT_DIR} ${BROKEN_READS} ${HAPLOTYPE_COVERAGE} ${LOW_QUALITY_TYPE} ${MIN_K_FOR_DISAMBIGUATION} ${MAX_K_FOR_DISAMBIGUATION} ${N_THREADS} ${DELETE_TMP_FILES}
./3-getUniqueSubstrings.sh ${INPUT_DIR} ${N_HAPLOTYPES} ${HAPLOTYPE_COVERAGE} ${MAX_K_UNIQUE} ${N_THREADS} ${DELETE_TMP_FILES} ${IDENTITY_THRESHOLD} ${DISTANCE_THRESHOLD} ${CHARACTER_THRESHOLD}
./3-getUniqueSubstrings.sh ${INPUT_DIR} ${GENOME_LENGTH} ${N_HAPLOTYPES} ${MAX_K_UNIQUE} ${N_THREADS} ${DELETE_TMP_FILES} ${IDENTITY_THRESHOLD} ${DISTANCE_THRESHOLD} ${CHARACTER_THRESHOLD}
./4-filterAlignments.sh ${INPUT_DIR} ${BROKEN_READS} ${PERIODIC_ENDPOINTS_FIXED} ${MIN_ALIGNMENT_LENGTH_READ_READ} ${MIN_ALIGNMENT_LENGTH_READ_REPEAT} ${MAX_K_UNIQUE} ${ALIGNMENT_FILTERING_MODE} ${MIN_INTERSECTION_NONREPETITIVE} ${N_THREADS} ${DELETE_TMP_FILES}
READ_LENGTHS_FILE="${INPUT_DIR}/reads-lengths.txt"
N_READS=$(wc -l < ${READ_LENGTHS_FILE})
Expand Down
74 changes: 55 additions & 19 deletions src/de/mpi_cbg/revant/apps/CompactKmers.java
Original file line number Diff line number Diff line change
@@ -1,38 +1,45 @@
package de.mpi_cbg.revant.apps;

import java.io.*;
import org.apache.commons.statistics.distribution.PoissonDistribution;

import de.mpi_cbg.revant.util.Math;

/**
* Given a sorted file that contains the k-mers extracted by several threads, the program
* sums up all the counts of the same k-mer, it discards k-mers whose total count is
* outside a given range or that occur multiple times inside the same read, and it prints
* a histogram of total counts for all k-mers.
* sums up all the counts of the same k-mer, it discards k-mers whose total count fails a
* significance test at every possible ploidy of the k-mer, or that occur multiple times
* inside the same read, and it prints a histogram of total counts for all k-mers.
*/
public class CompactKmers {
public static void main(String[] args) throws IOException {
final String INPUT_FILE = args[0];
final int K = Integer.parseInt(args[1]);
final int MIN_COUNT = Integer.parseInt(args[2]);
final int MAX_COUNT = Integer.parseInt(args[3]); // -1 to impose no max
final String OUTPUT_FILE_KMERS = args[4];
final int MAX_HISTOGRAM_COUNT = Integer.parseInt(args[5]);
final boolean DISCARD_SAME_READ_KMERS = Integer.parseInt(args[6])==1;
final String OUTPUT_FILE_HISTOGRAM = args[7].equalsIgnoreCase("null")?null:args[7];
final long GENOME_LENGTH = Long.parseLong(args[2]); // Of one haplotype
final int N_HAPLOTYPES = Integer.parseInt(args[3]);
final int N_READS = Integer.parseInt(args[4]);
final int AVG_READ_LENGTH = Integer.parseInt(args[5]);
final int SPANNING_BPS = Integer.parseInt(args[6]);
final boolean DISCARD_SAME_READ_KMERS = Integer.parseInt(args[7])==1;
final String ALPHABET_FILE = args[8];
final int MAX_HISTOGRAM_COUNT = Integer.parseInt(args[9]);
final String OUTPUT_FILE_KMERS = args[10];
final String OUTPUT_FILE_HISTOGRAM = args[11].equalsIgnoreCase("null")?null:args[11];

final double SIGNIFICANCE_LEVEL = 0.05; // Conventional

boolean equal;
int i;
int sameReadCount, previousSameReadCount;
long count, previousCount;
int count, previousCount, sameReadCount, previousSameReadCount;
String str;
BufferedReader br;
BufferedWriter bw;
int[] previous, current, tmpArray;
long[] histogram;
String[] tokens;


RepeatAlphabet.deserializeAlphabet(ALPHABET_FILE,2);
br = new BufferedReader(new FileReader(INPUT_FILE));
str=br.readLine();
if (str==null) {
Expand All @@ -49,7 +56,7 @@ public static void main(String[] args) throws IOException {
previous = new int[K];
tokens=str.split(",");
for (i=0; i<K; i++) previous[i]=Integer.parseInt(tokens[i]);
previousCount=Long.parseLong(tokens[K]);
previousCount=Integer.parseInt(tokens[K]);
previousSameReadCount=Integer.parseInt(tokens[K+1]);
current = new int[K];
str=br.readLine();
Expand All @@ -60,35 +67,64 @@ public static void main(String[] args) throws IOException {
current[i]=Integer.parseInt(tokens[i]);
if (current[i]!=previous[i]) equal=false;
}
count=Long.parseLong(tokens[K]);
count=Integer.parseInt(tokens[K]);
sameReadCount=Integer.parseInt(tokens[K+1]);
if (equal) {
previousCount+=count;
previousSameReadCount=Math.max(previousSameReadCount,sameReadCount);
}
else {
if (previousCount>=MIN_COUNT && (MAX_COUNT==-1||previousCount<=MAX_COUNT) && (DISCARD_SAME_READ_KMERS?previousSameReadCount==1:true)) {
if ((DISCARD_SAME_READ_KMERS?previousSameReadCount==1:true) && isUnique(previousCount,previous,K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL)!=-1) {
for (i=0; i<K; i++) bw.write(previous[i]+",");
bw.write(previousCount+"\n");
}
if (OUTPUT_FILE_HISTOGRAM!=null) histogram[previousCount>MAX_HISTOGRAM_COUNT?MAX_HISTOGRAM_COUNT:(int)previousCount]++;
if (OUTPUT_FILE_HISTOGRAM!=null) histogram[previousCount>MAX_HISTOGRAM_COUNT?MAX_HISTOGRAM_COUNT:previousCount]++;
tmpArray=previous; previous=current; current=tmpArray;
previousCount=count; previousSameReadCount=sameReadCount;
}
str=br.readLine();
}
br.close();
if (previousCount>=MIN_COUNT && (MAX_COUNT==-1||previousCount<=MAX_COUNT) && (DISCARD_SAME_READ_KMERS?previousSameReadCount==1:true)) {
if ((DISCARD_SAME_READ_KMERS?previousSameReadCount==1:true) && isUnique(previousCount,previous,K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL)!=-1) {
for (i=0; i<K; i++) bw.write(previous[i]+",");
bw.write(previousCount+"\n");
}
bw.close();
if (OUTPUT_FILE_HISTOGRAM!=null) {
histogram[previousCount>MAX_HISTOGRAM_COUNT?MAX_HISTOGRAM_COUNT:(int)previousCount]++;
histogram[previousCount>MAX_HISTOGRAM_COUNT?MAX_HISTOGRAM_COUNT:previousCount]++;
bw = new BufferedWriter(new FileWriter(OUTPUT_FILE_HISTOGRAM));
for (i=0; i<=MAX_HISTOGRAM_COUNT; i++) bw.write(i+","+histogram[i]+"\n");
bw.close();
}
}


/**
* @param nReads in the entire dataset;
* @param spanningBps basepairs before and after $kmer$ for it to be considered
* observed in a read;
* @param genomeLength of one haplotype;
* @return -1 if the p-value of $count$ fails the two-sided significance test for
* every ploidy of $kmer$ (assuming a Poisson distribution for every possible ploidy);
* this happens if $kmer$ is noise or a repeat; otherwise, the smallest ploidy of
* $kmer$ for which the significance test does not fail.
*/
private static final int isUnique(int count, int[] kmer, int k, int nReads, int avgReadLength, int spanningBps, long genomeLength, int nHaplotypes, double significanceLevel) {
int i, length;
double p;
PoissonDistribution distribution;

length=0;
for (i=0; i<k; i++) RepeatAlphabet.alphabet[kmer[i]].getLength();
final double base = ((double)(avgReadLength-((spanningBps)<<1)-length))/genomeLength;
final int quantum = nReads/nHaplotypes;
for (i=0; i<nHaplotypes; i++) {
distribution=PoissonDistribution.of(base*quantum*(i+1));
p=distribution.cumulativeProbability(count);
p=2.0*Math.min(p,1.0-p);
if (p>significanceLevel) return i+1;
}
return -1;
}

}
2 changes: 1 addition & 1 deletion src/de/mpi_cbg/revant/util/Math.java
Original file line number Diff line number Diff line change
Expand Up @@ -947,5 +947,5 @@ public static final void reverse(int[] array, int first, int last) {
a++; b--;
}
}

}

0 comments on commit df0e8b6

Please sign in to comment.