Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
fcunial committed Aug 17, 2023
1 parent b2fe7a6 commit d13475e
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 11 deletions.
10 changes: 9 additions & 1 deletion scripts/6-repeatAlphabet/2-fixEndBlocks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ MIN_K=$5 # One plus the min length of a context used for disambiguation
MAX_K=$6 # One plus the max length of a context used for disambiguation
N_THREADS=$7
DELETE_TMP_FILES=$8
GENOME_LENGTH=$9
N_HAPLOTYPES=${10}
SPANNING_BPS="150" # Bps before and after a k-mer to consider it observed in a read.
# ------------------------------------ REVANT --------------------------------------------
REVANT_LIBRARIES="${REVANT_BINARIES}/../lib/*.jar"
# ----------------------------------------------------------------------------------------

set -o pipefail; set -e; set -u
Expand All @@ -32,6 +37,8 @@ TMPFILE_PATH="${INPUT_DIR}/${TMPFILE_NAME}"
READ_IDS_FILE="${INPUT_DIR}/reads-ids.txt"
N_READS=$(wc -l < ${READ_IDS_FILE})
READ_LENGTHS_FILE="${INPUT_DIR}/reads-lengths.txt"
AVG_READ_LENGTH=$(paste -sd+ ${READ_LENGTHS_FILE} | bc)
AVG_READ_LENGTH=$(( ${AVG_READ_LENGTH} / ${N_READS} ))
READS_TRANSLATED_FILE="${INPUT_DIR}/reads-translated-new.txt"
READS_BOUNDARIES_FILE="${INPUT_DIR}/reads-translated-boundaries-new.txt"
READS_DISAMBIGUATED_FILE="${INPUT_DIR}/reads-translated-disambiguated.txt"
Expand Down Expand Up @@ -86,8 +93,9 @@ for K in $(seq ${MIN_K} ${MAX_K}); do
break
fi
FREQUENT_KMERS_FILE="${INPUT_DIR}/frequent-k${K}.txt"
OUTPUT_FILE_HISTOGRAM="${INPUT_DIR}/histogram-k${K}.txt"
echo "Finding frequent ${K}-mers..."
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CompactKmers ${TMPFILE_PATH}-${K}.txt ${K} ${MIN_FREQUENCY_UNIQUE} -1 ${FREQUENT_KMERS_FILE} 0 0 null
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}:${REVANT_LIBRARIES}" de.mpi_cbg.revant.apps.CompactKmers ${TMPFILE_PATH}-${K}.txt ${K} ${GENOME_LENGTH} ${N_HAPLOTYPES} ${N_READS} ${AVG_READ_LENGTH} ${SPANNING_BPS} 0 ${ALPHABET_FILE} 1 10000 ${FREQUENT_KMERS_FILE} ${OUTPUT_FILE_HISTOGRAM}
echo "Computing $((${K}-1))-mers..."
K_MINUS_ONE_MERS_FILE="${INPUT_DIR}/kMinusOne-k${K}.txt"
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.GetKMinusOneMers ${ALPHABET_FILE} ${FREQUENT_KMERS_FILE} ${K} ${K_MINUS_ONE_MERS_FILE}
Expand Down
7 changes: 4 additions & 3 deletions scripts/6-repeatAlphabet/3-getUniqueSubstrings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,11 @@ CHARACTER_THRESHOLD=$9
UNIQUE_MODE="1" # Non-repetitive blocks are allowed in a k-mer, except at the first/last
# position of the k-mer. Usually a good choice.
SPANNING_BPS="150" # Bps before and after a k-mer to consider it observed in a read.
# ------------------------------------ REVANT --------------------------------------------
REVANT_LIBRARIES="${REVANT_BINARIES}/../lib/*.jar"
# ----------------------------------------------------------------------------------------


set -o pipefail; set -e; set -u
export LC_ALL=C # To speed up the $sort$ command.
READ_IDS_FILE="${INPUT_DIR}/reads-ids.txt"
Expand All @@ -42,8 +45,6 @@ ALPHABET_FILE="${INPUT_DIR}/alphabet-cleaned.txt"
TANDEMS_FILE="${INPUT_DIR}/tandems.txt"
REPEAT_LENGTHS_FILE="${INPUT_DIR}/repeats-lengths.txt"
N_REPEATS=$(wc -l < ${REPEAT_LENGTHS_FILE})
MIN_FREQUENCY_UNIQUE=$(( ${HAPLOTYPE_COVERAGE} / 2 ))
MAX_FREQUENCY_UNIQUE=$(( ${HAPLOTYPE_COVERAGE}*${N_HAPLOTYPES} + ${HAPLOTYPE_COVERAGE}/2 ))
MAX_HISTOGRAM_COUNT="10000" # Arbitrary
rm -f ${TMPFILE_PATH}*
rm -f ${INPUT_DIR}/unique-*
Expand Down Expand Up @@ -104,7 +105,7 @@ for K in $(seq 1 ${MAX_K}); do
UNIQUE_KMERS_FILE="${INPUT_DIR}/unique-k${K}.txt"
OUTPUT_FILE_HISTOGRAM="${INPUT_DIR}/histogram-k${K}.txt"
echo "Finding unique ${K}-mers..."
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CompactKmers ${TMPFILE_PATH}-${K}.txt ${K} ${GENOME_LENGTH} ${N_HAPLOTYPES} ${N_READS} ${AVG_READ_LENGTH} ${SPANNING_BPS} 1 ${ALPHABET_FILE} ${MAX_HISTOGRAM_COUNT} ${UNIQUE_KMERS_FILE} ${OUTPUT_FILE_HISTOGRAM}
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}:${REVANT_LIBRARIES}" de.mpi_cbg.revant.apps.CompactKmers ${TMPFILE_PATH}-${K}.txt ${K} ${GENOME_LENGTH} ${N_HAPLOTYPES} ${N_READS} ${AVG_READ_LENGTH} ${SPANNING_BPS} 1 ${ALPHABET_FILE} 0 ${MAX_HISTOGRAM_COUNT} ${UNIQUE_KMERS_FILE} ${OUTPUT_FILE_HISTOGRAM}
echo "Updating shortest unique intervals file..."
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-0-*"); do
THREAD_ID=${FILE#${INPUT_DIR}/${TMPFILE_NAME}-0-}
Expand Down
2 changes: 1 addition & 1 deletion scripts/6-repeatAlphabet/master.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ fi
./1-buildAlphabet.sh ${INPUT_DIR} ${BROKEN_READS} ${MAX_ALIGNMENT_ERROR} ${MIN_ALIGNMENT_LENGTH_READ_REPEAT} ${N_HAPLOTYPES} ${HAPLOTYPE_COVERAGE} ${N_THREADS} ${DELETE_TMP_FILES} ${MAX_SPACER_LENGTH} ${WOBBLE_LENGTH} ${FIX_TANDEM_SPACERS} ${CONCATENATE_BLOCKS}
PERIODIC_ENDPOINTS_FIXED=$(cat ${INPUT_DIR}/buildAlphabet-tmp-return.txt)
MIN_K_FOR_DISAMBIGUATION="2"; MAX_K_FOR_DISAMBIGUATION="4"
./2-fixEndBlocks.sh ${INPUT_DIR} ${BROKEN_READS} ${HAPLOTYPE_COVERAGE} ${LOW_QUALITY_TYPE} ${MIN_K_FOR_DISAMBIGUATION} ${MAX_K_FOR_DISAMBIGUATION} ${N_THREADS} ${DELETE_TMP_FILES}
./2-fixEndBlocks.sh ${INPUT_DIR} ${BROKEN_READS} ${HAPLOTYPE_COVERAGE} ${LOW_QUALITY_TYPE} ${MIN_K_FOR_DISAMBIGUATION} ${MAX_K_FOR_DISAMBIGUATION} ${N_THREADS} ${DELETE_TMP_FILES} ${GENOME_LENGTH} ${N_HAPLOTYPES}
./3-getUniqueSubstrings.sh ${INPUT_DIR} ${GENOME_LENGTH} ${N_HAPLOTYPES} ${MAX_K_UNIQUE} ${N_THREADS} ${DELETE_TMP_FILES} ${IDENTITY_THRESHOLD} ${DISTANCE_THRESHOLD} ${CHARACTER_THRESHOLD}
./4-filterAlignments.sh ${INPUT_DIR} ${BROKEN_READS} ${PERIODIC_ENDPOINTS_FIXED} ${MIN_ALIGNMENT_LENGTH_READ_READ} ${MIN_ALIGNMENT_LENGTH_READ_REPEAT} ${MAX_K_UNIQUE} ${ALIGNMENT_FILTERING_MODE} ${MIN_INTERSECTION_NONREPETITIVE} ${N_THREADS} ${DELETE_TMP_FILES}
READ_LENGTHS_FILE="${INPUT_DIR}/reads-lengths.txt"
Expand Down
36 changes: 30 additions & 6 deletions src/de/mpi_cbg/revant/apps/CompactKmers.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
* inside the same read, and it prints a histogram of total counts for all k-mers.
*/
public class CompactKmers {

/**
* @param args 9 TRUE=keep every k-mer that passes a one-sided significance test in
* the model where it occurs on just one haplotype.
*/
public static void main(String[] args) throws IOException {
final String INPUT_FILE = args[0];
final int K = Integer.parseInt(args[1]);
Expand All @@ -23,9 +26,10 @@ public static void main(String[] args) throws IOException {
final int SPANNING_BPS = Integer.parseInt(args[6]);
final boolean DISCARD_SAME_READ_KMERS = Integer.parseInt(args[7])==1;
final String ALPHABET_FILE = args[8];
final int MAX_HISTOGRAM_COUNT = Integer.parseInt(args[9]);
final String OUTPUT_FILE_KMERS = args[10];
final String OUTPUT_FILE_HISTOGRAM = args[11].equalsIgnoreCase("null")?null:args[11];
final boolean KEEP_ALL_FREQUENT = Integer.parseInt(args[9])==1;
final int MAX_HISTOGRAM_COUNT = Integer.parseInt(args[10]);
final String OUTPUT_FILE_KMERS = args[11];
final String OUTPUT_FILE_HISTOGRAM = args[12].equalsIgnoreCase("null")?null:args[12];

final double SIGNIFICANCE_LEVEL = 0.05; // Conventional

Expand Down Expand Up @@ -74,7 +78,9 @@ public static void main(String[] args) throws IOException {
previousSameReadCount=Math.max(previousSameReadCount,sameReadCount);
}
else {
if ((DISCARD_SAME_READ_KMERS?previousSameReadCount==1:true) && isUnique(previousCount,previous,K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL)!=-1) {
if ( (DISCARD_SAME_READ_KMERS?previousSameReadCount==1:true) &&
(KEEP_ALL_FREQUENT?isFrequent(previousCount,previous,K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL):(isUnique(previousCount,previous,K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL)!=-1))
) {
for (i=0; i<K; i++) bw.write(previous[i]+",");
bw.write(previousCount+"\n");
}
Expand All @@ -85,7 +91,9 @@ public static void main(String[] args) throws IOException {
str=br.readLine();
}
br.close();
if ((DISCARD_SAME_READ_KMERS?previousSameReadCount==1:true) && isUnique(previousCount,previous,K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL)!=-1) {
if ( (DISCARD_SAME_READ_KMERS?previousSameReadCount==1:true) &&
(KEEP_ALL_FREQUENT?isFrequent(previousCount,previous,K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL):(isUnique(previousCount,previous,K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL)!=-1))
) {
for (i=0; i<K; i++) bw.write(previous[i]+",");
bw.write(previousCount+"\n");
}
Expand Down Expand Up @@ -126,5 +134,21 @@ private static final int isUnique(int count, int[] kmer, int k, int nReads, int
}
return -1;
}


/**
* Just a one-sided test on the model with one haplotype.
*/
private static final boolean isFrequent(int count, int[] kmer, int k, int nReads, int avgReadLength, int spanningBps, long genomeLength, int nHaplotypes, double significanceLevel) {
int i, length;
PoissonDistribution distribution;

length=0;
for (i=0; i<k; i++) RepeatAlphabet.alphabet[kmer[i]].getLength();
final double base = ((double)(avgReadLength-((spanningBps)<<1)-length))/genomeLength;
final int quantum = nReads/nHaplotypes;
distribution=PoissonDistribution.of(base*quantum);
return distribution.cumulativeProbability(count)>significanceLevel;
}

}

0 comments on commit d13475e

Please sign in to comment.