Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
fcunial committed Sep 3, 2023
1 parent 8040c28 commit 5b45948
Show file tree
Hide file tree
Showing 11 changed files with 265 additions and 211 deletions.
4 changes: 2 additions & 2 deletions scripts/2-intervalGraph/2-steps2345.sh
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,10 @@ cp ${TAGS_DIR}/tags-root.txt ${STEP4_DIR}/
# Merging all tag files
TMP_FILE="${TAGS_DIR}/tmp.txt"
rm -f ${TMP_FILE}
find "${TAGS_DIR}" -type f -name "tags-*.txt" -exec cat {} + >> ${TMP_FILE}
find "${TAGS_DIR}" -type f -maxdepth 1 -name "tags-*.txt" -exec cat {} + >> ${TMP_FILE}
sort -m -t , -k 1,1n -k 2,2n -k 3,3n ${TMP_FILE} > ${TAGS_DIR}/allTags.txt
rm -f ${TMP_FILE}
find "${STEP4_DIR}" -type f -name "tags-*.txt" -exec cat {} + >> ${TMP_FILE}
find "${STEP4_DIR}" -type f -maxdepth 1 -name "tags-*.txt" -exec cat {} + >> ${TMP_FILE}
sort -m -t , -k 1,1n -k 2,2n -k 3,3n ${TMP_FILE} > ${STEP4_DIR}/allTags.txt
rm -f ${TMP_FILE}

Expand Down
31 changes: 23 additions & 8 deletions scripts/6-repeatAlphabet/0-breakReads.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,21 @@ TMPFILE_NAME="breakReads-tmp"
TMPFILE_PATH="${INPUT_DIR}/${TMPFILE_NAME}"
rm -f ${TMPFILE_PATH}*

function waitAndCheck() {
local ARRAY_NAME=$1[@]

local PIDS=(${!ARRAY_NAME})
local LAST_THREAD=$((${#PIDS[@]} - 1))
N_FAILED="0"
for THREAD in $(seq 0 ${LAST_THREAD}); do
wait ${PIDS[${THREAD}]} || N_FAILED=$(( ${N_FAILED} + 1 ))
done
if [ ${N_FAILED} -ne 0 ]; then
exit 1
fi
}


echo "Breaking reads..."
OLD2NEW_FILE="${INPUT_DIR}/unbroken2broken.txt"
NEW2OLD_FILE="${INPUT_DIR}/broken2unbroken.txt"
Expand All @@ -39,9 +54,6 @@ function alignmentsThread1() {
local ALIGNMENTS_FILE_ID=$1
local WRITE_HEADER=$2
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.BreakReads2 ${N_READS} ${READ_LENGTHS_FILE} ${OLD2NEW_FILE} 1 ${WRITE_HEADER} ${TMPFILE_PATH}-1-${ALIGNMENTS_FILE_ID}.txt ${TMPFILE_PATH}-2-${ALIGNMENTS_FILE_ID}.txt
if [ $? -ne 0 ]; then
exit
fi
}
ALIGNMENTS_FILE="${INPUT_DIR}/LAshow-reads-reads.txt"
N_ALIGNMENTS=$(( $(wc -l < ${ALIGNMENTS_FILE}) - 2 ))
Expand All @@ -53,11 +65,14 @@ if [ -e ${TMPFILE_PATH}-1-${N_THREADS}.txt ]; then
else
TO=$(( ${N_THREADS} - 1 ))
fi
PIDS=()
alignmentsThread1 0 1 &
PIDS+=($!)
for THREAD in $(seq 1 ${TO}); do
alignmentsThread1 ${THREAD} 1 & # We always write the header for future scripts.
PIDS+=($!)
done
wait
waitAndCheck PIDS
echo "Read-read alignments translated successfully"
# We do not need to concatenate the chunks in a single file, since they will be used
# directly as chunks by the following scripts.
Expand All @@ -74,9 +89,6 @@ function alignmentsThread2() {
local ALIGNMENTS_FILE_ID=$1
local WRITE_HEADER=$2
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.BreakReads2 ${N_READS} ${READ_LENGTHS_FILE} ${OLD2NEW_FILE} 0 ${WRITE_HEADER} ${TMPFILE_PATH}-3-${ALIGNMENTS_FILE_ID}.txt ${TMPFILE_PATH}-4-${ALIGNMENTS_FILE_ID}.txt
if [ $? -ne 0 ]; then
exit
fi
}
ALIGNMENTS_FILE="${INPUT_DIR}/LAshow-reads-repeats.txt"
N_ALIGNMENTS=$(( $(wc -l < ${ALIGNMENTS_FILE}) - 2 ))
Expand All @@ -88,11 +100,14 @@ if [ -e ${TMPFILE_PATH}-3-${N_THREADS}.txt ]; then
else
TO=$(( ${N_THREADS} - 1 ))
fi
PIDS=()
alignmentsThread2 0 1 &
PIDS+=($!)
for THREAD in $(seq 1 ${TO}); do
alignmentsThread2 ${THREAD} 1 & # We always write the header for future scripts.
PIDS+=($!)
done
wait
waitAndCheck PIDS
echo "Read-repeat alignments translated successfully"
# We do not need to concatenate the chunks in a single file, since they will be used
# directly as chunks by the following scripts.
Expand Down
190 changes: 104 additions & 86 deletions scripts/6-repeatAlphabet/1-buildAlphabet.sh

Large diffs are not rendered by default.

47 changes: 29 additions & 18 deletions scripts/6-repeatAlphabet/2-fixEndBlocks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,27 @@ READS_DISAMBIGUATED_FILE="${INPUT_DIR}/reads-translated-disambiguated.txt"
ALPHABET_FILE="${INPUT_DIR}/alphabet-cleaned.txt"
rm -f ${TMPFILE_PATH}*

function waitAndCheck() {
local ARRAY_NAME=$1[@]

local PIDS=(${!ARRAY_NAME})
local LAST_THREAD=$((${#PIDS[@]} - 1))
N_FAILED="0"
for THREAD in $(seq 0 ${LAST_THREAD}); do
wait ${PIDS[${THREAD}]} || N_FAILED=$(( ${N_FAILED} + 1 ))
done
if [ ${N_FAILED} -ne 0 ]; then
exit 1
fi
}

function enumerateKmersThread() {
local LOCAL_K=$1
local LOCAL_TRANSLATED_READS_FILE=$2
local LOCAL_BOUNDARIES_FILE=$3
local LOCAL_READ_LENGTHS_FILE=$4
local LOCAL_KMERS_FILE=$5
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CollectKmers 0 ${LOCAL_K} ${MAX_KMER_LENGTH_BPS} 1 ${LOCAL_TRANSLATED_READS_FILE} ${LOCAL_BOUNDARIES_FILE} ${LOCAL_READ_LENGTHS_FILE} ${ALPHABET_FILE} null null ${LOCAL_KMERS_FILE}
if [ $? -ne 0 ]; then
exit
fi
}

function countKmersThread() {
Expand All @@ -65,9 +76,6 @@ function countKmersThread() {
local LOCAL_KMERS_FILE_INPUT=$5
local LOCAL_KMERS_FILE_OUTPUT=$6
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CollectKmers 1 ${LOCAL_K} ${MAX_KMER_LENGTH_BPS} 1 ${LOCAL_TRANSLATED_READS_FILE} ${LOCAL_BOUNDARIES_FILE} ${LOCAL_READ_LENGTHS_FILE} ${ALPHABET_FILE} null ${LOCAL_KMERS_FILE_INPUT} ${LOCAL_KMERS_FILE_OUTPUT}
if [ $? -ne 0 ]; then
exit
fi
}

function fixThread() {
Expand All @@ -79,9 +87,6 @@ function fixThread() {
local LOCAL_NEW_TRANSLATED_FILE=$6
local LOCAL_STATS_FILE=$7
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.FixEndBlocks ${ALPHABET_FILE} ${LOCAL_OLD_TRANSLATED_FILE} ${LOCAL_BOUNDARIES_FILE} ${LOCAL_READ_LENGTHS_FILE} ${LOCAL_KMERS_FILE} ${LOCAL_K} ${TIGHT_MODE} ${LOCAL_NEW_TRANSLATED_FILE} ${LOCAL_STATS_FILE}
if [ $? -ne 0 ]; then
exit
fi
}

rm -f "${TMPFILE_PATH}-1-*"
Expand All @@ -94,23 +99,27 @@ for K in $(seq ${MIN_K} ${MAX_K}); do
SORT_OPTIONS_KMERS="${SORT_OPTIONS_KMERS} -k ${i},${i}n"
done
echo "Enumerating distinct ${K}-mers..."
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-1-*"); do
PIDS=()
for FILE in $(find -s ${INPUT_DIR} -maxdepth 1 -name "${TMPFILE_NAME}-1-*"); do
THREAD_ID=${FILE#${INPUT_DIR}/${TMPFILE_NAME}-1-}
enumerateKmersThread ${K} ${FILE} ${TMPFILE_PATH}-z1-${THREAD_ID} ${TMPFILE_PATH}-z2-${THREAD_ID} ${TMPFILE_PATH}-kmers-${K}-${THREAD_ID} &
PIDS+=($!)
done
wait
sort --parallel=${N_THREADS} -m -t , ${SORT_OPTIONS_KMERS} ${TMPFILE_PATH}-kmers-${K}-* > ${TMPFILE_PATH}-${K}-distinct.txt
waitAndCheck PIDS
sort --parallel=${N_THREADS} -m -u -t , ${SORT_OPTIONS_KMERS} ${TMPFILE_PATH}-kmers-${K}-* > ${TMPFILE_PATH}-${K}-distinct.txt
if [ ! -s ${TMPFILE_PATH}-${K}-distinct.txt ]; then
MAX_K=$((${K}-1))
break
fi
echo "Counting ${K}-mer occurrences..."
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-1-*"); do
PIDS=()
for FILE in $(find -s ${INPUT_DIR} -maxdepth 1 -name "${TMPFILE_NAME}-1-*"); do
THREAD_ID=${FILE#${INPUT_DIR}/${TMPFILE_NAME}-1-}
rm -f ${TMPFILE_PATH}-kmers-${K}-${THREAD_ID}
countKmersThread ${K} ${FILE} ${TMPFILE_PATH}-z1-${THREAD_ID} ${TMPFILE_PATH}-z2-${THREAD_ID} ${TMPFILE_PATH}-${K}-distinct.txt ${TMPFILE_PATH}-kmers-${K}-${THREAD_ID} &
PIDS+=($!)
done
wait
waitAndCheck PIDS
sort --parallel=${N_THREADS} -m -t , ${SORT_OPTIONS_KMERS} ${TMPFILE_PATH}-kmers-${K}-* > ${TMPFILE_PATH}-${K}.txt
rm -f ${TMPFILE_PATH}-${K}-distinct.txt
FREQUENT_KMERS_FILE="${INPUT_DIR}/frequent-k${K}.txt"
Expand All @@ -121,13 +130,15 @@ for K in $(seq ${MIN_K} ${MAX_K}); do
K_MINUS_ONE_MERS_FILE="${INPUT_DIR}/kMinusOne-k${K}.txt"
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.GetKMinusOneMers ${ALPHABET_FILE} ${FREQUENT_KMERS_FILE} ${K} ${K_MINUS_ONE_MERS_FILE}
echo "Disambiguating read ends using contexts of length $((${K}-1))..."
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-$((${K}-1))-*"); do
PIDS=()
for FILE in $(find -s ${INPUT_DIR} -maxdepth 1 -name "${TMPFILE_NAME}-$((${K}-1))-*"); do
THREAD_ID=${FILE#${INPUT_DIR}/${TMPFILE_NAME}-$((${K}-1))-}
fixThread ${FILE} ${TMPFILE_PATH}-z1-${THREAD_ID} ${TMPFILE_PATH}-z2-${THREAD_ID} ${K_MINUS_ONE_MERS_FILE} $((${K}-1)) ${TMPFILE_PATH}-${K}-${THREAD_ID} ${TMPFILE_PATH}-counts-${K}-${THREAD_ID} &
PIDS+=($!)
done
wait
waitAndCheck PIDS
N_FIXED="0"; N_FIXABLE="0"; N_ENDS="0";
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-counts-${K}-*"); do
for FILE in $(find -s ${INPUT_DIR} -maxdepth 1 -name "${TMPFILE_NAME}-counts-${K}-*"); do
N_FIXED=$(( ${N_FIXED} + $(cut -d , -f 1 ${FILE}) ))
N_FIXABLE=$(( ${N_FIXABLE} + $(cut -d , -f 2 ${FILE}) ))
N_ENDS=$(( ${N_ENDS} + $(cut -d , -f 3 ${FILE}) ))
Expand All @@ -136,7 +147,7 @@ for K in $(seq ${MIN_K} ${MAX_K}); do
echo "Disambiguated ${N_FIXED} read ends out of ${N_FIXABLE} fixable (${N_ENDS} total)"
done
rm -f ${READS_DISAMBIGUATED_FILE}
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-${MAX_K}-*" ); do
for FILE in $(find -s ${INPUT_DIR} -maxdepth 1 -name "${TMPFILE_NAME}-${MAX_K}-*" ); do
cat ${FILE} >> ${READS_DISAMBIGUATED_FILE}
done
if [ ${BROKEN_READS} -eq 1 ]; then
Expand Down
56 changes: 33 additions & 23 deletions scripts/6-repeatAlphabet/3-getUniqueSubstrings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,20 @@ rm -f ${TMPFILE_PATH}*
rm -f ${INPUT_DIR}/unique-*
rm -f ${INPUT_DIR}/histogram-*

function waitAndCheck() {
local ARRAY_NAME=$1[@]

local PIDS=(${!ARRAY_NAME})
local LAST_THREAD=$((${#PIDS[@]} - 1))
N_FAILED="0"
for THREAD in $(seq 0 ${LAST_THREAD}); do
wait ${PIDS[${THREAD}]} || N_FAILED=$(( ${N_FAILED} + 1 ))
done
if [ ${N_FAILED} -ne 0 ]; then
exit 1
fi
}

function enumerateKmersThread() {
local LOCAL_K=$1
local LOCAL_TRANSLATED_READS_FILE=$2
Expand All @@ -60,9 +74,6 @@ function enumerateKmersThread() {
local LOCAL_K_MINUS_ONE_INTERVALS_FILE=$5
local LOCAL_KMERS_FILE=$6
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CollectKmers 0 ${LOCAL_K} ${MAX_KMER_LENGTH_BPS} 2 ${LOCAL_TRANSLATED_READS_FILE} ${LOCAL_BOUNDARIES_FILE} ${LOCAL_READ_LENGTHS_FILE} ${ALPHABET_FILE} ${LOCAL_K_MINUS_ONE_INTERVALS_FILE} null ${LOCAL_KMERS_FILE}
if [ $? -ne 0 ]; then
exit
fi
}

function countKmersThread() {
Expand All @@ -74,9 +85,6 @@ function countKmersThread() {
local LOCAL_KMERS_FILE_INPUT=$6
local LOCAL_KMERS_FILE_OUTPUT=$7
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CollectKmers 1 ${LOCAL_K} ${MAX_KMER_LENGTH_BPS} 2 ${LOCAL_TRANSLATED_READS_FILE} ${LOCAL_BOUNDARIES_FILE} ${LOCAL_READ_LENGTHS_FILE} ${ALPHABET_FILE} ${LOCAL_K_MINUS_ONE_INTERVALS_FILE} ${LOCAL_KMERS_FILE_INPUT} ${LOCAL_KMERS_FILE_OUTPUT}
if [ $? -ne 0 ]; then
exit
fi
}

function intervalsThread() {
Expand All @@ -88,9 +96,6 @@ function intervalsThread() {
local LOCAL_K_MINUS_ONE_INTERVALS_FILE=$6
local LOCAL_INTERVALS_FILE=$7
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}:${REVANT_LIBRARIES}" de.mpi_cbg.revant.apps.GetShortestUniqueIntervals ${LOCAL_K} ${MAX_KMER_LENGTH_BPS} ${LOCAL_TRANSLATED_READS_FILE} ${LOCAL_BOUNDARIES_FILE} ${LOCAL_READ_LENGTHS_FILE} ${ALPHABET_FILE} ${LOCAL_UNIQUE_KMERS_FILE} ${N_READS} ${AVG_READ_LENGTH} ${GENOME_LENGTH} ${N_HAPLOTYPES} ${MIN_ALIGNMENT_LENGTH} ${IDENTITY_THRESHOLD} ${DISTANCE_THRESHOLD} ${CHARACTER_THRESHOLD} ${LOCAL_K_MINUS_ONE_INTERVALS_FILE} ${LOCAL_INTERVALS_FILE}
if [ $? -ne 0 ]; then
exit
fi
}

FINAL_INTERVALS_FILE="${INPUT_DIR}/unique-intervals-k1-${MAX_K}.txt"
Expand All @@ -103,23 +108,26 @@ for K in $(seq 1 ${MAX_K}); do
SORT_OPTIONS_KMERS="${SORT_OPTIONS_KMERS} -k ${i},${i}n"
done
echo "Enumerating distinct ${K}-mers..."
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-0-*"); do
PIDS=()
for FILE in $(find -s ${INPUT_DIR} -maxdepth 1 -name "${TMPFILE_NAME}-0-*"); do
THREAD_ID=${FILE#${INPUT_DIR}/${TMPFILE_NAME}-0-}
if [ ${K} -le 1 ]; then
PREVIOUS_INTERVALS="null"
else
PREVIOUS_INTERVALS="${TMPFILE_PATH}-$((${K}-1))-intervals-${THREAD_ID}"
fi
enumerateKmersThread ${K} ${FILE} ${TMPFILE_PATH}-1-${THREAD_ID} ${TMPFILE_PATH}-2-${THREAD_ID} ${PREVIOUS_INTERVALS} ${TMPFILE_PATH}-${K}-kmers-${THREAD_ID} &
PIDS+=($!)
done
wait
sort --parallel=${N_THREADS} -m -t , ${SORT_OPTIONS_KMERS} ${TMPFILE_PATH}-${K}-kmers-* > ${TMPFILE_PATH}-${K}-distinct.txt
waitAndCheck PIDS
sort --parallel=${N_THREADS} -m -u -t , ${SORT_OPTIONS_KMERS} ${TMPFILE_PATH}-${K}-kmers-* > ${TMPFILE_PATH}-${K}-distinct.txt
if [ ! -s ${TMPFILE_PATH}-${K}-distinct.txt ]; then
MAX_K=$((${K}-1))
break
fi
echo "Counting ${K}-mer occurrences..."
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-0-*"); do
PIDS=()
for FILE in $(find -s ${INPUT_DIR} -maxdepth 1 -name "${TMPFILE_NAME}-0-*"); do
THREAD_ID=${FILE#${INPUT_DIR}/${TMPFILE_NAME}-0-}
if [ ${K} -le 1 ]; then
PREVIOUS_INTERVALS="null"
Expand All @@ -128,28 +136,31 @@ for K in $(seq 1 ${MAX_K}); do
fi
rm -f ${TMPFILE_PATH}-${K}-kmers-${THREAD_ID}
countKmersThread ${K} ${FILE} ${TMPFILE_PATH}-1-${THREAD_ID} ${TMPFILE_PATH}-2-${THREAD_ID} ${PREVIOUS_INTERVALS} ${TMPFILE_PATH}-${K}-distinct.txt ${TMPFILE_PATH}-${K}-kmers-${THREAD_ID} &
PIDS+=($!)
done
wait
waitAndCheck PIDS
sort --parallel=${N_THREADS} -m -t , ${SORT_OPTIONS_KMERS} ${TMPFILE_PATH}-${K}-kmers-* > ${TMPFILE_PATH}-${K}.txt
rm -f ${TMPFILE_PATH}-${K}-distinct.txt
UNIQUE_KMERS_FILE="${INPUT_DIR}/unique-k${K}.txt"
OUTPUT_FILE_HISTOGRAM="${INPUT_DIR}/histogram-k${K}.txt"
echo "Finding unique ${K}-mers..."
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}:${REVANT_LIBRARIES}" de.mpi_cbg.revant.apps.CompactKmers ${TMPFILE_PATH}-${K}.txt ${K} ${GENOME_LENGTH} ${N_HAPLOTYPES} ${N_READS} ${AVG_READ_LENGTH} ${SPANNING_BPS} ${MIN_ALIGNMENT_LENGTH} 1 ${ALPHABET_FILE} 0 ${MAX_HISTOGRAM_COUNT} ${UNIQUE_KMERS_FILE} ${OUTPUT_FILE_HISTOGRAM}
echo "Updating shortest unique intervals file..."
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-0-*"); do
PIDS=()
for FILE in $(find -s ${INPUT_DIR} -maxdepth 1 -name "${TMPFILE_NAME}-0-*"); do
THREAD_ID=${FILE#${INPUT_DIR}/${TMPFILE_NAME}-0-}
if [ ${K} -le 1 ]; then
PREVIOUS_INTERVALS="null"
else
PREVIOUS_INTERVALS="${TMPFILE_PATH}-$((${K}-1))-intervals-${THREAD_ID}"
fi
intervalsThread ${K} ${FILE} ${TMPFILE_PATH}-1-${THREAD_ID} ${TMPFILE_PATH}-2-${THREAD_ID} ${UNIQUE_KMERS_FILE} ${PREVIOUS_INTERVALS} ${TMPFILE_PATH}-${K}-intervals-${THREAD_ID} &
PIDS+=($!)
done
wait
waitAndCheck PIDS
done
rm -f ${FINAL_INTERVALS_FILE}
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-${MAX_K}-intervals-*" ); do
for FILE in $(find -s ${INPUT_DIR} -maxdepth 1 -name "${TMPFILE_NAME}-${MAX_K}-intervals-*" ); do
cat ${FILE} >> ${FINAL_INTERVALS_FILE}
done
INTERVAL_STATS_FILE="${INPUT_DIR}/unique-intervals-k1-${MAX_K}-stats.txt"
Expand All @@ -164,18 +175,17 @@ function tandemsThread() {
local LOCAL_READ_LENGTHS_FILE=$3
local LOCAL_TANDEMS_FILE=$4
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CollectTandems 1 1 ${NONPERIODIC_MODE} ${ALPHABET_FILE} ${LOCAL_TRANSLATED_READS_FILE} ${LOCAL_BOUNDARIES_FILE} ${LOCAL_READ_LENGTHS_FILE} ${REPEAT_LENGTHS_FILE} ${N_REPEATS} ${LOCAL_TANDEMS_FILE}
if [ $? -ne 0 ]; then
exit
fi
}
echo "Collecting tandems..."
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-0-*"); do
PIDS=()
for FILE in $(find -s ${INPUT_DIR} -maxdepth 1 -name "${TMPFILE_NAME}-0-*"); do
THREAD_ID=${FILE#${INPUT_DIR}/${TMPFILE_NAME}-0-}
tandemsThread ${FILE} ${TMPFILE_PATH}-1-${THREAD_ID} ${TMPFILE_PATH}-2-${THREAD_ID} ${TMPFILE_PATH}-tandems-${THREAD_ID} &
PIDS+=($!)
done
wait
waitAndCheck PIDS
rm -f ${TANDEMS_FILE}
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-0-*"); do
for FILE in $(find -s ${INPUT_DIR} -maxdepth 1 -name "${TMPFILE_NAME}-0-*"); do
THREAD_ID=${FILE#${INPUT_DIR}/${TMPFILE_NAME}-0-}
cat ${TMPFILE_PATH}-tandems-${THREAD_ID} >> ${TANDEMS_FILE}
done
Expand Down
24 changes: 17 additions & 7 deletions scripts/6-repeatAlphabet/4-filterAlignments.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,20 @@ TMPFILE_NAME="filterAlignments-tmp"
TMPFILE_PATH="${INPUT_DIR}/${TMPFILE_NAME}"
rm -f ${TMPFILE_PATH}*

function waitAndCheck() {
local ARRAY_NAME=$1[@]

local PIDS=(${!ARRAY_NAME})
local LAST_THREAD=$((${#PIDS[@]} - 1))
N_FAILED="0"
for THREAD in $(seq 0 ${LAST_THREAD}); do
wait ${PIDS[${THREAD}]} || N_FAILED=$(( ${N_FAILED} + 1 ))
done
if [ ${N_FAILED} -ne 0 ]; then
exit 1
fi
}

echo "Splitting the alignments file..."
if [ ${PERIODIC_ENDPOINTS_FIXED} -eq 1 ]; then
# Reusing the chunks of the read-read alignments file that are already there (we
Expand Down Expand Up @@ -79,19 +93,13 @@ ALPHABET_FILE="${INPUT_DIR}/alphabet-cleaned.txt"
function filterThread() {
local ALIGNMENTS_FILE_ID=$1
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.FilterAlignments ${TMPFILE_PATH}-1-${ALIGNMENTS_FILE_ID}.txt ${N_READS} ${READ_LENGTHS_FILE} ${READ_IDS_FILE} ${READS_TRANSLATED_FILE} ${READS_TRANSLATED_BOUNDARIES} ${FULLY_UNIQUE_FILE} ${N_FULLY_UNIQUE} ${FULLY_CONTAINED_FILE} ${N_FULLY_CONTAINED} ${UNIQUE_INTERVALS_FILE} ${TANDEM_INTERVALS_FILE} ${FILTERING_MODE} ${SUFFIX_PREFIX_MODE} ${BOTH_READS_TANDEM} ${ALPHABET_FILE} ${TMPFILE_PATH}-2-${ALIGNMENTS_FILE_ID} ${TMPFILE_PATH}-3-${ALIGNMENTS_FILE_ID} ${MIN_ALIGNMENT_LENGTH_READ_READ} ${MIN_ALIGNMENT_LENGTH_READ_REPEAT} ${MIN_BLUE_INTERVAL_LENGTH} ${MIN_INTERSECTION_NONREPETITIVE}
if [ $? -ne 0 ]; then
exit
fi
if [ ${BROKEN_READS} -eq 1 ]; then
NEW2OLD_FILE="${INPUT_DIR}/broken2unbroken.txt"
OLD2NEW_FILE="${INPUT_DIR}/unbroken2broken.txt"
NREADS_OLD=$(wc -l < "${INPUT_DIR}/reads-ids-unbroken.txt")
READ_LENGTHS_FILE_OLD="${INPUT_DIR}/reads-lengths-unbroken.txt"
ALIGNMENTS_FILE_OLD="${INPUT_DIR}/breakReads-tmp-1-${ALIGNMENTS_FILE_ID}.txt"
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.BreakReads5 ${TMPFILE_PATH}-2-${ALIGNMENTS_FILE_ID} ${TMPFILE_PATH}-3-${ALIGNMENTS_FILE_ID} ${TMPFILE_PATH}-1-${ALIGNMENTS_FILE_ID}.txt ${NEW2OLD_FILE} ${N_READS} ${OLD2NEW_FILE} ${NREADS_OLD} ${ALIGNMENTS_FILE_OLD} ${READ_LENGTHS_FILE_OLD} ${TMPFILE_PATH}-4-${ALIGNMENTS_FILE_ID} ${TMPFILE_PATH}-5-${ALIGNMENTS_FILE_ID}
if [ $? -ne 0 ]; then
exit
fi
fi
}

Expand All @@ -100,10 +108,12 @@ if [ -e ${TMPFILE_PATH}-1-${N_THREADS}.txt ]; then
else
TO=$(( ${N_THREADS} - 1 ))
fi
PIDS=()
for THREAD in $(seq 0 ${TO}); do
filterThread ${THREAD} &
PIDS+=($!)
done
wait
waitAndCheck PIDS
echo "Alignments filtered successfully"
OUTPUT_BITVECTOR="${ALIGNMENTS_FILE}.mode${FILTERING_MODE}.bitvector"
OUTPUT_TANDEM_BITVECTOR="${ALIGNMENTS_FILE}.tandem.bitvector"
Expand Down
Loading

0 comments on commit 5b45948

Please sign in to comment.