Skip to content

Commit

Permalink
towards wobbling long-period tandems
Browse files Browse the repository at this point in the history
  • Loading branch information
fcunial committed Aug 6, 2023
1 parent 46849fa commit cec5a36
Show file tree
Hide file tree
Showing 5 changed files with 189 additions and 19 deletions.
74 changes: 63 additions & 11 deletions scripts/6-repeatAlphabet/1-buildAlphabet.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,23 +204,23 @@ LONG_SPACER_LENGTH=$(( ${MIN_ALIGNMENT_LENGTH} * 20 )) # Arbitrary
if [ ${TANDEM_SPACERS_ITERATIONS} -gt 0 ]; then
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.SplitSpacers ${LAST_READA_FILE} ${N_THREADS} null ${READ_IDS_FILE} ${READ_LENGTHS_FILE} ${TMPFILE_PATH}-stash-
fi
function tandemsThread() {
local LOCAL_TRANSLATED_READS_FILE=$1
local LOCAL_BOUNDARIES_FILE=$2
local LOCAL_READ_LENGTHS_FILE=$3
local LOCAL_TANDEMS_FILE=$4
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CollectTandems 0 1 2 ${ALPHABET_FILE} ${LOCAL_TRANSLATED_READS_FILE} ${LOCAL_BOUNDARIES_FILE} ${LOCAL_READ_LENGTHS_FILE} ${REPEAT_LENGTHS_FILE} ${N_REPEATS} ${LOCAL_TANDEMS_FILE}
if [ $? -ne 0 ]; then
exit
fi
}
ITER="1";
while [ ${ITER} -le ${TANDEM_SPACERS_ITERATIONS} ]; do
rm -f ${TMPFILE_PATH}-tspacers-*
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-stash-*"); do
SUFFIX=${FILE#${TMPFILE_PATH}-stash-}
cp ${FILE} ${TMPFILE_PATH}-tspacers-1-${SUFFIX}
done
function tandemsThread() {
local LOCAL_TRANSLATED_READS_FILE=$1
local LOCAL_BOUNDARIES_FILE=$2
local LOCAL_READ_LENGTHS_FILE=$3
local LOCAL_TANDEMS_FILE=$4
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CollectTandems 0 1 2 ${ALPHABET_FILE} ${LOCAL_TRANSLATED_READS_FILE} ${LOCAL_BOUNDARIES_FILE} ${LOCAL_READ_LENGTHS_FILE} ${REPEAT_LENGTHS_FILE} ${N_REPEATS} ${LOCAL_TANDEMS_FILE}
if [ $? -ne 0 ]; then
exit
fi
}
echo "Computing tandem track..."
for FILE in $(find -s ${INPUT_DIR} -name "${TMPFILE_NAME}-8-*.txt"); do
THREAD_ID=${FILE#${TMPFILE_PATH}-8-}
Expand Down Expand Up @@ -376,6 +376,58 @@ while [ ${ITER} -le ${TANDEM_SPACERS_ITERATIONS} ]; do
ITER=$(( ${ITER} + 1 ))
fi
done
echo "Tandem spacers fixed"
if [ ${WOBBLE_LENGTH} -ne 0 ]; then
echo "Wobbling..."
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.SplitTranslations ${READ_IDS_FILE} ${READS_TRANSLATED_FILE} ${READS_TRANSLATED_BOUNDARIES} ${LAST_READA_FILE} ${TMPFILE_PATH}-wobble-1- ${TMPFILE_PATH}-wobble-2-
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.SplitSpacers ${LAST_READA_FILE} ${N_THREADS} null ${READ_IDS_FILE} ${READ_LENGTHS_FILE} ${TMPFILE_PATH}-wobble-3-
if [ -e ${TMPFILE_PATH}-wobble-1-${N_THREADS}.txt ]; then
TO=${N_THREADS}
else
TO=$(( ${N_THREADS} - 1 ))
fi
echo "Computing tandem track..."
for THREAD in $(seq 0 ${TO}); do
tandemsThread ${TMPFILE_PATH}-wobble-1-${THREAD_ID}.txt ${TMPFILE_PATH}-wobble-2-${THREAD_ID}.txt ${TMPFILE_PATH}-wobble-3-lengths-${THREAD_ID}.txt ${TMPFILE_PATH}-wobble-4-${THREAD_ID}.txt &
done
wait
function wobbleThreadCreate() {
local WOBBLE_FILE_ID=$1
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.WobbleLongPeriodCreateAlphabet1 ${ALPHABET_FILE} ${TMPFILE_PATH}-wobble-1-${WOBBLE_FILE_ID}.txt ${TMPFILE_PATH}-wobble-4-${WOBBLE_FILE_ID}.txt ${TMPFILE_PATH}-wobble-3-lengths-${WOBBLE_FILE_ID}.txt ${TMPFILE_PATH}-wobble-5-flags-${WOBBLE_FILE_ID}.txt
if [ $? -ne 0 ]; then
exit
fi
}
for THREAD in $(seq 0 ${TO}); do
wobbleThreadCreate ${THREAD} &
done
wait
WOBBLE_ALPHABET="${INPUT_DIR}/alphabet-wobble.txt"
WOBBLE_OLD2NEW="${INPUT_DIR}/alphabet-wobble-old2new.txt"
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.WobbleCreateAlphabet2 ${ALPHABET_FILE} ${WOBBLE_LENGTH} ${MIN_ALIGNMENT_LENGTH} ${REPEAT_LENGTHS_FILE} ${N_REPEATS} ${TMPFILE_PATH}-wobble-5-flags ${TO} 0 ${WOBBLE_ALPHABET} ${WOBBLE_OLD2NEW}
function wobbleThread() {
local WOBBLE_FILE_ID=$1
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.WobbleLongPeriod ${TMPFILE_PATH}-wobble-1-${WOBBLE_FILE_ID}.txt ${WOBBLE_LENGTH} ${ALPHABET_FILE} ${WOBBLE_ALPHABET} ${WOBBLE_OLD2NEW} ${REPEAT_LENGTHS_FILE} ${N_REPEATS} ${TMPFILE_PATH}-wobble-4-${WOBBLE_FILE_ID}.txt ${TMPFILE_PATH}-wobble-6-${WOBBLE_FILE_ID}.txt
if [ $? -ne 0 ]; then
exit
fi
}
for THREAD in $(seq 0 ${TO}); do
wobbleThread ${THREAD} &
done
wait
mv ${READS_TRANSLATED_FILE} ${READS_TRANSLATED_FILE}-prewobble
for THREAD in $(seq 0 ${TO}); do
cat ${TMPFILE_PATH}-wobble-6-${THREAD}.txt >> ${READS_TRANSLATED_FILE}
done
mv ${ALPHABET_FILE} ${ALPHABET_FILE}-prewobble
mv ${WOBBLE_ALPHABET} ${ALPHABET_FILE}
echo "Wobbling completed"
fi







Expand Down Expand Up @@ -510,7 +562,7 @@ if [ ${MAX_SPACER_LENGTH} -ne 0 ]; then
wait
WOBBLE_ALPHABET="${INPUT_DIR}/alphabet-wobble.txt"
WOBBLE_OLD2NEW="${INPUT_DIR}/alphabet-wobble-old2new.txt"
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.WobbleCreateAlphabet2 ${ALPHABET_FILE} ${WOBBLE_LENGTH} ${MIN_ALIGNMENT_LENGTH} ${REPEAT_LENGTHS_FILE} ${N_REPEATS} ${WOBBLE_PREFIX}-flags ${TO} ${WOBBLE_ALPHABET} ${WOBBLE_OLD2NEW}
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.WobbleCreateAlphabet2 ${ALPHABET_FILE} ${WOBBLE_LENGTH} ${MIN_ALIGNMENT_LENGTH} ${REPEAT_LENGTHS_FILE} ${N_REPEATS} ${WOBBLE_PREFIX}-flags ${TO} 1 ${WOBBLE_ALPHABET} ${WOBBLE_OLD2NEW}
function wobbleThread() {
local WOBBLE_FILE_ID=$1
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.Wobble ${WOBBLE_PREFIX}-${WOBBLE_FILE_ID}.txt ${WOBBLE_LENGTH} ${ALPHABET_FILE} ${WOBBLE_ALPHABET} ${WOBBLE_OLD2NEW} ${REPEAT_LENGTHS_FILE} ${N_REPEATS} ${TMPFILE_PATH}-wobble-3-${WOBBLE_FILE_ID}.txt
Expand Down
10 changes: 7 additions & 3 deletions src/de/mpi_cbg/revant/apps/RepeatAlphabet.java
Original file line number Diff line number Diff line change
Expand Up @@ -10181,8 +10181,10 @@ else if (c==-1-(lastAlphabet+1)) {
private static final int isWobbleOf(int x, Character reference, int quantum_wobble, int quantum_alphabet, Character[] alphabet) {
if ( alphabet[x].repeat!=reference.repeat || alphabet[x].orientation!=reference.orientation ||
alphabet[x].start<reference.start-quantum_wobble || alphabet[x].start>reference.start+quantum_wobble ) return -1;
if ( (alphabet[x].openStart!=reference.openStart && alphabet[x].start>quantum_alphabet && reference.start>quantum_alphabet) ||
(alphabet[x].openEnd!=reference.openEnd && alphabet[x].end<repeatLengths[alphabet[x].repeat]-quantum_alphabet && reference.end<repeatLengths[alphabet[x].repeat]-quantum_alphabet)
if ( reference.start!=-1 && reference.end!=-1 &&
( (alphabet[x].openStart!=reference.openStart && alphabet[x].start>quantum_alphabet && reference.start>quantum_alphabet) ||
(alphabet[x].openEnd!=reference.openEnd && alphabet[x].end<repeatLengths[alphabet[x].repeat]-quantum_alphabet && reference.end<repeatLengths[alphabet[x].repeat]-quantum_alphabet)
)
) return 0;
return Math.abs(alphabet[x].getLength(),reference.getLength())<=quantum_wobble?1:0;
}
Expand All @@ -10204,9 +10206,10 @@ public static final void wobble_longPeriod_markAlphabet(String read2characters,
nBlocks=loadBlocks(read2characters);
loadIntBlocks(nBlocks,boundaries,readLength,tmpCharacter);
i=-1; p=0; q=read2tandems.indexOf(SEPARATOR);
while (q>=0) {
while (q>=0) {
tmpArray[++i]=Integer.parseInt(read2tandems.substring(p,q));
p=q+1; q=read2tandems.indexOf(SEPARATOR,p);
if (q<0) q=read2tandems.length();
tmpArray[++i]=Integer.parseInt(read2tandems.substring(p,q));
p=q+1; q=read2tandems.indexOf(SEPARATOR,p);
}
Expand Down Expand Up @@ -10250,6 +10253,7 @@ public static final void wobble_longPeriod(String read2characters, String read2t
while (q>=0) {
first=Integer.parseInt(read2tandems.substring(p,q));
p=q+1; q=read2tandems.indexOf(SEPARATOR,p);
if (q<0) q=read2tandems.length();
last=Integer.parseInt(read2tandems.substring(p,q));
p=q+1; q=read2tandems.indexOf(SEPARATOR,p);
for (i=first; i<=last; i++) tmpArray1[i]=1;
Expand Down
11 changes: 6 additions & 5 deletions src/de/mpi_cbg/revant/apps/WobbleCreateAlphabet2.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import java.io.*;

/**
* Expands the alphabet by wobbling existing periodic characters, as well as all the non-
* periodic characters flagged in some input bitvectors. Wobbling means creating a
* Expands the alphabet by wobbling all the characters flagged in some input bitvectors,
* as well as all existing periodic characters if needed. Wobbling means creating a
* character that is similar to an existing one but has slightly different length.
*/
public class WobbleCreateAlphabet2 {
Expand All @@ -19,8 +19,9 @@ public static void main(String[] args) throws IOException {
final int N_REPEATS = Integer.parseInt(args[4]);
final String FLAGS_FILE_PREFIX = args[5];
final int LAST_FLAG_FILE = Integer.parseInt(args[6]);
final String OUTPUT_FILE_ALPHABET = args[7];
final String OUTPUT_FILE_OLD2NEW = args[8];
final boolean WOBBLE_ALL_PERIODIC = Integer.parseInt(args[7])==1;
final String OUTPUT_FILE_ALPHABET = args[8];
final String OUTPUT_FILE_OLD2NEW = args[9];

int i, j;
int nFlags, lastUnique_old, lastPeriodic_old, lastAlphabet_old, lastUnique_new, lastPeriodic_new, lastAlphabet_new;
Expand All @@ -45,7 +46,7 @@ public static void main(String[] args) throws IOException {
System.err.println("Wobbling "+(RepeatAlphabet.lastPeriodic-RepeatAlphabet.lastUnique)+" periodic and "+nFlags+" non-periodic characters...");
RepeatAlphabet.loadRepeatLengths(REPEAT_LENGTHS_FILE,N_REPEATS);
alphabet_old=RepeatAlphabet.alphabet; lastUnique_old=RepeatAlphabet.lastUnique; lastPeriodic_old=RepeatAlphabet.lastPeriodic; lastAlphabet_old=RepeatAlphabet.lastAlphabet;
alphabet_new=RepeatAlphabet.wobble_extendAlphabet(flags,nFlags,true,WOBBLE_LENGTH,IO.quantum,MIN_ALIGNMENT_LENGTH,out);
alphabet_new=RepeatAlphabet.wobble_extendAlphabet(flags,nFlags,WOBBLE_ALL_PERIODIC,WOBBLE_LENGTH,IO.quantum,MIN_ALIGNMENT_LENGTH,out);
lastUnique_new=out[0]; lastPeriodic_new=out[1]; lastAlphabet_new=out[2];
RepeatAlphabet.alphabet=alphabet_new;
RepeatAlphabet.lastUnique=lastUnique_new; RepeatAlphabet.lastPeriodic=lastPeriodic_new; RepeatAlphabet.lastAlphabet=lastAlphabet_new;
Expand Down
55 changes: 55 additions & 0 deletions src/de/mpi_cbg/revant/apps/WobbleLongPeriod.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
package de.mpi_cbg.revant.apps;

import de.mpi_cbg.revant.util.IO;
import java.io.*;

/**
* Like $Wobble.java$.
*/
public class WobbleLongPeriod {

public static void main(String[] args) throws IOException {
final String TRANSLATED_READS_CHARACTERS_FILE = args[0]; // Of a chunk of reads
final int WOBBLE_LENGTH = Integer.parseInt(args[1]);
final String ALPHABET_FILE_OLD = args[2]; // Of all reads
final String ALPHABET_FILE_NEW = args[3]; // Of all reads
final String ALPHABET_FILE_OLD2NEW = args[4]; // Of all reads
final String REPEAT_LENGTHS_FILE = args[5];
final int N_REPEATS = Integer.parseInt(args[6]);
final String TANDEMS_FILE = args[7]; // Of a chunk of reads. Non-periodic only.
final String OUTPUT_FILE = args[8]; // Of a chunk of reads

int i;
int nBlocks, lastUnique_new, lastPeriodic_new, lastAlphabet_new;
String str1, str2;
BufferedReader br1, br2;
BufferedWriter bw;
int[] old2new, tmpArray1, tmpArray2, tmpArray3;
RepeatAlphabet.Character[] alphabet_new;

RepeatAlphabet.loadRepeatLengths(REPEAT_LENGTHS_FILE,N_REPEATS);
RepeatAlphabet.deserializeAlphabet(ALPHABET_FILE_NEW,2);
alphabet_new=RepeatAlphabet.alphabet; lastUnique_new=RepeatAlphabet.lastUnique; lastPeriodic_new=RepeatAlphabet.lastPeriodic; lastAlphabet_new=RepeatAlphabet.lastAlphabet;
RepeatAlphabet.deserializeAlphabet(ALPHABET_FILE_OLD,2);
old2new = new int[lastAlphabet_new+1];
br1 = new BufferedReader(new FileReader(ALPHABET_FILE_OLD2NEW));
for (i=0; i<=RepeatAlphabet.lastAlphabet; i++) old2new[i]=Integer.parseInt(br1.readLine());
br1.close();
tmpArray1 = new int[100]; // Arbitrary
tmpArray2 = new int[RepeatAlphabet.lastAlphabet+1];
tmpArray3 = new int[] {0,0};
br1 = new BufferedReader(new FileReader(TRANSLATED_READS_CHARACTERS_FILE));
br2 = new BufferedReader(new FileReader(TANDEMS_FILE));
bw = new BufferedWriter(new FileWriter(OUTPUT_FILE));
str1=br1.readLine(); str2=br2.readLine();
while (str1!=null) {
nBlocks=1+((str1.length()+1)>>1); // Loose upper bound
if (tmpArray1.length<nBlocks) tmpArray1 = new int[nBlocks];
RepeatAlphabet.wobble_longPeriod(str1,str2,WOBBLE_LENGTH,IO.quantum,old2new,alphabet_new,lastUnique_new,lastPeriodic_new,lastAlphabet_new,bw,tmpArray1,tmpArray2,tmpArray3);
str1=br1.readLine(); str2=br2.readLine();
}
br1.close(); br2.close(); bw.close();
System.err.println("Applied wobbling to "+tmpArray3[0]+" blocks out of "+tmpArray3[1]+" total ("+((100.0*tmpArray3[0])/tmpArray3[1])+"%)");
}

}
58 changes: 58 additions & 0 deletions src/de/mpi_cbg/revant/apps/WobbleLongPeriodCreateAlphabet1.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package de.mpi_cbg.revant.apps;

import de.mpi_cbg.revant.util.Math;
import de.mpi_cbg.revant.util.IO;
import java.io.*;

/**
* Marks all characters of the alphabet that are adjacent to a long-period tandem or that
* belong to a long-period tandem in some translation, and prints the corresponding
* bitvector in output.
*
* This is designed to work on a chunk of reads.
*/
public class WobbleLongPeriodCreateAlphabet1 {

public static void main(String[] args) throws IOException {
final String ALPHABET_FILE = args[0]; // Of all reads
final String TRANSLATED_READS_CHARACTERS_FILE = args[1]; // Of a chunk of reads
final String TANDEMS_FILE = args[2]; // Of a chunk of reads. Non-periodic only.
final String READ_LENGTHS_FILE = args[3]; // Of a chunk of reads
final String OUTPUT_FILE = args[4];

int i;
int nBlocks, nFlags;
String str1, str2, str3;
RepeatAlphabet.Character tmpCharacter;
BufferedReader br1, br2, br3;
BufferedWriter bw;
boolean[] flags;
int[] tmpArray;

tmpCharacter = new RepeatAlphabet.Character();
RepeatAlphabet.deserializeAlphabet(ALPHABET_FILE,2);
flags = new boolean[RepeatAlphabet.lastAlphabet+1];
Math.set(flags,RepeatAlphabet.lastAlphabet,false);
if (RepeatAlphabet.lastAlphabet>RepeatAlphabet.lastPeriodic) {
tmpArray = new int[100]; // Arbitrary
br1 = new BufferedReader(new FileReader(TRANSLATED_READS_CHARACTERS_FILE));
br2 = new BufferedReader(new FileReader(TANDEMS_FILE));
br3 = new BufferedReader(new FileReader(READ_LENGTHS_FILE));
str1=br1.readLine(); str2=br2.readLine(); str3=br3.readLine();
i=0;
while (str1!=null) {
nBlocks=1+((str1.length()+1)>>1); // Loose upper bound
if (tmpArray.length<nBlocks) tmpArray = new int[nBlocks];
RepeatAlphabet.wobble_longPeriod_markAlphabet(str1,str2,Integer.parseInt(str3),flags,tmpCharacter,tmpArray);
i++;
if (i%10000==0) System.err.println("Processed "+i+" reads");
str1=br1.readLine(); str2=br2.readLine(); str3=br3.readLine();
}
br1.close(); br2.close(); br3.close();
}
bw = new BufferedWriter(new FileWriter(OUTPUT_FILE));
for (i=0; i<=RepeatAlphabet.lastAlphabet; i++) bw.write(flags[i]?"1\n":"0\n");
bw.close();
}

}

0 comments on commit cec5a36

Please sign in to comment.