Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
fcunial committed Nov 25, 2023
1 parent 4cb43e9 commit aa9e441
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 27 deletions.
8 changes: 5 additions & 3 deletions scripts/6-repeatAlphabet/1-buildAlphabet.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ WOBBLE_LENGTH=${10} # 0=do not wobble
FIX_TANDEM_SPACERS=${11} # 0=assume that non-repetitive blocks near tandems are real.
CONCATENATE_BLOCKS=${12} # 0=do not try to merge adjacent blocks from the same repeat.
AVG_READ_LENGTH=${13}
GENOME_LENGTH=${14} # Of one haplotype
KEEP_PERIODIC="1" # 1=do not remove rare characters if they are periodic. Usually good.
SPANNING_BPS="150" # Bps before and after a character to consider it observed in a read.
# ---------------------------------- TANDEM SPACERS --------------------------------------
TANDEM_SPACERS_ITERATIONS="1" # >=1
NONREPETITIVE_BLOCKS_MODE="2" # Should be probably set to 1 in production, 2 is the most aggressive.
Expand Down Expand Up @@ -614,14 +616,14 @@ fi
echo "Discarding rare characters..."
COUNTS_FILE="${INPUT_DIR}/alphabet-counts.txt"
HISTOGRAM_FILE="${INPUT_DIR}/alphabet-histogram.txt"
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.GetCharacterCounts ${READS_TRANSLATED_FILE} ${ALPHABET_FILE} ${COUNTS_FILE} ${HISTOGRAM_FILE}
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.GetCharacterCounts ${READS_TRANSLATED_FILE} ${READS_TRANSLATED_BOUNDARIES} ${READ_LENGTHS_FILE} ${ALPHABET_FILE} ${COUNTS_FILE} ${HISTOGRAM_FILE}
function cleaningThread1() {
local TRANSLATED_CHARACTERS=$1
local TRANSLATED_BOUNDARIES=$2
local PREFIX_1=$3
local PREFIX_2=$4
local ID=$5
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CleanTranslatedReads1 ${ALPHABET_FILE} ${COUNTS_FILE} ${N_READS} ${READ_IDS_FILE} ${READ_LENGTHS_FILE} ${TRANSLATED_CHARACTERS} ${TRANSLATED_BOUNDARIES} ${MIN_CHARACTER_FREQUENCY} ${KEEP_PERIODIC} ${PREFIX_1}${ID}.txt > ${PREFIX_1}unique-${ID}.txt
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CleanTranslatedReads1 ${ALPHABET_FILE} ${COUNTS_FILE} ${N_READS} ${READ_IDS_FILE} ${READ_LENGTHS_FILE} ${AVG_READ_LENGTH} ${SPANNING_BPS} ${MIN_ALIGNMENT_LENGTH} ${GENOME_LENGTH} ${N_HAPLOTYPES} ${TRANSLATED_CHARACTERS} ${TRANSLATED_BOUNDARIES} ${KEEP_PERIODIC} ${PREFIX_1}${ID}.txt > ${PREFIX_1}unique-${ID}.txt
sort --parallel=1 -t , -u ${SORT_OPTIONS} ${PREFIX_1}${ID}.txt > ${PREFIX_2}${ID}.txt
}
split -l $(( ${N_READS} / ${N_THREADS} )) ${READS_TRANSLATED_FILE} "${TMPFILE_PATH}-12-"
Expand All @@ -639,7 +641,7 @@ ALPHABET_FILE_CLEANED="${INPUT_DIR}/alphabet-cleaned.txt"
rm -f ${ALPHABET_FILE_CLEANED}
OLD2NEW_FILE="${INPUT_DIR}/alphabet-old2new.txt"
rm -f ${OLD2NEW_FILE}
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CleanTranslatedReads2 ${ALPHABET_FILE} ${COUNTS_FILE} $(wc -l < ${TMPFILE_PATH}-15.txt) ${TMPFILE_PATH}-15.txt ${MIN_CHARACTER_FREQUENCY} ${KEEP_PERIODIC} ${TMPFILE_PATH}-14-unique.txt ${ALPHABET_FILE_CLEANED} ${OLD2NEW_FILE}
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.CleanTranslatedReads2 ${ALPHABET_FILE} ${COUNTS_FILE} ${N_READS} ${AVG_READ_LENGTH} ${SPANNING_BPS} ${MIN_ALIGNMENT_LENGTH} ${GENOME_LENGTH} ${N_HAPLOTYPES} $(wc -l < ${TMPFILE_PATH}-15.txt) ${TMPFILE_PATH}-15.txt ${KEEP_PERIODIC} ${TMPFILE_PATH}-14-unique.txt ${ALPHABET_FILE_CLEANED} ${OLD2NEW_FILE}
function cleaningThread3() {
local TRANSLATED_CHARACTERS_OLD=$1
local TRANSLATED_BOUNDARIES_OLD=$2
Expand Down
19 changes: 13 additions & 6 deletions src/de/mpi_cbg/revant/apps/CleanTranslatedReads1.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,19 @@ public static void main(String[] args) throws IOException {
final int N_READS = Integer.parseInt(args[2]);
final String READ_IDS_FILE = args[3];
final String READ_LENGTHS_FILE = args[4];
final String TRANSLATED_READS_CHARACTERS_FILE = args[5];
final String TRANSLATED_READS_BOUNDARIES_FILE = args[6];
final int MIN_FREQUENCY = Integer.parseInt(args[7]);
final boolean KEEP_PERIODIC = Integer.parseInt(args[8])==1;
final String OUTPUT_FILE = args[9];
final int AVG_READ_LENGTH = Integer.parseInt(args[5]);
final int SPANNING_BPS = Integer.parseInt(args[6]);
final int MIN_ALIGNMENT_LENGTH = Integer.parseInt(args[7]); // Read-repeat
final long GENOME_LENGTH = Long.parseLong(args[8]); // Of one haplotype
final int N_HAPLOTYPES = Integer.parseInt(args[9]);
final String TRANSLATED_READS_CHARACTERS_FILE = args[10];
final String TRANSLATED_READS_BOUNDARIES_FILE = args[11];
final boolean KEEP_PERIODIC = Integer.parseInt(args[12])==1;
final String OUTPUT_FILE = args[13];

final double SIGNIFICANCE_LEVEL = 0.05; // Conventional
final int MIN_MISSING_LENGTH = IO.quantum; // Arbitrary

int i;
String str1, str2;
RepeatAlphabet.Character tmpChar = new RepeatAlphabet.Character();
Expand All @@ -41,7 +48,7 @@ public static void main(String[] args) throws IOException {
bw = new BufferedWriter(new FileWriter(OUTPUT_FILE));
i=0; str1=br1.readLine(); str2=br2.readLine();
while (str1!=null) {
RepeatAlphabet.cleanTranslatedRead_collectCharacterInstances(str1,str2,Reads.readLengths[i],MIN_FREQUENCY,KEEP_PERIODIC,IO.quantum,bw,tmpChar);
RepeatAlphabet.cleanTranslatedRead_collectCharacterInstances(str1,str2,Reads.readLengths[i],KEEP_PERIODIC,IO.quantum,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,MIN_ALIGNMENT_LENGTH,MIN_MISSING_LENGTH,SIGNIFICANCE_LEVEL,bw,tmpChar);
i++; str1=br1.readLine(); str2=br2.readLine();
}
br1.close(); br2.close(); bw.close();
Expand Down
24 changes: 16 additions & 8 deletions src/de/mpi_cbg/revant/apps/CleanTranslatedReads2.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,25 @@ public class CleanTranslatedReads2 {
public static void main(String[] args) throws IOException {
final String ALPHABET_FILE = args[0];
final String ALPHABET_COUNTS_FILE = args[1];
final int N_NEW_CHARACTERS = Integer.parseInt(args[2]);
final String NEW_CHARACTERS_FILE = args[3];
final int MIN_FREQUENCY = Integer.parseInt(args[4]);
final boolean KEEP_PERIODIC = Integer.parseInt(args[5])==1;
final String NEW_UNIQUE_FILE = args[6]; // Sorted in decreasing order
final String NEW_ALPHABET_FILE = args[7];
final String OLD2NEW_FILE = args[8]; // Map oldNonUnique -> newNonUnique
final int N_READS = Integer.parseInt(args[2]);
final int AVG_READ_LENGTH = Integer.parseInt(args[3]);
final int SPANNING_BPS = Integer.parseInt(args[4]);
final int MIN_ALIGNMENT_LENGTH = Integer.parseInt(args[5]); // Read-repeat
final long GENOME_LENGTH = Long.parseLong(args[6]); // Of one haplotype
final int N_HAPLOTYPES = Integer.parseInt(args[7]);
final int N_NEW_CHARACTERS = Integer.parseInt(args[8]);
final String NEW_CHARACTERS_FILE = args[9];
final boolean KEEP_PERIODIC = Integer.parseInt(args[10])==1;
final String NEW_UNIQUE_FILE = args[11]; // Sorted in decreasing order
final String NEW_ALPHABET_FILE = args[12];
final String OLD2NEW_FILE = args[13]; // Map oldNonUnique -> newNonUnique

final double SIGNIFICANCE_LEVEL = 0.05; // Conventional
final int MIN_MISSING_LENGTH = IO.quantum; // Arbitrary

RepeatAlphabet.deserializeAlphabet(ALPHABET_FILE,2);
RepeatAlphabet.loadAlphabetCount(ALPHABET_COUNTS_FILE,RepeatAlphabet.lastAlphabet+1);
int[] old2new = RepeatAlphabet.cleanTranslatedRead_updateAlphabet(N_NEW_CHARACTERS,NEW_CHARACTERS_FILE,MIN_FREQUENCY,KEEP_PERIODIC);
int[] old2new = RepeatAlphabet.cleanTranslatedRead_updateAlphabet(N_NEW_CHARACTERS,NEW_CHARACTERS_FILE,KEEP_PERIODIC,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,MIN_ALIGNMENT_LENGTH,MIN_MISSING_LENGTH,SIGNIFICANCE_LEVEL);

BufferedReader br = new BufferedReader(new FileReader(NEW_UNIQUE_FILE));
RepeatAlphabet.maxOpenLength_unique=Integer.parseInt(br.readLine());
Expand Down
19 changes: 9 additions & 10 deletions src/de/mpi_cbg/revant/apps/RepeatAlphabet.java
Original file line number Diff line number Diff line change
Expand Up @@ -1742,8 +1742,8 @@ public static final void loadAlphabetCount(String file, int alphabetSize) throws


/**
* Appends to $bw$ the new unique closed characters that result from removing every
* character ---------->with count smaller than $minCount$ from $read2characters$, and updates
* Appends to $bw$ the new unique closed characters that result from removing from
* $read2characters$ every character whose count fails a statistical test, and updates
* $maxOpenLength_unique$ as well.
*
* Remark: the procedure assumes that global variables $alphabet,alphabetCount$ have
Expand All @@ -1756,7 +1756,7 @@ public static final void loadAlphabetCount(String file, int alphabetSize) throws
* @param read2boundaries boundaries of the characters in $read2characters$;
* @param tmpChar temporary space.
*/
public static final void cleanTranslatedRead_collectCharacterInstances(String read2characters, String read2boundaries, int readLength, boolean keepPeriodic, int quantum, BufferedWriter bw, Character tmpChar) throws IOException {
public static final void cleanTranslatedRead_collectCharacterInstances(String read2characters, String read2boundaries, int readLength, boolean keepPeriodic, int quantum, int nReads, int avgReadLength, int spanningBps, long genomeLength, int nHaplotypes, int minAlignmentLength, int minMissingLength, double significanceLevel, BufferedWriter bw, Character tmpChar) throws IOException {
boolean isUnique;
int i, j, k;
int c, length, first, nBlocks, nBoundaries;
Expand All @@ -1765,9 +1765,7 @@ public static final void cleanTranslatedRead_collectCharacterInstances(String re
if (read2characters.length()==0) return;
nBoundaries=loadBoundaries(read2boundaries)+1;
nBlocks=loadBlocks(read2characters);
--->removeRareCharacters(nBlocks,lastUnique,lastPeriodic,lastAlphabet,keepPeriodic);
removeRareCharacters(int nBlocks, int lastUnique, int lastPeriodic, int lastAlphabet, boolean keepPeriodic, int nReads, int avgReadLength, int spanningBps, long genomeLength, int nHaplotypes, int minAlignmentLength, int minMissingLength, double significanceLevel)

removeRareCharacters(nBlocks,lastUnique,lastPeriodic,lastAlphabet,keepPeriodic,nReads,avgReadLength,spanningBps,genomeLength,nHaplotypes,minAlignmentLength,minMissingLength,significanceLevel);
first=-1;
for (i=0; i<nBlocks; i++) {
if (lastInBlock[i]==-1) isUnique=true;
Expand Down Expand Up @@ -1871,8 +1869,9 @@ private static final void removeRareCharacters(int nBlocks, int lastUnique, int


/**
* Removes from $alphabet$ all characters with $alphabetCount < minCount$, and adds to
* $alphabet$ all the new unique characters in $newCharactersFile$.
* Removes from $alphabet$ all characters whose count are too low and fail a
* statistical test, and adds to $alphabet$ all the new unique characters in
* $newCharactersFile$.
*
* Remark: $alphabetCount$ is not valid after the procedure completes.
*
Expand All @@ -1882,7 +1881,7 @@ private static final void removeRareCharacters(int nBlocks, int lastUnique, int
* the new alphabet). Positions are relative to the corresponding values of
* $lastUnique+1$.
*/
public static final int[] cleanTranslatedRead_updateAlphabet(int nNewCharacters, String newCharactersFile, int minCount, boolean keepPeriodic) throws IOException {
public static final int[] cleanTranslatedRead_updateAlphabet(int nNewCharacters, String newCharactersFile, boolean keepPeriodic, int nReads, int avgReadLength, int spanningBps, long genomeLength, int nHaplotypes, int minAlignmentLength, int minMissingLength, double significanceLevel) throws IOException {
int i, j;
int lastPeriodicPrime;
String str;
Expand All @@ -1900,7 +1899,7 @@ public static final int[] cleanTranslatedRead_updateAlphabet(int nNewCharacters,
}
else { j=lastUnique; lastPeriodicPrime=-1; }
for (i=keepPeriodic?lastPeriodic+1:lastUnique+1; i<=lastAlphabet; i++) {
if (alphabetCount[i]<minCount) continue;
if (alphabet[i].isRare(alphabetCount[i][0],alphabetCount[i][1],nReads,avgReadLength,spanningBps,genomeLength,nHaplotypes,minAlignmentLength,minMissingLength,significanceLevel)) continue;
j++;
tmpChar=alphabet[j];
alphabet[j]=alphabet[i];
Expand Down

0 comments on commit aa9e441

Please sign in to comment.