Skip to content

Commit

Permalink
fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
fcunial committed Aug 20, 2023
1 parent a86cfac commit 44ae8ad
Show file tree
Hide file tree
Showing 11 changed files with 127 additions and 115 deletions.
2 changes: 1 addition & 1 deletion scripts/6-repeatAlphabet/1-buildAlphabet.sh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ SORT_OPTIONS=""
for i in $(seq 1 9); do # Should be in sync with the serialization of $Character$.
SORT_OPTIONS="${SORT_OPTIONS} -k ${i},${i}n"
done
MIN_CHARACTER_FREQUENCY=$(( ${HAPLOTYPE_COVERAGE} / 2 ))
MIN_CHARACTER_FREQUENCY=$(( ${HAPLOTYPE_COVERAGE} / 2 )) # Arbitrary
rm -f ${TMPFILE_PATH}*


Expand Down
21 changes: 10 additions & 11 deletions scripts/6-repeatAlphabet/2-fixEndBlocks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,19 @@
#
INPUT_DIR=$1
BROKEN_READS=$2 # 1=TRUE
HAPLOTYPE_COVERAGE=$3 # Of one haplotype
TIGHT_MODE="0"
LOW_QUALITY_TYPE=$4 # 1=replacement, 0=insertion.
LOW_QUALITY_TYPE=$3 # 1=replacement, 0=insertion.
LOW_QUALITY_LENGTH_TOLERANCE="200" # bps
MIN_K=$5 # One plus the min length of a context used for disambiguation
MAX_K=$6 # One plus the max length of a context used for disambiguation
N_THREADS=$7
DELETE_TMP_FILES=$8
GENOME_LENGTH=$9
N_HAPLOTYPES=${10}
MIN_K=$4 # One plus the min length of a context used for disambiguation
MAX_K=$5 # One plus the max length of a context used for disambiguation
N_THREADS=$6
DELETE_TMP_FILES=$7
GENOME_LENGTH=$8
N_HAPLOTYPES=$9
TIGHT_MODE="0"
SPANNING_BPS="150" # Bps before and after a k-mer to consider it observed in a read.
# ------------------------------------ REVANT --------------------------------------------
REVANT_LIBRARIES="${REVANT_BINARIES}/../lib/*.jar"
REVANT_LIBRARIES="${REVANT_BINARIES}/../lib"
REVANT_LIBRARIES="${REVANT_LIBRARIES}/commons-numbers-gamma-1.1.jar:${REVANT_LIBRARIES}/commons-rng-sampling-1.5.jar:${REVANT_LIBRARIES}/commons-statistics-distribution-1.0.jar"
# ----------------------------------------------------------------------------------------

set -o pipefail; set -e; set -u
Expand All @@ -43,7 +43,6 @@ READS_TRANSLATED_FILE="${INPUT_DIR}/reads-translated-new.txt"
READS_BOUNDARIES_FILE="${INPUT_DIR}/reads-translated-boundaries-new.txt"
READS_DISAMBIGUATED_FILE="${INPUT_DIR}/reads-translated-disambiguated.txt"
ALPHABET_FILE="${INPUT_DIR}/alphabet-cleaned.txt"
MIN_FREQUENCY_UNIQUE=$(( ${HAPLOTYPE_COVERAGE} / 2 ))
rm -f ${TMPFILE_PATH}*

function kmersThread() {
Expand Down
5 changes: 3 additions & 2 deletions scripts/6-repeatAlphabet/3-getUniqueSubstrings.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ UNIQUE_MODE="1" # Non-repetitive blocks are allowed in a k-mer, except at the f
# position of the k-mer. Usually a good choice.
SPANNING_BPS="150" # Bps before and after a k-mer to consider it observed in a read.
# ------------------------------------ REVANT --------------------------------------------
REVANT_LIBRARIES="${REVANT_BINARIES}/../lib/*.jar"
REVANT_LIBRARIES="${REVANT_BINARIES}/../lib"
REVANT_LIBRARIES="${REVANT_LIBRARIES}/commons-numbers-gamma-1.1.jar:${REVANT_LIBRARIES}/commons-rng-sampling-1.5.jar:${REVANT_LIBRARIES}/commons-statistics-distribution-1.0.jar"
# ----------------------------------------------------------------------------------------


Expand Down Expand Up @@ -71,7 +72,7 @@ function intervalsThread() {
local LOCAL_UNIQUE_KMERS_FILE=$5
local LOCAL_K_MINUS_ONE_INTERVALS_FILE=$6
local LOCAL_INTERVALS_FILE=$7
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}" de.mpi_cbg.revant.apps.GetShortestUniqueIntervals ${LOCAL_K} ${LOCAL_TRANSLATED_READS_FILE} ${LOCAL_BOUNDARIES_FILE} ${LOCAL_READ_LENGTHS_FILE} ${ALPHABET_FILE} ${LOCAL_UNIQUE_KMERS_FILE} ${HAPLOTYPE_COVERAGE} ${IDENTITY_THRESHOLD} ${DISTANCE_THRESHOLD} ${CHARACTER_THRESHOLD} ${LOCAL_K_MINUS_ONE_INTERVALS_FILE} ${LOCAL_INTERVALS_FILE}
java ${JAVA_RUNTIME_FLAGS} -classpath "${REVANT_BINARIES}:${REVANT_LIBRARIES}" de.mpi_cbg.revant.apps.GetShortestUniqueIntervals ${LOCAL_K} ${LOCAL_TRANSLATED_READS_FILE} ${LOCAL_BOUNDARIES_FILE} ${LOCAL_READ_LENGTHS_FILE} ${ALPHABET_FILE} ${LOCAL_UNIQUE_KMERS_FILE} ${N_READS} ${AVG_READ_LENGTH} ${GENOME_LENGTH} ${N_HAPLOTYPES} ${IDENTITY_THRESHOLD} ${DISTANCE_THRESHOLD} ${CHARACTER_THRESHOLD} ${LOCAL_K_MINUS_ONE_INTERVALS_FILE} ${LOCAL_INTERVALS_FILE}
if [ $? -ne 0 ]; then
exit
fi
Expand Down
2 changes: 1 addition & 1 deletion scripts/6-repeatAlphabet/master.sh
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ fi
./1-buildAlphabet.sh ${INPUT_DIR} ${BROKEN_READS} ${MAX_ALIGNMENT_ERROR} ${MIN_ALIGNMENT_LENGTH_READ_REPEAT} ${N_HAPLOTYPES} ${HAPLOTYPE_COVERAGE} ${N_THREADS} ${DELETE_TMP_FILES} ${MAX_SPACER_LENGTH} ${WOBBLE_LENGTH} ${FIX_TANDEM_SPACERS} ${CONCATENATE_BLOCKS}
PERIODIC_ENDPOINTS_FIXED=$(cat ${INPUT_DIR}/buildAlphabet-tmp-return.txt)
MIN_K_FOR_DISAMBIGUATION="2"; MAX_K_FOR_DISAMBIGUATION="4"
./2-fixEndBlocks.sh ${INPUT_DIR} ${BROKEN_READS} ${HAPLOTYPE_COVERAGE} ${LOW_QUALITY_TYPE} ${MIN_K_FOR_DISAMBIGUATION} ${MAX_K_FOR_DISAMBIGUATION} ${N_THREADS} ${DELETE_TMP_FILES} ${GENOME_LENGTH} ${N_HAPLOTYPES}
./2-fixEndBlocks.sh ${INPUT_DIR} ${BROKEN_READS} ${LOW_QUALITY_TYPE} ${MIN_K_FOR_DISAMBIGUATION} ${MAX_K_FOR_DISAMBIGUATION} ${N_THREADS} ${DELETE_TMP_FILES} ${GENOME_LENGTH} ${N_HAPLOTYPES}
./3-getUniqueSubstrings.sh ${INPUT_DIR} ${GENOME_LENGTH} ${N_HAPLOTYPES} ${MAX_K_UNIQUE} ${N_THREADS} ${DELETE_TMP_FILES} ${IDENTITY_THRESHOLD} ${DISTANCE_THRESHOLD} ${CHARACTER_THRESHOLD}
./4-filterAlignments.sh ${INPUT_DIR} ${BROKEN_READS} ${PERIODIC_ENDPOINTS_FIXED} ${MIN_ALIGNMENT_LENGTH_READ_READ} ${MIN_ALIGNMENT_LENGTH_READ_REPEAT} ${MAX_K_UNIQUE} ${ALIGNMENT_FILTERING_MODE} ${MIN_INTERSECTION_NONREPETITIVE} ${N_THREADS} ${DELETE_TMP_FILES}
READ_LENGTHS_FILE="${INPUT_DIR}/reads-lengths.txt"
Expand Down
2 changes: 1 addition & 1 deletion src/de/mpi_cbg/revant/apps/BuildAssemblyGraph.java
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ public static void main(String[] args) throws IOException {



if (Alignments.readA==441) {
if (Alignments.readA==1110) {
String type2 = "";
if (Alignments.startA<=100) type2="PREFIX";
else if (Alignments.endA>=Reads.getReadLength(Alignments.readA-1)-100) type2="SUFFIX";
Expand Down
2 changes: 1 addition & 1 deletion src/de/mpi_cbg/revant/apps/CollectKmers.java
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public static void main(String[] args) throws IOException {
else lastAvoidedInterval=-1;
RepeatAlphabet.loadBoundaries(str3);
readLength=Integer.parseInt(str4);
RepeatAlphabet.getKmers(str1,K,kmers,null,avoidedIntervals,lastAvoidedInterval,-1,readLength,RepeatAlphabet.boundaries,-1/*argument not used*/,-1/*argument not used*/,-1.0/*argument not used*/,tmpKmer,tmpArray2,tmpArray3,tmpMap,tmpChar);
RepeatAlphabet.getKmers(str1,K,kmers,null,avoidedIntervals,lastAvoidedInterval,readLength,-1/*argument not used*/,-1/*argument not used*/,-1/*argument not used*/,-1/*argument not used*/,RepeatAlphabet.boundaries,-1/*argument not used*/,-1/*argument not used*/,-1.0/*argument not used*/,tmpKmer,tmpArray2,tmpArray3,tmpMap,tmpChar);
str1=br1.readLine(); str2=INTERVALS_FILE_EXISTS?br2.readLine():null;
str3=br3.readLine(); str4=br4.readLine(); row++;
}
Expand Down
53 changes: 5 additions & 48 deletions src/de/mpi_cbg/revant/apps/CompactKmers.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package de.mpi_cbg.revant.apps;

import java.io.*;
import org.apache.commons.statistics.distribution.PoissonDistribution;

import de.mpi_cbg.revant.util.Math;

Expand Down Expand Up @@ -39,6 +38,7 @@ public static void main(String[] args) throws IOException {
String str;
BufferedReader br;
BufferedWriter bw;
RepeatAlphabet.Kmer kmer = new RepeatAlphabet.Kmer();
int[] previous, current, tmpArray;
long[] histogram;
String[] tokens;
Expand Down Expand Up @@ -78,8 +78,9 @@ public static void main(String[] args) throws IOException {
previousSameReadCount=Math.max(previousSameReadCount,sameReadCount);
}
else {
kmer.set(previous,K,previousCount);
if ( (DISCARD_SAME_READ_KMERS?previousSameReadCount==1:true) &&
(KEEP_ALL_FREQUENT?isFrequent(previousCount,previous,K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL):(isUnique(previousCount,previous,K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL)!=-1))
(KEEP_ALL_FREQUENT?kmer.isFrequent(K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL):(kmer.isUnique(K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL)!=-1))
) {
for (i=0; i<K; i++) bw.write(previous[i]+",");
bw.write(previousCount+"\n");
Expand All @@ -91,8 +92,9 @@ public static void main(String[] args) throws IOException {
str=br.readLine();
}
br.close();
kmer.set(previous,K,previousCount);
if ( (DISCARD_SAME_READ_KMERS?previousSameReadCount==1:true) &&
(KEEP_ALL_FREQUENT?isFrequent(previousCount,previous,K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL):(isUnique(previousCount,previous,K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL)!=-1))
(KEEP_ALL_FREQUENT?kmer.isFrequent(K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL):(kmer.isUnique(K,N_READS,AVG_READ_LENGTH,SPANNING_BPS,GENOME_LENGTH,N_HAPLOTYPES,SIGNIFICANCE_LEVEL)!=-1))
) {
for (i=0; i<K; i++) bw.write(previous[i]+",");
bw.write(previousCount+"\n");
Expand All @@ -105,50 +107,5 @@ public static void main(String[] args) throws IOException {
bw.close();
}
}


/**
* @param nReads in the entire dataset;
* @param spanningBps basepairs before and after $kmer$ for it to be considered
* observed in a read;
* @param genomeLength of one haplotype;
* @return -1 if the p-value of $count$ fails the two-sided significance test for
* every ploidy of $kmer$ (assuming a Poisson distribution for every possible ploidy);
* this happens if $kmer$ is noise or a repeat; otherwise, the smallest ploidy of
* $kmer$ for which the significance test does not fail.
*/
private static final int isUnique(int count, int[] kmer, int k, int nReads, int avgReadLength, int spanningBps, long genomeLength, int nHaplotypes, double significanceLevel) {
int i, length;
double p;
PoissonDistribution distribution;

length=0;
for (i=0; i<k; i++) RepeatAlphabet.alphabet[kmer[i]].getLength();
final double base = ((double)(avgReadLength-((spanningBps)<<1)-length))/genomeLength;
final int quantum = nReads/nHaplotypes;
for (i=0; i<nHaplotypes; i++) {
distribution=PoissonDistribution.of(base*quantum*(i+1));
p=distribution.cumulativeProbability(count);
p=2.0*Math.min(p,1.0-p);
if (p>significanceLevel) return i+1;
}
return -1;
}


/**
* Just a one-sided test on the model with one haplotype.
*/
private static final boolean isFrequent(int count, int[] kmer, int k, int nReads, int avgReadLength, int spanningBps, long genomeLength, int nHaplotypes, double significanceLevel) {
int i, length;
PoissonDistribution distribution;

length=0;
for (i=0; i<k; i++) RepeatAlphabet.alphabet[kmer[i]].getLength();
final double base = ((double)(avgReadLength-((spanningBps)<<1)-length))/genomeLength;
final int quantum = nReads/nHaplotypes;
distribution=PoissonDistribution.of(base*quantum);
return distribution.cumulativeProbability(count)>significanceLevel;
}

}
17 changes: 10 additions & 7 deletions src/de/mpi_cbg/revant/apps/GetShortestUniqueIntervals.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,15 @@ public static void main(String[] args) throws IOException {
final String READ_LENGTHS_FILE = args[3];
final String ALPHABET_FILE = args[4];
final String UNIQUE_KMERS_FILE = args[5];
final int HAPLOTYPE_COVERAGE = Integer.parseInt(args[6]);
final int IDENTITY_THRESHOLD = Integer.parseInt(args[7]);
final int DISTANCE_THRESHOLD = Integer.parseInt(args[8]);
final double CHARACTER_FRACTION = Double.parseDouble(args[9]);
final String OLD_INTERVALS_FILE = args[10]; // NULL to discard it
final String NEW_INTERVALS_FILE = args[11]; // Output
final int N_READS = Integer.parseInt(args[6]);
final int AVG_READ_LENGTH = Integer.parseInt(args[7]);
final long GENOME_LENGTH = Long.parseLong(args[8]); // One haplotype
final int N_HAPLOTYPES = Integer.parseInt(args[9]);
final int IDENTITY_THRESHOLD = Integer.parseInt(args[10]);
final int DISTANCE_THRESHOLD = Integer.parseInt(args[11]);
final double CHARACTER_FRACTION = Double.parseDouble(args[12]);
final String OLD_INTERVALS_FILE = args[13]; // NULL to discard it
final String NEW_INTERVALS_FILE = args[14]; // Output

boolean OLD_INTERVALS_FILE_EXISTS = !OLD_INTERVALS_FILE.equalsIgnoreCase("null");

Expand Down Expand Up @@ -91,7 +94,7 @@ public static void main(String[] args) throws IOException {
else lastUniqueInterval=-1;
RepeatAlphabet.loadBoundaries(str3);
readLength=Integer.parseInt(str4);
lastUniqueInterval=RepeatAlphabet.getKmers(str1,K,null,kmers,uniqueIntervals,lastUniqueInterval,HAPLOTYPE_COVERAGE,readLength,RepeatAlphabet.boundaries,IDENTITY_THRESHOLD,DISTANCE_THRESHOLD,CHARACTER_FRACTION,tmpKmer,tmpArray2,tmpArray3,null,tmpChar);
lastUniqueInterval=RepeatAlphabet.getKmers(str1,K,null,kmers,uniqueIntervals,lastUniqueInterval,readLength,N_READS,AVG_READ_LENGTH,GENOME_LENGTH,N_HAPLOTYPES,RepeatAlphabet.boundaries,IDENTITY_THRESHOLD,DISTANCE_THRESHOLD,CHARACTER_FRACTION,tmpKmer,tmpArray2,tmpArray3,null,tmpChar);
if (lastUniqueInterval>0) {
nPairs=(lastUniqueInterval+1)/3;
if (pairs.length<nPairs) {
Expand Down
Loading

0 comments on commit 44ae8ad

Please sign in to comment.