Skip to content

Commit

Permalink
changed klet counting behaviour
Browse files Browse the repository at this point in the history
  • Loading branch information
bjmt committed May 3, 2019
1 parent 7a6444a commit 5f0aa25
Show file tree
Hide file tree
Showing 9 changed files with 37 additions and 36 deletions.
5 changes: 5 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
2019-05-03 Benjamin Jean-Marie Tremblay <[email protected]>

* Removed default entries from getopt switch
* shuffler (euler) no longer calls make_klets(), making it faster and more
memory efficient at higher k
* shuffler version bumped to 1.2
* replaced all instances of count_klets() with count_klets2(), which does not
require a vector<string> of the actual klets to do counting

2019-05-02 Benjamin Jean-Marie Tremblay <[email protected]>

Expand Down
18 changes: 9 additions & 9 deletions README
Original file line number Diff line number Diff line change
Expand Up @@ -238,47 +238,47 @@ A random DNA string with 100,000 characters was used as input.
shuffler 1428 0.01

2 uShuffle 3796 0.00
shuffler (e) 3012 0.01
shuffler (e) 2808 0.01
shuffler (l) 1760 0.01
shuffler (m) 2100 0.04

3 uShuffle 3796 0.01
shuffler (e) 3036 0.02
shuffler (e) 2824 0.02
shuffler (l) 1556 0.00
shuffler (m) 2216 0.04

4 uShuffle 3780 0.01
shuffler (e) 2788 0.02
shuffler (e) 2764 0.02
shuffler (l) 1520 0.00
shuffler (m) 2268 0.06

5 uShuffle 3792 0.01
shuffler (e) 2992 0.02
shuffler (e) 2840 0.02
shuffler (l) 1644 0.01
shuffler (m) 2188 0.10

6 uShuffle 3820 0.01
shuffler (e) 3396 0.03
shuffler (e) 3052 0.03
shuffler (l) 1488 0.00
shuffler (m) 2328 0.28

7 uShuffle 3908 0.02
shuffler (e) 4768 0.04
shuffler (e) 3612 0.03
shuffler (l) 1592 0.00
shuffler (m) 3508 1.08

8 uShuffle 4296 0.05
shuffler (e) 8876 0.06
shuffler (e) 5220 0.05
shuffler (l) 1612 0.00
shuffler (m) 7224 4.20

9 uShuffle 5384 0.08
shuffler (e) 27436 0.16
shuffler (e) 14068 0.15
shuffler (l) 1468 0.00
shuffler (m) 23528 13.11

10 uShuffle 6380 0.10
shuffler (e) 102148 0.43
shuffler (e) 46660 0.32
shuffler (l) 1608 0.00
shuffler (m) 88148 2:12.22

Expand Down
2 changes: 1 addition & 1 deletion src/countlets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ int main(int argc, char **argv) {
alphlen = lets_uniq.size();

klets = make_klets(lets_uniq, k);
counts = count_klets(letters, klets, lets_uniq, k, alphlen);
counts = count_klets2(letters, lets_uniq, k, alphlen);

} else {

Expand Down
4 changes: 2 additions & 2 deletions src/countwin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ int main(int argc, char **argv) {
cerr << "Run countwin -h to see usage." << endl;
exit(EXIT_FAILURE);
}
counts = count_klets(vector<char>(seq.begin(), seq.end()), klets, lets_uniq, k, alphlen);
counts = count_klets2(vector<char>(seq.begin(), seq.end()), lets_uniq, k, alphlen);
if (has_out) {
outfile << make_row(to_string(START), to_string(STOP), counts, klets, nozero);
} else {
Expand All @@ -231,7 +231,7 @@ int main(int argc, char **argv) {

if (seq.length() < k) break;

counts = count_klets(vector<char>(seq.begin(), seq.end()), klets, lets_uniq, k, alphlen);
counts = count_klets2(vector<char>(seq.begin(), seq.end()), lets_uniq, k, alphlen);

STOP = START + seq.length() - 1;

Expand Down
4 changes: 2 additions & 2 deletions src/klets.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ vector<string> make_klets(vector<char> lets_uniq, int k) {

}

vector<int> count_klets(vector<char> letters, vector<string> klets,
vector<char> lets_uniq, int k, int alphlen) {
vector<int> count_klets2(vector<char> letters, vector<char> lets_uniq, int k,
int alphlen) {

/* Scales very well with increasing k, but requires having the entire
* sequence in memory.
Expand Down
4 changes: 2 additions & 2 deletions src/klets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

std::vector<std::string> make_klets(std::vector<char> lets_uniq, int k);

std::vector<int> count_klets(std::vector<char> letters, std::vector<std::string> klets,
std::vector<char> lets_uniq, int k, int alphlen);
std::vector<int> count_klets2(std::vector<char> letters, std::vector<char> lets_uniq,
int k, int alphlen);

#endif
26 changes: 11 additions & 15 deletions src/shuffle_euler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ vector<vector<int>> fill_vertices(vector<vector<int>> edgelist,

vector<int> walk_euler(vector<vector<int>> edgelist, int seqlen, int k,
vector<char> lets_uniq, default_random_engine gen, vector<int> last_letsi,
string firstl, string lastl, int lasti) {
string firstl, int lasti) {

vector<int> out_i;
int alphlen = lets_uniq.size();
Expand Down Expand Up @@ -207,23 +207,19 @@ string shuffle_euler(vector<char> letters, default_random_engine gen, int k,

int seqlen = letters.size();
int alphlen, nlets, nletsm1;
int lasti{-1};
int lasti{0};
vector<int> let_counts, last_letsi, out_i;
vector<char> lets_uniq;
set<int> lets_set;
vector<string> klets, kletsm1;
vector<vector<int>> edgelist;
string firstl, lastl, out;
string firstl, out;

/* the first and last letters remain unchanged; these are special vertices
* which only have a single directed edge to them
*/
for (int i = 0; i < k - 1; ++i) {
firstl += letters[i];
}
for (int i = seqlen - k + 1; i < seqlen; ++i) {
lastl += letters[i];
}

for (int i = 0; i < seqlen; ++i) {
lets_set.insert(letters[i]);
Expand All @@ -234,14 +230,14 @@ string shuffle_euler(vector<char> letters, default_random_engine gen, int k,
nlets = pow(alphlen, k);
nletsm1 = pow(alphlen, k - 1);

klets = make_klets(lets_uniq, k);
let_counts = count_klets(letters, klets, lets_uniq, k, alphlen);
kletsm1 = make_klets(lets_uniq, k - 1); /* these are the vertices */
let_counts = count_klets2(letters, lets_uniq, k, alphlen);

for (int i = 0; i < nletsm1; ++i) {
if (lastl.compare(kletsm1[i]) == 0) {
lasti = i;
break;
for (int i = k - 2; i >= 0; --i) {
for (int j = 0; j < alphlen; ++j) {
if (letters[seqlen - 1 - i] == lets_uniq[j]) {
lasti += pow(alphlen, i) * j;
continue;
}
}
}

Expand Down Expand Up @@ -285,7 +281,7 @@ string shuffle_euler(vector<char> letters, default_random_engine gen, int k,

/* walk new Eulerian path */
out_i = walk_euler(edgelist2, seqlen, k, lets_uniq, gen, last_letsi, firstl,
lastl, lasti);
lasti);

/* indices --> letters */
out.reserve(out_i.size());
Expand Down
8 changes: 4 additions & 4 deletions src/shuffle_markov.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ string markov_loop(vector<string> klets, vector<string> kletsm1,

/* Perhaps experiment with using ints instead of chars here. Not convinced
* it would lead to much speed increase since in the end a new string has
* to be pasted together regardless. I'm probably wrong though.
* to be pasted together regardless. I'm very probably wrong though.
*/

if (tmp_let.compare(kletsm1[j]) == 0) {
Expand All @@ -83,7 +83,7 @@ string markov_loop(vector<string> klets, vector<string> kletsm1,

if (verbose) {
vector<string> k1lets = make_klets(lets_uniq, 1);
vector<int> k1_counts = count_klets(out_split, k1lets, lets_uniq, 1, alphlen);
vector<int> k1_counts = count_klets2(out_split, lets_uniq, 1, alphlen);
int alignlen = to_string(max_element(k1_counts.begin(), k1_counts.end())[0]).length();
cerr << " After shuffling:" << endl;
for (int i = 0; i < alphlen; ++i) {
Expand Down Expand Up @@ -131,11 +131,11 @@ string shuffle_markov(vector<char> letters, default_random_engine gen, int k,

/* count k-lets */

let_counts = count_klets(letters, klets, lets_uniq, k, alphlen);
let_counts = count_klets2(letters, lets_uniq, k, alphlen);

if (verbose) {
vector<string> k1lets = make_klets(lets_uniq, 1);
vector<int> k1_counts = count_klets(letters, k1lets, lets_uniq, 1, alphlen);
vector<int> k1_counts = count_klets2(letters, lets_uniq, 1, alphlen);
int alignlen = to_string(max_element(k1_counts.begin(), k1_counts.end())[0]).length();
cerr << "Letter counts:" << endl;
cerr << " Before shuffling:" << endl;
Expand Down
2 changes: 1 addition & 1 deletion src/shuffler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ using namespace std;

void usage() {
printf(
"shuffler v1.1 Copyright (C) 2019 Benjamin Jean-Marie Tremblay \n"
"shuffler v1.2 Copyright (C) 2019 Benjamin Jean-Marie Tremblay \n"
" \n"
"Usage: shuffler [options] -i [filename] -o [filename] \n"
" echo [string] | shuffler [options] > [filename] \n"
Expand Down

0 comments on commit 5f0aa25

Please sign in to comment.