diff --git a/data/test.gff b/data/test.gff index ee33aabc..9a5f0d9f 100644 --- a/data/test.gff +++ b/data/test.gff @@ -1,7 +1,7 @@ ## GFF3 file format -test Genbank primer_binding_site 2 290 . + . ID=id-test1;Note=daft punk -test Genbank long_terminal_repeat 1 55 . + . ID=id-test2;Note=random access memories -test Genbank CDS 2 292 . + . ID=id-test3;Note=eluvite -test Genbank CDS 1 55 . + . ID=id-test4;Note=ategnato -test Genbank CDS 2 292 . + . ID=id-testedit1;Note=PinkFloyd;EditPosition=100;EditSequence=A -test Genbank CDS 2 292 . + . ID=id-testedit2;Note=AnotherBrickInTheWall;EditPosition=102;EditSequence=AA \ No newline at end of file +test Genbank primer_binding_site 2 290 . + . ID=id-test1;Note=daft punk;Parent=geneFragmentsOfTime; +test Genbank long_terminal_repeat 1 55 . + . ID=id-test2;Note=random access memories;Parent=geneGetLucky; +test Genbank CDS 2 292 . + . ID=id-test3;Note=eluvite;gene=geneKatyPerry; +test Genbank CDS 1 55 . + . ID=id-test4;Note=ategnato;gene=geneKatyPerry; +test Genbank CDS 2 292 . + . ID=id-testedit1;Note=PinkFloyd;EditPosition=100;EditSequence=A; +test Genbank CDS 2 292 . + . ID=id-testedit2;Note=AnotherBrickInTheWall;EditPosition=102;EditSequence=AA;gene=geneTaylorSwift; diff --git a/src/call_variants.cpp b/src/call_variants.cpp index 46d10555..4f7ab4d6 100644 --- a/src/call_variants.cpp +++ b/src/call_variants.cpp @@ -28,6 +28,7 @@ int call_variants_from_plup(std::istream &cin, std::string out_file, uint8_t min ref_antd refantd(ref_path, gff_path); std::ostringstream out_str; std::ofstream fout((out_file+".tsv").c_str()); + //make columns in the file fout << "REGION" "\tPOS" "\tREF" @@ -47,6 +48,7 @@ int call_variants_from_plup(std::istream &cin, std::string out_file, uint8_t min "\tREF_AA" "\tALT_CODON" "\tALT_AA" + "\tPOS_AA" << std::endl; int ctr = 0; int64_t pos = 0; @@ -118,6 +120,7 @@ int call_variants_from_plup(std::istream &cin, std::string out_file, uint8_t min freq_depth = get_frequency_depth(*it, pdepth, mdepth); if(freq_depth[0] < min_threshold) continue; + //this only adds the first bit of the information to the tsv out_str << region << "\t"; out_str << pos << "\t"; out_str << ref << "\t"; @@ -135,6 +138,7 @@ int call_variants_from_plup(std::istream &cin, std::string out_file, uint8_t min Exp | Error | Err free | Obs | AD | RD | */ + //adding pvalue related info err = pow(10, ( -1 * (it->mean_qual)/10)); kt_fisher_exact((err * mdepth), (1-err) * mdepth, it->depth, ref_it->depth, &pval_left, &pval_right, &pval_twotailed); out_str << pval_left << "\t"; @@ -143,10 +147,11 @@ int call_variants_from_plup(std::istream &cin, std::string out_file, uint8_t min } else { out_str << "FALSE" << "\t"; } + //adding codon related info, can add gene info here too if(it->nuc[0] != '+' && it->nuc[0] != '-'){ refantd.codon_aa_stream(region, out_str, fout, pos, it->nuc[0]); } else { - fout << out_str.str() << "NA\tNA\tNA\tNA\tNA" << std::endl; + fout << out_str.str() << "NA\tNA\tNA\tNA\tNA\tNA" << std::endl; } out_str.str(""); out_str.clear(); diff --git a/src/ref_seq.cpp b/src/ref_seq.cpp index 1b3d22a4..d52ef05c 100644 --- a/src/ref_seq.cpp +++ b/src/ref_seq.cpp @@ -109,23 +109,35 @@ ref_antd::~ref_antd() if (this->fai) fai_destroy(this->fai); } +//used to add codon info to variants output int ref_antd::codon_aa_stream(std::string region, std::ostringstream &line_stream, std::ofstream &fout, int64_t pos, char alt){ std::vector features = gff.query_features(pos, "CDS"); if(features.size() == 0){ // No matching CDS - fout << line_stream.str() << "NA\tNA\tNA\tNA\tNA" << std::endl; + fout << line_stream.str() << "NA\tNA\tNA\tNA\tNA\tNA" << std::endl; return 0; } std::vector::iterator it; char *ref_codon, *alt_codon; for(it = features.begin(); it != features.end(); it++){ fout << line_stream.str(); - fout << it->get_attribute("ID") << "\t"; + //add in gene level info, control for case it's not present + std::string gene = it->get_attribute("gene"); + if (gene.empty()){ + fout << it->get_attribute("ID") << "\t"; + } else { + fout << gene + ":" + it->get_attribute("ID") << "\t"; + } ref_codon = this->get_codon(pos, region, *it); fout << ref_codon[0] << ref_codon[1] << ref_codon[2] << "\t"; fout << codon2aa(ref_codon[0], ref_codon[1], ref_codon[2]) << "\t"; alt_codon = this->get_codon(pos, region, *it, alt); fout << alt_codon[0] << alt_codon[1] << alt_codon[2] << "\t"; - fout << codon2aa(alt_codon[0], alt_codon[1], alt_codon[2]); + fout << codon2aa(alt_codon[0], alt_codon[1], alt_codon[2]) << "\t"; + + //adding amino acid position + int64_t start = it->get_start(); + int64_t aa_pos = ((pos - start) / 3)+1; + fout << aa_pos; fout << std::endl; delete[] ref_codon; diff --git a/tests/test_variants.cpp b/tests/test_variants.cpp index 4d21f01c..d7d42dc4 100755 --- a/tests/test_variants.cpp +++ b/tests/test_variants.cpp @@ -12,11 +12,13 @@ int call_var_check_outfile(std::string prefix, uint8_t min_qual, uint8_t min_dep std::string l; getline(outFile, l); // Ignore first line int comp = 0, ctr = 0; + while(ctr < len){ getline(outFile, l); std::cout << l << std::endl; std::cout << out[ctr] << " -> CORRECT" << std::endl; comp += l.compare(out[ctr]); + std::cout << l.compare(out[ctr]) << "\n"; ctr++; } return comp; @@ -25,20 +27,20 @@ int call_var_check_outfile(std::string prefix, uint8_t min_qual, uint8_t min_dep int main() { int num_success = 0; // Quality threshold 20. Frequency threshold: 0.03. Total_DP = 3. Indel passes filters with total_depth 4. Has two lines. - std::string t_20_02_1[4] = {"test\t210\tA\tT\t1\t1\t41\t2\t1\t40\t0.666667\t3\t0.2\tFALSE\tid-test3\tAAG\tK\tATG\tM", "test\t210\tA\tT\t1\t1\t41\t2\t1\t40\t0.666667\t3\t0.2\tFALSE\tid-testedit1\tGAA\tE\tGAT\tD", "test\t210\tA\tT\t1\t1\t41\t2\t1\t40\t0.666667\t3\t0.2\tFALSE\tid-testedit2\tAGA\tR\tTGA\t*", "test\t210\tA\t+GT\t1\t1\t41\t1\t0\t20\t0.25\t4\t0.4\tFALSE\tNA\tNA\tNA\tNA\tNA"}; + std::string t_20_02_1[4] = {"test\t210\tA\tT\t1\t1\t41\t2\t1\t40\t0.666667\t3\t0.2\tFALSE\tgeneKatyPerry:id-test3\tAAG\tK\tATG\tM\t70", "test\t210\tA\tT\t1\t1\t41\t2\t1\t40\t0.666667\t3\t0.2\tFALSE\tid-testedit1\tGAA\tE\tGAT\tD\t70", "test\t210\tA\tT\t1\t1\t41\t2\t1\t40\t0.666667\t3\t0.2\tFALSE\tgeneTaylorSwift:id-testedit2\tAGA\tR\tTGA\t*\t70", "test\t210\tA\t+GT\t1\t1\t41\t1\t0\t20\t0.25\t4\t0.4\tFALSE\tNA\tNA\tNA\tNA\tNA\tNA"}; // Quality threshold 3-. Frequency threshold: 0.03. Total_DP = 3. Freq = 0.666667 No Indel - std::string t_20_03[3] = {"test\t210\tA\tT\t1\t1\t41\t2\t1\t40\t0.666667\t3\t0.2\tFALSE\tid-test3\tAAG\tK\tATG\tM", "test\t210\tA\tT\t1\t1\t41\t2\t1\t40\t0.666667\t3\t0.2\tFALSE\tid-testedit1\tGAA\tE\tGAT\tD", "test\t210\tA\tT\t1\t1\t41\t2\t1\t40\t0.666667\t3\t0.2\tFALSE\tid-testedit2\tAGA\tR\tTGA\t*"}; + std::string t_20_03[3] = {"test\t210\tA\tT\t1\t1\t41\t2\t1\t40\t0.666667\t3\t0.2\tFALSE\tgeneKatyPerry:id-test3\tAAG\tK\tATG\tM\t70", "test\t210\tA\tT\t1\t1\t41\t2\t1\t40\t0.666667\t3\t0.2\tFALSE\tid-testedit1\tGAA\tE\tGAT\tD\t70", "test\t210\tA\tT\t1\t1\t41\t2\t1\t40\t0.666667\t3\t0.2\tFALSE\tgeneTaylorSwift:id-testedit2\tAGA\tR\tTGA\t*\t70"}; // Quality threshold 25. Frequency threshold: 0.03. Total_DP = 2. Freq = 0.5 No Indel. - std::string t_25_03[3] = {"test\t210\tA\tT\t1\t1\t41\t1\t1\t58\t0.5\t2\t0.4\tFALSE\tid-test3\tAAG\tK\tATG\tM", "test\t210\tA\tT\t1\t1\t41\t1\t1\t58\t0.5\t2\t0.4\tFALSE\tid-testedit1\tGAA\tE\tGAT\tD", "test\t210\tA\tT\t1\t1\t41\t1\t1\t58\t0.5\t2\t0.4\tFALSE\tid-testedit2\tAGA\tR\tTGA\t*"}; + std::string t_25_03[3] = {"test\t210\tA\tT\t1\t1\t41\t1\t1\t58\t0.5\t2\t0.4\tFALSE\tgeneKatyPerry:id-test3\tAAG\tK\tATG\tM\t70", "test\t210\tA\tT\t1\t1\t41\t1\t1\t58\t0.5\t2\t0.4\tFALSE\tid-testedit1\tGAA\tE\tGAT\tD\t70", "test\t210\tA\tT\t1\t1\t41\t1\t1\t58\t0.5\t2\t0.4\tFALSE\tgeneTaylorSwift:id-testedit2\tAGA\tR\tTGA\t*\t70"}; // Minimum depth threshold. Should be empty std::string t_25_03_20[] = {}; num_success = call_var_check_outfile("../data/test.indel", 20, 0, 0.02, t_20_02_1, 4); std::cout << num_success << std::endl; - num_success += call_var_check_outfile("../data/test.indel", 20, 0, 0.03, t_20_03, 3); + num_success += call_var_check_outfile("../data/test.indel2", 20, 0, 0.03, t_20_03, 3); std::cout << num_success << std::endl; - num_success += call_var_check_outfile("../data/test.indel", 25, 0, 0.03, t_25_03, 3); + num_success += call_var_check_outfile("../data/test.indel3", 25, 0, 0.03, t_25_03, 3); std::cout << num_success << std::endl; - num_success += call_var_check_outfile("../data/test.indel", 25, 20, 0.03, t_25_03_20, 0); + num_success += call_var_check_outfile("../data/test.indel4", 25, 20, 0.03, t_25_03_20, 0); std::cout << num_success << std::endl; if(num_success == 0) return 0;