Merge pull request #170 from DennisSchmitz/dev

Update Jovian 1.2.02
DennisSchmitz · Feb 20, 2021 · 55ccd93 · 55ccd93
2 parents 9f29963 + 0bd587b
commit 55ccd93
Show file tree

Hide file tree

Showing 15 changed files with 898 additions and 125 deletions.
diff --git a/bin/Illumina_vir_Ref.smk b/bin/Illumina_vir_Ref.smk
@@ -64,6 +64,17 @@ with open(config["sample_sheet"]) as sample_sheet_file:
 reference           =   config["reference_file"]
 reference_basename  =   os.path.splitext(os.path.basename(reference))[0]
 
+primervalue = config["primer_file"]
+
+if primervalue.upper() == "NONE":
+    with open("placeholder_primers.fasta", "w") as fileout:
+        fileout.write(""">Placeholder
+EXAMPLE""")
+    primerfile = "placeholder_primers.fasta"
+    prstatus = "NONE"
+else:
+    prstatus = "SET"
+    primerfile = primervalue
 
 #@################################################################################
 #@#### Specify Jovian's final output:                                        #####
@@ -83,12 +94,13 @@ localrules:
 
 rule all:
     input:
-        expand( "{p}{sample}_{read}.fq",
-                p       =   f"{datadir + cln}",
+        expand( "{p}{sample}_{read}.fastq",
+                p       =   f"{datadir + cln + hugo_no_rm}",
                 sample  =   SAMPLES,
                 read    =   [   'pR1',
                                 'pR2',
-                                'unpaired'
+                                'uR1',
+                                'uR2'
                                 ]
                 ), # Extract unmapped & paired reads AND unpaired from HuGo alignment; i.e. cleaned fastqs #TODO omschrijven naar betere smk syntax
         expand( "{p}{ref}{extension}",
@@ -240,15 +252,14 @@ onstart:
 
 include: f"{rls}QC_raw.smk"
 include: f"{rls}CleanData.smk"
-include: f"{rls}QC_clean.smk"
 
 #>############################################################################
 #>#### Removal of background host data                                   #####
 #>############################################################################
 
-include: f"{rls}BG_removal_1.smk"
-include: f"{rls}BG_removal_2.smk"
-include: f"{rls}BG_removal_3.smk"
+#include: f"{rls}BG_removal_1.smk"
+#include: f"{rls}BG_removal_2.smk"
+#include: f"{rls}BG_removal_3.smk"
 
 
 ###########! nuttig om contig metrics rule ook toe te voegen?
@@ -259,6 +270,12 @@ include: f"{rls}BG_removal_3.smk"
 #>############################################################################
 include: f"{rls}ILM_Ref_index.smk"
 
+
+## remove primers
+include: f"{rls}ILM_Ref_RemovePrimers.smk"
+
+
+include: f"{rls}ILM_Ref_QC_clean.smk"
 # iteration 1
 include: f"{rls}ILM_Ref_align_to_ref_it1.smk"
 include: f"{rls}ILM_Ref_extract_raw_cons_it1.smk"
@@ -292,6 +309,11 @@ include: f"{rls}ILM_Ref_igv_combi.smk"
 onsuccess:
     shell("""
         echo -e "\nCleaning up..."
+
+        if test -f "placeholder_primers.fasta"; then
+            rm placeholder_primers.fasta
+        fi
+
         
         echo -e "\tRemoving temporary files..."
         if [ "{config[remove_temp]}" != "0" ]; then

diff --git a/bin/envs/Illumina_ref_CutPrimers.yaml b/bin/envs/Illumina_ref_CutPrimers.yaml
@@ -0,0 +1,15 @@
+name: Illumina_ref_CutPrimers
+channels:
+  - bioconda
+  - conda-forge
+  - intel
+  - defaults
+dependencies:
+  - python=3.7
+  - pysam==0.16.0.1
+  - modin==0.8.3
+  - samtools==1.11
+  - biopython==1.78
+  - bedtools==2.29.2
+  - minimap2==2.17
+  - mappy==2.17
diff --git a/bin/envs/Nano_clean.yaml b/bin/envs/Nano_clean.yaml
@@ -12,10 +12,11 @@ dependencies:
   - bedtools==2.29.2
   - pysam==0.16.0.1
   - pysamstats==1.1.2
-  - pandas==1.1.3
+  - modin==0.8.3
+  - pandas
   - mkl-service
+  - mappy==2.17
   - biopython==1.78
   - openjdk==11.0.8
   - fastqc==0.11.8
-  - fastp==0.20.1
-  - cutadapt==2.10
+  - fastp==0.20.1
diff --git a/bin/includes/Primer_validation b/bin/includes/Primer_validation
@@ -9,7 +9,7 @@ elif [ "${workflow}" == "NANO_REF" ]; then
     nicename="Nanopore-reference"
 fi
 
-if [[ "${workflow}" =~ (ILM_META|ILM_REF) ]] && [ "${PRIMER_FASTA}" != "NONE" ]; then
+if [ "${workflow}" == "ILM_META" ] && [ "${PRIMER_FASTA}" != "NONE" ]; then
     echo -e "A file with primers was given but the chosen workflow: \"${nicename}\" does not support a custom primer file."
     echo -e "For Illumina-data, we expect all input files to be processed with the \"Nextera\" library prep kit."
     echo -e "Other official Illumina library prep kits are technically supported, such as the TruSeq kit, but this requires you to manually change the settings in \"config/pipeline_parameters.yaml\""

diff --git a/bin/rules/ILM_Ref_QC_clean.smk b/bin/rules/ILM_Ref_QC_clean.smk
@@ -0,0 +1,23 @@
+rule ILM_Ref_QC_clean_data:
+    input:
+        rules.RemovePrimers_pt2.output
+    output:
+        html    =   f"{datadir + qc_post}" + "{sample}_fastqc.html",
+        zip     =   f"{datadir + qc_post}" + "{sample}_fastqc.zip"
+    conda:
+        f"{conda_envs}QC_and_clean.yaml"
+    log:
+        f"{logdir}" + "QC_clean_data_{sample}.log"
+    benchmark:
+        f"{logdir + bench}" + "QC_clean_data_{sample}.txt"
+    threads: 6
+    shell:
+        """
+if [ -s "{input}" ] # If file exists and is NOT empty (i.e. filesize > 0) do...
+then
+    fastqc -t 6 --quiet --outdir data/FastQC_posttrim/ {input} > {log} 2>&1
+else
+    touch {output.html}
+    touch {output.zip}
+fi
+        """
diff --git a/bin/rules/ILM_Ref_RemovePrimers.smk b/bin/rules/ILM_Ref_RemovePrimers.smk
@@ -0,0 +1,56 @@
+rule RemovePrimers_pt1:
+    input:
+        r1  = rules.Clean_the_data.output.r1,
+        r2  = rules.Clean_the_data.output.r2,
+        ur1 = rules.Clean_the_data.output.r1_unpaired,
+        ur2 = rules.Clean_the_data.output.r2_unpaired,
+        ref = rules.Illumina_index_reference.output.reference_copy
+    output:
+        bam =   f"{datadir + cln + prdir}" + "{sample}_sorted.bam",
+        bai =   f"{datadir + cln + prdir}" + "{sample}_sorted.bam.bai"
+    conda:
+        f"{conda_envs}Illumina_ref_alignment.yaml"
+    log:
+        f"{logdir}" + "Illumina_RemovePrimers_pt1_{sample}.log"
+    benchmark:
+        f"{logdir + bench}" + "Illumina_RemovePrimers_pt1_{sample}.txt"
+    threads: config["threads"]["Illumina_align_to_reference"]
+    params:
+        aln_type    =   config["Illumina_ref"]["Alignment"]["Alignment_type"]
+    shell:
+        """
+bowtie2 --time --threads {threads} {params.aln_type} \
+-x {input.ref} \
+-1 {input.r1} \
+-2 {input.r2} \
+-U {input.ur1},{input.ur2} 2> {log} |\
+samtools view -@ {threads} -uS - 2>> {log} |\
+samtools collate -@ {threads} -O - 2>> {log} |\
+samtools fixmate -@ {threads} -m - - 2>> {log} |\
+samtools sort -@ {threads} - -o {output.bam} 2>> {log}
+samtools index -@ {threads} {output.bam} >> {log} 2>&1
+        """
+
+rule RemovePrimers_pt2:
+    input: 
+        bam = rules.RemovePrimers_pt1.output.bam,
+        ref = rules.Illumina_index_reference.output.reference_copy,
+        primers = primerfile
+    output: f"{datadir + cln + prdir}" + "{sample}.fastq"
+    conda:
+        f"{conda_envs}Illumina_ref_CutPrimers.yaml"
+    log:
+        f"{logdir}" + "Illumina_RemovePrimers_pt2_{sample}.log"
+    benchmark:
+        f"{logdir + bench}" + "Illumina_RemovePrimers_pt2_{sample}.txt"
+    threads: config["threads"]["Illumina_RemovePrimers"]
+    params:
+        primer_status = prstatus
+    shell:
+        """
+if [ {params.primer_status} == "SET" ]; then
+    python bin/scripts/RemoveIlluminaPrimers.py --input {input.bam} --reference {input.ref} --primers {input.primers} --threads {threads} --output {output}
+else
+    bedtools bamtofastq -i {input.bam} -fq {output}
+fi
+        """
diff --git a/bin/rules/ILM_Ref_align_to_ref_it1.smk b/bin/rules/ILM_Ref_align_to_ref_it1.smk
@@ -3,9 +3,7 @@
 #>##################################################################################################
 rule Illumina_align_to_reference_it1:
     input:
-        pR1         =   rules.HuGo_removal_pt2_extract_paired_unmapped_reads.output.fastq_R1,
-        pR2         =   rules.HuGo_removal_pt2_extract_paired_unmapped_reads.output.fastq_R2,
-        unpaired    =   rules.HuGo_removal_pt3_extract_unpaired_unmapped_reads.output,
+        reads       =   rules.RemovePrimers_pt2.output,
         reference   =   rules.Illumina_index_reference.output.reference_copy
     output:
         sorted_bam          =   f"{datadir + it1 + aln}" + "{sample}_sorted.bam",
@@ -25,11 +23,9 @@ rule Illumina_align_to_reference_it1:
         max_read_length =   config["Illumina_ref"]["Alignment"]["Max_read_length"] # This is the default value and also the max read length of Illumina in-house sequencing.
     shell: # LoFreq dindel is required for indel calling downstream
         """
-bowtie2 --time --threads {threads} {params.aln_type} \
+bowtie2 --time --quiet --threads {threads} {params.aln_type} \
 -x {input.reference} \
--1 {input.pR1} \
--2 {input.pR2} \
--U {input.unpaired} 2> {log} |\
+-U {input.reads} 2> {log} |\
 samtools view -@ {threads} -uS - 2>> {log} |\
 samtools collate -@ {threads} -O - 2>> {log} |\
 samtools fixmate -@ {threads} -m - - 2>> {log} |\

diff --git a/bin/rules/ILM_Ref_align_to_ref_it2.smk b/bin/rules/ILM_Ref_align_to_ref_it2.smk
@@ -3,9 +3,7 @@
 #>##################################################################################################
 rule Illumina_align_to_reference_it2:
     input:
-        pR1         =   rules.HuGo_removal_pt2_extract_paired_unmapped_reads.output.fastq_R1,
-        pR2         =   rules.HuGo_removal_pt2_extract_paired_unmapped_reads.output.fastq_R2,
-        unpaired    =   rules.HuGo_removal_pt3_extract_unpaired_unmapped_reads.output,
+        reads       =   rules.RemovePrimers_pt2.output,
         reference   =   rules.Illumina_extract_raw_consensus_it1.output.reference_copy_it2
     output:
         reference_index     =   f"{datadir + it2 + refdir + reference_basename}" + "_{sample}.fasta.1.bt2", # I've only specified ".fasta.1.bt2", but the "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2" and "rev.2.bt2" are implicitly generated. #TODO find a way to specify all output correctly (multiext snakemake syntax?)
@@ -29,9 +27,7 @@ rule Illumina_align_to_reference_it2:
 bowtie2-build --threads {threads} {input.reference} {input.reference} >> {log} 2>&1
 bowtie2 --time --threads {threads} {params.aln_type} \
 -x {input.reference} \
--1 {input.pR1} \
--2 {input.pR2} \
--U {input.unpaired} 2> {log} |\
+-U {input.reads} 2> {log} |\
 samtools view -@ {threads} -uS - 2>> {log} |\
 samtools collate -@ {threads} -O - 2>> {log} |\
 samtools fixmate -@ {threads} -m - - 2>> {log} |\

diff --git a/bin/rules/ILM_Ref_multiqc.smk b/bin/rules/ILM_Ref_multiqc.smk
@@ -11,19 +11,18 @@ rule Illumina_MultiQC_report:
                 sample  =   SAMPLES,
                 read    =   "R1 R2".split()
                 ), # TODO dit moet nog verbetert worden qua smk syntax
-        expand( "{p}{sample}_{read}_fastqc.zip",
+        expand( "{p}{sample}_fastqc.zip",
                 p       =   f"{datadir + qc_post}",
-                sample  =   SAMPLES,
-                read    =   "pR1 pR2 uR1 uR2".split()
+                sample  =   SAMPLES
                 ), # TODO dit moet nog verbetert worden qua smk syntax
         expand( "{p}Clean_the_data_{sample}.log",
                 p       = f"{logdir}",
                 sample  = SAMPLES
                 ), # TODO dit moet nog verbetert worden qua smk syntax
-        expand( "{p}HuGo_removal_pt1_alignment_{sample}.log",
-                p       = f"{logdir}",
-                sample  = SAMPLES
-                ), # TODO dit moet nog verbetert worden qua smk syntax
+        #expand( "{p}HuGo_removal_pt1_alignment_{sample}.log",
+        #        p       = f"{logdir}",
+        #        sample  = SAMPLES
+        #        ), # TODO dit moet nog verbetert worden qua smk syntax
         expand( "{p}Illumina_align_to_reference_it2_{sample}.log",
                 p       = f"{logdir}",
                 sample  = SAMPLES

diff --git a/bin/rules/Nano_Ref_Cut-primers.smk b/bin/rules/Nano_Ref_Cut-primers.smk
@@ -1,8 +1,8 @@
 rule Cut_primers:
     input:
         fastq       =   rules.Cleanup.output.qc_fastq,
-        primers_5   =   rules.Prepare_primers.output.primers_5,
-        primers_3   =   rules.Prepare_primers.output.primers_3
+        primers     =   rules.Prepare_primers.output.primers,
+        ref         =   rules.Index_ref.output.refcopy
     output: f"{datadir + cln + prdir}" + "{sample}.fastq"
     conda:
         f"{conda_envs}Nano_clean.yaml"
@@ -11,25 +11,7 @@ rule Cut_primers:
     benchmark:
         f"{logdir + bench}" + "Primer_removal_{sample}.txt"
     threads: config["threads"]["Nanopore_primer_removal"]
-    params:
-        primer_cutoff_plus  =   config["Nanopore_ref"]["Primer_cutoff_plus"],
-        primer_cutoff_minus =   config["Nanopore_ref"]["Primer_cutoff_minus"],
-        overlap             =   config["Nanopore_ref"]["Primer_min_overlap"],
-        error_rate          =   config["Nanopore_ref"]["Primer_error_rate"],
-        repeat_search       =   config["Nanopore_ref"]["Primer_repeat_search"]
     shell:
         """
-cutadapt \
---cores={threads} \
---cut {params.primer_cutoff_plus} \
---cut {params.primer_cutoff_minus} \
--g file:{input.primers_5} \
--a file:{input.primers_3} \
---revcomp \
--O {params.overlap} \
--e {params.error_rate} \
---no-indels \
---times {params.repeat_search} \
--o {output} \
-{input.fastq} > {log}
+python bin/scripts/RemoveONTPrimers.py --input {input.fastq} --reference {input.ref} --primers {input.primers} --threads {threads} --output {output}
         """