Skip to content

Commit

Permalink
Merge pull request #170 from DennisSchmitz/dev
Browse files Browse the repository at this point in the history
Update Jovian 1.2.02
  • Loading branch information
florianzwagemaker committed Feb 20, 2021
2 parents 9f29963 + 0bd587b commit 55ccd93
Show file tree
Hide file tree
Showing 15 changed files with 898 additions and 125 deletions.
36 changes: 29 additions & 7 deletions bin/Illumina_vir_Ref.smk
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,17 @@ with open(config["sample_sheet"]) as sample_sheet_file:
reference = config["reference_file"]
reference_basename = os.path.splitext(os.path.basename(reference))[0]

primervalue = config["primer_file"]

if primervalue.upper() == "NONE":
with open("placeholder_primers.fasta", "w") as fileout:
fileout.write(""">Placeholder
EXAMPLE""")
primerfile = "placeholder_primers.fasta"
prstatus = "NONE"
else:
prstatus = "SET"
primerfile = primervalue

#@################################################################################
#@#### Specify Jovian's final output: #####
Expand All @@ -83,12 +94,13 @@ localrules:

rule all:
input:
expand( "{p}{sample}_{read}.fq",
p = f"{datadir + cln}",
expand( "{p}{sample}_{read}.fastq",
p = f"{datadir + cln + hugo_no_rm}",
sample = SAMPLES,
read = [ 'pR1',
'pR2',
'unpaired'
'uR1',
'uR2'
]
), # Extract unmapped & paired reads AND unpaired from HuGo alignment; i.e. cleaned fastqs #TODO omschrijven naar betere smk syntax
expand( "{p}{ref}{extension}",
Expand Down Expand Up @@ -240,15 +252,14 @@ onstart:

include: f"{rls}QC_raw.smk"
include: f"{rls}CleanData.smk"
include: f"{rls}QC_clean.smk"

#>############################################################################
#>#### Removal of background host data #####
#>############################################################################

include: f"{rls}BG_removal_1.smk"
include: f"{rls}BG_removal_2.smk"
include: f"{rls}BG_removal_3.smk"
#include: f"{rls}BG_removal_1.smk"
#include: f"{rls}BG_removal_2.smk"
#include: f"{rls}BG_removal_3.smk"


###########! nuttig om contig metrics rule ook toe te voegen?
Expand All @@ -259,6 +270,12 @@ include: f"{rls}BG_removal_3.smk"
#>############################################################################
include: f"{rls}ILM_Ref_index.smk"


## remove primers
include: f"{rls}ILM_Ref_RemovePrimers.smk"


include: f"{rls}ILM_Ref_QC_clean.smk"
# iteration 1
include: f"{rls}ILM_Ref_align_to_ref_it1.smk"
include: f"{rls}ILM_Ref_extract_raw_cons_it1.smk"
Expand Down Expand Up @@ -292,6 +309,11 @@ include: f"{rls}ILM_Ref_igv_combi.smk"
onsuccess:
shell("""
echo -e "\nCleaning up..."
if test -f "placeholder_primers.fasta"; then
rm placeholder_primers.fasta
fi
echo -e "\tRemoving temporary files..."
if [ "{config[remove_temp]}" != "0" ]; then
Expand Down
15 changes: 15 additions & 0 deletions bin/envs/Illumina_ref_CutPrimers.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
name: Illumina_ref_CutPrimers
channels:
- bioconda
- conda-forge
- intel
- defaults
dependencies:
- python=3.7
- pysam==0.16.0.1
- modin==0.8.3
- samtools==1.11
- biopython==1.78
- bedtools==2.29.2
- minimap2==2.17
- mappy==2.17
7 changes: 4 additions & 3 deletions bin/envs/Nano_clean.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,11 @@ dependencies:
- bedtools==2.29.2
- pysam==0.16.0.1
- pysamstats==1.1.2
- pandas==1.1.3
- modin==0.8.3
- pandas
- mkl-service
- mappy==2.17
- biopython==1.78
- openjdk==11.0.8
- fastqc==0.11.8
- fastp==0.20.1
- cutadapt==2.10
- fastp==0.20.1
2 changes: 1 addition & 1 deletion bin/includes/Primer_validation
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ elif [ "${workflow}" == "NANO_REF" ]; then
nicename="Nanopore-reference"
fi

if [[ "${workflow}" =~ (ILM_META|ILM_REF) ]] && [ "${PRIMER_FASTA}" != "NONE" ]; then
if [ "${workflow}" == "ILM_META" ] && [ "${PRIMER_FASTA}" != "NONE" ]; then
echo -e "A file with primers was given but the chosen workflow: \"${nicename}\" does not support a custom primer file."
echo -e "For Illumina-data, we expect all input files to be processed with the \"Nextera\" library prep kit."
echo -e "Other official Illumina library prep kits are technically supported, such as the TruSeq kit, but this requires you to manually change the settings in \"config/pipeline_parameters.yaml\""
Expand Down
23 changes: 23 additions & 0 deletions bin/rules/ILM_Ref_QC_clean.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
rule ILM_Ref_QC_clean_data:
input:
rules.RemovePrimers_pt2.output
output:
html = f"{datadir + qc_post}" + "{sample}_fastqc.html",
zip = f"{datadir + qc_post}" + "{sample}_fastqc.zip"
conda:
f"{conda_envs}QC_and_clean.yaml"
log:
f"{logdir}" + "QC_clean_data_{sample}.log"
benchmark:
f"{logdir + bench}" + "QC_clean_data_{sample}.txt"
threads: 6
shell:
"""
if [ -s "{input}" ] # If file exists and is NOT empty (i.e. filesize > 0) do...
then
fastqc -t 6 --quiet --outdir data/FastQC_posttrim/ {input} > {log} 2>&1
else
touch {output.html}
touch {output.zip}
fi
"""
56 changes: 56 additions & 0 deletions bin/rules/ILM_Ref_RemovePrimers.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
rule RemovePrimers_pt1:
input:
r1 = rules.Clean_the_data.output.r1,
r2 = rules.Clean_the_data.output.r2,
ur1 = rules.Clean_the_data.output.r1_unpaired,
ur2 = rules.Clean_the_data.output.r2_unpaired,
ref = rules.Illumina_index_reference.output.reference_copy
output:
bam = f"{datadir + cln + prdir}" + "{sample}_sorted.bam",
bai = f"{datadir + cln + prdir}" + "{sample}_sorted.bam.bai"
conda:
f"{conda_envs}Illumina_ref_alignment.yaml"
log:
f"{logdir}" + "Illumina_RemovePrimers_pt1_{sample}.log"
benchmark:
f"{logdir + bench}" + "Illumina_RemovePrimers_pt1_{sample}.txt"
threads: config["threads"]["Illumina_align_to_reference"]
params:
aln_type = config["Illumina_ref"]["Alignment"]["Alignment_type"]
shell:
"""
bowtie2 --time --threads {threads} {params.aln_type} \
-x {input.ref} \
-1 {input.r1} \
-2 {input.r2} \
-U {input.ur1},{input.ur2} 2> {log} |\
samtools view -@ {threads} -uS - 2>> {log} |\
samtools collate -@ {threads} -O - 2>> {log} |\
samtools fixmate -@ {threads} -m - - 2>> {log} |\
samtools sort -@ {threads} - -o {output.bam} 2>> {log}
samtools index -@ {threads} {output.bam} >> {log} 2>&1
"""

rule RemovePrimers_pt2:
input:
bam = rules.RemovePrimers_pt1.output.bam,
ref = rules.Illumina_index_reference.output.reference_copy,
primers = primerfile
output: f"{datadir + cln + prdir}" + "{sample}.fastq"
conda:
f"{conda_envs}Illumina_ref_CutPrimers.yaml"
log:
f"{logdir}" + "Illumina_RemovePrimers_pt2_{sample}.log"
benchmark:
f"{logdir + bench}" + "Illumina_RemovePrimers_pt2_{sample}.txt"
threads: config["threads"]["Illumina_RemovePrimers"]
params:
primer_status = prstatus
shell:
"""
if [ {params.primer_status} == "SET" ]; then
python bin/scripts/RemoveIlluminaPrimers.py --input {input.bam} --reference {input.ref} --primers {input.primers} --threads {threads} --output {output}
else
bedtools bamtofastq -i {input.bam} -fq {output}
fi
"""
10 changes: 3 additions & 7 deletions bin/rules/ILM_Ref_align_to_ref_it1.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
#>##################################################################################################
rule Illumina_align_to_reference_it1:
input:
pR1 = rules.HuGo_removal_pt2_extract_paired_unmapped_reads.output.fastq_R1,
pR2 = rules.HuGo_removal_pt2_extract_paired_unmapped_reads.output.fastq_R2,
unpaired = rules.HuGo_removal_pt3_extract_unpaired_unmapped_reads.output,
reads = rules.RemovePrimers_pt2.output,
reference = rules.Illumina_index_reference.output.reference_copy
output:
sorted_bam = f"{datadir + it1 + aln}" + "{sample}_sorted.bam",
Expand All @@ -25,11 +23,9 @@ rule Illumina_align_to_reference_it1:
max_read_length = config["Illumina_ref"]["Alignment"]["Max_read_length"] # This is the default value and also the max read length of Illumina in-house sequencing.
shell: # LoFreq dindel is required for indel calling downstream
"""
bowtie2 --time --threads {threads} {params.aln_type} \
bowtie2 --time --quiet --threads {threads} {params.aln_type} \
-x {input.reference} \
-1 {input.pR1} \
-2 {input.pR2} \
-U {input.unpaired} 2> {log} |\
-U {input.reads} 2> {log} |\
samtools view -@ {threads} -uS - 2>> {log} |\
samtools collate -@ {threads} -O - 2>> {log} |\
samtools fixmate -@ {threads} -m - - 2>> {log} |\
Expand Down
8 changes: 2 additions & 6 deletions bin/rules/ILM_Ref_align_to_ref_it2.smk
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,7 @@
#>##################################################################################################
rule Illumina_align_to_reference_it2:
input:
pR1 = rules.HuGo_removal_pt2_extract_paired_unmapped_reads.output.fastq_R1,
pR2 = rules.HuGo_removal_pt2_extract_paired_unmapped_reads.output.fastq_R2,
unpaired = rules.HuGo_removal_pt3_extract_unpaired_unmapped_reads.output,
reads = rules.RemovePrimers_pt2.output,
reference = rules.Illumina_extract_raw_consensus_it1.output.reference_copy_it2
output:
reference_index = f"{datadir + it2 + refdir + reference_basename}" + "_{sample}.fasta.1.bt2", # I've only specified ".fasta.1.bt2", but the "2.bt2", "3.bt2", "4.bt2", "rev.1.bt2" and "rev.2.bt2" are implicitly generated. #TODO find a way to specify all output correctly (multiext snakemake syntax?)
Expand All @@ -29,9 +27,7 @@ rule Illumina_align_to_reference_it2:
bowtie2-build --threads {threads} {input.reference} {input.reference} >> {log} 2>&1
bowtie2 --time --threads {threads} {params.aln_type} \
-x {input.reference} \
-1 {input.pR1} \
-2 {input.pR2} \
-U {input.unpaired} 2> {log} |\
-U {input.reads} 2> {log} |\
samtools view -@ {threads} -uS - 2>> {log} |\
samtools collate -@ {threads} -O - 2>> {log} |\
samtools fixmate -@ {threads} -m - - 2>> {log} |\
Expand Down
13 changes: 6 additions & 7 deletions bin/rules/ILM_Ref_multiqc.smk
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,18 @@ rule Illumina_MultiQC_report:
sample = SAMPLES,
read = "R1 R2".split()
), # TODO dit moet nog verbetert worden qua smk syntax
expand( "{p}{sample}_{read}_fastqc.zip",
expand( "{p}{sample}_fastqc.zip",
p = f"{datadir + qc_post}",
sample = SAMPLES,
read = "pR1 pR2 uR1 uR2".split()
sample = SAMPLES
), # TODO dit moet nog verbetert worden qua smk syntax
expand( "{p}Clean_the_data_{sample}.log",
p = f"{logdir}",
sample = SAMPLES
), # TODO dit moet nog verbetert worden qua smk syntax
expand( "{p}HuGo_removal_pt1_alignment_{sample}.log",
p = f"{logdir}",
sample = SAMPLES
), # TODO dit moet nog verbetert worden qua smk syntax
#expand( "{p}HuGo_removal_pt1_alignment_{sample}.log",
# p = f"{logdir}",
# sample = SAMPLES
# ), # TODO dit moet nog verbetert worden qua smk syntax
expand( "{p}Illumina_align_to_reference_it2_{sample}.log",
p = f"{logdir}",
sample = SAMPLES
Expand Down
24 changes: 3 additions & 21 deletions bin/rules/Nano_Ref_Cut-primers.smk
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
rule Cut_primers:
input:
fastq = rules.Cleanup.output.qc_fastq,
primers_5 = rules.Prepare_primers.output.primers_5,
primers_3 = rules.Prepare_primers.output.primers_3
primers = rules.Prepare_primers.output.primers,
ref = rules.Index_ref.output.refcopy
output: f"{datadir + cln + prdir}" + "{sample}.fastq"
conda:
f"{conda_envs}Nano_clean.yaml"
Expand All @@ -11,25 +11,7 @@ rule Cut_primers:
benchmark:
f"{logdir + bench}" + "Primer_removal_{sample}.txt"
threads: config["threads"]["Nanopore_primer_removal"]
params:
primer_cutoff_plus = config["Nanopore_ref"]["Primer_cutoff_plus"],
primer_cutoff_minus = config["Nanopore_ref"]["Primer_cutoff_minus"],
overlap = config["Nanopore_ref"]["Primer_min_overlap"],
error_rate = config["Nanopore_ref"]["Primer_error_rate"],
repeat_search = config["Nanopore_ref"]["Primer_repeat_search"]
shell:
"""
cutadapt \
--cores={threads} \
--cut {params.primer_cutoff_plus} \
--cut {params.primer_cutoff_minus} \
-g file:{input.primers_5} \
-a file:{input.primers_3} \
--revcomp \
-O {params.overlap} \
-e {params.error_rate} \
--no-indels \
--times {params.repeat_search} \
-o {output} \
{input.fastq} > {log}
python bin/scripts/RemoveONTPrimers.py --input {input.fastq} --reference {input.ref} --primers {input.primers} --threads {threads} --output {output}
"""
Loading

0 comments on commit 55ccd93

Please sign in to comment.