-
Notifications
You must be signed in to change notification settings - Fork 0
/
workflow.makefile
136 lines (119 loc) · 5.27 KB
/
workflow.makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# This makefile is used to run mzrecal on a set of files
# From the folder containing mzML files, execute as: make -f workflow.makefile
#
# Before using:
# - The following tools must be installed (in ${HOME}/tools, modify location below if needed):
# mzrecal, comet, idconvert, plot-recal.R
# - Set PPM1 to the mass error to search for PSM used for recalibration
# - Set PPM2 to the mass error for comparing the recalibrated results to the original
# - A single comet parameter file must be in the data directory.
# The 'ppm' parameter is not used, the file is copied and the parameter is set in the copy.
#
# It performs the following steps:
# Identify peptides (using comet)
# Convert .pep.XML into .mzid (using idconvert)
# Recalibrate (using mzrecal)
# Add index to mzML file (using msconvert)
# Identify peptides in recalibrated files (using comet)
# Display a histogram of ms2 precursor mass errors before and
# after recalibration
# All results and intermediate results are stored in a separate direcory
# defined by RESULT_DIR
DATA_DIR=$(shell pwd)
DATA_BASE=$(shell basename $(DATA_DIR))
RESULT_DIR=$(HOME)/results/$(DATA_BASE)
# m/z error to find PSM's for recalibration
PPM1=10
# m/z error to compare recalibrated with original
PPM2=5
KEEP_PEPXML = yes
PARAM_FILE ?= $(wildcard $(DATA_DIR)/*.params)
FASTA ?= $(wildcard $(DATA_DIR)/*.fasta)
TOOLS_DIR=$(HOME)/tools
SEARCHENGINE=$(TOOLS_DIR)/comet.2019015.linux.exe
IDCONVERT=$(TOOLS_DIR)/idconvert
MSCONVERT=$(TOOLS_DIR)/msconvert
MSCONVERT_DOCKER=docker run -it --rm -e WINEDEBUG=-all -v $(DATA_DIR):/data chambm/pwiz-skyline-i-agree-to-the-vendor-licenses wine msconvert --zlib --filter "peakPicking vendor"
MZRECAL=$(TOOLS_DIR)/mzrecal
MZRECAL_FLAGS=-ppmuncal=$(PPM1)
PLOT=$(TOOLS_DIR)/plot-recal.R
PLOT_FLAGS=-e 0.01 -m $(PPM2)
# Avoid locale errors in idconvert
export LC_ALL=C
MZMLS = $(wildcard *.mzML)
MZIDS = $(MZMLS:.mzML=.mzid)
RECALS = $(addprefix $(RESULT_DIR)/,$(MZMLS:.mzML=-recal.mzML))
PEPS1 = $(addprefix $(RESULT_DIR)/,$(MZMLS:.mzML=-$(PPM1)ppm.mzid))
PEPS2 = $(addprefix $(RESULT_DIR)/,$(MZMLS:.mzML=-$(PPM2)ppm.mzid))
RECALPEPS = $(addprefix $(RESULT_DIR)/,$(MZMLS:.mzML=-recal.mzid))
ifdef KEEP_PEPXML
PEPXMLS1 = $(addprefix $(RESULT_DIR)/,$(MZMLS:.mzML=-$(PPM1)ppm.pep.xml))
PEPXMLS2 = $(addprefix $(RESULT_DIR)/,$(MZMLS:.mzML=-$(PPM2)ppm.pep.xml))
RECALPEPXMLS = $(addprefix $(RESULT_DIR)/,$(MZMLS:.mzML=-recal.pep.xml))
endif
MZMLSLN = $(addprefix $(RESULT_DIR)/,$(MZMLS))
PLOTS = $(addprefix $(RESULT_DIR)/,$(MZMLS:.mzML=.png))
TXTSCORE = $(addprefix $(RESULT_DIR)/,$(MZMLS:.mzML=.txt))
INTERMEDIATES = $(MZMLSLN) $(PEPS1) $(PEPS2) $(RECALPEPS) $(RECALS) \
$(PEPXMLS1) $(PEPXMLS2) $(RECALPEPXMLS) \
$(RECALS:-recal.mzML=-recal.json)
# Main target
all: dirs $(RECALS) $(PLOTS)
# To remove generated files
clean:
rm -f $(RECALS) $(PLOTS) $(TXTSCORE) $(INTERMEDIATES)
# Prevent deletion of intermediate files
.SECONDARY: $(INTERMEDIATES)
.PHONY: dirs
dirs: $(RESULT_DIR)
$(RESULT_DIR):
mkdir -p $(RESULT_DIR)
$(RECALPEPS): $(RECALS)
# Search uncalibrated with wide mass window
# Since 'comet' always names its output file after the input file,
# and 'idconvert' always names its output file after the mzML file,
# we put intermediate files in a temproary directory to get this
# working.
$(RESULT_DIR)/%-$(PPM1)ppm.mzid: %.mzML
$(eval TMP := $(shell mktemp -d))
ln -sf $(DATA_DIR)/$< $(TMP)/$*-$(PPM1)ppm.mzML
# Create updated comet parameter file with desired PPM error
sed -E "s/^peptide_mass_tolerance *=.*/peptide_mass_tolerance = $(PPM1)/" ${PARAM_FILE} > $(TMP)/comet.params
$(SEARCHENGINE) -D$(FASTA) -P$(TMP)/comet.params $(TMP)/$*-$(PPM1)ppm.mzML
$(IDCONVERT) -o $(RESULT_DIR) $(TMP)/$*-$(PPM1)ppm.pep.xml
ifdef KEEP_PEPXML
mv $(TMP)/$*-$(PPM1)ppm.pep.xml $(RESULT_DIR)
endif
rm -rf $(TMP)
# Recalibrate
$(RESULT_DIR)/%-recal.mzML: %.mzML $(RESULT_DIR)/%-$(PPM1)ppm.mzid $(MZRECAL)
$(eval TMP := $(shell mktemp -d))
$(MZRECAL) $(MZRECAL_FLAGS) -mzid=$(RESULT_DIR)/$*-$(PPM1)ppm.mzid \
-o=$(TMP)/$*-recal.mzML -cal=$(RESULT_DIR)/$*-recal.json $<
$(MSCONVERT) -z $(TMP)/$*-recal.mzML -o $(RESULT_DIR)
# Search recalibrated with small mass window
$(RESULT_DIR)/%-recal.mzid: $(RESULT_DIR)/%-recal.mzML
$(eval TMP := $(shell mktemp -d))
ln -sf $< $(TMP)/
sed -E "s/^peptide_mass_tolerance *=.*/peptide_mass_tolerance = $(PPM2)/" ${PARAM_FILE} > $(TMP)/comet.params
$(SEARCHENGINE) -D$(FASTA) -P$(TMP)/comet.params $(TMP)/$*-recal.mzML
$(IDCONVERT) -o $(RESULT_DIR) $(TMP)/$*-recal.pep.xml
ifdef KEEP_PEPXML
mv $(TMP)/$*-recal.pep.xml $(RESULT_DIR)
endif
rm -rf $(TMP)
# Search uncalibrated with small mass window
$(RESULT_DIR)/%-$(PPM2)ppm.mzid: %.mzML
$(eval TMP := $(shell mktemp -d))
ln -sf $(DATA_DIR)/$< $(TMP)/$*-$(PPM2)ppm.mzML
sed -E "s/^peptide_mass_tolerance *=.*/peptide_mass_tolerance = $(PPM2)/" ${PARAM_FILE} > $(TMP)/comet.params
$(SEARCHENGINE) -D$(FASTA) -P$(TMP)/comet.params $(TMP)/$*-$(PPM2)ppm.mzML
$(IDCONVERT) -o $(RESULT_DIR) $(TMP)/$*-$(PPM2)ppm.pep.xml
ifdef KEEP_PEPXML
mv $(TMP)/$*-$(PPM2)ppm.pep.xml $(RESULT_DIR)
endif
rm -rf $(TMP)
# Plot small windows uncalibrated and recalibrated to .png
$(RESULT_DIR)/%.png: $(RESULT_DIR)/%-$(PPM2)ppm.mzid $(RESULT_DIR)/%-recal.mzid
$(PLOT) $(PLOT_FLAGS) "--name=$(DATA_BASE) $*" $(RESULT_DIR)/$*-$(PPM2)ppm.mzid $(RESULT_DIR)/$*-recal.mzid \
--outfile=$(RESULT_DIR)/$*