-
Notifications
You must be signed in to change notification settings - Fork 0
/
magi.py
271 lines (232 loc) · 8.6 KB
/
magi.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
__author__ = '''Roshan Padmanabhan'''
__version__ = '0.1'
'''
Main MAGI program
Absolute Requirement : pandas , numpy , biopython all compatabile with python3
Other Programs Required to run : Emboss , ProteinOrtho, Blastall for running ProteinOrtho , Prodigal
Input is a directory containing all pep and nuc files
pep and nuc files can be generated using another script called runProdigal.py
'''
#########################################################
import re
import sys
import os
import argparse
import shlex
import subprocess
from itertools import combinations as comb
import pandas as pd
import numpy as np
from SeqretFasta import SeqRet
import SeqretFasta
from preProcess import diskwalk , mergeFiles
import RunNW
#########################################################
def nC2(list_containing_headers):
'''list->list
list_containg_headers has the header names
returns the list of nC2 combiantions in tuple
'''
Comb = comb(list_containing_headers,2)
List_of_Tuples = []
for i in Comb:
List_of_Tuples.append(i)
return List_of_Tuples
def Mmax(l):
'''
return the max value of the two
'''
if len(l) == 2:
return max(l)
else :
return "Problem"
def MakeNewDF(df1):
'''(df) -> (df)
cut the first three columns and
returns the rest of columns as DF
'''
lookup1= ['#species','proteins','alg.-conn.']
df2 = df1.drop(lookup1, 1)
df2 = df2.drop(df2.index[:1])
return df2
def getCore(df):
'''(df) -> (df)
returns the core set in all rows
input is the dataframe returned from MakeNewDF
it reomoves all rows with '*'
'''
mask = df.applymap(lambda x: x in ['*'])
newDF = df[-mask.any(axis=1)]
return newDF
def getHeadNames(pdDF):
'''df->list
Returns the headers of pandas data fram as a list
>>>getHeadNames(DF1)
['sp1', 'sp2', 'sp3']
'''
return [ hNames for hNames in pdDF.columns]
def ConcatHeaders(lst):
'''
list->list
returns the concatenated list of Species
First it does a n combianation 2 of the names in list
and then concatenate the combinations using '_vs_'
then returns the list
>>>ConcatHeaders(['a','b','c','d'])
['a_vs_b', 'a_vs_c', 'a_vs_d', 'b_vs_c', 'b_vs_d', 'c_vs_d']
>>>ConcatHeaders(['1','2','3','4'])
['1_vs_2', '1_vs_3', '1_vs_4', '2_vs_3', '2_vs_4', '3_vs_4']
>>>ConcatHeaders([1,2,3,4])
['1_vs_2', '1_vs_3', '1_vs_4', '2_vs_3', '2_vs_4', '3_vs_4']
'''
return [str(x[0])+"_vs_"+str(x[1]) for x in nC2(lst)]
def ConcatIndex(lst):
'''
list->list
returns the concatenated list of seqids using '_vs_'
then returns the list
>>>ConcatIndex(['seqid1', 'seqid2'])
['seqid1_vs_seqid2']
>>>ConcatIndex([1,2])
['1_vs_2']
'''
return [str(lst[0])+"_vs_"+str(lst[1])]
def ConcatIndexString(lst):
'''
list->str
returns the concatenated str of seqids using '_vs_'
then returns the list
>>>ConcatIndex(['seqid1', 'seqid2'])
'seqid1_vs_seqid2'
>>>ConcatIndex([1,2])
'1_vs_2'
'''
return str(lst[0])+"_vs_"+str(lst[1])
def running_emboss(emboss_parameters):
'''(list) -> (str)
returns the similarity values for the parameters provided
emboss_parameters is a list comtaining seq1filename , seq2filename and needle / streacher
'''
erun = RunNW.EmbossRun(emboss_parameters)
ncl = erun.EmbossCline2()
erun.runCline(ncl)
return erun.GetSimilarity()
def create_pSeries(data,name):
'''
returns a pandas series
data should be dict and name string
'''
name = pd.Series(data,index=None,name=name,dtype=float)
return name
def create_pDataFrame(list_of_series,list_of_index_names):
'''list,list->DataFrame
returns a pandas dataframe
'''
return pd.DataFrame(list_of_series,index=list_of_index_names).T
def getMean(pdSeries):
'''pandas series --> number
'''
if getLen(pdSeries) >= 1:
return round(pdSeries.mean(),2)
def getMin(pdSeries):
'''pandas series --> number
'''
return pdSeries.min()
def getMax(pdSeries):
'''pandas series --> number
'''
return pdSeries.max()
def getLen(pdSeries):
'''pandas series --> number
'''
return pdSeries.count()
def garbage(anything):
del anything
def dots():
print(".",end='')
def get_agios_for_two_dict(dfcolumn1,dfcolumn2):
templistof_org1_org2 = {}
for i,en in enumerate(range(1,dfcolumn1.count())):
seq1outfile = open("/tmp/.agiosseq1.fasta",'w')
seq2outfile = open("/tmp/.agiosseq2.fasta",'w')
if '*' not in [ dfcolumn1[en] , dfcolumn2[en] ]:
seq1outfile.write(s.getSingleFasta( dfcolumn1[en] ))
seq2outfile.write(s.getSingleFasta( dfcolumn2[en] ))
seq1outfile.close()
seq2outfile.close()
if not (len(SeqretFasta.SeqIO.read(seq1outfile.name,'fasta'))) > 1500 :
templistof_org1_org2[(ConcatIndexString([dfcolumn1[en],dfcolumn2[en]]))] = running_emboss([seq1outfile.name,seq2outfile.name,'needle'])
else :
templistof_org1_org2[(ConcatIndexString([dfcolumn1[en],dfcolumn2[en]]))] = running_emboss([seq1outfile.name,seq2outfile.name,'needle'])
return templistof_org1_org2
#########################################################
if __name__ == '__main__':
####################### Command Line Argument Parsing ##########################
des="""
The main MAGI script.\n
Inputs are ProteinOrtho result file, path to the cds files and output directory path \n
"""
parser = argparse.ArgumentParser(description=des,formatter_class=argparse.RawTextHelpFormatter )
parser.add_argument('-g', help='path to input directory', action='store',dest='genome_files',required=True)
parser.add_argument('-p', help='path to ProteinOrtho result file', action='store',dest='proteinOrtho_Result_file',type=argparse.FileType('r'),required=True)
parser.add_argument('-o', help='Out put directory path', action='store',dest='out_path',required=True )
parser.add_argument('-c', help='calculate agios for core genes (0 for no cores 1 for yes)',type=int,choices=range(0,2),dest='core',required=False)
args = parser.parse_args()
glist = args.genome_files
output_dir = args.out_path
po_res_file = args.proteinOrtho_Result_file.name
core_agios = args.core
# ==========================TO DO==============================
# check if the directory is correct
# ==========================TO DO==============================
print("The input nucleotide files are processing now")
d = diskwalk(glist)
try:
allnuc = mergeFiles(d.sepNucs())
nucsmerged = allnuc.mergeflist()
#print("the path to the meged file ",nucsmerged)
except Exception as e:
raise e
o = diskwalk(output_dir)
outfile = o.fullFilePath('AGIOSresults.csv')
############################### START PROCESSING FILES IN THE INPUT FOLDER #########################################
# Now start the module SeqretFasta.SeqRet and get the sequence from the file nucsmeged
s=SeqRet(nucsmerged)
recd=s.makeFastaDict()
################################# START PROCESSING PROTEINORTHO RESULT FILE #######################################
pOrthoresfn = po_res_file
# The initial dataframe
pOResults = pd.read_csv(pOrthoresfn,sep='\t')
# corrected working DF with or without core genes
if core_agios :
pdres = getCore(MakeNewDF(pOResults))
else:
pdres = MakeNewDF(pOResults)
garbage(pOrthoresfn)
Headers = getHeadNames(pdres)
#print("The headers",Headers)
header_groups = nC2(Headers)
#print("The header groups",ConcatHeaders(Headers))
# final df to hold the results
finalDF = pd.DataFrame(columns=ConcatHeaders(Headers),index=None)
# looping tru all the header combinations
for OrgTup in header_groups:
org1=OrgTup[0] # org1 is the column header
org2=OrgTup[1]
colOrg1 = pdres[org1] # colOrg1 is the contents of the column org1
colOrg2 = pdres[org2]
print("============== For the pair {} == {} ============".format(org1,org2 ))
dots()
pSname = ConcatIndexString(OrgTup)
#print("Calculating AGIOS for the pair {} and {}".format(org1,org2 ))
#make a series for each header_group
eachSeries = create_pSeries(get_agios_for_two_dict(colOrg1,colOrg2),pSname)
# put it in the final df
finalDF[pSname] = [getMean(eachSeries),str(getLen(eachSeries))]
print('')
# print the results
FDF = pd.DataFrame(finalDF.T)
FDF.columns = ['agios_value','number_of_orthologs']
FDF.to_csv(outfile)
print("Please check the file {} for results".format(outfile))
print("")