-
Notifications
You must be signed in to change notification settings - Fork 0
/
Protien_File_Processor.py
57 lines (49 loc) · 1.84 KB
/
Protien_File_Processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# Team Members
# - Shubhendu Vimal - 11915067
# - Dharani Kiran Kavuri - 11915033
# - Anmol More - 11915043
#Data Preparation in plain Python
#Read raw fasta protien sequence files and convert it to parsable CSV files using biopython
#Embedded block of code for converting to pdf only, run separately (time taking)
import sys
import pandas as pd
from Bio.SeqUtils.ProtParam import ProteinAnalysis
#arg 1 - fasta file name
#arg 2 - no of records to process
file = str(sys.argv[1])
limit = int(sys.argv[2])
print("Processing file : ", file)
df = pd.DataFrame()
enzyme_type = file[:-6]
# read fasta files, line by line and process enzyme sequence
print("Group all as : ", enzyme_type)
with open(file) as fileobject:
start = True
row = ""
for line in fileobject:
if("sp|" in line or "tr|" in line) :
start = True
if(row != "") :
row_dict = {}
row_dict["Sequence"] = row
row_dict["Type"] = enzyme_type
try :
#use biopython library to process enzyme sequence
analysed_seq = ProteinAnalysis(row)
#print(analysed_seq)
amino_acid_counts = analysed_seq.count_amino_acids()
row_dict.update(amino_acid_counts)
analysed_seq = ProteinAnalysis(row)
row_dict["weight"] = analysed_seq.molecular_weight()
row_dict["gravy"] = analysed_seq.gravy()
df = df.append(row_dict, ignore_index=True)
print(df.shape)
if(df.shape[0] > limit) :
break
except :
print("Error")
row = ""
elif(start) :
row += line.rstrip()
print(df.head())
df.to_csv("data/" + enzyme_type + ".csv", index=False)