-
Notifications
You must be signed in to change notification settings - Fork 0
/
blast_setup.py
239 lines (204 loc) · 9.21 KB
/
blast_setup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
# -*- coding: utf-8 -*-
"""
File Name: blast_setup.py
Description: This script 1) updates/downloads the refseq_rna blast db files,
2) creates a list of taxonomy ids based on the list of organisms, and 3) create
a csv file with only human accessions and genes for downstream usage. Check the
ReadMe file for a first time setup of a blast database. Both steps occur on the
head node of the MCSR because they need internet access.
@author: Shaurita Hutchins
Date Created: Tue Feb 28 19:05:41 2017
Project Name: KARG Project
"""
# Modules used
from ftplib import FTP, error_perm
import os
import fnmatch
import sys
import logging as log
from datetime import datetime as d
from Bio import Entrez
from time import sleep as pause
import pandas as pd
import subprocess
#------------------------------------------------------------------------------
# Set up the logger for logging
format1 = '%a %b %d %I:%M:%S %p %Y' # Used to add as a date
format2 = '%m-%d-%Y@%I:%M:%S-%p' # Used to append to archives
format3 = '%m-%d-%Y' # Use for Windows
log.basicConfig(
filename="blast_setup_" +
d.now().strftime(format3) +
".log",
level=log.INFO)
log.info("#------------------------------------------------------------------")
log.info("The script name is %s" % os.path.basename(__file__))
log.info("The date and time is currently %s" % str(d.now().strftime(format1)))
log.info("#------------------------------------------------------------------")
log.info("PART 1")
log.info("Download and update the refseq_rna blast database.")
#------------------------------------------------------------------------------
home = os.getcwd() # My home directory for this script/project
dbpath = 'databases/refseq_rna_db' # My current dbpath
# Create a directory for the database if one doesn't exist
try:
# If the directory exists,
if os.path.exists(dbpath) == True:
log.info("The %s directory exists & will be archived." % str(dbpath))
# Move any files that are in the directory to a dated archive folder.
# Moving a directory in linux/unix essentially renames it.
os.system(
'mv ' +
dbpath +
' /databases/refseqrnadb_archive_' +
d.now().strftime(format2))
os.mkdir(dbpath) # Recreate the database directory
log.info("The %s directory was created." % str(dbpath))
pass
else: # If the directory does not exist
os.mkdir(dbpath)
log.info("The %s directory was created." % str(dbpath))
except os.error:
log.info("Error.")
sys.exit("There has been an error.")
os.chdir(dbpath) # Change to the database directory
#------------------------------------------------------------------------------
# Connect to the NCBI ftp site
try:
ncbi = 'ftp.ncbi.nlm.nih.gov/'
blastdb = '/blast/db/' # Set variable for the blastdb subdirectory
ftp = FTP("ftp.ncbi.nlm.nih.gov", timeout=None)
# Login using email as password
ftp.login(user='anonymous', passwd='[email protected]')
log.info("Successful FTP login.")
except error_perm: # This error will be thrown if there's a connection issue.
log.info("FTP connection error.")
sys.exit()
# Change to the desired directory
ftp.cwd(blastdb)
# Use ftp.pwd() to find out the current directory
# Use ftp.retrlines('LIST') to get a list of all the files in the directory
# This is a list of the file names in the current directory
filenames = ftp.nlst()
#------------------------------------------------------------------------------
# Create a for loop that writes the list/text file of files wanted
with open('downloadlist.txt', 'w') as downloads:
for filename in filenames:
if fnmatch.fnmatch(filename, 'refseq_rna*'): # Get only those files.
refseq_file = os.path.join(filename)
# Write the url of each refseq_rna db file to a text file.
downloads.writelines(ncbi + blastdb + refseq_file + '\n')
# use elif here to get the taxdb.tar.gz file.
elif fnmatch.fnmatch(filename, 'taxdb*'):
taxdb_file = os.path.join(filename)
downloads.writelines(ncbi + blastdb + taxdb_file + '\n')
# Download the list of files using 'wget' on linux/unix
try:
cmd = 'cat downloadlist.txt | xargs -n 1 -P 8 wget'
status = subprocess.call([cmd], shell=True)
if status == 0:
log.info("The refseqrna blast db files have downloaded.")
# Unzip all of the files and remove unneccessary files
# Unzip the database files
os.system("for file in *.tar.gz; do tar xvf $file; done")
os.system("rm -r *.tar.gz")
log.info("The files have been unzipped, and Part 1 has finished.")
log.info(
"#------------------------------------------------------------------")
else:
log.info("Something went wrong.")
ftp.quit()
except os.error:
log.info("There has been an error.")
log.info(sys.exit())
log.info(ftp.quit())
ftp.close()
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# Mark start of PART 2 with logging information
log.info("#------------------------------------------------------------------")
log.info("PART 2")
log.info("Create a list of taxonomy ids based on the organisms list.")
#------------------------------------------------------------------------------
# Change to the home directory
os.chdir(home)
#------------------------------------------------------------------------------
# Open Orgnanisms.csv which is a comma delimited list of all species names to be used.
# Read .csv into pandas. If no columns for the data, set header as None or 0 (int)
# If the data has columns, simply just read in the csv without the header
# option
orgsfile = pd.read_csv('data/initial-data/organisms.csv', header=None)
# Create a list name/variable and use list()
orgs = list(orgsfile[0])
#------------------------------------------------------------------------------
# Get a list of taxonomy ids for a corresponding organisms list
# Always tell ncbi whom you are
Entrez.email = "[email protected]"
# Create a short variable for esearch & record reading
esearch = Entrez.esearch
read = Entrez.read
# Initialize a list
idlist = []
# Create a for loop to compile a list of taxonomy ids
for species in orgs:
# Create a handle for the search
searchresult = esearch(db="taxonomy", term=species)
# Read and print the record or results of the search
record = read(searchresult)
# Replace unwanted characters
idresult = record['IdList']
x = str(idresult)
x = x.replace("'", "")
x = x.replace("[", "")
x = x.replace("]", "")
# Append the id to the id list
idlist.append(x)
# Keep updated on how this is progressing
print("Taxonomy ID: " + x, "| Species Name: " + species)
# Make a short pause (2 seconds) as to not break the entrez rules
pause(2)
log.info("This is the id list: %s" % idlist)
if len(idlist) == len(orgs):
log.info('There are the same number of taxonomy ids as organisms.')
else:
log.info("Something is wrong.")
# Turn the list into a pandas dataframe and save it
df = pd.DataFrame(idlist, dtype=str)
df.to_csv('data/initial-data/taxids.csv', header=False, index=False)
log.info('Your file taxids.csv has been saved.')
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
# Mark start of PART 3 with logging information
log.info("#------------------------------------------------------------------")
log.info("PART 3")
log.info("Create a csv file with only human accessions and genes for downstream usage.")
#------------------------------------------------------------------------------
# Open the file with the list of all the genes, accessions, and tiers
mal = pd.read_csv('data/initial-data/master_accessions_list.csv')
# Create a list of human accessions from column
human_acc = list(mal.Homo_sapiens)
genes_list = list(mal.Gene) # Create a list of genes from the column
tiers_list = list(mal.Tier)
# Turn the lists into dataframes
accs = pd.DataFrame(human_acc, dtype=str)
genes = pd.DataFrame(genes_list, dtype=str)
tiers = pd.DataFrame(tiers_list, dtype=str)
# List of dataframes I want to combine
frames = [tiers, genes, accs]
# Merge the dataframes into 1 dataframe
human_acc_info = pd.concat(frames, axis=1)
# Save the dataframe as a csv file which will be used in AccCollect.py
human_acc_info.to_csv(
'data/initial-data/homo_sapiens_accessions.csv',
index=False,
header=None)
log.info("homo_sapiens_accessions.csv file was created.")
log.info("The script has finished.")
log.info("#------------------------------------------------------------------")
log.info("#------------------------------------------------------------------")
log.info("#------------------------------------------------------------------")
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------
#------------------------------------------------------------------------------