-
Notifications
You must be signed in to change notification settings - Fork 0
/
GetBlastDB.py
103 lines (92 loc) · 4.01 KB
/
GetBlastDB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
This script 1) updates/downloads the refseq_rna blast db files,
2) creates a list of taxonomy ids based on the list of organisms, and 3) create
a csv file with only human accessions and genes for downstream usage. Check the
ReadMe file for a first time setup of a blast database. Both steps occur on the
head node of the MCSR because they need internet access.
"""
from ftplib import FTP, error_perm
import os
import fnmatch
import sys
import logging as log
from datetime import datetime as d
import subprocess
#------------------------------------------------------------------------------
# Set up the logger for logging
format1 = '%a %b %d %I:%M:%S %p %Y' # Used to add as a date
format2 = '%m-%d-%Y@%I:%M:%S-%p' # Used to append to archives
format3 = '%m-%d-%Y'
#------------------------------------------------------------------------------
home = os.getcwd() # My home directory for this script/project
dbpath = '/work5/r2295/bin/databases/refseq_rna_db' # My current dbpath
# Create a directory for the database if one doesn't exist
try:
# If the directory exists,
if os.path.exists(dbpath) == True:
# Move any files that are in the directory to a dated archive folder.
# Moving a directory in linux/unix essentially renames it.
os.system(
'mv ' +
dbpath +
' /work5/r2295/bin/databases/refseqrnadb_archive_' +
d.now().strftime(format2))
os.mkdir(dbpath) # Recreate the database directory
else: # If the directory does not exist
os.mkdir(dbpath)
log.info("The %s directory was created." % str(dbpath))
except os.error:
log.info("Error.")
sys.exit("There has been an error.")
os.chdir(dbpath) # Change to the database directory
#------------------------------------------------------------------------------
# Connect to the NCBI ftp site
try:
ncbi = 'ftp.ncbi.nlm.nih.gov/'
blastdb = '/blast/db/' # Set variable for the blastdb subdirectory
ftp = FTP("ftp.ncbi.nlm.nih.gov", timeout=None)
# Login using email as password
ftp.login(user='anonymous', passwd='')
log.info("Successful FTP login.")
except error_perm: # This error will be thrown if there's a connection issue.
log.info("FTP connection error.")
sys.exit()
# Change to the desired directory
ftp.cwd(blastdb)
# Use ftp.pwd() to find out the current directory
# Use ftp.retrlines('LIST') to get a list of all the files in the directory
# This is a list of the file names in the current directory
filenames = ftp.nlst()
#------------------------------------------------------------------------------
# Create a for loop that writes the list/text file of files wanted
with open('downloadlist.txt', 'w') as downloads:
for filename in filenames:
if fnmatch.fnmatch(filename, 'refseq_rna*'): # Get only those files.
refseq_file = os.path.join(filename)
# Write the url of each refseq_rna db file to a text file.
downloads.writelines(ncbi + blastdb + refseq_file + '\n')
# use elif here to get the taxdb.tar.gz file.
elif fnmatch.fnmatch(filename, 'taxdb*'):
taxdb_file = os.path.join(filename)
downloads.writelines(ncbi + blastdb + taxdb_file + '\n')
# Download the list of files using 'wget' on linux/unix
try:
cmd = 'cat downloadlist.txt | xargs -n 1 -P 8 wget'
status = subprocess.call([cmd], shell=True)
if status == 0:
log.info("The refseqrna blast db files have downloaded.")
else:
log.info("Something went wrong.")
ftp.quit()
except os.error:
log.info("There has been an error.")
log.info(sys.exit())
log.info(ftp.quit())
ftp.close()
#------------------------------------------------------------------------------
# Unzip all of the files and remove unneccessary files
# Unzip the database files
os.system("for file in *.tar.gz; do tar xvf $file; done")
os.system("rm -r *.tar.gz")
log.info("The files have been unzipped, and Part 1 has finished.")
log.info("#------------------------------------------------------------------")