Skip to content

Commit

Permalink
Merge pull request #250 from jodyphelan/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
jodyphelan committed Oct 25, 2022
2 parents 21793b5 + 858e0d4 commit 988d83f
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 49 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ cat results/ERR1664619.results.txt

#### Whole genome analysis

By default, `tb-profiler` only analyses drug resistance candidate genes. It is also possible to perform variant calling across the whole genome using the `--call_whole_genome` argument. When this is enabled it is also possible to compare your sample to previous runs to find those which are close to your new sample. Close samples are found using a SNP distance cutoff which is enabled using the `--snp_dist` argument. Close samples are stored in the json output and are also found in the text report. Additionally a neighbour joining tree can be built using the `--nj` argument. Please note that the `--snp_dist` and `--nj` functions are experimental and could produce unexpected results.
By default, `tb-profiler` only analyses drug resistance candidate genes. It is also possible to perform variant calling across the whole genome using the `--call_whole_genome` argument. When this is enabled it is also possible to compare your sample to previous runs to find those which are close to your new sample. Close samples are found using a SNP distance cutoff which is enabled using the `--snp_dist` argument. Close samples are stored in the json output and are also found in the text report. Please note that the `--snp_dist` function is experimental and could produce unexpected results.

#### Spoligotyping

Expand Down
2 changes: 1 addition & 1 deletion tb-profiler
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ algorithm.add_argument('--suspect',action="store_true",help="Use the suspect sui
algorithm.add_argument('--spoligotype',action="store_true",help="Perform in-silico spoligotyping (experimental feature)")
algorithm.add_argument('--call_whole_genome',action="store_true",help="Call variant across the whole genome")
algorithm.add_argument('--snp_dist',type=int,help="Store variant set and get all samples with snp distance less than this cutoff (experimental feature)")
algorithm.add_argument('--nj',action="store_true",help="Make an nj tree from snp distances (experimental feature)")
# algorithm.add_argument('--nj',action="store_true",help="Make an nj tree from snp distances (experimental feature)")
algorithm.add_argument('--no_trim',action="store_true",help="Don't trim files using trimmomatic")
algorithm.add_argument('--no_flagstat',action="store_true",help="Don't collect flagstats")
algorithm.add_argument('--no_clip',action="store_false",help="Don't clip reads")
Expand Down
17 changes: 10 additions & 7 deletions tbprofiler/output.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,19 @@ def write_outputs(args,results,template_file = None):
args.add_columns = None
extra_columns = [x.lower() for x in args.add_columns.split(",")] if args.add_columns else []
results["timestamp"] = datetime.now().strftime("%d-%m-%Y %H:%M:%S")

# if args.nj:
# # nj_tree = tbp.make_nj_tree(args,results)
# if "_tree" in results and results["_tree"]!=None:
# nj_tree = results['_tree']
# results['tree'] = nj_tree.ascii_art()
# infolog(f"Writing tree file: {tree_output}")
# nj_tree.write(tree_output)
# del results["_tree"]
infolog(f"Writing json file: {json_output}")
json.dump(results,open(json_output,"w"))
# extra_columns = [x.lower() for x in args.add_columns.split(",")] if args.add_columns else []

if args.nj:
nj_tree = tbp.make_nj_tree(args,results)
if nj_tree:
results['tree'] = nj_tree.ascii_art()
infolog(f"Writing tree file: {tree_output}")
nj_tree.write(tree_output)

if "pdf" in vars(args) and args.pdf:
infolog(f"Writing pdf file: {pdf_output}")
Expand All @@ -40,4 +43,4 @@ def write_outputs(args,results,template_file = None):
if args.csv:
infolog(f"Writing csv file: {csv_output}")
write_text(results,args.conf,csv_output,extra_columns,reporting_af=args.reporting_af,sep=",",template_file = template_file,use_suspect=args.suspect)

94 changes: 54 additions & 40 deletions tbprofiler/snp_dists.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import quickle
from pathogenprofiler import cmd_out,debug,infolog
from pathogenprofiler import cmd_out,debug,infolog,errlog
import os
import json
from .output import write_outputs
Expand Down Expand Up @@ -58,8 +58,16 @@ def get_close_samples(self,dir,cutoff=20):
})
return [x for x in self.sample_dists if x["distance"]<=cutoff]

def read_json(filename):
debug("Reading %s" % filename)
lock = filelock.FileLock(filename + ".lock")
with lock:
data = json.load(open(filename))
debug("Finished reading %s" % filename)
return data

def sample_in_json(sample,result_file):
results = json.load(open(result_file))
results = read_json(result_file)
if sample in [d['sample'] for d in results['close_samples']]:
return True
else:
Expand All @@ -80,6 +88,8 @@ def run_snp_dists(args,results):
dt = int(t2-t1)
infolog(f"\nFound {len(results['close_samples'])} close samples in {dt} seconds")
infolog("-------------------------------------------------------------\n")
# if args.nj:
# results['_tree'] = make_nj_tree(args,results)

def update_neighbour_snp_dist_output(args,results):
for s in results['close_samples']:
Expand All @@ -88,54 +98,58 @@ def update_neighbour_snp_dist_output(args,results):
if not sample_in_json(args.prefix,f):
lock = filelock.FileLock(f + ".lock")
with lock:
debug("Acquiring lock for %s" % f)
data = json.load(open(f))
data['close_samples'].append({
"sample":args.prefix,
"distance":s['distance']
})
temp_args = copy(args)
temp_args.prefix = s['sample']
# if args.nj:
# results['_tree'] = make_nj_tree(temp_args,data)
write_outputs(temp_args,data,template_file=args.output_template)
debug("Finished with lock for %s" % f)

def make_nj_tree(args,results):
# def make_nj_tree(args,results):

neighbours = [s['sample'] for s in results['close_samples']]
if len(neighbours)<2:
return None
all_samps = neighbours + [results['id']]
if len(neighbours)==0: return
dist_dict = {}
for d in results['close_samples']:
dist_dict[(results['id'],d['sample'])] = d['distance']
dist_dict[(d['sample'],results['id'])] = d['distance']
for si in neighbours:
data = json.load(open(os.path.join(args.dir,"results",f"{si}.results.json")))
for d in data['close_samples']:
sj = d['sample']
if sj not in all_samps: continue
dist_dict[(si,sj)] = d['distance']
dist_dict[(sj,si)] = d['distance']
dists = []
# neighbours = [s['sample'] for s in results['close_samples']]
# if len(neighbours)<2:
# return None
# all_samps = neighbours + [results['id']]
# if len(neighbours)==0: return
# dist_dict = {}
# for d in results['close_samples']:
# dist_dict[(results['id'],d['sample'])] = d['distance']
# dist_dict[(d['sample'],results['id'])] = d['distance']
# for si in neighbours:
# data = read_json(os.path.join(args.dir,"results",f"{si}.results.json"))
# for d in data['close_samples']:
# sj = d['sample']
# if sj not in all_samps: continue
# dist_dict[(si,sj)] = d['distance']
# dist_dict[(sj,si)] = d['distance']
# dists = []

for si in all_samps:
row = []
for sj in all_samps:
if si==sj:
row.append(0)
elif (si,sj) in dist_dict:
row.append(dist_dict[(si,sj)])
else:
si_set_file = os.path.join(args.dir,"results",f"{si}.non_ref.qkl")
sj_set_file = os.path.join(args.dir,"results",f"{sj}.non_ref.qkl")
si_set = variant_set(si_set_file)
d = si_set.get_snp_dist(sj_set_file)
row.append(d)
dists.append(row)
from skbio import DistanceMatrix
from skbio.tree import nj
dm = DistanceMatrix(dists, all_samps)
tree = nj(dm)
tree = tree.root_at_midpoint()
return tree
# for si in all_samps:
# row = []
# for sj in all_samps:
# if si==sj:
# row.append(0)
# elif (si,sj) in dist_dict:
# row.append(dist_dict[(si,sj)])
# else:
# si_set_file = os.path.join(args.dir,"results",f"{si}.non_ref.qkl")
# sj_set_file = os.path.join(args.dir,"results",f"{sj}.non_ref.qkl")
# si_set = variant_set(si_set_file)
# d = si_set.get_snp_dist(sj_set_file)
# row.append(d)
# dists.append(row)
# from skbio import DistanceMatrix
# from skbio.tree import nj
# dm = DistanceMatrix(dists, all_samps)
# tree = nj(dm)
# tree = tree.root_at_midpoint()
# return tree


0 comments on commit 988d83f

Please sign in to comment.