Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More robustly handle sample names in indel VCF headers #32

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 47 additions & 39 deletions deTiN/deTiN_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,16 +513,56 @@ def read_indel_vcf(vcf,seg_table,indel_type):
with open(vcf) as f:
content = f.readlines()

# if the sample names were not present in the header (this VCF is technically
# malformed), then fallback to generic "normal" and "tumor"
normal_sample = "normal"
tumor_sample = "tumor"
cols_type = {0: str}
for line in content:
# if we can get the sample names from the header, use them instead
if line[0:15] == '##normal_sample':
normal_sample = line.split('=')[1][0:-1]
if line[0:14] == '##tumor_sample':
tumor_sample = line.split('=')[1][0:-1]
if line[0] == '#' and line[1] != '#':
headerline = line.split('\t')
headerline = line.rstrip().split('\t')
break

indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type)
indel_table.rename(columns = {
0: 'contig',
1: 'position',
2: 'ID',
3: 'REF',
4: 'ALT',
5: 'QUAL',
6: 'filter',
7: 'INFO',
8: 'format',
9: headerline[9],
10: headerline[10]
}, inplace = True)

# determine which columns corresponds to the tumor/normal
# if the sample names don't match the column names, we assume it's because
# we set it to generic "tumor"/"normal" (lowercase) but the column name is
# capitalized. if this assumption isn't satisfied, then we fail.
try:
t_samp_ix = indel_table.columns.get_loc(tumor_sample)
except KeyError:
if "TUMOR" in indel_table.columns:
t_samp_ix = indel_table.columns.get_loc("TUMOR")
else:
raise KeyError("Could not infer which VCF column corresponds to the tumor!")
try:
n_samp_ix = indel_table.columns.get_loc(normal_sample)
except KeyError:
if "NORMAL" in indel_table.columns:
n_samp_ix = indel_table.columns.get_loc("NORMAL")
else:
raise KeyError("Could not infer which VCF column corresponds to the normal!")

if indel_type.lower() == 'strelka':
indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type)
indel_table.rename(columns={0: 'contig', 1: 'position',2:'ID',3:'REF',4:'ALT',5:'QUAL',7:'INFO', 8: 'format', 6: 'filter', 9: headerline[9].lower(), 10: headerline[10][0:-1].lower()},
inplace=True)
counts_format = indel_table['format'][0].split(':')
depth_ix = counts_format.index('DP')
alt_indel_ix = counts_format.index('TIR')
Expand All @@ -531,44 +571,12 @@ def read_indel_vcf(vcf,seg_table,indel_type):
indel_table.reset_index(inplace=True, drop=True)

elif indel_type.lower() == 'mutect2':
indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type)
# CHROM POS ID REF ALT QUAL FILTER INFO FORMAT TUMOR NORMAL
normal_sample = 'normal'
tumor_sample = 'tumor'
for line in content:
if line[0:15] == '##normal_sample':
normal_sample = line.split('=')[1][0:-1]
if line[0:14] == '##tumor_sample':
tumor_sample = line.split('=')[1][0:-1]
if tumor_sample == 'tumor' and normal_sample == 'normal':
indel_table.rename(
columns={0: 'contig', 1: 'position', 2: 'ID', 3: 'REF', 4: 'ALT', 5: 'QUAL', 7: 'INFO', 8: 'format',
6: 'filter', 9: 'tumor', 10: 'normal'},
inplace=True)
else:
if tumor_sample == headerline[9]:
indel_table.rename(
columns={0: 'contig', 1: 'position', 2: 'ID', 3: 'REF', 4: 'ALT', 5: 'QUAL', 7: 'INFO', 8: 'format',
6: 'filter', 9: 'tumor', 10: 'normal'},
inplace=True)
elif tumor_sample == headerline[10][0:-1]:
indel_table.rename(
columns={0: 'contig', 1: 'position', 2: 'ID', 3: 'REF', 4: 'ALT', 5: 'QUAL', 7: 'INFO', 8: 'format',
6: 'filter', 9: 'normal', 10: 'tumor'},
inplace=True)
else:
print('failed to read MuTect 2 indels VCF')
sys.exit()
counts_format = indel_table['format'][0].split(':')
depth_ix = counts_format.index('AD')
indel_table = indel_table[np.isfinite(is_member(indel_table['filter'], ['PASS', 'alt_allele_in_normal','artifact_in_normal']))]
indel_table.reset_index(inplace=True, drop=True)

elif indel_type.lower() == 'sanger':
indel_table = pd.read_csv(vcf, sep='\t', comment='#', header=None, low_memory=False, dtype=cols_type)
# CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOUR
indel_table.rename(columns={0: 'contig', 1: 'position',2:'ID',3:'REF',4:'ALT',5:'QUAL',7:'INFO',8: 'format', 6: 'filter', 9: headerline[9].lower(), 10: headerline[10][0:-1].lower()},
inplace=True)
b1 = np.logical_or.reduce([indel_table['filter'] == 'F012', indel_table['filter'] == 'F012;F015'])
b2 = np.logical_or.reduce([indel_table['filter'] == 'PASS', indel_table['filter'] == 'F015'])
indel_table = indel_table[np.logical_or.reduce([b1, b2])]
Expand All @@ -588,8 +596,8 @@ def read_indel_vcf(vcf,seg_table,indel_type):
t_ref_count = np.zeros([len(indel_table), 1])

for index, row in indel_table.iterrows():
spl_n = row['normal'].split(':')
spl_t = row['tumor'].split(':')
spl_n = row.iloc[n_samp_ix].split(':')
spl_t = row.iloc[t_samp_ix].split(':')
if indel_type.lower() == 'strelka':
n_depth[index] = int(spl_n[depth_ix])
n_alt_count[index] = int(spl_n[alt_indel_ix].split(',')[0])
Expand All @@ -612,7 +620,7 @@ def read_indel_vcf(vcf,seg_table,indel_type):
t_alt_count[index] = np.sum([int(spl_t[i]) for i in alt_count_idx])
t_ref_count[index] = t_depth[index] - t_alt_count[index]
if len(indel_table) == 0:
indel_table = pd.DataFrame(index=[0],columns=['contig', 'position','ID','REF','ALT','QUAL','INFO','format', 'filter',headerline[9].lower(), headerline[10][0:-1].lower(),
indel_table = pd.DataFrame(index=[0],columns=['contig', 'position','ID','REF','ALT','QUAL','INFO','format', 'filter',headerline[9], headerline[10],
't_depth','t_alt_count','t_ref_count','n_alt_count','n_depth','n_ref_count','tau','f_acs','Chromosome','genomic_coord_x'])
else:
indel_table['t_depth'] = t_alt_count + t_ref_count
Expand Down