-
Notifications
You must be signed in to change notification settings - Fork 0
/
mnovautils.py
274 lines (227 loc) · 9.09 KB
/
mnovautils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
import json
import pathlib
import pandas as pd
from shutil import copyfile
import datetime
from qtutils import warning_dialog
def make_excel_backup(fn_excel: pathlib.Path) -> bool:
ret = False
if fn_excel.exists():
fn_excel_bckup = (
fn_excel.parent
/ "excel_backup"
/ (
fn_excel.stem
+ "_"
+ datetime.datetime.now().strftime("%d%b%Y_%H%M%S")
+ ".xlsx"
)
)
fn_excel_bckup.parents[0].mkdir(parents=True, exist_ok=True)
ret = copyfile(fn_excel, fn_excel_bckup)
if len(str(ret)) > 0:
ret = True
return ret
def return_nonempty_mnova_datasets(data: dict) -> dict:
# remove datasets where multiplet counts is zero and peaks counts is zero and integrals counts is zero
dicts_to_keep = {}
for k, v in data.items():
if isinstance(v, dict):
if (
(v["multiplets"]["count"] > 0)
or (v["peaks"]["count"] > 0)
or (v["integrals"]["count"] > 0)
):
dicts_to_keep[k] = v
else:
dicts_to_keep[
k
] = v # must be smiles string, maybe need to check for this at some point
return dicts_to_keep
def add_technique(key: str, technique_keys: dict, technique_counts: dict, expt: str):
if expt in technique_keys.values():
technique_counts[expt] += 1
technique_keys[key] = f"{expt}_{technique_counts[expt]}"
else:
technique_keys[key] = expt
def read_in_mesrenova_json(fn: pathlib.Path) -> dict:
"""Read in the JSON file exported from MestReNova."""
with open(fn, "r") as file:
data_orig = json.load(file)
data = return_nonempty_mnova_datasets(data_orig)
# Identify the technique keys present in the JSON data
technique_keys = {}
technique_counts = {
"HSQC": 0,
"HMBC": 0,
"COSY": 0,
"C13_1D": 0,
"H1_1D": 0,
"NOESY": 0,
"H1_pureshift": 0,
}
for key in data:
if isinstance(data[key], dict):
subtype = data[key].get("subtype", "")
pulse_sequence = data[key].get("pulsesequence", "")
if (
subtype.lower().find("hsqc") != -1
and data[key].get("type", "").lower() == "2d"
):
add_technique(key, technique_keys, technique_counts, "HSQC")
elif (
subtype.lower().find("hmbc") != -1
and data[key].get("type", "").lower() == "2d"
):
add_technique(key, technique_keys, technique_counts, "HMBC")
elif (
subtype.lower().find("cosy") != -1
and data[key].get("type", "").lower() == "2d"
):
add_technique(key, technique_keys, technique_counts, "COSY")
elif (
subtype.lower().find("13c") != -1
and data[key].get("type", "").lower() == "1d"
):
add_technique(key, technique_keys, technique_counts, "C13_1D")
elif (
subtype.lower().find("1h") != -1
and data[key].get("type", "").lower() == "1d"
and "psyche" in data[key].get("pulsesequence", "").lower()
):
add_technique(key, technique_keys, technique_counts, "H1_pureshift")
elif (
subtype.lower().find("1h") != -1
and data[key].get("type", "").lower() == "1d"
):
add_technique(key, technique_keys, technique_counts, "H1_1D")
elif (
subtype.lower().find("1h") != -1
and data[key].get("type", "").lower() == "2d"
):
add_technique(key, technique_keys, technique_counts, "NOESY")
for k, v in technique_keys.items():
data[v] = data[k]
del data[k]
return data
def get_2D_dataframe_from_json(json_data: dict, technique: str) -> pd.DataFrame:
"""
Returns a pandas dataframe from the json_data dictionary for the specified technique.
"""
df_data = []
for i in range(json_data[technique]["peaks"]["count"]):
df_data.append(
[
json_data[technique]["peaks"][str(i)]["delta2"],
json_data[technique]["peaks"][str(i)]["delta1"],
json_data[technique]["peaks"][str(i)]["intensity"],
json_data[technique]["peaks"][str(i)]["type"],
]
)
df = pd.DataFrame(df_data, columns=["f2 (ppm)", "f1 (ppm)", "Intensity", "Type"])
# sort the dataframe by f2 (ppm), descending order, reset the index and start the index at 1
df = df.sort_values(by=["f2 (ppm)"], ascending=False).reset_index(drop=True)
df.index += 1
return df
def get_1d_dataframe_from_json(json_data: dict, technique: str) -> pd.DataFrame:
df_data = []
if json_data[technique]["multiplets"]["count"] == 0:
# find peaks from from peaks key
for i in range(json_data[technique]["peaks"]["count"]):
if str(i) in json_data[technique]["peaks"]:
df_data.append(
[
json_data[technique]["peaks"][str(i)]["delta1"],
json_data[technique]["peaks"][str(i)]["intensity"],
json_data[technique]["peaks"][str(i)]["type"],
]
)
df = pd.DataFrame(df_data, columns=["ppm", "Intensity", "Type"])
else:
# find peaks from from multiplets key
# Name Shift Range H's Integral Class J's Method
count = json_data[technique]["multiplets"]["count"]
normValue = json_data[technique]["multiplets"]["normValue"]
for i in [str(i) for i in range(count)]:
if str(i) in json_data[technique]["multiplets"]:
row = [
json_data[technique]["multiplets"][i]["delta1"],
json_data[technique]["multiplets"][i]["integralValue"],
json_data[technique]["multiplets"][i]["nH"],
json_data[technique]["multiplets"][i]["category"],
]
# create a string from the list of J values and add it to df_data
j_values = json_data[technique]["multiplets"][i]["jvals"]
j_string = ", ".join([f"{j:1.3}" for j in j_values])
j_string = f"{j_string}"
row.append(j_string)
df_data.append(row)
df = pd.DataFrame(df_data, columns=["ppm", "Integral", "H's", "Class", "J's"])
df["Integral"] = df["Integral"] / normValue
# sort the dataframe by f2 (ppm), descending order, reset the index and start the index at 1
df = df.sort_values(by=["ppm"], ascending=False).reset_index(drop=True)
df.index += 1
return df
def create_dataframes_from_mresnova_json(data: dict) -> dict:
"""
Returns a dictionary of pandas dataframes for each technique in the data dictionary.
"""
dataframes = {}
for k, v in data.items():
if k in [
"H1_1D",
"C13_1D",
"HSQC",
"HMBC",
"HSQC_CH",
"COSY",
"NOESY",
"H1_pureshift",
]:
if v["type"].lower() == "2d":
df = get_2D_dataframe_from_json(data, k)
dataframes[k] = df
elif v["type"].lower() == "1d":
df = get_1d_dataframe_from_json(data, k)
dataframes[k] = df
elif k in ["smiles"]:
dataframes["molecule"] = pd.DataFrame([data["smiles"]], columns=["smiles"])
return dataframes
def write_excel_file_from_mresnova_df_dict(
df_frames: dict, excel_path: pathlib.Path, qtstarted: bool = False
) -> bool:
# check if path is valid
if not excel_path.parent.exists():
return False
else:
try:
with pd.ExcelWriter(excel_path) as writer:
# check if path is valid
for k, df in df_frames.items():
df.to_excel(writer, sheet_name=k)
except:
# print exception
warning_dialog(f"Exception occurred attempting to write excel file\n{str(excel_path)}",
"Exception occurred attempting to write excel file", qtstarted)
return False
return True
def check_for_multiple_HSQC_expts(data: dict) -> dict:
"""Check for multiple HSQC experiments and rename them."""
hsqc_expts = [e for e in data.keys() if e.find("HSQC") != -1]
print(hsqc_expts)
if len(hsqc_expts) == 2:
# compare the number of peaks in each experiment
if (
data[hsqc_expts[0]]["peaks"]["count"]
>= data[hsqc_expts[1]]["peaks"]["count"]
):
# rename the 2nd experiment
data["HSQC_CH"] = data.pop(hsqc_expts[1])
else:
# rename the 1st experiment
data["HSQC_CH"] = data.pop(hsqc_expts[0])
# rename the 2nd experiment
data["HSQC"] = data.pop(hsqc_expts[1])
return data
if __name__ == "__main__":
pass