forked from zaritskylab/SLGC-tissue-similarity-and-grading
-
Notifications
You must be signed in to change notification settings - Fork 0
/
chip_types_data_analysis.py
530 lines (479 loc) · 17.8 KB
/
chip_types_data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
"""Chip types data analysis
The script should be ran and will read imzML files, process data and create
relevant plot such as feature counts per Chip types.
"""
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
from typing import Dict
from pathlib import Path
from matplotlib import pyplot as plt
from matplotlib.patches import Patch
from pyimzml.ImzMLParser import ImzMLParser
from processing import process
from correlation import correlation_analysis
from utils import read_msi
from liver_data_analysis import msi_mean_num_features
def plot_msi_segmentation(
metadata_df: pd.DataFrame, masks, save_path: Path
) -> None:
"""Plots and saves an image that represents the segmentation mask of each msi.
Args:
metadata (pd.DataFrame): Data frame of metadata.
masks (Dict[str, np.ndarray]): A dictionary containing masks for each
sample.
save_path (Path): Path where the correlation plot will be saved.
"""
# Determine the layout of the subplot grid based on the data
grouped = metadata_df.groupby("sample_number")
# Determine the layout of the subplot grid based on the data
max_rows = grouped.size().max()
num_groups = len(grouped)
# Creating a grid of subplots
_, axs = plt.subplots(
max_rows, num_groups, figsize=(num_groups * 2, max_rows * 2)
)
# Ensuring axs is always a 2D array for consistency in indexing
if num_groups == 1:
axs = np.expand_dims(axs, axis=-1)
if max_rows == 1:
axs = np.expand_dims(axs, axis=0)
# Plotting each mask image in the appropriate subplot
for group_index, (_, group) in enumerate(grouped):
for image_index, (_, row) in enumerate(group.iterrows()):
# Determine the subplot index
subplot_index = image_index if group_index == 0 else image_index + 1
# Access the subplot axis
ax = axs[subplot_index, group_index]
try:
# Get the mask image
img = masks[row.sample_file_name]
# Display the image
ax.imshow(img, cmap="gray_r")
# Customizing the ticks for better readability
ax.set_xticks([0, img.shape[1] - 1])
ax.set_yticks([0, img.shape[0] - 1])
ax.tick_params(axis='both', length=0)
# Customizing tick labels
for yticklabel in ax.get_yticklabels():
yticklabel.set_fontweight('bold')
yticklabel.set_fontsize(14)
yticklabel.set_color('0.2')
for xticklabel in ax.get_xticklabels():
xticklabel.set_fontweight('bold')
xticklabel.set_fontsize(14)
xticklabel.set_color('0.2')
# Customizing the spines for aesthetics
for axis in ['bottom', 'left']:
ax.spines[axis].set_linewidth(2.5)
ax.spines[axis].set_color('0.2')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
except FileNotFoundError:
# Hide the axis if the file is not found and print a warning
ax.axis('off')
print(f"File not found: {row['ImagePath']}")
# Turning off unused subplots to maintain a clean grid
for ax in axs[len(group) + (0 if group_index == 0 else 1):, group_index]:
ax.axis('off')
# Specifically turning off the skipped subplot for non-first groups
if group_index != 0:
axs[0, group_index].axis('off')
# Add space between subplots
plt.tight_layout()
# Save the plot
plt.savefig(
save_path.joinpath("segmentations.png"), bbox_inches='tight', dpi=1200,
transparent=True
)
# Show the plot
plt.show()
def plot_spectras_corr(processed_path: str, output_path: str) -> None:
"""Plots a correlation matrix for the chip types and saves the figure.
Args:
processed_path (str): Path to processed continuos imzML files.
output_path (str): Path to save correlation matrices.
"""
# Apply correlation analysis
correlation_analysis(processed_path, output_path)
# Load the correlation matrix from the file
corr_matrix = pd.read_csv(
output_path / 'meaningful-signal_tissue_tissue.csv', index_col=0
)
# Filter for 'replica' (-r) and 'sections' (-s) spectras
replica_columns = [col for col in corr_matrix.columns if col.endswith('-r')]
section_rows = corr_matrix.index[corr_matrix.index.str.endswith('-s')]
# Create a new correlation matrix with the specified conditions
filtered_corr_matrix = corr_matrix.loc[section_rows, replica_columns]
# Generate the heatmap
_, ax = plt.subplots(1, 1, figsize=(15, 5), tight_layout=True)
# Plot correlation matrix
ax = sns.heatmap(
filtered_corr_matrix, annot=True, cbar=False, cmap="YlGn", fmt=".2f",
vmin=-1, vmax=1, linewidth=2, linecolor='w', square=True, ax=ax,
annot_kws={
"fontsize": 14,
"fontweight": 'bold',
'color': '0.2'
}
)
# Rename x and y ticks labels
x_ticks_labels = [
label.replace('_', '\n').replace("-r", "") for label in replica_columns
]
y_ticks_labels = []
for label in section_rows:
tick_label = label[0]
try:
tick_label += "_" + str(int(label.replace("-s", "")[-1]))
except:
pass
y_ticks_labels.append(tick_label)
# Set x and y ticks labels
ax.set_xticklabels(x_ticks_labels, rotation=0, ha='center')
ax.set_yticklabels(y_ticks_labels, va='center')
# Customize the ticks
for yticklabel in ax.get_yticklabels():
yticklabel
yticklabel.set_fontweight('bold')
yticklabel.set_fontsize(14)
yticklabel.set_color('0.2')
for xticklabel in ax.get_xticklabels():
xticklabel.set_fontweight('bold')
xticklabel.set_fontsize(14)
xticklabel.set_color('0.2')
# Remove ticks line
ax.tick_params(axis='both', length=0)
# Customize the axis labels
ax.set_ylabel("Tissue", fontsize=14, weight='bold', color='0.2')
ax.set_xlabel("Chips", fontsize=14, weight='bold', color='0.2')
# Add space between subplots
plt.tight_layout()
# Save the plot
plt.savefig(
output_path.joinpath("correlations.png"), bbox_inches='tight', dpi=1200,
transparent=True
)
# Show plot
plt.show()
def get_sample_type(sample_file_name) -> str:
"""Retrieves the sample type given its sample_file_name
Args:
sample_file_name (str): sample file name from metadata.
Returns:
str: sample type.
"""
vals = sample_file_name.split("_")[1:]
vals[-1] = vals[-1].split("-")[0]
try:
int(vals[-1])
vals.pop()
except:
pass
return "_".join(vals)
def num_features_df(
parsers: Dict[str, ImzMLParser], masks: Dict[str, np.ndarray],
percentages: np.ndarray
) -> pd.DataFrame:
"""Creates a dataframe containing the number of features in each percentage
threshold for each parser.
Args:
parser (Dict[str, ImzMLParser]): A dictionary containing ImzMLParser
objects with keys as sample names.
masks (Dict[str, np.ndarray]): A dictionary containing masks for each
sample.
save_path (np.ndarray): Array of percentages to calculate features.
"""
# Dictionary to store number of features for each sample
num_features = {}
# Loop through each parser and calculate number of features
for name, p in parsers.items():
name_num_features = []
for percentage in percentages:
name_num_features.append(
msi_mean_num_features(p, masks[name], percentage)
)
num_features[name] = name_num_features
# Creating a DataFrame from the dictionary
df = pd.DataFrame.from_dict(num_features)
# Transposing the DataFrame and setting the columns
df_transposed = df.T
df_transposed.columns = percentages
# Resetting the index to make the original column names a regular column
df_reset = df_transposed.reset_index()
# Applying the function to extract the type
types = [get_sample_type(name) for name in df.columns]
df_reset['Type'] = types
# Melting the DataFrame for a suitable format
df_melted = df_reset.melt(
id_vars='Type', var_name='Peak Percentage',
value_name='Number of Features'
)
# Removing non numeric values
df_melted['Peak Percentage'] = pd.to_numeric(
df_melted['Peak Percentage'], errors='coerce'
)
df_melted = df_melted.dropna(subset=['Peak Percentage'])
return df_melted
def plot_num_features_thresholds(num_features: pd.DataFrame, save_path: Path):
"""Plots and saves a line plot comparing the number of features between chip
types in each percentage threshold.
Args:
num_features_df (pd.DataFrame): dataframe containing the number of features
in each percentage threshold for each parser.
save_path (Path): Path object where the output image will be saved.
"""
# Creating the line plot with the corrected x-ticks
plt.figure(figsize=(7, 7))
ax = sns.lineplot(
data=num_features, x='Peak Percentage', y='Number of Features',
hue='Type', linewidth=2
)
plt.xticks(sorted(num_features['Peak Percentage'].unique()))
# Customize the spines
for axis in ['bottom', 'left']:
ax.spines[axis].set_linewidth(2.5)
ax.spines[axis].set_color('0.2')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Customize the ticks
for yticklabel in ax.get_yticklabels():
yticklabel
yticklabel.set_fontweight('bold')
yticklabel.set_fontsize(14)
yticklabel.set_color('0.2')
for xticklabel in ax.get_xticklabels():
xticklabel.set_fontweight('bold')
xticklabel.set_fontsize(14)
xticklabel.set_color('0.2')
xticklabel.set_rotation(90)
# Remove ticks line
ax.tick_params(axis='both', length=0)
# Customize the axis labels
ax.set_ylabel("Number of Features", fontsize=14, weight='bold', color='0.2')
ax.set_xlabel("Peak Percentage", fontsize=14, weight='bold', color='0.2')
# Creating legend and Customizing it
leg = plt.legend(loc='upper right', prop={"size": 14})
leg.set_frame_on(False)
for line in leg.get_lines():
line.set_linewidth(3.5)
for text in leg.get_texts():
text.set_text(" ".join(text.get_text().split("_")).capitalize())
text.set_weight('bold')
text.set_color('0.2')
# Add space between subplots
plt.tight_layout()
# Save the plot
plt.savefig(
save_path.joinpath("num_features_by_threshold.png"), bbox_inches='tight',
dpi=1200, transparent=True
)
# Show plot
plt.show()
def plot_num_features(
num_features: pd.DataFrame, percentage: float, save_path: Path
):
"""Plots and saves a bar chart comparing the number of features between chip
types in a certain percentage.
Args:
num_features_df (pd.DataFrame): dataframe containing the number of features
in each percentage threshold for each parser.
percentage (float): percentage to get number of features. (should be in the
already calculated percentages)
save_path (Path): Path object where the output image will be saved.
"""
#
num_features = num_features[num_features["Peak Percentage"] == percentage]
# Now we plot the bars for the mean and add error bars for the SEM.
_, ax = plt.subplots(1, 1, figsize=(10, 12))
# Create a bar plot using seaborn with custom error bars
ax = sns.barplot(
data=num_features, x="Type", y="Number of Features", errorbar=lambda x:
(x.mean() - x.sem(), x.mean() + x.sem()), capsize=0.15
)
sns.scatterplot(
data=num_features, x="Type", y="Number of Features", legend=False,
zorder=10, color='0.2', edgecolor='0.2', marker='s', s=70
)
# Customize the spines
for axis in ['bottom', 'left']:
ax.spines[axis].set_linewidth(2.5)
ax.spines[axis].set_color('0.2')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Customize ticks
plt.xticks(fontsize=14, fontweight='bold', color='0.2')
plt.xticks(rotation=45)
plt.yticks(fontsize=14, fontweight='bold', color='0.2')
ax.set_xticklabels([])
# Add legend to the plot with a title
legend_patches = [
Patch(color=color, label=label) for label, color in
zip(num_features['Type'].unique(), sns.color_palette())
]
leg = ax.legend(
loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, fancybox=True,
shadow=True, handles=legend_patches, prop={"size": 14}
)
leg.set_frame_on(False)
for line in leg.get_lines():
line.set_linewidth(3.5)
for text in leg.get_texts():
text.set_text(" ".join(text.get_text().split("_")).capitalize())
text.set_weight('bold')
text.set_color('0.2')
# Remove ticks line
ax.tick_params(axis='both', length=0)
# Set labels font size to 14
ax.set_ylabel('Number of Features', fontsize=14, weight='bold', color='0.2')
# Improve Aesthetics
plt.xlabel('')
plt.ylabel('Number of Features')
# Add space between subplots and show the plot
plt.tight_layout()
# Save the plot
plt.savefig(
save_path.joinpath("number_of_features_bar_graph.png"),
bbox_inches='tight', dpi=1200, transparent=True
)
# Display the plot
plt.show()
def plot_area_ratio(masks: Dict[str, np.ndarray], save_path: Path):
"""Plots and saves a bar chart comparing the ratio area of chip
types and tissue sections.
Args:
masks (Dict[str, np.ndarray]): A dictionary containing masks for each
sample.
save_path (Path): Path object where the output image will be saved.
"""
#
areas = []
for mask_name_1, mask_1 in masks.items():
for mask_name_2, mask_2 in masks.items():
mask_1_type = get_sample_type(mask_name_1)
mask_2_type = get_sample_type(mask_name_2)
if (mask_name_1[0]
== mask_name_2[0]) and (mask_1_type != "tissue_section"
) and (mask_2_type == "tissue_section"):
areas.append([mask_1_type, mask_1.sum() / mask_2.sum()])
areas = pd.DataFrame(areas, columns=["Type", "Area Ratio"])
# Now we plot the bars for the mean and add error bars for the SEM.
_, ax = plt.subplots(1, 1, figsize=(10, 10))
# Create a bar plot using seaborn with custom error bars
ax = sns.barplot(
data=areas, x="Type", y="Area Ratio", errorbar=lambda x:
(x.mean() - x.sem(), x.mean() + x.sem()), capsize=0.15
)
sns.scatterplot(
data=areas, x="Type", y="Area Ratio", legend=False, zorder=10,
color='0.2', edgecolor='0.2', marker='s', s=70
)
# Customize the spines
for axis in ['bottom', 'left']:
ax.spines[axis].set_linewidth(2.5)
ax.spines[axis].set_color('0.2')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Customize ticks
plt.xticks(fontsize=14, fontweight='bold', color='0.2')
plt.xticks(rotation=45)
plt.yticks(fontsize=14, fontweight='bold', color='0.2')
ax.set_xticklabels([])
# Add legend to the plot with a title
legend_patches = [
Patch(color=color, label=label)
for label, color in zip(areas['Type'].unique(), sns.color_palette())
]
leg = ax.legend(
loc='upper center', bbox_to_anchor=(0.5, 1.05), ncol=2, fancybox=True,
shadow=True, handles=legend_patches, prop={"size": 14}
)
leg.set_frame_on(False)
for line in leg.get_lines():
line.set_linewidth(3.5)
for text in leg.get_texts():
text.set_text(" ".join(text.get_text().split("_")).capitalize())
text.set_weight('bold')
text.set_color('0.2')
# Remove ticks line
ax.tick_params(axis='both', length=0)
# Set labels font size to 14
ax.set_ylabel('Area Ratio', fontsize=14, weight='bold', color='0.2')
# Improve Aesthetics
plt.xlabel('')
plt.ylabel('Replica Area / Tissue Area')
# Add space between subplots and show the plot
plt.tight_layout()
# Save the plot
plt.savefig(
save_path.joinpath("area_ratio_bar_graph.png"), bbox_inches='tight',
dpi=1200, transparent=True
)
# Display the plot
plt.show()
def main():
"""Function containing main code"""
# Define current folder using this file
CWD = Path(os.path.dirname(os.path.abspath(__file__)))
# Define folder that contains the revision chip type dataset
CHIP_TYPES_PATH = Path(os.path.join(CWD, "..", "data", "CHIP_TYPES"))
# Define folder that contains raw data
RAW_DATA = CHIP_TYPES_PATH.joinpath("raw")
# Define folder to save processed data
PROCESSED_DATA = CHIP_TYPES_PATH.joinpath("processed")
# Define file that contains dhg metadata
METADATA_PATH = CHIP_TYPES_PATH.joinpath("metadata.csv")
# Define mass range start value
MZ_START = 50
# Define mass range end value
MZ_END = 1200
# Define mass resolution of the data
MASS_RESOLUTION = 0.025
# Define representative peaks
REPRESENTATIVE_PEAKS = [
611.5, 682.58, 736.64, 844.64, 860.63, 888.62, 600.49, 834.53
]
# Define random seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
# Read metadata csv
metadata_df = pd.read_csv(METADATA_PATH)
metadata_df["sample_number"] = metadata_df.sample_file_name.apply(
lambda s: s.split("_")[0]
)
# Loop over each ROI in data frame
for _, roi in metadata_df.iterrows():
# Define path to msi imzML file
msi_path = os.path.join(RAW_DATA, f"{roi.file_name}.imzML")
# Define path to new msi imzML file after processing
output_path = os.path.join(PROCESSED_DATA, f"{roi.sample_file_name}")
# Create output folder if doesn't exist
Path(output_path).mkdir(parents=True, exist_ok=True)
# Process msi
process(
msi_path, output_path, roi.x_min, roi.x_max, roi.y_min, roi.y_max,
MZ_START, MZ_END, MASS_RESOLUTION, REPRESENTATIVE_PEAKS
)
# Define path to save figures
PLOT_PATH = Path(CWD / "chip_types/")
# Create dirs
PLOT_PATH.mkdir(parents=True, exist_ok=True)
# Create dict of msi parsers and masks
parsers = {}
masks = {}
for folder in PROCESSED_DATA.iterdir():
name = folder.stem
parsers[name] = ImzMLParser(folder.joinpath("meaningful_signal.imzML"))
masks[name] = np.load(folder.joinpath("segmentation.npy"), mmap_mode='r')
# Plot figures
plot_msi_segmentation(metadata_df, masks, PLOT_PATH)
plot_spectras_corr(PROCESSED_DATA, PLOT_PATH)
num_features = num_features_df(parsers, masks, np.arange(0.05, 0.55, 0.05))
plot_num_features_thresholds(num_features, PLOT_PATH)
plot_num_features(num_features, 0.3, PLOT_PATH)
plot_area_ratio(masks, PLOT_PATH)
if __name__ == '__main__':
main()