-
Notifications
You must be signed in to change notification settings - Fork 0
/
Bootstraping.py
161 lines (142 loc) · 8.26 KB
/
Bootstraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
MAIN_DATA_DIRECTORY_PATH = '/Users/yishaiazabary/PycharmProjects/MucsleCellFusionSimulation/Data'
class BootstrappingPerExperiment:
def __init__(self, exp_prefix, type_of_simulation, col_to_calc_by, number_of_replications, frame_num):
self.exp_prefix = exp_prefix
self.type_of_simulation = type_of_simulation
self.col_to_calc_by = col_to_calc_by
type_of_simulation = '{0}_{1}'.format(int(number_of_replications), type_of_simulation)
self.path_to_simulations_directory = os.sep.join([MAIN_DATA_DIRECTORY_PATH, exp_prefix, type_of_simulation])
self.number_of_replications = number_of_replications
self.frame_num = frame_num
self.path_to_true_exp_file = os.sep.join([MAIN_DATA_DIRECTORY_PATH, exp_prefix, ''.join([exp_prefix, '_processed.csv'])])
self.all_simulations_data_points = np.ndarray((self.number_of_replications, self.frame_num))
self.simulations_dist_averages = np.ndarray(self.frame_num)
self.p_values_per_frame = np.zeros(frame_num)
entire_exp_df = pd.read_csv(self.path_to_true_exp_file)
# sum_of_totals = entire_exp_df.loc[:, 'CTRn=1'].values + \
# entire_exp_df.loc[:, 'CTRn=2'].values + \
# entire_exp_df.loc[:, 'CTRn=3'].values + \
# entire_exp_df.loc[:, 'CTRn=4'].values
self.initial_number_of_nuclei = np.max(entire_exp_df.loc[:, 'CTRn=1'].values)
self.exp_data_points_normalized = entire_exp_df.loc[:, self.col_to_calc_by].values / self.initial_number_of_nuclei
def read_simulations_col_data_points(self):
files = (file for file in os.listdir(self.path_to_simulations_directory)
if os.path.isfile(os.path.join(self.path_to_simulations_directory, file)))
idx = 0
for filename in files:
if filename.endswith('.csv'):
path_to_single_file = os.sep.join([self.path_to_simulations_directory, filename])
entire_df = pd.read_csv(path_to_single_file)
single_file_col_data = entire_df.loc[:, self.col_to_calc_by].values
# sum_of_totals = entire_df.loc[:, 'CTRn=1'].values + \
# entire_df.loc[:, 'CTRn=2'].values + \
# entire_df.loc[:, 'CTRn=3'].values + \
# entire_df.loc[:, 'CTRn=4'].values
single_file_data_normalized = single_file_col_data / self.initial_number_of_nuclei
self.all_simulations_data_points[idx] = single_file_data_normalized
idx += 1
def calc_p_values_per_frame(self):
for frame_idx, exp_frame_dist in enumerate(self.exp_data_points_normalized):
frame_data_from_simulations = self.all_simulations_data_points[:, frame_idx]
self.simulations_dist_averages[frame_idx] = np.average(frame_data_from_simulations)
num_of_equal_or_above = len(frame_data_from_simulations[frame_data_from_simulations >= exp_frame_dist])
self.p_values_per_frame[frame_idx] = 0.001 if num_of_equal_or_above / self.number_of_replications <= 0.001 else num_of_equal_or_above / self.number_of_replications
def plot_col_data_with_p_values(self, save_csv: bool = True):
fig, ax = plt.subplots()
lines_and_legends = {}
# plot simulation dist averages
coding = 'gx'
sim_data = self.simulations_dist_averages
ax.plot(sim_data, coding)
lines_and_legends['{} Simulation Distribution Averages'.format(self.type_of_simulation.capitalize())] = coding
# plot true exp dist
coding = 'r+'
exp_data = self.exp_data_points_normalized
ax.plot(exp_data, coding)
lines_and_legends['True Experiment Distribution'] = coding
# plot p values
coding = 'b*'
pval = self.p_values_per_frame
ax.plot(pval, coding)
lines_and_legends['P-Values<='] = coding
ax.set_title('EXP:{0}\nBootstrap P-Values of Simulation type:{1}\nReplications:{2}'.format(self.exp_prefix, self.type_of_simulation, self.number_of_replications))
ax.set_ylabel('fraction')
ax.set_xlabel('time (h)')
artist_patches = []
for leg_label, coding in lines_and_legends.items():
artist_patches.append(mlines.Line2D([], [],
color=coding[:1],
marker=coding[1:],
label=leg_label))
# annotate p_values
for idx, p_value in enumerate(self.p_values_per_frame):
text = '{:.3f}≥'.format(p_value) if p_value != 1 else str(1)
ax.annotate(text, xy=(idx, p_value), xytext=(idx+0.05, p_value), fontsize=6, rotation=40)
ax.legend(handles=artist_patches, loc=6)
plt.show()
# path_to_plot = os.sep.join([MAIN_DATA_DIRECTORY_PATH, self.exp_prefix, 'plots', '{0}_{1}_{2}.png'.format(self.exp_prefix, self.type_of_simulation, 'pvalues')])
# fig.savefig(path_to_plot, dpi=300)
# save to csv
if save_csv:
df = pd.DataFrame({
'Frame Number': ['#{}'.format(x) for x in range(len(exp_data))],
'Fraction of N≥4 in Experiment': exp_data,
'Average Fraction of N≥4 in simulations': self.simulations_dist_averages,
'P-Values': self.p_values_per_frame
})
df.to_csv(os.sep.join([MAIN_DATA_DIRECTORY_PATH, self.exp_prefix, 'Summarized exp: {0}, type:{1}.csv'.format(self.exp_prefix, self.type_of_simulation, 'pvalues')]))
def plot_experiments_pvalues(exp_prefixes):
types = ['randoms', 'weighted']
coding = ['g+', 'rx', 'b*', 'c,', 'y>', 'm<', 'k^']
for art_idx, sim_type in enumerate(types):
# read p-values file
fig, ax = plt.subplots()
lines_and_legends = {}
for exp_idx, exp_prefix in enumerate(exp_prefixes):
path_to_np_file = os.sep.join([MAIN_DATA_DIRECTORY_PATH, exp_prefix, '{0}_{1}_pvals.npy'.format(exp_prefix, sim_type)])
p_vals = np.load(path_to_np_file)
ax.plot(p_vals, coding[exp_idx], alpha=0.5)
lines_and_legends['Exp:{}'.format(exp_prefix)] = coding[exp_idx]
ax.set_title('Bootstrap P-Values Comparison\nof Simulation Type:{0}\n'.format(sim_type))
ax.set_ylabel('p-value')
ax.set_xlabel('time (h)')
art_idx = 0
artist_patches = []
for leg_label, ax_coding in lines_and_legends.items():
artist_patches.append(mlines.Line2D([], [],
color=ax_coding[:1],
marker=ax_coding[1:],
label=leg_label))
art_idx += 1
ax.legend(handles=artist_patches, loc=6)
plt.show()
path_to_plot = os.sep.join([MAIN_DATA_DIRECTORY_PATH, '{0}_{1}.png'.format(sim_type, 'pvalues')])
fig.savefig(path_to_plot, dpi=300)
# plot_experiments_pvalues(['200203_S24', '200604_S06', '200604_S09'])
for exp_prefix in ['200203_S17', '200203_S19', '200203_S22', '200203_S24', '200604_S06', '200604_S09']:
# randoms
type_of_simulation = 'randoms'
col_to_calc_by = 'CTR4total'
number_of_replications = 1000
frame_num = 16
bspe = BootstrappingPerExperiment(exp_prefix, type_of_simulation, col_to_calc_by, number_of_replications, frame_num)
bspe.read_simulations_col_data_points()
bspe.calc_p_values_per_frame()
bspe.plot_col_data_with_p_values()
# save p-values as numpy array
np.save(os.sep.join([MAIN_DATA_DIRECTORY_PATH, exp_prefix, '{0}_{1}_pvals'.format(exp_prefix, type_of_simulation)]),
bspe.p_values_per_frame)
# weighted
type_of_simulation = 'weighted'
bspe = BootstrappingPerExperiment(exp_prefix, type_of_simulation, col_to_calc_by, number_of_replications, frame_num)
bspe.read_simulations_col_data_points()
bspe.calc_p_values_per_frame()
bspe.plot_col_data_with_p_values()
# save p-values as numpy array
np.save(os.sep.join([MAIN_DATA_DIRECTORY_PATH, exp_prefix, '{0}_{1}_pvals'.format(exp_prefix, type_of_simulation)]),
bspe.p_values_per_frame)