-
Notifications
You must be signed in to change notification settings - Fork 0
/
experiments.py
258 lines (229 loc) · 8.9 KB
/
experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
from multiprocessing import Pool, Manager
from multiprocessing.managers import SyncManager
from typing import List, Literal, Tuple
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from deep_rl.environments.flappy_bird import FlappyBird
from tqdm import trange, tqdm
from src import TreeBasedAgent
def launch_multiple_experiments(
environment: FlappyBird,
n_experiments: int,
alpha: float = 0.0,
beta: float = 0.3,
min_tree_depth: int = 20,
heuristic: Literal["convex", "geometric", "exact"] = "convex",
max_steps: int = 1000,
gravity: float = 0.05,
force_push: float = 0.1,
vx: float = 0.05,
verbose: bool = False,
disable_progress_bar: bool = False,
) -> None:
"""
Launches multiple runs for a single set of settings to obtain averaged results.
"""
agent = TreeBasedAgent(
gravity,
force_push,
vx,
alpha=alpha,
beta=beta,
max_bars=-1,
heuristic=heuristic,
min_tree_depth=min_tree_depth,
)
environment.reset()
observation = environment.step(0)[0]
rewards = np.zeros(n_experiments)
n_steps = np.zeros(n_experiments)
for exp in trange(n_experiments, desc="Number of finished runs", position=1):
step, total_reward = 0, 0
for step in trange(
1,
max_steps + 1,
desc="Steps within an episode",
disable=disable_progress_bar,
position=2,
leave=False,
):
action = agent.sample_action(observation)
observation, reward, done = environment.step(action)
if verbose:
print(
f"action: {action}, reward: {reward}, observation: {str(observation)}"
)
total_reward += reward
if verbose:
print(f"Cumulated reward at step {step}: {total_reward:>3}.")
agent.print_outcomes_stats()
if done:
if verbose:
print(
f" Simulation ended after {step} steps for a total reward of {total_reward}."
)
break
rewards[exp] = total_reward
n_steps[exp] = step
print(
f"\rCurrent mean reward / number of steps: {rewards[:exp + 1].mean():.2f} / {n_steps[:exp + 1].mean():.2f}",
end="\r",
flush=True,
)
print(
f"\n\nReward over {n_experiments} experiments: {rewards.mean():.2f} +/- {1.96 * rewards.std():.2f} "
f"[{rewards.min(initial=max_steps)}, {rewards.max(initial=0)}]"
)
print(
f"Number of steps: {n_steps.mean():.2f} +/- {1.96 * n_steps.std():.2f} "
f"[{n_steps.min(initial=max_steps)}, {n_steps.max(initial=0)}]"
)
sns.set_theme()
# showing the score function computed as a matrix
if heuristic == "exact":
agent.visualize_score_function()
# noinspection PyArgumentEqualDefault
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
ax1.hist(
rewards,
bins=(int(rewards.max(initial=0)) - int(rewards.min(initial=max_steps)) // 10 or 1),
color="cornflowerblue",
edgecolor="cornflowerblue",
)
ax2.hist(
n_steps,
bins=(int(n_steps.max(initial=0)) - int(n_steps.min(initial=max_steps)) // 10 or 1),
color="cornflowerblue",
edgecolor="cornflowerblue",
)
ax1.set(xlabel="rewards", ylabel="number of occurrences")
ax2.set(xlabel="n_steps", ylabel="number of occurrences")
plt.show()
def parallel_experiment(
environment: FlappyBird,
agent: TreeBasedAgent,
n_experiments: int,
max_steps: int,
p_id: int,
lock_manager: SyncManager,
) -> Tuple[np.ndarray, np.ndarray]:
"""
Launches an experiment. Allows for easy multiprocessing as it is a pure function.
"""
rewards = np.zeros(n_experiments)
n_steps = np.zeros(n_experiments)
with lock_manager:
global_progress_bar = tqdm(
desc=f"Total over all runs - alpha: {agent.alpha}, beta: {agent.beta}",
total=n_experiments,
position=2 * p_id,
leave=False,
)
inner_progress_bar = tqdm(
desc=f"Progress within run - alpha: {agent.alpha}, beta: {agent.beta}",
total=max_steps,
position=2 * p_id + 1,
leave=False,
)
for exp in range(n_experiments):
step, total_reward = 0, 0
observation = environment.reset()
for step in range(1, max_steps + 1):
action = agent.sample_action(observation)
observation, reward, done = environment.step(action)
total_reward += reward
if done:
break
with lock_manager:
inner_progress_bar.update()
with lock_manager:
inner_progress_bar.reset()
global_progress_bar.update()
rewards[exp] = total_reward
n_steps[exp] = step
with lock_manager:
global_progress_bar.close()
inner_progress_bar.close()
return rewards, n_steps
def launch_cross_validation(
alphas: List[float],
betas: List[float],
environment: FlappyBird,
n_experiments: int,
max_steps: int,
min_tree_depth: int = 20,
heuristic: Literal["convex", "geometric", "exact"] = "convex",
gravity: float = 0.05,
force_push: float = 0.1,
vx: float = 0.05,
n_processes: int = 8,
) -> Tuple[np.ndarray, np.ndarray]:
"""
Launches a series of experiment to cross-validate on the values of alpha and beta.
The experiments are multiprocessed with one process for each value of (alpha, beta), which is fit for the current
experiment that has 7 values of (alpha, beta) for 8 processes.
Be careful with the number of processes, if you use one less process than there are combinations of (alpha, beta)
it will take twice the time (the same applies to one less process than a divisor of the number of combinations).
"""
lock, process_id = Manager().Lock(), 0
all_rewards = np.zeros((len(alphas), len(betas), n_experiments))
all_n_steps = np.zeros((len(alphas), len(betas), n_experiments))
async_results = []
with Pool(processes=n_processes) as pool:
for alpha_i, alpha_val in enumerate(alphas):
for beta_i, beta_val in enumerate(betas):
# keeping only one computation for null alpha
if alpha_val > 0 or beta_i == 0:
# one big process for each (alpha, beta) instead of n_alpha x n_beta x args.n_experiments
alpha_beta_result = pool.apply_async(
parallel_experiment,
args=(
environment,
TreeBasedAgent(
gravity,
force_push,
vx,
max_bars=-1,
alpha=alpha_val,
beta=beta_val,
heuristic=heuristic,
min_tree_depth=min_tree_depth,
),
n_experiments,
max_steps,
process_id,
lock,
),
)
async_results.append((alpha_i, beta_i, alpha_beta_result))
process_id += 1
for alpha_i, beta_i, res in async_results:
rewards, n_steps = res.get(timeout=86400)
all_rewards[alpha_i, beta_i, :] = rewards
all_n_steps[alpha_i, beta_i, :] = n_steps
# copying the values for the experiments we skipped
for alpha_i, alpha_val in enumerate(alphas):
for beta_i, beta_val in enumerate(betas):
if alpha_val == 0 and beta_i > 0:
all_rewards[alpha_i, beta_i, :] = all_rewards[alpha_i, 0, :]
all_n_steps[alpha_i, beta_i, :] = all_n_steps[alpha_i, 0, :]
return all_rewards, all_n_steps
def launch_concurrent_runs(
environment: FlappyBird,
first_agent,
second_agent,
n_experiments: int,
max_steps: int,
) -> float:
"""
Launches a series of concurrent runs, where two agents compete against one another in order to identify the
strengths and weaknesses of each agent through an analysis of the situations where one of the agent fails and the
other one succeeds.
One agent should be using the environment, and a parallel environment that has the same bars as the first one but
whose position and velocity is controlled by the actions of the second agent. This can be done by reproducing the
dynamic of the problem (see TreeBasedAgent._build_tree for the dynamic and TreeBasedAgent._is_bird_crashing for the
condition of a crash).
TODO: implement concurrent runs.
"""
pass