-
Notifications
You must be signed in to change notification settings - Fork 0
/
params_pool.py
172 lines (131 loc) · 6.88 KB
/
params_pool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import numpy as np
import torch
import torch.nn as nn
from torch import optim
from replay_buffer import Batch
def get_net(
num_in:int,
num_out:int,
final_activation, # e.g. nn.Tanh
num_hidden_layers:int=6,
num_neurons_per_hidden_layer:int=64
) -> nn.Sequential:
layers = []
layers.extend([
nn.Linear(num_in, num_neurons_per_hidden_layer),
nn.ReLU(),
])
for _ in range(num_hidden_layers):
layers.extend([
nn.Linear(num_neurons_per_hidden_layer, num_neurons_per_hidden_layer),
nn.ReLU(),
])
layers.append(nn.Linear(num_neurons_per_hidden_layer, num_out))
if final_activation is not None:
layers.append(final_activation)
return nn.Sequential(*layers)
class ParamsPool:
"""
Description: ParamPool stands for parameter pool. This is inspired by the fact that everything
in this class all depend heavily on lots of parameters. Of course, it also involves methods
to update the parameters in the face of new data.
Caution: This class can deal with multi-dimensional actions, but the range of actions has
to be within [-1, 1]. To ensure it in gym, you can use wrappers. A list of official wrappers
are available at https://github.com/openai/gym/tree/master/gym/wrappers; you should pay
attention to the rescale_action.py. If you are coming from the future, then note that this
link might be deprecated.
Exposed arguments:
input_dim (int): dimension of input of the two q networks
action_dim (int): dimension of output of the two q networks
noise_var (float): variance of exploration noise added to greedy actions
noise_var_multiplier (float)
polyak (float): polyak-averaging coefficient; interpreted as the proportion of target parameters to keep
Un-exposed arguments (that you might want to play with):
learning rates
number of layers and neurons in each layer
exploration noise decay schedule
"""
def __init__(self,
input_dim:int,
action_dim:int,
gamma:float=0.95,
noise_var:float=0.1,
noise_var_multiplier:float=0.95,
noise_var_min:float=0,
polyak:float=0.90
):
# ===== networks =====
# q_prediction_net: (s, a_) --- network --> scalar
# q_target_net : (s, a_) --- network --> scalar
# q_maximizing_net: s --- network --> a_ (in (0, 1) and hence need to undo normalization)
self.q_prediction_net = get_net(num_in=input_dim + action_dim, num_out=1, final_activation=None)
self.q_target_net = get_net(num_in=input_dim + action_dim, num_out=1, final_activation=None)
self.q_maximizing_net = get_net(num_in=input_dim, num_out=action_dim, final_activation=nn.Tanh())
self.q_target_net.eval() # we won't be passing gradients to this network
self.q_target_net.load_state_dict(self.q_prediction_net.state_dict())
# ===== optimizers =====
# ref: https://pytorch.org/docs/stable/optim.html
self.q_prediction_net_optimizer = optim.Adam(self.q_prediction_net.parameters(), lr=1e-3)
self.q_maximizing_net_optimizer = optim.Adam(self.q_maximizing_net.parameters(), lr=1e-3)
# ===== hyper-parameters =====
# for discounting
self.gamma = gamma
# for exploration during training
self.action_dim = action_dim
self.noise_var = noise_var
self.noise_var_multiplier = noise_var_multiplier
self.noise_var_min = noise_var_min
# for updating the q target network
self.polyak = polyak
def update_q_prediction_net_and_q_maximizing_net(self, batch: Batch) -> tuple:
# ==================================================
# bellman equation loss (just like Q-learning)
# ==================================================
PREDICTIONS = self.q_prediction_net(torch.cat([batch.s, batch.a], dim=1))
q_maximizing_a_prime = self.q_maximizing_net(batch.s_prime)
# oh my, this bug in the following line took me 2 days or so to find it
# basically, if batch.mask has shape (64, ) and its multiplier has shape (64, 1)
# the result is a (64, 64) tensor, but this does not even cause an error!!!
TARGETS = batch.r + \
self.gamma * self.q_target_net(torch.cat([batch.s_prime, q_maximizing_a_prime], dim=1)) * batch.mask
Q_LEARNING_LOSS = torch.mean((PREDICTIONS - TARGETS.detach()) ** 2)
# ==================================================
# policy loss (not present in Q-learning)
# ==================================================
q_maximizing_a = self.q_maximizing_net(batch.s)
Q_VALUES = self.q_prediction_net(torch.cat([batch.s, q_maximizing_a], dim=1))
ACTOR_LOSS = - torch.mean(Q_VALUES) # minimizing this loss is maximizing the q values
# ==================================================
# backpropagation and gradient descent
# ==================================================
self.q_maximizing_net_optimizer.zero_grad()
ACTOR_LOSS.backward() # inconveniently this back-props into prediction net as well, but (see following line)
self.q_prediction_net_optimizer.zero_grad() # clear the gradient of the prediction net accumulated by ACTOR_LOSS.backward()
Q_LEARNING_LOSS.backward()
# doing a gradient clipping between -1 and 1 is equivalent to using Huber loss
# guaranteed to improve stability so no harm in using at all
for param in self.q_prediction_net.parameters():
param.grad.data.clamp_(-1, 1)
for param in self.q_maximizing_net.parameters():
param.grad.data.clamp_(-1, 1)
self.q_prediction_net_optimizer.step()
self.q_maximizing_net_optimizer.step()
return float(Q_LEARNING_LOSS), float(ACTOR_LOSS)
def update_q_target_net(self) -> None:
for target_param, param in zip(self.q_target_net.parameters(), self.q_prediction_net.parameters()):
target_param.data.copy_(target_param.data * self.polyak + param.data * (1 - self.polyak))
def act(self, state: np.array, noisy: bool) -> np.array:
"""
For training: turn on deterministic
For testing: turn off deterministic
"""
state = torch.tensor(state).unsqueeze(0).float()
greedy_action = self.q_maximizing_net(state).detach().numpy()[0]
# use [0] instead of un-squeeze because un-squeeze gets rid of all extra brackets but we need one
if noisy:
return np.clip(greedy_action + self.noise_var * np.random.randn(self.action_dim), -1.0, 1.0)
else:
return greedy_action
def decay_noise_var(self) -> None:
if self.noise_var > self.noise_var_min:
self.noise_var *= self.noise_var_multiplier