Stable-Baselines-Team · b-vm · Nov 12, 2022 · Nov 14, 2022 · Nov 23, 2022 · Nov 28, 2022
diff --git a/.gitignore b/.gitignore
@@ -47,3 +47,5 @@ src
 *.prof
 
 MUJOCO_LOG.TXT
+
+temp/
diff --git a/sb3_contrib/common/recurrent/buffers.py b/sb3_contrib/common/recurrent/buffers.py
@@ -1,15 +1,19 @@
 from functools import partial
-from typing import Callable, Generator, Optional, Tuple, Union
+from typing import Callable, Generator, List, Optional, Tuple, Union
 
 import numpy as np
 import torch as th
 from gymnasium import spaces
 from stable_baselines3.common.buffers import DictRolloutBuffer, RolloutBuffer
 from stable_baselines3.common.vec_env import VecNormalize
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
 
 from sb3_contrib.common.recurrent.type_aliases import (
     RecurrentDictRolloutBufferSamples,
+    RecurrentDictRolloutBufferSequenceSamples,
     RecurrentRolloutBufferSamples,
+    RecurrentRolloutBufferSequenceSamples,
     RNNStates,
 )
 
@@ -94,6 +98,30 @@ def create_sequencers(
     return seq_start_indices, local_pad, local_pad_and_flatten
 
 
+def create_sequence_slicer(
+    episode_start_indices: np.ndarray, device: Union[th.device, str]
+) -> Callable[[np.ndarray, List[str]], th.Tensor]:
+    def create_sequence_minibatch(tensor: np.ndarray, seq_indices: List[str]) -> th.Tensor:
+        """
+        Create minibatch of whole sequence.
+
+        :param tensor: Tensor that will be sliced (e.g. observations, rewards)
+        :param seq_indices: Sequences to be used.
+        :return: (max_sequence_length, batch_size=n_seq, features_size)
+        """
+        return pad_sequence(
+            [
+                th.tensor(
+                    tensor[episode_start_indices[i] : episode_start_indices[i + 1]],
+                    device=device,
+                )
+                for i in seq_indices
+            ]
+        )
+
+    return create_sequence_minibatch
+
+
 class RecurrentRolloutBuffer(RolloutBuffer):
     """
     Rollout buffer that also stores the LSTM cell and hidden states.
@@ -382,3 +410,161 @@ def _get_samples(
             episode_starts=self.pad_and_flatten(self.episode_starts[batch_inds]),
             mask=self.pad_and_flatten(np.ones_like(self.returns[batch_inds])),
         )
+
+
+class RecurrentSequenceRolloutBuffer(RecurrentRolloutBuffer):
+    """
+    Sequence Rollout buffer used in on-policy algorithms like A2C/PPO.
+    Overrides the RecurrentRolloutBuffer to yield 3d batches of whole sequences
+
+    :param buffer_size: Max number of element in the buffer
+    :param observation_space: Observation space
+    :param action_space: Action space
+    :param hidden_state_shape: Shape of the buffer that will collect lstm states
+    :param device: PyTorch device
+    :param gae_lambda: Factor for trade-off of bias vs variance for Generalized Advantage Estimator
+        Equivalent to classic advantage when set to 1.
+    :param gamma: Discount factor
+    :param n_envs: Number of parallel environments
+    """
+
+    def __init__(
+        self,
+        buffer_size: int,
+        observation_space: spaces.Space,
+        action_space: spaces.Space,
+        hidden_state_shape: Tuple[int, int, int, int],
+        device: Union[th.device, str] = "auto",
+        gae_lambda: float = 1,
+        gamma: float = 0.99,
+        n_envs: int = 1,
+    ):
+        self.hidden_state_shape = hidden_state_shape
+        self.seq_start_indices, self.seq_end_indices = None, None
+        super().__init__(
+            buffer_size, observation_space, action_space, hidden_state_shape, device, gae_lambda, gamma, n_envs=n_envs
+        )
+
+    def get(self, batch_size: Optional[int] = None) -> Generator[RecurrentRolloutBufferSequenceSamples, None, None]:
+        assert self.full, "Rollout buffer must be full before sampling from it"
+        # Prepare the data
+        if not self.generator_ready:
+            self.episode_starts[0, :] = 1
+            for tensor in [
+                "observations",
+                "actions",
+                "values",
+                "log_probs",
+                "advantages",
+                "returns",
+                "episode_starts",
+            ]:
+                self.__dict__[tensor] = self.swap_and_flatten(self.__dict__[tensor])
+
+            self.episode_start_indices = np.where(self.episode_starts == 1)[0]
+            self.generator_ready = True
+
+        random_indices = SubsetRandomSampler(range(len(self.episode_start_indices)))
+        # Do not drop last batch so we are sure we sample at least one sequence
+        # TODO: allow to change that parameter
+        batch_sampler = BatchSampler(random_indices, batch_size, drop_last=False)
+        # add a dummy index to make the code below simpler
+        episode_start_indices = np.concatenate([self.episode_start_indices, np.array([len(self.episode_starts)])])
+
+        create_minibatch = create_sequence_slicer(episode_start_indices, self.device)
+
+        # yields batches of whole sequences, shape: (max_sequence_length, batch_size=n_seq, features_size))
+        for indices in batch_sampler:
+            returns_batch = create_minibatch(self.returns, indices)
+            masks_batch = pad_sequence([th.ones_like(returns) for returns in th.swapaxes(returns_batch, 0, 1)])
+
+            yield RecurrentRolloutBufferSequenceSamples(
+                observations=create_minibatch(self.observations, indices),
+                actions=create_minibatch(self.actions, indices),
+                old_values=create_minibatch(self.values, indices),
+                old_log_prob=create_minibatch(self.log_probs, indices),
+                advantages=create_minibatch(self.advantages, indices),
+                returns=returns_batch,
+                mask=masks_batch,
+            )
+
+
+class RecurrentSequenceDictRolloutBuffer(RecurrentDictRolloutBuffer):
+    """
+    Sequence Dict Rollout buffer used in on-policy algorithms like A2C/PPO.
+    Overrides the DictRecurrentRolloutBuffer to yield 3d batches of whole sequences
+
+    :param buffer_size: Max number of element in the buffer
+    :param observation_space: Observation space
+    :param action_space: Action space
+    :param hidden_state_shape: Shape of the buffer that will collect lstm states
+    :param device: PyTorch device
+    :param gae_lambda: Factor for trade-off of bias vs variance for Generalized Advantage Estimator
+        Equivalent to classic advantage when set to 1.
+    :param gamma: Discount factor
+    :param n_envs: Number of parallel environments
+    """
+
+    def __init__(
+        self,
+        buffer_size: int,
+        observation_space: spaces.Space,
+        action_space: spaces.Space,
+        hidden_state_shape: Tuple[int, int, int, int],
+        device: Union[th.device, str] = "auto",
+        gae_lambda: float = 1,
+        gamma: float = 0.99,
+        n_envs: int = 1,
+    ):
+        self.hidden_state_shape = hidden_state_shape
+        self.seq_start_indices, self.seq_end_indices = None, None
+        super().__init__(
+            buffer_size, observation_space, action_space, hidden_state_shape, device, gae_lambda, gamma, n_envs=n_envs
+        )
+
+    def get(self, batch_size: Optional[int] = None) -> Generator[RecurrentDictRolloutBufferSequenceSamples, None, None]:
+        assert self.full, "Rollout buffer must be full before sampling from it"
+        # Prepare the data
+        if not self.generator_ready:
+            self.episode_starts[0, :] = 1
+            for key, obs in self.observations.items():
+                self.observations[key] = self.swap_and_flatten(obs)
+
+            for tensor in [
+                "actions",
+                "values",
+                "log_probs",
+                "advantages",
+                "returns",
+                "episode_starts",
+            ]:
+                self.__dict__[tensor] = self.swap_and_flatten(self.__dict__[tensor])
+
+            self.episode_start_indices = np.where(self.episode_starts == 1)[0]
+            self.generator_ready = True
+
+        random_indices = SubsetRandomSampler(range(len(self.episode_start_indices)))
+        # drop last batch to prevent extremely small batches causing spurious updates
+        batch_sampler = BatchSampler(random_indices, batch_size, drop_last=True)
+        # add a dummy index to make the code below simpler
+        episode_start_indices = np.concatenate([self.episode_start_indices, np.array([len(self.episode_starts)])])
+
+        create_minibatch = create_sequence_slicer(episode_start_indices, self.device)
+
+        # yields batches of whole sequences, shape: (sequence_length, batch_size=n_seq, features_size)
+        for indices in batch_sampler:
+            obs_batch = {}
+            for key in self.observations:
+                obs_batch[key] = create_minibatch(self.observations[key], indices)
+            returns_batch = create_minibatch(self.returns, indices)
+            masks_batch = pad_sequence([th.ones_like(returns) for returns in th.swapaxes(returns_batch, 0, 1)])
+
+            yield RecurrentDictRolloutBufferSequenceSamples(
+                observations=obs_batch,
+                actions=create_minibatch(self.actions, indices),
+                old_values=create_minibatch(self.values, indices),
+                old_log_prob=create_minibatch(self.log_probs, indices),
+                advantages=create_minibatch(self.advantages, indices),
+                returns=returns_batch,
+                mask=masks_batch,
+            )
diff --git a/sb3_contrib/common/recurrent/policies.py b/sb3_contrib/common/recurrent/policies.py
@@ -344,6 +344,54 @@ def evaluate_actions(
         values = self.value_net(latent_vf)
         return values, log_prob, distribution.entropy()
 
+    def evaluate_actions_whole_sequence(
+        self,
+        obs: th.Tensor,
+        actions: th.Tensor,
+    ) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
+        """
+        Evaluate actions of batches of whole sequences according to the current policy,
+        given the observations.
+
+        :param obs: Observation.
+        :param actions:
+        :param lstm_states: The last hidden and memory states for the LSTM.
+        :param episode_starts: Whether the observations correspond to new episodes
+            or not (we reset the lstm states in that case).
+        :return: estimated value, log likelihood of taking those actions
+            and entropy of the action distribution.
+        """
+        # Preprocess the observation if needed
+
+        # temporary fix to disable the flattening that stable_baselines3 feature extractors do by default
+        # flattening will turn the sequences in the batch into 1 long sequence without proper resetting of lstm hidden states
+        if self.features_extractor_class == FlattenExtractor:
+            features = obs
+        else:
+            features = self.extract_features(obs)
+        latent_pi, _ = self.lstm_actor(features)
+
+        if self.lstm_critic is not None:
+            latent_vf, _ = self.lstm_critic(features)
+        elif self.shared_lstm:
+            latent_vf = latent_pi.detach()
+        else:
+            latent_vf = self.critic(features)
+
+        latent_pi = self.mlp_extractor.forward_actor(latent_pi)
+        latent_vf = self.mlp_extractor.forward_critic(latent_vf)
+
+        values = self.value_net(latent_vf)
+
+        distribution = self._get_action_dist_from_latent(latent_pi)
+        log_prob = distribution.distribution.log_prob(actions).sum(dim=-1)
+        log_prob = log_prob.reshape((*log_prob.shape, 1))
+
+        entropy = distribution.distribution.entropy().sum(dim=-1)
+        entropy = entropy.reshape((*entropy.shape, 1))
+
+        return values, log_prob, entropy
+
     def _predict(
         self,
         observation: th.Tensor,

diff --git a/sb3_contrib/common/recurrent/type_aliases.py b/sb3_contrib/common/recurrent/type_aliases.py
@@ -31,3 +31,23 @@ class RecurrentDictRolloutBufferSamples(NamedTuple):
     lstm_states: RNNStates
     episode_starts: th.Tensor
     mask: th.Tensor
+
+
+class RecurrentRolloutBufferSequenceSamples(NamedTuple):
+    observations: th.Tensor
+    actions: th.Tensor
+    old_values: th.Tensor
+    old_log_prob: th.Tensor
+    advantages: th.Tensor
+    returns: th.Tensor
+    mask: th.Tensor
+
+
+class RecurrentDictRolloutBufferSequenceSamples(NamedTuple):
+    observations: TensorDict
+    actions: th.Tensor
+    old_values: th.Tensor
+    old_log_prob: th.Tensor
+    advantages: th.Tensor
+    returns: th.Tensor
+    mask: th.Tensor