Stable-Baselines-Team · danielpalen · May 3, 2024 · May 5, 2024 · May 12, 2024 · May 12, 2024
diff --git a/README.md b/README.md
@@ -31,6 +31,7 @@ See documentation for the full list of included features.
 - [PPO with recurrent policy (RecurrentPPO aka PPO LSTM)](https://ppo-details.cleanrl.dev//2021/11/05/ppo-implementation-details/)
 - [Truncated Quantile Critics (TQC)](https://arxiv.org/abs/2005.04269)
 - [Trust Region Policy Optimization (TRPO)](https://arxiv.org/abs/1502.05477)
+- [Batch Normalization in Deep Reinforcement Learning (CrossQ)](https://openreview.net/forum?id=PczQtTsTIX)
 
 **Gym Wrappers**:
 - [Time Feature Wrapper](https://arxiv.org/abs/1712.00378)

diff --git a/docs/common/torch_layers.rst b/docs/common/torch_layers.rst
@@ -0,0 +1,7 @@
+.. _th_layers:
+
+Torch Layers
+============
+
+.. automodule:: sb3_contrib.common.torch_layers
+  :members:
diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst
@@ -113,3 +113,16 @@ Train a PPO agent with a recurrent policy on the CartPole environment.
      obs, rewards, dones, info = vec_env.step(action)
      episode_starts = dones
      vec_env.render("human")
+
+CrossQ
+------
+
+Train a CrossQ agent on the Pendulum environment.
+
+.. code-block:: python
+
+ from sb3_contrib import CrossQ
+
+ model = CrossQ("MlpPolicy", "Pendulum-v1", verbose=1, policy_kwargs=dict(net_arch=dict(pi=[256, 256], qf=[1024, 1024])))
+ model.learn(total_timesteps=5_000, log_interval=4)
+ model.save("crossq_pendulum")
diff --git a/docs/images/crossQ_performance.png b/docs/images/crossQ_performance.png
diff --git a/docs/index.rst b/docs/index.rst
@@ -32,6 +32,7 @@ RL Baselines3 Zoo also offers a simple interface to train, evaluate agents and d
   :caption: RL Algorithms
 
   modules/ars
+  modules/crossq
   modules/ppo_mask
   modules/ppo_recurrent
   modules/qrdqn
@@ -42,6 +43,7 @@ RL Baselines3 Zoo also offers a simple interface to train, evaluate agents and d
   :maxdepth: 1
   :caption: Common
 
+  common/torch_layers
   common/utils
   common/wrappers
 

diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -3,6 +3,30 @@
 Changelog
 ==========
 
+
+Release 2.4.0a0 (WIP)
+--------------------------
+
+Breaking Changes:
+^^^^^^^^^^^^^^^^^
+
+New Features:
+^^^^^^^^^^^^^
+- Added CrossQ (@danielpalen)
+
+Bug Fixes:
+^^^^^^^^^^
+
+Deprecations:
+^^^^^^^^^^^^^
+
+Others:
+^^^^^^^
+
+Documentation:
+^^^^^^^^^^^^^^
+
+
 Release 2.3.0 (2024-03-31)
 --------------------------
 
@@ -554,4 +578,4 @@ Contributors:
 -------------
 
 @ku2482 @guyk1971 @minhlong94 @ayeright @kronion @glmcdona @cyprienc @sgillen @Gregwar @rnederstigt @qgallouedec
-@mlodel @CppMaster @burakdmb @honglu2875 @ZikangXiong @AlexPasqua @jonasreiher @icheered @Armandpl
+@mlodel @CppMaster @burakdmb @honglu2875 @ZikangXiong @AlexPasqua @jonasreiher @icheered @Armandpl @danielpalen
diff --git a/docs/modules/crossq.rst b/docs/modules/crossq.rst
@@ -0,0 +1,100 @@
+.. _crossq:
+
+.. automodule:: sb3_contrib.crossq
+
+
+CrossQ
+======
+
+Implementation of CrossQ proposed in:
+
+`Bhatt A.* & Palenicek D.* et al. Batch Normalization in Deep Reinforcement Learning for Greater Sample Efficiency and Simplicity. ICLR 2024.`
+
+CrossQ is a simple and efficient algorithm that uses batch normalization to improve the sample efficiency of off-policy deep reinforcement learning algorithms.
+It is based on the idea of carefully introducing batch normalization layers in the critic network and dropping target networks.
+This yield a simpler and more sample-efficient algorithm without requiring high update-to-data ratios.
+
+.. rubric:: Available Policies
+
+.. autosummary::
+    :nosignatures:
+
+    MlpPolicy
+
+.. note::
+
+  Compared to the original implementation, the default network architecture for the q-value function is ``[1024, 1024]``
+  instead of ``[2048, 2048]`` as it provides a good compromise between speed and performance.
+
+Notes
+-----
+
+- Original paper: https://openreview.net/pdf?id=PczQtTsTIX
+- Original Implementation: https://github.com/adityab/CrossQ
+- SBX Implementation: https://github.com/araffin/sbx
+
+
+Can I use?
+----------
+
+-  Recurrent policies: ❌
+-  Multi processing: ✔️
+-  Gym spaces:
+
+
+============= ====== ===========
+Space         Action Observation
+============= ====== ===========
+Discrete      ❌      ✔️
+Box           ✔️      ✔️
+MultiDiscrete ❌      ✔️
+MultiBinary   ❌      ✔️
+Dict          ❌      ✔️
+============= ====== ===========
+
+
+Example
+-------
+
+.. code-block:: python
+
+  from sb3_contrib import CrossQ
+
+  model = CrossQ("MlpPolicy", "Walker2d-v4")
+  model.learn(total_timesteps=1_000_000)
+  model.save("crossq_walker")
+
+
+Results
+-------
+
+Performance evaluation of CrossQ on six MuJoCo environments.
+Compared to results from the original paper as well as a version from `SBX <https://github.com/araffin/sbx>`_.
+
+.. image:: ../images/crossQ_performance.png
+
+Comments
+--------
+
+This implementation is based on SB3 SAC implementation.
+
+
+Parameters
+----------
+
+.. autoclass:: CrossQ
+  :members:
+  :inherited-members:
+
+.. _crossq_policies:
+
+CrossQ Policies
+---------------
+
+.. autoclass:: MlpPolicy
+  :members:
+  :inherited-members:
+
+.. autoclass:: sb3_contrib.crossq.policies.CrossQPolicy
+  :members:
+  :noindex:
diff --git a/sb3_contrib/__init__.py b/sb3_contrib/__init__.py
@@ -1,6 +1,7 @@
 import os
 
 from sb3_contrib.ars import ARS
+from sb3_contrib.crossq import CrossQ
 from sb3_contrib.ppo_mask import MaskablePPO
 from sb3_contrib.ppo_recurrent import RecurrentPPO
 from sb3_contrib.qrdqn import QRDQN
@@ -14,6 +15,7 @@
 
 __all__ = [
     "ARS",
+    "CrossQ",
     "MaskablePPO",
     "RecurrentPPO",
     "QRDQN",

diff --git a/sb3_contrib/common/torch_layers.py b/sb3_contrib/common/torch_layers.py
@@ -0,0 +1,114 @@
+import torch
+
+__all__ = ["BatchRenorm1d", "BatchRenorm"]
+
+
+class BatchRenorm(torch.jit.ScriptModule):
+    """
+    BatchRenorm Module (https://arxiv.org/abs/1702.03275).
+    Adapted to Pytorch from sbx.sbx.common.jax_layers.BatchRenorm
+
+    BatchRenorm is an improved version of vanilla BatchNorm. Contrary to BatchNorm,
+    BatchRenorm uses the running statistics for normalizing the batches after a warmup phase.
+    This makes it less prone to suffer from "outlier" batches that can happen
+    during very long training runs and, therefore, is more robust during long training runs.
+
+    During the warmup phase, it behaves exactly like a BatchNorm layer. After the warmup phase,
+    the running statistics are used for normalization. The running statistics are updated during
+    training mode. During evaluation mode, the running statistics are used for normalization but
+    not updated.
+
+    :param num_features: Number of features in the input tensor.
+    :param eps: A value added to the variance for numerical stability.
+    :param momentum: The value used for the ra_mean and ra_var computation.
+    :param affine: A boolean value that when set to True, this module has learnable
+            affine parameters. Default: True
+    :param warmup_steps: Number of warum steps that are performed before the running statistics
+            are used form normalization. During the warump phase, the batch statistics are used.
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 0.001,
+        momentum: float = 0.01,
+        affine: bool = True,
+        warmup_steps: int = 100_000,
+    ):
+        super().__init__()
+        # Running average mean and variance
+        self.register_buffer("ra_mean", torch.zeros(num_features, dtype=torch.float))
+        self.register_buffer("ra_var", torch.ones(num_features, dtype=torch.float))
+        self.register_buffer("steps", torch.tensor(0, dtype=torch.long))
+        self.scale = torch.nn.Parameter(torch.ones(num_features, dtype=torch.float))
+        self.bias = torch.nn.Parameter(torch.zeros(num_features, dtype=torch.float))
+
+        self.affine = affine
+        self.eps = eps
+        self.step = 0
+        self.momentum = momentum
+        self.rmax = 3.0
+        self.dmax = 5.0
+        self.warmup_steps = warmup_steps
+
+    def _check_input_dim(self, x: torch.Tensor) -> None:
+        raise NotImplementedError()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Normalize the input tensor.
+
+        :param x: Input tensor
+        :return: Normalized tensor.
+        """
+
+        if self.training:
+            batch_mean = x.mean(0)
+            batch_var = x.var(0)
+            batch_std = (batch_var + self.eps).sqrt()
+
+            # Use batch statistics during initial warm up phase.
+            # Note: in the original paper, after some warmup phase (batch norm phase of 5k steps)
+            # the constraints are linearly relaxed to r_max/d_max over 40k steps
+            # Here we only have a warmup phase
+            if self.steps > self.warmup_steps:
+
+                running_std = (self.ra_var + self.eps).sqrt()
+                # scale
+                r = (batch_std / running_std).detach()
+                r = r.clamp(1 / self.rmax, self.rmax)
+                # bias
+                d = ((batch_mean - self.ra_mean) / running_std).detach()
+                d = d.clamp(-self.dmax, self.dmax)
+
+                # BatchNorm normalization, using minibatch stats and running average stats
+                # Because we use _normalize, this is equivalent to
+                # ((x - x_mean) / sigma) * r + d = ((x - x_mean) * r + d * sigma) / sigma
+                # where sigma = sqrt(var)
+                custom_mean = batch_mean - d * batch_var.sqrt() / r
+                custom_var = batch_var / (r**2)
+
+            else:
+                custom_mean, custom_var = batch_mean, batch_var
+
+            # Update Running Statistics
+            self.ra_mean += self.momentum * (batch_mean.detach() - self.ra_mean)
+            self.ra_var += self.momentum * (batch_var.detach() - self.ra_var)
+            self.steps += 1
+
+        else:
+            custom_mean, custom_var = self.ra_mean, self.ra_var
+
+        # Normalize
+        x = (x - custom_mean[None]) / (custom_var[None] + self.eps).sqrt()
+
+        if self.affine:
+            x = self.scale * x + self.bias
+
+        return x
+
+
+class BatchRenorm1d(BatchRenorm):
+    def _check_input_dim(self, x: torch.Tensor) -> None:
+        if x.dim() == 1:
+            raise ValueError(f"Expected 2D or 3D input (got {x.dim()}D input)")
diff --git a/sb3_contrib/crossq/__init__.py b/sb3_contrib/crossq/__init__.py
@@ -0,0 +1,4 @@
+from sb3_contrib.crossq.crossq import CrossQ
+from sb3_contrib.crossq.policies import MlpPolicy
+
+__all__ = ["CrossQ", "MlpPolicy"]