Stable-Baselines-Team · danielpalen · May 3, 2024 · May 5, 2024 · May 12, 2024 · May 12, 2024
diff --git a/README.md b/README.md
@@ -31,6 +31,7 @@ See documentation for the full list of included features.
 - [PPO with recurrent policy (RecurrentPPO aka PPO LSTM)](https://ppo-details.cleanrl.dev//2021/11/05/ppo-implementation-details/)
 - [Truncated Quantile Critics (TQC)](https://arxiv.org/abs/2005.04269)
 - [Trust Region Policy Optimization (TRPO)](https://arxiv.org/abs/1502.05477)
+- [Batch Normalization in Deep Reinforcement Learning (CrossQ)](https://openreview.net/forum?id=PczQtTsTIX)
 
 **Gym Wrappers**:
 - [Time Feature Wrapper](https://arxiv.org/abs/1712.00378)

diff --git a/docs/images/crossQ_performance.png b/docs/images/crossQ_performance.png
diff --git a/docs/index.rst b/docs/index.rst
@@ -32,6 +32,7 @@ RL Baselines3 Zoo also offers a simple interface to train, evaluate agents and d
   :caption: RL Algorithms
 
   modules/ars
+  modules/crossq
   modules/ppo_mask
   modules/ppo_recurrent
   modules/qrdqn

diff --git a/docs/modules/crossq.rst b/docs/modules/crossq.rst
@@ -0,0 +1,99 @@
+.. _crossq:
+
+.. automodule:: sb3_contrib.crossq
+
+
+CrossQ
+======
+
+Implementation of CrossQ proposed in:
+
+`Bhatt A.* & Palenicek D.* et al. Batch Normalization in Deep Reinforcement Learning for Greater Sample Efficiency and Simplicity. ICLR 2024.`
+
+CrossQ is a simple and efficient algorithm that uses batch normalization to improve the sample efficiency of off-policy deep reinforcement learning algorithms.
+It is based on the idea of carefully introducing batch normalization layers in the critic network and dropping target networks.
+This yield a simpler and more sample-efficient algorithm without requiring high update-to-data ratios.
+
+.. rubric:: Available Policies
+
+.. autosummary::
+    :nosignatures:
+
+    MlpPolicy
+
+
+Notes
+-----
+
+- Original paper: https://openreview.net/pdf?id=PczQtTsTIX
+- Original Implementation: https://github.com/adityab/CrossQ
+
+
+Can I use?
+----------
+
+-  Recurrent policies: ❌
+-  Multi processing: ✔️
+-  Gym spaces:
+
+
+============= ====== ===========
+Space         Action Observation
+============= ====== ===========
+Discrete      ❌      ✔️
+Box           ✔️      ✔️
+MultiDiscrete ❌      ✔️
+MultiBinary   ❌      ✔️
+Dict          ❌      ✔️
+============= ====== ===========
+
+
+Example
+-------
+
+.. code-block:: python
+
+  import gymnasium as gym
+  import numpy as np
+
+  from sb3_contrib import CrossQ
+
+  model = CrossQ("MlpPolicy", "Walker2d-v4")
+  model.learn(total_timesteps=1_000_000)
+  model.save("crossq_walker")
+
+
+Results
+-------
+
+Performance evaluation of CrossQ on six MuJoCo environments.
+Compared to results from the original paper as well as a version from SBX.
+
+.. image:: ../images/crossQ_performance.png
+
+Comments
+--------
+
+This implementation is based on SB3 SAC implementation.
+
+
+Parameters
+----------
+
+.. autoclass:: CrossQ
+  :members:
+  :inherited-members:
+
+.. _crossq_policies:
+
+CrossQ Policies
+---------------
+
+.. autoclass:: MlpPolicy
+  :members:
+  :inherited-members:
+
+.. autoclass:: sb3_contrib.crossq.policies.CrossQPolicy
+  :members:
+  :noindex:
+
diff --git a/sb3_contrib/__init__.py b/sb3_contrib/__init__.py
@@ -1,6 +1,7 @@
 import os
 
 from sb3_contrib.ars import ARS
+from sb3_contrib.crossq import CrossQ
 from sb3_contrib.ppo_mask import MaskablePPO
 from sb3_contrib.ppo_recurrent import RecurrentPPO
 from sb3_contrib.qrdqn import QRDQN
@@ -14,6 +15,7 @@
 
 __all__ = [
     "ARS",
+    "CrossQ",
     "MaskablePPO",
     "RecurrentPPO",
     "QRDQN",

diff --git a/sb3_contrib/common/network_layers.py b/sb3_contrib/common/network_layers.py
@@ -0,0 +1,94 @@
+import torch
+
+__all__ = ["BatchRenorm1d"]
+
+
+class BatchRenorm(torch.jit.ScriptModule):
+    """
+    BatchRenorm Module (https://arxiv.org/abs/1702.03275).
+    Adapted from flax.linen.normalization.BatchNorm
+
+    BatchRenorm is an improved version of vanilla BatchNorm. Contrary to BatchNorm,
+    BatchRenorm uses the running statistics for normalizing the batches after a warmup phase.
+    This makes it less prone to suffer from "outlier" batches that can happen
+    during very long training runs and, therefore, is more robust during long training runs.
+
+    During the warmup phase, it behaves exactly like a BatchNorm layer.
+
+    Args:
+        num_features: Number of features in the input tensor.
+        eps: A value added to the variance for numerical stability.
+        momentum: The value used for the running_mean and running_var computation.
+        affine: A boolean value that when set to True, this module has learnable
+            affine parameters. Default: True
+    """
+
+    def __init__(
+        self,
+        num_features: int,
+        eps: float = 0.001,
+        momentum: float = 0.01,
+        affine: bool = True,
+    ):
+        super().__init__()
+        self.register_buffer("running_mean", torch.zeros(num_features, dtype=torch.float))
+        self.register_buffer("running_var", torch.ones(num_features, dtype=torch.float))
+        self.register_buffer("num_batches_tracked", torch.tensor(0, dtype=torch.long))
+        self.scale = torch.nn.Parameter(torch.ones(num_features, dtype=torch.float))
+        self.bias = torch.nn.Parameter(torch.zeros(num_features, dtype=torch.float))
+
+        self.affine = affine
+        self.eps = eps
+        self.step = 0
+        self.momentum = momentum
+        self.rmax = 3.0
+        self.dmax = 5.0
+
+    def _check_input_dim(self, x: torch.Tensor) -> None:
+        raise NotImplementedError()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+
+        if self.training:
+            batch_mean = x.mean(0)
+            batch_var = x.var(0)
+            batch_std = (batch_var + self.eps).sqrt()
+
+            # Use batch statistics during initial warm up phase.
+            if self.num_batches_tracked > 100_000:
+
+                running_std = (self.running_var + self.eps).sqrt()
+                running_mean = self.running_mean
+
+                r = (batch_std / running_std).detach()
+                r = r.clamp(1 / self.rmax, self.rmax)
+                d = ((batch_mean - running_mean) / running_std).detach()
+                d = d.clamp(-self.dmax, self.dmax)
+
+                m = batch_mean - d * batch_var.sqrt() / r
+                v = batch_var / (r**2)
+
+            else:
+                m, v = batch_mean, batch_var
+
+            # Update Running Statistics
+            self.running_mean += self.momentum * (batch_mean.detach() - self.running_mean)
+            self.running_var += self.momentum * (batch_var.detach() - self.running_var)
+            self.num_batches_tracked += 1
+
+        else:
+            m, v = self.running_mean, self.running_var
+
+        # Normalize
+        x = (x - m[None]) / (v[None] + self.eps).sqrt()
+
+        if self.affine:
+            x = self.scale * x + self.bias
+
+        return x
+
+
+class BatchRenorm1d(BatchRenorm):
+    def _check_input_dim(self, x: torch.Tensor) -> None:
+        if x.dim() == 1:
+            raise ValueError("expected 2D or 3D input (got {x.dim()}D input)")
diff --git a/sb3_contrib/crossq/__init__.py b/sb3_contrib/crossq/__init__.py
@@ -0,0 +1,4 @@
+from sb3_contrib.crossq.crossq import CrossQ
+from sb3_contrib.crossq.policies import MlpPolicy
+
+__all__ = ["CrossQ", "MlpPolicy"]