diff --git a/README.md b/README.md
index 5d7e7746..5593d4a4 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,7 @@ See documentation for the full list of included features.
 **RL Algorithms**:
 - [Truncated Quantile Critics (TQC)](https://arxiv.org/abs/2005.04269)
 - [Quantile Regression DQN (QR-DQN)](https://arxiv.org/abs/1710.10044)
+- [Bootstrapped Dual Policy Iteration](https://arxiv.org/abs/1903.04193)
 
 **Gym Wrappers**:
 - [Time Feature Wrapper](https://arxiv.org/abs/1712.00378)
diff --git a/docs/guide/algos.rst b/docs/guide/algos.rst
index f1b0ed55..6dd4a4e5 100644
--- a/docs/guide/algos.rst
+++ b/docs/guide/algos.rst
@@ -10,11 +10,12 @@ Name         ``Box``     ``Discrete`` ``MultiDiscrete`` ``MultiBinary`` Multi Pr
 ============ =========== ============ ================= =============== ================
 TQC          ✔️          ❌            ❌                ❌              ❌
 QR-DQN       ️❌          ️✔️            ❌                ❌              ❌
+BDPI         ❌          ✔️            ❌                ❌              ✔️
 ============ =========== ============ ================= =============== ================
 
 
 .. note::
-    Non-array spaces such as ``Dict`` or ``Tuple`` are not currently supported by any algorithm.
+    Non-array spaces such as ``Dict`` or ``Tuple`` are only supported by BDPI, using ``MultiInputPolicy`` instead of ``MlpPolicy`` as the policy.
 
 Actions ``gym.spaces``:
 
diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst
index 12b7d718..d77e5003 100644
--- a/docs/guide/examples.rst
+++ b/docs/guide/examples.rst
@@ -30,6 +30,19 @@ Train a Quantile Regression DQN (QR-DQN) agent on the CartPole environment.
   model.learn(total_timesteps=10000, log_interval=4)
   model.save("qrdqn_cartpole")
 
+BDPI
+----
+
+Train a Bootstrapped Dual Policy Iteration (BDPI) agent on the LunarLander environment
+
+.. code-block:: python
+
+  from sb3_contrib import BDPI
+
+  policy_kwargs = dict(n_critics=8)
+  model = BDPI("MlpPolicy", "LunarLander-v2", policy_kwargs=policy_kwargs, verbose=1)
+  model.learn(total_timesteps=50000, log_interval=4)
+  model.save("bdpi_lunarlander")
 
 .. PyBullet: Normalizing input features
 .. ------------------------------------
diff --git a/docs/index.rst b/docs/index.rst
index f86c47ee..0b7aec85 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -33,6 +33,7 @@ RL Baselines3 Zoo also offers a simple interface to train, evaluate agents and d
 
   modules/tqc
   modules/qrdqn
+  modules/bdpi
 
 .. toctree::
   :maxdepth: 1
diff --git a/docs/modules/bdpi.rst b/docs/modules/bdpi.rst
new file mode 100644
index 00000000..941c99ab
--- /dev/null
+++ b/docs/modules/bdpi.rst
@@ -0,0 +1,136 @@
+.. _bdpi:
+
+.. automodule:: sb3_contrib.bdpi
+
+BDPI
+====
+
+`Bootstrapped Dual Policy Iteration <https://arxiv.org/abs/1903.04193>`_ is an actor-critic algorithm for
+discrete action spaces. The distinctive components of BDPI are as follows:
+
+- Like Bootstrapped DQN, it uses several critics, with each critic having a Qa and Qb network
+  (like Clipped DQN).
+- The BDPI critics, inspired from the DQN literature, are therefore off-policy. They don't know
+  about the actor, and do not use any form of off-policy corrections to evaluate the actor. They
+  instead directly approximate Q*, the optimal value function.
+- The actor is trained with an equation inspired from Conservative Policy Iteration, instead of
+  Policy Gradient (as used by A2C, PPO, SAC, DDPG, etc). This use of Conservative Policy Iteration
+  is what allows the BDPI actor to be compatible with off-policy critics.
+
+As a result, BDPI can be configured to be highly sample-efficient, at the cost of compute efficiency.
+The off-policy critics can learn aggressively (many samples, many gradient steps), as they don't have
+to remain close to the actor. The actor then learns from a mixture of high-quality critics, leading
+to good exploration even in challenging environments (see the Table environment described in the paper
+linked above).
+
+Can I use?
+----------
+
+-  Recurrent policies: ❌
+-  Multi processing: ✔️
+-  Gym spaces:
+
+
+============= ====== ===========
+Space         Action Observation
+============= ====== ===========
+Discrete      ✔️      ✔️
+Box           ❌     ✔️
+MultiDiscrete ❌     ✔️
+MultiBinary   ❌     ✔️
+Dict          ❌     ✔️
+============= ====== ===========
+
+Example
+-------
+
+Train a BDPI agent on ``LunarLander-v2``, with hyper-parameters tuned by Optuna in rl-baselines3-zoo:
+
+.. code-block:: python
+
+  import gym
+
+  from sb3_contrib import BDPI
+
+  model = BDPI(
+    "MlpPolicy",
+    'LunarLander-v2',
+    actor_lr=0.01,       # How fast the actor pursues the greedy policy of the critics
+    critic_lr=0.234,     # Q-Learning learning rate of the critics
+    batch_size=256,      # 256 experiences sampled from the buffer every time-step, for every critic
+    buffer_size=100000,
+    gradient_steps=64,   # Actor and critics fit for 64 gradient steps per time-step
+    learning_rate=0.00026, # Adam optimizer learning rate
+    policy_kwargs=dict(net_arch=[64, 64], n_critics=8), # 8 critics
+    verbose=1,
+    tensorboard_log='./tb_log'
+  )
+
+  model.learn(total_timesteps=50000)
+  model.save("bdpi_lunarlander")
+
+  del model # remove to demonstrate saving and loading
+
+  model = BDPI.load("bdpi_lunarlander")
+
+  obs = env.reset()
+  while True:
+      action, _states = model.predict(obs)
+      obs, rewards, dones, info = env.step(action)
+      env.render()
+
+
+Results
+-------
+
+LunarLander
+^^^^^^^^^^^
+
+Results for BDPI are available in `this Github issue <https://github.com/DLR-RM/stable-baselines3/issues/499>`_.
+
+How to replicate the results?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Clone the `rl-zoo repo <https://github.com/DLR-RM/rl-baselines3-zoo>`_:
+
+.. code-block:: bash
+
+  git clone https://github.com/DLR-RM/rl-baselines3-zoo
+  cd rl-baselines3-zoo/
+
+
+Run the benchmark (replace ``$ENV_ID`` by the envs mentioned above, and ``$N`` with
+the number of CPU cores in your machine):
+
+.. code-block:: bash
+
+  python train.py --algo bdpi --env $ENV_ID --eval-episodes 10 --eval-freq 10000 -params threads:$N
+
+
+Plot the results (here for LunarLander only):
+
+.. code-block:: bash
+
+  python scripts/all_plots.py -a bdpi -e LunarLander -f logs/ -o logs/bdpi_results
+  python scripts/plot_from_file.py -i logs/bdpi_results.pkl -latex -l BDPI
+
+
+Parameters
+----------
+
+.. autoclass:: BDPI
+  :members:
+  :inherited-members:
+
+
+BDPI Policies
+-------------
+
+.. autoclass:: MlpPolicy
+  :members:
+
+.. autoclass:: CnnPolicy
+  :members:
+
+.. autoclass:: MultiInputPolicy
+  :members:
diff --git a/sb3_contrib/__init__.py b/sb3_contrib/__init__.py
index 8f253e12..a3c83525 100644
--- a/sb3_contrib/__init__.py
+++ b/sb3_contrib/__init__.py
@@ -1,5 +1,6 @@
 import os
 
+from sb3_contrib.bdpi import BDPI
 from sb3_contrib.qrdqn import QRDQN
 from sb3_contrib.tqc import TQC
 
diff --git a/sb3_contrib/bdpi/__init__.py b/sb3_contrib/bdpi/__init__.py
new file mode 100644
index 00000000..7dc9b2ef
--- /dev/null
+++ b/sb3_contrib/bdpi/__init__.py
@@ -0,0 +1,2 @@
+from sb3_contrib.bdpi.bdpi import BDPI
+from sb3_contrib.bdpi.policies import CnnPolicy, MlpPolicy, MultiInputPolicy
diff --git a/sb3_contrib/bdpi/bdpi.py b/sb3_contrib/bdpi/bdpi.py
new file mode 100644
index 00000000..6ee4a160
--- /dev/null
+++ b/sb3_contrib/bdpi/bdpi.py
@@ -0,0 +1,281 @@
+import random
+from typing import Any, Dict, List, Optional, Tuple, Type, Union
+
+import gym
+import torch as th
+import torch.multiprocessing as mp
+from stable_baselines3.common.buffers import ReplayBuffer
+from stable_baselines3.common.off_policy_algorithm import OffPolicyAlgorithm
+from stable_baselines3.common.type_aliases import GymEnv, MaybeCallback, Schedule, TensorDict
+
+from sb3_contrib.bdpi.policies import BDPIPolicy
+
+# Because BDPI uses many critics per agent, and each critic has 2 Q-Networks, sharing them with file descriptors
+# exhausts the maximum number of open file descriptors on many Linux distributions. The file_system sharing method
+# creates many small files in /dev/shm, that are then shared by file-name. This avoids reaching the maximum number
+# of open file descriptors.
+mp.set_sharing_strategy("file_system")
+
+
+def train_model(
+    model: th.nn.Module,
+    inp: Union[th.Tensor, TensorDict],
+    outp: th.Tensor,
+    gradient_steps: int,
+) -> float:
+    """Train a PyTorch module on inputs and outputs, minimizing the MSE loss for gradient_steps steps.
+
+    :param model: PyTorch module to be trained. It must have a ".optimizer" attribute with an instance of Optimizer in it.
+    :param inp: Input tensor (or dictionary of tensors if model is a MultiInput model)
+    :param outp: Expected outputs tensor
+    :param gradient_steps: Number of gradient steps to execute when minimizing the MSE.
+    :return: MSE loss (with the 'sum' reduction) after the last gradient step, as a float.
+    """
+    mse_loss = th.nn.MSELoss(reduction="sum")
+
+    for i in range(gradient_steps):
+        predicted = model(inp)
+        loss = mse_loss(predicted, outp)
+
+        model.optimizer.zero_grad()
+        loss.backward()
+        model.optimizer.step()
+
+    return float(loss.item())
+
+
+class BDPI(OffPolicyAlgorithm):
+    """
+    Bootstrapped Dual Policy Iteration
+
+    Sample-efficient discrete-action RL algorithm, built on one actor trained
+    to imitate the greedy policy of several Q-Learning critics.
+
+    Paper: https://arxiv.org/abs/1903.04193
+
+    :param policy: The policy model to use (MlpPolicy, CnnPolicy, ...)
+    :param env: The environment to learn from (if registered in Gym, can be str)
+    :param learning_rate: learning rate for adam optimizer,
+        the same learning rate will be used for all networks (Q-Values and Actor)
+        it can be a function of the current progress remaining (from 1 to 0)
+    :param actor_lr: Conservative Policy Iteration learning rate for the actor (used in a formula, not for Adam gradient steps)
+    :param critic_lr: Q-Learning "alpha" learning rate for the critics
+    :param buffer_size: size of the replay buffer
+    :param learning_starts: how many steps of the model to collect transitions for before learning starts
+    :param batch_size: Minibatch size for each gradient update
+    :param gamma: the discount factor
+    :param train_freq: Update the model every ``train_freq`` steps. Alternatively pass a tuple of frequency and unit
+        like ``(5, "step")`` or ``(2, "episode")``.
+    :param gradient_steps: How many gradient steps to do after each rollout (see ``train_freq``)
+    :param threads: Number of threads to use to train the actor and critics in parallel
+    :param replay_buffer_class: Replay buffer class to use (for instance ``HerReplayBuffer``).
+        If ``None``, it will be automatically selected.
+    :param replay_buffer_kwargs: Keyword arguments to pass to the replay buffer on creation.
+    :param optimize_memory_usage: Enable a memory efficient variant of the replay buffer
+        at a cost of more complexity.
+        See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
+    :param create_eval_env: Whether to create a second environment that will be
+        used for evaluating the agent periodically. (Only available when passing string for the environment)
+    :param policy_kwargs: additional arguments to be passed to the policy on creation
+    :param verbose: the verbosity level: 0 no output, 1 info, 2 debug
+    :param seed: Seed for the pseudo random generators
+    :param device: Device (cpu, cuda, ...) on which the code should be run.
+        Setting it to auto, the code will be run on the GPU if possible.
+    :param _init_setup_model: Whether or not to build the network at the creation of the instance.
+    """
+
+    def __init__(
+        self,
+        policy: Union[str, Type[BDPIPolicy]],
+        env: Union[GymEnv, str],
+        learning_rate: Union[float, Schedule] = 1e-4,
+        actor_lr: float = 0.05,
+        critic_lr: float = 0.2,
+        buffer_size: int = 1000000,  # 1e6
+        learning_starts: int = 256,
+        batch_size: int = 256,
+        gamma: float = 0.99,
+        train_freq: Union[int, Tuple[int, str]] = 1,
+        gradient_steps: int = 20,
+        threads: int = 1,
+        replay_buffer_class: Optional[ReplayBuffer] = None,
+        replay_buffer_kwargs: Optional[Dict[str, Any]] = None,
+        optimize_memory_usage: bool = False,
+        tensorboard_log: Optional[str] = None,
+        create_eval_env: bool = False,
+        policy_kwargs: Dict[str, Any] = None,
+        verbose: int = 0,
+        seed: Optional[int] = None,
+        device: Union[th.device, str] = "auto",
+        _init_setup_model: bool = True,
+    ):
+        super(BDPI, self).__init__(
+            policy,
+            env,
+            BDPIPolicy,
+            learning_rate,
+            buffer_size,
+            learning_starts,
+            batch_size,
+            0.0,
+            gamma,
+            train_freq,
+            gradient_steps,
+            None,
+            replay_buffer_class=replay_buffer_class,
+            replay_buffer_kwargs=replay_buffer_kwargs,
+            policy_kwargs=policy_kwargs,
+            tensorboard_log=tensorboard_log,
+            verbose=verbose,
+            device=device,
+            create_eval_env=create_eval_env,
+            seed=seed,
+            use_sde=False,
+            sde_sample_freq=1,
+            use_sde_at_warmup=False,
+            optimize_memory_usage=optimize_memory_usage,
+            supported_action_spaces=(gym.spaces.Discrete),
+            sde_support=False,
+        )
+
+        self.actor_lr = actor_lr
+        self.critic_lr = critic_lr
+        self.threads = threads
+        self.pool = mp.get_context("spawn").Pool(threads)
+
+        if _init_setup_model:
+            self._setup_model()
+
+    def _setup_model(self) -> None:
+        """Create the BDPI actor and critics, and make their memory shared across processes."""
+        super(BDPI, self)._setup_model()
+
+        self.actor = self.policy.actor
+        self.criticsA = self.policy.criticsA
+        self.criticsB = self.policy.criticsB
+
+        self.actor.share_memory()
+
+        for cA, cB in zip(self.criticsA, self.criticsB):
+            cA.share_memory()
+            cB.share_memory()
+
+    def _excluded_save_params(self) -> List[str]:
+        """Process pools cannot be pickled, so exclude "self.pool" from the saved parameters of BDPI."""
+        return super()._excluded_save_params() + ["pool"]
+
+    def train(self, gradient_steps: int, batch_size: int = 64) -> None:
+        """BDPI Training procedure.
+
+        This method is called every time-step (if train_freq=1, as in the original paper).
+        Every time this method is called, the following steps are performed:
+
+        - Every critic, in random order, gets updated with the Clipped DQN equation on its own batch of experiences
+        - Every critic, just after being updated, computes its greedy policy and updates the actor towards it
+        - After every critic has been updated, their QA and QB networks are swapped.
+
+        This method implements some basic multi-processing:
+
+        - Every critic and the actor are PyTorch modules with share_memory() called on them
+        - A process pool is used to perform the neural network training operations (gradient descent steps)
+
+        This approach only has a minimal impact on code, but does not scale very well:
+
+        - On the plus side, the actor is trained concurrently by several workers, ala HOGWILD
+        - However, the predictions (getting Q(next state), computing updated Q-Values and the greedy policy)
+          all happen sequentially in the main process. With self.threads>8, the bottleneck therefore becomes
+          the main process, that has to perform all the updates and predictions. The worker processes only
+          fit neural networks.
+        """
+
+        # Update optimizers learning rate
+        optimizers = [self.actor.optimizer] + [c.optimizer for c in self.criticsA] + [c.optimizer for c in self.criticsB]
+        self._update_learning_rate(optimizers)
+
+        # Update every critic (and the actor after each critic), in a random order
+        critic_losses = []
+        actor_losses = []
+        critics = list(zip(self.criticsA, self.criticsB))
+
+        random.shuffle(critics)
+
+        for criticA, criticB in critics:
+            # Sample replay buffer
+            with th.no_grad():
+                replay_data = self.replay_buffer.sample(batch_size, env=self._vec_normalize_env)
+
+            # Update the critic (code taken from DQN)
+            with th.no_grad():
+                qvA = criticA(replay_data.next_observations)
+                qvB = criticB(replay_data.next_observations)
+                qv = th.min(qvA, qvB)
+
+                QN = th.arange(replay_data.rewards.shape[0])
+                next_q_values = qv[QN, qvA.argmax(1)].reshape(-1, 1)
+
+                # 1-step TD target
+                target_values = replay_data.rewards + (1 - replay_data.dones) * self.gamma * next_q_values
+
+                # Make real supervised learning target Q-Values (even for non-taken actions)
+                target_q_values = criticA(replay_data.observations)
+                actions = replay_data.actions.long().flatten()
+                target_q_values[QN, actions] += self.critic_lr * (target_values.flatten() - target_q_values[QN, actions])
+
+            critic_losses.append(
+                self.pool.apply_async(train_model, (criticA, replay_data.observations, target_q_values, gradient_steps))
+            )
+
+            self.logger.record("train/avg_q", float(target_q_values.mean()))
+
+            # Update the actor
+            with th.no_grad():
+                greedy_actions = target_q_values.argmax(1)
+
+                train_probas = th.zeros_like(target_q_values)
+                train_probas[QN, greedy_actions] = 1.0
+
+                # Normalize the direction to be pursued
+                train_probas /= 1e-6 + train_probas.sum(1)[:, None]
+                actor_probas = self.actor(replay_data.observations)
+
+                # Imitation learning (or distillation, or reward-penalty Pursuit, all these are the same thing)
+                alr = self.actor_lr
+                train_probas = (1.0 - alr) * actor_probas + alr * train_probas
+                train_probas /= train_probas.sum(-1, keepdim=True)
+
+            actor_losses.append(
+                self.pool.apply_async(train_model, (self.actor, replay_data.observations, train_probas, gradient_steps))
+            )
+
+        # Log losses
+        for aloss, closs in zip(actor_losses, critic_losses):
+            self.logger.record("train/critic_loss", closs.get())
+            self.logger.record("train/actor_loss", aloss.get())
+
+        # Swap QA and QB
+        self.criticsA, self.criticsB = self.criticsB, self.criticsA
+
+    def learn(
+        self,
+        total_timesteps: int,
+        callback: MaybeCallback = None,
+        log_interval: int = 4,
+        eval_env: Optional[GymEnv] = None,
+        eval_freq: int = -1,
+        n_eval_episodes: int = 5,
+        tb_log_name: str = "BDPI",
+        eval_log_path: Optional[str] = None,
+        reset_num_timesteps: bool = True,
+    ) -> OffPolicyAlgorithm:
+
+        return super(BDPI, self).learn(
+            total_timesteps=total_timesteps,
+            callback=callback,
+            log_interval=log_interval,
+            eval_env=eval_env,
+            eval_freq=eval_freq,
+            n_eval_episodes=n_eval_episodes,
+            tb_log_name=tb_log_name,
+            eval_log_path=eval_log_path,
+            reset_num_timesteps=reset_num_timesteps,
+        )
diff --git a/sb3_contrib/bdpi/policies.py b/sb3_contrib/bdpi/policies.py
new file mode 100644
index 00000000..e24f43b0
--- /dev/null
+++ b/sb3_contrib/bdpi/policies.py
@@ -0,0 +1,320 @@
+from typing import Any, Dict, List, Optional, Type, Union
+
+import gym
+import torch as th
+from stable_baselines3.common.policies import BasePolicy, register_policy
+from stable_baselines3.common.torch_layers import (
+    BaseFeaturesExtractor,
+    CombinedExtractor,
+    FlattenExtractor,
+    NatureCNN,
+    create_mlp,
+)
+from stable_baselines3.common.type_aliases import Schedule
+from stable_baselines3.dqn.policies import QNetwork
+from torch import nn
+
+
+class Actor(BasePolicy):
+    """
+    Actor network (policy) for BDPI.
+
+    :param observation_space: Obervation space
+    :param action_space: Action space
+    :param net_arch: Network architecture
+    :param features_extractor: Network to extract features
+        (a CNN when using images, a nn.Flatten() layer otherwise)
+    :param features_dim: Number of features
+    :param activation_fn: Activation function
+    :param normalize_images: Whether to normalize images or not,
+         dividing by 255.0 (True by default)
+    """
+
+    def __init__(
+        self,
+        observation_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        net_arch: List[int],
+        features_extractor: nn.Module,
+        features_dim: int,
+        activation_fn: Type[nn.Module] = nn.ReLU,
+        normalize_images: bool = True,
+    ):
+        super(Actor, self).__init__(
+            observation_space, action_space, features_extractor=features_extractor, normalize_images=normalize_images
+        )
+
+        # Save arguments to re-create object at loading
+        self.net_arch = net_arch
+        self.features_dim = features_dim
+        self.activation_fn = activation_fn
+
+        # One neural network in the policy, that maps a state to a discrete distribution over actions
+        actor_net = create_mlp(features_dim, self.action_space.n, net_arch, activation_fn)
+        actor_net.append(nn.Softmax(1))
+
+        self.actor = nn.Sequential(*actor_net)
+
+    def _get_constructor_parameters(self) -> Dict[str, Any]:
+        data = super()._get_constructor_parameters()
+
+        data.update(
+            dict(
+                net_arch=self.net_arch,
+                features_dim=self.features_dim,
+                activation_fn=self.activation_fn,
+                features_extractor=self.features_extractor,
+            )
+        )
+        return data
+
+    def forward(self, obs: th.Tensor) -> th.Tensor:
+        """Output of the neural network (here: probabilities)"""
+        return self.actor(self.extract_features(obs))
+
+    def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
+        """Action to be executed by the actor"""
+        probas = self.forward(observation)
+
+        if deterministic:
+            # Take the argmax
+            return th.max(probas, 1)[1].flatten()
+        else:
+            # Sample according to probabilities
+            return th.multinomial(probas, num_samples=1).flatten()
+
+
+class BDPIPolicy(BasePolicy):
+    """
+    Policy class for BDPI, with one actor and several critics (the critics are DQN QNetworks).
+
+    :param observation_space: Observation space
+    :param action_space: Action space
+    :param lr_schedule: Learning rate schedule (could be constant)
+    :param net_arch: The specification of the policy and value networks.
+    :param activation_fn: Activation function
+    :param features_extractor_class: Features extractor to use.
+    :param features_extractor_kwargs: Keyword arguments to pass to the features extractor.
+    :param normalize_images: Whether to normalize images or not, dividing by 255.0 (True by default)
+    :param optimizer_class: The optimizer to use, ``th.optim.Adam`` by default
+    :param optimizer_kwargs: Additional keyword arguments, excluding the learning rate, to pass to the optimizer
+    :param n_critics: Number of critic networks to create.
+    :param share_features_extractor: Whether to share or not the features extractor
+        between the actor and the critics (this saves computation time)
+    """
+
+    def __init__(
+        self,
+        observation_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        lr_schedule: Schedule,
+        net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None,
+        activation_fn: Type[nn.Module] = nn.ReLU,
+        features_extractor_class: Type[BaseFeaturesExtractor] = FlattenExtractor,
+        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
+        normalize_images: bool = True,
+        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
+        optimizer_kwargs: Optional[Dict[str, Any]] = None,
+        n_critics: int = 16,
+        share_features_extractor: bool = True,
+    ):
+        super(BDPIPolicy, self).__init__(
+            observation_space,
+            action_space,
+            features_extractor_class,
+            features_extractor_kwargs,
+            optimizer_class=optimizer_class,
+            optimizer_kwargs=optimizer_kwargs,
+        )
+
+        if net_arch is None:
+            if features_extractor_class == NatureCNN:
+                net_arch = []
+            else:
+                net_arch = [256, 256]
+
+        self.net_arch = net_arch
+        self.activation_fn = activation_fn
+        self.net_args = {
+            "observation_space": self.observation_space,
+            "action_space": self.action_space,
+            "net_arch": net_arch,
+            "activation_fn": self.activation_fn,
+            "normalize_images": normalize_images,
+        }
+
+        self.actor = None
+        self.criticsA = nn.ModuleList()
+        self.criticsB = nn.ModuleList()
+        self.share_features_extractor = share_features_extractor
+        self.n_critics = n_critics
+
+        self._build(lr_schedule)
+
+    def _build(self, lr_schedule: Schedule) -> None:
+        # Make the actor
+        self.actor = self.make_actor()
+        self.actor.optimizer = self.optimizer_class(self.actor.parameters(), lr=lr_schedule(1), **self.optimizer_kwargs)
+
+        # Make the critics
+        for i in range(self.n_critics * 2):
+            if self.share_features_extractor:
+                critic = self.make_critic(features_extractor=self.actor.features_extractor)
+            else:
+                # Create a separate features extractor for the critic
+                # this requires more memory and computation
+                critic = self.make_critic(features_extractor=None)
+
+            critic.optimizer = self.optimizer_class(critic.parameters(), lr=lr_schedule(1), **self.optimizer_kwargs)
+
+            if i < self.n_critics:
+                self.criticsA.append(critic)
+            else:
+                self.criticsB.append(critic)
+
+    def _get_constructor_parameters(self) -> Dict[str, Any]:
+        data = super()._get_constructor_parameters()
+
+        data.update(
+            dict(
+                net_arch=self.net_arch,
+                activation_fn=self.net_args["activation_fn"],
+                n_critics=self.n_critics,
+                lr_schedule=self._dummy_schedule,  # dummy lr schedule, not needed for loading policy alone
+                optimizer_class=self.optimizer_class,
+                optimizer_kwargs=self.optimizer_kwargs,
+                features_extractor_class=self.features_extractor_class,
+                features_extractor_kwargs=self.features_extractor_kwargs,
+            )
+        )
+        return data
+
+    def make_actor(self, features_extractor: Optional[BaseFeaturesExtractor] = None) -> Actor:
+        actor_kwargs = self._update_features_extractor(self.net_args, features_extractor)
+        return Actor(**actor_kwargs).to(self.device)
+
+    def make_critic(self, features_extractor: Optional[BaseFeaturesExtractor] = None) -> QNetwork:
+        critic_kwargs = self._update_features_extractor(self.net_args, features_extractor)
+        return QNetwork(**critic_kwargs).to(self.device)
+
+    def forward(self, obs: th.Tensor, deterministic: bool = False) -> th.Tensor:
+        return self._predict(obs, deterministic=deterministic)
+
+    def _predict(self, observation: th.Tensor, deterministic: bool = False) -> th.Tensor:
+        return self.actor._predict(observation, deterministic)
+
+
+MlpPolicy = BDPIPolicy
+
+
+class CnnPolicy(BDPIPolicy):
+    """
+    Policy class for BDPI, with one actor and several critics (the critics are DQN QNetworks). CNN version
+
+    :param observation_space: Observation space
+    :param action_space: Action space
+    :param lr_schedule: Learning rate schedule (could be constant)
+    :param net_arch: The specification of the policy and value networks.
+    :param activation_fn: Activation function
+    :param features_extractor_class: Features extractor to use.
+    :param features_extractor_kwargs: Keyword arguments
+        to pass to the features extractor.
+    :param normalize_images: Whether to normalize images or not,
+         dividing by 255.0 (True by default)
+    :param optimizer_class: The optimizer to use,
+        ``th.optim.Adam`` by default
+    :param optimizer_kwargs: Additional keyword arguments,
+        excluding the learning rate, to pass to the optimizer
+    :param n_critics: Number of critic networks to create.
+    :param share_features_extractor: Whether to share or not the features extractor
+        between the actor and the critics (this saves computation time)
+    """
+
+    def __init__(
+        self,
+        observation_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        lr_schedule: Schedule,
+        net_arch: Optional[List[int]] = None,
+        activation_fn: Type[nn.Module] = nn.ReLU,
+        features_extractor_class: Type[BaseFeaturesExtractor] = NatureCNN,
+        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
+        normalize_images: bool = True,
+        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
+        optimizer_kwargs: Optional[Dict[str, Any]] = None,
+        n_critics: int = 16,
+        share_features_extractor: bool = True,
+    ):
+        super(CnnPolicy, self).__init__(
+            observation_space,
+            action_space,
+            lr_schedule,
+            net_arch,
+            activation_fn,
+            features_extractor_class,
+            features_extractor_kwargs,
+            normalize_images,
+            optimizer_class,
+            optimizer_kwargs,
+            n_critics,
+            share_features_extractor,
+        )
+
+
+class MultiInputPolicy(BDPIPolicy):
+    """
+    Policy class for BDPI, with one actor and several critics (the critics are DQN QNetworks). Multi-input version
+
+    :param observation_space: Observation space
+    :param action_space: Action space
+    :param lr_schedule: Learning rate schedule (could be constant)
+    :param net_arch: The specification of the policy and value networks.
+    :param activation_fn: Activation function
+    :param features_extractor_class: Features extractor to use.
+    :param features_extractor_kwargs: Keyword arguments
+        to pass to the features extractor.
+    :param normalize_images: Whether to normalize images or not,
+         dividing by 255.0 (True by default)
+    :param optimizer_class: The optimizer to use,
+        ``th.optim.Adam`` by default
+    :param optimizer_kwargs: Additional keyword arguments,
+        excluding the learning rate, to pass to the optimizer
+    :param n_critics: Number of critic networks to create.
+    :param share_features_extractor: Whether to share or not the features extractor
+        between the actor and the critics (this saves computation time)
+    """
+
+    def __init__(
+        self,
+        observation_space: gym.spaces.Space,
+        action_space: gym.spaces.Space,
+        lr_schedule: Schedule,
+        net_arch: Optional[Union[List[int], Dict[str, List[int]]]] = None,
+        activation_fn: Type[nn.Module] = nn.ReLU,
+        features_extractor_class: Type[BaseFeaturesExtractor] = CombinedExtractor,
+        features_extractor_kwargs: Optional[Dict[str, Any]] = None,
+        normalize_images: bool = True,
+        optimizer_class: Type[th.optim.Optimizer] = th.optim.Adam,
+        optimizer_kwargs: Optional[Dict[str, Any]] = None,
+        n_critics: int = 16,
+        share_features_extractor: bool = True,
+    ):
+        super(MultiInputPolicy, self).__init__(
+            observation_space,
+            action_space,
+            lr_schedule,
+            net_arch,
+            activation_fn,
+            features_extractor_class,
+            features_extractor_kwargs,
+            normalize_images,
+            optimizer_class,
+            optimizer_kwargs,
+            n_critics,
+            share_features_extractor,
+        )
+
+
+register_policy("MlpPolicy", MlpPolicy)
+register_policy("CnnPolicy", CnnPolicy)
+register_policy("MultiInputPolicy", MultiInputPolicy)
diff --git a/setup.cfg b/setup.cfg
index 3ddbd033..72faf2a0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -24,6 +24,7 @@ per-file-ignores =
 	./sb3_contrib/__init__.py:F401
 	./sb3_contrib/qrdqn/__init__.py:F401
 	./sb3_contrib/tqc/__init__.py:F401
+	./sb3_contrib/bdpi/__init__.py:F401
 	./sb3_contrib/common/wrappers/__init__.py:F401
 exclude =
     # No need to traverse our git directory
diff --git a/tests/test_cnn.py b/tests/test_cnn.py
index 6c277856..58f40c13 100644
--- a/tests/test_cnn.py
+++ b/tests/test_cnn.py
@@ -8,10 +8,10 @@
 from stable_baselines3.common.utils import zip_strict
 from stable_baselines3.common.vec_env import VecTransposeImage, is_vecenv_wrapped
 
-from sb3_contrib import QRDQN, TQC
+from sb3_contrib import BDPI, QRDQN, TQC
 
 
-@pytest.mark.parametrize("model_class", [TQC, QRDQN])
+@pytest.mark.parametrize("model_class", [TQC, QRDQN, BDPI])
 def test_cnn(tmp_path, model_class):
     SAVE_NAME = "cnn_model.zip"
     # Fake grayscale with frameskip
@@ -24,7 +24,15 @@ def test_cnn(tmp_path, model_class):
         discrete=model_class not in {TQC},
     )
     kwargs = {}
-    if model_class in {TQC, QRDQN}:
+
+    if model_class in [BDPI]:
+        kwargs = dict(
+            buffer_size=250,
+            policy_kwargs=dict(
+                features_extractor_kwargs=dict(features_dim=32),
+            ),
+        )
+    elif model_class in [TQC, QRDQN]:
         # Avoid memory error when using replay buffer
         # Reduce the size of the features and the number of quantiles
         kwargs = dict(
@@ -34,6 +42,7 @@ def test_cnn(tmp_path, model_class):
                 features_extractor_kwargs=dict(features_dim=32),
             ),
         )
+
     model = model_class("CnnPolicy", env, **kwargs).learn(250)
 
     obs = env.reset()
@@ -94,6 +103,7 @@ def test_feature_extractor_target_net(model_class, share_features_extractor):
             learning_starts=100,
             policy_kwargs=dict(n_quantiles=25, features_extractor_kwargs=dict(features_dim=32)),
         )
+
     if model_class != QRDQN:
         kwargs["policy_kwargs"]["share_features_extractor"] = share_features_extractor
 
diff --git a/tests/test_dict_env.py b/tests/test_dict_env.py
index fda27259..116667f2 100644
--- a/tests/test_dict_env.py
+++ b/tests/test_dict_env.py
@@ -6,7 +6,7 @@
 from stable_baselines3.common.evaluation import evaluate_policy
 from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack, VecNormalize
 
-from sb3_contrib import QRDQN, TQC
+from sb3_contrib import BDPI, QRDQN, TQC
 
 
 class DummyDictEnv(gym.Env):
@@ -78,13 +78,13 @@ def render(self, mode="human"):
         pass
 
 
-@pytest.mark.parametrize("model_class", [QRDQN, TQC])
+@pytest.mark.parametrize("model_class", [QRDQN, TQC, BDPI])
 def test_consistency(model_class):
     """
     Make sure that dict obs with vector only vs using flatten obs is equivalent.
     This ensures notable that the network architectures are the same.
     """
-    use_discrete_actions = model_class == QRDQN
+    use_discrete_actions = model_class in [QRDQN, BDPI]
     dict_env = DummyDictEnv(use_discrete_actions=use_discrete_actions, vec_only=True)
     dict_env = gym.wrappers.TimeLimit(dict_env, 100)
     env = gym.wrappers.FlattenObservation(dict_env)
@@ -124,7 +124,7 @@ def test_consistency(model_class):
     assert np.allclose(action_1, action_2)
 
 
-@pytest.mark.parametrize("model_class", [QRDQN, TQC])
+@pytest.mark.parametrize("model_class", [QRDQN, TQC, BDPI])
 @pytest.mark.parametrize("channel_last", [False, True])
 def test_dict_spaces(model_class, channel_last):
     """
@@ -138,9 +138,9 @@ def test_dict_spaces(model_class, channel_last):
     kwargs = {}
     n_steps = 256
 
-    if model_class in {}:
+    if model_class in [BDPI]:
         kwargs = dict(
-            n_steps=128,
+            buffer_size=250,
             policy_kwargs=dict(
                 net_arch=[32],
                 features_extractor_kwargs=dict(cnn_output_dim=32),
@@ -169,7 +169,7 @@ def test_dict_spaces(model_class, channel_last):
     evaluate_policy(model, env, n_eval_episodes=5, warn=False)
 
 
-@pytest.mark.parametrize("model_class", [QRDQN, TQC])
+@pytest.mark.parametrize("model_class", [QRDQN, TQC, BDPI])
 @pytest.mark.parametrize("channel_last", [False, True])
 def test_dict_vec_framestack(model_class, channel_last):
     """
@@ -187,9 +187,9 @@ def test_dict_vec_framestack(model_class, channel_last):
     kwargs = {}
     n_steps = 256
 
-    if model_class in {}:
+    if model_class in [BDPI]:
         kwargs = dict(
-            n_steps=128,
+            buffer_size=250,
             policy_kwargs=dict(
                 net_arch=[32],
                 features_extractor_kwargs=dict(cnn_output_dim=32),
@@ -218,13 +218,14 @@ def test_dict_vec_framestack(model_class, channel_last):
     evaluate_policy(model, env, n_eval_episodes=5, warn=False)
 
 
-@pytest.mark.parametrize("model_class", [QRDQN, TQC])
+@pytest.mark.parametrize("model_class", [QRDQN, TQC, BDPI])
 def test_vec_normalize(model_class):
     """
     Additional tests to check observation space support
     for GoalEnv and VecNormalize using MultiInputPolicy.
     """
-    env = DummyVecEnv([lambda: BitFlippingEnv(n_bits=4, continuous=not (model_class == QRDQN))])
+    use_discrete_actions = model_class not in [TQC]
+    env = DummyVecEnv([lambda: BitFlippingEnv(n_bits=4, continuous=not use_discrete_actions)])
     env = VecNormalize(env)
 
     kwargs = {}
@@ -233,6 +234,7 @@ def test_vec_normalize(model_class):
     if model_class in {}:
         kwargs = dict(
             n_steps=128,
+            buffer_size=250,
             policy_kwargs=dict(
                 net_arch=[32],
             ),
diff --git a/tests/test_run.py b/tests/test_run.py
index 195d0114..86268b64 100644
--- a/tests/test_run.py
+++ b/tests/test_run.py
@@ -1,6 +1,6 @@
 import pytest
 
-from sb3_contrib import QRDQN, TQC
+from sb3_contrib import BDPI, QRDQN, TQC
 
 
 @pytest.mark.parametrize("ent_coef", ["auto", 0.01, "auto_0.01"])
@@ -56,3 +56,18 @@ def test_qrdqn():
         create_eval_env=True,
     )
     model.learn(total_timesteps=500, eval_freq=250)
+
+
+@pytest.mark.parametrize("n_critics", [1, 3])
+def test_bdpi(n_critics):
+    model = BDPI(
+        "MlpPolicy",
+        "CartPole-v1",
+        policy_kwargs=dict(n_critics=n_critics, net_arch=[64, 64]),
+        learning_starts=0,
+        buffer_size=500,
+        learning_rate=3e-4,
+        verbose=1,
+        create_eval_env=True,
+    )
+    model.learn(total_timesteps=500, eval_freq=250)
diff --git a/tests/test_save_load.py b/tests/test_save_load.py
index d2ee3a21..4eb6c9a4 100644
--- a/tests/test_save_load.py
+++ b/tests/test_save_load.py
@@ -12,16 +12,16 @@
 from stable_baselines3.common.utils import get_device
 from stable_baselines3.common.vec_env import DummyVecEnv
 
-from sb3_contrib import QRDQN, TQC
+from sb3_contrib import BDPI, QRDQN, TQC
 
-MODEL_LIST = [TQC, QRDQN]
+MODEL_LIST = [TQC, QRDQN, BDPI]
 
 
 def select_env(model_class: BaseAlgorithm) -> gym.Env:
     """
-    Selects an environment with the correct action space as QRDQN only supports discrete action space
+    Selects an environment with the correct action space as QRDQN and BDPI only support discrete action space
     """
-    if model_class == QRDQN:
+    if model_class in [QRDQN, BDPI]:
         return IdentityEnv(10)
     else:
         return IdentityEnvBox(10)
@@ -228,7 +228,7 @@ def test_exclude_include_saved_params(tmp_path, model_class):
     os.remove(tmp_path / "test_save.zip")
 
 
-@pytest.mark.parametrize("model_class", [TQC, QRDQN])
+@pytest.mark.parametrize("model_class", [TQC, QRDQN, BDPI])
 def test_save_load_replay_buffer(tmp_path, model_class):
     path = pathlib.Path(tmp_path / "logs/replay_buffer.pkl")
     path.parent.mkdir(exist_ok=True, parents=True)  # to not raise a warning
@@ -269,7 +269,7 @@ def test_save_load_policy(tmp_path, model_class, policy_str):
     if policy_str == "MlpPolicy":
         env = select_env(model_class)
     else:
-        if model_class in [TQC, QRDQN]:
+        if model_class in [TQC, QRDQN, BDPI]:
             # Avoid memory error when using replay buffer
             # Reduce the size of the features
             kwargs = dict(
@@ -277,7 +277,7 @@ def test_save_load_policy(tmp_path, model_class, policy_str):
                 learning_starts=100,
                 policy_kwargs=dict(features_extractor_kwargs=dict(features_dim=32)),
             )
-        env = FakeImageEnv(screen_height=40, screen_width=40, n_channels=2, discrete=model_class == QRDQN)
+        env = FakeImageEnv(screen_height=40, screen_width=40, n_channels=2, discrete=(model_class in [QRDQN, BDPI]))
 
     # Reduce number of quantiles for faster tests
     if model_class in [TQC, QRDQN]:
@@ -368,7 +368,7 @@ def test_save_load_q_net(tmp_path, model_class, policy_str):
     if policy_str == "MlpPolicy":
         env = select_env(model_class)
     else:
-        if model_class in [QRDQN]:
+        if model_class in [QRDQN, BDPI]:
             # Avoid memory error when using replay buffer
             # Reduce the size of the features
             kwargs = dict(