[Algorithm] SOTA discrete offline CQL (#3098)

Ibinarriaga8 · jorge.ibinarriaga.robles.becas · vmoens · web-flow · commit 1eccb49c2dc4 · 2025-08-01T15:31:47.000+01:00
Co-authored-by: jorge.ibinarriaga.robles.becas &lt;jorge.ibinarriaga.robles.becas@bbva.com&gt;
Co-authored-by: vmoens &lt;vincentmoens@gmail.com&gt;
diff --git a/.github/unittest/linux_sota/scripts/environment.yml b/.github/unittest/linux_sota/scripts/environment.yml
@@ -29,3 +29,5 @@ dependencies:
     - coverage
     - vmas
     - transformers
+    - minari
+    - minari[create]
diff --git a/.github/unittest/linux_sota/scripts/test_sota.py b/.github/unittest/linux_sota/scripts/test_sota.py
@@ -105,6 +105,14 @@
   collector.env_per_collector=2 \
   replay_buffer.size=120 \
   logger.backend=
+""",
+    "discrete_cql_offline": """python sota-implementations/cql/discrete_cql_offline.py \
+  collector.total_frames=48 \
+  collector.init_random_frames=10 \
+  collector.frames_per_batch=16 \
+  collector.env_per_collector=2 \
+  replay_buffer.batch_size=10 \
+  logger.backend=
 """,
     "redq": """python sota-implementations/redq/redq.py \
   num_workers=4 \
diff --git a/sota-check/run_discrete_cql.sh b/sota-check/run_discrete_cql.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+#SBATCH --job-name=cql_discrete_offline
+#SBATCH --ntasks=32
+#SBATCH --cpus-per-task=1
+#SBATCH --gres=gpu:1
+#SBATCH --output=slurm_logs/cql_discrete_offline_%j.txt
+#SBATCH --error=slurm_errors/cql_discrete_offline_%j.txt
+
+current_commit=$(git rev-parse --short HEAD)
+project_name="torchrl-example-check-$current_commit"
+group_name="cql_discrete_offline"
+
+export PYTHONPATH=$(dirname $(dirname $PWD))
+python $PYTHONPATH/sota-implementations/cql/discrete_cql_offline.py \
+  logger.backend=wandb \
+  logger.project_name="$project_name" \
+  logger.group_name="$group_name"
+
+# Capture the exit status of the Python command
+exit_status=$?
+# Write the exit status to a file
+if [ $exit_status -eq 0 ]; then
+  echo "${group_name}_${SLURM_JOB_ID}=success" >>> report.log
+else
+  echo "${group_name}_${SLURM_JOB_ID}=error" >> report.log
+fi
diff --git a/sota-implementations/cql/discrete_cql_offline.py b/sota-implementations/cql/discrete_cql_offline.py
@@ -0,0 +1,181 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""CQL Example.
+
+This is a self-contained example of a discrete offline CQL training script.
+
+The helper functions are coded in the utils.py associated with this script.
+"""
+from __future__ import annotations
+
+import warnings
+
+import hydra
+import numpy as np
+import torch
+import tqdm
+from tensordict.nn import CudaGraphModule
+from torchrl._utils import timeit
+from torchrl.envs.utils import ExplorationType, set_exploration_type
+from torchrl.record.loggers import generate_exp_name, get_logger
+from utils import (
+    dump_video,
+    log_metrics,
+    make_discrete_cql_optimizer,
+    make_discrete_loss,
+    make_discretecql_model,
+    make_environment,
+    make_offline_discrete_replay_buffer,
+)
+
+torch.set_float32_matmul_precision("high")
+
+
+@hydra.main(version_base="1.1", config_path="", config_name="discrete_offline_config")
+def main(cfg):  # noqa: F821
+    device = cfg.optim.device
+    if device in ("", None):
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    device = torch.device(device)
+
+    # Create logger
+    exp_name = generate_exp_name("DiscreteCQL", cfg.logger.exp_name)
+    logger = None
+    if cfg.logger.backend:
+        logger = get_logger(
+            logger_type=cfg.logger.backend,
+            logger_name="discretecql_logging",
+            experiment_name=exp_name,
+            wandb_kwargs={
+                "mode": cfg.logger.mode,
+                "config": dict(cfg),
+                "project": cfg.logger.project_name,
+                "group": cfg.logger.group_name,
+            },
+        )
+
+    # Set seeds
+    torch.manual_seed(cfg.env.seed)
+    np.random.seed(cfg.env.seed)
+    if cfg.env.seed is not None:
+        warnings.warn(
+            "The seed in the environment config is deprecated. "
+            "Please set the seed in the optim config instead."
+        )
+
+    # Create replay buffer
+    replay_buffer = make_offline_discrete_replay_buffer(cfg.replay_buffer)
+
+    # Create env
+    train_env, eval_env = make_environment(
+        cfg, train_num_envs=1, eval_num_envs=cfg.logger.eval_envs, logger=logger
+    )
+
+    # Create agent
+    model, explore_policy = make_discretecql_model(cfg, train_env, eval_env, device)
+
+    del train_env
+
+    # Create loss
+    loss_module, target_net_updater = make_discrete_loss(cfg.loss, model, device)
+
+    # Create optimizers
+    optimizer = make_discrete_cql_optimizer(cfg, loss_module)  # optimizer for CQL loss
+
+    def update(data):
+
+        # Compute loss components
+        loss_vals = loss_module(data)
+
+        q_loss = loss_vals["loss_qvalue"]
+        cql_loss = loss_vals["loss_cql"]
+
+        # Total loss = Q-learning loss + CQL regularization
+        loss = q_loss + cql_loss
+
+        loss.backward()
+        optimizer.step()
+        optimizer.zero_grad(set_to_none=True)
+
+        # Soft update of target Q-network
+        target_net_updater.step()
+
+        # Detach to avoid keeping computation graph in logging
+        return loss.detach(), loss_vals.detach()
+
+    compile_mode = None
+    if cfg.compile.compile:
+        if cfg.compile.compile_mode not in (None, ""):
+            compile_mode = cfg.compile.compile_mode
+        elif cfg.compile.cudagraphs:
+            compile_mode = "default"
+        else:
+            compile_mode = "reduce-overhead"
+        update = torch.compile(update, mode=compile_mode)
+    if cfg.compile.cudagraphs:
+        warnings.warn(
+            "CudaGraphModule es experimental y puede llevar a resultados incorrectos silenciosamente. Úsalo con precaución.",
+            category=UserWarning,
+        )
+        update = CudaGraphModule(update, warmup=50)
+
+    pbar = tqdm.tqdm(total=cfg.optim.gradient_steps)
+
+    gradient_steps = cfg.optim.gradient_steps
+    policy_eval_start = cfg.optim.policy_eval_start
+    evaluation_interval = cfg.logger.eval_iter
+    eval_steps = cfg.logger.eval_steps
+
+    # Training loop
+    policy_eval_start = torch.tensor(policy_eval_start, device=device)
+    for i in range(gradient_steps):
+        timeit.printevery(1000, gradient_steps, erase=True)
+        pbar.update(1)
+        # sample data
+        with timeit("sample"):
+            data = replay_buffer.sample()
+
+        with timeit("update"):
+            torch.compiler.cudagraph_mark_step_begin()
+            loss, loss_vals = update(data.to(device))
+
+        # log metrics
+        metrics_to_log = {
+            "loss": loss.cpu(),
+            **loss_vals.cpu(),
+        }
+
+        # evaluation
+        with timeit("log/eval"):
+            if i % evaluation_interval == 0:
+                with set_exploration_type(
+                    ExplorationType.DETERMINISTIC
+                ), torch.no_grad():
+                    eval_td = eval_env.rollout(
+                        max_steps=eval_steps,
+                        policy=explore_policy,
+                        auto_cast_to_device=True,
+                    )
+                    eval_env.apply(dump_video)
+
+                # eval_td: matrix of shape: [num_episodes, max_steps, ...]
+                eval_reward = (
+                    eval_td["next", "reward"].sum(1).mean().item()
+                )  # mean computed over the sum of rewards for each episode
+                metrics_to_log["evaluation_reward"] = eval_reward
+
+        with timeit("log"):
+            metrics_to_log.update(timeit.todict(prefix="time"))
+            metrics_to_log["time/speed"] = pbar.format_dict["rate"]
+            log_metrics(logger, metrics_to_log, i)
+
+    pbar.close()
+    if not eval_env.is_closed:
+        eval_env.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/sota-implementations/cql/discrete_cql_online.py b/sota-implementations/cql/discrete_cql_online.py
@@ -36,7 +36,7 @@
 torch.set_float32_matmul_precision("high")
 
 
-@hydra.main(version_base="1.1", config_path="", config_name="discrete_cql_config")
+@hydra.main(version_base="1.1", config_path="", config_name="discrete_online_config")
 def main(cfg: DictConfig):  # noqa: F821
     device = cfg.optim.device
     if device in ("", None):
diff --git a/sota-implementations/cql/discrete_offline_config.yaml b/sota-implementations/cql/discrete_offline_config.yaml
@@ -0,0 +1,66 @@
+# env and task
+env:
+  name: CartPole-v1 # CartPole environment for discrete action space
+  task: ""
+  library: minari
+  n_samples_stats: 1000
+  seed: 0
+  backend: gymnasium  
+
+# Collector
+collector:
+  frames_per_batch: 200
+  total_frames: 1_000_000
+  multi_step: 0
+  init_random_frames: 1000
+  env_per_collector: 1
+  device:
+  max_frames_per_traj: 200
+  annealing_frames: 10000
+  eps_start: 1.0
+  eps_end: 0.01
+
+
+# logger
+logger:
+  backend: wandb
+  project_name: torchrl_example_cql
+  group_name: null
+  exp_name: cql_${replay_buffer.dataset}
+  eval_iter: 5000         # eval interval in gradient steps
+  eval_steps: 1000        # evaluation steps per eval
+  mode: online
+  eval_envs: 5 # number of evaluation environments
+  video: True
+
+# replay buffer
+replay_buffer:
+  env: CartPole-v1
+  dataset: CartPole-v2-random-v1
+  batch_size: 128
+  episodes: 10000
+
+# optimization
+optim:
+  device: null
+  lr: 3e-4               # learning rate
+  weight_decay: 0.0
+  gradient_steps: 100_000
+  policy_eval_start: 40_000
+
+# model
+model:
+  hidden_sizes: [256, 256]
+  activation: relu
+
+# loss
+loss: 
+  loss_function: l2
+  gamma: 0.99
+  tau: 0.005
+  action_space: categorical
+
+compile:
+  compile: False
+  compile_mode:
+  cudagraphs: False
diff --git a/sota-implementations/cql/discrete_online_config.yaml b/sota-implementations/cql/discrete_online_config.yaml
diff --git a/sota-implementations/cql/utils.py b/sota-implementations/cql/utils.py
@@ -195,6 +195,49 @@ def make_offline_replay_buffer(rb_cfg):
     return data
 
 
+def make_offline_discrete_replay_buffer(rb_cfg):
+    import gymnasium as gym
+    import minari
+    from minari import DataCollector
+
+    # Create custom minari dataset from environment
+
+    env = gym.make(rb_cfg.env)
+    env = DataCollector(env)
+
+    for _ in range(rb_cfg.episodes):
+        env.reset(seed=123)
+        while True:
+            action = env.action_space.sample()
+            obs, rew, terminated, truncated, info = env.step(action)
+            if terminated or truncated:
+                break
+
+    env.create_dataset(
+        dataset_id=rb_cfg.dataset,
+        algorithm_name="Random-Policy",
+        code_permalink="https://github.com/Farama-Foundation/Minari",
+        author="Farama",
+        author_email="contact@farama.org",
+    )
+
+    data = MinariExperienceReplay(
+        dataset_id=rb_cfg.dataset,
+        split_trajs=False,
+        batch_size=rb_cfg.batch_size,
+        load_from_local_minari=True,
+        sampler=SamplerWithoutReplacement(drop_last=True),
+        prefetch=4,
+    )
+
+    data.append_transform(DoubleToFloat())
+
+    # Clean up
+    minari.delete_dataset(rb_cfg.dataset)
+
+    return data
+
+
 # ====================================================================
 # Model
 # -----
@@ -354,11 +397,21 @@ def make_continuous_loss(loss_cfg, model, device: torch.device | None = None):
 
 
 def make_discrete_loss(loss_cfg, model, device: torch.device | None = None):
-    loss_module = DiscreteCQLLoss(
-        model,
-        loss_function=loss_cfg.loss_function,
-        delay_value=True,
-    )
+
+    if "action_space" in loss_cfg:  # especify action space
+        loss_module = DiscreteCQLLoss(
+            model,
+            loss_function=loss_cfg.loss_function,
+            action_space=loss_cfg.action_space,
+            delay_value=True,
+        )
+    else:
+        loss_module = DiscreteCQLLoss(
+            model,
+            loss_function=loss_cfg.loss_function,
+            delay_value=True,
+        )
+
     loss_module.make_value_estimator(gamma=loss_cfg.gamma, device=device)
     target_net_updater = SoftUpdate(loss_module, tau=loss_cfg.tau)
 
diff --git a/test/llm/test_wrapper.py b/test/llm/test_wrapper.py
@@ -2386,7 +2386,7 @@ def test_batching_continuous_throughput(
             assert len(processing_events) > 0, "No processing occurred"
 
             # Check that processing happened across multiple threads (indicating concurrent processing)
-            thread_ids = set(event["thread_id"] for event in processing_events)
+            thread_ids = {event["thread_id"] for event in processing_events}  # noqa
             assert (
                 len(thread_ids) > 1
             ), f"All processing happened in single thread: {thread_ids}"