Fixes

sergiopaniego · sergiopaniego · commit 9704198dad89 · 2026-03-13T15:33:52.000+01:00
diff --git a/docs/source/openenv.md b/docs/source/openenv.md
@@ -316,41 +316,16 @@ That's it! Let's unpack how the main pieces fit together:
 
 You can run the example in either colocate mode (1 GPU) or server mode (2 GPUs):
 
-<hfoptions id="vllm_mode">
-
-<hfoption id="colocate">
-
-**Colocate mode (1 GPU, recommended)**
-
 ```bash
-python examples/scripts/openenv/echo.py --vllm-mode colocate
+python examples/scripts/openenv/echo.py
 ```
 
-This runs vLLM in the same process as training, requiring only a single GPU.
-
-</hfoption>
-
-<hfoption id="server">
-
-**Server mode (2+ GPUs, scalable)**
+You can customize the model and environment URL:
 
 ```bash
-# Terminal 1: Start vLLM inference server
-CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen3-0.6B --host 0.0.0.0 --port 8000
-
-# Terminal 2: Run GRPO training with OpenEnv
-CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/echo.py --vllm-mode server --vllm-server-url http://localhost:8000
+python examples/scripts/openenv/echo.py --model Qwen/Qwen3-0.6B --env-host https://openenv-echo-env.hf.space
 ```
 
-This runs vLLM as a separate server process, useful when you want to:
-- Share the inference server across multiple training jobs
-- Use multiple GPUs for the vLLM server (via `--tensor-parallel-size`)
-- Scale up training to many GPUs while sharing a single inference endpoint
-
-</hfoption>
-
-</hfoptions>
-
 Below is the reward curve from training:
 
 <iframe src="https://trl-lib-trackio.hf.space?project=openenv&metrics=train/rewards/reward_from_env/mean&runs=qgallouedec-1761202871&sidebar=hidden&navbar=hidden" style="width:600px; height:500px; border:0;"></iframe>
@@ -512,6 +487,9 @@ The resulting model improves its performance on the game, both by reducing the n
 
 <iframe src="https://burtenshaw-wordle-grpo.hf.space?project=group-Qwen-Qwen3-17B&metrics=reward&runs=run-2025-10-26_09-39-49,run-2025-10-26_08-04-49&sidebar=hidden&navbar=hidden" style="width:1600px; height:500px; border:0;"></iframe>
 
+> [!NOTE]
+> With `enable_thinking=False` (the default in these examples), small models like Qwen3-1.7B can learn to improve their guesses but should not be expected to consistently solve the game. For significantly better results, use larger models or enable thinking mode (`enable_thinking=True`), which allows the model to reason before making a guess at the cost of longer completions.
+
 We experimented with larger models like `gpt-oss-20b` and found that the model was able to consistently win the game. However, this requires a lot of compute to train the model. Why not try this out yourself?
 
 ## Multi-Environment Training
@@ -587,6 +565,7 @@ Key patterns:
 - **Lazy client initialization**: Create clients in `reset()`, not `__init__()`, to avoid unnecessary WebSocket connections.
 - **Close before reopen**: Close the previous client before creating a new one to avoid server capacity errors.
 - **`kwargs` routing**: The `"env"` column from the dataset is passed to `reset()` as a keyword argument.
+- **All tools are exposed simultaneously**: The model sees `guess`, `move`, and `stay` as available tools regardless of the active environment. If it calls the wrong tool (e.g., `move` during Wordle), the method raises a `ValueError` that the trainer catches gracefully. In practice, models learn to use the correct tools based on the system prompt.
 
 ### Per-environment reward functions
 
diff --git a/examples/scripts/openenv/multi_env.py b/examples/scripts/openenv/multi_env.py
@@ -237,13 +237,14 @@ def main() -> None:
     CATCH_URL = args.catch_url
 
     n = 500  # samples per environment
-    dataset = Dataset.from_dict({
-        "prompt": (
-            [[{"role": "user", "content": wordle_prompt}]] * n
-            + [[{"role": "user", "content": catch_prompt}]] * n
-        ),
-        "env": ["wordle"] * n + ["catch"] * n,
-    })
+    dataset = Dataset.from_dict(
+        {
+            "prompt": (
+                [[{"role": "user", "content": wordle_prompt}]] * n + [[{"role": "user", "content": catch_prompt}]] * n
+            ),
+            "env": ["wordle"] * n + ["catch"] * n,
+        }
+    )
 
     trainer = GRPOTrainer(
         model="Qwen/Qwen3-1.7B",
diff --git a/examples/scripts/openenv/sudoku.py b/examples/scripts/openenv/sudoku.py
@@ -170,7 +170,9 @@ def parse_args() -> argparse.Namespace:
     )
 
     # LoRA / PEFT
-    parser.add_argument("--use-lora", action="store_true", default=False, help="Use LoRA for memory-efficient training")
+    parser.add_argument(
+        "--use-lora", action="store_true", default=False, help="Use LoRA for memory-efficient training"
+    )
     parser.add_argument("--lora-r", type=int, default=16, help="LoRA rank")
     parser.add_argument("--lora-alpha", type=int, default=32, help="LoRA alpha")
 
@@ -499,7 +501,7 @@ def place(self, row: int, col: int, number: int) -> str:
 
             # Only check the NEW content for feedback (messages are cumulative)
             full_content = observation.messages[0].content if observation.messages else ""
-            new_content = full_content[len(self._last_full_content):]
+            new_content = full_content[len(self._last_full_content) :]
             self._last_full_content = full_content
 
             new_content_lower = new_content.lower()
diff --git a/trl/generation/vllm_generation.py b/trl/generation/vllm_generation.py
@@ -523,6 +523,7 @@ def generate(
         images: list[list | None] | None,
         num_generations: int,
         profiler: ProfilingContext | None = None,
+        tools: list | None = None,
     ) -> tuple:
         """Generate completions using vLLM.
 
diff --git a/trl/trainer/grpo_trainer.py b/trl/trainer/grpo_trainer.py
@@ -15,9 +15,9 @@
 import asyncio
 import atexit
 import copy
-import math
 import importlib.resources as pkg_resources
 import inspect
+import math
 import os
 import sys
 import textwrap
@@ -1249,6 +1249,7 @@ def _generate_single_turn(self, prompt_ids, images, multimodal_fields):
                 images=images,
                 num_generations=num_generations,
                 profiler=profiling_context(self, "vLLM.generate"),
+                tools=self.tools,
             )
             # vLLM returns per-token top-k logprobs; keep only the top-1 (sampled token) logprob
             logprobs = [[lp[0] for lp in seq] for seq in logprobs]
@@ -1504,8 +1505,9 @@ async def _run_async_tools(async_coros):
             for idx, tool_call in zip(idxs_with_tool, tool_calls, strict=True):
                 if not tool_call:
                     continue
-                # If the environment has a _done attribute and it's True, stop calling tools for it
-                if hasattr(self.environments[idx], "_done") and self.environments[idx]._done:
+                # If the environment signals it's done, stop calling tools for it
+                env = self.environments[idx]
+                if getattr(env, "_done", False) or getattr(env, "done", False):
                     continue
                 filtered_idxs.append(idx)
                 filtered_tool_calls.append(tool_call)