Update vLLM to 0.11 (#350)

pan-x-c · web-flow · commit 7d571487cfc5 · 2025-10-30T18:01:58.000+08:00
diff --git a/.github/workflows/docker/docker-compose.yaml b/.github/workflows/docker/docker-compose.yaml
@@ -1,6 +1,6 @@
 services:
   trinity-node-1:
-    image: trinity-rft-unittest:20250924
+    image: trinity-rft-unittest:20251030
     pull_policy: never
     command: sh -c "pip install -e .[dev] && ray start --head --dashboard-host 0.0.0.0 --include-dashboard true --block"
     environment:
@@ -29,7 +29,7 @@ services:
             capabilities: [gpu]
 
   trinity-node-2:
-    image: trinity-rft-unittest:20250924
+    image: trinity-rft-unittest:20251030
     pull_policy: never
     command: sh -c "pip install -e .[dev] && ray start --address=trinity-node-1:6379 --block"
     environment:
diff --git a/.github/workflows/unittest.yaml b/.github/workflows/unittest.yaml
@@ -97,6 +97,15 @@ jobs:
           fi
         fi
 
+    - name: Convert report.json time to ms
+      working-directory: trinity-${{ github.run_id }}
+      if: env.tests_run == 'true' || failure()
+      run: |
+        REPORT=report.json
+        if [ -f "$REPORT" ]; then
+          jq '(.results.tests[] | .duration, .start, .stop) |= (. * 1000) | (.results.summary.start, .results.summary.stop) |= (. * 1000)' "$REPORT" > "$REPORT.tmp" && mv "$REPORT.tmp" "$REPORT"
+        fi
+
     - name: Clean checkpoint dir
       working-directory: trinity-${{ github.run_id }}/.github/workflows/docker
       if: always()
diff --git a/benchmark/config/countdown-template.yaml b/benchmark/config/countdown-template.yaml
@@ -54,7 +54,6 @@ explorer:
   rollout_model:
     engine_num: 2
     tensor_parallel_size: 1
-    use_v1: true
     enforce_eager: true
     enable_prefix_caching: false
     enable_chunked_prefill: false
diff --git a/benchmark/config/gsm8k-template.yaml b/benchmark/config/gsm8k-template.yaml
@@ -59,7 +59,6 @@ explorer:
   rollout_model:
     engine_num: 2
     tensor_parallel_size: 1
-    use_v1: true
     enforce_eager: false
     enable_prefix_caching: false
     enable_chunked_prefill: false
diff --git a/docs/sphinx_doc/source/conf.py b/docs/sphinx_doc/source/conf.py
@@ -5,6 +5,8 @@
 
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+import subprocess
+
 from trinity import __version__ as version
 
 project = "Trinity-RFT"
@@ -58,11 +60,22 @@
 apidoc_excluded_paths = ["tests", "build"]
 apidoc_separate_modules = True
 
+
 # Multiversion configs
-smv_tag_whitelist = r"^v\d+\.\d+\.\d+$"  # match v1.0.0 pattern
+def get_recent_tags(n: int) -> list:
+    """Retrieve the most recent n git tags."""
+    try:
+        tags = subprocess.check_output(
+            ["git", "tag", "--sort=-creatordate"], universal_newlines=True
+        ).splitlines()
+        return tags[:n]
+    except subprocess.CalledProcessError:
+        return []
+
+
+smv_tag_whitelist = r"^(" + "|".join(get_recent_tags(4)) + r")$"
 smv_branch_whitelist = r"^(main)$"  # included branches
 smv_remote_whitelist = None
-smv_released_pattern = r"^tags/.*$"
 
 smv_prefer_remote_refs = False
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -23,7 +23,7 @@ requires-python = ">=3.10,<3.13"
 dependencies = [
     "verl==0.5.0",
     "ray[default]>=2.48.0",
-    "vllm>=0.9.1,<=0.10.2",
+    "vllm>=0.9.1,<=0.11.0",
     "tensordict",
     "wandb",
     "omegaconf",
@@ -73,7 +73,7 @@ dev = [
 ]
 megatron = [
     "megatron-core[mlm]==0.13.1",
-    "transformer_engine[pytorch]==2.6.0.post1",
+    "transformer_engine[pytorch]==2.8.0",
     "mbridge>=0.13.0",
 ]
 
diff --git a/scripts/docker/Dockerfile b/scripts/docker/Dockerfile
@@ -5,7 +5,7 @@
 # docker run -it --gpus all --shm-size="64g" --rm -v $PWD:/workspace -v <root_path_of_data_and_checkpoints>:/data trinity-rft:latest
 
 
-FROM nvcr.io/nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04
+FROM nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
 
 WORKDIR /workspace
 
@@ -20,13 +20,13 @@ RUN apt update && apt install -y \
 
 
 # For Aliyun users: update pip mirror to aliyun to speed up pip install
-RUN pip config set global.index-url http://mirrors.cloud.aliyuncs.com/pypi/simple/ \
-   && pip config set install.trusted-host mirrors.cloud.aliyuncs.com
+# RUN pip config set global.index-url http://mirrors.cloud.aliyuncs.com/pypi/simple/ \
+#    && pip config set install.trusted-host mirrors.cloud.aliyuncs.com
 
 # copy the Trinity-RFT dir into the workspace
 COPY . .
 
-RUN pip install --upgrade pip && pip install -e .[dev] && pip install flash-attn
+RUN pip install --upgrade pip && pip install -e .[dev] && pip install flash_attn==2.8.1 --no-build-isolation
 
 # Set Env variables
 
diff --git a/scripts/docker_for_megatron/Dockerfile b/scripts/docker_for_megatron/Dockerfile
@@ -5,13 +5,10 @@
 # docker run -it --gpus all --shm-size="64g" --rm -v $PWD:/workspace -v <root_path_of_data_and_checkpoints>:/data trinity-rft-megatron:latest
 
 
-FROM nvcr.io/nvidia/cuda:12.6.3-cudnn-devel-ubuntu22.04
+FROM nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
 
 WORKDIR /workspace
 
-# copy the Trinity-RFT dir into the workspace
-COPY . .
-
 RUN apt update && apt install -y \
     build-essential \
     curl git wget vim tmux net-tools \
@@ -22,17 +19,21 @@ RUN apt update && apt install -y \
     && ln -sf /usr/bin/pip3 /usr/bin/pip
 
 # For Aliyun users: update pip mirror to aliyun to speed up pip install
-RUN pip config set global.index-url http://mirrors.cloud.aliyuncs.com/pypi/simple/ \
-   && pip config set install.trusted-host mirrors.cloud.aliyuncs.com
+# RUN pip config set global.index-url http://mirrors.cloud.aliyuncs.com/pypi/simple/ \
+#    && pip config set install.trusted-host mirrors.cloud.aliyuncs.com
+
+# copy the Trinity-RFT dir into the workspace
+COPY . .
 
 # Install Trinity-RFT with Megatron
 RUN pip install --upgrade pip \
+    && pip install -e .[dev] \
+    && pip install flash_attn==2.8.1 --no-build-isolation \
     && pip install -e .[megatron] \
-    && pip install flash-attn==2.8.1 \
     && pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
         --config-settings "--build-option=--cpp_ext" \
         --config-settings "--build-option=--cuda_ext" \
-        --resume-retries 999 git+https://github.com/NVIDIA/apex.git
+        --resume-retries 20 git+https://github.com/NVIDIA/apex.git
 
 # Set Env variables
 
diff --git a/tests/common/vllm_test.py b/tests/common/vllm_test.py
@@ -276,7 +276,6 @@ def setUp(self):
         self.config.explorer.rollout_model.engine_type = "vllm"
         self.config.explorer.rollout_model.engine_num = 1
         self.config.explorer.rollout_model.tensor_parallel_size = 1
-        self.config.explorer.rollout_model.use_v1 = True
         self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
         self.config.explorer.rollout_model.enable_openai_api = True
 
@@ -368,7 +367,6 @@ def setUp(self):
         self.config.explorer.rollout_model.engine_type = "vllm"
         self.config.explorer.rollout_model.engine_num = 1
         self.config.explorer.rollout_model.tensor_parallel_size = 1
-        self.config.explorer.rollout_model.use_v1 = True
         self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
         self.config.explorer.rollout_model.enable_openai_api = True
 
@@ -578,7 +576,6 @@ def setUp(self):
         self.config.explorer.rollout_model.engine_type = "vllm"
         self.config.explorer.rollout_model.engine_num = 1
         self.config.explorer.rollout_model.tensor_parallel_size = 1
-        self.config.explorer.rollout_model.use_v1 = True
         self.config.explorer.rollout_model.chat_template = CHAT_TEMPLATE
         self.config.explorer.rollout_model.enable_openai_api = True
         # added for toolcalls
diff --git a/tests/template/config.yaml b/tests/template/config.yaml
@@ -42,7 +42,6 @@ explorer:
     enforce_eager: true
     dtype: bfloat16
     seed: 42
-    use_v1: true
 trainer:
   trainer_type: verl
   save_interval: 100
diff --git a/trinity/common/models/api/vllm_patch.py b/trinity/common/models/api/vllm_patch.py
@@ -347,10 +347,10 @@ async def run_api_server_in_ray_actor(
     reasoning_parser: Optional[str] = None,
 ):
     vllm_version = get_vllm_version()
-    if vllm_version < parse_version("0.8.5") or vllm_version > parse_version("0.10.2"):
+    if vllm_version < parse_version("0.8.5") or vllm_version > parse_version("0.11.0"):
         raise ValueError(
             f"Unsupported vllm version: {vllm.__version__}. "
-            "This patch requires vllm version >= 0.8.5, <= 0.10.2."
+            "This patch requires vllm version >= 0.8.5, <= 0.11.0."
         )
 
     parser = FlexibleArgumentParser(description="Run the OpenAI API server.")
@@ -371,5 +371,6 @@ async def run_api_server_in_ray_actor(
     if reasoning_parser:
         cli_args.extend(["--reasoning-parser", reasoning_parser])
     args = parser.parse_args(cli_args)
-    print(args)
+    if vllm_version >= parse_version("0.11.0"):
+        args.structured_outputs_config.reasoning_parser = reasoning_parser
     await run_server_in_ray(args, async_llm)
diff --git a/trinity/common/models/vllm_model.py b/trinity/common/models/vllm_model.py
@@ -91,6 +91,8 @@ def __init__(
             engine_args.enable_log_requests = False
         else:
             engine_args.disable_log_requests = True
+        if get_vllm_version() >= parse_version("0.11.0"):
+            engine_args.reasoning_parser = config.reasoning_parser
         self.async_llm = vllm.AsyncLLMEngine.from_engine_args(engine_args)
         self.processor = None
         self.tokenizer = None
@@ -107,12 +109,7 @@ def __init__(
 
     async def _initialize_tokenizer(self):
         if self.tokenizer is None:
-            if self.enable_lora:
-                self.tokenizer = await self.async_llm.get_tokenizer(
-                    lora_request=self.get_lora_request()
-                )
-            else:
-                self.tokenizer = await self.async_llm.get_tokenizer()
+            self.tokenizer = await self.async_llm.get_tokenizer()
         self.tokenizer.truncation_side = "left"
 
     def _initialize_processor(self):
diff --git a/trinity/explorer/scheduler.py b/trinity/explorer/scheduler.py
@@ -52,7 +52,7 @@ def _create_runner(self):
         return (
             ray.remote(WorkflowRunner)
             .options(
-                num_cpus=1,
+                num_cpus=0,
                 namespace=self.namespace,
                 scheduling_strategy="SPREAD",
                 runtime_env={
diff --git a/trinity/trainer/trainer.py b/trinity/trainer/trainer.py
@@ -153,15 +153,15 @@ def need_save(self) -> bool:
 
     async def sync_weight(self) -> Dict:
         """Sync the model weight."""
-        self.logger.info(f"Trainer synchronizing weights at step {self.train_step_num} starting..")
+        self.logger.info(f"Trainer sync_weights at step {self.train_step_num} started.")
         metrics = {}
         with Timer(metrics, "time/sync_weight"):
             if self.config.synchronizer.sync_method == SyncMethod.NCCL:
                 result = await self.synchronizer.ready_to_nccl_sync.remote(
                     "trainer", self.train_step_num
                 )
                 if result is None:
-                    self.logger.error("Trainer synchronizing weights failed.")
+                    self.logger.error("Trainer sync_weights failed.")
                 else:
                     self.engine.sync_weight()
                     self.last_trainer_sync_step = self.train_step_num
@@ -171,7 +171,7 @@ async def sync_weight(self) -> Dict:
                 self.engine.upload_state_dict()
             self.last_sync_step = self.train_step_num
             await self.synchronizer.set_trainer_status.remote(RunningStatus.RUNNING)
-        self.logger.info(f"Trainer synchronizing weights at step {self.train_step_num} end.")
+        self.logger.info(f"Trainer sync_weights at step {self.train_step_num} finished.")
         return metrics
 
     def _log_experiences(self, samples: List[Dict]) -> None: