Skip to content

Commit 9389dfb

Browse files
authored
chore: upgrade vllm to v0.10.0 (#766)
Signed-off-by: Yuki Huang <yukih@nvidia.com>
1 parent 5604024 commit 9389dfb

File tree

5 files changed

+386
-298
lines changed

5 files changed

+386
-298
lines changed

docs/guides/use-custom-vllm.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@ dependencies = [
2828

2929
[project.optional-dependencies]
3030
vllm = [
31-
#"vllm==0.9.0", # <-- BEFORE
32-
"vllm", # <-- AFTER
31+
#"vllm==0.10.0", # <-- BEFORE
32+
"vllm", # <-- AFTER
3333
]
3434

3535
# ...<OMITTED>

nemo_rl/models/generation/vllm.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,9 @@ def _patch_vllm_init_workers_ray():
325325
llm_kwargs = dict(
326326
model=self.model_name,
327327
load_format=load_format,
328-
skip_tokenizer_init=self.cfg["vllm_cfg"]["skip_tokenizer_init"],
328+
# vllm==0.10.0 breaks skip_tokenizer_init=True.
329+
# This will be reverted to `self.cfg["vllm_cfg"]["skip_tokenizer_init"]` once https://github.com/NVIDIA-NeMo/RL/issues/818 is resolved.
330+
skip_tokenizer_init=False,
329331
tensor_parallel_size=self.tensor_parallel_size,
330332
pipeline_parallel_size=self.pipeline_parallel_size,
331333
gpu_memory_utilization=self.gpu_memory_utilization,
@@ -338,6 +340,7 @@ def _patch_vllm_init_workers_ray():
338340
worker_extension_cls="nemo_rl.models.generation.vllm_backend.VllmInternalWorkerExtension",
339341
enable_sleep_mode=True,
340342
disable_log_stats=True,
343+
logprobs_mode="raw_logprobs",
341344
**vllm_kwargs,
342345
)
343346

nemo_rl/models/generation/vllm_backend.py

Lines changed: 1 addition & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,8 @@
1111
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
14-
import os
1514
from collections import defaultdict
16-
from typing import Any, Iterable, Optional
15+
from typing import Any, Optional
1716

1817
import torch
1918
from torch.multiprocessing.reductions import rebuild_cuda_tensor
@@ -28,49 +27,6 @@
2827
)
2928

3029

31-
def _patch_gemma3_mm():
32-
"""Patch gemma3_mm.py to support new HF multimodal format (post transformers v4.52).
33-
34-
Patch taken from:https://github.com/vllm-project/vllm/pull/19151/files#diff-5890909300e4e6c3160444e4587ec3fd80498bb83f598b22ce81337f75992b06
35-
"""
36-
from packaging.version import Version as PkgVersion
37-
38-
assert PkgVersion(vllm.__version__) < PkgVersion("0.9.2"), (
39-
f"You are using vllm version {vllm.__version__}. "
40-
"Please remove this patch (_patch_gemma3_mm in nemo_rl/models/generation/vllm_backend.py) "
41-
"since it is included in vllm>=0.9.2."
42-
)
43-
44-
from vllm.logger import init_logger
45-
from vllm.model_executor.models import gemma3_mm
46-
from vllm.model_executor.models.utils import (
47-
AutoWeightsLoader,
48-
WeightsMapper,
49-
)
50-
51-
logger = init_logger("gemma3_mm_patch")
52-
53-
gemma3_mm.Gemma3ForConditionalGeneration.hf_to_vllm_mapper = WeightsMapper(
54-
orig_to_new_prefix={
55-
# mapping for new names in checkpoint saved after transformers v4.52
56-
"model.language_model.": "language_model.model.",
57-
"model.vision_tower.": "vision_tower.",
58-
"model.multi_modal_projector.": "multi_modal_projector.",
59-
"lm_head.": "language_model.lm_head.",
60-
}
61-
)
62-
63-
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
64-
loader = AutoWeightsLoader(self)
65-
return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
66-
67-
gemma3_mm.Gemma3ForConditionalGeneration.load_weights = load_weights
68-
logger.info("Successfully patched gemma3_mm.py in vllm_backend.")
69-
70-
71-
_patch_gemma3_mm()
72-
73-
7430
class VllmInternalWorkerExtension:
7531
def init_collective(
7632
self, rank_prefix: int, ip: str, port: int, world_size: int
@@ -82,10 +38,6 @@ def init_collective(
8238
local_rank = torch.distributed.get_rank()
8339
rank = rank_prefix + local_rank + 1 # 1 is the head node of the train cluster
8440

85-
# Temporary fix for vllm==0.9.0 which overrides the NCCL_CUMEM_ENABLE to 0 and causes
86-
# https://github.com/NVIDIA-NeMo/RL/issues/564. This can be removed after it is upgraded to vllm>=0.9.1rc1.
87-
os.environ["NCCL_CUMEM_ENABLE"] = "1"
88-
8941
pg = StatelessProcessGroup.create(
9042
host=ip, port=port, rank=rank, world_size=world_size
9143
)

pyproject.toml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,12 @@ license = {text = "Apache 2.0"}
2121
dependencies = [
2222
"setuptools",
2323
"ninja", # for flash-attn parallel build
24-
"torch==2.7.0",
24+
"torch==2.7.1",
2525
"triton",
2626
"colored==2.2.3",
2727
"ray[default]==2.46.0",
28-
# vllm<0.10.0 and transformers>=4.54.0 has a conflict
29-
# Remove this once we upgrade vllm to >=0.10.0
28+
# transformers==4.54.0/4.54.1 both fail on rm models
29+
# Remove this once https://github.com/NVIDIA-NeMo/RL/issues/811 resolved
3030
"transformers>=4.51.0,<4.54.0",
3131
"wandb",
3232
"numpy",
@@ -57,7 +57,7 @@ automodel = [
5757
"flash-attn==2.7.4.post1",
5858
]
5959
vllm = [
60-
"vllm==0.9.0",
60+
"vllm==0.10.0",
6161
# Remove this once https://github.com/NVIDIA-NeMo/RL/issues/501 resolved
6262
"flash-attn==2.7.4.post1",
6363
]
@@ -81,7 +81,7 @@ mcore = [
8181
# This is a default group so that we install these even with bare `uv sync`
8282
build = [
8383
# Build requirement for TE
84-
"torch==2.7.0",
84+
"torch==2.7.1",
8585
# Build requirement for TE
8686
"setuptools",
8787
"packaging",

0 commit comments

Comments
 (0)