Skip to content

Commit 5bf56a9

Browse files
yfwguyueh1chtruong814pjin-nvidiathomasdhc
authored
chore: Bump vllm to 0.11.2, torch to 2.9, transformers to 4.57.1 (#1563)
Signed-off-by: Yi-Fu Wu <yifu.wu@gmail.com> Signed-off-by: Charlie Truong <chtruong@nvidia.com> Signed-off-by: Peter Jin <pjin@nvidia.com> Signed-off-by: Dong Hyuk Chang <donghyukc@nvidia.com> Co-authored-by: Guyue Huang <guyueh@nvidia.com> Co-authored-by: Charlie Truong <chtruong@nvidia.com> Co-authored-by: Peter Jin <pjin@nvidia.com> Co-authored-by: Dong Hyuk Chang <donghyukc@nvidia.com>
1 parent 441f745 commit 5bf56a9

File tree

15 files changed

+764
-454
lines changed

15 files changed

+764
-454
lines changed

.github/workflows/cicd-main.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ jobs:
208208
build-contexts: |
209209
nemo-rl=${{ github.run_id }}/
210210
build-args: |
211-
MAX_JOBS=32
211+
MAX_JOBS=4
212212
NEMO_RL_COMMIT=${{ github.sha }}
213213
214214
cicd-doc-tests:

.gitmodules

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
[submodule "3rdparty/Automodel-workspace/Automodel"]
1212
path = 3rdparty/Automodel-workspace/Automodel
1313
url = https://github.com/NVIDIA-NeMo/Automodel.git
14-
branch = nemo-rl-submodule
14+
branch = yifu/bump-torch-and-hf
1515
shallow = true
1616
[submodule "3rdparty/Gym-workspace/Gym"]
1717
path = 3rdparty/Gym-workspace/Gym

examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e.yaml renamed to examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ grpo:
66
loss_fn:
77
use_importance_sampling_correction: true
88
checkpointing:
9-
checkpoint_dir: results/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e
9+
checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e
1010
policy:
1111
model_name: meta-llama/Llama-3.1-8B-Instruct
1212
tokenizer:
@@ -48,11 +48,12 @@ policy:
4848
data:
4949
max_input_seq_length: 4096
5050
logger:
51-
log_dir: logs/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e
51+
log_dir: logs/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e
5252
wandb_enabled: true
5353
tensorboard_enabled: true
5454
wandb:
5555
project: nemo-rl
56-
name: grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e
56+
name: grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e
5757
cluster:
58+
num_nodes: 2
5859
gpus_per_node: 8

nemo_rl/models/generation/fp8.py

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -272,6 +272,7 @@ def init_fp8(vllm_cfg, model_name, model_parallel_size):
272272

273273
if vllm_cfg.get("use_deep_gemm", False):
274274
os.environ["VLLM_USE_DEEP_GEMM"] = "1"
275+
os.environ["VLLM_USE_DEEP_GEMM_E8M0"] = "0"
275276

276277
if vllm_cfg["async_engine"]:
277278
# for async engine, vllm spawns a process for each DP, so we patch
@@ -541,14 +542,46 @@ def cast_tensor_to_fp8_blockwise(
541542
return fp_data, descale_fp
542543

543544

545+
# Ref: https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/vllm/model_executor/layers/quantization/utils/fp8_utils.py#L1175
546+
# Patches this method to not create new torch.nn.Parameter for layer weights
547+
# to maintain weight loaders.
548+
def maybe_post_process_fp8_weight_block(layer: torch.nn.Module):
549+
assert layer.weight_block_size is not None
550+
551+
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
552+
deepgemm_post_process_fp8_weight_block,
553+
)
554+
from vllm.utils.deep_gemm import (
555+
is_deep_gemm_e8m0_used,
556+
should_use_deepgemm_for_fp8_linear,
557+
)
558+
559+
# On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to
560+
# requantize the weight and input to the specific scale
561+
# at the same time.
562+
should_use_deepgemm = should_use_deepgemm_for_fp8_linear(
563+
layer.orig_dtype, layer.weight
564+
)
565+
if should_use_deepgemm:
566+
dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block(
567+
wq=layer.weight.data,
568+
ws=layer.weight_scale.data,
569+
quant_block_shape=tuple(layer.weight_block_size),
570+
use_e8m0=is_deep_gemm_e8m0_used(),
571+
)
572+
# This is the only part we change from the original function (https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/vllm/model_executor/layers/quantization/utils/fp8_utils.py#L1196-L1197)
573+
# Instead of creating new torch.nn.Parameter, we update the data in place.
574+
layer.weight.data.copy_(dg_weight)
575+
layer.weight_scale.data.copy_(dg_weight_scale)
576+
577+
544578
def process_weights_after_loading(self, layer) -> None:
545579
"""This function is used to process the weights after loading for a Linear layer.
546580
547581
Compared to the original process_weights_after_loading in vllm, we just avoid creation of
548582
new torch.nn.Parameter objects, because that removes the weight_loader attribute which we need for refit.
549583
"""
550584
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
551-
maybe_post_process_fp8_weight_block,
552585
process_fp8_weight_block_strategy,
553586
)
554587

@@ -566,7 +599,7 @@ def process_weights_after_loading(self, layer) -> None:
566599
layer.weight_scale = torch.nn.Parameter(weight_scale.data, requires_grad=False)
567600
layer.update_param_tp_status()
568601

569-
maybe_post_process_fp8_weight_block(layer, self.cutlass_block_fp8_supported)
602+
maybe_post_process_fp8_weight_block(layer)
570603

571604

572605
def process_weights_after_loading_moe(self, layer) -> None:

nemo_rl/models/generation/vllm/vllm_worker.py

Lines changed: 116 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
import gc
1717
import os
1818
import sys
19+
from importlib.util import find_spec
1920
from typing import Any, Optional, cast
2021

2122
import ray
@@ -157,63 +158,134 @@ def __init__(
157158
self.rank = 0
158159
self.world_size = 1
159160

160-
# Monkey patch for vLLM to ensure RAY_ADDRESS is set in Ray actors.
161-
try:
162-
from vllm.logger import init_logger
161+
# Monkey patches for vLLM behavior. We avoid importing vllm modules
162+
# here to prevent side effects during initialization and instead
163+
# locate the files via importlib metadata.
163164

164-
logger = init_logger("vllm_patch")
165+
from vllm.logger import init_logger
165166

166-
def _patch_vllm_init_workers_ray():
167-
"""Patch the vLLM ray_distributed_executor.py file.
167+
logger = init_logger("vllm_patch")
168168

169-
1. Pass custom runtime_env in _init_workers_ray call.
170-
- This allows passing custom py_executable to worker initialization.
171-
2. Add NCCL_CUMEM_ENABLE and NCCL_NVLS_ENABLE to vLLM ADDITIONAL_ENV_VARS.
172-
- This is a workaround to fix async vllm in some scenarios.
173-
- See https://github.com/NVIDIA-NeMo/RL/pull/898 for more details.
174-
"""
175-
try:
176-
import vllm.executor.ray_distributed_executor as ray_executor_module
169+
def _get_vllm_file(relative_path: str) -> str:
170+
"""Return absolute path to a vLLM file or raise if it cannot be found.
171+
172+
The relative_path should be a POSIX-style path under the vllm
173+
package root, e.g. "v1/executor/ray_executor.py" or
174+
"attention/layer.py".
175+
"""
176+
spec = find_spec("vllm")
177+
if spec is None or not spec.submodule_search_locations:
178+
raise RuntimeError(
179+
"vLLM package not found while attempting to patch "
180+
f"'{relative_path}'. Ensure vLLM is installed and "
181+
"available in this environment."
182+
)
177183

178-
file_to_patch = ray_executor_module.__file__
184+
base_dir = next(iter(spec.submodule_search_locations))
185+
file_path = os.path.join(base_dir, *relative_path.split("/"))
179186

180-
with open(file_to_patch, "r") as f:
181-
content = f.read()
187+
if not os.path.exists(file_path):
188+
raise RuntimeError(
189+
"Failed to locate expected vLLM file to patch. "
190+
f"Looked for '{relative_path}' at '{file_path}'. "
191+
"This likely indicates an unexpected vLLM installation "
192+
"layout or version mismatch."
193+
)
194+
195+
return file_path
196+
197+
def _patch_vllm_init_workers_ray():
198+
"""Patch the vLLM ray_distributed_executor.py file.
199+
200+
1. Pass custom runtime_env in _init_workers_ray call.
201+
- This allows passing custom py_executable to worker initialization.
202+
2. Add NCCL_CUMEM_ENABLE and NCCL_NVLS_ENABLE to vLLM ADDITIONAL_ENV_VARS.
203+
- This is a workaround to fix async vllm in some scenarios.
204+
- See https://github.com/NVIDIA-NeMo/RL/pull/898 for more details.
205+
"""
206+
file_to_patch = _get_vllm_file("v1/executor/ray_executor.py")
182207

183-
old_lines = [
184-
"self._init_workers_ray(placement_group)",
185-
'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}',
186-
]
208+
with open(file_to_patch, "r") as f:
209+
content = f.read()
187210

188-
new_lines = [
189-
f'self._init_workers_ray(placement_group, runtime_env={{"py_executable": "{self.py_executable}"}})',
190-
'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "NCCL_CUMEM_ENABLE", "NCCL_NVLS_ENABLE", "RAY_ENABLE_UV_RUN_RUNTIME_ENV"}',
191-
]
211+
old_lines = [
212+
"self._init_workers_ray(placement_group)",
213+
'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}',
214+
]
192215

193-
need_replace = False
194-
for old_line, new_line in zip(old_lines, new_lines):
195-
if new_line in content or old_line not in content:
196-
continue
197-
content = content.replace(old_line, new_line)
198-
need_replace = True
216+
new_lines = [
217+
f'self._init_workers_ray(placement_group, runtime_env={{"py_executable": "{self.py_executable}"}})',
218+
'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "NCCL_CUMEM_ENABLE", "NCCL_NVLS_ENABLE", "RAY_ENABLE_UV_RUN_RUNTIME_ENV"}',
219+
]
220+
221+
need_replace = False
222+
for old_line, new_line in zip(old_lines, new_lines):
223+
if new_line in content or old_line not in content:
224+
continue
225+
content = content.replace(old_line, new_line)
226+
need_replace = True
227+
228+
if not need_replace:
229+
return
230+
231+
# Write back the patched content
232+
with open(file_to_patch, "w") as f:
233+
f.write(content)
234+
235+
def _patch_vllm_vit_flash_attn_backend():
236+
"""Patch vLLM vision attention backend selection logic.
237+
238+
Modify the CUDA branch of maybe_get_vit_flash_attn_backend in
239+
vllm.attention.layer to avoid overriding the backend when it
240+
is already set to XFORMERS. This avoids flash attention related
241+
errors when the ViT head dimension is not a multiple of 32.
242+
243+
Related issues:
244+
- https://github.com/vllm-project/vllm/issues/27562
245+
- https://github.com/vllm-project/vllm/issues/26989
246+
247+
This is properly fixed in https://github.com/vllm-project/vllm/pull/28763. We can remove this patch once we upgrade to a version of vllm that contains this fix.
248+
"""
249+
file_to_patch = _get_vllm_file("attention/layer.py")
250+
with open(file_to_patch, "r") as f:
251+
content = f.read()
252+
253+
old_snippet = (
254+
" elif current_platform.is_cuda():\n"
255+
" if (\n"
256+
" attn_backend != AttentionBackendEnum.FLASH_ATTN\n"
257+
" and check_upstream_fa_availability(torch.get_default_dtype())\n"
258+
" ):\n"
259+
" attn_backend = AttentionBackendEnum.FLASH_ATTN\n"
260+
" use_upstream_fa = True\n"
261+
)
262+
263+
new_snippet = (
264+
" elif current_platform.is_cuda():\n"
265+
" if (\n"
266+
" attn_backend != AttentionBackendEnum.FLASH_ATTN\n"
267+
" and attn_backend != AttentionBackendEnum.XFORMERS\n"
268+
" and check_upstream_fa_availability(torch.get_default_dtype())\n"
269+
" ):\n"
270+
" attn_backend = AttentionBackendEnum.FLASH_ATTN\n"
271+
" use_upstream_fa = True\n"
272+
)
199273

200-
if not need_replace:
201-
return
274+
# Only patch if the file still has the old snippet and
275+
# hasn't been patched already.
276+
if new_snippet in content or old_snippet not in content:
277+
return
202278

203-
# Write back the patched content
204-
with open(file_to_patch, "w") as f:
205-
f.write(content)
279+
content = content.replace(old_snippet, new_snippet)
206280

207-
except (ImportError, FileNotFoundError, PermissionError):
208-
# Allow failures gracefully
209-
pass
281+
with open(file_to_patch, "w") as f:
282+
f.write(content)
210283

211-
_patch_vllm_init_workers_ray()
212-
logger.info("Successfully patched vllm _init_workers_ray.")
284+
_patch_vllm_init_workers_ray()
285+
logger.info("Successfully patched vllm _init_workers_ray.")
213286

214-
except (ImportError, AttributeError):
215-
# vllm not installed or has a different structure, skipping patch.
216-
pass
287+
_patch_vllm_vit_flash_attn_backend()
288+
logger.info("Successfully patched vllm vit flash attention backend.")
217289

218290
try:
219291
import vllm

nemo_rl/models/generation/vllm/vllm_worker_async.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -471,9 +471,6 @@ class NeMoRLOpenAIServingChat(NeMoRLOpenAIServingMixin, OpenAIServingChat):
471471
return_tokens_as_token_ids=True,
472472
)
473473
)
474-
# Remove this fork when https://github.com/NVIDIA-NeMo/RL/pull/1563 is merged to NeMo RL main bumping to vLLM 0.11.2
475-
if vllm_version < "0.11.1":
476-
serving_chat_kwargs["model_config"] = model_config
477474
openai_serving_chat = NeMoRLOpenAIServingChat(**serving_chat_kwargs)
478475

479476
generation_config = self.cfg
@@ -538,9 +535,6 @@ class NeMoRLOpenAIServingTokenization(
538535
engine_client=serving_chat_kwargs["engine_client"],
539536
models=serving_chat_kwargs["models"],
540537
)
541-
# Remove this fork when https://github.com/NVIDIA-NeMo/RL/pull/1563 is merged to NeMo RL main bumping to vLLM 0.11.2
542-
if vllm_version < "0.11.1":
543-
serving_tokenization_kwargs["model_config"] = model_config
544538
openai_serving_tokenization = NeMoRLOpenAIServingTokenization(
545539
**serving_tokenization_kwargs
546540
)

nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1848,7 +1848,7 @@ def move_buffer_to_device(
18481848
) -> nn.Module:
18491849
# FSDP modules do not move buffers to the device automatically
18501850
for v in model.buffers():
1851-
v.data = v.data.to(device)
1851+
v = v.to(device)
18521852

18531853
return model
18541854

0 commit comments

Comments
 (0)