Skip to content

Commit 75e916f

Browse files
authored
fix: mcore generation config restored in nightly test (#1720)
Signed-off-by: Terry Kong <terryk@nvidia.com>
1 parent ba46741 commit 75e916f

File tree

5 files changed

+53
-18
lines changed

5 files changed

+53
-18
lines changed

examples/configs/grpo_math_1B.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,15 @@ policy:
216216
top_k: null
217217
stop_token_ids: null
218218
stop_strings: null
219+
mcore_generation_config:
220+
buffer_size_gb: 20 # Total GPU memory (in GB) allocated for KV cache buffers
221+
buffer_guaranteed_fraction: 0.1 # Fraction of buffer reserved for guaranteed active requests
222+
num_cuda_graphs: 16 # Number of CUDA graphs to pre-compile for different batch sizes
223+
block_size_tokens: 256 # Size of each KV cache block in tokens (affects memory granularity)
224+
use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing
225+
enable_chunked_prefill: true # Split long prefills into chunks for better memory management
226+
unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
227+
max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
219228
vllm_cfg:
220229
async_engine: false
221230
precision: ${policy.precision}

examples/configs/grpo_math_1B_megatron.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,7 @@ policy:
150150
use_cuda_graphs_for_non_decode_steps: true # Enable CUDA graphs for prefill/context processing
151151
enable_chunked_prefill: true # Split long prefills into chunks for better memory management
152152
unified_memory_level: 0 # Unified memory usage level (0=disabled, higher values enable more aggressive paging)
153-
max_tokens: 16384 # Maximum number of tokens to use in a single step
153+
max_tokens: 16384 # Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens
154154

155155
vllm_cfg:
156156
tensor_parallel_size: 1

nemo_rl/models/policy/workers/megatron_policy_worker.py

Lines changed: 37 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from collections import defaultdict
2020
from contextlib import AbstractContextManager, contextmanager, nullcontext
2121
from functools import partial
22-
from typing import Any, Iterator, Optional, TypeVar, cast
22+
from typing import Any, Iterator, Optional, TypedDict, TypeVar, cast
2323

2424
import ray
2525
import torch
@@ -145,6 +145,27 @@
145145
TokenizerType = TypeVar("TokenizerType", bound=PreTrainedTokenizerBase)
146146

147147

148+
class MegatronGenerationConfig(TypedDict):
149+
# Total GPU memory (in GB) allocated for KV cache buffers
150+
buffer_size_gb: int
151+
# Fraction of buffer reserved for guaranteed active requests
152+
buffer_guaranteed_fraction: float
153+
# Number of CUDA graphs to pre-compile for different batch sizes
154+
num_cuda_graphs: int
155+
# Size of each KV cache block in tokens (affects memory granularity)
156+
block_size_tokens: int
157+
# Enable CUDA graphs for prefill/context processing
158+
use_cuda_graphs_for_non_decode_steps: bool
159+
# Split long prefills into chunks for better memory management
160+
enable_chunked_prefill: bool
161+
# Unified memory usage level (0=disabled, higher values enable more aggressive paging)
162+
unified_memory_level: int
163+
# Maximum number of tokens to use in a single step. Analogous to vllm's max_num_batched_tokens.
164+
# Can cause OOM if set too high so should be tuned with buffer_size_gb if OOMing. If set too
165+
# low, then will only do 512 tokens at a time, which can be slow.
166+
max_tokens: int
167+
168+
148169
def broadcast_object_across_pp_ranks(obj):
149170
"""Broadcast an object across pipeline parallel ranks.
150171
@@ -1820,22 +1841,22 @@ def generate(
18201841
)
18211842
from megatron.core.inference.sampling_params import SamplingParams
18221843

1823-
mcore_generation_config = self.cfg["generation"]["mcore_generation_config"]
1824-
buffer_size_gb = mcore_generation_config.get("buffer_size_gb", 20)
1825-
1826-
num_cuda_graphs = mcore_generation_config.get("num_cuda_graphs", 16)
1827-
block_size_tokens = mcore_generation_config.get("block_size_tokens", 256)
1828-
use_cuda_graphs_for_non_decode_steps = mcore_generation_config.get(
1829-
"use_cuda_graphs_for_non_decode_steps", True
1830-
)
1831-
enable_chunked_prefill = mcore_generation_config.get(
1832-
"enable_chunked_prefill", True
1844+
mcore_generation_config = cast(
1845+
MegatronGenerationConfig, self.cfg["generation"]["mcore_generation_config"]
18331846
)
1834-
unified_memory_level = mcore_generation_config.get("unified_memory_level", 0)
1835-
buffer_guaranteed_fraction = mcore_generation_config.get(
1836-
"buffer_guaranteed_fraction", 0.1
1837-
)
1838-
max_tokens = mcore_generation_config.get("max_tokens", 16384)
1847+
buffer_size_gb = mcore_generation_config["buffer_size_gb"]
1848+
1849+
num_cuda_graphs = mcore_generation_config["num_cuda_graphs"]
1850+
block_size_tokens = mcore_generation_config["block_size_tokens"]
1851+
use_cuda_graphs_for_non_decode_steps = mcore_generation_config[
1852+
"use_cuda_graphs_for_non_decode_steps"
1853+
]
1854+
enable_chunked_prefill = mcore_generation_config["enable_chunked_prefill"]
1855+
unified_memory_level = mcore_generation_config["unified_memory_level"]
1856+
buffer_guaranteed_fraction = mcore_generation_config[
1857+
"buffer_guaranteed_fraction"
1858+
]
1859+
max_tokens = mcore_generation_config["max_tokens"]
18391860

18401861
model_config = self.model.config
18411862
model_config.cuda_graph_impl = "local"

tests/test_suites/llm/grpo-llama3.2-1b-instruct-1n8g-megatron_generation.sh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,12 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
3434

3535
# Only run metrics if the target step is reached
3636
if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
37+
# total_step_time observed around ~16, so 17.5 for buffer
3738
uv run tests/check_metrics.py $JSON_METRICS \
3839
'median(data["train/token_mult_prob_error"]) < 1.1' \
3940
'data["train/token_mult_prob_error"]["500"] < 1.1' \
4041
'data["train/reward"]["500"] > 0.1' \
41-
'mean(data["timing/train/total_step_time"], -6, -1) < 10.5'
42+
'mean(data["timing/train/total_step_time"], -6, -1) < 17.5'
4243

4344
# Clean up checkpoint directory after successful run to save space.
4445
rm -rf "$CKPT_DIR"

tests/unit/models/policy/test_megatron_worker.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ def create_megatron_test_config(
9090
"buffer_size_gb": 20,
9191
"buffer_guaranteed_fraction": 0.1,
9292
"num_cuda_graphs": 16,
93+
"block_size_tokens": 256,
94+
"use_cuda_graphs_for_non_decode_steps": True,
95+
"enable_chunked_prefill": True,
96+
"unified_memory_level": 0,
9397
"max_tokens": 16384,
9498
},
9599
"colocated": {

0 commit comments

Comments
 (0)