feat: add vllm enable_expert_parallel (#997)

yuki-97 · web-flow · commit 49234032d16c · 2025-08-28T05:20:57.000Z
Signed-off-by: Yuki Huang &lt;yukih@nvidia.com&gt;
diff --git a/examples/configs/evals/eval.yaml b/examples/configs/evals/eval.yaml
@@ -21,6 +21,7 @@ generation:
     precision: "bfloat16"
     tensor_parallel_size: 1
     pipeline_parallel_size: 1
+    enable_expert_parallel: false
     gpu_memory_utilization: 0.9
     max_model_len: 2048
     enforce_eager: False
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
@@ -175,6 +175,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
       enforce_eager: False
diff --git a/examples/configs/grpo_sliding_puzzle.yaml b/examples/configs/grpo_sliding_puzzle.yaml
@@ -40,6 +40,7 @@ policy:
       async_engine: false
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
 
diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-24K.yaml
@@ -42,6 +42,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.8
       enforce_eager: True
       max_model_len: ${policy.max_total_sequence_length}
diff --git a/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml b/examples/configs/recipes/llm/grpo-deepscaler-1.5b-8K.yaml
@@ -99,6 +99,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
       enforce_eager: True
diff --git a/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml b/examples/configs/recipes/llm/grpo-gemma3-1b-it-1n8g-fsdp2tp1.yaml
@@ -93,6 +93,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: 512
       enforce_eager: False
diff --git a/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml b/examples/configs/recipes/llm/grpo-gemma3-27b-it-16n8g-fsdp2tp8sp-actckpt-long.yaml
@@ -94,6 +94,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 4
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: 16384
       enforce_eager: False
diff --git a/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml b/examples/configs/recipes/llm/grpo-gspo-deepscaler-1.5b-8K.yaml
@@ -100,6 +100,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
       enforce_eager: True
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8.yaml
@@ -122,6 +122,7 @@ policy:
       precision: 'fp8'
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: 4096
       enforce_eager: False
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-4n8g-fsdp2tp1-long.v3.yaml
@@ -94,6 +94,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: 4096
       enforce_eager: False
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-fsdp2tp1.v3.yaml
@@ -94,6 +94,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: 512
       enforce_eager: False
diff --git a/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron.yaml
@@ -123,6 +123,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: 512
       enforce_eager: False
diff --git a/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml b/examples/configs/recipes/llm/grpo-math-qwen3-30ba3b-megatron-tp4-32k.yaml
@@ -127,6 +127,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 4
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
       # NB(pjin): https://github.com/NVIDIA-NeMo/RL/pull/857
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt-long.v3.yaml
@@ -94,6 +94,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 4
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: 16384
       enforce_eager: False
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-32b-32n8g-fsdp2tp8sp-actckpt.v3.yaml
@@ -94,6 +94,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 4
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: 16384
       enforce_eager: False
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-fsdp2tp4sp.v3.yaml
@@ -94,6 +94,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 4
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: 4096
       enforce_eager: False
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-7b-instruct-4n8g-megatron.yaml
@@ -145,6 +145,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 4
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: 4096
       enforce_eager: False
diff --git a/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml b/examples/configs/recipes/llm/grpo-qwen2.5-math-1.5b-instruct-1n8g-fsdp2tp1.v3.yaml
@@ -94,6 +94,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: 512
       enforce_eager: False
diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml
@@ -108,6 +108,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
       enforce_eager: False
diff --git a/examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-smolvlm2-2.2b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml
@@ -108,6 +108,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
       enforce_eager: False
diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml
@@ -109,6 +109,7 @@ policy:
       precision: ${policy.precision}
       tensor_parallel_size: 1
       pipeline_parallel_size: 1
+      enable_expert_parallel: false
       gpu_memory_utilization: 0.6
       max_model_len: ${policy.max_total_sequence_length}
       enforce_eager: False
diff --git a/nemo_rl/models/generation/vllm/config.py b/nemo_rl/models/generation/vllm/config.py
@@ -20,6 +20,7 @@
 class VllmSpecificArgs(TypedDict):
     tensor_parallel_size: int
     pipeline_parallel_size: int
+    enable_expert_parallel: bool
     gpu_memory_utilization: float
     max_model_len: int
     # Additional arguments for vLLM inserted by nemo rl based on the context of when vllm is used
diff --git a/nemo_rl/models/generation/vllm/vllm_worker.py b/nemo_rl/models/generation/vllm/vllm_worker.py
@@ -134,6 +134,7 @@ def __init__(
         self.model_name = self.cfg["model_name"]
         self.tensor_parallel_size = self.cfg["vllm_cfg"]["tensor_parallel_size"]
         self.pipeline_parallel_size = self.cfg["vllm_cfg"]["pipeline_parallel_size"]
+        self.enable_expert_parallel = self.cfg["vllm_cfg"]["enable_expert_parallel"]
         self.gpu_memory_utilization = self.cfg["vllm_cfg"]["gpu_memory_utilization"]
         self.precision = self.cfg["vllm_cfg"]["precision"]
         self.fraction_of_gpus = fraction_of_gpus
@@ -366,6 +367,7 @@ def _patch_vllm_sampler():
             skip_tokenizer_init=False,
             tensor_parallel_size=self.tensor_parallel_size,
             pipeline_parallel_size=self.pipeline_parallel_size,
+            enable_expert_parallel=self.enable_expert_parallel,
             gpu_memory_utilization=self.gpu_memory_utilization,
             enable_prefix_caching=torch.cuda.get_device_capability()[0] >= 8,
             dtype=self.precision,
diff --git a/tests/unit/environments/test_code_environment.py b/tests/unit/environments/test_code_environment.py
@@ -53,6 +53,7 @@
         "precision": "bfloat16",
         "tensor_parallel_size": 1,
         "pipeline_parallel_size": 1,
+        "enable_expert_parallel": False,
         "max_model_len": 1024,
         "disable_log_stats": True,
         "disable_log_requests": True,
diff --git a/tests/unit/environments/test_retriever.py b/tests/unit/environments/test_retriever.py
@@ -52,6 +52,7 @@
         "precision": "bfloat16",
         "tensor_parallel_size": 1,
         "pipeline_parallel_size": 1,
+        "enable_expert_parallel": False,
         "max_model_len": 1024,
         "disable_log_stats": True,
         "disable_log_requests": True,
diff --git a/tests/unit/experience/test_rollouts.py b/tests/unit/experience/test_rollouts.py
@@ -198,6 +198,7 @@ def initial_multi_step_calculator_batch(rollout_tokenizer):
         "precision": "bfloat16",
         "tensor_parallel_size": 1,
         "pipeline_parallel_size": 1,
+        "enable_expert_parallel": False,
         "max_model_len": 2048,
         "disable_log_stats": True,
         "disable_log_requests": True,
diff --git a/tests/unit/models/generation/test_vllm_generation.py b/tests/unit/models/generation/test_vllm_generation.py
@@ -54,6 +54,7 @@
         "precision": "bfloat16",
         "tensor_parallel_size": 1,
         "pipeline_parallel_size": 1,
+        "enable_expert_parallel": False,
         "gpu_memory_utilization": 0.7,
         "max_model_len": 1024,
         "async_engine": False,  # Default to False for synchronous tests
diff --git a/tests/unit/models/generation/test_vllm_large_model.py b/tests/unit/models/generation/test_vllm_large_model.py
@@ -44,6 +44,7 @@
         "precision": "bfloat16",
         "tensor_parallel_size": 8,
         "pipeline_parallel_size": 2,
+        "enable_expert_parallel": False,
         "gpu_memory_utilization": 0.7,
         "max_model_len": 1024,
         "async_engine": True,