address review comments

h-guo18 · h-guo18 · commit e6334c49a2c5 · 2025-09-05T18:17:23.000Z
Signed-off-by: h-guo18 &lt;67671475+h-guo18@users.noreply.github.com&gt;
diff --git a/examples/speculative_decoding/README.md b/examples/speculative_decoding/README.md
@@ -184,16 +184,6 @@ This will export the model from a modelopt checkpoint to a deployment-compatible
 
 The exported checkpoint can be deployed on TRT-LLM or vLLM.
 
-#### vLLM
-
-To test AR on MT-bench with vLLM:
-
-```python
-python vllm_inference_demo.py --base_model $BASE_MODEL --eagle_model $EXPORT_PATH --mode mt-bench
-```
-
-Please refer to [vLLM Doc: Speculative Decoding](https://docs.vllm.ai/en/v0.9.0/features/spec_decode.html) for detailed usage.
-
 #### TRT-LLM
 
 To serve the checkpoint with trtllm, we can run trtllm-serve with:
@@ -223,6 +213,10 @@ kv_cache_config:
 
 Please refer to [TRT-LLM Doc: Speculative Decoding](https://nvidia.github.io/TensorRT-LLM/examples/llm_speculative_decoding.html) for detailed usage.
 
+#### vLLM
+
+Please refer to [vLLM Doc: Speculative Decoding](https://docs.vllm.ai/en/v0.9.0/features/spec_decode.html) for detailed usage.
+
 #### Deploying Quantized model
 
 See more details on deployment of quantized model to TRTLLM [here](../llm_ptq/README.md).
diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py
@@ -46,7 +46,13 @@
 import modelopt.torch.opt as mto
 import modelopt.torch.speculative as mtsp
 from modelopt.torch.utils import print_rank_0
-from modelopt.torch.utils.distributed import is_master
+
+try:
+    import wandb
+
+    wandb.init()
+except ImportError:
+    wandb = None
 
 torch.manual_seed(0)
 mto.enable_huggingface_checkpointing()
@@ -205,15 +211,6 @@ def train():
     class ARValidationCallback(TrainerCallback):
         def __init__(self, ar_validate_steps: int = 500):
             self.ar_validate_steps = ar_validate_steps
-            self.wandb = None
-            if is_master():
-                try:
-                    import wandb
-
-                    self.wandb = wandb
-                    self.wandb.init()
-                except ImportError:
-                    pass
 
         def on_step_end(self, args, state, control, **kwargs):
             if state.global_step % self.ar_validate_steps == 0 and state.global_step > 0:
@@ -225,8 +222,8 @@ def on_step_end(self, args, state, control, **kwargs):
                     device=kwargs["model"].device,
                 )
                 print_rank_0(f"Step {state.global_step} AR: {sum(ars) / len(ars):.4f}")
-                if self.wandb:
-                    self.wandb.log({"validate_ar": sum(ars) / len(ars)}, step=state.global_step)
+                if wandb:
+                    wandb.log({"validate_ar": sum(ars) / len(ars)}, step=state.global_step)
             return control
 
     trainer = Trainer(
diff --git a/examples/speculative_decoding/server_generate.py b/examples/speculative_decoding/server_generate.py
@@ -46,7 +46,7 @@
 parser.add_argument(
     "--max_tokens", type=int, default=2048, help="Maximum number of tokens to generate"
 )
-parser.add_argument("--chat", action="store_true", default=True, help="Use chat mode")
+parser.add_argument("--chat", default=True, type=bool, help="Use chat mode")
 parser.add_argument("--model", type=str, default="model", help="Model name")
 parser.add_argument("--url", type=str, default="http://localhost:8000/v1", help="URL of the API")
 parser.add_argument("--api_key", type=str, default="token-abc123", help="API key (if any)")
diff --git a/examples/speculative_decoding/train_eagle3_and_export.sh b/examples/speculative_decoding/train_eagle3_and_export.sh
@@ -55,7 +55,7 @@ fi
 
 MODEL_BASENAME=$(basename "$BASE_MODEL")
 
-echo "==== [1/4] Training draft model ===="
+echo "==== [1/3] Training draft model ===="
 OUTPUT_DIR=ckpts/${MODEL_BASENAME}-$(date +%Y%m%d_%H%M)
 ./launch_train.sh --model $BASE_MODEL \
             --output_dir $OUTPUT_DIR \
@@ -64,12 +64,9 @@ OUTPUT_DIR=ckpts/${MODEL_BASENAME}-$(date +%Y%m%d_%H%M)
             --num_epochs 2 \
             --eagle_config eagle_config.json
 
-echo "==== [2/4] Evaluating ModelOpt checkpoint on MT-Bench ===="
+echo "==== [2/3] Evaluating ModelOpt checkpoint on MT-Bench ===="
 python ar_validate.py --model_path $OUTPUT_DIR
 
-echo "==== [3/4] Exporting checkpoint to deployment format ===="
+echo "==== [3/3] Exporting checkpoint to deployment format ===="
 EXPORT_PATH=export/${MODEL_BASENAME}-$(date +%Y%m%d_%H%M)
 python export_hf_checkpoint.py --model_path $OUTPUT_DIR --export_path $EXPORT_PATH
-
-echo "==== [4/4] Text Generation with speculative decoding in vLLM===="
-python vllm_inference_demo.py --base-model $BASE_MODEL --eagle-model $EXPORT_PATH --mode generate --prompt "Write a short story about a cat."
diff --git a/examples/speculative_decoding/vllm_inference_demo.py b/examples/speculative_decoding/vllm_inference_demo.py

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@`
`46`	`46`	`parser.add_argument(`
`47`	`47`	`"--max_tokens", type=int, default=2048, help="Maximum number of tokens to generate"`
`48`	`48`	`)`
`49`		`-parser.add_argument("--chat", action="store_true", default=True, help="Use chat mode")`
	`49`	`+parser.add_argument("--chat", default=True, type=bool, help="Use chat mode")`
`50`	`50`	`parser.add_argument("--model", type=str, default="model", help="Model name")`
`51`	`51`	`parser.add_argument("--url", type=str, default="http://localhost:8000/v1", help="URL of the API")`
`52`	`52`	`parser.add_argument("--api_key", type=str, default="token-abc123", help="API key (if any)")`