feat: update eagle3 example; add export

h-guo18 · h-guo18 · commit 5d9f76e235ea · 2025-09-08T17:19:28.000Z
Signed-off-by: h-guo18 &lt;67671475+h-guo18@users.noreply.github.com&gt;
diff --git a/examples/speculative_decoding/README.md b/examples/speculative_decoding/README.md
diff --git a/examples/speculative_decoding/ar_validate.py b/examples/speculative_decoding/ar_validate.py
@@ -26,7 +26,7 @@
 mto.enable_huggingface_checkpointing()
 
 
-def validate_ar(model, tokenizer, ds, steps=3, osl=20, num_samples=20, device=None):
+def validate_ar(model, tokenizer, ds, steps=3, osl=20, num_samples=80, device=None):
     validator = HFARValidation(model, tokenizer)
     num_samples = min(num_samples, len(ds))
     ars = []
@@ -54,12 +54,12 @@ def validate_ar(model, tokenizer, ds, steps=3, osl=20, num_samples=20, device=No
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_path", type=str, required=True, help="Path to model directory")
-    parser.add_argument("--steps", type=int, default=1, help="Steps for AR validation")
+    parser.add_argument("--steps", type=int, default=3, help="Steps for AR validation")
     parser.add_argument(
-        "--osl", type=int, default=100, help="Output sequence length for AR validation"
+        "--osl", type=int, default=32, help="Output sequence length for AR validation"
     )
     parser.add_argument(
-        "--num_samples", type=int, default=20, help="Number of MT-Bench samples to use"
+        "--num_samples", type=int, default=80, help="Number of MT-Bench samples to use"
     )
     parser.add_argument(
         "--ar_lower_bound",
diff --git a/examples/speculative_decoding/calibrate_draft_vocab.py b/examples/speculative_decoding/calibrate_draft_vocab.py
@@ -28,11 +28,10 @@ def main():
     parser.add_argument("--model", type=str, required=True, help="Model name or path for tokenizer")
     parser.add_argument("--data", type=str, required=True, help="Path to training data (jsonl)")
     parser.add_argument(
-        "--eagle_config",
-        type=str,
+        "--draft_vocab_size",
+        type=int,
         required=True,
-        default="eagle_config.json",
-        help="Path to eagle_config.json",
+        help="Draft vocab size",
     )
     parser.add_argument(
         "--calibrate_size",
@@ -45,12 +44,6 @@ def main():
     )
     args = parser.parse_args()
 
-    with open(args.eagle_config) as f:
-        eagle_config = json.load(f)
-    if "draft_vocab_size" not in eagle_config:
-        print("No draft vocab size specified in eagle_config.json, no need to calibrate for d2t.")
-        return
-
     print("Calibrating vocab...")
     tokenizer = AutoTokenizer.from_pretrained(args.model)
     with open(args.data) as f:
@@ -59,7 +52,7 @@ def main():
             conversations = conversations[: args.calibrate_size]
         conversations = [item for sublist in conversations for item in sublist]
 
-    d2t = calibrate_frequent_vocab(tokenizer, conversations, eagle_config["draft_vocab_size"])
+    d2t = calibrate_frequent_vocab(tokenizer, conversations, args.draft_vocab_size)
     model_name = os.path.basename(os.path.normpath(args.model))
     vocab_path = os.path.join(args.save_dir, model_name, "d2t.pt")
     os.makedirs(os.path.dirname(vocab_path), exist_ok=True)
diff --git a/examples/speculative_decoding/eagle_config.json b/examples/speculative_decoding/eagle_config.json
@@ -1,3 +1,10 @@
 {
-    "draft_vocab_size": 32000
+    "rope_scaling": {
+        "factor": 32.0,
+        "low_freq_factor": 1.0,
+        "high_freq_factor": 4.0,
+        "original_max_position_embeddings": 8192,
+        "rope_type": "llama3"
+    },
+    "initializer_range": 0.02
 }
diff --git a/examples/speculative_decoding/export_hf_checkpoint.py b/examples/speculative_decoding/export_hf_checkpoint.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+from transformers import AutoModelForCausalLM
+
+import modelopt.torch.opt as mto
+from modelopt.torch.export import export_hf_checkpoint
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, default="")
+    parser.add_argument("--export_path", type=str, default="")
+    return parser.parse_args()
+
+
+mto.enable_huggingface_checkpointing()
+
+args = parse_args()
+model = AutoModelForCausalLM.from_pretrained(args.model_path, torch_dtype="auto")
+with torch.inference_mode():
+    export_hf_checkpoint(
+        model,  # The quantized model.
+        export_dir=args.export_path,  # The directory where the exported files will be stored.
+    )
+print(f"Exported checkpoint to {args.export_path}")
diff --git a/examples/speculative_decoding/main.py b/examples/speculative_decoding/main.py
@@ -47,6 +47,13 @@
 import modelopt.torch.speculative as mtsp
 from modelopt.torch.utils import print_rank_0
 
+try:
+    import wandb
+
+    wandb.init()
+except ImportError:
+    wandb = None
+
 torch.manual_seed(0)
 mto.enable_huggingface_checkpointing()
 
@@ -170,6 +177,8 @@ def train():
                 {
                     "hidden_size": model.config.hidden_size,
                     "vocab_size": model.config.vocab_size,
+                    # we also overwrite max_pos_embedding for deployment compatibility
+                    "max_position_embeddings": model.config.max_position_embeddings,
                     "draft_vocab_size": custom_config["draft_vocab_size"]
                     if eagle_args.eagle_config and "draft_vocab_size" in custom_config
                     else model.config.vocab_size,
@@ -213,6 +222,8 @@ def on_step_end(self, args, state, control, **kwargs):
                     device=kwargs["model"].device,
                 )
                 print_rank_0(f"Step {state.global_step} AR: {sum(ars) / len(ars):.4f}")
+                if wandb:
+                    wandb.log({"validate_ar": sum(ars) / len(ars)}, step=state.global_step)
             return control
 
     trainer = Trainer(
diff --git a/examples/speculative_decoding/server_generate.py b/examples/speculative_decoding/server_generate.py
@@ -46,7 +46,7 @@
 parser.add_argument(
     "--max_tokens", type=int, default=2048, help="Maximum number of tokens to generate"
 )
-parser.add_argument("--chat", action="store_true", help="Use chat mode")
+parser.add_argument("--chat", default=True, type=bool, help="Use chat mode")
 parser.add_argument("--model", type=str, default="model", help="Model name")
 parser.add_argument("--url", type=str, default="http://localhost:8000/v1", help="URL of the API")
 parser.add_argument("--api_key", type=str, default="token-abc123", help="API key (if any)")
diff --git a/examples/speculative_decoding/train_eagle3_and_export.sh b/examples/speculative_decoding/train_eagle3_and_export.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -eo pipefail
+
+# Set default values for BASE_MODEL, NUM_GPU, and DATA
+BASE_MODEL=meta-llama/Llama-3.2-1B-Instruct
+NUM_GPU=1
+DATA=Daring-Anteater/train.jsonl
+
+# Parse input arguments --base-model, --num_gpu, and --data
+while [[ $# -gt 0 ]]; do
+  key="$1"
+  case $key in
+    --base_model)
+      BASE_MODEL="$2"
+      shift; shift
+      ;;
+    --num_gpu)
+      NUM_GPU="$2"
+      shift; shift
+      ;;
+    --data)
+      DATA="$2"
+      shift; shift
+      ;;
+    *)
+      echo "Unknown argument: $1"
+      exit 1
+      ;;
+  esac
+done
+
+
+if [[ "$NUM_GPU" == 1 ]]; then
+  export CUDA_VISIBLE_DEVICES=0
+else
+  # Export as 0,1,...,N-1 for NUM_GPU GPUs
+  export CUDA_VISIBLE_DEVICES=$(seq -s, 0 $((NUM_GPU-1)))
+fi
+
+MODEL_BASENAME=$(basename "$BASE_MODEL")
+
+echo "==== [1/3] Training draft model ===="
+OUTPUT_DIR=ckpts/${MODEL_BASENAME}-$(date +%Y%m%d_%H%M)
+./launch_train.sh --model $BASE_MODEL \
+            --output_dir $OUTPUT_DIR \
+            --data $DATA \
+            --num_gpu $NUM_GPU \
+            --num_epochs 2 \
+            --eagle_config eagle_config.json
+
+echo "==== [2/3] Evaluating ModelOpt checkpoint on MT-Bench ===="
+python ar_validate.py --model_path $OUTPUT_DIR
+
+echo "==== [3/3] Exporting checkpoint to deployment format ===="
+EXPORT_PATH=export/${MODEL_BASENAME}-$(date +%Y%m%d_%H%M)
+python export_hf_checkpoint.py --model_path $OUTPUT_DIR --export_path $EXPORT_PATH
diff --git a/modelopt/torch/export/plugins/__init__.py b/modelopt/torch/export/plugins/__init__.py
@@ -19,3 +19,5 @@
 
 with import_plugin("megatron_importer"):
     from .megatron_importer import *
+with import_plugin("transformers"):
+    from .hf_spec_export import *
diff --git a/modelopt/torch/export/plugins/hf_spec_export.py b/modelopt/torch/export/plugins/hf_spec_export.py
@@ -0,0 +1,151 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Modifiy stated_dict and config for exporting speculative decoding in official format."""
+
+import torch
+import torch.nn as nn
+
+from modelopt.torch.speculative.plugins.transformers import HFEagleModel
+
+SPECULATIVE_DECODING_MODES = ["eagle", "medusa"]
+
+EALGE_MODELOPT_TO_OFFICIAL = {
+    "required": {
+        "layers.0.self_attn.q_proj.weight": "midlayer.self_attn.q_proj.weight",
+        "layers.0.self_attn.k_proj.weight": "midlayer.self_attn.k_proj.weight",
+        "layers.0.self_attn.v_proj.weight": "midlayer.self_attn.v_proj.weight",
+        "layers.0.self_attn.o_proj.weight": "midlayer.self_attn.o_proj.weight",
+        "layers.0.mlp.gate_proj.weight": "midlayer.mlp.gate_proj.weight",
+        "layers.0.mlp.up_proj.weight": "midlayer.mlp.up_proj.weight",
+        "layers.0.mlp.down_proj.weight": "midlayer.mlp.down_proj.weight",
+        "hidden_norm.weight": "midlayer.hidden_norm.weight",
+        "input_embeds_norm.weight": "midlayer.input_layernorm.weight",
+        "layers.0.post_attention_layernorm.weight": "midlayer.post_attention_layernorm.weight",
+        "norm.weight": "norm.weight",
+        "fc.weight": "fc.weight",
+    },
+    "optional": {
+        "d2t": "d2t",
+        "eagle_lm_head.weight": "lm_head.weight",
+    },
+}
+
+
+def _check_state_dict_keys_match(draft_model: nn.Module, required_items: dict):
+    """Check if the state dict keys match."""
+    draft_keys = set(draft_model.state_dict().keys())
+    for required_key in required_items:
+        if required_key not in draft_keys:
+            raise ValueError(f"State dict keys mismatch!\nMissing in draft model: {required_key}")
+
+
+def rename_and_prune_if_spec_decoding(model: nn.Module, post_state_dict: dict):
+    """Only return the state dict of the draft model in official format and ignore the base model."""
+    # check the model has only speculative decoding
+    opt_modes = model._modelopt_state
+    if len(opt_modes) != 1 or opt_modes[0][0] != "eagle":
+        # if there's other opts, return as is
+        return post_state_dict
+
+    assert isinstance(model, HFEagleModel)
+    # Check if the state dict keys match
+    _check_state_dict_keys_match(model.eagle_module, EALGE_MODELOPT_TO_OFFICIAL["required"])
+
+    # Convert key names and save the state dict
+    export_state_dict = {}
+    for ours_key, export_key in {
+        **EALGE_MODELOPT_TO_OFFICIAL["required"],
+        **EALGE_MODELOPT_TO_OFFICIAL["optional"],
+    }.items():
+        if ours_key in model.eagle_module.state_dict():
+            export_state_dict[export_key] = model.eagle_module.state_dict()[ours_key]
+
+    # TODO: (hg) this is a temp fix. Find cleaner way to do this.
+    if "eagle_lm_head.weight" not in model.eagle_module.state_dict():
+        export_state_dict["lm_head.weight"] = model.state_dict()["lm_head.weight"]
+
+    return export_state_dict
+
+
+def set_config_if_spec_decoding(model: nn.Module, config_data: dict):
+    """Return the config of draft model in official format."""
+    if len(model._modelopt_state) != 1 or model._modelopt_state[0][0] != "eagle":
+        # return as is
+        return config_data
+
+    assert isinstance(model, HFEagleModel)
+
+    # This is the config keys in official checkpoint.
+    template_config = {
+        "architectures": ["LlamaForCausalLM"],
+        "bos_token_id": None,
+        "eos_token_id": None,
+        "hidden_act": None,
+        "hidden_size": None,
+        "initializer_range": None,
+        "intermediate_size": None,
+        "max_position_embeddings": None,
+        "model_type": "llama",
+        "num_attention_heads": None,
+        "num_key_value_heads": None,
+        "num_hidden_layers": None,
+        "pad_token_id": None,
+        "rms_norm_eps": None,
+        "tie_word_embeddings": False,
+        "torch_dtype": None,
+        "transformers_version": None,
+        "use_cache": None,
+        "vocab_size": None,
+        "draft_vocab_size": None,
+        "rope_scaling": None,
+        "attention_bias": None,
+        "attention_dropout": None,
+        "head_dim": None,
+        "mlp_bias": None,
+        "pretraining_tp": None,
+        "rope_theta": None,
+        "eagle_config": {
+            "eagle_aux_hidden_state_layer_ids": None,
+            "use_aux_hidden_state": None,
+            "use_input_layernorm_in_first_layer": None,
+            "use_last_layernorm": None,
+            "use_mtp_layernorm": None,
+        },
+    }
+
+    def _get_config_from_eagle_config_or_base_config(key: str, model: nn.Module):
+        if getattr(model.eagle_config, key, None) is not None:
+            return getattr(model.eagle_config, key)
+        elif getattr(model.config, key, None) is not None:
+            return getattr(model.config, key)
+        else:
+            return None
+
+    for key in template_config:
+        value = template_config[key]
+        if isinstance(value, dict):
+            # for eagle config, we find it in model.eagle_config
+            for sub_key in value:
+                value[sub_key] = _get_config_from_eagle_config_or_base_config(sub_key, model)
+        elif value is None:
+            # First, we try to load fron eagle config.
+            new_value = _get_config_from_eagle_config_or_base_config(key, model)
+            # If the value is a torch.dtype, we convert to string for serialization.
+            if isinstance(new_value, torch.dtype):
+                new_value = str(new_value).replace("torch.", "")
+            template_config[key] = new_value
+
+    return template_config
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,10 @@`
`1`	`1`	`{`
`2`		`- "draft_vocab_size": 32000`
	`2`	`+ "rope_scaling": {`
	`3`	`+ "factor": 32.0,`
	`4`	`+ "low_freq_factor": 1.0,`
	`5`	`+ "high_freq_factor": 4.0,`
	`6`	`+ "original_max_position_embeddings": 8192,`
	`7`	`+ "rope_type": "llama3"`
	`8`	`+ },`
	`9`	`+ "initializer_range": 0.02`
`3`	`10`	`}`
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@`
`46`	`46`	`parser.add_argument(`
`47`	`47`	`"--max_tokens", type=int, default=2048, help="Maximum number of tokens to generate"`
`48`	`48`	`)`
`49`		`-parser.add_argument("--chat", action="store_true", help="Use chat mode")`
	`49`	`+parser.add_argument("--chat", default=True, type=bool, help="Use chat mode")`
`50`	`50`	`parser.add_argument("--model", type=str, default="model", help="Model name")`
`51`	`51`	`parser.add_argument("--url", type=str, default="http://localhost:8000/v1", help="URL of the API")`
`52`	`52`	`parser.add_argument("--api_key", type=str, default="token-abc123", help="API key (if any)")`