support qwen3 coder draft model, add opc dataset process (sgl-project#73)

yizhang2077 · sleepcoo · qibaoyuan · commit 29a6ad59fdb0 · 2025-08-18T09:47:08.000+08:00
* support qwen3 coder draft model, add opc dataset process

* rename config

---------

Co-authored-by: lukec &lt;118525388+sleepcoo@users.noreply.github.com&gt;
diff --git a/benchmarks/run_gsm8k.py b/benchmarks/run_gsm8k.py
@@ -84,6 +84,7 @@ def few_shot_gsm8k(s, question):
     states = few_shot_gsm8k.run_batch(
         arguments,
         temperature=0,
+        max_new_tokens=2048,
         num_threads=args.parallel,
         progress_bar=True,
     )
diff --git a/benchmarks/run_humaneval.py b/benchmarks/run_humaneval.py
@@ -68,6 +68,7 @@ def get_humaneval_answer(s, question):
     states = get_humaneval_answer.run_batch(
         questions,
         temperature=0,
+        max_new_tokens=2048,
         num_threads=args.parallel,
         progress_bar=True,
     )
@@ -77,6 +78,7 @@ def get_humaneval_answer(s, question):
     num_output_tokens = sum(
         s.get_meta_info("answer")["completion_tokens"] for s in states
     )
+
     output_throughput = num_output_tokens / latency
 
     has_verify = "spec_verify_ct" in states[0].get_meta_info("answer")
diff --git a/benchmarks/run_math500.py b/benchmarks/run_math500.py
@@ -39,6 +39,7 @@ def get_humaneval_answer(s, question):
     states = get_humaneval_answer.run_batch(
         questions,
         temperature=0,
+        max_new_tokens=2048,
         num_threads=args.parallel,
         progress_bar=True,
     )
diff --git a/configs/qwen3-coder-480B-A35B-instruct-eagle3.json b/configs/qwen3-coder-480B-A35B-instruct-eagle3.json
@@ -0,0 +1,31 @@
+{
+  "architectures": [
+    "LlamaForCausalLMEagle3"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 6144,
+  "initializer_range": 0.02,
+  "intermediate_size": 16384,
+  "max_position_embeddings": 262144,
+  "max_window_layers": 62,
+  "model_type": "llama",
+  "num_attention_heads": 96,
+  "num_hidden_layers": 1,
+  "num_key_value_heads":8,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.0",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 151936,
+  "draft_vocab_size": 32000
+}
diff --git a/examples/run_qwen3_coder_eagle3_offline.sh b/examples/run_qwen3_coder_eagle3_offline.sh
@@ -0,0 +1,21 @@
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+ROOT_DIR=$(dirname $SCRIPT_DIR)
+
+# train eagle3 for qwen3-coder
+NUM_GPUS=${1:-8}
+
+torchrun \
+    --standalone \
+    --nproc_per_node $NUM_GPUS \
+    $ROOT_DIR/scripts/train_eagle3_offline.py \
+    --target-model-path Qwen/Qwen3-Coder-480B-A35B-Instruct \
+    --draft-model-config $ROOT_DIR/configs/qwen3-coder-480B-A35B-instruct-eagle3.json \
+    --train-data-path $ROOT_DIR/cache/dataset/opc.jsonl \
+    --train-hidden-states-path $ROOT_DIR/cache/hidden_states \
+    --output-dir $ROOT_DIR/outputs/Qwen3-Coder-480B-A35B-Instruct \
+    --num-epochs 10 \
+    --batch-size 1 \
+    --learning-rate 1e-4 \
+    --max-length 2048 \
+    --chat-template qwen \
+    --resume
diff --git a/scripts/prepare_data.py b/scripts/prepare_data.py
@@ -34,7 +34,7 @@ def parse_args():
     parser.add_argument(
         "--dataset",
         type=str,
-        choices=["ultrachat", "sharegpt"],
+        choices=["ultrachat", "sharegpt", "opc"],
         help="The demo dataset to quickly run the training for speculative decoding",
     )
     parser.add_argument(
@@ -108,6 +108,20 @@ def load_dataset_from_path(data_path: Path):
     return ds
 
 
+import hashlib
+
+
+def process_opc_sft_stage1(row) -> Dict:
+    row_id = hashlib.md5((row["instruction"] + row["output"]).encode()).hexdigest()
+    return {
+        "id": row_id,
+        "conversations": [
+            {"role": "user", "content": row["instruction"]},
+            {"role": "assistant", "content": row["output"]},
+        ],
+    }
+
+
 def main():
     args = parse_args()
     # load dataset
@@ -121,6 +135,11 @@ def main():
             print("Loading dataset from custom data path: ", args.data_path)
             ds = load_dataset_from_path(Path(args.data_path))
         proc_fn = process_sharegpt_row
+    elif args.dataset == "opc":
+        ds = load_dataset(
+            "OpenCoder-LLM/opc-sft-stage1", "largescale_diverse_instruct"
+        )["train"]
+        proc_fn = process_opc_sft_stage1
     else:
         raise ValueError(
             f"This script only supports ultrachat_200k and sharegpt datasets for demo purpose, if you wish to use other datasets, please modify this script."

Original file line number	Diff line number	Diff line change
`@@ -84,6 +84,7 @@ def few_shot_gsm8k(s, question):`
`84`	`84`	`states = few_shot_gsm8k.run_batch(`
`85`	`85`	`arguments,`
`86`	`86`	`temperature=0,`
	`87`	`+ max_new_tokens=2048,`
`87`	`88`	`num_threads=args.parallel,`
`88`	`89`	`progress_bar=True,`
`89`	`90`	`)`
Original file line number	Diff line number	Diff line change
`@@ -68,6 +68,7 @@ def get_humaneval_answer(s, question):`
`68`	`68`	`states = get_humaneval_answer.run_batch(`
`69`	`69`	`questions,`
`70`	`70`	`temperature=0,`
	`71`	`+ max_new_tokens=2048,`
`71`	`72`	`num_threads=args.parallel,`
`72`	`73`	`progress_bar=True,`
`73`	`74`	`)`
`@@ -77,6 +78,7 @@ def get_humaneval_answer(s, question):`
`77`	`78`	`num_output_tokens = sum(`
`78`	`79`	`s.get_meta_info("answer")["completion_tokens"] for s in states`
`79`	`80`	`)`
	`81`	`+`
`80`	`82`	`output_throughput = num_output_tokens / latency`
`81`	`83`
`82`	`84`	`has_verify = "spec_verify_ct" in states[0].get_meta_info("answer")`
Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@ def get_humaneval_answer(s, question):`
`39`	`39`	`states = get_humaneval_answer.run_batch(`
`40`	`40`	`questions,`
`41`	`41`	`temperature=0,`
	`42`	`+ max_new_tokens=2048,`
`42`	`43`	`num_threads=args.parallel,`
`43`	`44`	`progress_bar=True,`
`44`	`45`	`)`