Minor tweaks to data preparation

benchislett · benchislett · commit f74bf59f618c · 2025-09-17T10:24:11.000-04:00
Signed-off-by: Benjamin Chislett &lt;benjamin.chislett@centml.ai&gt;
diff --git a/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py b/examples/speculative_decoding/collect_hidden_states/compute_hidden_states_hf.py
@@ -24,6 +24,10 @@
 from tqdm import tqdm as tqdm
 from transformers import AutoModel, AutoTokenizer
 
+REMOVE_THINK_CHAT_TEMPLATE = (
+    "{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}"
+)
+
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
@@ -92,6 +96,7 @@ async def main(args: argparse.Namespace) -> None:
     tokenizer = AutoTokenizer.from_pretrained(args.model)
     if tokenizer.pad_token is None:
         tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.chat_template = tokenizer.chat_template.replace(REMOVE_THINK_CHAT_TEMPLATE, "")
 
     output_dir = args.output_dir
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -132,7 +137,12 @@ async def main(args: argparse.Namespace) -> None:
                 )
             # Extract hidden states from layers with index (2, N/2, N-3), and the output hidden states
             hidden_states = outputs.hidden_states
-            selected_layer_indices = [2, num_hidden_layers // 2, num_hidden_layers - 3]
+            selected_layer_indices = [
+                2,
+                max(0, num_hidden_layers // 2),
+                max(1, num_hidden_layers - 3),
+            ]
+            selected_layer_indices = sorted(set(selected_layer_indices))
             aux_hidden_states = torch.cat(
                 [hidden_states[i].squeeze(0).cpu() for i in selected_layer_indices], dim=-1
             )
diff --git a/examples/speculative_decoding/collect_hidden_states/run_hf_compute_hiddens.sh b/examples/speculative_decoding/collect_hidden_states/run_hf_compute_hiddens.sh
@@ -17,7 +17,7 @@
 # This script computes hidden states using a Hugging Face model and saves them to
 # the specified output directory.
 
-python3 collect_hidden_states/compute_hiddens_hf.py \
+python3 collect_hidden_states/compute_hidden_states_hf.py \
   --model meta-llama/Llama-3.2-1B-Instruct \
   --input-file synthetic_conversations/daring-anteater.jsonl \
   --output-dir /mnt/md0/eagle-hidden-states/llama1b/daring_anteater/
diff --git a/examples/speculative_decoding/collect_hidden_states/send_conversations_for_hiddens.py b/examples/speculative_decoding/collect_hidden_states/send_conversations_for_hiddens.py
@@ -152,7 +152,7 @@ async def main(args: argparse.Namespace) -> None:
             )
 
         input_ids = tokenizer.apply_chat_template(
-            conversations, return_tensors=None, add_generation_template=False
+            conversations, return_tensors=None, add_generation_template=False, tokenize=True
         )
         num_input_tokens = len(input_ids)
         if num_input_tokens <= 10 or num_input_tokens > args.max_seq_len:
diff --git a/examples/speculative_decoding/prepare_input_conversations/example_make_prompt_dataset.sh b/examples/speculative_decoding/prepare_input_conversations/example_make_prompt_dataset.sh
@@ -16,10 +16,10 @@
 # Example script to prepare a dataset of prompts for generation
 # Lines in this script can be uncommented to include specific datasets/splits in the prompt dataset.
 
-python3 make_prompts_for_gen/add_sharegpt.py --output-split eval --output-file data/mtbench_prompts_dataset.json
-# python3 make_prompts_for_gen/add_ultrachat.py --ultrachat-split train_sft --output-split train
-# python3 make_prompts_for_gen/add_ultrachat.py --ultrachat-split train_gen --output-split train
-# python3 make_prompts_for_gen/add_ultrachat.py --ultrachat-split test_sft --output-split mix_test
-# python3 make_prompts_for_gen/add_ultrachat.py --ultrachat-split test_gen --output-split mix_test
-python3 make_prompts_for_gen/add_mtbench.py --output-split train --output-file data/mtbench_prompts_dataset.json
-# python3 make_prompts_for_gen/add_mtbench.py --output-split eval --output-file data/mtbench_prompts_dataset.json
+python3 prepare_input_conversations/add_daring_anteater.py --output-split-name train
+# python3 prepare_input_conversations/add_sharegpt.py --output-split-name train
+# python3 prepare_input_conversations/add_ultrachat.py --ultrachat-split train_sft --output-split-name train
+# python3 prepare_input_conversations/add_ultrachat.py --ultrachat-split train_gen --output-split-name train
+# python3 prepare_input_conversations/add_ultrachat.py --ultrachat-split test_sft --output-split-name mix_test
+# python3 prepare_input_conversations/add_ultrachat.py --ultrachat-split test_gen --output-split-name mix_test
+python3 prepare_input_conversations/add_mtbench.py --output-split-name mix_test
diff --git a/examples/speculative_decoding/prepare_input_conversations/utils.py b/examples/speculative_decoding/prepare_input_conversations/utils.py
@@ -25,6 +25,7 @@
 
 async def download_file(url: str, destination: Path) -> None:
     """Download a file from a URL to a specified destination."""
+    destination.parent.mkdir(parents=True, exist_ok=True)
     async with aiohttp.ClientSession() as session, session.get(url) as response:
         if response.status != 200:
             msg = f"Failed to download {url}: {response.status}"
@@ -83,7 +84,8 @@ def add_conversations_to_split(conversations: list, dataset_dir: Path, split: st
     else:
         print(f"Added {num_new_entries} new conversations to {dataset_file}.")
 
-    with open(dataset_file, "w", encoding="utf-8") as f:
+    dataset_dir.mkdir(parents=True, exist_ok=True)
+    with dataset_file.open("w", encoding="utf-8") as f:
         for entry in all_conversations:
             f.write(json.dumps(entry, ensure_ascii=False) + "\n")
 
diff --git a/modelopt/torch/speculative/plugins/transformers.py b/modelopt/torch/speculative/plugins/transformers.py
@@ -336,10 +336,10 @@ class HFEagleModel(EagleModel):
 
     def _set_default_aux_hidden_state_layers(self):
         # Read a custom config attribute since we override num_hidden_layers for offline training
-        if self.eagle_offline:
-            num_layers = self.config.num_orig_hidden_layers
-        else:
-            num_layers = self.config.num_hidden_layers
+        num_layers = self.config.num_hidden_layers
+        if self.eagle_offline and (num_layers is None or num_layers <= 0):
+            num_layers = getattr(self.config, "num_orig_hidden_layers", 0)
+
         self.eagle_config.eagle_aux_hidden_state_layer_ids = [
             1,
             max(0, num_layers // 2 - 1),

Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,7 @@ async def main(args: argparse.Namespace) -> None:`
`152`	`152`	`)`
`153`	`153`
`154`	`154`	`input_ids = tokenizer.apply_chat_template(`
`155`		`- conversations, return_tensors=None, add_generation_template=False`
	`155`	`+ conversations, return_tensors=None, add_generation_template=False, tokenize=True`
`156`	`156`	`)`
`157`	`157`	`num_input_tokens = len(input_ids)`
`158`	`158`	`if num_input_tokens <= 10 or num_input_tokens > args.max_seq_len:`