sgl-project
diff --git a/‎benchmarks/bench_eagle3.py‎
Lines changed: 2 additions & 0 deletions b/‎benchmarks/bench_eagle3.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎scripts/prepare_hidden_states.py‎
Lines changed: 61 additions & 13 deletions b/‎scripts/prepare_hidden_states.py‎
Lines changed: 61 additions & 13 deletions
diff --git a/‎scripts/regenerate_train_data.py‎
Lines changed: 53 additions & 9 deletions b/‎scripts/regenerate_train_data.py‎
Lines changed: 53 additions & 9 deletions
@@ -201,7 +201,9 @@ def main():
     assert len(benchmark_list) != 0, "the number of benchmark list is 0"
 
     base_url = f"http://localhost:{args.port}"
+
     results = {}
+    results["model"] = server_args.speculative_draft_model_path
 
     def run_benchmarks(batch_size: int, steps: int, topk: int, num_draft_tokens: int):
         for benchmark_name, num_prompts, subset in benchmark_list:
 
@@ -34,6 +34,7 @@
 
 import argparse
 import gc
+import gzip
 import hashlib
 import os
 from concurrent.futures import ThreadPoolExecutor
@@ -46,7 +47,7 @@
 from tqdm import tqdm
 from transformers import AutoConfig, AutoProcessor, AutoTokenizer
 
-from datasets import load_dataset
+from datasets import Dataset
 from specforge.args import SGLangBackendArgs
 from specforge.data import build_eagle3_dataset, prepare_dp_dataloaders
 from specforge.distributed import (
@@ -57,7 +58,12 @@
     is_tp_rank_0,
 )
 from specforge.modeling.target import Eagle3TargetModel, get_eagle3_target_model
-from specforge.utils import print_with_rank, rank_0_priority
+from specforge.utils import (
+    print_args_with_dots,
+    print_with_rank,
+    rank_0_priority,
+    safe_conversations_generator,
+)
 
 
 @dataclass
@@ -119,8 +125,8 @@ def parse_args():
     others_group.add_argument(
         "--num-io-threads",
         type=int,
-        default=4,
-        help="Number of threads for async I/O operations",
+        default=None,
+        help="Number of threads for async I/O operations (default: all of CPU cores).",
     )
     others_group.add_argument(
         "--num-workers", type=int, default=4, help="Number of workers for DataLoader"
@@ -137,6 +143,17 @@ def parse_args():
         default=2000,
         help="Number of files per subdirectory.",
     )
+    others_group.add_argument(
+        "--compress",
+        action="store_true",
+        help="Compress hidden state files on disk (gzip).",
+    )
+    others_group.add_argument(
+        "--compression-level",
+        type=int,
+        default=6,
+        help="Gzip compression level (1-9).",
+    )
 
     sglang_group = parser.add_argument_group("sglang")
     SGLangBackendArgs.add_args(sglang_group)
@@ -211,6 +228,8 @@ def __init__(
         num_io_threads: int = 4,
         io_queue_size: int = 50,
         file_group_size: int = 2000,
+        compress: bool = False,
+        compression_level: int = 6,
     ):
         """
         Args:
@@ -227,6 +246,9 @@ def __init__(
         self.num_io_threads = num_io_threads
         self.io_queue_size = io_queue_size
         self.file_group_size = file_group_size
+        self.compress = compress
+        self.compression_level = compression_level
+        self.file_extension = ".ckpt.gz" if self.compress else ".ckpt"
 
         # progress bar should only shown on TP rank = 0
         self.show_progress = dist.get_rank(get_tp_group()) == 0
@@ -278,7 +300,13 @@ def _save_tensor_sync(self, data_point: DataPoint, output_file: str) -> None:
             )
             return
 
-        torch.save(asdict(data_point), output_file)
+        if self.compress:
+            with gzip.open(
+                output_file, "wb", compresslevel=self.compression_level
+            ) as f:
+                torch.save(asdict(data_point), f)
+        else:
+            torch.save(asdict(data_point), output_file)
 
     def _save_tensor_async(self, data_point: DataPoint, output_file: str) -> None:
         """
@@ -361,14 +389,22 @@ def _check_existing_files_batch(
             return [False] * len(global_indices)
 
         def check_single_file(idx):
-            return os.path.exists(self._get_file_path(output_path, idx))
+            if os.path.exists(self._get_file_path(output_path, idx)):
+                return True
+            legacy_ckpt = self._get_file_path(output_path, idx, extension=".ckpt")
+            compressed_ckpt = self._get_file_path(
+                output_path, idx, extension=".ckpt.gz"
+            )
+            return os.path.exists(legacy_ckpt) or os.path.exists(compressed_ckpt)
 
         # Parallel file existence check
         with ThreadPoolExecutor(max_workers=self.num_io_threads) as executor:
             exists = list(executor.map(check_single_file, global_indices))
         return exists
 
-    def _get_file_path(self, output_path: str, idx: int) -> str:
+    def _get_file_path(
+        self, output_path: str, idx: int, extension: Optional[str] = None
+    ) -> str:
         """
         A helper function to get the standard file path for the data point with the given index.
 
@@ -379,9 +415,10 @@ def _get_file_path(self, output_path: str, idx: int) -> str:
         Returns:
             str: The file path for the data point.
         """
+        ext = self.file_extension if extension is None else extension
         group_idx = (idx // self.file_group_size) * self.file_group_size
         grouped_subdir = f"rows_{group_idx}-{group_idx + self.file_group_size}"
-        return os.path.join(output_path, grouped_subdir, f"data_{idx}.ckpt")
+        return os.path.join(output_path, grouped_subdir, f"data_{idx}{ext}")
 
     @torch.no_grad()
     def generate(
@@ -469,7 +506,6 @@ def generate(
             filtered_batch_gpu = {
                 k: v.cuda(non_blocking=True) for k, v in filtered_batch.items()
             }
-
             _, _, aux_hidden_states_list, last_hidden_states_list = self.model.extend(
                 **filtered_batch_gpu,
                 return_last_hidden_states=True,
@@ -550,9 +586,12 @@ def main():
         args.aux_hidden_states_layers = [
             int(x) for x in args.aux_hidden_states_layers.split(",")
         ]
-
+    if args.num_io_threads is None:
+        cpu_cores = os.cpu_count() or 1
+        args.num_io_threads = max(1, cpu_cores)
     # Initialize distributed environment (TP + DP)
     init_distributed(timeout=args.dist_timeout, tp_size=args.tp_size)
+    print_args_with_dots(args)
 
     # Build target model (with TP)
     target_model_config = AutoConfig.from_pretrained(
@@ -574,10 +613,17 @@ def main():
     assert os.path.exists(
         args.data_path
     ), f"Dataset path {args.data_path} does not exist"
-    dataset = load_dataset("json", data_files=args.data_path)["train"]
+    dataset = Dataset.from_generator(
+        generator=safe_conversations_generator,
+        gen_kwargs={"file_path": args.data_path},
+        cache_dir=os.path.join(
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+            "cache",
+            "hf_dataset",
+        ),
+    )
     if args.num_samples is not None:
         dataset = dataset.select(range(args.num_samples))
-
     # Tokenizer and cache key
     tokenizer = AutoTokenizer.from_pretrained(
         args.target_model_path, trust_remote_code=True
@@ -643,10 +689,12 @@ def main():
         # Pass configurable arguments from args if needed
         with HiddenStatesGenerator(
             target_model,
-            args.enable_aux_hidden_states,
+            enable_aux_hidden_states=args.enable_aux_hidden_states,
             num_io_threads=args.num_io_threads,
             io_queue_size=args.io_queue_size,
             file_group_size=args.file_group_size,
+            compress=args.compress,
+            compression_level=args.compression_level,
             # Other params like io_queue_size can also be added to argparse
         ) as hidden_states_generator:
 
 
@@ -1,4 +1,4 @@
-"""
+"""
 This script will re-generate the dataset from target model,
 which better aligns the draft model with the target model’s output distribution.
 
@@ -29,6 +29,7 @@
 
 import argparse
 import json
+import os
 import random
 from concurrent.futures import ThreadPoolExecutor
 from typing import Any, Dict, List
@@ -113,6 +114,11 @@ def parse_arguments():
         default=None,
         help="The number of samples to regenerate, if not provided, all samples will be regenerated",
     )
+    data_group.add_argument(
+        "--resume",
+        action="store_true",
+        help="Resume from existing output file, skip already processed samples",
+    )
 
     # sglang server
     server_group = parser.add_argument_group("sglang server")
@@ -252,9 +258,29 @@ def main():
     print(f"  API URL: {args.server_address}")
     print(f"  Input file: {args.input_file_path}")
     print(f"  Output file: {args.output_file_path}")
+    print(f"  Resume mode: {args.resume}")
     print("-" * 50)
     total_lines = sum(1 for _ in open(args.input_file_path))
 
+    skip_lines = 0
+    error_file_path = args.output_file_path.replace(".jsonl", "_error.jsonl")
+
+    if args.resume and os.path.exists(args.output_file_path):
+        existing_success = sum(1 for _ in open(args.output_file_path))
+        existing_error = 0
+        if os.path.exists(error_file_path):
+            existing_error = sum(1 for _ in open(error_file_path))
+        skip_lines = existing_success + existing_error
+        print(f"Resume mode enabled:")
+        print(f"  Found {existing_success} successful samples in output file")
+        print(f"  Found {existing_error} error samples in error file")
+        print(f"  Skipping first {skip_lines} input samples")
+        print("-" * 50)
+
+        if skip_lines >= total_lines:
+            print(f"All {total_lines} samples already processed. Nothing to do.")
+            return
+
     # test all server addresses
     valid_server_addresses = []
     for server_address in args.server_address:
@@ -279,11 +305,14 @@ def main():
     )
     print("-" * 50)
 
-    # create error file path if not exists
-    error_file_path = args.output_file_path.replace(".jsonl", "_error.jsonl")
+    # Determine file open mode based on resume flag
+    file_mode = "a" if (args.resume and skip_lines > 0) else "w"
     print(
         f"Regenerating dataset and saving the output to {args.output_file_path} and error log to {error_file_path}"
     )
+    print(
+        f"File open mode: {file_mode} ({'append' if file_mode == 'a' else 'overwrite'})"
+    )
     print("-" * 50)
     context_token_sum = 0
     context_token_min = None
@@ -294,18 +323,24 @@ def main():
     # Create progress bar
     with (
         open(args.input_file_path, "r") as input_file,
-        open(args.output_file_path, "w") as output_file_handle,
-        open(error_file_path, "w") as error_file_handle,
+        open(args.output_file_path, file_mode) as output_file_handle,
+        open(error_file_path, file_mode) as error_file_handle,
     ):
         executor = ThreadPoolExecutor(
             max_workers=args.concurrency * len(valid_server_addresses)
         )
         waiting_queue = {
             server_address: [] for server_address in valid_server_addresses
         }
-        pbar = tqdm(total=total_lines, desc="Processing")
+        pbar = tqdm(total=total_lines, desc="Processing", initial=skip_lines)
         start_server_index = 0
 
+        if skip_lines > 0:
+            print(f"Skipping {skip_lines} already processed samples...")
+            for _ in range(skip_lines):
+                next(input_file, None)
+            print(f"Resuming from sample {skip_lines + 1}")
+
         for line in input_file:
             if (
                 args.num_samples is not None
@@ -398,9 +433,18 @@ def main():
     else:
         print("No successful examples to compute context length statistics.")
 
-    print(
-        f"\nProcessing completed! {success_samples} samples regenerated, {error_samples} samples failed."
-    )
+    total_processed = success_samples + error_samples
+    if skip_lines > 0:
+        print(f"\nResume processing completed!")
+        print(f"  Previously processed: {skip_lines}")
+        print(
+            f"  Newly processed: {total_processed} ({success_samples} success, {error_samples} failed)"
+        )
+        print(f"  Total: {skip_lines + total_processed}")
+    else:
+        print(
+            f"\nProcessing completed! {success_samples} samples regenerated, {error_samples} samples failed."
+        )
 
 
 if __name__ == "__main__":