update

tukwila · tukwila · commit 168071715a9e · 2025-09-09T11:21:31.000+08:00
Signed-off-by: guangli.bao &lt;guangli.bao@daocloud.io&gt;
diff --git a/src/guidellm/utils/preprocessing_sharegpt_data.py b/src/guidellm/utils/preprocessing_sharegpt_data.py
@@ -3,22 +3,43 @@
 import os
 import re
 from pathlib import Path
+from typing import Callable, Optional
 
 import numpy as np
 from datasets import load_dataset
 from transformers import AutoTokenizer
 
 MIN_CHAR = 10
 MAX_CHAR = 1000
-_tokenizer = None
 
 
-def estimate_num_tokens(text: str) -> int:
-    _tokenizer: AutoTokenizer
-    if _tokenizer is None:
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-        _tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
-    return len(_tokenizer(text, return_tensors=None))
+def create_token_estimator(
+    model_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
+) -> Callable[[str], int]:
+    _tokenizer: Optional[AutoTokenizer] = None
+
+    def initialize() -> None:
+        nonlocal _tokenizer
+        if _tokenizer is None:
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
+            try:
+                _tokenizer = AutoTokenizer.from_pretrained(model_name)
+            except (OSError, ImportError, ValueError) as e:
+                raise RuntimeError(f"Failed to initialize tokenizer: {e}") from e
+
+    def estimate_num_tokens(text: str) -> int:
+        initialize()
+
+        if _tokenizer is None:
+            return 0
+
+        try:
+            encoding = _tokenizer(text, return_tensors=None)
+            return len(encoding["input_ids"])
+        except (AttributeError, TypeError, RuntimeError) as e:
+            raise ValueError(f"Error processing text: {e}") from e
+
+    return estimate_num_tokens
 
 
 def extract_and_save_with_filtering(file):
@@ -72,6 +93,7 @@ def extract_and_save_with_filtering(file):
     with Path(sharegpt_file).open("r", encoding="utf-8") as file:
         data = json.load(file)
 
+    estimate_tokens = create_token_estimator()
     num_of_ids = len(data)
     data = data[: int(num_of_ids * args.parse)]
     for d in data:
@@ -80,9 +102,9 @@ def extract_and_save_with_filtering(file):
         gpt_tokens = []
         for conv in d["conversations"]:
             if conv["from"] == "human":
-                human_tokens.append(estimate_num_tokens(conv["value"]))
+                human_tokens.append(estimate_tokens(conv["value"]))
             if conv["from"] == "gpt":
-                token_number = estimate_num_tokens(conv["value"])
+                token_number = estimate_tokens(conv["value"])
                 conv["num_tokens"] = token_number
                 gpt_tokens.append(token_number)
         if len(human_tokens) == 0: