update

tukwila · tukwila · commit d863ca8cc0db · 2025-09-09T10:52:27.000+08:00
Signed-off-by: guangli.bao &lt;guangli.bao@daocloud.io&gt;
diff --git a/src/guidellm/utils/preprocessing_sharegpt_data.py b/src/guidellm/utils/preprocessing_sharegpt_data.py
@@ -10,6 +10,15 @@
 
 MIN_CHAR = 10
 MAX_CHAR = 1000
+_tokenizer = None
+
+
+def estimate_num_tokens(text: str) -> int:
+    _tokenizer: AutoTokenizer
+    if _tokenizer is None:
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        _tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
+    return len(_tokenizer(text, return_tensors=None))
 
 
 def extract_and_save_with_filtering(file):
@@ -63,14 +72,6 @@ def extract_and_save_with_filtering(file):
     with Path(sharegpt_file).open("r", encoding="utf-8") as file:
         data = json.load(file)
 
-    def estimate_num_tokens(text: str) -> int:
-        if not hasattr(estimate_num_tokens, "tokenizer"):
-            os.environ["TOKENIZERS_PARALLELISM"] = "false"
-            estimate_num_tokens.tokenizer = AutoTokenizer.from_pretrained(
-                "mistralai/Mistral-7B-Instruct-v0.2"
-            )
-        return len(estimate_num_tokens.tokenizer.tokenize(text))
-
     num_of_ids = len(data)
     data = data[: int(num_of_ids * args.parse)]
     for d in data: