Skip to content

Commit d863ca8

Browse files
committed
update
Signed-off-by: guangli.bao <[email protected]>
1 parent 42e7f4c commit d863ca8

File tree

1 file changed

+9
-8
lines changed

1 file changed

+9
-8
lines changed

src/guidellm/utils/preprocessing_sharegpt_data.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,15 @@
1010

1111
MIN_CHAR = 10
1212
MAX_CHAR = 1000
13+
_tokenizer = None
14+
15+
16+
def estimate_num_tokens(text: str) -> int:
17+
_tokenizer: AutoTokenizer
18+
if _tokenizer is None:
19+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
20+
_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
21+
return len(_tokenizer(text, return_tensors=None))
1322

1423

1524
def extract_and_save_with_filtering(file):
@@ -63,14 +72,6 @@ def extract_and_save_with_filtering(file):
6372
with Path(sharegpt_file).open("r", encoding="utf-8") as file:
6473
data = json.load(file)
6574

66-
def estimate_num_tokens(text: str) -> int:
67-
if not hasattr(estimate_num_tokens, "tokenizer"):
68-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
69-
estimate_num_tokens.tokenizer = AutoTokenizer.from_pretrained(
70-
"mistralai/Mistral-7B-Instruct-v0.2"
71-
)
72-
return len(estimate_num_tokens.tokenizer.tokenize(text))
73-
7475
num_of_ids = len(data)
7576
data = data[: int(num_of_ids * args.parse)]
7677
for d in data:

0 commit comments

Comments
 (0)