File tree Expand file tree Collapse file tree 1 file changed +9
-8
lines changed Expand file tree Collapse file tree 1 file changed +9
-8
lines changed Original file line number Diff line number Diff line change 1010
1111MIN_CHAR = 10
1212MAX_CHAR = 1000
13+ _tokenizer = None
14+
15+
16+ def estimate_num_tokens (text : str ) -> int :
17+ _tokenizer : AutoTokenizer
18+ if _tokenizer is None :
19+ os .environ ["TOKENIZERS_PARALLELISM" ] = "false"
20+ _tokenizer = AutoTokenizer .from_pretrained ("mistralai/Mistral-7B-Instruct-v0.2" )
21+ return len (_tokenizer (text , return_tensors = None ))
1322
1423
1524def extract_and_save_with_filtering (file ):
@@ -63,14 +72,6 @@ def extract_and_save_with_filtering(file):
6372 with Path (sharegpt_file ).open ("r" , encoding = "utf-8" ) as file :
6473 data = json .load (file )
6574
66- def estimate_num_tokens (text : str ) -> int :
67- if not hasattr (estimate_num_tokens , "tokenizer" ):
68- os .environ ["TOKENIZERS_PARALLELISM" ] = "false"
69- estimate_num_tokens .tokenizer = AutoTokenizer .from_pretrained (
70- "mistralai/Mistral-7B-Instruct-v0.2"
71- )
72- return len (estimate_num_tokens .tokenizer .tokenize (text ))
73-
7475 num_of_ids = len (data )
7576 data = data [: int (num_of_ids * args .parse )]
7677 for d in data :
You can’t perform that action at this time.
0 commit comments