File tree Expand file tree Collapse file tree 1 file changed +6
-4
lines changed
Expand file tree Collapse file tree 1 file changed +6
-4
lines changed Original file line number Diff line number Diff line change 1313def tokenize (
1414 examples : Dict [str , List [Any ]],
1515 tokenizer : PreTrainedTokenizer ,
16- ) -> Dict [ str , List [ List [ int ]]] :
16+ ) -> Dict :
1717 if 'text' in examples :
18- input_ids = tokenizer ( examples ['text' ])[ 'input_ids ' ]
18+ samples = examples ['text' ]
1919 elif 'content' in examples :
20- input_ids = tokenizer ( examples ['content' ])[ 'input_ids ' ]
20+ samples = examples ['content' ]
2121 else :
2222 raise ValueError (f'No "text" or "content" field found in examples:\n { examples } ' )
23- return {'input_ids' : input_ids }
23+ input_ids = tokenizer (samples )['input_ids' ]
24+ bits_per_token = [len (sample .encode (encoding = 'utf-8' )) * 8 / len (input_ids [i ]) for i , sample in enumerate (samples )]
25+ return {'input_ids' : input_ids , 'bits_per_token' : bits_per_token }
2426
2527
2628if __name__ == '__main__' :
You can’t perform that action at this time.
0 commit comments