@@ -114,7 +114,7 @@ def __init__(
114114 # however the requests can be sent as a string if the API doesn't support token inputs.
115115 # use tokenized_requests=False
116116 tokenizer_backend : Optional [
117- Literal ["tiktoken" , "huggingface" , "None" , "none" ]
117+ Literal ["tiktoken" , "huggingface" , "remote" , " None" , "none" ]
118118 ] = "huggingface" ,
119119 truncate : bool = False ,
120120 # number of concurrent requests. More useful if not batching
@@ -132,6 +132,8 @@ def __init__(
132132 revision : Optional [str ] = "main" ,
133133 use_fast_tokenizer : bool = True ,
134134 verify_certificate : bool = True ,
135+ ca_cert_path : Optional [str ] = None ,
136+ auth_token : Optional [str ] = None ,
135137 eos_string : str = None ,
136138 # timeout in seconds
137139 timeout : int = 300 ,
@@ -182,6 +184,8 @@ def __init__(
182184 self .tokenized_requests = tokenized_requests
183185 self .max_retries = int (max_retries )
184186 self .verify_certificate = verify_certificate
187+ self .ca_cert_path = ca_cert_path
188+ self .auth_token = auth_token
185189 self ._eos_string = eos_string
186190 self .timeout = int (timeout )
187191 self .max_images = int (max_images )
@@ -218,6 +222,21 @@ def __init__(
218222 f"Passed `base_url={ self .base_url } ` but using (OpenAI) Tiktoken tokenizer backend. "
219223 "Pass `tokenizer_backend=huggingface` and provide the HF tokenizer name if your model does not use Tiktoken."
220224 )
225+ elif self .tokenizer_backend == "remote" :
226+ from lm_eval .utils import RemoteTokenizer
227+
228+ if not self .base_url :
229+ raise ValueError (
230+ "base_url is required for remote tokenizer backend"
231+ )
232+ self .tokenizer = RemoteTokenizer (
233+ self .base_url ,
234+ self .timeout ,
235+ self .verify_certificate ,
236+ self .ca_cert_path ,
237+ self .auth_token ,
238+ )
239+ eval_logger .info (f"Using remote tokenizer from { self .base_url } " )
221240 else :
222241 import transformers
223242
@@ -310,7 +329,7 @@ def tokenizer_name(self) -> str:
310329
311330 def apply_chat_template (
312331 self , chat_history : List [Dict [str , str ]], add_generation_prompt : bool = True
313- ) -> Union [str , JsonChatStr ]:
332+ ) -> Union [str , JsonChatStr , List [ Dict ] ]:
314333 """Applies a chat template to a list of chat history between user and model."""
315334 if self .tokenizer_backend == "huggingface" and self .tokenized_requests :
316335 return self .tokenizer .apply_chat_template (
@@ -319,6 +338,8 @@ def apply_chat_template(
319338 add_generation_prompt = add_generation_prompt ,
320339 continue_final_message = not add_generation_prompt ,
321340 )
341+ elif self .tokenizer_backend == "remote" and self .tokenized_requests :
342+ return chat_history
322343 else :
323344 # bit of a hack. We'll load back before sending to the API
324345 return JsonChatStr (
@@ -337,6 +358,8 @@ def eot_token_id(self) -> Optional[int]:
337358 return self .tokenizer .eos_token_id
338359 elif self .tokenizer_backend == "tiktoken" :
339360 return self .tokenizer .eot_token
361+ elif self .tokenizer_backend == "remote" :
362+ return self .tokenizer .eos_token_id
340363
341364 @cached_property
342365 def eos_string (self ) -> Optional [str ]:
@@ -347,6 +370,8 @@ def eos_string(self) -> Optional[str]:
347370 return self .tokenizer .eos_token
348371 elif self .tokenizer_backend == "tiktoken" :
349372 return self .tokenizer .decode ([self .tokenizer .eot_token ])
373+ elif self .tokenizer_backend == "remote" :
374+ return self .tokenizer .eos_token
350375 else :
351376 eval_logger .warning (
352377 "Cannot determine EOS string to pass to stop sequence. Manually set by passing `eos_string` to model_args."
@@ -364,6 +389,8 @@ def prefix_token_id(self) -> Optional[int]:
364389 if self .tokenizer .bos_token_id is not None :
365390 return self .tokenizer .bos_token_id
366391 return self .tokenizer .eos_token_id
392+ elif self .tokenizer_backend == "remote" :
393+ return self .tokenizer .bos_token_id or self .tokenizer .eos_token_id
367394 else :
368395 return self .tokenizer .eot_token
369396
@@ -396,7 +423,19 @@ def tok_encode(
396423 encoding = encoding [- left_truncate_len :]
397424
398425 return encoding
426+ elif self .tokenizer_backend == "remote" :
427+ if isinstance (string , str ):
428+ encoding = self .tokenizer .encode (string )
429+ else :
430+ encoding = [self .tokenizer .encode (s ) for s in string ]
399431
432+ if left_truncate_len :
433+ if isinstance (string , str ):
434+ encoding = encoding [- left_truncate_len :]
435+ else :
436+ encoding = [enc [- left_truncate_len :] for enc in encoding ]
437+
438+ return encoding
400439 else :
401440 try :
402441 encoding = self .tokenizer .encode (string )
@@ -409,6 +448,8 @@ def decode_batch(self, tokens: List[List[int]]) -> List[str]:
409448 return self .tokenizer .batch_decode (tokens )
410449 elif self .tokenizer_backend == "tiktoken" :
411450 return self .tokenizer .decode_batch (tokens )
451+ elif self .tokenizer_backend == "remote" :
452+ return self .tokenizer .batch_decode (tokens )
412453
413454 def model_call (
414455 self ,
0 commit comments