From 6441dd9125e9dd78f26c734ff21e935561dfca12 Mon Sep 17 00:00:00 2001 From: V-E-D Date: Mon, 7 Apr 2025 16:38:42 +0530 Subject: [PATCH 1/2] Fix tokenizer.encode() to respect add_special_tokens=False parameter --- olmo/tokenizer.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/olmo/tokenizer.py b/olmo/tokenizer.py index 7f5026302..2bab7a6bc 100644 --- a/olmo/tokenizer.py +++ b/olmo/tokenizer.py @@ -1,6 +1,7 @@ from __future__ import annotations import os +import inspect from pathlib import Path from typing import List, Optional, Union @@ -180,7 +181,12 @@ def encode_batch(self, inputs: List[str], add_special_tokens: bool = True) -> Li if truncate_to is not None and add_special_tokens: truncate_to -= self.num_special_tokens_to_add(False) - batch_encoding = self.base_tokenizer.encode_batch(inputs) + # Check if the base tokenizer's encode_batch method supports add_special_tokens parameter + if 'add_special_tokens' in inspect.signature(self.base_tokenizer.encode_batch).parameters: + batch_encoding = self.base_tokenizer.encode_batch(inputs, add_special_tokens=False) + else: + # Fallback to original behavior if the parameter isn't supported + batch_encoding = self.base_tokenizer.encode_batch(inputs) all_input_ids = [] for encoding in batch_encoding: From 357e140ef6cd8c5b5f5fe39b3973f191af29177c Mon Sep 17 00:00:00 2001 From: V-E-D Date: Thu, 15 May 2025 11:38:23 +0530 Subject: [PATCH 2/2] required change --- olmo/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/olmo/tokenizer.py b/olmo/tokenizer.py index 2bab7a6bc..885594999 100644 --- a/olmo/tokenizer.py +++ b/olmo/tokenizer.py @@ -183,7 +183,7 @@ def encode_batch(self, inputs: List[str], add_special_tokens: bool = True) -> Li # Check if the base tokenizer's encode_batch method supports add_special_tokens parameter if 'add_special_tokens' in inspect.signature(self.base_tokenizer.encode_batch).parameters: - batch_encoding = self.base_tokenizer.encode_batch(inputs, add_special_tokens=False) + batch_encoding = self.base_tokenizer.encode_batch(inputs, add_special_tokens=add_special_tokens) else: # Fallback to original behavior if the parameter isn't supported batch_encoding = self.base_tokenizer.encode_batch(inputs)