From 6441dd9125e9dd78f26c734ff21e935561dfca12 Mon Sep 17 00:00:00 2001
From: V-E-D <vedantthote2019@gmail.com>
Date: Mon, 7 Apr 2025 16:38:42 +0530
Subject: [PATCH 1/2] Fix tokenizer.encode() to respect
 add_special_tokens=False parameter

---
 olmo/tokenizer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/olmo/tokenizer.py b/olmo/tokenizer.py
index 7f5026302..2bab7a6bc 100644
--- a/olmo/tokenizer.py
+++ b/olmo/tokenizer.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import inspect
 from pathlib import Path
 from typing import List, Optional, Union
 
@@ -180,7 +181,12 @@ def encode_batch(self, inputs: List[str], add_special_tokens: bool = True) -> Li
         if truncate_to is not None and add_special_tokens:
             truncate_to -= self.num_special_tokens_to_add(False)
 
-        batch_encoding = self.base_tokenizer.encode_batch(inputs)
+        # Check if the base tokenizer's encode_batch method supports add_special_tokens parameter
+        if 'add_special_tokens' in inspect.signature(self.base_tokenizer.encode_batch).parameters:
+            batch_encoding = self.base_tokenizer.encode_batch(inputs, add_special_tokens=False)
+        else:
+            # Fallback to original behavior if the parameter isn't supported
+            batch_encoding = self.base_tokenizer.encode_batch(inputs)
 
         all_input_ids = []
         for encoding in batch_encoding:

From 357e140ef6cd8c5b5f5fe39b3973f191af29177c Mon Sep 17 00:00:00 2001
From: V-E-D <vedantthote2019@gmail.com>
Date: Thu, 15 May 2025 11:38:23 +0530
Subject: [PATCH 2/2] required change

---
 olmo/tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/olmo/tokenizer.py b/olmo/tokenizer.py
index 2bab7a6bc..885594999 100644
--- a/olmo/tokenizer.py
+++ b/olmo/tokenizer.py
@@ -183,7 +183,7 @@ def encode_batch(self, inputs: List[str], add_special_tokens: bool = True) -> Li
 
         # Check if the base tokenizer's encode_batch method supports add_special_tokens parameter
         if 'add_special_tokens' in inspect.signature(self.base_tokenizer.encode_batch).parameters:
-            batch_encoding = self.base_tokenizer.encode_batch(inputs, add_special_tokens=False)
+            batch_encoding = self.base_tokenizer.encode_batch(inputs, add_special_tokens=add_special_tokens)
         else:
             # Fallback to original behavior if the parameter isn't supported
             batch_encoding = self.base_tokenizer.encode_batch(inputs)