google-ai-edge · copybara-service · Jan 26, 2026
diff --git a/schema/py/litertlm_builder.py b/schema/py/litertlm_builder.py
@@ -509,6 +509,8 @@ def add_hf_tokenizer(
     def read_and_compress(path: str) -> bytes:
       with litertlm_core.open_file(path, "rb") as f:
         content = f.read()
+        if path.endswith(".zlib"):
+          return content
         uncompressed_size = len(content)
         compressed_content = zlib.compress(content)
         return uncompressed_size.to_bytes(8, "little") + compressed_content

diff --git a/schema/py/litertlm_builder_test.py b/schema/py/litertlm_builder_test.py
@@ -15,6 +15,7 @@
 import io
 import os
 import pathlib
+import zlib
 from absl.testing import absltest
 from absl.testing import parameterized
 from google.protobuf import text_format
@@ -304,7 +305,8 @@ def test_add_sentencepiece_tokenizer(self):
 
   def test_add_hf_tokenizer(self):
     """Tests that a HuggingFace tokenizer can be added correctly."""
-    hf_path = self._create_dummy_file("tokenizer.json", b'{"version": "1.0"}')
+    hf_content = b'{"version": "1.0"}'
+    hf_path = self._create_dummy_file("tokenizer.json", hf_content)
     additional_metadata = [
         litertlm_builder.Metadata(
             key="test_key",
@@ -320,6 +322,49 @@ def test_add_hf_tokenizer(self):
     self.assertIn("Data Type:    HF_Tokenizer_Zlib", ss)
     self.assertIn("Key: test_key, Value (String): test_value", ss)
 
+    # Verify content compression
+    with litertlm_core.open_file(
+        os.path.join(self.temp_dir, "litertlm.litertlm"), "rb"
+    ) as f:
+      f.seek(litertlm_core.BLOCK_SIZE)
+      # Read uncompressed size (8 bytes)
+      uncompressed_size = int.from_bytes(f.read(8), "little")
+      self.assertLen(hf_content, uncompressed_size)
+      # Read remaining data (compressed)
+      compressed_data = f.read()
+      # Decompress and verify. zlib.decompress will stop at end of stream,
+      # ignoring padding
+      decompressed = zlib.decompress(compressed_data)
+      self.assertEqual(decompressed, hf_content)
+
+  def test_add_hf_tokenizer_zlib(self):
+    """Tests that a zipped HuggingFace tokenizer is handled correctly."""
+    zlib_content = b"dummy zlib content"
+    hf_path = self._create_dummy_file("tokenizer.zlib", zlib_content)
+    additional_metadata = [
+        litertlm_builder.Metadata(
+            key="test_key",
+            value="test_value",
+            dtype=litertlm_builder.DType.STRING,
+        )
+    ]
+    builder = litertlm_builder.LitertLmFileBuilder()
+    self._add_system_metadata(builder)
+    builder.add_hf_tokenizer(hf_path, additional_metadata=additional_metadata)
+    ss = self._build_and_read_litertlm(builder)
+    self.assertIn("Sections (1)", ss)
+    self.assertIn("Data Type:    HF_Tokenizer_Zlib", ss)
+    self.assertIn("Key: test_key, Value (String): test_value", ss)
+
+    # Verify content is raw (not re-compressed and no size prefix)
+    with litertlm_core.open_file(
+        os.path.join(self.temp_dir, "litertlm.litertlm"), "rb"
+    ) as f:
+      f.seek(litertlm_core.BLOCK_SIZE)
+      # Should match exact content immediately
+      read_content = f.read(len(zlib_content))
+      self.assertEqual(read_content, zlib_content)
+
   def test_add_tokenizer_already_added(self):
     """Tests that adding a tokenizer more than once raises an AssertionError."""
     sp_path = self._create_dummy_file("sp.model", b"")

diff --git a/schema/py/litertlm_peek.py b/schema/py/litertlm_peek.py
@@ -306,7 +306,7 @@ def _get_generic_section_file_extension(data_type_str: str) -> str:
   if data_type_str == "SP_Tokenizer":
     return ".spiece"
   elif data_type_str == "HF_Tokenizer_Zlib":
-    return ".json"
+    return ".zlib"
   else:
     return ".bin"