Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions schema/py/litertlm_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,6 +509,8 @@ def add_hf_tokenizer(
def read_and_compress(path: str) -> bytes:
with litertlm_core.open_file(path, "rb") as f:
content = f.read()
if path.endswith(".zlib"):
return content
uncompressed_size = len(content)
compressed_content = zlib.compress(content)
return uncompressed_size.to_bytes(8, "little") + compressed_content
Expand Down
47 changes: 46 additions & 1 deletion schema/py/litertlm_builder_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import io
import os
import pathlib
import zlib
from absl.testing import absltest
from absl.testing import parameterized
from google.protobuf import text_format
Expand Down Expand Up @@ -304,7 +305,8 @@ def test_add_sentencepiece_tokenizer(self):

def test_add_hf_tokenizer(self):
"""Tests that a HuggingFace tokenizer can be added correctly."""
hf_path = self._create_dummy_file("tokenizer.json", b'{"version": "1.0"}')
hf_content = b'{"version": "1.0"}'
hf_path = self._create_dummy_file("tokenizer.json", hf_content)
additional_metadata = [
litertlm_builder.Metadata(
key="test_key",
Expand All @@ -320,6 +322,49 @@ def test_add_hf_tokenizer(self):
self.assertIn("Data Type: HF_Tokenizer_Zlib", ss)
self.assertIn("Key: test_key, Value (String): test_value", ss)

# Verify content compression
with litertlm_core.open_file(
os.path.join(self.temp_dir, "litertlm.litertlm"), "rb"
) as f:
f.seek(litertlm_core.BLOCK_SIZE)
# Read uncompressed size (8 bytes)
uncompressed_size = int.from_bytes(f.read(8), "little")
self.assertLen(hf_content, uncompressed_size)
# Read remaining data (compressed)
compressed_data = f.read()
# Decompress and verify. zlib.decompress will stop at end of stream,
# ignoring padding
decompressed = zlib.decompress(compressed_data)
self.assertEqual(decompressed, hf_content)

def test_add_hf_tokenizer_zlib(self):
"""Tests that a zipped HuggingFace tokenizer is handled correctly."""
zlib_content = b"dummy zlib content"
hf_path = self._create_dummy_file("tokenizer.zlib", zlib_content)
additional_metadata = [
litertlm_builder.Metadata(
key="test_key",
value="test_value",
dtype=litertlm_builder.DType.STRING,
)
]
builder = litertlm_builder.LitertLmFileBuilder()
self._add_system_metadata(builder)
builder.add_hf_tokenizer(hf_path, additional_metadata=additional_metadata)
ss = self._build_and_read_litertlm(builder)
self.assertIn("Sections (1)", ss)
self.assertIn("Data Type: HF_Tokenizer_Zlib", ss)
self.assertIn("Key: test_key, Value (String): test_value", ss)

# Verify content is raw (not re-compressed and no size prefix)
with litertlm_core.open_file(
os.path.join(self.temp_dir, "litertlm.litertlm"), "rb"
) as f:
f.seek(litertlm_core.BLOCK_SIZE)
# Should match exact content immediately
read_content = f.read(len(zlib_content))
self.assertEqual(read_content, zlib_content)

def test_add_tokenizer_already_added(self):
"""Tests that adding a tokenizer more than once raises an AssertionError."""
sp_path = self._create_dummy_file("sp.model", b"")
Expand Down
2 changes: 1 addition & 1 deletion schema/py/litertlm_peek.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def _get_generic_section_file_extension(data_type_str: str) -> str:
if data_type_str == "SP_Tokenizer":
return ".spiece"
elif data_type_str == "HF_Tokenizer_Zlib":
return ".json"
return ".zlib"
else:
return ".bin"

Expand Down
Loading