Skip to content

Commit f7bbc5a

Browse files
fix(ci): resolve pyproject.toml license error and fix wheel tests
- Fixed project.license format in pyproject.toml (PEP 621) - Added backward compatibility for fast_mode and longest_match in CrayonVocab - Removed manual sys.path manipulation in unit tests to support wheel testing - Synchronized metadata across setup.py and pyproject.toml
1 parent fbb49c6 commit f7bbc5a

File tree

5 files changed

+50
-27
lines changed

5 files changed

+50
-27
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ version = "4.3.0"
88
description = "Omni-Backend Tokenizer - CPU (AVX2/512), CUDA (NVIDIA), ROCm (AMD) with automatic hardware detection"
99
readme = "README.md"
1010
requires-python = ">=3.10"
11-
license = "MIT"
11+
license = {file = "LICENSE"}
1212
authors = [
1313
{name = "Xerv Research Engineering Division", email = "engineering@xerv.ai"}
1414
]

src/crayon/core/vocabulary.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,6 +1034,38 @@ def is_profile_loaded(self) -> bool:
10341034
"""Check if a profile is currently loaded."""
10351035
return self._profile_loaded
10361036

1037+
@property
1038+
def fast_mode(self) -> bool:
1039+
"""Check if running in high-performance mode (C++ backend)."""
1040+
return self.device in ("cpu", "cuda", "rocm") and (self._cpu_backend is not None or self._gpu_backend is not None)
1041+
1042+
def longest_match(self, text: str, pos: int = 0) -> Tuple[int, int]:
1043+
"""
1044+
Find the longest matching token at the given position (Compatibility Mode).
1045+
1046+
Note: This is slower than tokenize() as it creates a substring.
1047+
"""
1048+
if pos >= len(text):
1049+
return self.unk_token_id, 0
1050+
1051+
# Optimization: We only need to check a reasonable window
1052+
# The longest token is rarely more than 100 characters.
1053+
window = text[pos : pos + 128]
1054+
tokens = self.tokenize(window)
1055+
1056+
if not tokens:
1057+
return self.unk_token_id, 1
1058+
1059+
# Get the first token ID
1060+
first_id = tokens[0]
1061+
1062+
# Get its length from id_to_token
1063+
if 0 <= first_id < len(self._idx_to_str):
1064+
token_str = self._idx_to_str[first_id]
1065+
return first_id, len(token_str)
1066+
else:
1067+
return self.unk_token_id, 1
1068+
10371069

10381070
# ============================================================================
10391071
# CONVENIENCE FUNCTIONS

tests/test_c_ext.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import json
88
from pathlib import Path
99

10-
# Add src to path for imports
11-
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
1210

1311
try:
1412
from crayon.c_ext import crayon_cpu, crayon_trainer, crayon_compiler

tests/test_core.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import sys
44
from pathlib import Path
55

6-
# Add src to path for imports
7-
sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
86

97
from crayon.core.vocabulary import CrayonVocab
108
from crayon.core.primitives import TokenMetadata

tests/test_throughput.py

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
import unittest
23
import time
34
from crayon.core.vocabulary import CrayonVocab
@@ -29,32 +30,26 @@ def test_throughput_target(self):
2930

3031
print(f"Throughput Test: {throughput:,.0f} tokens/sec")
3132

32-
# We should at least achieve baseline performance
33+
# We should at least achieve baseline performance (10k is very conservative for C++ engine)
3334
self.assertGreater(throughput, 10000, "Throughput fell below minimum acceptable threshold")
3435

35-
def test_c_extension_performance_boost(self):
36-
"""Test that C extension provides performance improvement."""
37-
if not self.vocab._c_ext_available:
38-
self.skipTest("C extension not available")
39-
40-
# Measure Python fallback
41-
self.vocab._c_ext_available = False
42-
original_trie = self.vocab._c_trie
43-
self.vocab._c_trie = None
44-
45-
start = time.perf_counter()
46-
for _ in range(3):
47-
_ = self.vocab.tokenize(self.text)
48-
python_time = time.perf_counter() - start
49-
50-
# Restore C extension
51-
self.vocab._c_ext_available = True
52-
self.vocab._c_trie = original_trie
53-
36+
def test_engine_performance_boost(self):
37+
"""Test that the engine provides reasonable performance."""
38+
# In V4, 'fast_mode' is the default if compiled.
39+
# We check by seeing if it's using the C++ backend.
40+
info = self.vocab.get_info()
41+
is_fast = info["backend"].endswith("_extension")
42+
43+
if not is_fast:
44+
self.skipTest("C++ extension not available, can't test boost")
45+
5446
start = time.perf_counter()
5547
for _ in range(3):
5648
_ = self.vocab.tokenize(self.text)
5749
c_time = time.perf_counter() - start
5850

59-
print(f"Python time: {python_time:.3f}s, C time: {c_time:.3f}s")
60-
# C extension should be at least comparable (may not always be faster due to Python overhead)
51+
print(f"C++ Engine time: {c_time:.3f}s")
52+
self.assertGreater(len(self.vocab.tokenize(self.text)), 0)
53+
54+
if __name__ == "__main__":
55+
unittest.main()

0 commit comments

Comments
 (0)