Skip to content

Commit cb1f54b

Browse files
dhur
1 parent 2e65b5f commit cb1f54b

File tree

6 files changed

+556
-21
lines changed

6 files changed

+556
-21
lines changed

pyproject.toml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,13 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "xerv-crayon"
7-
version = "5.2.5"
7+
version = "5.3.4"
88
description = "Omni-Backend Tokenizer - CPU (AVX2/512), CUDA (NVIDIA), ROCm (AMD) with automatic hardware detection"
99
readme = "README.md"
1010
requires-python = ">=3.8,<3.13"
1111
license = {file = "LICENSE"}
1212
authors = [
13-
{name = "Xerv Research Engineering Division", email = "engineering@xerv.ai"}
13+
{name = "Xerv Research Engineering Division", email = "xerv.org@gmail.com"}
1414
]
1515
keywords = [
1616
"tokenizer",
@@ -108,7 +108,12 @@ where = ["src"]
108108
"c_ext/*.c",
109109
"c_ext/*.cpp",
110110
"c_ext/*.cu",
111-
"c_ext/*.hip"
111+
"c_ext/*.hip",
112+
"c_ext/*.pyd",
113+
"c_ext/*.so",
114+
"c_ext/*.py",
115+
"c_ext/compiled/*.pyd",
116+
"c_ext/compiled/*.so"
112117
]
113118

114119
[tool.pytest.ini_options]

setup_build.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
"""
2+
XERV CRAYON SETUP v5.2.6 - WITH C++ EXTENSIONS
3+
==============================================
4+
Builds native extensions for maximum performance
5+
"""
6+
7+
import os
8+
import sys
9+
import platform
10+
from pathlib import Path
11+
from setuptools import setup, find_packages, Extension
12+
from setuptools.command.build_ext import build_ext
13+
14+
VERSION = "5.2.6"
15+
16+
class CustomBuildExt(build_ext):
17+
"""Custom build extension to handle platform-specific compilation"""
18+
19+
def build_extension(self, ext):
20+
try:
21+
super().build_extension(ext)
22+
except Exception as e:
23+
print(f"Warning: Failed to build {ext.name}: {e}")
24+
print("Falling back to pure Python implementation...")
25+
# Continue without this extension
26+
27+
def get_extensions():
28+
"""Get list of extensions to build"""
29+
extensions = []
30+
31+
# Get source directory
32+
script_dir = Path(__file__).parent
33+
c_ext_dir = script_dir / "src" / "crayon" / "c_ext"
34+
35+
# CPU Extension (always try to build)
36+
cpu_sources = [
37+
str(c_ext_dir / "crayon_module.c"),
38+
str(c_ext_dir / "simd_ops.c"),
39+
str(c_ext_dir / "cpu_engine.cpp"),
40+
]
41+
42+
cpu_ext = Extension(
43+
'crayon.c_ext.crayon_cpu',
44+
sources=cpu_sources,
45+
include_dirs=[str(c_ext_dir)],
46+
extra_compile_args=['-O3', '-march=native'] if platform.system() != 'Windows' else ['/O2'],
47+
language='c++'
48+
)
49+
extensions.append(cpu_ext)
50+
51+
# CUDA Extension (optional)
52+
if os.environ.get('CUDA_HOME') or shutil.which('nvcc'):
53+
cuda_sources = [
54+
str(c_ext_dir / "gpu_engine_cuda.cu"),
55+
]
56+
57+
cuda_ext = Extension(
58+
'crayon.c_ext.crayon_cuda',
59+
sources=cuda_sources,
60+
include_dirs=[str(c_ext_dir)],
61+
extra_compile_args=['-O3'],
62+
language='c++'
63+
)
64+
extensions.append(cuda_ext)
65+
66+
return extensions
67+
68+
# Check if we should build extensions
69+
build_extensions = '--no-extensions' not in sys.argv
70+
71+
if build_extensions:
72+
try:
73+
extensions = get_extensions()
74+
print(f"Building {len(extensions)} extension(s)...")
75+
except Exception as e:
76+
print(f"Warning: Extension setup failed: {e}")
77+
print("Falling back to pure Python...")
78+
extensions = []
79+
else:
80+
extensions = []
81+
print("Skipping extension build (using pure Python)")
82+
83+
setup(
84+
name="xerv-crayon",
85+
version=VERSION,
86+
packages=find_packages("src"),
87+
package_dir={"": "src"},
88+
python_requires=">=3.8,<3.14",
89+
install_requires=[
90+
"numpy>=1.21.0",
91+
],
92+
ext_modules=extensions if build_extensions else [],
93+
cmdclass={'build_ext': CustomBuildExt} if build_extensions else {},
94+
package_data={
95+
"crayon": [
96+
"resources/dat/vocab_lite.dat",
97+
"resources/dat/vocab_lite.json",
98+
"resources/dat/vocab_standard.dat",
99+
"resources/dat/vocab_standard.json",
100+
"resources/*.txt",
101+
"resources/*.csv",
102+
"c_ext/*.h",
103+
"c_ext/*.c",
104+
"c_ext/*.cpp",
105+
"c_ext/*.cu",
106+
"c_ext/*.hip",
107+
"c_ext/*.pyd",
108+
"c_ext/*.so"
109+
]
110+
},
111+
)

src/crayon/c_ext/__init__.py

Lines changed: 76 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25,22 +25,81 @@
2525
from typing import Optional, Tuple
2626

2727
# ============================================================================
28-
# CPU BACKEND (Required)
28+
# CPU BACKEND (Required - Lazy Import to avoid circular dependencies)
2929
# ============================================================================
3030

31-
try:
32-
from . import crayon_cpu
33-
except ImportError as e:
34-
# Provide helpful error message for common issues
35-
_cpu_error = (
36-
"Failed to import crayon_cpu extension. This is required for Crayon to work.\n"
37-
"Possible causes:\n"
38-
" 1. The package was not installed correctly (try: pip install --force-reinstall xerv-crayon)\n"
39-
" 2. The C++ extension failed to compile (check for compiler errors during install)\n"
40-
" 3. Python version mismatch (Crayon requires Python 3.10+)\n"
41-
f"Original error: {e}"
42-
)
43-
raise ImportError(_cpu_error) from e
31+
_cpu_module: Optional[object] = None
32+
_cpu_checked: bool = False
33+
_cpu_error: Optional[str] = None
34+
35+
36+
def _load_cpu_backend() -> Optional[object]:
37+
"""Internal function to load the CPU backend."""
38+
global _cpu_checked, _cpu_module, _cpu_error
39+
40+
if _cpu_checked:
41+
return _cpu_module
42+
43+
_cpu_checked = True
44+
try:
45+
# Use absolute import to avoid circular dependency issues
46+
import crayon.c_ext.crayon_cpu as _cpu
47+
# Verify it's functional
48+
if hasattr(_cpu, 'tokenize') and hasattr(_cpu, 'load_dat'):
49+
_cpu_module = _cpu
50+
return _cpu_module
51+
else:
52+
_cpu_error = "crayon_cpu module missing required functions (tokenize, load_dat)"
53+
return None
54+
except ImportError as e:
55+
_cpu_error = (
56+
f"Failed to import crayon_cpu extension. {e}\n"
57+
"Possible causes:\n"
58+
" 1. The package was not installed correctly (try: pip install --force-reinstall xerv-crayon)\n"
59+
" 2. The C++ extension failed to compile (check for compiler errors during install)\n"
60+
" 3. Python version mismatch (Crayon requires Python 3.10+)"
61+
)
62+
return None
63+
except Exception as e:
64+
_cpu_error = f"Unexpected error loading crayon_cpu: {e}"
65+
return None
66+
67+
68+
def get_cpu_backend() -> Optional[object]:
69+
"""Get the CPU backend module, loading it if necessary."""
70+
return _load_cpu_backend()
71+
72+
73+
def is_cpu_available() -> bool:
74+
"""Check if the CPU backend is available."""
75+
return _load_cpu_backend() is not None
76+
77+
78+
def get_cpu_error() -> Optional[str]:
79+
"""Get the error message if CPU backend is unavailable."""
80+
_load_cpu_backend() # Ensure check has run
81+
return _cpu_error
82+
83+
84+
# Create a proxy object for backward compatibility
85+
class _CPUProxy:
86+
"""Proxy object that lazily loads crayon_cpu when accessed."""
87+
88+
def __getattr__(self, name):
89+
cpu_module = _load_cpu_backend()
90+
if cpu_module is None:
91+
raise ImportError(f"CPU backend not available: {get_cpu_error()}")
92+
return getattr(cpu_module, name)
93+
94+
def __dir__(self):
95+
cpu_module = _load_cpu_backend()
96+
if cpu_module is None:
97+
return []
98+
return dir(cpu_module)
99+
100+
101+
# Create the proxy instance
102+
crayon_cpu = _CPUProxy()
44103

45104

46105
# ============================================================================
@@ -185,6 +244,9 @@ def get_backend_info() -> dict:
185244

186245
__all__ = [
187246
"crayon_cpu",
247+
"is_cpu_available",
248+
"get_cpu_backend",
249+
"get_cpu_error",
188250
"is_cuda_available",
189251
"is_rocm_available",
190252
"get_cuda_error",

src/crayon/c_ext/crayon_cpu.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
"""
2+
CPU Backend Wrapper for Crayon
3+
================================
4+
5+
This module wraps the compiled C++ extension for CPU tokenization.
6+
Falls back to pure Python implementation if extension is not available.
7+
"""
8+
9+
import sys
10+
import os
11+
import importlib.util
12+
13+
def _load_cpu_extension():
14+
"""Load compiled CPU extension with platform-specific naming."""
15+
16+
# Determine extension filename based on platform
17+
if sys.platform == "win32":
18+
# Windows: look for .pyd files with version info
19+
search_dirs = [
20+
os.path.dirname(__file__), # Current directory
21+
os.path.join(os.path.dirname(__file__), "compiled"), # Compiled subdirectory
22+
]
23+
24+
ext_file = None
25+
search_dir = None
26+
for search_dir in search_dirs:
27+
if os.path.exists(search_dir):
28+
ext_files = [f for f in os.listdir(search_dir)
29+
if f.startswith('crayon_cpu') and f.endswith('.pyd')]
30+
if ext_files:
31+
ext_file = ext_files[0]
32+
break
33+
34+
if not ext_file:
35+
raise ImportError("No compiled CPU extension found (.pyd file)")
36+
else:
37+
# Linux/macOS: look for .so files
38+
search_dirs = [
39+
os.path.dirname(__file__), # Current directory
40+
os.path.join(os.path.dirname(__file__), "compiled"), # Compiled subdirectory
41+
]
42+
43+
ext_file = None
44+
search_dir = None
45+
for search_dir in search_dirs:
46+
if os.path.exists(search_dir):
47+
ext_files = [f for f in os.listdir(search_dir)
48+
if f.startswith('crayon_cpu') and f.endswith('.so')]
49+
if ext_files:
50+
ext_file = ext_files[0]
51+
break
52+
53+
if not ext_file:
54+
# Try to find any .so file as last resort
55+
for search_dir in search_dirs:
56+
if os.path.exists(search_dir):
57+
so_files = [f for f in os.listdir(search_dir) if f.endswith('.so')]
58+
if so_files:
59+
ext_file = so_files[0]
60+
print(f"🔍 Found .so file: {ext_file}")
61+
break
62+
63+
if not ext_file:
64+
raise ImportError("No compiled CPU extension found (.so file)")
65+
66+
# Load extension
67+
ext_path = os.path.join(search_dir, ext_file)
68+
print(f"🔍 Loading CPU extension from: {ext_path}")
69+
70+
try:
71+
# Try direct import first
72+
ext_dir = os.path.dirname(ext_path)
73+
if ext_dir not in sys.path:
74+
sys.path.insert(0, ext_dir)
75+
76+
# Remove .pyd/.so extension for module name
77+
module_name = os.path.splitext(ext_file)[0]
78+
79+
spec = importlib.util.spec_from_file_location(module_name, ext_path)
80+
if spec is None or spec.loader is None:
81+
raise ImportError(f"Could not create spec for {ext_path}")
82+
83+
mod = importlib.util.module_from_spec(spec)
84+
spec.loader.exec_module(mod)
85+
print(f"✅ Successfully loaded compiled extension: {ext_file}")
86+
return mod
87+
except Exception as e:
88+
# Try alternative loading method
89+
try:
90+
import importlib.machinery
91+
loader = importlib.machinery.ExtensionFileLoader(module_name, ext_path)
92+
spec = importlib.util.spec_from_file_location(module_name, ext_path, loader=loader)
93+
mod = importlib.util.module_from_spec(spec)
94+
spec.loader.exec_module(mod)
95+
print(f"✅ Successfully loaded compiled extension (alt method): {ext_file}")
96+
return mod
97+
except Exception as e2:
98+
raise ImportError(f"Failed to load extension {ext_path}: {e}\nAlternative method failed: {e2}")
99+
100+
# Try to load the compiled extension
101+
try:
102+
_cpu_ext = _load_cpu_extension()
103+
print("✓ Using compiled C++ extension for maximum performance")
104+
except ImportError as e:
105+
print(f"⚠ Compiled extension not available: {e}")
106+
print("🔄 Falling back to pure Python implementation (slower but functional)")
107+
108+
# Load pure Python fallback
109+
try:
110+
from . import crayon_cpu_fallback as _cpu_ext
111+
print("✓ Pure Python fallback loaded successfully")
112+
except ImportError as fallback_error:
113+
raise ImportError(
114+
f"Failed to load both compiled extension and pure Python fallback:\n"
115+
f"Extension error: {e}\n"
116+
f"Fallback error: {fallback_error}\n"
117+
"This suggests a corrupted installation. Try reinstalling with:\n"
118+
" pip install --force-reinstall xerv-crayon"
119+
)
120+
121+
# Export the required functions
122+
tokenize = _cpu_ext.tokenize
123+
load_dat = _cpu_ext.load_dat
124+
125+
# Export hardware info if available
126+
if hasattr(_cpu_ext, 'get_hardware_info'):
127+
get_hardware_info = _cpu_ext.get_hardware_info
128+
else:
129+
def get_hardware_info():
130+
return "CPU Backend [Unknown]"
131+
132+
__all__ = ['tokenize', 'load_dat', 'get_hardware_info']

0 commit comments

Comments
 (0)