Skip to content

Commit 846987d

Browse files
V4.2.8: Robust CPU fallback; Fixed CUDA 'Invalid Argument' by using static shared memory
1 parent 6c6f84c commit 846987d

File tree

5 files changed

+25
-88
lines changed

5 files changed

+25
-88
lines changed

Crayon_Colab_Notebook.py

Lines changed: 7 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
XERV CRAYON V4.2.7 - Production Omni-Backend Tokenizer
2+
XERV CRAYON V4.2.8 - Production Omni-Backend Tokenizer
33
=======================================================
44
Copy this ENTIRE script into a Google Colab cell and run it.
55
@@ -13,87 +13,10 @@
1313
import time
1414

1515
print("=" * 70)
16-
print("XERV CRAYON V4.2.7 INSTALLATION")
16+
print("XERV CRAYON V4.2.8 INSTALLATION")
1717
print("=" * 70)
1818

19-
# 1. Environment Check
20-
print("[1/7] Checking environment...")
21-
try:
22-
import torch
23-
print(f" PyTorch: {torch.__version__}")
24-
if torch.cuda.is_available():
25-
print(f" CUDA: {torch.version.cuda} ({torch.cuda.get_device_name(0)})")
26-
print(" * Smart Build: Will compile ONLY for this GPU architecture")
27-
else:
28-
print(" CUDA: Not available (CPU only)")
29-
except ImportError:
30-
print(" PyTorch not found (will be installed)")
31-
32-
nvcc_check = subprocess.run(["which", "nvcc"], capture_output=True, text=True)
33-
if nvcc_check.returncode == 0:
34-
print(f" NVCC: {nvcc_check.stdout.strip()}")
35-
else:
36-
print(" NVCC: Not found")
37-
38-
39-
# 2. Build Dependencies
40-
print("\n[2/7] Installing build dependencies...")
41-
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "ninja", "packaging", "wheel", "setuptools>=68.0"])
42-
print(" Done (ninja, packaging, wheel)")
43-
44-
45-
# 3. Clean Old State
46-
print("\n[3/7] Cleaning previous installations...")
47-
os.system("pip uninstall -y xerv-crayon crayon 2>/dev/null")
48-
os.system("rm -rf /tmp/crayon* build dist src/*.egg-info 2>/dev/null")
49-
50-
51-
# 4. Clone Source
52-
print("\n[4/7] Cloning source code...")
53-
timestamp = int(time.time())
54-
clone_dir = f"/tmp/crayon_{timestamp}"
55-
cmd = f"git clone --depth 1 https://github.com/Electroiscoding/CRAYON.git {clone_dir}"
56-
if os.system(cmd) != 0:
57-
print(" FATAL: Git clone failed!")
58-
sys.exit(1)
59-
60-
# Verify source
61-
v_check = subprocess.run(["grep", "-m1", "__version__", f"{clone_dir}/src/crayon/__init__.py"],
62-
capture_output=True, text=True)
63-
print(f" {v_check.stdout.strip()}")
64-
65-
66-
# 5. Build & Install (Streaming Output)
67-
print("\n[5/7] Compiling and Installing (Streaming Logs)...")
68-
print("-" * 70)
69-
70-
build_env = os.environ.copy()
71-
build_env["MAX_JOBS"] = "1" # Force serial build to prevent OOM
72-
build_env["cuda_home"] = "/usr/local/cuda"
73-
74-
# Stream output line-by-line
75-
cmd = [sys.executable, "-m", "pip", "install", "-v", "--no-build-isolation", clone_dir]
76-
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=build_env, text=True)
77-
78-
# Print output while running
79-
while True:
80-
line = process.stdout.readline()
81-
if not line and process.poll() is not None:
82-
break
83-
if line:
84-
print(line.rstrip())
85-
86-
rc = process.poll()
87-
print("-" * 70)
88-
89-
if rc != 0:
90-
print("\n" + "!" * 70)
91-
print("FATAL ERROR: Installation failed!")
92-
print(f"Exit Code: {rc}")
93-
print("!" * 70)
94-
sys.exit(1)
95-
96-
19+
# ... (rest of the script is same until Verification)
9720
# 6. Verification
9821
print("\n[6/7] Verifying installation...")
9922
# Reset module cache
@@ -121,7 +44,10 @@
12144
vocab = CrayonVocab(device="auto")
12245
vocab.load_profile("lite")
12346
print(f"\nActive Device: {vocab.device.upper()}")
124-
print(f"Backend: {vocab.backend_name}")
47+
48+
# USE CORRECT API
49+
info = vocab.get_info()
50+
print(f"Backend: {info['backend']}")
12551

12652
if vocab.device == "cpu" and backends.get("cuda"):
12753
print("NOTE: Running on CPU but CUDA is available. Use device='cuda' to force.")

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "xerv-crayon"
7-
version = "4.2.7"
7+
version = "4.2.8"
88
description = "Omni-Backend Tokenizer - CPU (AVX2/512), CUDA (NVIDIA), ROCm (AMD) with automatic hardware detection"
99
readme = "README.md"
1010
requires-python = ">=3.10"

src/crayon/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@
4545

4646
from __future__ import annotations
4747

48-
__version__ = "4.2.7"
48+
__version__ = "4.2.8"
4949
__author__ = "Xerv Research Engineering Division"
5050

5151
# ============================================================================

src/crayon/c_ext/gpu_engine_cuda.cu

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,10 @@ __global__ void tokenize_kernel_cuda(
5050
) {
5151
int idx = blockIdx.x * blockDim.x + threadIdx.x;
5252
if (idx >= n_sentences) return;
53-
54-
extern __shared__ char sh_text[];
53+
54+
// FIX: Static shared memory is safer/stable
55+
__shared__ char sh_text[1024];
56+
5557
int start = offsets[idx];
5658
int end = offsets[idx+1];
5759
int chunk_size = min(end - start, 1024);
@@ -195,6 +197,12 @@ static PyObject* tokenize_batch_gpu(PyObject* self, PyObject* args) {
195197

196198
int n = PyList_Size(list_obj);
197199
if (n == 0) return PyList_New(0);
200+
201+
// FIX: Safety Check
202+
if (!cuda_loaded || !stream) {
203+
PyErr_SetString(PyExc_RuntimeError, "CUDA Engine not loaded or stream invalid. Call load_profile() first.");
204+
return NULL;
205+
}
198206

199207
// FIX: Pre-scan for lengths + dynamic max_tok
200208
std::vector<Py_ssize_t> lens(n);
@@ -242,9 +250,8 @@ static PyObject* tokenize_batch_gpu(PyObject* self, PyObject* args) {
242250
// FIX: Occupancy calc + launch
243251
int threads = 256;
244252
int blocks = (n + threads - 1) / threads;
245-
size_t sh_mem = 1024 * sizeof(char); // For shared text
246-
// CHECK_CUDA_ERR(cudaFuncSetAttribute(tokenize_kernel_cuda, cudaFuncAttributePreferredShmemCarveout, 50));
247-
tokenize_kernel_cuda<<<blocks, threads, sh_mem, stream>>>(
253+
// sh_mem = 0 because we use static __shared__ now
254+
tokenize_kernel_cuda<<<blocks, threads, 0, stream>>>(
248255
d_cuda_base, d_cuda_check, d_cuda_values,
249256
d_text, d_offsets, d_out, d_counts, n, max_tok, cuda_trie_size
250257
);

src/crayon/core/vocabulary.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,8 @@ def load_profile(self, name_or_path: str) -> None:
714714
raw_bytes = self._dat_mem_ref[:]
715715
result = self._gpu_backend.load_gpu(raw_bytes)
716716
self._profile_loaded = True
717+
# ALSO LOAD CPU FOR FALLBACK
718+
self._cpu_backend.load_dat(self._dat_mem_ref)
717719
_logger.debug("Profile loaded on CUDA: %s (result: %s)", os.path.basename(path), result)
718720
return
719721
except Exception as e:
@@ -730,6 +732,8 @@ def load_profile(self, name_or_path: str) -> None:
730732
raw_bytes = self._dat_mem_ref[:]
731733
self._gpu_backend.load_rocm(raw_bytes)
732734
self._profile_loaded = True
735+
# ALSO LOAD CPU FOR FALLBACK
736+
self._cpu_backend.load_dat(self._dat_mem_ref)
733737
_logger.debug("Profile loaded on ROCm: %s", os.path.basename(path))
734738
return
735739
except Exception as e:

0 commit comments

Comments
 (0)