Skip to content

Commit 5fdba77

Browse files
fix(ci): fix wheel building and unit tests
- Unified C++ module testing around new crayon_cpu/compiler/trainer - Added load_from_list() to CrayonVocab for ad-hoc vocab testing - Fixed C++ includes for non-x86 architectures (macOS ARM64) - Forced CPU-only mode for universal distribution wheels - Updated unit tests to match V2 engine behavior
1 parent 1462e19 commit 5fdba77

File tree

5 files changed

+165
-214
lines changed

5 files changed

+165
-214
lines changed

.github/workflows/build_wheels.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,11 @@ jobs:
3333
# We explicitly force x86_64 builds to avoid failures on ARM64 runners.
3434
CIBW_ARCHS_LINUX: x86_64
3535
CIBW_ARCHS_WINDOWS: AMD64
36-
CIBW_ARCHS_MACOS: x86_64
36+
CIBW_ARCHS_MACOS: x86_64 arm64
37+
38+
# 3. Environment
39+
# Universal wheels should be CPU-only (CUDA/ROCm are for custom local builds)
40+
CIBW_ENVIRONMENT: CRAYON_FORCE_CPU=1
3741

3842
# 3. Quality Assurance
3943
# Run the test suite against the installed wheel to verify the C-extension

src/crayon/c_ext/cpu_engine.cpp

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,13 +14,12 @@
1414
#include <cstring>
1515

1616
// --- SIMD INTRINSICS & CPU DETECTION ---
17-
#ifdef _MSC_VER
18-
#include <intrin.h>
19-
#else
20-
#include <cpuid.h>
21-
#endif
22-
2317
#if defined(__x86_64__) || defined(_M_X64)
18+
#ifdef _MSC_VER
19+
#include <intrin.h>
20+
#else
21+
#include <cpuid.h>
22+
#endif
2423
#include <immintrin.h> // AVX2
2524
#define USE_AVX2 1
2625
#else

src/crayon/core/vocabulary.py

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import os
2727
import platform
2828
import sys
29+
import tempfile
2930
import threading
3031
from dataclasses import dataclass, field
3132
from enum import Enum
@@ -364,16 +365,23 @@ class CrayonVocab:
364365
"_hardware_info",
365366
)
366367

367-
def __init__(self, device: DeviceType = "auto") -> None:
368+
def __init__(
369+
self,
370+
vocab_list: Optional[List[str]] = None,
371+
device: DeviceType = "auto",
372+
unk_token: str = "<UNK>"
373+
) -> None:
368374
"""
369375
Initialize the tokenizer engine.
370376
371377
Args:
378+
vocab_list: Optional list of strings to build an ad-hoc vocabulary.
372379
device: Device selection mode.
373380
- "auto": Detects GPU. If available, uses it. Else CPU.
374381
- "cpu": Forces AVX2/AVX-512 CPU backend (best for latency).
375382
- "cuda": Forces NVIDIA GPU backend (best for batch throughput).
376383
- "rocm": Forces AMD GPU backend (best for batch throughput).
384+
unk_token: String to use as the unknown token placeholder.
377385
378386
Raises:
379387
ImportError: If the CPU backend extension is not available.
@@ -395,6 +403,11 @@ def __init__(self, device: DeviceType = "auto") -> None:
395403
self._idx_to_str: List[str] = []
396404
self.current_profile_path: Optional[str] = None
397405
self._profile_loaded: bool = False
406+
self._temp_dat_path: Optional[str] = None
407+
408+
# Public properties for test compatibility
409+
self.unk_token = unk_token
410+
self.unk_token_id = 1 # Hardware convention in Crayon v2
398411

399412
# Device state
400413
self._requested_device: DeviceType = device
@@ -413,6 +426,10 @@ def __init__(self, device: DeviceType = "auto") -> None:
413426
# --- Resolve and Initialize Device ---
414427
self.device = self._resolve_device(device)
415428
self._init_selected_backend()
429+
430+
# --- Load ad-hoc vocab if provided ---
431+
if vocab_list:
432+
self.load_from_list(vocab_list)
416433

417434
def _load_cpu_backend(self) -> None:
418435
"""Load the CPU extension (required as fallback for all modes)."""
@@ -610,6 +627,49 @@ def _resolve_profile_path(self, name_or_path: str) -> str:
610627
f"You can specify the full path or set CRAYON_PROFILE_DIR environment variable."
611628
)
612629

630+
@property
631+
def id_to_token(self) -> List[str]:
632+
"""Get the ID-to-token mapping list (for compatibility)."""
633+
return self._idx_to_str
634+
635+
def __len__(self) -> int:
636+
"""Return the total number of tokens in the active vocabulary."""
637+
return len(self._idx_to_str)
638+
639+
def __contains__(self, token: str) -> bool:
640+
"""Check if a token exists in the active vocabulary (O(N) fallback)."""
641+
return token in self._idx_to_str
642+
643+
def load_from_list(self, vocab: List[str]) -> None:
644+
"""Build and load a temporary DAT profile from a list of strings."""
645+
try:
646+
from ..c_ext import crayon_compiler
647+
except ImportError:
648+
raise ImportError("crayon_compiler extension required for load_from_list()")
649+
650+
with self._lock:
651+
# Create a secure temporary file
652+
fd, path = tempfile.mkstemp(suffix=".dat")
653+
os.close(fd)
654+
655+
try:
656+
# Compile to the temp file
657+
crayon_compiler.compile_dat(vocab, path)
658+
659+
# IMPORTANT: Since load_profile() expects a .json file to load _idx_to_str,
660+
# we create a dummy JSON or just bypass the load_profile JSON loading
661+
# by manually setting _idx_to_str after load_profile.
662+
self.load_profile(path)
663+
664+
# Override the idx_to_str which failed to load during load_profile (since no .json exists)
665+
self._idx_to_str = list(vocab)
666+
self._temp_dat_path = path
667+
668+
except Exception as e:
669+
if os.path.exists(path):
670+
os.unlink(path)
671+
raise RuntimeError(f"Failed to build ad-hoc vocabulary: {e}")
672+
613673
def _close_profile_handles(self) -> None:
614674
"""Safely close any open file handles."""
615675
if self._dat_mem_ref is not None:
@@ -625,6 +685,14 @@ def _close_profile_handles(self) -> None:
625685
except Exception:
626686
pass
627687
self._dat_file_ref = None
688+
689+
# Clean up temporary DAT if exists
690+
if hasattr(self, '_temp_dat_path') and self._temp_dat_path and os.path.exists(self._temp_dat_path):
691+
try:
692+
os.unlink(self._temp_dat_path)
693+
except Exception:
694+
pass
695+
self._temp_dat_path = None
628696

629697
def close(self) -> None:
630698
"""Release all resources and close file handles."""

0 commit comments

Comments
 (0)