Skip to content

Commit fbb49c6

Browse files
fix: resolve CrayonVocab __slots__ conflict and update universal demo
1 parent 5fdba77 commit fbb49c6

File tree

2 files changed

+79
-47
lines changed

2 files changed

+79
-47
lines changed

demo_tokenize.py

Lines changed: 76 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,101 @@
1+
12
"""
2-
Crayon Tokenizer Demo
3-
---------------------
4-
Simple script to demonstrate loading a profile and tokenizing text.
3+
CRAYON UNIVERSAL TOKENIZER DEMO
4+
===============================
5+
This script demonstrates the production-ready Crayon Tokenizer API.
6+
It is designed to work seamlessly across:
7+
- Local Machine (Windows/Linux/Mac)
8+
- Google Colab / Jupyter Notebooks
9+
- CPU, NVIDIA GPU (CUDA), and AMD GPU (ROCm)
510
"""
6-
import sys
11+
712
import os
13+
import sys
814
from pathlib import Path
915

10-
# Add paths to use local build if running from source
11-
sys.path.insert(0, os.path.join(os.getcwd(), "build", "lib.win-amd64-cpython-313"))
12-
sys.path.insert(0, os.path.join(os.getcwd(), "src"))
16+
# --- 1. Environment Setup ---
17+
# Add 'src' to path so we can run without installing the package
18+
REPO_ROOT = Path(__file__).resolve().parent
19+
SRC_PATH = REPO_ROOT / "src"
20+
if SRC_PATH.exists():
21+
sys.path.insert(0, str(SRC_PATH))
1322

14-
from crayon.core.vocabulary import CrayonVocab
23+
try:
24+
from crayon import CrayonVocab
25+
from crayon.core.vocabulary import enable_verbose_logging
26+
except ImportError:
27+
print("❌ Error: CRAYON source not found. Make sure you are running this from the repo root.")
28+
sys.exit(1)
29+
30+
def run_universal_demo():
31+
# Optional: Enable verbose logging to see hardware detection in action
32+
# enable_verbose_logging()
1533

16-
def run_demo():
1734
print("=" * 60)
18-
print("CRAYON TOKENIZER DEMO")
35+
print("🚀 CRAYON: UNIVERSAL TOKENIZATION DEMO")
1936
print("=" * 60)
2037

21-
# 1. Load Profile
22-
profile_name = "lite"
23-
print(f"\n[1] Loading '{profile_name}' profile...")
24-
38+
# --- 2. Initialize Engine ---
39+
# device="auto" automatically picks CUDA > ROCm > CPU
40+
print("\n[STEP 1] Initializing Engine (Auto-Detecting Hardware)...")
2541
try:
26-
vocab = CrayonVocab.load_profile(profile_name)
27-
except Exception as e:
28-
print(f"Standard load failed: {e}")
29-
# Manual fallback for development environment without installation
30-
print(" -> Attempting development fallback...")
31-
dat_path = Path("src/crayon/resources/dat/vocab_lite.dat")
32-
json_path = Path("src/crayon/resources/dat/vocab_lite.json")
42+
vocab = CrayonVocab(device="auto")
43+
info = vocab.get_info()
3344

34-
if dat_path.exists():
35-
vocab = CrayonVocab()
36-
vocab._load_binary_dat(dat_path)
37-
if json_path.exists():
38-
vocab._load_json_mappings(json_path)
39-
else:
40-
print("❌ Could not find tokenizer files.")
41-
sys.exit(1)
45+
hw_name = info.get("hardware", {}).get("name", "Unknown")
46+
hw_feat = info.get("hardware", {}).get("features", "")
47+
print(f" ✓ Device: {info['device'].upper()}")
48+
print(f" ✓ Backend: {info['backend']}")
49+
print(f" ✓ Hardware: {hw_name} [{hw_feat}]")
50+
except Exception as e:
51+
print(f" ❌ Initialization failed: {e}")
52+
return
4253

43-
# 2. Check Engine Mode
44-
mode = "🚀 Fast C++ DAT Engine" if vocab.fast_mode else "🐢 Slow Python Fallback"
45-
print(f" Status: {mode}")
54+
# --- 3. Load Profile ---
55+
# We load 'lite' which is bundled with the repository
56+
print(f"\n[STEP 2] Loading 'lite' profile...")
57+
try:
58+
vocab.load_profile("lite")
59+
print(f" ✓ Profile Loaded: {vocab.current_profile_path}")
60+
print(f" ✓ Vocabulary Size: {vocab.vocab_size:,} tokens")
61+
except Exception as e:
62+
print(f" ❌ Load failed: {e}")
63+
print(" (Note: If you haven't built/downloaded the profiles, run train_code_profile.py first)")
64+
return
4665

47-
# 3. Tokenize
48-
text = "Hello, world! This is Crayon."
49-
print(f"\n[2] Tokenizing: '{text}'")
66+
# --- 4. Performance Tokenization ---
67+
text = (
68+
"CRAYON is a hyper-fast tokenizer designed for modern AI. "
69+
"It supports AVX2 on CPUs, CUDA on NVIDIA, and ROCm on AMD."
70+
)
71+
72+
print(f"\n[STEP 3] Tokenizing Text...")
73+
print(f" Input: \"{text[:50]}...\"")
5074

75+
# Tokenize (returns a list of IDs)
5176
tokens = vocab.tokenize(text)
52-
print(f" Tokens IDs: {tokens}")
53-
print(f" Count: {len(tokens)}")
54-
55-
# 4. Decode
56-
print(f"\n[3] Decoding back to text...")
77+
78+
print(f" Result: {tokens[:10]}... ({len(tokens)} tokens)")
79+
80+
# --- 5. Reconstruction (Decoding) ---
81+
print(f"\n[STEP 4] Decoding back to text...")
5782
try:
5883
decoded = vocab.decode(tokens)
59-
print(f" Decoded: '{decoded}'")
84+
print(f" Output: \"{decoded[:50]}...\"")
6085

61-
if decoded == text:
62-
print(" Unknown/Unmapped tokens found (exact match requires full coverage)")
86+
# Verify success
87+
# Note: BPE usually preserves whitespace and case
88+
if decoded.strip().lower() == text.strip().lower():
89+
print("\n✅ SUCCESS: Tokenization and Decoding were perfect!")
6390
else:
64-
print(" (Note: exact reconstruction depends on vocabulary coverage)")
65-
91+
print("\nℹ️ INFO: Tokenization complete (approximate reconstruction).")
92+
6693
except Exception as e:
67-
print(f" Decode failed: {e}")
94+
print(f" Decode failed: {e}")
6895

6996
print("\n" + "=" * 60)
97+
print("DEMO COMPLETE - CRAYON IS READY FOR PRODUCTION")
98+
print("=" * 60)
7099

71100
if __name__ == "__main__":
72-
run_demo()
101+
run_universal_demo()

src/crayon/core/vocabulary.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -359,6 +359,9 @@ class CrayonVocab:
359359
"_idx_to_str",
360360
"current_profile_path",
361361
"_profile_loaded",
362+
"_temp_dat_path",
363+
"unk_token",
364+
"unk_token_id",
362365
"device",
363366
"_requested_device",
364367
"_device_state",

0 commit comments

Comments
 (0)