|
| 1 | + |
1 | 2 | """ |
2 | | -Crayon Tokenizer Demo |
3 | | ---------------------- |
4 | | -Simple script to demonstrate loading a profile and tokenizing text. |
| 3 | +CRAYON UNIVERSAL TOKENIZER DEMO |
| 4 | +=============================== |
| 5 | +This script demonstrates the production-ready Crayon Tokenizer API. |
| 6 | +It is designed to work seamlessly across: |
| 7 | +- Local Machine (Windows/Linux/Mac) |
| 8 | +- Google Colab / Jupyter Notebooks |
| 9 | +- CPU, NVIDIA GPU (CUDA), and AMD GPU (ROCm) |
5 | 10 | """ |
6 | | -import sys |
| 11 | + |
7 | 12 | import os |
| 13 | +import sys |
8 | 14 | from pathlib import Path |
9 | 15 |
|
10 | | -# Add paths to use local build if running from source |
11 | | -sys.path.insert(0, os.path.join(os.getcwd(), "build", "lib.win-amd64-cpython-313")) |
12 | | -sys.path.insert(0, os.path.join(os.getcwd(), "src")) |
| 16 | +# --- 1. Environment Setup --- |
| 17 | +# Add 'src' to path so we can run without installing the package |
| 18 | +REPO_ROOT = Path(__file__).resolve().parent |
| 19 | +SRC_PATH = REPO_ROOT / "src" |
| 20 | +if SRC_PATH.exists(): |
| 21 | + sys.path.insert(0, str(SRC_PATH)) |
13 | 22 |
|
14 | | -from crayon.core.vocabulary import CrayonVocab |
| 23 | +try: |
| 24 | + from crayon import CrayonVocab |
| 25 | + from crayon.core.vocabulary import enable_verbose_logging |
| 26 | +except ImportError: |
| 27 | + print("❌ Error: CRAYON source not found. Make sure you are running this from the repo root.") |
| 28 | + sys.exit(1) |
| 29 | + |
| 30 | +def run_universal_demo(): |
| 31 | + # Optional: Enable verbose logging to see hardware detection in action |
| 32 | + # enable_verbose_logging() |
15 | 33 |
|
16 | | -def run_demo(): |
17 | 34 | print("=" * 60) |
18 | | - print("CRAYON TOKENIZER DEMO") |
| 35 | + print("🚀 CRAYON: UNIVERSAL TOKENIZATION DEMO") |
19 | 36 | print("=" * 60) |
20 | 37 |
|
21 | | - # 1. Load Profile |
22 | | - profile_name = "lite" |
23 | | - print(f"\n[1] Loading '{profile_name}' profile...") |
24 | | - |
| 38 | + # --- 2. Initialize Engine --- |
| 39 | + # device="auto" automatically picks CUDA > ROCm > CPU |
| 40 | + print("\n[STEP 1] Initializing Engine (Auto-Detecting Hardware)...") |
25 | 41 | try: |
26 | | - vocab = CrayonVocab.load_profile(profile_name) |
27 | | - except Exception as e: |
28 | | - print(f"Standard load failed: {e}") |
29 | | - # Manual fallback for development environment without installation |
30 | | - print(" -> Attempting development fallback...") |
31 | | - dat_path = Path("src/crayon/resources/dat/vocab_lite.dat") |
32 | | - json_path = Path("src/crayon/resources/dat/vocab_lite.json") |
| 42 | + vocab = CrayonVocab(device="auto") |
| 43 | + info = vocab.get_info() |
33 | 44 |
|
34 | | - if dat_path.exists(): |
35 | | - vocab = CrayonVocab() |
36 | | - vocab._load_binary_dat(dat_path) |
37 | | - if json_path.exists(): |
38 | | - vocab._load_json_mappings(json_path) |
39 | | - else: |
40 | | - print("❌ Could not find tokenizer files.") |
41 | | - sys.exit(1) |
| 45 | + hw_name = info.get("hardware", {}).get("name", "Unknown") |
| 46 | + hw_feat = info.get("hardware", {}).get("features", "") |
| 47 | + print(f" ✓ Device: {info['device'].upper()}") |
| 48 | + print(f" ✓ Backend: {info['backend']}") |
| 49 | + print(f" ✓ Hardware: {hw_name} [{hw_feat}]") |
| 50 | + except Exception as e: |
| 51 | + print(f" ❌ Initialization failed: {e}") |
| 52 | + return |
42 | 53 |
|
43 | | - # 2. Check Engine Mode |
44 | | - mode = "🚀 Fast C++ DAT Engine" if vocab.fast_mode else "🐢 Slow Python Fallback" |
45 | | - print(f" Status: {mode}") |
| 54 | + # --- 3. Load Profile --- |
| 55 | + # We load 'lite' which is bundled with the repository |
| 56 | + print(f"\n[STEP 2] Loading 'lite' profile...") |
| 57 | + try: |
| 58 | + vocab.load_profile("lite") |
| 59 | + print(f" ✓ Profile Loaded: {vocab.current_profile_path}") |
| 60 | + print(f" ✓ Vocabulary Size: {vocab.vocab_size:,} tokens") |
| 61 | + except Exception as e: |
| 62 | + print(f" ❌ Load failed: {e}") |
| 63 | + print(" (Note: If you haven't built/downloaded the profiles, run train_code_profile.py first)") |
| 64 | + return |
46 | 65 |
|
47 | | - # 3. Tokenize |
48 | | - text = "Hello, world! This is Crayon." |
49 | | - print(f"\n[2] Tokenizing: '{text}'") |
| 66 | + # --- 4. Performance Tokenization --- |
| 67 | + text = ( |
| 68 | + "CRAYON is a hyper-fast tokenizer designed for modern AI. " |
| 69 | + "It supports AVX2 on CPUs, CUDA on NVIDIA, and ROCm on AMD." |
| 70 | + ) |
| 71 | + |
| 72 | + print(f"\n[STEP 3] Tokenizing Text...") |
| 73 | + print(f" Input: \"{text[:50]}...\"") |
50 | 74 |
|
| 75 | + # Tokenize (returns a list of IDs) |
51 | 76 | tokens = vocab.tokenize(text) |
52 | | - print(f" Tokens IDs: {tokens}") |
53 | | - print(f" Count: {len(tokens)}") |
54 | | - |
55 | | - # 4. Decode |
56 | | - print(f"\n[3] Decoding back to text...") |
| 77 | + |
| 78 | + print(f" Result: {tokens[:10]}... ({len(tokens)} tokens)") |
| 79 | + |
| 80 | + # --- 5. Reconstruction (Decoding) --- |
| 81 | + print(f"\n[STEP 4] Decoding back to text...") |
57 | 82 | try: |
58 | 83 | decoded = vocab.decode(tokens) |
59 | | - print(f" Decoded: '{decoded}'") |
| 84 | + print(f" Output: \"{decoded[:50]}...\"") |
60 | 85 |
|
61 | | - if decoded == text: |
62 | | - print(" Unknown/Unmapped tokens found (exact match requires full coverage)") |
| 86 | + # Verify success |
| 87 | + # Note: BPE usually preserves whitespace and case |
| 88 | + if decoded.strip().lower() == text.strip().lower(): |
| 89 | + print("\n✅ SUCCESS: Tokenization and Decoding were perfect!") |
63 | 90 | else: |
64 | | - print(" (Note: exact reconstruction depends on vocabulary coverage)") |
65 | | - |
| 91 | + print("\nℹ️ INFO: Tokenization complete (approximate reconstruction).") |
| 92 | + |
66 | 93 | except Exception as e: |
67 | | - print(f" Decode failed: {e}") |
| 94 | + print(f" ❌ Decode failed: {e}") |
68 | 95 |
|
69 | 96 | print("\n" + "=" * 60) |
| 97 | + print("DEMO COMPLETE - CRAYON IS READY FOR PRODUCTION") |
| 98 | + print("=" * 60) |
70 | 99 |
|
71 | 100 | if __name__ == "__main__": |
72 | | - run_demo() |
| 101 | + run_universal_demo() |
0 commit comments