CRAYON/demo_tokenize.py at main · Electroiscoding/CRAYON · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101

"""
CRAYON UNIVERSAL TOKENIZER DEMO
===============================
This script demonstrates the production-ready Crayon Tokenizer API.
It is designed to work seamlessly across:
- Local Machine (Windows/Linux/Mac)
- Google Colab / Jupyter Notebooks
- CPU, NVIDIA GPU (CUDA), and AMD GPU (ROCm)
"""

import os
import sys
from pathlib import Path

# --- 1. Environment Setup ---
# Add 'src' to path so we can run without installing the package
REPO_ROOT = Path(__file__).resolve().parent
SRC_PATH = REPO_ROOT / "src"
if SRC_PATH.exists():
    sys.path.insert(0, str(SRC_PATH))

try:
    from crayon import CrayonVocab
    from crayon.core.vocabulary import enable_verbose_logging
except ImportError:
    print("❌ Error: CRAYON source not found. Make sure you are running this from the repo root.")
    sys.exit(1)

def run_universal_demo():
    # Optional: Enable verbose logging to see hardware detection in action
    # enable_verbose_logging()

    print("=" * 60)
    print("🚀 CRAYON: UNIVERSAL TOKENIZATION DEMO")
    print("=" * 60)

    # --- 2. Initialize Engine ---
    # device="auto" automatically picks CUDA > ROCm > CPU
    print("\n[STEP 1] Initializing Engine (Auto-Detecting Hardware)...")
    try:
        vocab = CrayonVocab(device="auto")
        info = vocab.get_info()

        hw_name = info.get("hardware", {}).get("name", "Unknown")
        hw_feat = info.get("hardware", {}).get("features", "")
        print(f"    ✓ Device:  {info['device'].upper()}")
        print(f"    ✓ Backend: {info['backend']}")
        print(f"    ✓ Hardware: {hw_name} [{hw_feat}]")
    except Exception as e:
        print(f"    ❌ Initialization failed: {e}")
        return

    # --- 3. Load Profile ---
    # We load 'lite' which is bundled with the repository
    print(f"\n[STEP 2] Loading 'lite' profile...")
    try:
        vocab.load_profile("lite")
        print(f"    ✓ Profile Loaded: {vocab.current_profile_path}")
        print(f"    ✓ Vocabulary Size: {vocab.vocab_size:,} tokens")
    except Exception as e:
        print(f"    ❌ Load failed: {e}")
        print("    (Note: If you haven't built/downloaded the profiles, run train_code_profile.py first)")
        return

    # --- 4. Performance Tokenization ---
    text = (
        "CRAYON is a hyper-fast tokenizer designed for modern AI. "
        "It supports AVX2 on CPUs, CUDA on NVIDIA, and ROCm on AMD."
    )

    print(f"\n[STEP 3] Tokenizing Text...")
    print(f"    Input: \"{text[:50]}...\"")

    # Tokenize (returns a list of IDs)
    tokens = vocab.tokenize(text)

    print(f"    Result: {tokens[:10]}... ({len(tokens)} tokens)")

    # --- 5. Reconstruction (Decoding) ---
    print(f"\n[STEP 4] Decoding back to text...")
    try:
        decoded = vocab.decode(tokens)
        print(f"    Output: \"{decoded[:50]}...\"")

        # Verify success
        # Note: BPE usually preserves whitespace and case
        if decoded.strip().lower() == text.strip().lower():
             print("\n✅ SUCCESS: Tokenization and Decoding were perfect!")
        else:
             print("\nℹ️  INFO: Tokenization complete (approximate reconstruction).")

    except Exception as e:
        print(f"    ❌ Decode failed: {e}")

    print("\n" + "=" * 60)
    print("DEMO COMPLETE - CRAYON IS READY FOR PRODUCTION")
    print("=" * 60)

if __name__ == "__main__":
    run_universal_demo()