-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdemo_tokenize.py
More file actions
101 lines (83 loc) · 3.35 KB
/
demo_tokenize.py
File metadata and controls
101 lines (83 loc) · 3.35 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
CRAYON UNIVERSAL TOKENIZER DEMO
===============================
This script demonstrates the production-ready Crayon Tokenizer API.
It is designed to work seamlessly across:
- Local Machine (Windows/Linux/Mac)
- Google Colab / Jupyter Notebooks
- CPU, NVIDIA GPU (CUDA), and AMD GPU (ROCm)
"""
import os
import sys
from pathlib import Path
# --- 1. Environment Setup ---
# Add 'src' to path so we can run without installing the package
REPO_ROOT = Path(__file__).resolve().parent
SRC_PATH = REPO_ROOT / "src"
if SRC_PATH.exists():
sys.path.insert(0, str(SRC_PATH))
try:
from crayon import CrayonVocab
from crayon.core.vocabulary import enable_verbose_logging
except ImportError:
print("❌ Error: CRAYON source not found. Make sure you are running this from the repo root.")
sys.exit(1)
def run_universal_demo():
# Optional: Enable verbose logging to see hardware detection in action
# enable_verbose_logging()
print("=" * 60)
print("🚀 CRAYON: UNIVERSAL TOKENIZATION DEMO")
print("=" * 60)
# --- 2. Initialize Engine ---
# device="auto" automatically picks CUDA > ROCm > CPU
print("\n[STEP 1] Initializing Engine (Auto-Detecting Hardware)...")
try:
vocab = CrayonVocab(device="auto")
info = vocab.get_info()
hw_name = info.get("hardware", {}).get("name", "Unknown")
hw_feat = info.get("hardware", {}).get("features", "")
print(f" ✓ Device: {info['device'].upper()}")
print(f" ✓ Backend: {info['backend']}")
print(f" ✓ Hardware: {hw_name} [{hw_feat}]")
except Exception as e:
print(f" ❌ Initialization failed: {e}")
return
# --- 3. Load Profile ---
# We load 'lite' which is bundled with the repository
print(f"\n[STEP 2] Loading 'lite' profile...")
try:
vocab.load_profile("lite")
print(f" ✓ Profile Loaded: {vocab.current_profile_path}")
print(f" ✓ Vocabulary Size: {vocab.vocab_size:,} tokens")
except Exception as e:
print(f" ❌ Load failed: {e}")
print(" (Note: If you haven't built/downloaded the profiles, run train_code_profile.py first)")
return
# --- 4. Performance Tokenization ---
text = (
"CRAYON is a hyper-fast tokenizer designed for modern AI. "
"It supports AVX2 on CPUs, CUDA on NVIDIA, and ROCm on AMD."
)
print(f"\n[STEP 3] Tokenizing Text...")
print(f" Input: \"{text[:50]}...\"")
# Tokenize (returns a list of IDs)
tokens = vocab.tokenize(text)
print(f" Result: {tokens[:10]}... ({len(tokens)} tokens)")
# --- 5. Reconstruction (Decoding) ---
print(f"\n[STEP 4] Decoding back to text...")
try:
decoded = vocab.decode(tokens)
print(f" Output: \"{decoded[:50]}...\"")
# Verify success
# Note: BPE usually preserves whitespace and case
if decoded.strip().lower() == text.strip().lower():
print("\n✅ SUCCESS: Tokenization and Decoding were perfect!")
else:
print("\nℹ️ INFO: Tokenization complete (approximate reconstruction).")
except Exception as e:
print(f" ❌ Decode failed: {e}")
print("\n" + "=" * 60)
print("DEMO COMPLETE - CRAYON IS READY FOR PRODUCTION")
print("=" * 60)
if __name__ == "__main__":
run_universal_demo()