|
1 | 1 | """ |
2 | | -XERV CRAYON V4.2.3 - Omni-Backend Tokenizer |
| 2 | +XERV CRAYON V4.2.4 - Omni-Backend Tokenizer |
3 | 3 | ============================================= |
4 | 4 | Copy this ENTIRE script into a Google Colab cell and run it. |
5 | 5 |
|
|
13 | 13 | import time |
14 | 14 |
|
15 | 15 | print("=" * 70) |
16 | | -print("XERV CRAYON INSTALLATION V4.2.3") |
| 16 | +print("XERV CRAYON INSTALLATION V4.2.4") |
17 | 17 | print("=" * 70) |
18 | 18 |
|
19 | 19 | # Step 1: GPU Detection |
20 | | -print("\n[1/6] Detecting GPU hardware...") |
| 20 | +print("\n[1/7] Detecting GPU hardware...") |
21 | 21 | try: |
22 | 22 | result = subprocess.run(["nvidia-smi", "--query-gpu=name,compute_cap", "--format=csv,noheader"], |
23 | 23 | capture_output=True, text=True, timeout=10) |
|
33 | 33 | has_gpu = False |
34 | 34 |
|
35 | 35 | # Step 2: NVCC Detection |
36 | | -print("\n[2/6] Checking CUDA compiler...") |
| 36 | +print("\n[2/7] Checking CUDA compiler...") |
37 | 37 | nvcc_check = subprocess.run(["which", "nvcc"], capture_output=True, text=True) |
38 | 38 | if nvcc_check.returncode == 0: |
39 | 39 | nvcc_path = nvcc_check.stdout.strip() |
40 | 40 | print(f" NVCC: {nvcc_path}") |
41 | | - nvcc_v = subprocess.run([nvcc_path, "--version"], capture_output=True, text=True) |
42 | | - for line in nvcc_v.stdout.split("\n"): |
43 | | - if "release" in line.lower(): |
44 | | - print(f" {line.strip()}") |
45 | 41 | has_nvcc = True |
46 | 42 | else: |
47 | 43 | print(" NVCC not found") |
48 | 44 | has_nvcc = False |
49 | 45 |
|
50 | | -# Step 3: Clean ALL Caches |
51 | | -print("\n[3/6] Cleaning ALL caches...") |
| 46 | +# Step 3: Ensure PyTorch is installed (required for CUDAExtension) |
| 47 | +print("\n[3/7] Checking PyTorch...") |
| 48 | +try: |
| 49 | + import torch |
| 50 | + print(f" PyTorch {torch.__version__}") |
| 51 | + print(f" CUDA available: {torch.cuda.is_available()}") |
| 52 | +except ImportError: |
| 53 | + print(" Installing PyTorch...") |
| 54 | + subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "torch"]) |
| 55 | + import torch |
| 56 | + print(f" PyTorch {torch.__version__} installed") |
| 57 | + |
| 58 | +# Step 4: Clean ALL Caches |
| 59 | +print("\n[4/7] Cleaning ALL caches...") |
52 | 60 | os.system("pip uninstall -y xerv-crayon crayon 2>/dev/null") |
53 | 61 | os.system("pip cache purge 2>/dev/null") |
54 | | -os.system("rm -rf /tmp/crayon /tmp/crayon_build ~/.cache/pip 2>/dev/null") |
| 62 | +os.system("rm -rf /tmp/crayon* ~/.cache/pip 2>/dev/null") |
55 | 63 | print(" Done") |
56 | 64 |
|
57 | | -# Step 4: Fresh Clone with timestamp to avoid caching |
58 | | -print("\n[4/6] Cloning from GitHub (fresh)...") |
| 65 | +# Step 5: Fresh Clone |
| 66 | +print("\n[5/7] Cloning from GitHub...") |
59 | 67 | timestamp = int(time.time()) |
60 | 68 | clone_dir = f"/tmp/crayon_{timestamp}" |
61 | 69 | os.system(f"git clone --depth 1 https://github.com/Electroiscoding/CRAYON.git {clone_dir}") |
62 | 70 |
|
63 | | -# Verify version in cloned repo |
64 | | -version_check = subprocess.run(["grep", "__version__", f"{clone_dir}/src/crayon/__init__.py"], |
| 71 | +version_check = subprocess.run(["grep", "-m1", "__version__", f"{clone_dir}/src/crayon/__init__.py"], |
65 | 72 | capture_output=True, text=True) |
66 | | -print(f" Cloned version: {version_check.stdout.strip()}") |
| 73 | +print(f" {version_check.stdout.strip()}") |
67 | 74 |
|
68 | | -# Step 5: Install with verbose output and no cache |
69 | | -print("\n[5/6] Building and installing...") |
| 75 | +# Step 6: Build and Install |
| 76 | +print("\n[6/7] Building with CUDA support (this takes ~2 min)...") |
70 | 77 | print("-" * 70) |
71 | 78 |
|
| 79 | +env = os.environ.copy() |
| 80 | +env["CUDA_HOME"] = "/usr/local/cuda" |
| 81 | + |
72 | 82 | result = subprocess.run( |
73 | 83 | [sys.executable, "-m", "pip", "install", "-v", "--no-cache-dir", "--no-build-isolation", clone_dir], |
74 | | - env={**os.environ, "CUDA_HOME": "/usr/local/cuda"} |
| 84 | + env=env |
75 | 85 | ) |
76 | 86 |
|
77 | 87 | print("-" * 70) |
78 | 88 |
|
79 | | -# Step 6: Verify Installation |
80 | | -print("\n[6/6] Verifying installation...") |
| 89 | +# Step 7: Verify |
| 90 | +print("\n[7/7] Verifying installation...") |
81 | 91 |
|
82 | | -# Force reimport |
83 | | -if "crayon" in sys.modules: |
84 | | - del sys.modules["crayon"] |
85 | 92 | for key in list(sys.modules.keys()): |
86 | | - if key.startswith("crayon"): |
| 93 | + if "crayon" in key: |
87 | 94 | del sys.modules[key] |
88 | 95 |
|
89 | 96 | import crayon |
90 | | -print(f"\n Crayon Version: {crayon.get_version()}") |
| 97 | +print(f"\n Version: {crayon.get_version()}") |
91 | 98 | backends = crayon.check_backends() |
92 | 99 | print(f" Backends: {backends}") |
93 | 100 |
|
94 | 101 | if backends.get("cuda"): |
95 | | - print(" CUDA backend: READY") |
| 102 | + print(" CUDA: READY", "\u2705") |
96 | 103 | elif has_gpu and has_nvcc: |
97 | | - print("\n WARNING: GPU + NVCC detected but CUDA backend not available!") |
98 | | - print(" Check the build output above for errors.") |
| 104 | + print(" WARNING: GPU detected but CUDA not compiled!") |
| 105 | + print(" Check build output above for nvcc errors") |
99 | 106 |
|
100 | 107 | print("\n" + "=" * 70) |
101 | | -print("INITIALIZATION") |
| 108 | +print("TOKENIZER TEST") |
102 | 109 | print("=" * 70) |
103 | 110 |
|
104 | 111 | from crayon import CrayonVocab |
|
107 | 114 | vocab.load_profile("lite") |
108 | 115 |
|
109 | 116 | info = vocab.get_info() |
110 | | -print(f"\nActive Device: {info['device'].upper()}") |
| 117 | +print(f"\nDevice: {info['device'].upper()}") |
111 | 118 | print(f"Backend: {info['backend']}") |
112 | 119 | print(f"Vocabulary: {vocab.vocab_size:,} tokens") |
113 | 120 |
|
114 | | -# Quick test |
115 | | -text = "Hello, Crayon tokenizer!" |
| 121 | +text = "Hello, Crayon!" |
116 | 122 | tokens = vocab.tokenize(text) |
117 | | -print(f"\nTest: '{text}' -> {len(tokens)} tokens") |
| 123 | +print(f"\nTest: '{text}' -> {tokens}") |
118 | 124 |
|
119 | 125 | print("\n" + "=" * 70) |
120 | | -print("BENCHMARKS") |
| 126 | +print("BENCHMARKS") |
121 | 127 | print("=" * 70) |
122 | 128 |
|
123 | | -import time |
124 | | - |
125 | 129 | base_text = "The quick brown fox jumps over the lazy dog." |
126 | 130 |
|
127 | | -print("\n--- Batch Throughput ---") |
| 131 | +print("\n--- Throughput ---") |
128 | 132 | for batch_size in [1000, 10000, 50000]: |
129 | 133 | batch = [base_text] * batch_size |
130 | 134 | vocab.tokenize(batch[:10]) |
|
135 | 139 | print(f"{batch_size:>8}: {batch_size/duration:>12,.0f} docs/sec | {total_tokens/duration:>14,.0f} tokens/sec") |
136 | 140 |
|
137 | 141 | if vocab.device != "cpu": |
138 | | - print(f"\n--- GPU Stress Test ({vocab.device.upper()}) ---") |
| 142 | + print(f"\n--- GPU Stress Test ---") |
139 | 143 | for batch_size in [100000, 500000]: |
140 | 144 | batch = [base_text] * batch_size |
141 | 145 | start = time.time() |
142 | 146 | results = vocab.tokenize(batch) |
143 | 147 | duration = time.time() - start |
144 | | - total_tokens = sum(len(r) for r in results) |
145 | 148 | print(f"{batch_size:>8}: {batch_size/duration:>12,.0f} docs/sec in {duration:.3f}s") |
146 | 149 |
|
147 | 150 | vocab.close() |
148 | | -print("\nDone!") |
| 151 | +print("\n" + "=" * 70) |
| 152 | +print("DONE!") |
| 153 | +print("=" * 70) |
0 commit comments