Skip to content

Commit 5274d65

Browse files
Fix Colab caching: timestamped clone dir, cache purge, CUDA_HOME
1 parent 0af1374 commit 5274d65

File tree

1 file changed

+63
-103
lines changed

1 file changed

+63
-103
lines changed

Crayon_Colab_Notebook.py

Lines changed: 63 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -7,19 +7,17 @@
77
Runtime -> Change runtime type -> GPU (T4/V100/A100)
88
"""
99

10-
# ═══════════════════════════════════════════════════════════════════════════════
11-
# CELL 1: CLEAN INSTALL FROM SOURCE (FORCES CUDA COMPILATION)
12-
# ═══════════════════════════════════════════════════════════════════════════════
13-
1410
import subprocess
1511
import sys
1612
import os
13+
import time
1714

1815
print("=" * 70)
19-
print("XERV CRAYON INSTALLATION")
16+
print("XERV CRAYON INSTALLATION V4.2.3")
2017
print("=" * 70)
2118

22-
print("\n[1/5] Detecting GPU hardware...")
19+
# Step 1: GPU Detection
20+
print("\n[1/6] Detecting GPU hardware...")
2321
try:
2422
result = subprocess.run(["nvidia-smi", "--query-gpu=name,compute_cap", "--format=csv,noheader"],
2523
capture_output=True, text=True, timeout=10)
@@ -34,120 +32,107 @@
3432
print(" No NVIDIA GPU detected")
3533
has_gpu = False
3634

37-
print("\n[2/5] Checking CUDA compiler...")
35+
# Step 2: NVCC Detection
36+
print("\n[2/6] Checking CUDA compiler...")
3837
nvcc_check = subprocess.run(["which", "nvcc"], capture_output=True, text=True)
3938
if nvcc_check.returncode == 0:
4039
nvcc_path = nvcc_check.stdout.strip()
41-
print(f" NVCC found: {nvcc_path}")
42-
nvcc_version = subprocess.run([nvcc_path, "--version"], capture_output=True, text=True)
43-
for line in nvcc_version.stdout.split("\n"):
40+
print(f" NVCC: {nvcc_path}")
41+
nvcc_v = subprocess.run([nvcc_path, "--version"], capture_output=True, text=True)
42+
for line in nvcc_v.stdout.split("\n"):
4443
if "release" in line.lower():
4544
print(f" {line.strip()}")
45+
has_nvcc = True
4646
else:
47-
print(" NVCC not found - CUDA backend will not be available")
47+
print(" NVCC not found")
48+
has_nvcc = False
4849

49-
print("\n[3/5] Removing old installations...")
50+
# Step 3: Clean ALL Caches
51+
print("\n[3/6] Cleaning ALL caches...")
5052
os.system("pip uninstall -y xerv-crayon crayon 2>/dev/null")
51-
os.system("rm -rf /tmp/crayon ~/.cache/pip/wheels/*crayon* 2>/dev/null")
52-
53-
print("\n[4/5] Cloning latest source from GitHub...")
54-
os.system("rm -rf /tmp/crayon")
55-
clone_result = os.system("git clone --depth 1 https://github.com/Electroiscoding/CRAYON.git /tmp/crayon")
56-
if clone_result != 0:
57-
print(" ERROR: Git clone failed!")
58-
sys.exit(1)
59-
60-
print("\n[5/5] Building and installing (with CUDA compilation)...")
61-
print(" This may take 1-2 minutes on first run...")
53+
os.system("pip cache purge 2>/dev/null")
54+
os.system("rm -rf /tmp/crayon /tmp/crayon_build ~/.cache/pip 2>/dev/null")
55+
print(" Done")
56+
57+
# Step 4: Fresh Clone with timestamp to avoid caching
58+
print("\n[4/6] Cloning from GitHub (fresh)...")
59+
timestamp = int(time.time())
60+
clone_dir = f"/tmp/crayon_{timestamp}"
61+
os.system(f"git clone --depth 1 https://github.com/Electroiscoding/CRAYON.git {clone_dir}")
62+
63+
# Verify version in cloned repo
64+
version_check = subprocess.run(["grep", "__version__", f"{clone_dir}/src/crayon/__init__.py"],
65+
capture_output=True, text=True)
66+
print(f" Cloned version: {version_check.stdout.strip()}")
67+
68+
# Step 5: Install with verbose output and no cache
69+
print("\n[5/6] Building and installing...")
6270
print("-" * 70)
6371

64-
build_result = subprocess.run(
65-
[sys.executable, "-m", "pip", "install", "-v", "--no-build-isolation", "/tmp/crayon"],
66-
capture_output=False
72+
result = subprocess.run(
73+
[sys.executable, "-m", "pip", "install", "-v", "--no-cache-dir", "--no-build-isolation", clone_dir],
74+
env={**os.environ, "CUDA_HOME": "/usr/local/cuda"}
6775
)
6876

6977
print("-" * 70)
7078

71-
if build_result.returncode != 0:
72-
print("\nERROR: Installation failed!")
73-
sys.exit(1)
79+
# Step 6: Verify Installation
80+
print("\n[6/6] Verifying installation...")
7481

75-
print("\n" + "=" * 70)
76-
print("INSTALLATION COMPLETE")
77-
print("=" * 70)
82+
# Force reimport
83+
if "crayon" in sys.modules:
84+
del sys.modules["crayon"]
85+
for key in list(sys.modules.keys()):
86+
if key.startswith("crayon"):
87+
del sys.modules[key]
7888

7989
import crayon
80-
print(f"\nCrayon Version: {crayon.get_version()}")
90+
print(f"\n Crayon Version: {crayon.get_version()}")
8191
backends = crayon.check_backends()
82-
print(f"Available Backends: {backends}")
92+
print(f" Backends: {backends}")
8393

84-
if has_gpu and not backends.get("cuda"):
85-
print("\nWARNING: GPU detected but CUDA backend not available!")
86-
print("Check the build output above for CUDA compilation errors.")
87-
88-
# ═══════════════════════════════════════════════════════════════════════════════
89-
# CELL 2: INITIALIZE AND TEST
90-
# ═══════════════════════════════════════════════════════════════════════════════
91-
92-
from crayon import CrayonVocab
94+
if backends.get("cuda"):
95+
print(" CUDA backend: READY")
96+
elif has_gpu and has_nvcc:
97+
print("\n WARNING: GPU + NVCC detected but CUDA backend not available!")
98+
print(" Check the build output above for errors.")
9399

94100
print("\n" + "=" * 70)
95-
print("TOKENIZER TEST")
101+
print("INITIALIZATION")
96102
print("=" * 70)
97103

104+
from crayon import CrayonVocab
105+
98106
vocab = CrayonVocab(device="auto")
99107
vocab.load_profile("lite")
100108

101109
info = vocab.get_info()
102110
print(f"\nActive Device: {info['device'].upper()}")
103111
print(f"Backend: {info['backend']}")
104-
print(f"Vocabulary Size: {vocab.vocab_size:,} tokens")
112+
print(f"Vocabulary: {vocab.vocab_size:,} tokens")
105113

106-
text = "Hello, world! Crayon is a high-performance tokenizer."
114+
# Quick test
115+
text = "Hello, Crayon tokenizer!"
107116
tokens = vocab.tokenize(text)
108-
print(f"\nTest Input: {text}")
109-
print(f"Tokens: {tokens}")
110-
print(f"Token Count: {len(tokens)}")
111-
112-
# ═══════════════════════════════════════════════════════════════════════════════
113-
# CELL 3: BENCHMARKS
114-
# ═══════════════════════════════════════════════════════════════════════════════
115-
116-
import time
117+
print(f"\nTest: '{text}' -> {len(tokens)} tokens")
117118

118119
print("\n" + "=" * 70)
119-
print("PERFORMANCE BENCHMARKS")
120+
print("BENCHMARKS")
120121
print("=" * 70)
121122

123+
import time
124+
122125
base_text = "The quick brown fox jumps over the lazy dog."
123126

124-
print("\n--- Latency Test (Single String) ---")
125-
iterations = 10000
126-
for _ in range(100):
127-
vocab.tokenize(base_text)
128-
start = time.perf_counter()
129-
for _ in range(iterations):
130-
vocab.tokenize(base_text)
131-
elapsed = time.perf_counter() - start
132-
print(f"Latency: {(elapsed/iterations)*1e6:.2f} microseconds/call")
133-
print(f"Throughput: {iterations/elapsed:,.0f} calls/second")
134-
135-
print("\n--- Batch Throughput Test ---")
136-
print(f"{'Batch Size':>12} | {'Docs/sec':>14} | {'Tokens/sec':>16}")
137-
print("-" * 50)
138-
139-
for batch_size in [100, 1000, 10000, 50000]:
127+
print("\n--- Batch Throughput ---")
128+
for batch_size in [1000, 10000, 50000]:
140129
batch = [base_text] * batch_size
141130
vocab.tokenize(batch[:10])
142-
143131
start = time.time()
144132
results = vocab.tokenize(batch)
145133
duration = time.time() - start
146-
147134
total_tokens = sum(len(r) for r in results)
148-
docs_sec = batch_size / duration
149-
toks_sec = total_tokens / duration
150-
print(f"{batch_size:>12,} | {docs_sec:>14,.0f} | {toks_sec:>16,.0f}")
135+
print(f"{batch_size:>8}: {batch_size/duration:>12,.0f} docs/sec | {total_tokens/duration:>14,.0f} tokens/sec")
151136

152137
if vocab.device != "cpu":
153138
print(f"\n--- GPU Stress Test ({vocab.device.upper()}) ---")
@@ -157,32 +142,7 @@
157142
results = vocab.tokenize(batch)
158143
duration = time.time() - start
159144
total_tokens = sum(len(r) for r in results)
160-
print(f"{batch_size:>12,} docs in {duration:.3f}s = {batch_size/duration:,.0f} docs/sec, {total_tokens/duration:,.0f} tokens/sec")
161-
162-
# ═══════════════════════════════════════════════════════════════════════════════
163-
# CELL 4: ROUND-TRIP VERIFICATION
164-
# ═══════════════════════════════════════════════════════════════════════════════
165-
166-
print("\n" + "=" * 70)
167-
print("ENCODE/DECODE VERIFICATION")
168-
print("=" * 70)
169-
170-
test_strings = [
171-
"Hello, Crayon!",
172-
"The quick brown fox jumps over the lazy dog.",
173-
"def forward(self, x): return torch.relu(x)",
174-
]
175-
176-
all_passed = True
177-
for s in test_strings:
178-
tokens = vocab.tokenize(s)
179-
decoded = vocab.decode(tokens)
180-
passed = s == decoded
181-
all_passed = all_passed and passed
182-
status = "PASS" if passed else "FAIL"
183-
print(f"[{status}] '{s[:40]}...' -> {len(tokens)} tokens")
184-
185-
print(f"\nAll tests passed: {all_passed}")
145+
print(f"{batch_size:>8}: {batch_size/duration:>12,.0f} docs/sec in {duration:.3f}s")
186146

187147
vocab.close()
188148
print("\nDone!")

0 commit comments

Comments
 (0)