-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathverify_dat_engine.py
More file actions
125 lines (100 loc) · 3.44 KB
/
verify_dat_engine.py
File metadata and controls
125 lines (100 loc) · 3.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
XERV CRAYON V2.0 - Production Verification Script
Verifies the DAT engine with actual trained vocabularies.
"""
import sys
import os
import json
# Add paths
sys.path.insert(0, os.path.join(os.getcwd(), "build", "lib.win-amd64-cpython-313"))
sys.path.insert(0, os.path.join(os.getcwd(), "src"))
import time
import tempfile
import mmap
from crayon.c_ext.dat_builder import DATBuilder
from crayon.c_ext import crayon_fast
print("=" * 70)
print("XERV CRAYON V2.0 - HYPER-PRODUCTION DAT ENGINE VERIFICATION")
print("=" * 70)
# Load the trained vocabulary (lite version for speed)
vocab_path = os.path.join(os.getcwd(), "trained_vocab_lite.json")
if not os.path.exists(vocab_path):
# Fallback to full vocab
vocab_path = os.path.join(os.getcwd(), "trained_vocab.json")
print(f"Loading vocabulary from: {vocab_path}")
with open(vocab_path, 'r', encoding='utf-8') as f:
vocab_data = json.load(f)
# Handle both list and dict formats
if isinstance(vocab_data, list):
vocab = vocab_data
elif isinstance(vocab_data, dict):
vocab = [k for k, v in sorted(vocab_data.items(), key=lambda x: x[1])]
else:
raise ValueError("Unknown vocab format")
print(f"Vocabulary Size: {len(vocab):,} tokens")
# Build DAT
builder = DATBuilder()
builder.build(vocab)
# Save to temp file
dat_path = os.path.join(tempfile.gettempdir(), "trained_vocab.dat")
builder.save(dat_path)
print(f"DAT Nodes: {builder.size:,}")
print(f"DAT File Size: {os.path.getsize(dat_path)/1024:.1f} KB")
# Load via mmap (zero-copy)
fh = open(dat_path, 'rb')
mm = mmap.mmap(fh.fileno(), 0, access=mmap.ACCESS_READ)
size = crayon_fast.load_dat(mm)
print(f"Loaded into C++ engine: {size:,} nodes")
# Build id_to_token for decoding
id_to_token = {i: t for i, t in enumerate(vocab)}
# Test tokenization
test_texts = [
"The quick brown fox jumps over the lazy dog.",
"Machine learning and artificial intelligence are transforming industries.",
"def hello_world():\n print('Hello, World!')",
]
print("-" * 70)
print("TOKENIZATION SAMPLES:")
print("-" * 70)
for text in test_texts:
tokens = crayon_fast.tokenize(text)
# Decode first few tokens
decoded = [id_to_token.get(t, f"[{t}]") for t in tokens[:10]]
print(f"Input: \"{text[:50]}...\"" if len(text) > 50 else f"Input: \"{text}\"")
print(f"Tokens ({len(tokens)}): {tokens[:10]}...")
print(f"Decoded: {decoded}")
print()
# Benchmark with substantial text
benchmark_text = " ".join(test_texts) * 5000
text_size_kb = len(benchmark_text) / 1024
text_size_mb = len(benchmark_text) / 1024 / 1024
print("=" * 70)
print(f"BENCHMARK: {text_size_mb:.2f} MB of text")
print("=" * 70)
# Warmup
_ = crayon_fast.tokenize(benchmark_text[:1000])
# Actual benchmark
start = time.perf_counter()
result = crayon_fast.tokenize(benchmark_text)
elapsed = time.perf_counter() - start
tokens_per_sec = len(result) / elapsed
mb_per_sec = text_size_mb / elapsed
print(f"Tokens generated: {len(result):,}")
print(f"Time: {elapsed*1000:.2f} ms")
print(f"Throughput: {tokens_per_sec:,.0f} tokens/sec")
print(f"Throughput: {mb_per_sec:.2f} MB/sec")
print("=" * 70)
if tokens_per_sec > 1_000_000:
print("STATUS: ✅ HYPER-PRODUCTION READY (>1M tokens/sec)")
elif tokens_per_sec > 500_000:
print("STATUS: ✅ PRODUCTION READY (>500K tokens/sec)")
else:
print("STATUS: ⚠️ Performance below target")
# Cleanup
try:
crayon_fast.load_dat(b'CRAY' + b'\x02\x00\x00\x00' + b'\x00\x00\x00\x00')
except:
pass
mm.close()
fh.close()
os.unlink(dat_path)