-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathverify_code_vocab.py
More file actions
48 lines (39 loc) · 1.84 KB
/
verify_code_vocab.py
File metadata and controls
48 lines (39 loc) · 1.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
"""Quick verification of the updated vocabulary with code tokens."""
from crayon import CrayonVocab
# Load vocabulary
v = CrayonVocab.from_json('trained_vocab.json')
print(f"Vocabulary Size: {len(v):,} tokens")
print(f"C-Extension: {'Enabled' if v._c_ext_available else 'Disabled'}")
# Test code samples from multiple languages
test_cases = [
("Python", "def fibonacci(n: int) -> int:\n return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)"),
("JavaScript", "const fetchData = async (url) => { const res = await fetch(url); return res.json(); }"),
("TypeScript", "interface User { id: number; name: string; email: string; }"),
("Java", 'public static void main(String[] args) { System.out.println("Hello World"); }'),
("C++", "#include <iostream>\nint main() { std::cout << \"Hello\" << std::endl; return 0; }"),
("Rust", 'fn main() { let x: i32 = 42; println!("Value: {}", x); }'),
("Go", 'func main() { fmt.Println("Hello, World!") }'),
("NumPy", "import numpy as np\ndf = pd.DataFrame(data)"),
]
print("\n" + "=" * 50)
print("Verification Tests")
print("=" * 50)
for lang, code in test_cases:
tokens = v.tokenize(code)
decoded = v.decode(tokens)
match = "[OK]" if decoded == code else "[FAIL]"
display = code[:45] + "..." if len(code) > 45 else code
display = display.replace('\n', '\\n')
print(f"\n[{lang}] {match}")
print(f" Input: '{display}'")
print(f" Tokens: {len(tokens)}")
print("\n" + "=" * 50)
print("Sample Code Tokens (IDs 50000+)")
print("=" * 50)
# Show some new code tokens (starting after the original 50k)
print("\nNew code tokens (sample):")
for i in range(50000, min(50030, len(v))):
token = v.id_to_token[i]
display = repr(token) if len(repr(token)) < 30 else repr(token[:25] + "...")
print(f" ID {i}: {display}")
print(f"\nTotal vocabulary: {len(v):,} tokens")