Skip to content

Commit 84a86a8

Browse files
ok
1 parent 24500f1 commit 84a86a8

File tree

1 file changed

+47
-0
lines changed

1 file changed

+47
-0
lines changed

simple_demo.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
from crayon import CrayonVocab
2+
3+
def main():
4+
print("Crayon Tokenizer Demo")
5+
print("=======================\n")
6+
7+
# 1. Initialize & Load Profile
8+
# 'auto' will use GPU if available, else CPU
9+
vocab = CrayonVocab(device="auto")
10+
vocab.load_profile("lite")
11+
print(f"Loaded Profile: 'lite' on {vocab.device.upper()}")
12+
13+
# 2. Define Input Text
14+
text = "Hello, Crayon! This is a simple test."
15+
16+
# 3. Tokenize
17+
# This converts the string into a list of integer IDs
18+
tokens = vocab.tokenize(text)
19+
20+
print(f"\nInput Text: '{text}'")
21+
print(f"Token IDs: {tokens}")
22+
print(f"Count: {len(tokens)} tokens\n")
23+
24+
# 4. Analyze Each Token
25+
# We decode each ID individually to show exactly what substring it represents
26+
print("Token Breakdown:")
27+
print(f"{'ID':<8} | {'Substring':<20}")
28+
print("-" * 30)
29+
30+
for tid in tokens:
31+
# We pass a list [tid] because decode expects a sequence
32+
substring = vocab.decode([tid])
33+
print(f"{tid:<8} | '{substring}'")
34+
35+
# 5. Full Decode
36+
# Convert the list of IDs back to the original string
37+
decoded_text = vocab.decode(tokens)
38+
print(f"\nFull Decode check: '{decoded_text}'")
39+
40+
# Verification
41+
if text == decoded_text:
42+
print("[MATCH] Exact Match!")
43+
else:
44+
print("[MISMATCH] Mismatch (canonicalization might differ)")
45+
46+
if __name__ == "__main__":
47+
main()

0 commit comments

Comments
 (0)