ok

Electroiscoding · Electroiscoding · commit 84a86a821fa6 · 2026-01-28T23:49:51.000+05:30
diff --git a/simple_demo.py b/simple_demo.py
@@ -0,0 +1,47 @@
+from crayon import CrayonVocab
+
+def main():
+    print("Crayon Tokenizer Demo")
+    print("=======================\n")
+
+    # 1. Initialize & Load Profile
+    # 'auto' will use GPU if available, else CPU
+    vocab = CrayonVocab(device="auto")
+    vocab.load_profile("lite") 
+    print(f"Loaded Profile: 'lite' on {vocab.device.upper()}")
+
+    # 2. Define Input Text
+    text = "Hello, Crayon! This is a simple test."
+
+    # 3. Tokenize
+    # This converts the string into a list of integer IDs
+    tokens = vocab.tokenize(text)
+
+    print(f"\nInput Text:  '{text}'")
+    print(f"Token IDs:   {tokens}")
+    print(f"Count:       {len(tokens)} tokens\n")
+
+    # 4. Analyze Each Token
+    # We decode each ID individually to show exactly what substring it represents
+    print("Token Breakdown:")
+    print(f"{'ID':<8} | {'Substring':<20}")
+    print("-" * 30)
+
+    for tid in tokens:
+        # We pass a list [tid] because decode expects a sequence
+        substring = vocab.decode([tid])
+        print(f"{tid:<8} | '{substring}'")
+
+    # 5. Full Decode
+    # Convert the list of IDs back to the original string
+    decoded_text = vocab.decode(tokens)
+    print(f"\nFull Decode check: '{decoded_text}'")
+    
+    # Verification
+    if text == decoded_text:
+        print("[MATCH] Exact Match!")
+    else:
+        print("[MISMATCH] Mismatch (canonicalization might differ)")
+
+if __name__ == "__main__":
+    main()