|
| 1 | +from crayon import CrayonVocab |
| 2 | + |
| 3 | +def main(): |
| 4 | + print("Crayon Tokenizer Demo") |
| 5 | + print("=======================\n") |
| 6 | + |
| 7 | + # 1. Initialize & Load Profile |
| 8 | + # 'auto' will use GPU if available, else CPU |
| 9 | + vocab = CrayonVocab(device="auto") |
| 10 | + vocab.load_profile("lite") |
| 11 | + print(f"Loaded Profile: 'lite' on {vocab.device.upper()}") |
| 12 | + |
| 13 | + # 2. Define Input Text |
| 14 | + text = "Hello, Crayon! This is a simple test." |
| 15 | + |
| 16 | + # 3. Tokenize |
| 17 | + # This converts the string into a list of integer IDs |
| 18 | + tokens = vocab.tokenize(text) |
| 19 | + |
| 20 | + print(f"\nInput Text: '{text}'") |
| 21 | + print(f"Token IDs: {tokens}") |
| 22 | + print(f"Count: {len(tokens)} tokens\n") |
| 23 | + |
| 24 | + # 4. Analyze Each Token |
| 25 | + # We decode each ID individually to show exactly what substring it represents |
| 26 | + print("Token Breakdown:") |
| 27 | + print(f"{'ID':<8} | {'Substring':<20}") |
| 28 | + print("-" * 30) |
| 29 | + |
| 30 | + for tid in tokens: |
| 31 | + # We pass a list [tid] because decode expects a sequence |
| 32 | + substring = vocab.decode([tid]) |
| 33 | + print(f"{tid:<8} | '{substring}'") |
| 34 | + |
| 35 | + # 5. Full Decode |
| 36 | + # Convert the list of IDs back to the original string |
| 37 | + decoded_text = vocab.decode(tokens) |
| 38 | + print(f"\nFull Decode check: '{decoded_text}'") |
| 39 | + |
| 40 | + # Verification |
| 41 | + if text == decoded_text: |
| 42 | + print("[MATCH] Exact Match!") |
| 43 | + else: |
| 44 | + print("[MISMATCH] Mismatch (canonicalization might differ)") |
| 45 | + |
| 46 | +if __name__ == "__main__": |
| 47 | + main() |
0 commit comments