Skip to content

Commit c85b17f

Browse files
Update streamlit_app.py
1 parent a73844f commit c85b17f

File tree

1 file changed

+55
-57
lines changed

1 file changed

+55
-57
lines changed

streamlit_app.py

Lines changed: 55 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,57 @@
1-
21
import streamlit as st
32
from crayon import CrayonVocab
4-
5-
@st.cache_resource
6-
def load_crayon_vocab_cached():
7-
vocab = CrayonVocab(device="auto")
8-
vocab.load_profile("lite")
9-
return vocab
10-
11-
vocab = load_crayon_vocab_cached()
12-
13-
st.set_page_config(page_title='CRAYON Tokenizer Demo', layout='wide')
14-
15-
st.title('CRAYON Tokenizer Demonstration')
16-
17-
default_text = "CRAYON is a hyper-fast tokenizer designed for modern AI. It offers unparalleled speed and efficiency in processing large volumes of text data."
18-
user_text = st.text_area('Enter text here:', default_text, height=200)
19-
20-
if user_text:
21-
tokens_ids = vocab.tokenize(user_text)
22-
decoded_tokens = [vocab.decode([token_id]) for token_id in tokens_ids]
23-
24-
word_count = len([word for word in user_text.split() if word.strip()])
25-
token_count = len(tokens_ids)
26-
27-
colors = ["rgba(173, 216, 230, 0.4)", "rgba(144, 238, 144, 0.4)", "rgba(255, 255, 153, 0.4)", "rgba(255, 192, 203, 0.4)"]
28-
highlighted_tokens_html = []
29-
for i, token in enumerate(decoded_tokens):
30-
color = colors[i % len(colors)]
31-
highlighted_tokens_html.append(f"<span style='background-color: {color}; padding: 2px; margin: 0 1px;'>{token}</span>")
32-
33-
display_tokens_html = "".join(highlighted_tokens_html)
34-
35-
st.subheader('Tokenized Text')
36-
st.markdown(f"<div style='border: 1px solid #ccc; padding: 10px; border-radius: 5px;'>{display_tokens_html}</div>", unsafe_allow_html=True)
37-
38-
st.subheader('Word Count')
39-
st.write(word_count)
40-
41-
st.subheader('Token Count')
42-
st.write(token_count)
43-
44-
with st.expander("Show Detailed Token Information (IDs and Decoded Parts)"):
45-
st.write("--- ")
46-
st.markdown("### Token IDs:")
47-
st.code(str(tokens_ids))
48-
st.markdown("### Decoded Token Parts:")
49-
for i, token in enumerate(decoded_tokens):
50-
st.markdown(f"- ID: {tokens_ids[i]}, Part: `{token}`")
51-
st.write("--- ")
52-
53-
else:
54-
st.subheader('Tokenized Text')
55-
st.write("Please enter some text to tokenize.")
56-
st.subheader('Word Count')
57-
st.write("0")
58-
st.subheader('Token Count')
59-
st.write("0")
3+
import time
4+
5+
st.set_page_config(page_title="CRAYON Tokenizer Demo", layout="wide")
6+
7+
st.title("🖍️ CRAYON Tokenizer Demo")
8+
st.markdown("Interactive tokenization with CRAYON—the hyper-fast specialized tokenizer.")
9+
10+
# Initialize session state
11+
if "vocab" not in st.session_state:
12+
with st.spinner("Loading vocabulary profile..."):
13+
st.session_state.vocab = CrayonVocab(device="cpu") # Use CPU for cloud compatibility
14+
st.session_state.vocab.load_profile("lite")
15+
st.success("✓ Profile loaded!")
16+
17+
vocab = st.session_state.vocab
18+
19+
# User input
20+
st.subheader("Input Text")
21+
text_input = st.text_area(
22+
"Enter text to tokenize:",
23+
value="Hello, CRAYON! This is a production-grade tokenizer.",
24+
height=100
25+
)
26+
27+
if text_input:
28+
# Tokenize
29+
start = time.perf_counter()
30+
tokens = vocab.tokenize(text_input)
31+
elapsed = (time.perf_counter() - start) * 1000
32+
33+
# Decode
34+
decoded = vocab.decode(tokens)
35+
36+
# Display results
37+
col1, col2 = st.columns(2)
38+
39+
with col1:
40+
st.subheader("Tokens")
41+
st.code(str(tokens), language="python")
42+
43+
with col2:
44+
st.subheader("Statistics")
45+
st.metric("Token Count", len(tokens))
46+
st.metric("Processing Time", f"{elapsed:.3f}ms")
47+
48+
st.subheader("Decoded Output")
49+
st.write(decoded)
50+
51+
# Token breakdown
52+
with st.expander("📋 Token Breakdown"):
53+
st.write(f"{'ID':<8} | {'Substring':<20}")
54+
st.write("-" * 30)
55+
for tid in tokens:
56+
substring = vocab.decode([tid])
57+
st.write(f"{tid:<8} | '{substring}'")

0 commit comments

Comments
 (0)