Skip to content

Commit a73844f

Browse files
Create streamlit_app.py
1 parent c456090 commit a73844f

File tree

1 file changed

+59
-0
lines changed

1 file changed

+59
-0
lines changed

streamlit_app.py

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
2+
import streamlit as st
3+
from crayon import CrayonVocab
4+
5+
@st.cache_resource
6+
def load_crayon_vocab_cached():
7+
vocab = CrayonVocab(device="auto")
8+
vocab.load_profile("lite")
9+
return vocab
10+
11+
vocab = load_crayon_vocab_cached()
12+
13+
st.set_page_config(page_title='CRAYON Tokenizer Demo', layout='wide')
14+
15+
st.title('CRAYON Tokenizer Demonstration')
16+
17+
default_text = "CRAYON is a hyper-fast tokenizer designed for modern AI. It offers unparalleled speed and efficiency in processing large volumes of text data."
18+
user_text = st.text_area('Enter text here:', default_text, height=200)
19+
20+
if user_text:
21+
tokens_ids = vocab.tokenize(user_text)
22+
decoded_tokens = [vocab.decode([token_id]) for token_id in tokens_ids]
23+
24+
word_count = len([word for word in user_text.split() if word.strip()])
25+
token_count = len(tokens_ids)
26+
27+
colors = ["rgba(173, 216, 230, 0.4)", "rgba(144, 238, 144, 0.4)", "rgba(255, 255, 153, 0.4)", "rgba(255, 192, 203, 0.4)"]
28+
highlighted_tokens_html = []
29+
for i, token in enumerate(decoded_tokens):
30+
color = colors[i % len(colors)]
31+
highlighted_tokens_html.append(f"<span style='background-color: {color}; padding: 2px; margin: 0 1px;'>{token}</span>")
32+
33+
display_tokens_html = "".join(highlighted_tokens_html)
34+
35+
st.subheader('Tokenized Text')
36+
st.markdown(f"<div style='border: 1px solid #ccc; padding: 10px; border-radius: 5px;'>{display_tokens_html}</div>", unsafe_allow_html=True)
37+
38+
st.subheader('Word Count')
39+
st.write(word_count)
40+
41+
st.subheader('Token Count')
42+
st.write(token_count)
43+
44+
with st.expander("Show Detailed Token Information (IDs and Decoded Parts)"):
45+
st.write("--- ")
46+
st.markdown("### Token IDs:")
47+
st.code(str(tokens_ids))
48+
st.markdown("### Decoded Token Parts:")
49+
for i, token in enumerate(decoded_tokens):
50+
st.markdown(f"- ID: {tokens_ids[i]}, Part: `{token}`")
51+
st.write("--- ")
52+
53+
else:
54+
st.subheader('Tokenized Text')
55+
st.write("Please enter some text to tokenize.")
56+
st.subheader('Word Count')
57+
st.write("0")
58+
st.subheader('Token Count')
59+
st.write("0")

0 commit comments

Comments
 (0)