Skip to content

Commit 7549959

Browse files
Add files via upload
1 parent dacec08 commit 7549959

File tree

7 files changed

+478364
-0
lines changed

7 files changed

+478364
-0
lines changed

mtokenizer/index.html

Lines changed: 309 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
6+
<title>Mistral Transformers Tokenizers</title>
7+
<style>
8+
body {
9+
font-family: Arial, sans-serif;
10+
display: flex;
11+
justify-content: center;
12+
align-items: center;
13+
height: 100vh;
14+
margin: 0;
15+
background-color: #000;
16+
color: #fff;
17+
width: 100%;
18+
flex-direction: column;
19+
}
20+
.input-container {
21+
padding: 10px;
22+
font-size: 14px;
23+
min-width: 30vw;
24+
max-width: 60vw;
25+
box-sizing: border-box;
26+
min-height: 40px;
27+
outline: none;
28+
white-space: pre-wrap;
29+
word-wrap: break-word;
30+
background-color: #1e1e1e;
31+
color: #fff;
32+
border-radius: 5px;
33+
overflow-wrap: break-word;
34+
line-height: 1.5;
35+
}
36+
.highlight {
37+
border-radius: 4px;
38+
padding: 2px;
39+
margin: 1px;
40+
}
41+
.token-container {
42+
display: inline-block;
43+
text-align: center;
44+
margin: 5px;
45+
position: relative;
46+
}
47+
.token-id {
48+
font-size: 10px;
49+
color: #ccc;
50+
margin-top: 5px;
51+
}
52+
.input-section {
53+
display: flex;
54+
align-items: center;
55+
border: 1px solid #ccc;
56+
border-radius: 5px;
57+
background-color: orange;
58+
}
59+
60+
.tokens-display {
61+
display: flex;
62+
flex-wrap: wrap;
63+
justify-content: center;
64+
margin-top: 20px;
65+
width: 60vw;
66+
font-size: 12px;
67+
}
68+
.dropdown-container {
69+
margin-top: 5px;
70+
}
71+
.dropdown {
72+
padding: 5px;
73+
background-color: #1e1e1e;
74+
color: #fff;
75+
border: 1px solid #ccc;
76+
border-radius: 5px;
77+
margin: 10px;
78+
cursor: pointer;
79+
}
80+
.input-box {
81+
padding: 5px;
82+
background-color: #1e1e1e;
83+
color: #fff;
84+
border: 1px solid #ccc;
85+
border-radius: 5px;
86+
margin: 10px;
87+
}
88+
.error-message {
89+
color: red;
90+
margin-top: 20px;
91+
}
92+
.examples-container {
93+
margin-top: 16px;
94+
display: flex;
95+
flex-wrap: wrap;
96+
justify-content: center;
97+
gap: 8px;
98+
}
99+
.example {
100+
font-size: 0.9em;
101+
flex: 1 1 calc(33.333% - 20px);
102+
margin: 4px;
103+
padding: 8px;
104+
background-color: #1e1e1e;
105+
color: #fff;
106+
border: 1px solid #ccc;
107+
border-radius: 5px;
108+
cursor: pointer;
109+
text-align: center;
110+
box-sizing: border-box;
111+
}
112+
.example:hover{
113+
background-color: #2e2e2e;
114+
}
115+
footer {
116+
position: absolute;
117+
bottom: 10px;
118+
width: 100%;
119+
text-align: center;
120+
padding: 3px 0px;
121+
background-color: transparent;
122+
color: #9e9e9e;
123+
font-size: 12px;
124+
border-top: none;
125+
}
126+
footer a {
127+
color: #b04100;
128+
text-decoration: none;
129+
}
130+
footer a:hover {
131+
text-decoration: underline;
132+
}
133+
</style>
134+
</head>
135+
<body>
136+
<h1 style="color: orange">Tokenize!</h1>
137+
<div class="dropdown-container">
138+
<select class="dropdown" id="tokenizerDropdown">
139+
<option value="mistral-v7">Mistral V7</option>
140+
<option value="mistral-v3">Mistral V3</option>
141+
<option value="mistral-v1">Mistral V1</option>
142+
</select>
143+
</div>
144+
<section class="input-section">
145+
<div class="token-container">
146+
<span class="highlight">&lt;s&gt;</span>
147+
</div>
148+
<div class="input-container" contenteditable="true" id="highlightInput" placeholder="Type something..."></div>
149+
</section>
150+
<div class="tokens-display" id="tokensDisplay"></div>
151+
<div class="error-message" id="errorMessage"></div>
152+
<div class="examples-container">
153+
<div class="example" data-example=" [INST] user message [/INST] assistant message</s>" data-tokenizer="mistral-v7">One Shot with Mistral V7</div>
154+
<div class="example" data-example="[INST] user message[/INST] assistant message</s>" data-tokenizer="mistral-v3">One Shot with Mistral V3</div>
155+
<div class="example" data-example="[INST] user message[/INST] assistant message</s>" data-tokenizer="mistral-v1">One Shot with Mistral V1</div>
156+
<div class="example" data-example=" [INST] user message [/INST] assistant message</s> [INST] new user message [/INST] new assistant message</s>" data-tokenizer="mistral-v1">Simple Chat with Mistral V1</div>
157+
<div class="example" data-example="[INST] user message[/INST] assistant message</s>[INST] new user message[/INST] new assistant message</s>" data-tokenizer="mistral-v3">Simple Chat with Mistral V3</div>
158+
<div class="example" data-example="[INST] user message[/INST] assistant message</s>[INST] new user message[/INST] new assistant message</s>" data-tokenizer="mistral-v7">Simple Chat with Mistral V7</div>
159+
<div class="example" data-example="[SYSTEM_PROMPT] system prompt[/SYSTEM_PROMPT][INST] user message[/INST] assistant message</s>[INST] new user message[/INST] new assistant message</s>" data-tokenizer="mistral-v7">System Prompt with Mistral V7</div>
160+
</div>
161+
<footer>
162+
<p>This is a work in progress. Contribution is welcome, repository <a href="https://github.com/your-github-url">here</a></p>
163+
</footer>
164+
<script type="module">
165+
import { AutoTokenizer, env } from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected]';
166+
167+
env.allowLocalModels = true;
168+
env.localModelPath = "/tokenizers/";
169+
170+
const input = document.getElementById('highlightInput');
171+
const tokensDisplay = document.getElementById('tokensDisplay');
172+
const tokenizerDropdown = document.getElementById('tokenizerDropdown');
173+
const errorMessage = document.getElementById('errorMessage');
174+
175+
const colors = ['#ff9b00', '#001bff', '#ff0000', '#9b00ff', '#00ff17', '#00e4ff', '#e4ff00', '#ff00d1'];
176+
177+
function hexToRgba(hex, alpha) {
178+
hex = hex.replace(/^#/, '');
179+
let bigint = parseInt(hex, 16);
180+
let r = (bigint >> 16) & 255;
181+
let g = (bigint >> 8) & 255;
182+
let b = bigint & 255;
183+
return `rgba(${r}, ${g}, ${b}, ${alpha})`;
184+
}
185+
186+
let tokenizer = await loadTokenizer('./mistral-v7', false);
187+
188+
async function loadTokenizer(tokenizerPath, useAuthToken) {
189+
try {
190+
return await AutoTokenizer.from_pretrained(tokenizerPath, { useAuthToken });
191+
} catch (error) {
192+
errorMessage.textContent = `Error loading tokenizer: ${error.message}`;
193+
throw error;
194+
}
195+
}
196+
197+
tokenizerDropdown.addEventListener('change', async () => {
198+
const tokenizerPath = tokenizerDropdown.value;
199+
try {
200+
tokenizer = await loadTokenizer(`./${tokenizerPath}`, false);
201+
triggerInputEvent();
202+
} catch (error) {
203+
errorMessage.textContent = `Error loading tokenizer: ${error.message}`;
204+
}
205+
});
206+
207+
function triggerInputEvent() {
208+
const event = new Event('input', { bubbles: true });
209+
input.dispatchEvent(event);
210+
}
211+
212+
input.addEventListener('input', async () => {
213+
errorMessage.textContent = '';
214+
if (input.textContent === ""){
215+
tokensDisplay.innerHTML = "";
216+
return;
217+
}
218+
219+
const text = input.textContent.replace(/\u200B/g, '').replace(/<0x0A>/g, '\n');
220+
221+
if (!tokenizer) return;
222+
223+
console.log(text)
224+
const tokens = tokenizer.tokenize("<s>"+text).slice(1);
225+
console.log(tokens)
226+
if (tokens[tokens.length - 1] === "<0x0A>") {
227+
tokens.pop();
228+
}
229+
const tokenIds = tokenizer.encode(text).slice(1);
230+
console.log(tokenIds)
231+
232+
let tokenListHTML = `
233+
<div class="token-container">
234+
<span class="highlight" style="color: orange;">&lt;s&gt;</span>
235+
<div class="token-id">1</div>
236+
</div>
237+
`;
238+
let highlightedText = '';
239+
240+
tokens.forEach((token, index) => {
241+
if (token !== '') {
242+
const color = colors[(index + 1) % colors.length];
243+
let highlightedToken = token;
244+
245+
if (token === '<s>' || token === '</s>') {
246+
highlightedToken = `&lt;${token.slice(1, -1)}&gt;`;
247+
}else if (token === '</'){
248+
highlightedToken = `&lt;/` + token.slice(1, -1);
249+
}
250+
251+
let highlightedTokenInput = highlightedToken.replace(/<0x0A>/g, '\n\u200B').replace(//g, ' ');
252+
let highlightedTokenToId = highlightedToken.replace(/<0x0A>/g, '\\n').replace(/ /g, ' ');
253+
254+
tokenListHTML += `
255+
<div class="token-container">
256+
<span class="highlight" style="color: ${color};">${highlightedTokenToId}</span>
257+
<div class="token-id">${tokenIds[index]}</div>
258+
</div>
259+
`;
260+
261+
highlightedText += `<span class="highlight" style="background-color: ${hexToRgba(color, 0.3)};">${highlightedTokenInput}</span>`;
262+
}
263+
});
264+
265+
tokensDisplay.innerHTML = tokenListHTML;
266+
input.innerHTML = highlightedText;
267+
268+
const range = document.createRange();
269+
const selection = window.getSelection();
270+
range.selectNodeContents(input);
271+
range.collapse(false);
272+
selection.removeAllRanges();
273+
selection.addRange(range);
274+
});
275+
276+
input.addEventListener('keydown', (e) => {
277+
if (e.key === 'Enter') {
278+
input.innerHTML += '\n';
279+
e.preventDefault();
280+
281+
const emptyTextNode = document.createTextNode('\u200b');
282+
input.appendChild(emptyTextNode);
283+
const range = document.createRange();
284+
const selection = window.getSelection();
285+
range.setStart(emptyTextNode, 1);
286+
range.collapse(true);
287+
selection.removeAllRanges();
288+
selection.addRange(range);
289+
}
290+
});
291+
292+
input.addEventListener('paste', (e) => {
293+
e.preventDefault();
294+
const pastedText = (e.clipboardData || window.clipboardData).getData('text');
295+
document.execCommand('insertText', false, pastedText);
296+
});
297+
298+
document.querySelectorAll('.example').forEach(example => {
299+
example.addEventListener('click', () => {
300+
const exampleText = example.getAttribute('data-example');
301+
const tokenizerValue = example.getAttribute('data-tokenizer');
302+
tokenizerDropdown.value = tokenizerValue;
303+
input.textContent = exampleText;
304+
triggerInputEvent();
305+
});
306+
});
307+
</script>
308+
</body>
309+
</html>

0 commit comments

Comments
 (0)