1+ <!DOCTYPE html>
2+ < html lang ="en ">
3+ < head >
4+ < meta charset ="UTF-8 ">
5+ < meta name ="viewport " content ="width=device-width, initial-scale=1.0 ">
6+ < title > Mistral Transformers Tokenizers</ title >
7+ < style >
8+ body {
9+ font-family : Arial, sans-serif;
10+ display : flex;
11+ justify-content : center;
12+ align-items : center;
13+ height : 100vh ;
14+ margin : 0 ;
15+ background-color : # 000 ;
16+ color : # fff ;
17+ width : 100% ;
18+ flex-direction : column;
19+ }
20+ .input-container {
21+ padding : 10px ;
22+ font-size : 14px ;
23+ min-width : 30vw ;
24+ max-width : 60vw ;
25+ box-sizing : border-box;
26+ min-height : 40px ;
27+ outline : none;
28+ white-space : pre-wrap;
29+ word-wrap : break-word;
30+ background-color : # 1e1e1e ;
31+ color : # fff ;
32+ border-radius : 5px ;
33+ overflow-wrap : break-word;
34+ line-height : 1.5 ;
35+ }
36+ .highlight {
37+ border-radius : 4px ;
38+ padding : 2px ;
39+ margin : 1px ;
40+ }
41+ .token-container {
42+ display : inline-block;
43+ text-align : center;
44+ margin : 5px ;
45+ position : relative;
46+ }
47+ .token-id {
48+ font-size : 10px ;
49+ color : # ccc ;
50+ margin-top : 5px ;
51+ }
52+ .input-section {
53+ display : flex;
54+ align-items : center;
55+ border : 1px solid # ccc ;
56+ border-radius : 5px ;
57+ background-color : orange;
58+ }
59+
60+ .tokens-display {
61+ display : flex;
62+ flex-wrap : wrap;
63+ justify-content : center;
64+ margin-top : 20px ;
65+ width : 60vw ;
66+ font-size : 12px ;
67+ }
68+ .dropdown-container {
69+ margin-top : 5px ;
70+ }
71+ .dropdown {
72+ padding : 5px ;
73+ background-color : # 1e1e1e ;
74+ color : # fff ;
75+ border : 1px solid # ccc ;
76+ border-radius : 5px ;
77+ margin : 10px ;
78+ cursor : pointer;
79+ }
80+ .input-box {
81+ padding : 5px ;
82+ background-color : # 1e1e1e ;
83+ color : # fff ;
84+ border : 1px solid # ccc ;
85+ border-radius : 5px ;
86+ margin : 10px ;
87+ }
88+ .error-message {
89+ color : red;
90+ margin-top : 20px ;
91+ }
92+ .examples-container {
93+ margin-top : 16px ;
94+ display : flex;
95+ flex-wrap : wrap;
96+ justify-content : center;
97+ gap : 8px ;
98+ }
99+ .example {
100+ font-size : 0.9em ;
101+ flex : 1 1 calc (33.333% - 20px );
102+ margin : 4px ;
103+ padding : 8px ;
104+ background-color : # 1e1e1e ;
105+ color : # fff ;
106+ border : 1px solid # ccc ;
107+ border-radius : 5px ;
108+ cursor : pointer;
109+ text-align : center;
110+ box-sizing : border-box;
111+ }
112+ .example : hover {
113+ background-color : # 2e2e2e ;
114+ }
115+ footer {
116+ position : absolute;
117+ bottom : 10px ;
118+ width : 100% ;
119+ text-align : center;
120+ padding : 3px 0px ;
121+ background-color : transparent;
122+ color : # 9e9e9e ;
123+ font-size : 12px ;
124+ border-top : none;
125+ }
126+ footer a {
127+ color : # b04100 ;
128+ text-decoration : none;
129+ }
130+ footer a : hover {
131+ text-decoration : underline;
132+ }
133+ </ style >
134+ </ head >
135+ < body >
136+ < h1 style ="color: orange "> Tokenize!</ h1 >
137+ < div class ="dropdown-container ">
138+ < select class ="dropdown " id ="tokenizerDropdown ">
139+ < option value ="mistral-v7 "> Mistral V7</ option >
140+ < option value ="mistral-v3 "> Mistral V3</ option >
141+ < option value ="mistral-v1 "> Mistral V1</ option >
142+ </ select >
143+ </ div >
144+ < section class ="input-section ">
145+ < div class ="token-container ">
146+ < span class ="highlight "> <s></ span >
147+ </ div >
148+ < div class ="input-container " contenteditable ="true " id ="highlightInput " placeholder ="Type something... "> </ div >
149+ </ section >
150+ < div class ="tokens-display " id ="tokensDisplay "> </ div >
151+ < div class ="error-message " id ="errorMessage "> </ div >
152+ < div class ="examples-container ">
153+ < div class ="example " data-example =" [INST] user message [/INST] assistant message</s> " data-tokenizer ="mistral-v7 "> One Shot with Mistral V7</ div >
154+ < div class ="example " data-example ="[INST] user message[/INST] assistant message</s> " data-tokenizer ="mistral-v3 "> One Shot with Mistral V3</ div >
155+ < div class ="example " data-example ="[INST] user message[/INST] assistant message</s> " data-tokenizer ="mistral-v1 "> One Shot with Mistral V1</ div >
156+ < div class ="example " data-example =" [INST] user message [/INST] assistant message</s> [INST] new user message [/INST] new assistant message</s> " data-tokenizer ="mistral-v1 "> Simple Chat with Mistral V1</ div >
157+ < div class ="example " data-example ="[INST] user message[/INST] assistant message</s>[INST] new user message[/INST] new assistant message</s> " data-tokenizer ="mistral-v3 "> Simple Chat with Mistral V3</ div >
158+ < div class ="example " data-example ="[INST] user message[/INST] assistant message</s>[INST] new user message[/INST] new assistant message</s> " data-tokenizer ="mistral-v7 "> Simple Chat with Mistral V7</ div >
159+ < div class ="example " data-example ="[SYSTEM_PROMPT] system prompt[/SYSTEM_PROMPT][INST] user message[/INST] assistant message</s>[INST] new user message[/INST] new assistant message</s> " data-tokenizer ="mistral-v7 "> System Prompt with Mistral V7</ div >
160+ </ div >
161+ < footer >
162+ < p > This is a work in progress. Contribution is welcome, repository < a href ="https://github.com/your-github-url "> here</ a > </ p >
163+ </ footer >
164+ < script type ="module ">
165+ import { AutoTokenizer , env } from 'https://cdn.jsdelivr.net/npm/@huggingface/[email protected] ' ; 166+
167+ env . allowLocalModels = true ;
168+ env . localModelPath = "/tokenizers/" ;
169+
170+ const input = document . getElementById ( 'highlightInput' ) ;
171+ const tokensDisplay = document . getElementById ( 'tokensDisplay' ) ;
172+ const tokenizerDropdown = document . getElementById ( 'tokenizerDropdown' ) ;
173+ const errorMessage = document . getElementById ( 'errorMessage' ) ;
174+
175+ const colors = [ '#ff9b00' , '#001bff' , '#ff0000' , '#9b00ff' , '#00ff17' , '#00e4ff' , '#e4ff00' , '#ff00d1' ] ;
176+
177+ function hexToRgba ( hex , alpha ) {
178+ hex = hex . replace ( / ^ # / , '' ) ;
179+ let bigint = parseInt ( hex , 16 ) ;
180+ let r = ( bigint >> 16 ) & 255 ;
181+ let g = ( bigint >> 8 ) & 255 ;
182+ let b = bigint & 255 ;
183+ return `rgba(${ r } , ${ g } , ${ b } , ${ alpha } )` ;
184+ }
185+
186+ let tokenizer = await loadTokenizer ( './mistral-v7' , false ) ;
187+
188+ async function loadTokenizer ( tokenizerPath , useAuthToken ) {
189+ try {
190+ return await AutoTokenizer . from_pretrained ( tokenizerPath , { useAuthToken } ) ;
191+ } catch ( error ) {
192+ errorMessage . textContent = `Error loading tokenizer: ${ error . message } ` ;
193+ throw error ;
194+ }
195+ }
196+
197+ tokenizerDropdown . addEventListener ( 'change' , async ( ) => {
198+ const tokenizerPath = tokenizerDropdown . value ;
199+ try {
200+ tokenizer = await loadTokenizer ( `./${ tokenizerPath } ` , false ) ;
201+ triggerInputEvent ( ) ;
202+ } catch ( error ) {
203+ errorMessage . textContent = `Error loading tokenizer: ${ error . message } ` ;
204+ }
205+ } ) ;
206+
207+ function triggerInputEvent ( ) {
208+ const event = new Event ( 'input' , { bubbles : true } ) ;
209+ input . dispatchEvent ( event ) ;
210+ }
211+
212+ input . addEventListener ( 'input' , async ( ) => {
213+ errorMessage . textContent = '' ;
214+ if ( input . textContent === "" ) {
215+ tokensDisplay . innerHTML = "" ;
216+ return ;
217+ }
218+
219+ const text = input . textContent . replace ( / \u200B / g, '' ) . replace ( / < 0 x 0 A > / g, '\n' ) ;
220+
221+ if ( ! tokenizer ) return ;
222+
223+ console . log ( text )
224+ const tokens = tokenizer . tokenize ( "<s>" + text ) . slice ( 1 ) ;
225+ console . log ( tokens )
226+ if ( tokens [ tokens . length - 1 ] === "<0x0A>" ) {
227+ tokens . pop ( ) ;
228+ }
229+ const tokenIds = tokenizer . encode ( text ) . slice ( 1 ) ;
230+ console . log ( tokenIds )
231+
232+ let tokenListHTML = `
233+ <div class="token-container">
234+ <span class="highlight" style="color: orange;"><s></span>
235+ <div class="token-id">1</div>
236+ </div>
237+ ` ;
238+ let highlightedText = '' ;
239+
240+ tokens . forEach ( ( token , index ) => {
241+ if ( token !== '' ) {
242+ const color = colors [ ( index + 1 ) % colors . length ] ;
243+ let highlightedToken = token ;
244+
245+ if ( token === '<s>' || token === '</s>' ) {
246+ highlightedToken = `<${ token . slice ( 1 , - 1 ) } >` ;
247+ } else if ( token === '</' ) {
248+ highlightedToken = `</` + token . slice ( 1 , - 1 ) ;
249+ }
250+
251+ let highlightedTokenInput = highlightedToken . replace ( / < 0 x 0 A > / g, '\n\u200B' ) . replace ( / ▁ / g, ' ' ) ;
252+ let highlightedTokenToId = highlightedToken . replace ( / < 0 x 0 A > / g, '\\n' ) . replace ( / / g, ' ' ) ;
253+
254+ tokenListHTML += `
255+ <div class="token-container">
256+ <span class="highlight" style="color: ${ color } ;">${ highlightedTokenToId } </span>
257+ <div class="token-id">${ tokenIds [ index ] } </div>
258+ </div>
259+ ` ;
260+
261+ highlightedText += `<span class="highlight" style="background-color: ${ hexToRgba ( color , 0.3 ) } ;">${ highlightedTokenInput } </span>` ;
262+ }
263+ } ) ;
264+
265+ tokensDisplay . innerHTML = tokenListHTML ;
266+ input . innerHTML = highlightedText ;
267+
268+ const range = document . createRange ( ) ;
269+ const selection = window . getSelection ( ) ;
270+ range . selectNodeContents ( input ) ;
271+ range . collapse ( false ) ;
272+ selection . removeAllRanges ( ) ;
273+ selection . addRange ( range ) ;
274+ } ) ;
275+
276+ input . addEventListener ( 'keydown' , ( e ) => {
277+ if ( e . key === 'Enter' ) {
278+ input . innerHTML += '\n' ;
279+ e . preventDefault ( ) ;
280+
281+ const emptyTextNode = document . createTextNode ( '\u200b' ) ;
282+ input . appendChild ( emptyTextNode ) ;
283+ const range = document . createRange ( ) ;
284+ const selection = window . getSelection ( ) ;
285+ range . setStart ( emptyTextNode , 1 ) ;
286+ range . collapse ( true ) ;
287+ selection . removeAllRanges ( ) ;
288+ selection . addRange ( range ) ;
289+ }
290+ } ) ;
291+
292+ input . addEventListener ( 'paste' , ( e ) => {
293+ e . preventDefault ( ) ;
294+ const pastedText = ( e . clipboardData || window . clipboardData ) . getData ( 'text' ) ;
295+ document . execCommand ( 'insertText' , false , pastedText ) ;
296+ } ) ;
297+
298+ document . querySelectorAll ( '.example' ) . forEach ( example => {
299+ example . addEventListener ( 'click' , ( ) => {
300+ const exampleText = example . getAttribute ( 'data-example' ) ;
301+ const tokenizerValue = example . getAttribute ( 'data-tokenizer' ) ;
302+ tokenizerDropdown . value = tokenizerValue ;
303+ input . textContent = exampleText ;
304+ triggerInputEvent ( ) ;
305+ } ) ;
306+ } ) ;
307+ </ script >
308+ </ body >
309+ </ html >
0 commit comments