|
23 | 23 | "| Encoding name | OpenAI models |\n",
|
24 | 24 | "|-------------------------|-----------------------------------------------------|\n",
|
25 | 25 | "| `o200k_base` | `gpt-4o`, `gpt-4o-mini` |\n",
|
26 |
| - "| `cl100k_base` | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large` |\n", |
| 26 | + "| `cl100k_base` | `gpt-4-turbo`, `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large` |\n", |
27 | 27 | "| `p50k_base` | Codex models, `text-davinci-002`, `text-davinci-003`|\n",
|
28 | 28 | "| `r50k_base` (or `gpt2`) | GPT-3 models like `davinci` |\n",
|
29 | 29 | "\n",
|
|
36 | 36 | "\n",
|
37 | 37 | "## Tokenizer libraries by language\n",
|
38 | 38 | "\n",
|
39 |
| - "For `cl100k_base` and `p50k_base` encodings:\n", |
| 39 | + "For `o200k_base`, `cl100k_base` and `p50k_base` encodings:\n", |
40 | 40 | "- Python: [tiktoken](https://github.com/openai/tiktoken/blob/main/README.md)\n",
|
41 | 41 | "- .NET / C#: [SharpToken](https://github.com/dmitry-brazhenko/SharpToken), [TiktokenSharp](https://github.com/aiqinxuancai/TiktokenSharp)\n",
|
42 | 42 | "- Java: [jtokkit](https://github.com/knuddelsgmbh/jtokkit)\n",
|
|
81 | 81 | "text": [
|
82 | 82 | "\n",
|
83 | 83 | "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
|
84 |
| - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n", |
| 84 | + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", |
85 | 85 | "Note: you may need to restart the kernel to use updated packages.\n",
|
86 | 86 | "\n",
|
87 | 87 | "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
|
88 |
| - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n", |
| 88 | + "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", |
89 | 89 | "Note: you may need to restart the kernel to use updated packages.\n"
|
90 | 90 | ]
|
91 | 91 | }
|
|
184 | 184 | }
|
185 | 185 | ],
|
186 | 186 | "source": [
|
187 |
| - "encoding.encode(\"tiktoken is great!\")\n" |
| 187 | + "encoding.encode(\"tiktoken is great!\")" |
188 | 188 | ]
|
189 | 189 | },
|
190 | 190 | {
|
|
205 | 205 | " \"\"\"Returns the number of tokens in a text string.\"\"\"\n",
|
206 | 206 | " encoding = tiktoken.get_encoding(encoding_name)\n",
|
207 | 207 | " num_tokens = len(encoding.encode(string))\n",
|
208 |
| - " return num_tokens\n" |
| 208 | + " return num_tokens" |
209 | 209 | ]
|
210 | 210 | },
|
211 | 211 | {
|
|
225 | 225 | }
|
226 | 226 | ],
|
227 | 227 | "source": [
|
228 |
| - "num_tokens_from_string(\"tiktoken is great!\", \"cl100k_base\")\n" |
| 228 | + "num_tokens_from_string(\"tiktoken is great!\", \"o200k_base\")" |
229 | 229 | ]
|
230 | 230 | },
|
231 | 231 | {
|
|
252 | 252 | {
|
253 | 253 | "data": {
|
254 | 254 | "text/plain": [
|
255 |
| - "'turesറلás!'" |
| 255 | + "'tiktoken is great!'" |
256 | 256 | ]
|
257 | 257 | },
|
258 | 258 | "execution_count": 8,
|
|
261 | 261 | }
|
262 | 262 | ],
|
263 | 263 | "source": [
|
264 |
| - "encoding.decode([83, 1609, 5963, 374, 2294, 0])\n" |
| 264 | + "encoding.decode([83, 8251, 2488, 382, 2212, 0])" |
265 | 265 | ]
|
266 | 266 | },
|
267 | 267 | {
|
|
288 | 288 | {
|
289 | 289 | "data": {
|
290 | 290 | "text/plain": [
|
291 |
| - "[b't', b'ures', b'\\xe0\\xb4\\xb1', b'\\xd9\\x84', b'\\xc3\\xa1s', b'!']" |
| 291 | + "[b't', b'ikt', b'oken', b' is', b' great', b'!']" |
292 | 292 | ]
|
293 | 293 | },
|
294 | 294 | "execution_count": 9,
|
|
297 | 297 | }
|
298 | 298 | ],
|
299 | 299 | "source": [
|
300 |
| - "[encoding.decode_single_token_bytes(token) for token in [83, 1609, 5963, 374, 2294, 0]]\n" |
| 300 | + "[encoding.decode_single_token_bytes(token) for token in [83, 8251, 2488, 382, 2212, 0]]\n" |
301 | 301 | ]
|
302 | 302 | },
|
303 | 303 | {
|
|
337 | 337 | " print()\n",
|
338 | 338 | " print(f\"{encoding_name}: {num_tokens} tokens\")\n",
|
339 | 339 | " print(f\"token integers: {token_integers}\")\n",
|
340 |
| - " print(f\"token bytes: {token_bytes}\")\n", |
341 |
| - " " |
| 340 | + " print(f\"token bytes: {token_bytes}\")" |
342 | 341 | ]
|
343 | 342 | },
|
344 | 343 | {
|
|
372 | 371 | }
|
373 | 372 | ],
|
374 | 373 | "source": [
|
375 |
| - "compare_encodings(\"antidisestablishmentarianism\")\n" |
| 374 | + "compare_encodings(\"antidisestablishmentarianism\")" |
376 | 375 | ]
|
377 | 376 | },
|
378 | 377 | {
|
|
406 | 405 | }
|
407 | 406 | ],
|
408 | 407 | "source": [
|
409 |
| - "compare_encodings(\"2 + 2 = 4\")\n" |
| 408 | + "compare_encodings(\"2 + 2 = 4\")" |
410 | 409 | ]
|
411 | 410 | },
|
412 | 411 | {
|
|
440 | 439 | }
|
441 | 440 | ],
|
442 | 441 | "source": [
|
443 |
| - "compare_encodings(\"お誕生日おめでとう\")\n" |
| 442 | + "compare_encodings(\"お誕生日おめでとう\")" |
444 | 443 | ]
|
445 | 444 | },
|
446 | 445 | {
|
|
450 | 449 | "source": [
|
451 | 450 | "## 6. Counting tokens for chat completions API calls\n",
|
452 | 451 | "\n",
|
453 |
| - "ChatGPT models like `gpt-3.5-turbo` and `gpt-4` use tokens in the same way as older completions models, but because of their message-based formatting, it's more difficult to count how many tokens will be used by a conversation.\n", |
| 452 | + "ChatGPT models like `gpt-4o-mini` and `gpt-4` use tokens in the same way as older completions models, but because of their message-based formatting, it's more difficult to count how many tokens will be used by a conversation.\n", |
454 | 453 | "\n",
|
455 |
| - "Below is an example function for counting tokens for messages passed to `gpt-3.5-turbo` or `gpt-4`.\n", |
| 454 | + "Below is an example function for counting tokens for messages passed to `gpt-3.5-turbo`, `gpt-4`, `gpt-4o` and `gpt-4o-mini`.\n", |
456 | 455 | "\n",
|
457 | 456 | "Note that the exact way that tokens are counted from messages may change from model to model. Consider the counts from the function below an estimate, not a timeless guarantee.\n",
|
458 | 457 | "\n",
|
|
470 | 469 | " try:\n",
|
471 | 470 | " encoding = tiktoken.encoding_for_model(model)\n",
|
472 | 471 | " except KeyError:\n",
|
473 |
| - " print(\"Warning: model not found. Using cl100k_base encoding.\")\n", |
474 |
| - " encoding = tiktoken.get_encoding(\"cl100k_base\")\n", |
| 472 | + " print(\"Warning: model not found. Using o200k_base encoding.\")\n", |
| 473 | + " encoding = tiktoken.get_encoding(\"o200k_base\")\n", |
475 | 474 | " if model in {\n",
|
476 | 475 | " \"gpt-3.5-turbo-0125\",\n",
|
477 | 476 | " \"gpt-4-0314\",\n",
|
|
486 | 485 | " elif \"gpt-3.5-turbo\" in model:\n",
|
487 | 486 | " print(\"Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0125.\")\n",
|
488 | 487 | " return num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0125\")\n",
|
489 |
| - " elif \"gpt-4\" in model:\n", |
490 |
| - " print(\"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\")\n", |
491 |
| - " return num_tokens_from_messages(messages, model=\"gpt-4-0613\")\n", |
492 |
| - " elif \"gpt-4o\" in model:\n", |
493 |
| - " print(\"Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.\")\n", |
494 |
| - " return num_tokens_from_messages(messages, model=\"ggpt-4o-2024-08-06\")\n", |
495 | 488 | " elif \"gpt-4o-mini\" in model:\n",
|
496 | 489 | " print(\"Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.\")\n",
|
497 | 490 | " return num_tokens_from_messages(messages, model=\"gpt-4o-mini-2024-07-18\")\n",
|
| 491 | + " elif \"gpt-4o\" in model:\n", |
| 492 | + " print(\"Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.\")\n", |
| 493 | + " return num_tokens_from_messages(messages, model=\"gpt-4o-2024-08-06\")\n", |
| 494 | + " elif \"gpt-4\" in model:\n", |
| 495 | + " print(\"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\")\n", |
| 496 | + " return num_tokens_from_messages(messages, model=\"gpt-4-0613\")\n", |
498 | 497 | " else:\n",
|
499 | 498 | " raise NotImplementedError(\n",
|
500 | 499 | " f\"\"\"num_tokens_from_messages() is not implemented for model {model}.\"\"\"\n",
|
|
534 | 533 | "129 prompt tokens counted by the OpenAI API.\n",
|
535 | 534 | "\n",
|
536 | 535 | "gpt-4o\n",
|
537 |
| - "Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\n", |
538 |
| - "129 prompt tokens counted by num_tokens_from_messages().\n", |
| 536 | + "Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.\n", |
| 537 | + "124 prompt tokens counted by num_tokens_from_messages().\n", |
539 | 538 | "124 prompt tokens counted by the OpenAI API.\n",
|
540 | 539 | "\n",
|
541 | 540 | "gpt-4o-mini\n",
|
542 |
| - "Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\n", |
543 |
| - "129 prompt tokens counted by num_tokens_from_messages().\n", |
| 541 | + "Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.\n", |
| 542 | + "124 prompt tokens counted by num_tokens_from_messages().\n", |
544 | 543 | "124 prompt tokens counted by the OpenAI API.\n",
|
545 | 544 | "\n"
|
546 | 545 | ]
|
|
621 | 620 | "name": "python",
|
622 | 621 | "nbconvert_exporter": "python",
|
623 | 622 | "pygments_lexer": "ipython3",
|
624 |
| - "version": "3.12.1" |
| 623 | + "version": "3.11.7" |
625 | 624 | },
|
626 | 625 | "vscode": {
|
627 | 626 | "interpreter": {
|
|
0 commit comments