Skip to content

Commit ab69ac5

Browse files
committed
corrected
1 parent 18273cb commit ab69ac5

File tree

1 file changed

+30
-31
lines changed

1 file changed

+30
-31
lines changed

examples/How_to_count_tokens_with_tiktoken.ipynb

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
"| Encoding name | OpenAI models |\n",
2424
"|-------------------------|-----------------------------------------------------|\n",
2525
"| `o200k_base` | `gpt-4o`, `gpt-4o-mini` |\n",
26-
"| `cl100k_base` | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large` |\n",
26+
"| `cl100k_base` | `gpt-4-turbo`, `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large` |\n",
2727
"| `p50k_base` | Codex models, `text-davinci-002`, `text-davinci-003`|\n",
2828
"| `r50k_base` (or `gpt2`) | GPT-3 models like `davinci` |\n",
2929
"\n",
@@ -36,7 +36,7 @@
3636
"\n",
3737
"## Tokenizer libraries by language\n",
3838
"\n",
39-
"For `cl100k_base` and `p50k_base` encodings:\n",
39+
"For `o200k_base`, `cl100k_base` and `p50k_base` encodings:\n",
4040
"- Python: [tiktoken](https://github.com/openai/tiktoken/blob/main/README.md)\n",
4141
"- .NET / C#: [SharpToken](https://github.com/dmitry-brazhenko/SharpToken), [TiktokenSharp](https://github.com/aiqinxuancai/TiktokenSharp)\n",
4242
"- Java: [jtokkit](https://github.com/knuddelsgmbh/jtokkit)\n",
@@ -81,11 +81,11 @@
8181
"text": [
8282
"\n",
8383
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
84-
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
84+
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
8585
"Note: you may need to restart the kernel to use updated packages.\n",
8686
"\n",
8787
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
88-
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
88+
"\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
8989
"Note: you may need to restart the kernel to use updated packages.\n"
9090
]
9191
}
@@ -184,7 +184,7 @@
184184
}
185185
],
186186
"source": [
187-
"encoding.encode(\"tiktoken is great!\")\n"
187+
"encoding.encode(\"tiktoken is great!\")"
188188
]
189189
},
190190
{
@@ -205,7 +205,7 @@
205205
" \"\"\"Returns the number of tokens in a text string.\"\"\"\n",
206206
" encoding = tiktoken.get_encoding(encoding_name)\n",
207207
" num_tokens = len(encoding.encode(string))\n",
208-
" return num_tokens\n"
208+
" return num_tokens"
209209
]
210210
},
211211
{
@@ -225,7 +225,7 @@
225225
}
226226
],
227227
"source": [
228-
"num_tokens_from_string(\"tiktoken is great!\", \"cl100k_base\")\n"
228+
"num_tokens_from_string(\"tiktoken is great!\", \"o200k_base\")"
229229
]
230230
},
231231
{
@@ -252,7 +252,7 @@
252252
{
253253
"data": {
254254
"text/plain": [
255-
"'turesറلás!'"
255+
"'tiktoken is great!'"
256256
]
257257
},
258258
"execution_count": 8,
@@ -261,7 +261,7 @@
261261
}
262262
],
263263
"source": [
264-
"encoding.decode([83, 1609, 5963, 374, 2294, 0])\n"
264+
"encoding.decode([83, 8251, 2488, 382, 2212, 0])"
265265
]
266266
},
267267
{
@@ -288,7 +288,7 @@
288288
{
289289
"data": {
290290
"text/plain": [
291-
"[b't', b'ures', b'\\xe0\\xb4\\xb1', b'\\xd9\\x84', b'\\xc3\\xa1s', b'!']"
291+
"[b't', b'ikt', b'oken', b' is', b' great', b'!']"
292292
]
293293
},
294294
"execution_count": 9,
@@ -297,7 +297,7 @@
297297
}
298298
],
299299
"source": [
300-
"[encoding.decode_single_token_bytes(token) for token in [83, 1609, 5963, 374, 2294, 0]]\n"
300+
"[encoding.decode_single_token_bytes(token) for token in [83, 8251, 2488, 382, 2212, 0]]\n"
301301
]
302302
},
303303
{
@@ -337,8 +337,7 @@
337337
" print()\n",
338338
" print(f\"{encoding_name}: {num_tokens} tokens\")\n",
339339
" print(f\"token integers: {token_integers}\")\n",
340-
" print(f\"token bytes: {token_bytes}\")\n",
341-
" "
340+
" print(f\"token bytes: {token_bytes}\")"
342341
]
343342
},
344343
{
@@ -372,7 +371,7 @@
372371
}
373372
],
374373
"source": [
375-
"compare_encodings(\"antidisestablishmentarianism\")\n"
374+
"compare_encodings(\"antidisestablishmentarianism\")"
376375
]
377376
},
378377
{
@@ -406,7 +405,7 @@
406405
}
407406
],
408407
"source": [
409-
"compare_encodings(\"2 + 2 = 4\")\n"
408+
"compare_encodings(\"2 + 2 = 4\")"
410409
]
411410
},
412411
{
@@ -440,7 +439,7 @@
440439
}
441440
],
442441
"source": [
443-
"compare_encodings(\"お誕生日おめでとう\")\n"
442+
"compare_encodings(\"お誕生日おめでとう\")"
444443
]
445444
},
446445
{
@@ -450,9 +449,9 @@
450449
"source": [
451450
"## 6. Counting tokens for chat completions API calls\n",
452451
"\n",
453-
"ChatGPT models like `gpt-3.5-turbo` and `gpt-4` use tokens in the same way as older completions models, but because of their message-based formatting, it's more difficult to count how many tokens will be used by a conversation.\n",
452+
"ChatGPT models like `gpt-4o-mini` and `gpt-4` use tokens in the same way as older completions models, but because of their message-based formatting, it's more difficult to count how many tokens will be used by a conversation.\n",
454453
"\n",
455-
"Below is an example function for counting tokens for messages passed to `gpt-3.5-turbo` or `gpt-4`.\n",
454+
"Below is an example function for counting tokens for messages passed to `gpt-3.5-turbo`, `gpt-4`, `gpt-4o` and `gpt-4o-mini`.\n",
456455
"\n",
457456
"Note that the exact way that tokens are counted from messages may change from model to model. Consider the counts from the function below an estimate, not a timeless guarantee.\n",
458457
"\n",
@@ -470,8 +469,8 @@
470469
" try:\n",
471470
" encoding = tiktoken.encoding_for_model(model)\n",
472471
" except KeyError:\n",
473-
" print(\"Warning: model not found. Using cl100k_base encoding.\")\n",
474-
" encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
472+
" print(\"Warning: model not found. Using o200k_base encoding.\")\n",
473+
" encoding = tiktoken.get_encoding(\"o200k_base\")\n",
475474
" if model in {\n",
476475
" \"gpt-3.5-turbo-0125\",\n",
477476
" \"gpt-4-0314\",\n",
@@ -486,15 +485,15 @@
486485
" elif \"gpt-3.5-turbo\" in model:\n",
487486
" print(\"Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0125.\")\n",
488487
" return num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0125\")\n",
489-
" elif \"gpt-4\" in model:\n",
490-
" print(\"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\")\n",
491-
" return num_tokens_from_messages(messages, model=\"gpt-4-0613\")\n",
492-
" elif \"gpt-4o\" in model:\n",
493-
" print(\"Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.\")\n",
494-
" return num_tokens_from_messages(messages, model=\"ggpt-4o-2024-08-06\")\n",
495488
" elif \"gpt-4o-mini\" in model:\n",
496489
" print(\"Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.\")\n",
497490
" return num_tokens_from_messages(messages, model=\"gpt-4o-mini-2024-07-18\")\n",
491+
" elif \"gpt-4o\" in model:\n",
492+
" print(\"Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.\")\n",
493+
" return num_tokens_from_messages(messages, model=\"gpt-4o-2024-08-06\")\n",
494+
" elif \"gpt-4\" in model:\n",
495+
" print(\"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\")\n",
496+
" return num_tokens_from_messages(messages, model=\"gpt-4-0613\")\n",
498497
" else:\n",
499498
" raise NotImplementedError(\n",
500499
" f\"\"\"num_tokens_from_messages() is not implemented for model {model}.\"\"\"\n",
@@ -534,13 +533,13 @@
534533
"129 prompt tokens counted by the OpenAI API.\n",
535534
"\n",
536535
"gpt-4o\n",
537-
"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\n",
538-
"129 prompt tokens counted by num_tokens_from_messages().\n",
536+
"Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.\n",
537+
"124 prompt tokens counted by num_tokens_from_messages().\n",
539538
"124 prompt tokens counted by the OpenAI API.\n",
540539
"\n",
541540
"gpt-4o-mini\n",
542-
"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\n",
543-
"129 prompt tokens counted by num_tokens_from_messages().\n",
541+
"Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.\n",
542+
"124 prompt tokens counted by num_tokens_from_messages().\n",
544543
"124 prompt tokens counted by the OpenAI API.\n",
545544
"\n"
546545
]
@@ -621,7 +620,7 @@
621620
"name": "python",
622621
"nbconvert_exporter": "python",
623622
"pygments_lexer": "ipython3",
624-
"version": "3.12.1"
623+
"version": "3.11.7"
625624
},
626625
"vscode": {
627626
"interpreter": {

0 commit comments

Comments
 (0)