corrected

teomusatoiu · teomusatoiu · commit ab69ac50373c · 2024-09-24T12:35:44.000+03:00
diff --git a/examples/How_to_count_tokens_with_tiktoken.ipynb b/examples/How_to_count_tokens_with_tiktoken.ipynb
@@ -23,7 +23,7 @@
     "| Encoding name           | OpenAI models                                       |\n",
     "|-------------------------|-----------------------------------------------------|\n",
     "| `o200k_base`            | `gpt-4o`, `gpt-4o-mini`                             |\n",
-    "| `cl100k_base`           | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large`  |\n",
+    "| `cl100k_base`           | `gpt-4-turbo`, `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large`  |\n",
     "| `p50k_base`             | Codex models, `text-davinci-002`, `text-davinci-003`|\n",
     "| `r50k_base` (or `gpt2`) | GPT-3 models like `davinci`                         |\n",
     "\n",
@@ -36,7 +36,7 @@
     "\n",
     "## Tokenizer libraries by language\n",
     "\n",
-    "For `cl100k_base` and `p50k_base` encodings:\n",
+    "For `o200k_base`, `cl100k_base` and `p50k_base` encodings:\n",
     "- Python: [tiktoken](https://github.com/openai/tiktoken/blob/main/README.md)\n",
     "- .NET / C#: [SharpToken](https://github.com/dmitry-brazhenko/SharpToken), [TiktokenSharp](https://github.com/aiqinxuancai/TiktokenSharp)\n",
     "- Java: [jtokkit](https://github.com/knuddelsgmbh/jtokkit)\n",
@@ -81,11 +81,11 @@
      "text": [
       "\n",
       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
       "Note: you may need to restart the kernel to use updated packages.\n",
       "\n",
       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m24.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
-      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3 -m pip install --upgrade pip\u001b[0m\n",
+      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
       "Note: you may need to restart the kernel to use updated packages.\n"
      ]
     }
@@ -184,7 +184,7 @@
     }
    ],
    "source": [
-    "encoding.encode(\"tiktoken is great!\")\n"
+    "encoding.encode(\"tiktoken is great!\")"
    ]
   },
   {
@@ -205,7 +205,7 @@
     "    \"\"\"Returns the number of tokens in a text string.\"\"\"\n",
     "    encoding = tiktoken.get_encoding(encoding_name)\n",
     "    num_tokens = len(encoding.encode(string))\n",
-    "    return num_tokens\n"
+    "    return num_tokens"
    ]
   },
   {
@@ -225,7 +225,7 @@
     }
    ],
    "source": [
-    "num_tokens_from_string(\"tiktoken is great!\", \"cl100k_base\")\n"
+    "num_tokens_from_string(\"tiktoken is great!\", \"o200k_base\")"
    ]
   },
   {
@@ -252,7 +252,7 @@
     {
      "data": {
       "text/plain": [
-       "'turesറلás!'"
+       "'tiktoken is great!'"
       ]
      },
      "execution_count": 8,
@@ -261,7 +261,7 @@
     }
    ],
    "source": [
-    "encoding.decode([83, 1609, 5963, 374, 2294, 0])\n"
+    "encoding.decode([83, 8251, 2488, 382, 2212, 0])"
    ]
   },
   {
@@ -288,7 +288,7 @@
     {
      "data": {
       "text/plain": [
-       "[b't', b'ures', b'\\xe0\\xb4\\xb1', b'\\xd9\\x84', b'\\xc3\\xa1s', b'!']"
+       "[b't', b'ikt', b'oken', b' is', b' great', b'!']"
       ]
      },
      "execution_count": 9,
@@ -297,7 +297,7 @@
     }
    ],
    "source": [
-    "[encoding.decode_single_token_bytes(token) for token in [83, 1609, 5963, 374, 2294, 0]]\n"
+    "[encoding.decode_single_token_bytes(token) for token in [83, 8251, 2488, 382, 2212, 0]]\n"
    ]
   },
   {
@@ -337,8 +337,7 @@
     "        print()\n",
     "        print(f\"{encoding_name}: {num_tokens} tokens\")\n",
     "        print(f\"token integers: {token_integers}\")\n",
-    "        print(f\"token bytes: {token_bytes}\")\n",
-    "        "
+    "        print(f\"token bytes: {token_bytes}\")"
    ]
   },
   {
@@ -372,7 +371,7 @@
     }
    ],
    "source": [
-    "compare_encodings(\"antidisestablishmentarianism\")\n"
+    "compare_encodings(\"antidisestablishmentarianism\")"
    ]
   },
   {
@@ -406,7 +405,7 @@
     }
    ],
    "source": [
-    "compare_encodings(\"2 + 2 = 4\")\n"
+    "compare_encodings(\"2 + 2 = 4\")"
    ]
   },
   {
@@ -440,7 +439,7 @@
     }
    ],
    "source": [
-    "compare_encodings(\"お誕生日おめでとう\")\n"
+    "compare_encodings(\"お誕生日おめでとう\")"
    ]
   },
   {
@@ -450,9 +449,9 @@
    "source": [
     "## 6. Counting tokens for chat completions API calls\n",
     "\n",
-    "ChatGPT models like `gpt-3.5-turbo` and `gpt-4` use tokens in the same way as older completions models, but because of their message-based formatting, it's more difficult to count how many tokens will be used by a conversation.\n",
+    "ChatGPT models like `gpt-4o-mini` and `gpt-4` use tokens in the same way as older completions models, but because of their message-based formatting, it's more difficult to count how many tokens will be used by a conversation.\n",
     "\n",
-    "Below is an example function for counting tokens for messages passed to `gpt-3.5-turbo` or `gpt-4`.\n",
+    "Below is an example function for counting tokens for messages passed to `gpt-3.5-turbo`, `gpt-4`, `gpt-4o` and `gpt-4o-mini`.\n",
     "\n",
     "Note that the exact way that tokens are counted from messages may change from model to model. Consider the counts from the function below an estimate, not a timeless guarantee.\n",
     "\n",
@@ -470,8 +469,8 @@
     "    try:\n",
     "        encoding = tiktoken.encoding_for_model(model)\n",
     "    except KeyError:\n",
-    "        print(\"Warning: model not found. Using cl100k_base encoding.\")\n",
-    "        encoding = tiktoken.get_encoding(\"cl100k_base\")\n",
+    "        print(\"Warning: model not found. Using o200k_base encoding.\")\n",
+    "        encoding = tiktoken.get_encoding(\"o200k_base\")\n",
     "    if model in {\n",
     "        \"gpt-3.5-turbo-0125\",\n",
     "        \"gpt-4-0314\",\n",
@@ -486,15 +485,15 @@
     "    elif \"gpt-3.5-turbo\" in model:\n",
     "        print(\"Warning: gpt-3.5-turbo may update over time. Returning num tokens assuming gpt-3.5-turbo-0125.\")\n",
     "        return num_tokens_from_messages(messages, model=\"gpt-3.5-turbo-0125\")\n",
-    "    elif \"gpt-4\" in model:\n",
-    "        print(\"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\")\n",
-    "        return num_tokens_from_messages(messages, model=\"gpt-4-0613\")\n",
-    "    elif \"gpt-4o\" in model:\n",
-    "        print(\"Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.\")\n",
-    "        return num_tokens_from_messages(messages, model=\"ggpt-4o-2024-08-06\")\n",
     "    elif \"gpt-4o-mini\" in model:\n",
     "        print(\"Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.\")\n",
     "        return num_tokens_from_messages(messages, model=\"gpt-4o-mini-2024-07-18\")\n",
+    "    elif \"gpt-4o\" in model:\n",
+    "        print(\"Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.\")\n",
+    "        return num_tokens_from_messages(messages, model=\"gpt-4o-2024-08-06\")\n",
+    "    elif \"gpt-4\" in model:\n",
+    "        print(\"Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\")\n",
+    "        return num_tokens_from_messages(messages, model=\"gpt-4-0613\")\n",
     "    else:\n",
     "        raise NotImplementedError(\n",
     "            f\"\"\"num_tokens_from_messages() is not implemented for model {model}.\"\"\"\n",
@@ -534,13 +533,13 @@
       "129 prompt tokens counted by the OpenAI API.\n",
       "\n",
       "gpt-4o\n",
-      "Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\n",
-      "129 prompt tokens counted by num_tokens_from_messages().\n",
+      "Warning: gpt-4o and gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-2024-08-06.\n",
+      "124 prompt tokens counted by num_tokens_from_messages().\n",
       "124 prompt tokens counted by the OpenAI API.\n",
       "\n",
       "gpt-4o-mini\n",
-      "Warning: gpt-4 may update over time. Returning num tokens assuming gpt-4-0613.\n",
-      "129 prompt tokens counted by num_tokens_from_messages().\n",
+      "Warning: gpt-4o-mini may update over time. Returning num tokens assuming gpt-4o-mini-2024-07-18.\n",
+      "124 prompt tokens counted by num_tokens_from_messages().\n",
       "124 prompt tokens counted by the OpenAI API.\n",
       "\n"
      ]
@@ -621,7 +620,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.1"
+   "version": "3.11.7"
   },
   "vscode": {
    "interpreter": {