|
67 | 67 | "name": "stdout", |
68 | 68 | "output_type": "stream", |
69 | 69 | "text": [ |
70 | | - "tiktoken version: 0.5.1\n" |
| 70 | + "tiktoken version: 0.7.0\n" |
71 | 71 | ] |
72 | 72 | } |
73 | 73 | ], |
|
180 | 180 | "name": "stderr", |
181 | 181 | "output_type": "stream", |
182 | 182 | "text": [ |
183 | | - "Fetching encoder.json: 1.04Mit [00:00, 3.14Mit/s] \n", |
184 | | - "Fetching vocab.bpe: 457kit [00:00, 1.67Mit/s] \n" |
| 183 | + "Fetching encoder.json: 1.04Mit [00:00, 3.47Mit/s] \n", |
| 184 | + "Fetching vocab.bpe: 457kit [00:00, 2.07Mit/s] \n" |
185 | 185 | ] |
186 | 186 | } |
187 | 187 | ], |
|
259 | 259 | { |
260 | 260 | "data": { |
261 | 261 | "text/plain": [ |
262 | | - "'4.34.0'" |
| 262 | + "'4.48.0'" |
263 | 263 | ] |
264 | 264 | }, |
265 | 265 | "execution_count": 12, |
|
278 | 278 | "execution_count": 13, |
279 | 279 | "id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8", |
280 | 280 | "metadata": {}, |
281 | | - "outputs": [ |
282 | | - { |
283 | | - "data": { |
284 | | - "application/vnd.jupyter.widget-view+json": { |
285 | | - "model_id": "e4df871bb797435787143a3abe6b0231", |
286 | | - "version_major": 2, |
287 | | - "version_minor": 0 |
288 | | - }, |
289 | | - "text/plain": [ |
290 | | - "Downloading tokenizer_config.json: 0%| | 0.00/26.0 [00:00<?, ?B/s]" |
291 | | - ] |
292 | | - }, |
293 | | - "metadata": {}, |
294 | | - "output_type": "display_data" |
295 | | - }, |
296 | | - { |
297 | | - "data": { |
298 | | - "application/vnd.jupyter.widget-view+json": { |
299 | | - "model_id": "f11b27a4aabf43af9bf57f929683def6", |
300 | | - "version_major": 2, |
301 | | - "version_minor": 0 |
302 | | - }, |
303 | | - "text/plain": [ |
304 | | - "Downloading vocab.json: 0%| | 0.00/1.04M [00:00<?, ?B/s]" |
305 | | - ] |
306 | | - }, |
307 | | - "metadata": {}, |
308 | | - "output_type": "display_data" |
309 | | - }, |
310 | | - { |
311 | | - "data": { |
312 | | - "application/vnd.jupyter.widget-view+json": { |
313 | | - "model_id": "d3aa9a24aacc43108ef2ed72e7bacd33", |
314 | | - "version_major": 2, |
315 | | - "version_minor": 0 |
316 | | - }, |
317 | | - "text/plain": [ |
318 | | - "Downloading merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]" |
319 | | - ] |
320 | | - }, |
321 | | - "metadata": {}, |
322 | | - "output_type": "display_data" |
323 | | - }, |
324 | | - { |
325 | | - "data": { |
326 | | - "application/vnd.jupyter.widget-view+json": { |
327 | | - "model_id": "f9341bc23b594bb68dcf8954bff6d9bd", |
328 | | - "version_major": 2, |
329 | | - "version_minor": 0 |
330 | | - }, |
331 | | - "text/plain": [ |
332 | | - "Downloading tokenizer.json: 0%| | 0.00/1.36M [00:00<?, ?B/s]" |
333 | | - ] |
334 | | - }, |
335 | | - "metadata": {}, |
336 | | - "output_type": "display_data" |
337 | | - }, |
338 | | - { |
339 | | - "data": { |
340 | | - "application/vnd.jupyter.widget-view+json": { |
341 | | - "model_id": "c5f55f2f1dbc4152acc9b2061167ee0a", |
342 | | - "version_major": 2, |
343 | | - "version_minor": 0 |
344 | | - }, |
345 | | - "text/plain": [ |
346 | | - "Downloading config.json: 0%| | 0.00/665 [00:00<?, ?B/s]" |
347 | | - ] |
348 | | - }, |
349 | | - "metadata": {}, |
350 | | - "output_type": "display_data" |
351 | | - } |
352 | | - ], |
| 281 | + "outputs": [], |
353 | 282 | "source": [ |
354 | 283 | "from transformers import GPT2Tokenizer\n", |
355 | 284 | "\n", |
|
377 | 306 | "hf_tokenizer(strings)[\"input_ids\"]" |
378 | 307 | ] |
379 | 308 | }, |
| 309 | + { |
| 310 | + "cell_type": "markdown", |
| 311 | + "id": "9d0f2e95-8ae8-4606-a8e0-b0fce91cfac9", |
| 312 | + "metadata": {}, |
| 313 | + "source": [ |
| 314 | + "<br>\n", |
| 315 | + " \n", |
| 316 | + "\n", |
| 317 | + "## Using my own from-scratch BPE tokenizer" |
| 318 | + ] |
| 319 | + }, |
| 320 | + { |
| 321 | + "cell_type": "code", |
| 322 | + "execution_count": 15, |
| 323 | + "id": "b6e6b1a5-9dc0-4b20-9a8b-c02aa0e3191c", |
| 324 | + "metadata": {}, |
| 325 | + "outputs": [], |
| 326 | + "source": [ |
| 327 | + "import os\n", |
| 328 | + "import sys\n", |
| 329 | + "import io\n", |
| 330 | + "import nbformat\n", |
| 331 | + "import types\n", |
| 332 | + "\n", |
| 333 | + "def import_from_notebook():\n", |
| 334 | + " def import_definitions_from_notebook(fullname, names):\n", |
| 335 | + " current_dir = os.getcwd()\n", |
| 336 | + " path = os.path.join(current_dir, \"..\", \"05_bpe-from-scratch\", fullname + \".ipynb\")\n", |
| 337 | + " path = os.path.normpath(path)\n", |
| 338 | + "\n", |
| 339 | + " # Load the notebook\n", |
| 340 | + " if not os.path.exists(path):\n", |
| 341 | + " raise FileNotFoundError(f\"Notebook file not found at: {path}\")\n", |
| 342 | + "\n", |
| 343 | + " with io.open(path, \"r\", encoding=\"utf-8\") as f:\n", |
| 344 | + " nb = nbformat.read(f, as_version=4)\n", |
| 345 | + "\n", |
| 346 | + " # Create a module to store the imported functions and classes\n", |
| 347 | + " mod = types.ModuleType(fullname)\n", |
| 348 | + " sys.modules[fullname] = mod\n", |
| 349 | + "\n", |
| 350 | + " # Go through the notebook cells and only execute function or class definitions\n", |
| 351 | + " for cell in nb.cells:\n", |
| 352 | + " if cell.cell_type == \"code\":\n", |
| 353 | + " cell_code = cell.source\n", |
| 354 | + " for name in names:\n", |
| 355 | + " # Check for function or class definitions\n", |
| 356 | + " if f\"def {name}\" in cell_code or f\"class {name}\" in cell_code:\n", |
| 357 | + " exec(cell_code, mod.__dict__)\n", |
| 358 | + " return mod\n", |
| 359 | + "\n", |
| 360 | + " fullname = \"bpe-from-scratch\"\n", |
| 361 | + " names = [\"BPETokenizerSimple\"]\n", |
| 362 | + "\n", |
| 363 | + " return import_definitions_from_notebook(fullname, names)" |
| 364 | + ] |
| 365 | + }, |
| 366 | + { |
| 367 | + "cell_type": "code", |
| 368 | + "execution_count": 16, |
| 369 | + "id": "04fbd764-ec98-44f1-9b0a-e9db9a3bb91e", |
| 370 | + "metadata": {}, |
| 371 | + "outputs": [], |
| 372 | + "source": [ |
| 373 | + "imported_module = import_from_notebook()\n", |
| 374 | + "BPETokenizerSimple = getattr(imported_module, \"BPETokenizerSimple\", None)\n", |
| 375 | + "\n", |
| 376 | + "tokenizer_gpt2 = BPETokenizerSimple()\n", |
| 377 | + "tokenizer_gpt2.load_vocab_and_merges_from_openai(\n", |
| 378 | + " vocab_path=os.path.join(\"gpt2_model\", \"encoder.json\"),\n", |
| 379 | + " bpe_merges_path=os.path.join(\"gpt2_model\", \"vocab.bpe\")\n", |
| 380 | + ")" |
| 381 | + ] |
| 382 | + }, |
| 383 | + { |
| 384 | + "cell_type": "code", |
| 385 | + "execution_count": 17, |
| 386 | + "id": "5a5def88-1d2c-4550-a5e8-ee82b72b92d7", |
| 387 | + "metadata": {}, |
| 388 | + "outputs": [ |
| 389 | + { |
| 390 | + "name": "stdout", |
| 391 | + "output_type": "stream", |
| 392 | + "text": [ |
| 393 | + "[1544, 18798, 11, 995, 13, 1148, 256, 5303, 82, 438, 257, 1332, 30]\n" |
| 394 | + ] |
| 395 | + } |
| 396 | + ], |
| 397 | + "source": [ |
| 398 | + "integers = tokenizer_gpt2.encode(text)\n", |
| 399 | + "\n", |
| 400 | + "print(integers)" |
| 401 | + ] |
| 402 | + }, |
380 | 403 | { |
381 | 404 | "cell_type": "markdown", |
382 | 405 | "id": "907a1ade-3401-4f2e-9017-7f58a60cbd98", |
|
390 | 413 | }, |
391 | 414 | { |
392 | 415 | "cell_type": "code", |
393 | | - "execution_count": 15, |
| 416 | + "execution_count": 18, |
394 | 417 | "id": "a61bb445-b151-4a2f-8180-d4004c503754", |
395 | 418 | "metadata": {}, |
396 | 419 | "outputs": [], |
|
399 | 422 | " raw_text = f.read()" |
400 | 423 | ] |
401 | 424 | }, |
| 425 | + { |
| 426 | + "cell_type": "markdown", |
| 427 | + "id": "9c0ae9f0-47a1-4e7f-a210-e1d2721f4d1e", |
| 428 | + "metadata": {}, |
| 429 | + "source": [ |
| 430 | + "### Original OpenAI GPT-2 tokenizer" |
| 431 | + ] |
| 432 | + }, |
402 | 433 | { |
403 | 434 | "cell_type": "code", |
404 | | - "execution_count": 16, |
| 435 | + "execution_count": 19, |
405 | 436 | "id": "57f7c0a3-c1fd-4313-af34-68e78eb33653", |
406 | 437 | "metadata": {}, |
407 | 438 | "outputs": [ |
408 | 439 | { |
409 | 440 | "name": "stdout", |
410 | 441 | "output_type": "stream", |
411 | 442 | "text": [ |
412 | | - "4.29 ms ± 46.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" |
| 443 | + "3.44 ms ± 54 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" |
413 | 444 | ] |
414 | 445 | } |
415 | 446 | ], |
416 | 447 | "source": [ |
417 | 448 | "%timeit orig_tokenizer.encode(raw_text)" |
418 | 449 | ] |
419 | 450 | }, |
| 451 | + { |
| 452 | + "cell_type": "markdown", |
| 453 | + "id": "ef2ce3f3-1f81-47ce-b563-99fe2c7a1e90", |
| 454 | + "metadata": {}, |
| 455 | + "source": [ |
| 456 | + "### Tiktoken OpenAI GPT-2 tokenizer" |
| 457 | + ] |
| 458 | + }, |
420 | 459 | { |
421 | 460 | "cell_type": "code", |
422 | | - "execution_count": 17, |
| 461 | + "execution_count": 20, |
423 | 462 | "id": "036dd628-3591-46c9-a5ce-b20b105a8062", |
424 | 463 | "metadata": {}, |
425 | 464 | "outputs": [ |
426 | 465 | { |
427 | 466 | "name": "stdout", |
428 | 467 | "output_type": "stream", |
429 | 468 | "text": [ |
430 | | - "1.4 ms ± 9.71 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" |
| 469 | + "1.08 ms ± 4.69 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" |
431 | 470 | ] |
432 | 471 | } |
433 | 472 | ], |
434 | 473 | "source": [ |
435 | 474 | "%timeit tik_tokenizer.encode(raw_text)" |
436 | 475 | ] |
437 | 476 | }, |
| 477 | + { |
| 478 | + "cell_type": "markdown", |
| 479 | + "id": "0c748de8-273e-42df-b078-3a510106da60", |
| 480 | + "metadata": {}, |
| 481 | + "source": [ |
| 482 | + "### Hugging Face OpenAI GPT-2 tokenizer" |
| 483 | + ] |
| 484 | + }, |
438 | 485 | { |
439 | 486 | "cell_type": "code", |
440 | | - "execution_count": 18, |
| 487 | + "execution_count": 21, |
441 | 488 | "id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90", |
442 | 489 | "metadata": {}, |
443 | 490 | "outputs": [ |
|
452 | 499 | "name": "stdout", |
453 | 500 | "output_type": "stream", |
454 | 501 | "text": [ |
455 | | - "8.46 ms ± 48.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" |
| 502 | + "10.3 ms ± 180 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" |
456 | 503 | ] |
457 | 504 | } |
458 | 505 | ], |
|
462 | 509 | }, |
463 | 510 | { |
464 | 511 | "cell_type": "code", |
465 | | - "execution_count": 19, |
| 512 | + "execution_count": 22, |
466 | 513 | "id": "7117107f-22a6-46b4-a442-712d50b3ac7a", |
467 | 514 | "metadata": {}, |
468 | 515 | "outputs": [ |
469 | 516 | { |
470 | 517 | "name": "stdout", |
471 | 518 | "output_type": "stream", |
472 | 519 | "text": [ |
473 | | - "8.36 ms ± 184 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" |
| 520 | + "10.2 ms ± 72.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" |
474 | 521 | ] |
475 | 522 | } |
476 | 523 | ], |
477 | 524 | "source": [ |
478 | 525 | "%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)[\"input_ids\"]" |
479 | 526 | ] |
| 527 | + }, |
| 528 | + { |
| 529 | + "cell_type": "markdown", |
| 530 | + "id": "91ac2876-f36e-498c-bd75-8597a39f2d4b", |
| 531 | + "metadata": {}, |
| 532 | + "source": [ |
| 533 | + "### My own GPT-2 tokenizer (for educational purposes)" |
| 534 | + ] |
| 535 | + }, |
| 536 | + { |
| 537 | + "cell_type": "code", |
| 538 | + "execution_count": 23, |
| 539 | + "id": "3b4ff4d5-f2d9-4ea6-a51c-023dbba15429", |
| 540 | + "metadata": {}, |
| 541 | + "outputs": [ |
| 542 | + { |
| 543 | + "name": "stdout", |
| 544 | + "output_type": "stream", |
| 545 | + "text": [ |
| 546 | + "1.74 ms ± 48.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" |
| 547 | + ] |
| 548 | + } |
| 549 | + ], |
| 550 | + "source": [ |
| 551 | + "%timeit tokenizer_gpt2.encode(raw_text)" |
| 552 | + ] |
480 | 553 | } |
481 | 554 | ], |
482 | 555 | "metadata": { |
|
0 commit comments