Skip to content

Commit 0d4967e

Browse files
authored
Implementingthe BPE Tokenizer from Scratch (#487)
1 parent 2fef211 commit 0d4967e

File tree

4 files changed

+1463
-86
lines changed

4 files changed

+1463
-86
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ Several folders contain optional materials as a bonus for interested readers:
102102
- [Installing Python Packages and Libraries Used In This Book](setup/02_installing-python-libraries)
103103
- [Docker Environment Setup Guide](setup/03_optional-docker-environment)
104104
- **Chapter 2: Working with text data**
105+
- [Byte Pair Encoding (BPE) Tokenizer From Scratch](ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb)
105106
- [Comparing Various Byte Pair Encoding (BPE) Implementations](ch02/02_bonus_bytepair-encoder)
106107
- [Understanding the Difference Between Embedding Layers and Linear Layers](ch02/03_bonus_embedding-vs-matmul)
107108
- [Dataloader Intuition with Simple Numbers](ch02/04_bonus_dataloader-intuition)

ch02/01_main-chapter-code/ch02.ipynb

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1900,7 +1900,9 @@
19001900
"source": [
19011901
"See the [./dataloader.ipynb](./dataloader.ipynb) code notebook, which is a concise version of the data loader that we implemented in this chapter and will need for training the GPT model in upcoming chapters.\n",
19021902
"\n",
1903-
"See [./exercise-solutions.ipynb](./exercise-solutions.ipynb) for the exercise solutions."
1903+
"See [./exercise-solutions.ipynb](./exercise-solutions.ipynb) for the exercise solutions.\n",
1904+
"\n",
1905+
"See the [Byte Pair Encoding (BPE) Tokenizer From Scratch](../02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb) notebook if you are interested in learning how the GPT-2 tokenizer can be implemented and trained from scratch."
19041906
]
19051907
}
19061908
],

ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb

Lines changed: 158 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
"name": "stdout",
6868
"output_type": "stream",
6969
"text": [
70-
"tiktoken version: 0.5.1\n"
70+
"tiktoken version: 0.7.0\n"
7171
]
7272
}
7373
],
@@ -180,8 +180,8 @@
180180
"name": "stderr",
181181
"output_type": "stream",
182182
"text": [
183-
"Fetching encoder.json: 1.04Mit [00:00, 3.14Mit/s] \n",
184-
"Fetching vocab.bpe: 457kit [00:00, 1.67Mit/s] \n"
183+
"Fetching encoder.json: 1.04Mit [00:00, 3.47Mit/s] \n",
184+
"Fetching vocab.bpe: 457kit [00:00, 2.07Mit/s] \n"
185185
]
186186
}
187187
],
@@ -259,7 +259,7 @@
259259
{
260260
"data": {
261261
"text/plain": [
262-
"'4.34.0'"
262+
"'4.48.0'"
263263
]
264264
},
265265
"execution_count": 12,
@@ -278,78 +278,7 @@
278278
"execution_count": 13,
279279
"id": "a9839137-b8ea-4a2c-85fc-9a63064cf8c8",
280280
"metadata": {},
281-
"outputs": [
282-
{
283-
"data": {
284-
"application/vnd.jupyter.widget-view+json": {
285-
"model_id": "e4df871bb797435787143a3abe6b0231",
286-
"version_major": 2,
287-
"version_minor": 0
288-
},
289-
"text/plain": [
290-
"Downloading tokenizer_config.json: 0%| | 0.00/26.0 [00:00<?, ?B/s]"
291-
]
292-
},
293-
"metadata": {},
294-
"output_type": "display_data"
295-
},
296-
{
297-
"data": {
298-
"application/vnd.jupyter.widget-view+json": {
299-
"model_id": "f11b27a4aabf43af9bf57f929683def6",
300-
"version_major": 2,
301-
"version_minor": 0
302-
},
303-
"text/plain": [
304-
"Downloading vocab.json: 0%| | 0.00/1.04M [00:00<?, ?B/s]"
305-
]
306-
},
307-
"metadata": {},
308-
"output_type": "display_data"
309-
},
310-
{
311-
"data": {
312-
"application/vnd.jupyter.widget-view+json": {
313-
"model_id": "d3aa9a24aacc43108ef2ed72e7bacd33",
314-
"version_major": 2,
315-
"version_minor": 0
316-
},
317-
"text/plain": [
318-
"Downloading merges.txt: 0%| | 0.00/456k [00:00<?, ?B/s]"
319-
]
320-
},
321-
"metadata": {},
322-
"output_type": "display_data"
323-
},
324-
{
325-
"data": {
326-
"application/vnd.jupyter.widget-view+json": {
327-
"model_id": "f9341bc23b594bb68dcf8954bff6d9bd",
328-
"version_major": 2,
329-
"version_minor": 0
330-
},
331-
"text/plain": [
332-
"Downloading tokenizer.json: 0%| | 0.00/1.36M [00:00<?, ?B/s]"
333-
]
334-
},
335-
"metadata": {},
336-
"output_type": "display_data"
337-
},
338-
{
339-
"data": {
340-
"application/vnd.jupyter.widget-view+json": {
341-
"model_id": "c5f55f2f1dbc4152acc9b2061167ee0a",
342-
"version_major": 2,
343-
"version_minor": 0
344-
},
345-
"text/plain": [
346-
"Downloading config.json: 0%| | 0.00/665 [00:00<?, ?B/s]"
347-
]
348-
},
349-
"metadata": {},
350-
"output_type": "display_data"
351-
}
352-
],
281+
"outputs": [],
353282
"source": [
354283
"from transformers import GPT2Tokenizer\n",
355284
"\n",
@@ -377,6 +306,100 @@
377306
"hf_tokenizer(strings)[\"input_ids\"]"
378307
]
379308
},
309+
{
310+
"cell_type": "markdown",
311+
"id": "9d0f2e95-8ae8-4606-a8e0-b0fce91cfac9",
312+
"metadata": {},
313+
"source": [
314+
"<br>\n",
315+
"&nbsp;\n",
316+
"\n",
317+
"## Using my own from-scratch BPE tokenizer"
318+
]
319+
},
320+
{
321+
"cell_type": "code",
322+
"execution_count": 15,
323+
"id": "b6e6b1a5-9dc0-4b20-9a8b-c02aa0e3191c",
324+
"metadata": {},
325+
"outputs": [],
326+
"source": [
327+
"import os\n",
328+
"import sys\n",
329+
"import io\n",
330+
"import nbformat\n",
331+
"import types\n",
332+
"\n",
333+
"def import_from_notebook():\n",
334+
" def import_definitions_from_notebook(fullname, names):\n",
335+
" current_dir = os.getcwd()\n",
336+
" path = os.path.join(current_dir, \"..\", \"05_bpe-from-scratch\", fullname + \".ipynb\")\n",
337+
" path = os.path.normpath(path)\n",
338+
"\n",
339+
" # Load the notebook\n",
340+
" if not os.path.exists(path):\n",
341+
" raise FileNotFoundError(f\"Notebook file not found at: {path}\")\n",
342+
"\n",
343+
" with io.open(path, \"r\", encoding=\"utf-8\") as f:\n",
344+
" nb = nbformat.read(f, as_version=4)\n",
345+
"\n",
346+
" # Create a module to store the imported functions and classes\n",
347+
" mod = types.ModuleType(fullname)\n",
348+
" sys.modules[fullname] = mod\n",
349+
"\n",
350+
" # Go through the notebook cells and only execute function or class definitions\n",
351+
" for cell in nb.cells:\n",
352+
" if cell.cell_type == \"code\":\n",
353+
" cell_code = cell.source\n",
354+
" for name in names:\n",
355+
" # Check for function or class definitions\n",
356+
" if f\"def {name}\" in cell_code or f\"class {name}\" in cell_code:\n",
357+
" exec(cell_code, mod.__dict__)\n",
358+
" return mod\n",
359+
"\n",
360+
" fullname = \"bpe-from-scratch\"\n",
361+
" names = [\"BPETokenizerSimple\"]\n",
362+
"\n",
363+
" return import_definitions_from_notebook(fullname, names)"
364+
]
365+
},
366+
{
367+
"cell_type": "code",
368+
"execution_count": 16,
369+
"id": "04fbd764-ec98-44f1-9b0a-e9db9a3bb91e",
370+
"metadata": {},
371+
"outputs": [],
372+
"source": [
373+
"imported_module = import_from_notebook()\n",
374+
"BPETokenizerSimple = getattr(imported_module, \"BPETokenizerSimple\", None)\n",
375+
"\n",
376+
"tokenizer_gpt2 = BPETokenizerSimple()\n",
377+
"tokenizer_gpt2.load_vocab_and_merges_from_openai(\n",
378+
" vocab_path=os.path.join(\"gpt2_model\", \"encoder.json\"),\n",
379+
" bpe_merges_path=os.path.join(\"gpt2_model\", \"vocab.bpe\")\n",
380+
")"
381+
]
382+
},
383+
{
384+
"cell_type": "code",
385+
"execution_count": 17,
386+
"id": "5a5def88-1d2c-4550-a5e8-ee82b72b92d7",
387+
"metadata": {},
388+
"outputs": [
389+
{
390+
"name": "stdout",
391+
"output_type": "stream",
392+
"text": [
393+
"[1544, 18798, 11, 995, 13, 1148, 256, 5303, 82, 438, 257, 1332, 30]\n"
394+
]
395+
}
396+
],
397+
"source": [
398+
"integers = tokenizer_gpt2.encode(text)\n",
399+
"\n",
400+
"print(integers)"
401+
]
402+
},
380403
{
381404
"cell_type": "markdown",
382405
"id": "907a1ade-3401-4f2e-9017-7f58a60cbd98",
@@ -390,7 +413,7 @@
390413
},
391414
{
392415
"cell_type": "code",
393-
"execution_count": 15,
416+
"execution_count": 18,
394417
"id": "a61bb445-b151-4a2f-8180-d4004c503754",
395418
"metadata": {},
396419
"outputs": [],
@@ -399,45 +422,69 @@
399422
" raw_text = f.read()"
400423
]
401424
},
425+
{
426+
"cell_type": "markdown",
427+
"id": "9c0ae9f0-47a1-4e7f-a210-e1d2721f4d1e",
428+
"metadata": {},
429+
"source": [
430+
"### Original OpenAI GPT-2 tokenizer"
431+
]
432+
},
402433
{
403434
"cell_type": "code",
404-
"execution_count": 16,
435+
"execution_count": 19,
405436
"id": "57f7c0a3-c1fd-4313-af34-68e78eb33653",
406437
"metadata": {},
407438
"outputs": [
408439
{
409440
"name": "stdout",
410441
"output_type": "stream",
411442
"text": [
412-
"4.29 ms ± 46.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
443+
"3.44 ms ± 54 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
413444
]
414445
}
415446
],
416447
"source": [
417448
"%timeit orig_tokenizer.encode(raw_text)"
418449
]
419450
},
451+
{
452+
"cell_type": "markdown",
453+
"id": "ef2ce3f3-1f81-47ce-b563-99fe2c7a1e90",
454+
"metadata": {},
455+
"source": [
456+
"### Tiktoken OpenAI GPT-2 tokenizer"
457+
]
458+
},
420459
{
421460
"cell_type": "code",
422-
"execution_count": 17,
461+
"execution_count": 20,
423462
"id": "036dd628-3591-46c9-a5ce-b20b105a8062",
424463
"metadata": {},
425464
"outputs": [
426465
{
427466
"name": "stdout",
428467
"output_type": "stream",
429468
"text": [
430-
"1.4 ms ± 9.71 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
469+
"1.08 ms ± 4.69 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
431470
]
432471
}
433472
],
434473
"source": [
435474
"%timeit tik_tokenizer.encode(raw_text)"
436475
]
437476
},
477+
{
478+
"cell_type": "markdown",
479+
"id": "0c748de8-273e-42df-b078-3a510106da60",
480+
"metadata": {},
481+
"source": [
482+
"### Hugging Face OpenAI GPT-2 tokenizer"
483+
]
484+
},
438485
{
439486
"cell_type": "code",
440-
"execution_count": 18,
487+
"execution_count": 21,
441488
"id": "b9c85b58-bfbc-465e-9a7e-477e53d55c90",
442489
"metadata": {},
443490
"outputs": [
@@ -452,7 +499,7 @@
452499
"name": "stdout",
453500
"output_type": "stream",
454501
"text": [
455-
"8.46 ms ± 48.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
502+
"10.3 ms ± 180 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
456503
]
457504
}
458505
],
@@ -462,21 +509,47 @@
462509
},
463510
{
464511
"cell_type": "code",
465-
"execution_count": 19,
512+
"execution_count": 22,
466513
"id": "7117107f-22a6-46b4-a442-712d50b3ac7a",
467514
"metadata": {},
468515
"outputs": [
469516
{
470517
"name": "stdout",
471518
"output_type": "stream",
472519
"text": [
473-
"8.36 ms ± 184 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
520+
"10.2 ms ± 72.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
474521
]
475522
}
476523
],
477524
"source": [
478525
"%timeit hf_tokenizer(raw_text, max_length=5145, truncation=True)[\"input_ids\"]"
479526
]
527+
},
528+
{
529+
"cell_type": "markdown",
530+
"id": "91ac2876-f36e-498c-bd75-8597a39f2d4b",
531+
"metadata": {},
532+
"source": [
533+
"### My own GPT-2 tokenizer (for educational purposes)"
534+
]
535+
},
536+
{
537+
"cell_type": "code",
538+
"execution_count": 23,
539+
"id": "3b4ff4d5-f2d9-4ea6-a51c-023dbba15429",
540+
"metadata": {},
541+
"outputs": [
542+
{
543+
"name": "stdout",
544+
"output_type": "stream",
545+
"text": [
546+
"1.74 ms ± 48.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
547+
]
548+
}
549+
],
550+
"source": [
551+
"%timeit tokenizer_gpt2.encode(raw_text)"
552+
]
480553
}
481554
],
482555
"metadata": {

0 commit comments

Comments
 (0)