rasbt
diff --git a/‎appendix-D/01_main-chapter-code/appendix-D.ipynb‎
Lines changed: 23 additions & 2 deletions b/‎appendix-D/01_main-chapter-code/appendix-D.ipynb‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎appendix-E/01_main-chapter-code/appendix-E.ipynb‎
Lines changed: 10 additions & 2 deletions b/‎appendix-E/01_main-chapter-code/appendix-E.ipynb‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎appendix-E/01_main-chapter-code/previous_chapters.py‎
Lines changed: 7 additions & 4 deletions b/‎appendix-E/01_main-chapter-code/previous_chapters.py‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎ch02/01_main-chapter-code/ch02.ipynb‎
Lines changed: 26 additions & 1 deletion b/‎ch02/01_main-chapter-code/ch02.ipynb‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb‎
Lines changed: 9 additions & 3 deletions b/‎ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎ch05/01_main-chapter-code/ch05.ipynb‎
Lines changed: 28 additions & 4 deletions b/‎ch05/01_main-chapter-code/ch05.ipynb‎
Lines changed: 28 additions & 4 deletions
diff --git a/‎ch05/01_main-chapter-code/exercise-solutions.ipynb‎
Lines changed: 21 additions & 1 deletion b/‎ch05/01_main-chapter-code/exercise-solutions.ipynb‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎ch05/01_main-chapter-code/gpt_download.py‎
Lines changed: 22 additions & 27 deletions b/‎ch05/01_main-chapter-code/gpt_download.py‎
Lines changed: 22 additions & 27 deletions
@@ -121,19 +121,40 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import urllib.request\n",
+    "import request\n",
     "\n",
     "file_path = \"the-verdict.txt\"\n",
     "url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
     "\n",
     "if not os.path.exists(file_path):\n",
+    "    response = requests.get(url, timeout=30)\n",
+    "    response.raise_for_status()\n",
+    "    text_data = response.text\n",
+    "    with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
+    "        file.write(text_data)\n",
+    "else:\n",
+    "    with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
+    "        text_data = file.read()\n",
+    "\n",
+    "# The book originally used the following code below\n",
+    "# However, urllib uses older protocol settings that\n",
+    "# can cause problems for some readers using a VPN.\n",
+    "# The `requests` version above is more robust\n",
+    "# in that regard.\n",
+    "\n",
+    "\"\"\"\n",
+    "import os\n",
+    "import urllib.request\n",
+    "\n",
+    "if not os.path.exists(file_path):\n",
     "    with urllib.request.urlopen(url) as response:\n",
     "        text_data = response.read().decode('utf-8')\n",
     "    with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
     "        file.write(text_data)\n",
     "else:\n",
     "    with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
-    "        text_data = file.read()"
+    "        text_data = file.read()\n",
+    "\"\"\""
    ]
   },
   {
 
@@ -190,7 +190,8 @@
     }
    ],
    "source": [
-    "import urllib\n",
+    "# import urllib\n",
+    "import requests\n",
     "from pathlib import Path\n",
     "import pandas as pd\n",
     "from previous_chapters import (\n",
@@ -215,13 +216,20 @@
     "extracted_path = \"sms_spam_collection\"\n",
     "data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
     "\n",
+    "\n",
     "try:\n",
     "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
-    "except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
+    "except (requests.exceptions.RequestException, TimeoutError) as e:\n",
     "    print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
     "    url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
     "    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
     "\n",
+    "# The book originally used\n",
+    "# except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
+    "# in the code above.\n",
+    "# However, some VPN users reported issues with `urllib`, so the code was updated\n",
+    "# to use `requests` instead\n",
+    "\n",
     "df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
     "balanced_df = create_balanced_dataset(df)\n",
     "balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n",
 
@@ -9,12 +9,12 @@
 
 import os
 from pathlib import Path
-import urllib
 import zipfile
 
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
+import requests
 import tiktoken
 import torch
 import torch.nn as nn
@@ -367,9 +367,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
         return
 
     # Downloading the file
-    with urllib.request.urlopen(url) as response:
-        with open(zip_path, "wb") as out_file:
-            out_file.write(response.read())
+    response = requests.get(url, stream=True, timeout=60)
+    response.raise_for_status()
+    with open(zip_path, "wb") as out_file:
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                out_file.write(chunk)
 
     # Unzipping the file
     with zipfile.ZipFile(zip_path, "r") as zip_ref:
 
@@ -163,6 +163,30 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import os\n",
+    "import requests\n",
+    "\n",
+    "if not os.path.exists(\"the-verdict.txt\"):\n",
+    "    url = (\n",
+    "        \"https://raw.githubusercontent.com/rasbt/\"\n",
+    "        \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
+    "        \"the-verdict.txt\"\n",
+    "    )\n",
+    "    file_path = \"the-verdict.txt\"\n",
+    "\n",
+    "    response = requests.get(url, timeout=30)\n",
+    "    response.raise_for_status()\n",
+    "    with open(file_path, \"wb\") as f:\n",
+    "        f.write(response.content)\n",
+    "\n",
+    "\n",
+    "# The book originally used the following code below\n",
+    "# However, urllib uses older protocol settings that\n",
+    "# can cause problems for some readers using a VPN.\n",
+    "# The `requests` version above is more robust\n",
+    "# in that regard.\n",
+    "\n",
+    "\"\"\"\n",
     "import os\n",
     "import urllib.request\n",
     "\n",
@@ -171,7 +195,8 @@
     "           \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
     "           \"the-verdict.txt\")\n",
     "    file_path = \"the-verdict.txt\"\n",
-    "    urllib.request.urlretrieve(url, file_path)"
+    "    urllib.request.urlretrieve(url, file_path)\n",
+    "\"\"\""
    ]
   },
   {
 
@@ -823,7 +823,7 @@
    ],
    "source": [
     "import os\n",
-    "import urllib.request\n",
+    "import requests\n",
     "\n",
     "def download_file_if_absent(url, filename, search_dirs):\n",
     "    for directory in search_dirs:\n",
@@ -834,13 +834,19 @@
     "\n",
     "    target_path = os.path.join(search_dirs[0], filename)\n",
     "    try:\n",
-    "        with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n",
-    "            out_file.write(response.read())\n",
+    "        response = requests.get(url, stream=True, timeout=60)\n",
+    "        response.raise_for_status()\n",
+    "        with open(target_path, \"wb\") as out_file:\n",
+    "            for chunk in response.iter_content(chunk_size=8192):\n",
+    "                if chunk:\n",
+    "                    out_file.write(chunk)\n",
     "        print(f\"Downloaded {filename} to {target_path}\")\n",
     "    except Exception as e:\n",
     "        print(f\"Failed to download {filename}. Error: {e}\")\n",
+    "\n",
     "    return target_path\n",
     "\n",
+    "\n",
     "verdict_path = download_file_if_absent(\n",
     "    url=(\n",
     "         \"https://raw.githubusercontent.com/rasbt/\"\n",
 
@@ -793,19 +793,43 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import urllib.request\n",
+    "import requests\n",
     "\n",
     "file_path = \"the-verdict.txt\"\n",
     "url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
     "\n",
     "if not os.path.exists(file_path):\n",
-    "    with urllib.request.urlopen(url) as response:\n",
-    "        text_data = response.read().decode('utf-8')\n",
+    "    response = requests.get(url, timeout=30)\n",
+    "    response.raise_for_status()\n",
+    "    text_data = response.text\n",
     "    with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
     "        file.write(text_data)\n",
     "else:\n",
     "    with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
-    "        text_data = file.read()"
+    "        text_data = file.read()\n",
+    "\n",
+    "\n",
+    "# The book originally used the following code below\n",
+    "# However, urllib uses older protocol settings that\n",
+    "# can cause problems for some readers using a VPN.\n",
+    "# The `requests` version above is more robust\n",
+    "# in that regard.\n",
+    "\n",
+    "        \n",
+    "# import os\n",
+    "# import urllib.request\n",
+    "\n",
+    "# file_path = \"the-verdict.txt\"\n",
+    "# url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
+    "\n",
+    "# if not os.path.exists(file_path):\n",
+    "#     with urllib.request.urlopen(url) as response:\n",
+    "#         text_data = response.read().decode('utf-8')\n",
+    "#     with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
+    "#         file.write(text_data)\n",
+    "# else:\n",
+    "#     with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
+    "#         text_data = file.read()"
    ]
   },
   {
 
@@ -491,21 +491,41 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import urllib.request\n",
+    "import requests\n",
     "from previous_chapters import create_dataloader_v1\n",
     "\n",
     "\n",
     "file_path = \"the-verdict.txt\"\n",
     "url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
     "\n",
     "if not os.path.exists(file_path):\n",
+    "    response = requests.get(url, timeout=30)\n",
+    "    response.raise_for_status()\n",
+    "    text_data = response.text\n",
+    "    with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
+    "        file.write(text_data)\n",
+    "else:\n",
+    "    with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
+    "        text_data = file.read()\n",
+    "\n",
+    "# The book originally used the following code below\n",
+    "# However, urllib uses older protocol settings that\n",
+    "# can cause problems for some readers using a VPN.\n",
+    "# The `requests` version above is more robust\n",
+    "# in that regard.\n",
+    "\n",
+    "\"\"\"\n",
+    "import urllib.request\n",
+    "\n",
+    "if not os.path.exists(file_path):\n",
     "    with urllib.request.urlopen(url) as response:\n",
     "        text_data = response.read().decode('utf-8')\n",
     "    with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
     "        file.write(text_data)\n",
     "else:\n",
     "    with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
     "        text_data = file.read()\n",
+    "\"\"\"\n",
     "\n",
     "\n",
     "# Train/validation ratio\n",
 
@@ -5,9 +5,8 @@
 
 
 import os
-import urllib.request
 
-# import requests
+import requests
 import json
 import numpy as np
 import tensorflow as tf
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):
 
 def download_file(url, destination, backup_url=None):
     def _attempt_download(download_url):
-        with urllib.request.urlopen(download_url) as response:
-            # Get the total file size from headers, defaulting to 0 if not present
-            file_size = int(response.headers.get("Content-Length", 0))
-
-            # Check if file exists and has the same size
-            if os.path.exists(destination):
-                file_size_local = os.path.getsize(destination)
-                if file_size == file_size_local:
-                    print(f"File already exists and is up-to-date: {destination}")
-                    return True  # Indicate success without re-downloading
-
-            block_size = 1024  # 1 Kilobyte
-
-            # Initialize the progress bar with total file size
-            progress_bar_description = os.path.basename(download_url)
-            with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
-                with open(destination, "wb") as file:
-                    while True:
-                        chunk = response.read(block_size)
-                        if not chunk:
-                            break
+        response = requests.get(download_url, stream=True, timeout=60)
+        response.raise_for_status()
+
+        file_size = int(response.headers.get("Content-Length", 0))
+
+        # Check if file exists and has same size
+        if os.path.exists(destination):
+            file_size_local = os.path.getsize(destination)
+            if file_size and file_size == file_size_local:
+                print(f"File already exists and is up-to-date: {destination}")
+                return True
+
+        block_size = 1024  # 1 KB
+        desc = os.path.basename(download_url)
+        with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
+            with open(destination, "wb") as file:
+                for chunk in response.iter_content(chunk_size=block_size):
+                    if chunk:
                         file.write(chunk)
                         progress_bar.update(len(chunk))
-            return True
+        return True
 
     try:
         if _attempt_download(url):
             return
-    except (urllib.error.HTTPError, urllib.error.URLError):
+    except requests.exceptions.RequestException:
         if backup_url is not None:
             print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
             try:
                 if _attempt_download(backup_url):
                     return
-            except urllib.error.HTTPError:
+            except requests.exceptions.RequestException:
                 pass
 
-        # If we reach here, both attempts have failed
         error_message = (
             f"Failed to download from both primary URL ({url})"
             f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."