Skip to content

Commit fe17f21

Browse files
committed
Switch from urllib to requests to improve reliability
1 parent 8552565 commit fe17f21

File tree

39 files changed

+492
-350
lines changed

39 files changed

+492
-350
lines changed

appendix-D/01_main-chapter-code/appendix-D.ipynb

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,19 +121,40 @@
121121
"outputs": [],
122122
"source": [
123123
"import os\n",
124-
"import urllib.request\n",
124+
"import request\n",
125125
"\n",
126126
"file_path = \"the-verdict.txt\"\n",
127127
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
128128
"\n",
129129
"if not os.path.exists(file_path):\n",
130+
" response = requests.get(url, timeout=30)\n",
131+
" response.raise_for_status()\n",
132+
" text_data = response.text\n",
133+
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
134+
" file.write(text_data)\n",
135+
"else:\n",
136+
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
137+
" text_data = file.read()\n",
138+
"\n",
139+
"# The book originally used the following code below\n",
140+
"# However, urllib uses older protocol settings that\n",
141+
"# can cause problems for some readers using a VPN.\n",
142+
"# The `requests` version above is more robust\n",
143+
"# in that regard.\n",
144+
"\n",
145+
"\"\"\"\n",
146+
"import os\n",
147+
"import urllib.request\n",
148+
"\n",
149+
"if not os.path.exists(file_path):\n",
130150
" with urllib.request.urlopen(url) as response:\n",
131151
" text_data = response.read().decode('utf-8')\n",
132152
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
133153
" file.write(text_data)\n",
134154
"else:\n",
135155
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
136-
" text_data = file.read()"
156+
" text_data = file.read()\n",
157+
"\"\"\""
137158
]
138159
},
139160
{

appendix-E/01_main-chapter-code/appendix-E.ipynb

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,8 @@
190190
}
191191
],
192192
"source": [
193-
"import urllib\n",
193+
"# import urllib\n",
194+
"import requests\n",
194195
"from pathlib import Path\n",
195196
"import pandas as pd\n",
196197
"from previous_chapters import (\n",
@@ -215,13 +216,20 @@
215216
"extracted_path = \"sms_spam_collection\"\n",
216217
"data_file_path = Path(extracted_path) / \"SMSSpamCollection.tsv\"\n",
217218
"\n",
219+
"\n",
218220
"try:\n",
219221
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
220-
"except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
222+
"except (requests.exceptions.RequestException, TimeoutError) as e:\n",
221223
" print(f\"Primary URL failed: {e}. Trying backup URL...\")\n",
222224
" url = \"https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip\"\n",
223225
" download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)\n",
224226
"\n",
227+
"# The book originally used\n",
228+
"# except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:\n",
229+
"# in the code above.\n",
230+
"# However, some VPN users reported issues with `urllib`, so the code was updated\n",
231+
"# to use `requests` instead\n",
232+
"\n",
225233
"df = pd.read_csv(data_file_path, sep=\"\\t\", header=None, names=[\"Label\", \"Text\"])\n",
226234
"balanced_df = create_balanced_dataset(df)\n",
227235
"balanced_df[\"Label\"] = balanced_df[\"Label\"].map({\"ham\": 0, \"spam\": 1})\n",

appendix-E/01_main-chapter-code/previous_chapters.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99

1010
import os
1111
from pathlib import Path
12-
import urllib
1312
import zipfile
1413

1514
import matplotlib.pyplot as plt
1615
import numpy as np
1716
import pandas as pd
17+
import requests
1818
import tiktoken
1919
import torch
2020
import torch.nn as nn
@@ -367,9 +367,12 @@ def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
367367
return
368368

369369
# Downloading the file
370-
with urllib.request.urlopen(url) as response:
371-
with open(zip_path, "wb") as out_file:
372-
out_file.write(response.read())
370+
response = requests.get(url, stream=True, timeout=60)
371+
response.raise_for_status()
372+
with open(zip_path, "wb") as out_file:
373+
for chunk in response.iter_content(chunk_size=8192):
374+
if chunk:
375+
out_file.write(chunk)
373376

374377
# Unzipping the file
375378
with zipfile.ZipFile(zip_path, "r") as zip_ref:

ch02/01_main-chapter-code/ch02.ipynb

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,30 @@
163163
"metadata": {},
164164
"outputs": [],
165165
"source": [
166+
"import os\n",
167+
"import requests\n",
168+
"\n",
169+
"if not os.path.exists(\"the-verdict.txt\"):\n",
170+
" url = (\n",
171+
" \"https://raw.githubusercontent.com/rasbt/\"\n",
172+
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
173+
" \"the-verdict.txt\"\n",
174+
" )\n",
175+
" file_path = \"the-verdict.txt\"\n",
176+
"\n",
177+
" response = requests.get(url, timeout=30)\n",
178+
" response.raise_for_status()\n",
179+
" with open(file_path, \"wb\") as f:\n",
180+
" f.write(response.content)\n",
181+
"\n",
182+
"\n",
183+
"# The book originally used the following code below\n",
184+
"# However, urllib uses older protocol settings that\n",
185+
"# can cause problems for some readers using a VPN.\n",
186+
"# The `requests` version above is more robust\n",
187+
"# in that regard.\n",
188+
"\n",
189+
"\"\"\"\n",
166190
"import os\n",
167191
"import urllib.request\n",
168192
"\n",
@@ -171,7 +195,8 @@
171195
" \"LLMs-from-scratch/main/ch02/01_main-chapter-code/\"\n",
172196
" \"the-verdict.txt\")\n",
173197
" file_path = \"the-verdict.txt\"\n",
174-
" urllib.request.urlretrieve(url, file_path)"
198+
" urllib.request.urlretrieve(url, file_path)\n",
199+
"\"\"\""
175200
]
176201
},
177202
{

ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -823,7 +823,7 @@
823823
],
824824
"source": [
825825
"import os\n",
826-
"import urllib.request\n",
826+
"import requests\n",
827827
"\n",
828828
"def download_file_if_absent(url, filename, search_dirs):\n",
829829
" for directory in search_dirs:\n",
@@ -834,13 +834,19 @@
834834
"\n",
835835
" target_path = os.path.join(search_dirs[0], filename)\n",
836836
" try:\n",
837-
" with urllib.request.urlopen(url) as response, open(target_path, \"wb\") as out_file:\n",
838-
" out_file.write(response.read())\n",
837+
" response = requests.get(url, stream=True, timeout=60)\n",
838+
" response.raise_for_status()\n",
839+
" with open(target_path, \"wb\") as out_file:\n",
840+
" for chunk in response.iter_content(chunk_size=8192):\n",
841+
" if chunk:\n",
842+
" out_file.write(chunk)\n",
839843
" print(f\"Downloaded {filename} to {target_path}\")\n",
840844
" except Exception as e:\n",
841845
" print(f\"Failed to download {filename}. Error: {e}\")\n",
846+
"\n",
842847
" return target_path\n",
843848
"\n",
849+
"\n",
844850
"verdict_path = download_file_if_absent(\n",
845851
" url=(\n",
846852
" \"https://raw.githubusercontent.com/rasbt/\"\n",

ch05/01_main-chapter-code/ch05.ipynb

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -793,19 +793,43 @@
793793
"outputs": [],
794794
"source": [
795795
"import os\n",
796-
"import urllib.request\n",
796+
"import requests\n",
797797
"\n",
798798
"file_path = \"the-verdict.txt\"\n",
799799
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
800800
"\n",
801801
"if not os.path.exists(file_path):\n",
802-
" with urllib.request.urlopen(url) as response:\n",
803-
" text_data = response.read().decode('utf-8')\n",
802+
" response = requests.get(url, timeout=30)\n",
803+
" response.raise_for_status()\n",
804+
" text_data = response.text\n",
804805
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
805806
" file.write(text_data)\n",
806807
"else:\n",
807808
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
808-
" text_data = file.read()"
809+
" text_data = file.read()\n",
810+
"\n",
811+
"\n",
812+
"# The book originally used the following code below\n",
813+
"# However, urllib uses older protocol settings that\n",
814+
"# can cause problems for some readers using a VPN.\n",
815+
"# The `requests` version above is more robust\n",
816+
"# in that regard.\n",
817+
"\n",
818+
" \n",
819+
"# import os\n",
820+
"# import urllib.request\n",
821+
"\n",
822+
"# file_path = \"the-verdict.txt\"\n",
823+
"# url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
824+
"\n",
825+
"# if not os.path.exists(file_path):\n",
826+
"# with urllib.request.urlopen(url) as response:\n",
827+
"# text_data = response.read().decode('utf-8')\n",
828+
"# with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
829+
"# file.write(text_data)\n",
830+
"# else:\n",
831+
"# with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
832+
"# text_data = file.read()"
809833
]
810834
},
811835
{

ch05/01_main-chapter-code/exercise-solutions.ipynb

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -491,21 +491,41 @@
491491
"outputs": [],
492492
"source": [
493493
"import os\n",
494-
"import urllib.request\n",
494+
"import requests\n",
495495
"from previous_chapters import create_dataloader_v1\n",
496496
"\n",
497497
"\n",
498498
"file_path = \"the-verdict.txt\"\n",
499499
"url = \"https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt\"\n",
500500
"\n",
501501
"if not os.path.exists(file_path):\n",
502+
" response = requests.get(url, timeout=30)\n",
503+
" response.raise_for_status()\n",
504+
" text_data = response.text\n",
505+
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
506+
" file.write(text_data)\n",
507+
"else:\n",
508+
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
509+
" text_data = file.read()\n",
510+
"\n",
511+
"# The book originally used the following code below\n",
512+
"# However, urllib uses older protocol settings that\n",
513+
"# can cause problems for some readers using a VPN.\n",
514+
"# The `requests` version above is more robust\n",
515+
"# in that regard.\n",
516+
"\n",
517+
"\"\"\"\n",
518+
"import urllib.request\n",
519+
"\n",
520+
"if not os.path.exists(file_path):\n",
502521
" with urllib.request.urlopen(url) as response:\n",
503522
" text_data = response.read().decode('utf-8')\n",
504523
" with open(file_path, \"w\", encoding=\"utf-8\") as file:\n",
505524
" file.write(text_data)\n",
506525
"else:\n",
507526
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
508527
" text_data = file.read()\n",
528+
"\"\"\"\n",
509529
"\n",
510530
"\n",
511531
"# Train/validation ratio\n",

ch05/01_main-chapter-code/gpt_download.py

Lines changed: 22 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@
55

66

77
import os
8-
import urllib.request
98

10-
# import requests
9+
import requests
1110
import json
1211
import numpy as np
1312
import tensorflow as tf
@@ -48,44 +47,40 @@ def download_and_load_gpt2(model_size, models_dir):
4847

4948
def download_file(url, destination, backup_url=None):
5049
def _attempt_download(download_url):
51-
with urllib.request.urlopen(download_url) as response:
52-
# Get the total file size from headers, defaulting to 0 if not present
53-
file_size = int(response.headers.get("Content-Length", 0))
54-
55-
# Check if file exists and has the same size
56-
if os.path.exists(destination):
57-
file_size_local = os.path.getsize(destination)
58-
if file_size == file_size_local:
59-
print(f"File already exists and is up-to-date: {destination}")
60-
return True # Indicate success without re-downloading
61-
62-
block_size = 1024 # 1 Kilobyte
63-
64-
# Initialize the progress bar with total file size
65-
progress_bar_description = os.path.basename(download_url)
66-
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
67-
with open(destination, "wb") as file:
68-
while True:
69-
chunk = response.read(block_size)
70-
if not chunk:
71-
break
50+
response = requests.get(download_url, stream=True, timeout=60)
51+
response.raise_for_status()
52+
53+
file_size = int(response.headers.get("Content-Length", 0))
54+
55+
# Check if file exists and has same size
56+
if os.path.exists(destination):
57+
file_size_local = os.path.getsize(destination)
58+
if file_size and file_size == file_size_local:
59+
print(f"File already exists and is up-to-date: {destination}")
60+
return True
61+
62+
block_size = 1024 # 1 KB
63+
desc = os.path.basename(download_url)
64+
with tqdm(total=file_size, unit="iB", unit_scale=True, desc=desc) as progress_bar:
65+
with open(destination, "wb") as file:
66+
for chunk in response.iter_content(chunk_size=block_size):
67+
if chunk:
7268
file.write(chunk)
7369
progress_bar.update(len(chunk))
74-
return True
70+
return True
7571

7672
try:
7773
if _attempt_download(url):
7874
return
79-
except (urllib.error.HTTPError, urllib.error.URLError):
75+
except requests.exceptions.RequestException:
8076
if backup_url is not None:
8177
print(f"Primary URL ({url}) failed. Attempting backup URL: {backup_url}")
8278
try:
8379
if _attempt_download(backup_url):
8480
return
85-
except urllib.error.HTTPError:
81+
except requests.exceptions.RequestException:
8682
pass
8783

88-
# If we reach here, both attempts have failed
8984
error_message = (
9085
f"Failed to download from both primary URL ({url})"
9186
f"{' and backup URL (' + backup_url + ')' if backup_url else ''}."

0 commit comments

Comments
 (0)