Skip to content

Commit 280667c

Browse files
authored
Add support for 404/410 status codes
- Handled 404/410 status codes to skip removed or inaccessible URLs - Improved readability and code clarity - Added descriptive comments - Renamed constants, and refactored code structure
1 parent 45d46b0 commit 280667c

File tree

6 files changed

+73
-56
lines changed

6 files changed

+73
-56
lines changed

album_downloader.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
The script validates the provided album URL, collects links to the media files, and
44
downloads them to a specified local directory.
55
"""
6+
67
from __future__ import annotations
78

89
import argparse
@@ -24,9 +25,12 @@
2425
from requests.models import Response
2526

2627

27-
def extract_download_links(album_url: str) -> list[str]:
28+
def extract_download_links(album_url: str) -> list[str] | None:
2829
"""Extract download links for video and image sources from the album URL."""
2930
soup = fetch_page(album_url)
31+
if soup is None:
32+
return None
33+
3034
videos = [
3135
video_source["src"] for video_source in soup.find_all("source")
3236
]
@@ -42,12 +46,14 @@ def download_album(
4246
profile: str | None = None,
4347
) -> None:
4448
"""Download an album from the given URL."""
45-
download_links = extract_download_links(album_url)
46-
47-
album_id = album_url.split("/")[-1]
49+
album_id = album_url.rstrip("/").split("/")[-1]
4850
album_path = album_id if not profile else Path(profile) / album_id
4951
download_path = create_download_directory(album_path)
5052

53+
download_links = extract_download_links(album_url)
54+
if download_links is None:
55+
return
56+
5157
run_in_parallel(
5258
download, download_links, live_manager, album_id, download_path, album_url,
5359
)
@@ -119,14 +125,13 @@ def setup_parser() -> ArgumentParser:
119125
metavar="album_url",
120126
help="Album URL to process",
121127
)
122-
return parser
128+
return parser.parse_args()
123129

124130

125131
def main() -> None:
126132
"""Initiate the download process."""
127133
clear_terminal()
128-
parser = setup_parser()
129-
args = parser.parse_args()
134+
args = setup_parser()
130135

131136
live_manager = initialize_managers()
132137
validated_url = validate_url(args.url)

helpers/config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,10 @@
4242

4343
# Default chunk size for files larger than the largest threshold.
4444
LARGE_FILE_CHUNK_SIZE = 64 * KB
45+
46+
# ============================
47+
# HTTP / Network
48+
# ============================
49+
# HTTP status codes
50+
HTTP_STATUS_NOT_FOUND = 404
51+
HTTP_STATUS_GONE = 410

helpers/general_utils.py

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
across projects.
66
"""
77

8+
from __future__ import annotations
9+
810
import logging
911
import os
1012
import sys
@@ -13,36 +15,45 @@
1315
import requests
1416
from bs4 import BeautifulSoup
1517

16-
from .config import DOWNLOAD_FOLDER
18+
from .config import DOWNLOAD_FOLDER, HTTP_STATUS_GONE, HTTP_STATUS_NOT_FOUND
1719

1820

19-
def fetch_page(url: str, timeout: int = 10) -> BeautifulSoup:
21+
def fetch_page(url: str, timeout: int = 10) -> BeautifulSoup | None:
2022
"""Fetch the HTML content of a webpage."""
2123
# Create a new session per worker
2224
session = requests.Session()
2325

2426
try:
2527
response = session.get(url, timeout=timeout)
28+
if response.status_code in (HTTP_STATUS_NOT_FOUND, HTTP_STATUS_GONE):
29+
log_message = f"Page not found or permanently removed: {url}"
30+
logging.warning(log_message)
31+
return None
32+
2633
response.raise_for_status()
27-
return BeautifulSoup(response.text, "html.parser")
2834

2935
except requests.RequestException as req_err:
3036
message = f"Error fetching page {url}: {req_err}"
3137
logging.exception(message)
3238
sys.exit(1)
3339

40+
return BeautifulSoup(response.text, "html.parser")
41+
42+
3443
def create_download_directory(directory_path: str) -> str:
3544
"""Construct a download path for the given title."""
3645
download_path = Path(DOWNLOAD_FOLDER) / Path(directory_path)
3746

3847
try:
3948
Path(download_path).mkdir(parents=True, exist_ok=True)
40-
return download_path
4149

4250
except OSError:
4351
logging.exception("Error creating directory")
4452
sys.exit(1)
4553

54+
return download_path
55+
56+
4657
def clear_terminal() -> None:
4758
"""Clear the terminal screen based on the operating system."""
4859
commands = {

helpers/managers/live_manager.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ def __init__(
3838
self.live = Live(
3939
self._render_live_view(), refresh_per_second=refresh_per_second,
4040
)
41-
4241
self.start_time = time.time()
4342
self.update_log("Script started", "The script has started execution.")
4443

helpers/profile_crawler.py

Lines changed: 31 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
python profile_crawler.py https://www.erome.com/marieanita
1414
1515
"""
16+
from __future__ import annotations
1617

1718
import logging
1819
import re
@@ -32,55 +33,57 @@ def fetch_profile_page(url: str) -> BeautifulSoup:
3233
try:
3334
response = requests.get(url, timeout=10)
3435
response.raise_for_status()
35-
return BeautifulSoup(response.text, "html.parser")
3636

3737
except requests.RequestException as req_err:
3838
message = f"Error fetching the page: {req_err}"
3939
logging.exception(message)
4040
sys.exit(1)
4141

42+
return BeautifulSoup(response.text, "html.parser")
43+
44+
45+
def extract_page_number(page_link: dict[str, str]) -> int | None:
46+
"""Extract the page number from a URL."""
47+
try:
48+
# Extract page number using regex and convert to integer
49+
return int(re.search(r"page=(\d+)", page_link["href"]).group(1))
50+
51+
except (AttributeError, ValueError, TypeError) as err:
52+
message = f"Error extracting page index from {page_link['href']}: {err}"
53+
logging.exception(message)
54+
return None
55+
4256

4357
def get_profile_page_links(
4458
soup: BeautifulSoup,
4559
profile: str,
46-
next_page_tag: str = "?page=",
4760
) -> list[str]:
4861
"""Extract and profile page links from a BeautifulSoup object."""
4962
try:
5063
# Regular expression to find all 'a' tags with href that match "?page="
5164
# followed by a number
5265
page_links = soup.find_all(
5366
"a",
54-
{"href": re.compile(f"/{profile}\\{next_page_tag}\\d+")},
67+
{"href": re.compile(f"/{profile}\\?page=\\d+")},
5568
)
5669

57-
page_numbers = []
58-
for page_link in page_links:
59-
try:
60-
# Extract page number using regex and convert to integer
61-
page_number = int(re.search(r"page=(\d+)", page_link["href"]).group(1))
62-
page_numbers.append(page_number)
63-
64-
except (AttributeError, ValueError, TypeError) as err:
65-
message = f"Error extracting page index from {page_link['href']}: {err}"
66-
logging.exception(message)
67-
68-
max_page_number = max(page_numbers) if page_numbers else None
69-
70-
formatted_page_links = []
71-
if max_page_number is not None:
72-
# The last item of the page_links list isn't useful, so it is discarded
73-
formatted_page_links = [
74-
HOST_PAGE + page_link["href"] for page_link in page_links[:-1]
75-
]
76-
77-
return formatted_page_links
78-
7970
except (AttributeError, TypeError, KeyError) as err:
8071
message = f"An error occurred while processing the soup: {err}"
8172
logging.exception(message)
8273
return []
8374

75+
page_numbers = [extract_page_number(page_link) for page_link in page_links]
76+
max_page_number = max(page_numbers) if page_numbers else None
77+
78+
formatted_page_links = []
79+
if max_page_number is not None:
80+
# The last item of the page_links list isn't useful, so it is discarded
81+
formatted_page_links = [
82+
HOST_PAGE + page_link["href"] for page_link in page_links[:-1]
83+
]
84+
85+
return formatted_page_links
86+
8487

8588
def extract_album_links_in_page(soup: BeautifulSoup) -> list[str]:
8689
"""Extract album links from a BeautifulSoup object representing a webpage."""
@@ -125,19 +128,15 @@ def process_profile_url(url: str) -> None:
125128
generate_profile_dump(profile_album_links)
126129

127130
except ValueError as val_err:
128-
message = f"Value error: {val_err}"
131+
message = f"Error occurred processing profile URL: {val_err}"
129132
logging.exception(message)
130133

131-
finally:
132-
console.print("[green]✓[/green] Dump file successfully generated.")
134+
else:
135+
console.print("[green]✓[/green] Dump file successfully generated.\n")
133136

134137

135138
def main() -> None:
136139
"""Execute the profile album extraction process."""
137-
if len(sys.argv) != 2:
138-
logging.error("Usage: python profile_crawler.py <profile_page_url>")
139-
sys.exit(1)
140-
141140
url = sys.argv[1]
142141
process_profile_url(url)
143142

main.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,7 @@
1919
setup_parser,
2020
validate_url,
2121
)
22-
from helpers.config import (
23-
DUMP_FILE,
24-
)
25-
from helpers.config import (
26-
FILE as DEFAULT_FILE,
27-
)
22+
from helpers.config import DUMP_FILE, URLS_FILE
2823
from helpers.file_utils import read_file, write_file
2924
from helpers.general_utils import clear_terminal
3025
from helpers.profile_crawler import process_profile_url
@@ -33,6 +28,7 @@
3328
def process_urls(urls: list[str], profile_name: str) -> None:
3429
"""Validate and processes a list of URLs to download items."""
3530
live_manager = initialize_managers()
31+
3632
with live_manager.live:
3733
for url in urls:
3834
validated_url = validate_url(url)
@@ -52,19 +48,19 @@ def handle_profile_processing(profile_url: str) -> str | None:
5248

5349
def main() -> None:
5450
"""Run the script."""
51+
# Clear the terminal and profile dump file
5552
clear_terminal()
5653
write_file(DUMP_FILE)
5754

58-
parser = setup_parser()
59-
args = parser.parse_args()
60-
61-
file_to_read = DUMP_FILE if args.profile else DEFAULT_FILE
55+
# Parse arguments, determine which file to read, and handle profile processing
56+
args = setup_parser()
57+
file_to_read = DUMP_FILE if args.profile else URLS_FILE
6258
profile_name = handle_profile_processing(args.profile)
6359

60+
# Read the content from the determined file, processes the URLs, and clear the files
6461
urls = read_file(file_to_read)
6562
process_urls(urls, profile_name)
66-
67-
write_file(DEFAULT_FILE)
63+
write_file(URLS_FILE)
6864

6965

7066
if __name__ == "__main__":

0 commit comments

Comments
 (0)