Add support for 404/410 status codes

Lysagxra · web-flow · commit 280667c0af30 · 2025-08-31T14:41:33.000+02:00
- Handled 404/410 status codes to skip removed or inaccessible URLs
- Improved readability and code clarity
- Added descriptive comments
- Renamed constants, and refactored code structure
diff --git a/album_downloader.py b/album_downloader.py
@@ -3,6 +3,7 @@
 The script validates the provided album URL, collects links to the media files, and
 downloads them to a specified local directory.
 """
+
 from __future__ import annotations
 
 import argparse
@@ -24,9 +25,12 @@
     from requests.models import Response
 
 
-def extract_download_links(album_url: str) -> list[str]:
+def extract_download_links(album_url: str) -> list[str] | None:
     """Extract download links for video and image sources from the album URL."""
     soup = fetch_page(album_url)
+    if soup is None:
+        return None
+
     videos = [
         video_source["src"] for video_source in soup.find_all("source")
     ]
@@ -42,12 +46,14 @@ def download_album(
     profile: str | None = None,
 ) -> None:
     """Download an album from the given URL."""
-    download_links = extract_download_links(album_url)
-
-    album_id = album_url.split("/")[-1]
+    album_id = album_url.rstrip("/").split("/")[-1]
     album_path = album_id if not profile else Path(profile) / album_id
     download_path = create_download_directory(album_path)
 
+    download_links = extract_download_links(album_url)
+    if download_links is None:
+        return
+
     run_in_parallel(
         download, download_links, live_manager, album_id, download_path, album_url,
     )
@@ -119,14 +125,13 @@ def setup_parser() -> ArgumentParser:
         metavar="album_url",
         help="Album URL to process",
     )
-    return parser
+    return parser.parse_args()
 
 
 def main() -> None:
     """Initiate the download process."""
     clear_terminal()
-    parser = setup_parser()
-    args = parser.parse_args()
+    args = setup_parser()
 
     live_manager = initialize_managers()
     validated_url = validate_url(args.url)
diff --git a/helpers/config.py b/helpers/config.py
@@ -42,3 +42,10 @@
 
 # Default chunk size for files larger than the largest threshold.
 LARGE_FILE_CHUNK_SIZE = 64 * KB
+
+# ============================
+# HTTP / Network
+# ============================
+# HTTP status codes
+HTTP_STATUS_NOT_FOUND = 404
+HTTP_STATUS_GONE = 410
diff --git a/helpers/general_utils.py b/helpers/general_utils.py
@@ -5,6 +5,8 @@
 across projects.
 """
 
+from __future__ import annotations
+
 import logging
 import os
 import sys
@@ -13,36 +15,45 @@
 import requests
 from bs4 import BeautifulSoup
 
-from .config import DOWNLOAD_FOLDER
+from .config import DOWNLOAD_FOLDER, HTTP_STATUS_GONE, HTTP_STATUS_NOT_FOUND
 
 
-def fetch_page(url: str, timeout: int = 10) -> BeautifulSoup:
+def fetch_page(url: str, timeout: int = 10) -> BeautifulSoup | None:
     """Fetch the HTML content of a webpage."""
     # Create a new session per worker
     session = requests.Session()
 
     try:
         response = session.get(url, timeout=timeout)
+        if response.status_code in (HTTP_STATUS_NOT_FOUND, HTTP_STATUS_GONE):
+            log_message = f"Page not found or permanently removed: {url}"
+            logging.warning(log_message)
+            return None
+
         response.raise_for_status()
-        return BeautifulSoup(response.text, "html.parser")
 
     except requests.RequestException as req_err:
         message = f"Error fetching page {url}: {req_err}"
         logging.exception(message)
         sys.exit(1)
 
+    return BeautifulSoup(response.text, "html.parser")
+
+
 def create_download_directory(directory_path: str) -> str:
     """Construct a download path for the given title."""
     download_path = Path(DOWNLOAD_FOLDER) / Path(directory_path)
 
     try:
         Path(download_path).mkdir(parents=True, exist_ok=True)
-        return download_path
 
     except OSError:
         logging.exception("Error creating directory")
         sys.exit(1)
 
+    return download_path
+
+
 def clear_terminal() -> None:
     """Clear the terminal screen based on the operating system."""
     commands = {
diff --git a/helpers/managers/live_manager.py b/helpers/managers/live_manager.py
@@ -38,7 +38,6 @@ def __init__(
         self.live = Live(
             self._render_live_view(), refresh_per_second=refresh_per_second,
         )
-
         self.start_time = time.time()
         self.update_log("Script started", "The script has started execution.")
 
diff --git a/helpers/profile_crawler.py b/helpers/profile_crawler.py
@@ -13,6 +13,7 @@
     python profile_crawler.py https://www.erome.com/marieanita
 
 """
+from __future__ import annotations
 
 import logging
 import re
@@ -32,55 +33,57 @@ def fetch_profile_page(url: str) -> BeautifulSoup:
     try:
         response = requests.get(url, timeout=10)
         response.raise_for_status()
-        return BeautifulSoup(response.text, "html.parser")
 
     except requests.RequestException as req_err:
         message = f"Error fetching the page: {req_err}"
         logging.exception(message)
         sys.exit(1)
 
+    return BeautifulSoup(response.text, "html.parser")
+
+
+def extract_page_number(page_link: dict[str, str]) -> int | None:
+    """Extract the page number from a URL."""
+    try:
+        # Extract page number using regex and convert to integer
+        return int(re.search(r"page=(\d+)", page_link["href"]).group(1))
+
+    except (AttributeError, ValueError, TypeError) as err:
+        message = f"Error extracting page index from {page_link['href']}: {err}"
+        logging.exception(message)
+        return None
+
 
 def get_profile_page_links(
     soup: BeautifulSoup,
     profile: str,
-    next_page_tag: str = "?page=",
 ) -> list[str]:
     """Extract and  profile page links from a BeautifulSoup object."""
     try:
         # Regular expression to find all 'a' tags with href that match "?page="
         # followed by a number
         page_links = soup.find_all(
             "a",
-            {"href": re.compile(f"/{profile}\\{next_page_tag}\\d+")},
+            {"href": re.compile(f"/{profile}\\?page=\\d+")},
         )
 
-        page_numbers = []
-        for page_link in page_links:
-            try:
-                # Extract page number using regex and convert to integer
-                page_number = int(re.search(r"page=(\d+)", page_link["href"]).group(1))
-                page_numbers.append(page_number)
-
-            except (AttributeError, ValueError, TypeError) as err:
-                message = f"Error extracting page index from {page_link['href']}: {err}"
-                logging.exception(message)
-
-        max_page_number = max(page_numbers) if page_numbers else None
-
-        formatted_page_links = []
-        if max_page_number is not None:
-            # The last item of the page_links list isn't useful, so it is discarded
-            formatted_page_links = [
-                HOST_PAGE + page_link["href"] for page_link in page_links[:-1]
-            ]
-
-        return formatted_page_links
-
     except (AttributeError, TypeError, KeyError) as err:
         message = f"An error occurred while processing the soup: {err}"
         logging.exception(message)
         return []
 
+    page_numbers = [extract_page_number(page_link) for page_link in page_links]
+    max_page_number = max(page_numbers) if page_numbers else None
+
+    formatted_page_links = []
+    if max_page_number is not None:
+        # The last item of the page_links list isn't useful, so it is discarded
+        formatted_page_links = [
+            HOST_PAGE + page_link["href"] for page_link in page_links[:-1]
+        ]
+
+    return formatted_page_links
+
 
 def extract_album_links_in_page(soup: BeautifulSoup) -> list[str]:
     """Extract album links from a BeautifulSoup object representing a webpage."""
@@ -125,19 +128,15 @@ def process_profile_url(url: str) -> None:
         generate_profile_dump(profile_album_links)
 
     except ValueError as val_err:
-        message = f"Value error: {val_err}"
+        message = f"Error occurred processing profile URL: {val_err}"
         logging.exception(message)
 
-    finally:
-        console.print("[green]✓[/green] Dump file successfully generated.")
+    else:
+        console.print("[green]✓[/green] Dump file successfully generated.\n")
 
 
 def main() -> None:
     """Execute the profile album extraction process."""
-    if len(sys.argv) != 2:
-        logging.error("Usage: python profile_crawler.py <profile_page_url>")
-        sys.exit(1)
-
     url = sys.argv[1]
     process_profile_url(url)
 
diff --git a/main.py b/main.py
@@ -19,12 +19,7 @@
     setup_parser,
     validate_url,
 )
-from helpers.config import (
-    DUMP_FILE,
-)
-from helpers.config import (
-    FILE as DEFAULT_FILE,
-)
+from helpers.config import DUMP_FILE, URLS_FILE
 from helpers.file_utils import read_file, write_file
 from helpers.general_utils import clear_terminal
 from helpers.profile_crawler import process_profile_url
@@ -33,6 +28,7 @@
 def process_urls(urls: list[str], profile_name: str) -> None:
     """Validate and processes a list of URLs to download items."""
     live_manager = initialize_managers()
+
     with live_manager.live:
         for url in urls:
             validated_url = validate_url(url)
@@ -52,19 +48,19 @@ def handle_profile_processing(profile_url: str) -> str | None:
 
 def main() -> None:
     """Run the script."""
+    # Clear the terminal and profile dump file
     clear_terminal()
     write_file(DUMP_FILE)
 
-    parser = setup_parser()
-    args = parser.parse_args()
-
-    file_to_read = DUMP_FILE if args.profile else DEFAULT_FILE
+    # Parse arguments, determine which file to read, and handle profile processing
+    args = setup_parser()
+    file_to_read = DUMP_FILE if args.profile else URLS_FILE
     profile_name = handle_profile_processing(args.profile)
 
+    # Read the content from the determined file, processes the URLs, and clear the files
     urls = read_file(file_to_read)
     process_urls(urls, profile_name)
-
-    write_file(DEFAULT_FILE)
+    write_file(URLS_FILE)
 
 
 if __name__ == "__main__":

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,6 @@ def __init__(`
`38`	`38`	`self.live = Live(`
`39`	`39`	`self._render_live_view(), refresh_per_second=refresh_per_second,`
`40`	`40`	`)`
`41`		`-`
`42`	`41`	`self.start_time = time.time()`
`43`	`42`	`self.update_log("Script started", "The script has started execution.")`
`44`	`43`