Fix encoding issue for Russian letters

Lysagxra · web-flow · commit 987cb31cd893 · 2025-08-21T16:23:27.000+02:00
diff --git a/downloader.py b/downloader.py
@@ -52,6 +52,7 @@ async def handle_download_process(
     host_page = get_host_page(url)
     identifier = get_identifier(url, soup=soup)
 
+    # Album download
     if check_url_type(url):
         item_pages = extract_item_pages(soup, host_page)
         album_downloader = AlbumDownloader(
@@ -61,12 +62,13 @@ async def handle_download_process(
         )
         await album_downloader.download_album()
 
+    # Single item download
     else:
         download_link, filename = await get_download_info(url, soup)
         live_manager.add_overall_task(identifier, num_tasks=1)
         task = live_manager.add_task()
 
-        downloader = MediaDownloader(
+        media_downloader = MediaDownloader(
             session_info=session_info,
             download_info=DownloadInfo(
                 download_link=download_link,
@@ -75,7 +77,7 @@ async def handle_download_process(
             ),
             live_manager=live_manager,
         )
-        downloader.download()
+        media_downloader.download()
 
 
 async def validate_and_download(
diff --git a/helpers/config.py b/helpers/config.py
@@ -18,7 +18,7 @@
 
 DOWNLOAD_FOLDER = "Downloads"               # The folder where downloaded files will be
                                             # stored.
-FILE = "URLs.txt"                           # The name of the file containing the list
+URLS_FILE = "URLs.txt"                      # The name of the file containing the list
                                             # of URLs to process.
 SESSION_LOG = "session_log.txt"             # The file used to log errors.
 MIN_DISK_SPACE_GB = 3                       # Minimum free disk space (in GB) required.
@@ -65,8 +65,7 @@
 # Headers used for general HTTP requests.
 HEADERS = {
     "User-Agent": (
-        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:136.0) "
-        "Gecko/20100101 Firefox/136.0"
+        "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:136.0) Gecko/20100101 Firefox/136.0"
     ),
 }
 
diff --git a/helpers/managers/live_manager.py b/helpers/managers/live_manager.py
@@ -40,13 +40,11 @@ def __init__(
         self.progress_table = self.progress_manager.create_progress_table()
         self.logger = logger
         self.disable_ui = disable_ui
-
         self.live = (
             Live(self._render_live_view(), refresh_per_second=refresh_per_second)
             if not self.disable_ui
             else nullcontext()
         )
-
         self.start_time = time.time()
         self.update_log("Script started", "The script has started execution.")
 
@@ -72,7 +70,6 @@ def update_task(
     def update_log(self, event: str, details: str) -> None:
         """Log an event and refreshes the live display."""
         self.logger.log(event, details, disable_ui=self.disable_ui)
-
         if not self.disable_ui:
             self.live.update(self._render_live_view())
 
diff --git a/helpers/url_utils.py b/helpers/url_utils.py
@@ -6,6 +6,7 @@
 
 from __future__ import annotations
 
+import contextlib
 import html
 import logging
 import re
@@ -122,18 +123,30 @@ def get_media_slug(url: str, soup: BeautifulSoup) -> str:
 def get_album_name(soup: BeautifulSoup) -> str | None:
     """Extract the album name from the HTML of a page.
 
-    If the album name cannot be found, a message is printed, and `None` is returned.
+    Handles potential mojibake issues (UTF-8 decoded as Latin-1).
+    If the album name cannot be found, returns None.
     """
     name_container = soup.find(
         "div",
         {"class": "text-subs font-semibold flex text-base sm:text-lg"},
     )
 
-    if name_container:
-        album_name = name_container.find("h1").get_text(strip=True)
-        return html.unescape(album_name)
+    if not name_container:
+        return None
 
-    return None
+    raw_album_name = name_container.find("h1").get_text(strip=True)
+    unescaped_album_name = html.unescape(raw_album_name)
+
+    # Attempt to fix mojibake (UTF-8 bytes mis-decoded as Latin-1)
+    # If encoding/decoding fails, keep the decoded version
+    with contextlib.suppress(UnicodeEncodeError, UnicodeDecodeError):
+        fixed_album_name = unescaped_album_name.encode("latin1").decode("utf-8")
+
+    # Only replace if the repaired string differs
+    if fixed_album_name != unescaped_album_name:
+        return fixed_album_name
+
+    return unescaped_album_name
 
 
 def get_item_type(item_page: str) -> str | None:
diff --git a/main.py b/main.py
@@ -15,7 +15,7 @@
 
 from downloader import initialize_managers, validate_and_download
 from helpers.bunkr_utils import get_bunkr_status
-from helpers.config import FILE, SESSION_LOG
+from helpers.config import SESSION_LOG, URLS_FILE
 from helpers.file_utils import (
     check_python_version,
     read_file,
@@ -64,11 +64,11 @@ async def main() -> None:
     args = parse_arguments()
 
     # Read and process URLs
-    urls = read_file(FILE)
+    urls = read_file(URLS_FILE)
     await process_urls(urls, disable_ui=args.disable_ui)
 
     # Clear URLs file
-    write_file(FILE)
+    write_file(URLS_FILE)
 
 
 if __name__ == "__main__":