Skip to content

Commit 987cb31

Browse files
authored
Fix encoding issue for Russian letters
1 parent 2f270a6 commit 987cb31

File tree

5 files changed

+27
-16
lines changed

5 files changed

+27
-16
lines changed

downloader.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ async def handle_download_process(
5252
host_page = get_host_page(url)
5353
identifier = get_identifier(url, soup=soup)
5454

55+
# Album download
5556
if check_url_type(url):
5657
item_pages = extract_item_pages(soup, host_page)
5758
album_downloader = AlbumDownloader(
@@ -61,12 +62,13 @@ async def handle_download_process(
6162
)
6263
await album_downloader.download_album()
6364

65+
# Single item download
6466
else:
6567
download_link, filename = await get_download_info(url, soup)
6668
live_manager.add_overall_task(identifier, num_tasks=1)
6769
task = live_manager.add_task()
6870

69-
downloader = MediaDownloader(
71+
media_downloader = MediaDownloader(
7072
session_info=session_info,
7173
download_info=DownloadInfo(
7274
download_link=download_link,
@@ -75,7 +77,7 @@ async def handle_download_process(
7577
),
7678
live_manager=live_manager,
7779
)
78-
downloader.download()
80+
media_downloader.download()
7981

8082

8183
async def validate_and_download(

helpers/config.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
DOWNLOAD_FOLDER = "Downloads" # The folder where downloaded files will be
2020
# stored.
21-
FILE = "URLs.txt" # The name of the file containing the list
21+
URLS_FILE = "URLs.txt" # The name of the file containing the list
2222
# of URLs to process.
2323
SESSION_LOG = "session_log.txt" # The file used to log errors.
2424
MIN_DISK_SPACE_GB = 3 # Minimum free disk space (in GB) required.
@@ -65,8 +65,7 @@
6565
# Headers used for general HTTP requests.
6666
HEADERS = {
6767
"User-Agent": (
68-
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:136.0) "
69-
"Gecko/20100101 Firefox/136.0"
68+
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:136.0) Gecko/20100101 Firefox/136.0"
7069
),
7170
}
7271

helpers/managers/live_manager.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,11 @@ def __init__(
4040
self.progress_table = self.progress_manager.create_progress_table()
4141
self.logger = logger
4242
self.disable_ui = disable_ui
43-
4443
self.live = (
4544
Live(self._render_live_view(), refresh_per_second=refresh_per_second)
4645
if not self.disable_ui
4746
else nullcontext()
4847
)
49-
5048
self.start_time = time.time()
5149
self.update_log("Script started", "The script has started execution.")
5250

@@ -72,7 +70,6 @@ def update_task(
7270
def update_log(self, event: str, details: str) -> None:
7371
"""Log an event and refreshes the live display."""
7472
self.logger.log(event, details, disable_ui=self.disable_ui)
75-
7673
if not self.disable_ui:
7774
self.live.update(self._render_live_view())
7875

helpers/url_utils.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
from __future__ import annotations
88

9+
import contextlib
910
import html
1011
import logging
1112
import re
@@ -122,18 +123,30 @@ def get_media_slug(url: str, soup: BeautifulSoup) -> str:
122123
def get_album_name(soup: BeautifulSoup) -> str | None:
123124
"""Extract the album name from the HTML of a page.
124125
125-
If the album name cannot be found, a message is printed, and `None` is returned.
126+
Handles potential mojibake issues (UTF-8 decoded as Latin-1).
127+
If the album name cannot be found, returns None.
126128
"""
127129
name_container = soup.find(
128130
"div",
129131
{"class": "text-subs font-semibold flex text-base sm:text-lg"},
130132
)
131133

132-
if name_container:
133-
album_name = name_container.find("h1").get_text(strip=True)
134-
return html.unescape(album_name)
134+
if not name_container:
135+
return None
135136

136-
return None
137+
raw_album_name = name_container.find("h1").get_text(strip=True)
138+
unescaped_album_name = html.unescape(raw_album_name)
139+
140+
# Attempt to fix mojibake (UTF-8 bytes mis-decoded as Latin-1)
141+
# If encoding/decoding fails, keep the decoded version
142+
with contextlib.suppress(UnicodeEncodeError, UnicodeDecodeError):
143+
fixed_album_name = unescaped_album_name.encode("latin1").decode("utf-8")
144+
145+
# Only replace if the repaired string differs
146+
if fixed_album_name != unescaped_album_name:
147+
return fixed_album_name
148+
149+
return unescaped_album_name
137150

138151

139152
def get_item_type(item_page: str) -> str | None:

main.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
from downloader import initialize_managers, validate_and_download
1717
from helpers.bunkr_utils import get_bunkr_status
18-
from helpers.config import FILE, SESSION_LOG
18+
from helpers.config import SESSION_LOG, URLS_FILE
1919
from helpers.file_utils import (
2020
check_python_version,
2121
read_file,
@@ -64,11 +64,11 @@ async def main() -> None:
6464
args = parse_arguments()
6565

6666
# Read and process URLs
67-
urls = read_file(FILE)
67+
urls = read_file(URLS_FILE)
6868
await process_urls(urls, disable_ui=args.disable_ui)
6969

7070
# Clear URLs file
71-
write_file(FILE)
71+
write_file(URLS_FILE)
7272

7373

7474
if __name__ == "__main__":

0 commit comments

Comments
 (0)