|
6 | 6 |
|
7 | 7 | from __future__ import annotations |
8 | 8 |
|
| 9 | +import contextlib |
9 | 10 | import html |
10 | 11 | import logging |
11 | 12 | import re |
@@ -122,18 +123,30 @@ def get_media_slug(url: str, soup: BeautifulSoup) -> str: |
122 | 123 | def get_album_name(soup: BeautifulSoup) -> str | None: |
123 | 124 | """Extract the album name from the HTML of a page. |
124 | 125 |
|
125 | | - If the album name cannot be found, a message is printed, and `None` is returned. |
| 126 | + Handles potential mojibake issues (UTF-8 decoded as Latin-1). |
| 127 | + If the album name cannot be found, returns None. |
126 | 128 | """ |
127 | 129 | name_container = soup.find( |
128 | 130 | "div", |
129 | 131 | {"class": "text-subs font-semibold flex text-base sm:text-lg"}, |
130 | 132 | ) |
131 | 133 |
|
132 | | - if name_container: |
133 | | - album_name = name_container.find("h1").get_text(strip=True) |
134 | | - return html.unescape(album_name) |
| 134 | + if not name_container: |
| 135 | + return None |
135 | 136 |
|
136 | | - return None |
| 137 | + raw_album_name = name_container.find("h1").get_text(strip=True) |
| 138 | + unescaped_album_name = html.unescape(raw_album_name) |
| 139 | + |
| 140 | + # Attempt to fix mojibake (UTF-8 bytes mis-decoded as Latin-1) |
| 141 | + # If encoding/decoding fails, keep the decoded version |
| 142 | + with contextlib.suppress(UnicodeEncodeError, UnicodeDecodeError): |
| 143 | + fixed_album_name = unescaped_album_name.encode("latin1").decode("utf-8") |
| 144 | + |
| 145 | + # Only replace if the repaired string differs |
| 146 | + if fixed_album_name != unescaped_album_name: |
| 147 | + return fixed_album_name |
| 148 | + |
| 149 | + return unescaped_album_name |
137 | 150 |
|
138 | 151 |
|
139 | 152 | def get_item_type(item_page: str) -> str | None: |
|
0 commit comments