Skip to content

Commit 379b8f0

Browse files
committed
Handle HTML download interstitials
1 parent 79cc99c commit 379b8f0

File tree

2 files changed

+138
-0
lines changed

2 files changed

+138
-0
lines changed

src/glossapi/gloss_downloader.py

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,77 @@ def infer_file_extension(self, url: str, headers: Dict[str, str], content: bytes
765765

766766
# 5) Fall back to URL ext if any, otherwise 'bin'
767767
return url_ext if url_ext else 'bin'
768+
769+
def _url_looks_like_file_endpoint(self, url: str) -> bool:
770+
"""Return True when the URL shape suggests a direct file download endpoint."""
771+
try:
772+
lowered = str(url or "").lower()
773+
except Exception:
774+
return False
775+
hints = (
776+
".pdf",
777+
".docx",
778+
".pptx",
779+
".xml",
780+
".csv",
781+
"/pdf",
782+
"format=pdf",
783+
"type=pdf",
784+
"download",
785+
"attachment",
786+
"/file",
787+
"getfile.php",
788+
)
789+
return any(token in lowered for token in hints)
790+
791+
def _detect_html_interstitial(self, url: str, headers: Dict[str, str], content: bytes) -> Optional[str]:
792+
"""
793+
Detect HTML challenge/viewer pages that should not count as successful downloads.
794+
795+
We still allow regular HTML documents, but fail fast on common interstitials
796+
such as WAF challenge pages and JavaScript-only document viewers.
797+
"""
798+
try:
799+
lower_headers = {str(k).lower(): str(v).lower() for k, v in (headers or {}).items()}
800+
lower_body = (content or b"")[: 1 << 17].decode("utf-8", errors="ignore").lower()
801+
except Exception:
802+
lower_headers = {}
803+
lower_body = ""
804+
805+
if not lower_body:
806+
return None
807+
808+
if (
809+
"x-amzn-waf-action" in lower_headers
810+
or "awswafintegration" in lower_body
811+
or "challenge.js" in lower_body
812+
or "verify that you're not a robot" in lower_body
813+
):
814+
return (
815+
"HTML challenge page returned instead of a document; "
816+
"browser automation or cookie bootstrap is required"
817+
)
818+
819+
viewer_markers = (
820+
"fliphtml5_pages",
821+
"monitor:player:html5",
822+
"javascript/loadingjs.js",
823+
"javascript/main.js",
824+
"bookconfig.totalpagecount",
825+
"getfile.php?lib=",
826+
)
827+
viewer_hits = sum(1 for marker in viewer_markers if marker in lower_body)
828+
if viewer_hits >= 2:
829+
return (
830+
"HTML document viewer returned instead of a downloadable file; "
831+
"a source-specific fetcher with persisted cookies/redirect handling is required"
832+
)
833+
834+
content_type = lower_headers.get("content-type", "")
835+
if self._url_looks_like_file_endpoint(url) and "text/html" in content_type:
836+
return "Expected a file-like response but received HTML instead"
837+
838+
return None
768839

769840
async def download_file(self, row_index: int, url: str, semaphore: Optional[asyncio.Semaphore],
770841
rate_limiter: RateLimiter, retry_count: int = 0,
@@ -916,6 +987,15 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn
916987
await f.write(chunk)
917988
# Infer extension using URL, headers and first bytes
918989
file_ext = self.infer_file_extension(url, resp_headers, bytes(head))
990+
if file_ext == 'html':
991+
html_issue = self._detect_html_interstitial(url, resp_headers, bytes(head))
992+
if html_issue:
993+
try:
994+
os.remove(tmp_path)
995+
except Exception:
996+
pass
997+
self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}")
998+
return False, "", file_ext, html_issue, retry_count
919999
if not self.is_supported_format(file_ext):
9201000
# Clean up temp and report
9211001
try:
@@ -946,6 +1026,11 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn
9461026
session, requester, url, headers, timeout
9471027
)
9481028
file_ext = self.infer_file_extension(url, resp_headers, content)
1029+
if file_ext == 'html':
1030+
html_issue = self._detect_html_interstitial(url, resp_headers, content)
1031+
if html_issue:
1032+
self.logger.warning(f"HTML interstitial detected for {url}: {html_issue}")
1033+
return False, "", file_ext, html_issue, retry_count
9491034
if not self.is_supported_format(file_ext):
9501035
self.logger.warning(f"Unsupported file format after inference: {file_ext}. Supported formats: {', '.join(self.supported_formats)}")
9511036
return False, "", file_ext or "", f"Unsupported file format: {file_ext}", retry_count
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from glossapi.gloss_downloader import GlossDownloader
2+
3+
4+
def test_detects_waf_challenge_html(tmp_path):
5+
downloader = GlossDownloader(output_dir=str(tmp_path))
6+
url = "https://eur-lex.europa.eu/legal-content/EL/TXT/PDF/?uri=OJ:L_202502360"
7+
headers = {
8+
"Content-Type": "text/html; charset=UTF-8",
9+
"x-amzn-waf-action": "challenge",
10+
}
11+
body = b"""<!DOCTYPE html><html><body>
12+
<script>AwsWafIntegration.getToken()</script>
13+
<noscript>verify that you're not a robot</noscript>
14+
</body></html>"""
15+
16+
assert downloader.infer_file_extension(url, headers, body) == "html"
17+
error = downloader._detect_html_interstitial(url, headers, body)
18+
19+
assert error is not None
20+
assert "challenge page" in error.lower()
21+
22+
23+
def test_detects_js_document_viewer_html(tmp_path):
24+
downloader = GlossDownloader(output_dir=str(tmp_path))
25+
url = "https://freader.ekt.gr/eadd/index.php?doc=60819&lang=el"
26+
headers = {
27+
"Content-Type": "text/html; charset=UTF-8",
28+
}
29+
body = b"""<!DOCTYPE html><html><head>
30+
<meta name="monitor-signature" content="monitor:player:html5">
31+
<script>bookConfig.totalPageCount = 236;</script>
32+
<script>var fliphtml5_pages = [{"l":"../getfile.php?lib=eadd&path=large&item=1.jpg"}];</script>
33+
<script src="javascript/LoadingJS.js"></script>
34+
</head></html>"""
35+
36+
assert downloader.infer_file_extension(url, headers, body) == "html"
37+
error = downloader._detect_html_interstitial(url, headers, body)
38+
39+
assert error is not None
40+
assert "document viewer" in error.lower()
41+
42+
43+
def test_regular_html_document_is_still_allowed(tmp_path):
44+
downloader = GlossDownloader(output_dir=str(tmp_path))
45+
url = "https://example.org/article"
46+
headers = {
47+
"Content-Type": "text/html; charset=UTF-8",
48+
}
49+
body = b"""<!DOCTYPE html><html><head><title>Article</title></head>
50+
<body><main><article><h1>Normal HTML document</h1><p>Body text.</p></article></main></body></html>"""
51+
52+
assert downloader.infer_file_extension(url, headers, body) == "html"
53+
assert downloader._detect_html_interstitial(url, headers, body) is None

0 commit comments

Comments
 (0)