@@ -765,6 +765,77 @@ def infer_file_extension(self, url: str, headers: Dict[str, str], content: bytes
765765
766766 # 5) Fall back to URL ext if any, otherwise 'bin'
767767 return url_ext if url_ext else 'bin'
768+
769+ def _url_looks_like_file_endpoint (self , url : str ) -> bool :
770+ """Return True when the URL shape suggests a direct file download endpoint."""
771+ try :
772+ lowered = str (url or "" ).lower ()
773+ except Exception :
774+ return False
775+ hints = (
776+ ".pdf" ,
777+ ".docx" ,
778+ ".pptx" ,
779+ ".xml" ,
780+ ".csv" ,
781+ "/pdf" ,
782+ "format=pdf" ,
783+ "type=pdf" ,
784+ "download" ,
785+ "attachment" ,
786+ "/file" ,
787+ "getfile.php" ,
788+ )
789+ return any (token in lowered for token in hints )
790+
791+ def _detect_html_interstitial (self , url : str , headers : Dict [str , str ], content : bytes ) -> Optional [str ]:
792+ """
793+ Detect HTML challenge/viewer pages that should not count as successful downloads.
794+
795+ We still allow regular HTML documents, but fail fast on common interstitials
796+ such as WAF challenge pages and JavaScript-only document viewers.
797+ """
798+ try :
799+ lower_headers = {str (k ).lower (): str (v ).lower () for k , v in (headers or {}).items ()}
800+ lower_body = (content or b"" )[: 1 << 17 ].decode ("utf-8" , errors = "ignore" ).lower ()
801+ except Exception :
802+ lower_headers = {}
803+ lower_body = ""
804+
805+ if not lower_body :
806+ return None
807+
808+ if (
809+ "x-amzn-waf-action" in lower_headers
810+ or "awswafintegration" in lower_body
811+ or "challenge.js" in lower_body
812+ or "verify that you're not a robot" in lower_body
813+ ):
814+ return (
815+ "HTML challenge page returned instead of a document; "
816+ "browser automation or cookie bootstrap is required"
817+ )
818+
819+ viewer_markers = (
820+ "fliphtml5_pages" ,
821+ "monitor:player:html5" ,
822+ "javascript/loadingjs.js" ,
823+ "javascript/main.js" ,
824+ "bookconfig.totalpagecount" ,
825+ "getfile.php?lib=" ,
826+ )
827+ viewer_hits = sum (1 for marker in viewer_markers if marker in lower_body )
828+ if viewer_hits >= 2 :
829+ return (
830+ "HTML document viewer returned instead of a downloadable file; "
831+ "a source-specific fetcher with persisted cookies/redirect handling is required"
832+ )
833+
834+ content_type = lower_headers .get ("content-type" , "" )
835+ if self ._url_looks_like_file_endpoint (url ) and "text/html" in content_type :
836+ return "Expected a file-like response but received HTML instead"
837+
838+ return None
768839
769840 async def download_file (self , row_index : int , url : str , semaphore : Optional [asyncio .Semaphore ],
770841 rate_limiter : RateLimiter , retry_count : int = 0 ,
@@ -916,6 +987,15 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn
916987 await f .write (chunk )
917988 # Infer extension using URL, headers and first bytes
918989 file_ext = self .infer_file_extension (url , resp_headers , bytes (head ))
990+ if file_ext == 'html' :
991+ html_issue = self ._detect_html_interstitial (url , resp_headers , bytes (head ))
992+ if html_issue :
993+ try :
994+ os .remove (tmp_path )
995+ except Exception :
996+ pass
997+ self .logger .warning (f"HTML interstitial detected for { url } : { html_issue } " )
998+ return False , "" , file_ext , html_issue , retry_count
919999 if not self .is_supported_format (file_ext ):
9201000 # Clean up temp and report
9211001 try :
@@ -946,6 +1026,11 @@ async def download_file(self, row_index: int, url: str, semaphore: Optional[asyn
9461026 session , requester , url , headers , timeout
9471027 )
9481028 file_ext = self .infer_file_extension (url , resp_headers , content )
1029+ if file_ext == 'html' :
1030+ html_issue = self ._detect_html_interstitial (url , resp_headers , content )
1031+ if html_issue :
1032+ self .logger .warning (f"HTML interstitial detected for { url } : { html_issue } " )
1033+ return False , "" , file_ext , html_issue , retry_count
9491034 if not self .is_supported_format (file_ext ):
9501035 self .logger .warning (f"Unsupported file format after inference: { file_ext } . Supported formats: { ', ' .join (self .supported_formats )} " )
9511036 return False , "" , file_ext or "" , f"Unsupported file format: { file_ext } " , retry_count
0 commit comments