1717from requests_cache import CachedSession
1818
1919READABILITY_JS_COMMIT = "8e8ec27cd2013940bc6f3cc609de10e35a1d9d86"
20- READABILITY_JS_URL = f"https://raw.githubusercontent.com/mozilla/readability/{ READABILITY_JS_COMMIT } "
20+ READABILITY_JS_URL = (
21+ f"https://raw.githubusercontent.com/mozilla/readability/{ READABILITY_JS_COMMIT } "
22+ )
2123REQUESTS_TIMEOUT = 10
2224
2325
@@ -30,7 +32,9 @@ def get_readability_js():
3032 js_code .append (res .text )
3133 js_code .append (res .text )
3234
33- res = session .get (f"{ READABILITY_JS_URL } /Readability-readerable.js" , timeout = REQUESTS_TIMEOUT )
35+ res = session .get (
36+ f"{ READABILITY_JS_URL } /Readability-readerable.js" , timeout = REQUESTS_TIMEOUT
37+ )
3438 res .raise_for_status ()
3539 js_code .append (res .text )
3640
@@ -51,9 +55,15 @@ def url_arg_handler(url):
5155 return parsed_path .as_uri ()
5256
5357 # Handle Google Docs URLs
54- if (parsed_url .hostname == "docs.google.com"
55- and not parsed_url .path .endswith ("/pub" )
56- and (m := re .match (r"/document/d/(1[a-zA-Z0-9_-]{42}[AEIMQUYcgkosw048])" , parsed_url .path ))):
58+ if (
59+ parsed_url .hostname == "docs.google.com"
60+ and not parsed_url .path .endswith ("/pub" )
61+ and (
62+ m := re .match (
63+ r"/document/d/(1[a-zA-Z0-9_-]{42}[AEIMQUYcgkosw048])" , parsed_url .path
64+ )
65+ )
66+ ):
5767 logging .info ("Exporting HTML from Google Docs URL..." )
5868
5969 export_url = f"https://docs.google.com/feeds/download/documents/export/Export?id={ m [1 ]} &exportFormat=html"
@@ -78,15 +88,12 @@ def url_arg_handler(url):
7888 return url
7989
8090
81- def main ():
82- logging .basicConfig (format = '%(asctime)s [%(levelname)s] %(message)s' , level = logging .INFO )
83-
84- parser = argparse .ArgumentParser ()
85- parser .add_argument ("url" , help = "Input URL or path" )
86- parser .add_argument ("output" , help = "Output dir" )
87- parser .add_argument ("--no-readability-js" , action = "store_true" , help = "Disable readability.js" )
88- args = parser .parse_args ()
91+ def main (url , output ):
92+ logging .basicConfig (
93+ format = "%(asctime)s [%(levelname)s] %(message)s" , level = logging .INFO
94+ )
8995
96+ args = argparse .Namespace (url = url , output = output )
9097 access_url = url_arg_handler (args .url )
9198
9299 if access_url is None :
@@ -129,7 +136,10 @@ def error_cleanup(msg):
129136 url_status = dict ()
130137 navigated_urls = []
131138 page .on ("response" , lambda r : url_status .update ({r .url : r .status }))
132- page .on ("framenavigated" , lambda f : f .parent_frame is None and navigated_urls .append (f .url ))
139+ page .on (
140+ "framenavigated" ,
141+ lambda f : f .parent_frame is None and navigated_urls .append (f .url ),
142+ )
133143
134144 page .goto (access_url )
135145
@@ -146,7 +156,8 @@ def error_cleanup(msg):
146156 # Apply readability.js
147157 page .evaluate ("window.stop()" )
148158 page .add_script_tag (content = get_readability_js ())
149- readability_info = page .evaluate (r"""(no_readability_js) => {
159+ readability_info = page .evaluate (
160+ r"""(no_readability_js) => {
150161 window.stop();
151162
152163 const documentClone = document.cloneNode(true);
@@ -168,11 +179,13 @@ def error_cleanup(msg):
168179 elem.remove();
169180
170181 return article;
171- }""" , [args .no_readability_js ])
182+ }""" ,
183+ [args .no_readability_js ],
184+ )
172185 cleaned_html = page .content ()
173186
174187 # Check language
175- soup = bs4 .BeautifulSoup (cleaned_html , ' lxml' )
188+ soup = bs4 .BeautifulSoup (cleaned_html , " lxml" )
176189 soup_text = soup .body .text if soup .body else ""
177190
178191 try :
@@ -192,7 +205,9 @@ def error_cleanup(msg):
192205 output_dir = Path (args .output )
193206 output_dir .mkdir (exist_ok = True )
194207
195- with open (output_dir / "accessibility_tree.json" , "w" , encoding = "utf-8" ) as fout :
208+ with open (
209+ output_dir / "accessibility_tree.json" , "w" , encoding = "utf-8"
210+ ) as fout :
196211 json .dump (snapshot , fout )
197212
198213 with open (output_dir / "cleaned.html" , "w" , encoding = "utf-8" ) as fout :
@@ -207,4 +222,10 @@ def error_cleanup(msg):
207222
208223
209224if __name__ == "__main__" :
210- main ()
225+ # fallback to original CLI behavior
226+ import sys
227+
228+ if len (sys .argv ) != 3 :
229+ print ("usage: html_crawler.py <url_or_path> <output_dir>" )
230+ sys .exit (1 )
231+ main (sys .argv [1 ], sys .argv [2 ])
0 commit comments