1+ import contextlib
12import fileinput
23import os
34import re
78import time
89import typing
910import urllib .parse
10- from queue import SimpleQueue
11+ from queue import Queue , Empty
1112
1213from github_job_summary import JobSummary
1314from subdomains import Subdomains
2223# To avoid 403 responses
2324USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
2425
26+ CONNECT_TIMEOUT_SEC = 5
27+ MAX_TIME_SEC = 10
28+ JOIN_TIMEOUT_SEC = 120
29+
2530
2631class Curl :
2732 """
@@ -87,7 +92,6 @@ class Curl:
8792 ".sonatype.org" ,
8893 ".w3.org" ,
8994 ".wikipedia.org" ,
90-
9195 # Regular domains
9296 "barcode.qa.aspose.cloud" ,
9397 "editorconfig.org" ,
@@ -157,12 +161,13 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
157161 if os .path .splitext (filename )[1 ] in FILES_TO_IGNORE :
158162 continue
159163
160- with open (filename , "r" , encoding = "utf-8" ) as f :
161- try :
162- yield filename , f .read ()
163- except UnicodeDecodeError :
164- print ("Cant read '%s'" % filename , file = sys .stderr )
165- raise
164+ with contextlib .suppress (IsADirectoryError , FileNotFoundError ):
165+ with open (filename , "r" , encoding = "utf-8" ) as f :
166+ try :
167+ yield filename , f .read ()
168+ except UnicodeDecodeError :
169+ print ("Cant read '%s'" % filename , file = sys .stderr )
170+ raise
166171
167172
168173class Task :
@@ -177,6 +182,10 @@ def __init__(self, url: str):
177182 "-sSf" ,
178183 "--output" ,
179184 "-" ,
185+ "--connect-timeout" ,
186+ str (CONNECT_TIMEOUT_SEC ),
187+ "--max-time" ,
188+ str (MAX_TIME_SEC ),
180189 "--user-agent" ,
181190 USER_AGENT ,
182191 self .url ,
@@ -238,7 +247,7 @@ def process_finished_task(task: Task) -> None:
238247 JOB_SUMMARY .add_error (f"Broken URL '{ task .url } ': { task .stderr } Files: { EXTRACTED_URLS_WITH_FILES [task .url ]} " )
239248
240249
241- WORKER_QUEUE : SimpleQueue [str | None ] = SimpleQueue ()
250+ WORKER_QUEUE : Queue [str | None ] = Queue ()
242251
243252
244253def url_checker (num_workers : int = 8 ) -> None :
@@ -260,7 +269,11 @@ def url_checker(num_workers: int = 8) -> None:
260269
261270 if not queue_is_empty :
262271 for i in (i for (i , w ) in enumerate (workers ) if w is None ):
263- item = WORKER_QUEUE .get ()
272+ # Avoid blocking forever if the queue is currently empty
273+ try :
274+ item = WORKER_QUEUE .get_nowait ()
275+ except Empty :
276+ break
264277 if item is None :
265278 queue_is_empty = True
266279 print ("--- url queue is over ---" )
@@ -276,15 +289,21 @@ def url_checker(num_workers: int = 8) -> None:
276289
277290
278291def main (files : list [str ]) -> int :
279- checker = threading .Thread (target = url_checker )
292+ checker = threading .Thread (target = url_checker , daemon = True )
280293 checker .start ()
281294
282295 for filename , text in text_extractor (files ):
283296 for url in url_extractor (text , filename ):
284297 # print("In:", url)
285298 WORKER_QUEUE .put_nowait (url )
286299 WORKER_QUEUE .put_nowait (None )
287- checker .join ()
300+ checker .join (timeout = JOIN_TIMEOUT_SEC )
301+ if checker .is_alive ():
302+ print (
303+ f"URL checker did not finish within { JOIN_TIMEOUT_SEC } s; exiting early." ,
304+ file = sys .stderr ,
305+ flush = True ,
306+ )
288307
289308 JOB_SUMMARY .finalize ("Checked {total} failed **{failed}**\n Good={success}" )
290309 if JOB_SUMMARY .has_errors :
0 commit comments