Skip to content

Commit 48594fe

Browse files
committed
Fixed hangs in check-urls.py
1 parent d91c63e commit 48594fe

File tree

2 files changed

+38
-18
lines changed

2 files changed

+38
-18
lines changed

.gitignore

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1+
_ReSharper.*
2+
.idea/
3+
.qodo
4+
.vs/
5+
.vscode/
16
*.suo
27
*.user
3-
_ReSharper.*
8+
**/__pycache__/
49
bin/
5-
obj/
6-
.vs/
710
codegen/.generated/
811
codegen/debug*.json
9-
.idea/
10-
.vscode/
11-
.qodo
12+
obj/

scripts/check-urls.py

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import contextlib
12
import fileinput
23
import os
34
import re
@@ -7,7 +8,7 @@
78
import time
89
import typing
910
import urllib.parse
10-
from queue import SimpleQueue
11+
from queue import Queue, Empty
1112

1213
from github_job_summary import JobSummary
1314
from subdomains import Subdomains
@@ -22,6 +23,10 @@
2223
# To avoid 403 responses
2324
USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
2425

26+
CONNECT_TIMEOUT_SEC = 5
27+
MAX_TIME_SEC = 10
28+
JOIN_TIMEOUT_SEC = 120
29+
2530

2631
class Curl:
2732
"""
@@ -87,7 +92,6 @@ class Curl:
8792
".sonatype.org",
8893
".w3.org",
8994
".wikipedia.org",
90-
9195
# Regular domains
9296
"barcode.qa.aspose.cloud",
9397
"editorconfig.org",
@@ -157,12 +161,13 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
157161
if os.path.splitext(filename)[1] in FILES_TO_IGNORE:
158162
continue
159163

160-
with open(filename, "r", encoding="utf-8") as f:
161-
try:
162-
yield filename, f.read()
163-
except UnicodeDecodeError:
164-
print("Cant read '%s'" % filename, file=sys.stderr)
165-
raise
164+
with contextlib.suppress(IsADirectoryError, FileNotFoundError):
165+
with open(filename, "r", encoding="utf-8") as f:
166+
try:
167+
yield filename, f.read()
168+
except UnicodeDecodeError:
169+
print("Cant read '%s'" % filename, file=sys.stderr)
170+
raise
166171

167172

168173
class Task:
@@ -177,6 +182,10 @@ def __init__(self, url: str):
177182
"-sSf",
178183
"--output",
179184
"-",
185+
"--connect-timeout",
186+
str(CONNECT_TIMEOUT_SEC),
187+
"--max-time",
188+
str(MAX_TIME_SEC),
180189
"--user-agent",
181190
USER_AGENT,
182191
self.url,
@@ -238,7 +247,7 @@ def process_finished_task(task: Task) -> None:
238247
JOB_SUMMARY.add_error(f"Broken URL '{task.url}': {task.stderr}Files: {EXTRACTED_URLS_WITH_FILES[task.url]}")
239248

240249

241-
WORKER_QUEUE: SimpleQueue[str | None] = SimpleQueue()
250+
WORKER_QUEUE: Queue[str | None] = Queue()
242251

243252

244253
def url_checker(num_workers: int = 8) -> None:
@@ -260,7 +269,11 @@ def url_checker(num_workers: int = 8) -> None:
260269

261270
if not queue_is_empty:
262271
for i in (i for (i, w) in enumerate(workers) if w is None):
263-
item = WORKER_QUEUE.get()
272+
# Avoid blocking forever if the queue is currently empty
273+
try:
274+
item = WORKER_QUEUE.get_nowait()
275+
except Empty:
276+
break
264277
if item is None:
265278
queue_is_empty = True
266279
print("--- url queue is over ---")
@@ -276,15 +289,21 @@ def url_checker(num_workers: int = 8) -> None:
276289

277290

278291
def main(files: list[str]) -> int:
279-
checker = threading.Thread(target=url_checker)
292+
checker = threading.Thread(target=url_checker, daemon=True)
280293
checker.start()
281294

282295
for filename, text in text_extractor(files):
283296
for url in url_extractor(text, filename):
284297
# print("In:", url)
285298
WORKER_QUEUE.put_nowait(url)
286299
WORKER_QUEUE.put_nowait(None)
287-
checker.join()
300+
checker.join(timeout=JOIN_TIMEOUT_SEC)
301+
if checker.is_alive():
302+
print(
303+
f"URL checker did not finish within {JOIN_TIMEOUT_SEC}s; exiting early.",
304+
file=sys.stderr,
305+
flush=True,
306+
)
288307

289308
JOB_SUMMARY.finalize("Checked {total} failed **{failed}**\nGood={success}")
290309
if JOB_SUMMARY.has_errors:

0 commit comments

Comments
 (0)