1010from queue import SimpleQueue
1111
1212from github_job_summary import JobSummary
13+ from subdomains import Subdomains
1314
1415"""
1516Read file names from stdin (feed from git ls-files)
1819Check them with CURL
1920"""
2021
22+ # To avoid 403 responses
23+ USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
24+
2125
2226class Curl :
2327 """
@@ -49,35 +53,36 @@ class Curl:
4953 ]
5054)
5155
52- IGNORE_DOMAINS : frozenset [ str ] = frozenset (
56+ IGNORE_DOMAINS = Subdomains (
5357 [
54- "central.sonatype.org" ,
55- "curl.se" ,
56- "dart.dev" ,
57- "getcomposer.org" ,
58- "go.dev" ,
59- "maven.apache.org" ,
60- "mvnrepository.com" ,
61- "mvnrepository.com" ,
62- "nodejs.org" ,
63- "packagist.org" ,
64- "pkg.go.dev" ,
65- "pub.dev" ,
66- "pypi.org" ,
67- "pypi.python.org" ,
68- "repo1.maven.org" ,
69- "tools.ietf.org" ,
70- "urllib3.readthedocs.io" ,
71- "www.apache.org" ,
72- "www.dartlang.org" ,
73- "www.gradle.org" ,
74- "www.mojohaus.org" ,
75- "www.npmjs.com" ,
76- "www.nuget.org" ,
77- "www.opensource.org" ,
78- "www.php.net" ,
79- "www.python.org" ,
80- "www.w3.org" ,
58+ ".android.com" ,
59+ ".apache.org" ,
60+ ".curl.se" ,
61+ ".dart.dev" ,
62+ ".dartlang.org" ,
63+ ".getcomposer.org" ,
64+ ".go.dev" ,
65+ ".google.com" ,
66+ ".gradle.org" ,
67+ ".ietf.org" ,
68+ ".maven.org" ,
69+ ".microsoft.com" ,
70+ ".mojohaus.org" ,
71+ ".mvnrepository.com" ,
72+ ".nodejs.org" ,
73+ ".npmjs.com" ,
74+ ".nuget.org" ,
75+ ".opensource.org" ,
76+ ".packagist.org" ,
77+ ".php.net" ,
78+ ".phpunit.de" ,
79+ ".pub.dev" ,
80+ ".pypi.org" ,
81+ ".python.org" ,
82+ ".readthedocs.io" ,
83+ ".sonatype.org" ,
84+ ".w3.org" ,
85+ ".wikipedia.org" ,
8186 ]
8287)
8388
@@ -101,16 +106,7 @@ def valid_url(url: str) -> bool:
101106 if "." not in domain :
102107 # Ignore "localhost" and other domains without .
103108 return False
104- if domain in IGNORE_DOMAINS :
105- return False
106-
107- if (
108- domain .endswith ("android.com" )
109- or domain .endswith (".google.com" )
110- or domain .endswith (".microsoft.com" )
111- or domain .endswith (".wikipedia.org" )
112- ):
113- # Ignore popular domain
109+ if IGNORE_DOMAINS .exists (domain ):
114110 return False
115111
116112 if "{{" in url or "}}" in url :
@@ -159,8 +155,6 @@ def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None,
159155class Task :
160156 _proc : subprocess .Popen [bytes ]
161157 _stderr : str | None
162- # To avoid 403 responses
163- USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
164158
165159 def __init__ (self , url : str ):
166160 self .url = url
@@ -171,7 +165,7 @@ def __init__(self, url: str):
171165 "--output" ,
172166 "-" ,
173167 "--user-agent" ,
174- self . USER_AGENT ,
168+ USER_AGENT ,
175169 self .url ,
176170 ],
177171 stdout = open (os .devnull , "w" ),
@@ -256,7 +250,7 @@ def url_checker(num_workers: int = 8) -> None:
256250 item = WORKER_QUEUE .get ()
257251 if item is None :
258252 queue_is_empty = True
259- print ("URL queue is over" )
253+ print ("--- url queue is over --- " )
260254 break
261255 url = item
262256 workers [i ] = create_new_task (url )
0 commit comments