55import sys
66import threading
77import time
8+ import typing
9+ import urllib .parse
810from queue import SimpleQueue
911
1012from github_job_summary import JobSummary
@@ -29,58 +31,99 @@ class Curl:
2931
3032
3133CURL_EXIT_CODES_AND_HTTP_CODES = {
32- "http://schemas.android.com/aapt" : (Curl .COULDNT_RESOLVE_HOST , None ),
33- "http://schemas.android.com/apk/res-auto" : (Curl .COULDNT_RESOLVE_HOST , None ),
34- "http://schemas.android.com/apk/res/android" : (Curl .COULDNT_RESOLVE_HOST , None ),
35- "http://schemas.android.com/tools" : (Curl .COULDNT_RESOLVE_HOST , None ),
3634 "https://api.aspose.cloud/connect/token" : (Curl .HTTP_RETURNED_ERROR , 400 ),
3735 "https://api.aspose.cloud/v3.0" : (Curl .HTTP_RETURNED_ERROR , 404 ),
38- "https://id.aspose.cloud/connect/token" : (Curl .HTTP_RETURNED_ERROR , 400 ),
3936 "https://api.aspose.cloud/v4.0" : (Curl .HTTP_RETURNED_ERROR , 404 ),
37+ "https://api.aspose.cloud/v4.0/" : (Curl .HTTP_RETURNED_ERROR , 404 ),
38+ "https://id.aspose.cloud/connect/token" : (Curl .HTTP_RETURNED_ERROR , 400 ),
4039 "https://barcode.qa.aspose.cloud/v3.0/barcode/swagger/spec" : (Curl .COULDNT_RESOLVE_HOST , None ),
41- "https://mvnrepository.com/artifact/io.swagger/swagger-codegen-cli" : (Curl .HTTP_RETURNED_ERROR , 403 ),
42- "https://www.npmjs.com/package/aspose-barcode-cloud-node" : (Curl .HTTP_RETURNED_ERROR , 429 ),
4340 # TODO: Temporary fix
4441 "https://dashboard.aspose.cloud/applications" : (Curl .HTTP_RETURNED_ERROR , 404 ),
4542}
4643
4744URLS_TO_IGNORE : frozenset [str ] = frozenset (
4845 [
49- "http://|https://|ftp://" ,
50- "http://localhost:$port/" ,
51- "http://localhost:47972" ,
52- "http://localhost:47972/connect/token" ,
53- "http://localhost:47972/v3.0" ,
54- "http://localhost:47972/v3.0/barcode/swagger/spec" ,
55- "http://some" ,
56- "http://tools.ietf.org/html/rfc1341.html" ,
57- "http://tools.ietf.org/html/rfc2046" ,
58- "http://tools.ietf.org/html/rfc2388" ,
59- "http://urllib3.readthedocs.io/en/latest/advanced-usage.html" ,
6046 "https://api.aspose.cloud" ,
61- "https://api.aspose.cloud/v3.0/barcode/scan" ,
62- "https://github.com/aspose-barcode-cloud/aspose-barcode-cloud-dotnet/releases/tag/v{{packageVersion}}" ,
63- "https://img.shields.io/badge/api-v{{appVersion}}-lightgrey" ,
64- "https://pypi.org/project/{{projectName}}/" ,
65- "https://repo1.maven.org/maven2/io/swagger/swagger-codegen-cli/2.4.14/swagger-codegen-cli-2.4.14.jar" ,
66- "https://tools.ietf.org/html/rfc1521" ,
67- "https://unknown" ,
6847 "https://www.aspose.cloud/404" ,
69- "https://www.mojohaus.org/VERSIONS/RULE/2.1.0" ,
48+ ]
49+ )
50+
51+ IGNORE_DOMAINS : frozenset [str ] = frozenset (
52+ [
53+ "central.sonatype.org" ,
54+ "curl.se" ,
55+ "dart.dev" ,
56+ "getcomposer.org" ,
57+ "go.dev" ,
58+ "maven.apache.org" ,
59+ "mvnrepository.com" ,
60+ "mvnrepository.com" ,
61+ "nodejs.org" ,
62+ "packagist.org" ,
63+ "pkg.go.dev" ,
64+ "pub.dev" ,
65+ "pypi.org" ,
66+ "pypi.python.org" ,
67+ "repo1.maven.org" ,
68+ "tools.ietf.org" ,
69+ "urllib3.readthedocs.io" ,
70+ "www.apache.org" ,
71+ "www.dartlang.org" ,
72+ "www.gradle.org" ,
73+ "www.mojohaus.org" ,
74+ "www.npmjs.com" ,
75+ "www.nuget.org" ,
76+ "www.opensource.org" ,
77+ "www.php.net" ,
78+ "www.python.org" ,
79+ "www.w3.org" ,
7080 ]
7181)
7282
7383URL_END_CHARS = r",#\)\"'<>\*\s\\"
74- URL_RE_PATTERN = r"(https*://[^%s ]+)[%s ]?" % ( URL_END_CHARS , URL_END_CHARS )
84+ URL_RE_PATTERN = r"(https*://[^{0} ]+)[{0} ]?" . format ( URL_END_CHARS )
7585# print(URL_RE_PATTERN)
7686URL_REGEX = re .compile (URL_RE_PATTERN , re .MULTILINE )
7787
7888# URL : [Files]
7989EXTRACTED_URLS_WITH_FILES : dict [str , list [str ]] = {k : [] for k in URLS_TO_IGNORE }
8090
8191
82- def url_extractor (text , filename ):
92+ def valid_url (url : str ) -> bool :
93+ try :
94+ parsed : urllib .parse .ParseResult = urllib .parse .urlparse (url )
95+ except :
96+ # Malformed URL
97+ return False
98+ else :
99+ domain = parsed .netloc
100+ if "." not in domain :
101+ # Ignore "localhost" and other domains without .
102+ return False
103+ if domain in IGNORE_DOMAINS :
104+ return False
105+
106+ if (
107+ domain .endswith ("android.com" )
108+ or domain .endswith (".google.com" )
109+ or domain .endswith (".microsoft.com" )
110+ or domain .endswith (".wikipedia.org" )
111+ ):
112+ # Ignore popular domain
113+ return False
114+
115+ if "{{" in url or "}}" in url :
116+ # Ignore templates with {{var}}
117+ return False
118+
119+ return True
120+
121+
122+ def url_extractor (text : str , filename : str ) -> typing .Generator [str , None , None ]:
83123 for url in URL_REGEX .findall (text ):
124+ if not valid_url (url ):
125+ # print("Ignore:", url)
126+ continue
84127 if url not in EXTRACTED_URLS_WITH_FILES :
85128 EXTRACTED_URLS_WITH_FILES [url ] = [filename ]
86129 yield url
@@ -99,7 +142,7 @@ def url_extractor(text, filename):
99142)
100143
101144
102- def text_extractor (files ) :
145+ def text_extractor (files : list [ str ]) -> typing . Generator [ tuple [ str , str ], None , None ] :
103146 for filename in files :
104147 if os .path .splitext (filename )[1 ] in FILES_TO_IGNORE :
105148 continue
@@ -113,10 +156,12 @@ def text_extractor(files):
113156
114157
115158class Task :
159+ _proc : subprocess .Popen [bytes ]
160+ _stderr : str | None
116161 # To avoid 403 responses
117162 USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
118163
119- def __init__ (self , url ):
164+ def __init__ (self , url : str ):
120165 self .url = url
121166 self ._proc = subprocess .Popen (
122167 [
@@ -155,12 +200,12 @@ def age(self) -> float:
155200 return time .time () - self ._started
156201
157202
158- def create_new_task (url ) -> Task :
203+ def create_new_task (url : str ) -> Task :
159204 # print("Create task:", url)
160205 return Task (url )
161206
162207
163- def process_finished_task (task ) -> None :
208+ def process_finished_task (task : Task ) -> None :
164209 # print("Finish task:", task.url)
165210 expected_ret_code , expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES .get (task .url , (0 , None ))
166211 if task .ret_code == 0 or task .ret_code == expected_ret_code :
@@ -185,7 +230,7 @@ def process_finished_task(task) -> None:
185230 JOB_SUMMARY .add_error (f"Broken URL '{ task .url } ': { task .stderr } Files: { EXTRACTED_URLS_WITH_FILES [task .url ]} " )
186231
187232
188- WORKER_QUEUE : SimpleQueue = SimpleQueue ()
233+ WORKER_QUEUE : SimpleQueue [ str | None ] = SimpleQueue ()
189234
190235
191236def url_checker (num_workers : int = 8 ) -> None :
0 commit comments