Finally fixed check-urls.py

Denis-Averin · Denis-Averin · commit b90f2b9cbd67 · 2025-01-29T18:06:06.000+07:00
diff --git a/codegen/Templates/android/README.mustache b/codegen/Templates/android/README.mustache
@@ -10,7 +10,7 @@ Scan existing barcodes belonging to 60+ symbologies, including, *Codabar*, *PDF4
 
 ## BarCode Processing Features
 
-- [Generate](https://docs.aspose.cloud/barcode/generate-format-and-manipulate-a-barcode-using-cloud-storage/), scan and customize *1D* (linear), *2D* and *postal* barcodes.
+- [Generate](https://docs.aspose.cloud/barcode/v4.0/quickstart/#make-an-api-request-from-the-sdk-of-your-choice), scan and customize *1D* (linear), *2D* and *postal* barcodes.
 - Generate and recognize barcodes with the checksum option.
 - Fetch barcode as an image stream or save the barcode to the local disk.
 - Configure barcode height, width, angle quality, margin & resolution.
diff --git a/scripts/check-urls.py b/scripts/check-urls.py
@@ -5,6 +5,8 @@
 import sys
 import threading
 import time
+import typing
+import urllib.parse
 from queue import SimpleQueue
 
 from github_job_summary import JobSummary
@@ -29,58 +31,99 @@ class Curl:
 
 
 CURL_EXIT_CODES_AND_HTTP_CODES = {
-    "http://schemas.android.com/aapt": (Curl.COULDNT_RESOLVE_HOST, None),
-    "http://schemas.android.com/apk/res-auto": (Curl.COULDNT_RESOLVE_HOST, None),
-    "http://schemas.android.com/apk/res/android": (Curl.COULDNT_RESOLVE_HOST, None),
-    "http://schemas.android.com/tools": (Curl.COULDNT_RESOLVE_HOST, None),
     "https://api.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400),
     "https://api.aspose.cloud/v3.0": (Curl.HTTP_RETURNED_ERROR, 404),
-    "https://id.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400),
     "https://api.aspose.cloud/v4.0": (Curl.HTTP_RETURNED_ERROR, 404),
+    "https://api.aspose.cloud/v4.0/": (Curl.HTTP_RETURNED_ERROR, 404),
+    "https://id.aspose.cloud/connect/token": (Curl.HTTP_RETURNED_ERROR, 400),
     "https://barcode.qa.aspose.cloud/v3.0/barcode/swagger/spec": (Curl.COULDNT_RESOLVE_HOST, None),
-    "https://mvnrepository.com/artifact/io.swagger/swagger-codegen-cli": (Curl.HTTP_RETURNED_ERROR, 403),
-    "https://www.npmjs.com/package/aspose-barcode-cloud-node": (Curl.HTTP_RETURNED_ERROR, 429),
     # TODO: Temporary fix
     "https://dashboard.aspose.cloud/applications": (Curl.HTTP_RETURNED_ERROR, 404),
 }
 
 URLS_TO_IGNORE: frozenset[str] = frozenset(
     [
-        "http://|https://|ftp://",
-        "http://localhost:$port/",
-        "http://localhost:47972",
-        "http://localhost:47972/connect/token",
-        "http://localhost:47972/v3.0",
-        "http://localhost:47972/v3.0/barcode/swagger/spec",
-        "http://some",
-        "http://tools.ietf.org/html/rfc1341.html",
-        "http://tools.ietf.org/html/rfc2046",
-        "http://tools.ietf.org/html/rfc2388",
-        "http://urllib3.readthedocs.io/en/latest/advanced-usage.html",
         "https://api.aspose.cloud",
-        "https://api.aspose.cloud/v3.0/barcode/scan",
-        "https://github.com/aspose-barcode-cloud/aspose-barcode-cloud-dotnet/releases/tag/v{{packageVersion}}",
-        "https://img.shields.io/badge/api-v{{appVersion}}-lightgrey",
-        "https://pypi.org/project/{{projectName}}/",
-        "https://repo1.maven.org/maven2/io/swagger/swagger-codegen-cli/2.4.14/swagger-codegen-cli-2.4.14.jar",
-        "https://tools.ietf.org/html/rfc1521",
-        "https://unknown",
         "https://www.aspose.cloud/404",
-        "https://www.mojohaus.org/VERSIONS/RULE/2.1.0",
+    ]
+)
+
+IGNORE_DOMAINS: frozenset[str] = frozenset(
+    [
+        "central.sonatype.org",
+        "curl.se",
+        "dart.dev",
+        "getcomposer.org",
+        "go.dev",
+        "maven.apache.org",
+        "mvnrepository.com",
+        "mvnrepository.com",
+        "nodejs.org",
+        "packagist.org",
+        "pkg.go.dev",
+        "pub.dev",
+        "pypi.org",
+        "pypi.python.org",
+        "repo1.maven.org",
+        "tools.ietf.org",
+        "urllib3.readthedocs.io",
+        "www.apache.org",
+        "www.dartlang.org",
+        "www.gradle.org",
+        "www.mojohaus.org",
+        "www.npmjs.com",
+        "www.nuget.org",
+        "www.opensource.org",
+        "www.php.net",
+        "www.python.org",
+        "www.w3.org",
     ]
 )
 
 URL_END_CHARS = r",#\)\"'<>\*\s\\"
-URL_RE_PATTERN = r"(https*://[^%s]+)[%s]?" % (URL_END_CHARS, URL_END_CHARS)
+URL_RE_PATTERN = r"(https*://[^{0}]+)[{0}]?".format(URL_END_CHARS)
 # print(URL_RE_PATTERN)
 URL_REGEX = re.compile(URL_RE_PATTERN, re.MULTILINE)
 
 # URL : [Files]
 EXTRACTED_URLS_WITH_FILES: dict[str, list[str]] = {k: [] for k in URLS_TO_IGNORE}
 
 
-def url_extractor(text, filename):
+def valid_url(url: str) -> bool:
+    try:
+        parsed: urllib.parse.ParseResult = urllib.parse.urlparse(url)
+    except:
+        # Malformed URL
+        return False
+    else:
+        domain = parsed.netloc
+        if "." not in domain:
+            # Ignore "localhost" and other domains without .
+            return False
+        if domain in IGNORE_DOMAINS:
+            return False
+
+        if (
+            domain.endswith("android.com")
+            or domain.endswith(".google.com")
+            or domain.endswith(".microsoft.com")
+            or domain.endswith(".wikipedia.org")
+        ):
+            # Ignore popular domain
+            return False
+
+    if "{{" in url or "}}" in url:
+        # Ignore templates with {{var}}
+        return False
+
+    return True
+
+
+def url_extractor(text: str, filename: str) -> typing.Generator[str, None, None]:
     for url in URL_REGEX.findall(text):
+        if not valid_url(url):
+            # print("Ignore:", url)
+            continue
         if url not in EXTRACTED_URLS_WITH_FILES:
             EXTRACTED_URLS_WITH_FILES[url] = [filename]
             yield url
@@ -99,7 +142,7 @@ def url_extractor(text, filename):
 )
 
 
-def text_extractor(files):
+def text_extractor(files: list[str]) -> typing.Generator[tuple[str, str], None, None]:
     for filename in files:
         if os.path.splitext(filename)[1] in FILES_TO_IGNORE:
             continue
@@ -113,10 +156,12 @@ def text_extractor(files):
 
 
 class Task:
+    _proc: subprocess.Popen[bytes]
+    _stderr: str | None
     # To avoid 403 responses
     USER_AGENT = "Googlebot/2.1 (+http://www.google.com/bot.html)"
 
-    def __init__(self, url):
+    def __init__(self, url: str):
         self.url = url
         self._proc = subprocess.Popen(
             [
@@ -155,12 +200,12 @@ def age(self) -> float:
         return time.time() - self._started
 
 
-def create_new_task(url) -> Task:
+def create_new_task(url: str) -> Task:
     # print("Create task:", url)
     return Task(url)
 
 
-def process_finished_task(task) -> None:
+def process_finished_task(task: Task) -> None:
     # print("Finish task:", task.url)
     expected_ret_code, expected_http_code = CURL_EXIT_CODES_AND_HTTP_CODES.get(task.url, (0, None))
     if task.ret_code == 0 or task.ret_code == expected_ret_code:
@@ -185,7 +230,7 @@ def process_finished_task(task) -> None:
     JOB_SUMMARY.add_error(f"Broken URL '{task.url}': {task.stderr}Files: {EXTRACTED_URLS_WITH_FILES[task.url]}")
 
 
-WORKER_QUEUE: SimpleQueue = SimpleQueue()
+WORKER_QUEUE: SimpleQueue[str | None] = SimpleQueue()
 
 
 def url_checker(num_workers: int = 8) -> None:
diff --git a/scripts/github_job_summary.py b/scripts/github_job_summary.py
@@ -21,7 +21,7 @@ def __init__(self, filename: str):
         self._success = []
         self._lock = Lock()
 
-    def close(self):
+    def close(self) -> None:
         assert not self.__file.closed
         self.__file.close()
 
@@ -31,18 +31,18 @@ def __str__(self) -> str:
         lines = ["Errors:"] + self._errors
         return "\n\n".join(lines)
 
-    def _write_line(self, line):
+    def _write_line(self, line: str) -> None:
         with self._lock:
             self.__file.write(line.replace("\r", ""))
 
     @property
     def has_errors(self) -> bool:
         return bool(self._errors)
 
-    def add_header(self, text: str, level: int = 3):
+    def add_header(self, text: str, level: int = 3) -> None:
         self._write_line(f"{'#' * level} {text}\n\n")
 
-    def add_error(self, text: str):
+    def add_error(self, text: str) -> None:
         """
         See https://github.com/markdown-templates/markdown-emojis
         """
@@ -51,10 +51,10 @@ def add_error(self, text: str):
         self._errors.append(text)
         self._write_line(f"\n1. :x: {text}\n")
 
-    def add_success(self, text: str):
+    def add_success(self, text: str) -> None:
         self._success.append(text)
 
-    def finalize(self, format_str: str):
+    def finalize(self, format_str: str) -> None:
         total = len(self._success) + len(self._errors)
         self._write_line(
             "\n" + format_str.format(total=total, success=len(self._success), failed=len(self._errors)) + "\n"
diff --git a/scripts/open_ai_code_convertion.py b/scripts/open_ai_code_convertion.py
@@ -6,10 +6,11 @@
 
 # Environment Variables for OpenAI API Authentication
 client = OpenAI(
-    api_key = os.getenv("OPEN_AI_API_KEY"), 
-    organization = os.getenv("OPEN_AI_ORG_ID"),
-    project = os.getenv("OPEN_AI_PROJECT_ID")
-    )
+    api_key=os.getenv("OPEN_AI_API_KEY"),
+    organization=os.getenv("OPEN_AI_ORG_ID"),
+    project=os.getenv("OPEN_AI_PROJECT_ID"),
+)
+
 
 # Helper function to convert file content
 def convert_code(content, source_lang, target_lang, target_snippet):
@@ -34,8 +35,8 @@ def convert_code(content, source_lang, target_lang, target_snippet):
             model="gpt-4o-mini",
             messages=[
                 {"role": "system", "content": "You are a helpful code conversion assistant."},
-                {"role": "user", "content": prompt}
-            ]
+                {"role": "user", "content": prompt},
+            ],
         )
         print("Getted response")
         return response.choices[0].message.content
@@ -57,22 +58,23 @@ def process_files(folder, source_ext, target_ext, source_lang, target_lang, targ
             if file.endswith(source_ext):
                 file_path = Path(root) / file
                 print(f"Processing file: {file_path}")
-                
-                with open(file_path, 'r') as f:
+
+                with open(file_path, "r") as f:
                     content = f.read()
-                
+
                 converted_content = convert_code(content, source_lang, target_lang, target_snippet)
                 if converted_content:
                     matches = re.findall(pattern, converted_content)
                     if matches:
                         converted_content = matches[0][0].strip()
                     target_file_path = file_path.with_suffix(target_ext)
-                    with open(target_file_path, 'w') as f:
+                    with open(target_file_path, "w") as f:
                         f.write(converted_content)
                     print(f"Converted file saved as: {target_file_path}")
                 else:
                     print(f"Failed to convert file: {file_path}")
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Convert source code files between programming languages.")
     parser.add_argument("folder", type=str, help="Path to the folder containing source files.")
@@ -81,12 +83,12 @@ def process_files(folder, source_ext, target_ext, source_lang, target_lang, targ
     parser.add_argument("source_lang", type=str, help="Source language (e.g., JavaScript).")
     parser.add_argument("target_lang", type=str, help="Target language (e.g., Python).")
     parser.add_argument("target_snippet_file", type=str, help="File containing a code snippet in the target language.")
-    
+
     args = parser.parse_args()
 
     # Read the target snippet from the provided file
     try:
-        with open(args.target_snippet_file, 'r') as snippet_file:
+        with open(args.target_snippet_file, "r") as snippet_file:
             target_snippet = snippet_file.read()
     except Exception as e:
         print(f"Error reading target snippet file: {e}")
diff --git a/submodules/android b/submodules/android
@@ -1 +1 @@
-Subproject commit 41df595b943b2643813d01dca14906c01380f4cd
+Subproject commit 5da5b272767b44bd9513005c1859c69c19a1d177