refactor(scripts): update main function signatures to improve argument handling

lukeblevins · lukeblevins · commit 76a74517adbf · 2025-04-23T13:25:12.000-04:00
diff --git a/poligrapher/scripts/html_crawler.py b/poligrapher/scripts/html_crawler.py
@@ -17,7 +17,9 @@
 from requests_cache import CachedSession
 
 READABILITY_JS_COMMIT = "8e8ec27cd2013940bc6f3cc609de10e35a1d9d86"
-READABILITY_JS_URL = f"https://raw.githubusercontent.com/mozilla/readability/{READABILITY_JS_COMMIT}"
+READABILITY_JS_URL = (
+    f"https://raw.githubusercontent.com/mozilla/readability/{READABILITY_JS_COMMIT}"
+)
 REQUESTS_TIMEOUT = 10
 
 
@@ -30,7 +32,9 @@ def get_readability_js():
     js_code.append(res.text)
     js_code.append(res.text)
 
-    res = session.get(f"{READABILITY_JS_URL}/Readability-readerable.js", timeout=REQUESTS_TIMEOUT)
+    res = session.get(
+        f"{READABILITY_JS_URL}/Readability-readerable.js", timeout=REQUESTS_TIMEOUT
+    )
     res.raise_for_status()
     js_code.append(res.text)
 
@@ -51,9 +55,15 @@ def url_arg_handler(url):
         return parsed_path.as_uri()
 
     # Handle Google Docs URLs
-    if (parsed_url.hostname == "docs.google.com"
-            and not parsed_url.path.endswith("/pub")
-            and (m := re.match(r"/document/d/(1[a-zA-Z0-9_-]{42}[AEIMQUYcgkosw048])", parsed_url.path))):
+    if (
+        parsed_url.hostname == "docs.google.com"
+        and not parsed_url.path.endswith("/pub")
+        and (
+            m := re.match(
+                r"/document/d/(1[a-zA-Z0-9_-]{42}[AEIMQUYcgkosw048])", parsed_url.path
+            )
+        )
+    ):
         logging.info("Exporting HTML from Google Docs URL...")
 
         export_url = f"https://docs.google.com/feeds/download/documents/export/Export?id={m[1]}&exportFormat=html"
@@ -78,15 +88,12 @@ def url_arg_handler(url):
         return url
 
 
-def main():
-    logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO)
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("url", help="Input URL or path")
-    parser.add_argument("output", help="Output dir")
-    parser.add_argument("--no-readability-js", action="store_true", help="Disable readability.js")
-    args = parser.parse_args()
+def main(url, output):
+    logging.basicConfig(
+        format="%(asctime)s [%(levelname)s] %(message)s", level=logging.INFO
+    )
 
+    args = argparse.Namespace(url=url, output=output)
     access_url = url_arg_handler(args.url)
 
     if access_url is None:
@@ -129,7 +136,10 @@ def error_cleanup(msg):
         url_status = dict()
         navigated_urls = []
         page.on("response", lambda r: url_status.update({r.url: r.status}))
-        page.on("framenavigated", lambda f: f.parent_frame is None and navigated_urls.append(f.url))
+        page.on(
+            "framenavigated",
+            lambda f: f.parent_frame is None and navigated_urls.append(f.url),
+        )
 
         page.goto(access_url)
 
@@ -146,7 +156,8 @@ def error_cleanup(msg):
         # Apply readability.js
         page.evaluate("window.stop()")
         page.add_script_tag(content=get_readability_js())
-        readability_info = page.evaluate(r"""(no_readability_js) => {
+        readability_info = page.evaluate(
+            r"""(no_readability_js) => {
             window.stop();
 
             const documentClone = document.cloneNode(true);
@@ -168,11 +179,13 @@ def error_cleanup(msg):
                 elem.remove();
 
             return article;
-        }""", [args.no_readability_js])
+        }""",
+            [args.no_readability_js],
+        )
         cleaned_html = page.content()
 
         # Check language
-        soup = bs4.BeautifulSoup(cleaned_html, 'lxml')
+        soup = bs4.BeautifulSoup(cleaned_html, "lxml")
         soup_text = soup.body.text if soup.body else ""
 
         try:
@@ -192,7 +205,9 @@ def error_cleanup(msg):
         output_dir = Path(args.output)
         output_dir.mkdir(exist_ok=True)
 
-        with open(output_dir / "accessibility_tree.json", "w", encoding="utf-8") as fout:
+        with open(
+            output_dir / "accessibility_tree.json", "w", encoding="utf-8"
+        ) as fout:
             json.dump(snapshot, fout)
 
         with open(output_dir / "cleaned.html", "w", encoding="utf-8") as fout:
@@ -207,4 +222,10 @@ def error_cleanup(msg):
 
 
 if __name__ == "__main__":
-    main()
+    # fallback to original CLI behavior
+    import sys
+
+    if len(sys.argv) != 3:
+        print("usage: html_crawler.py <url_or_path> <output_dir>")
+        sys.exit(1)
+    main(sys.argv[1], sys.argv[2])
diff --git a/poligrapher/scripts/init_document.py b/poligrapher/scripts/init_document.py
@@ -18,27 +18,21 @@
 from poligrapher.utils import setup_nlp_pipeline
 
 
-def main():
-    logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=logging.INFO)
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("workdirs", nargs="+", help="Input directories")
-    parser.add_argument("--nlp", default="", help="NLP model directory")
-    parser.add_argument("--debug", action="store_true", help="Show NER results")
-    parser.add_argument("--gpu-memory-threshold", default=0.9, type=float,
-                        help="Max GPU usage to trigger manual cache cleaning")
-    args = parser.parse_args()
+def main(workdirs, nlp_model_dir="", debug=False, gpu_memory_threshold=0.9):
+    logging.basicConfig(
+        format="%(asctime)s [%(levelname)s] %(message)s", level=logging.INFO
+    )
 
     use_gpu = spacy.prefer_gpu()
-    nlp = setup_nlp_pipeline(args.nlp)
+    nlp = setup_nlp_pipeline(nlp_model_dir)
 
-    for d in args.workdirs:
+    for d in workdirs:
         logging.info("Processing %s ...", d)
 
         document = PolicyDocument.initialize(d, nlp=nlp)
         document.save()
 
-        if args.debug:
+        if debug:
             with open(os.path.join(d, "document.txt"), "w", encoding="utf-8") as fout:
                 fout.write(document.print_tree())
 
@@ -47,10 +41,24 @@ def main():
             gmem_total = torch.cuda.get_device_properties(current_device).total_memory
             gmem_reserved = torch.cuda.memory_reserved(current_device)
 
-            if gmem_reserved / gmem_total > args.gpu_memory_threshold:
+            if gmem_reserved / gmem_total > gpu_memory_threshold:
                 logging.warning("Empty GPU cache...")
                 torch.cuda.empty_cache()
 
 
 if __name__ == "__main__":
-    main()
+    import sys
+
+    if len(sys.argv) < 2:
+        print(
+            "usage: init_document.py <workdir1> [<workdir2> ...] [--nlp MODEL_DIR] [--debug] [--gpu-memory-threshold FLOAT]"
+        )
+        sys.exit(1)
+    # parse sys.argv manually or via argparse then call:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("workdirs", nargs="+")
+    parser.add_argument("--nlp", default="")
+    parser.add_argument("--debug", action="store_true")
+    parser.add_argument("--gpu-memory-threshold", default=0.9, type=float)
+    args = parser.parse_args()
+    main(args.workdirs, args.nlp, args.debug, args.gpu_memory_threshold)
diff --git a/poligrapher/scripts/pdf_parser.py b/poligrapher/scripts/pdf_parser.py
@@ -138,11 +138,8 @@ def url_arg_handler(url, args):
             return exported
 
 
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("url", help="Input URL or path")
-    parser.add_argument("output", help="Output dir")
-    args = parser.parse_args()
+def main(url, output):
+    args = argparse.Namespace(url=url, output=output)
 
     pdf_source = url_arg_handler(args.url, args)
     if pdf_source is None:
@@ -165,4 +162,10 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    # fallback to original CLI behavior
+    import sys
+
+    if len(sys.argv) != 3:
+        print("usage: pdf_parser.py <url_or_path> <output_dir>")
+        sys.exit(1)
+    main(sys.argv[1], sys.argv[2])
diff --git a/poligrapher/scripts/run_annotators.py b/poligrapher/scripts/run_annotators.py
@@ -15,30 +15,28 @@
 from poligrapher.utils import setup_nlp_pipeline
 
 
-def main():
-    logging.basicConfig(format='%(asctime)s [%(levelname)s] <%(name)s> %(message)s', level=logging.INFO)
+def main(workdirs, nlp_model_dir="", disable=""):
+    logging.basicConfig(
+        format="%(asctime)s [%(levelname)s] <%(name)s> %(message)s", level=logging.INFO
+    )
 
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--nlp", default="", help="NLP model directory")
-    parser.add_argument("--disable", default="", help="Disable annotators for ablation study")
-    parser.add_argument("workdirs", nargs="+", help="Input directories")
-    args = parser.parse_args()
-
-    nlp = setup_nlp_pipeline(args.nlp)
+    nlp = setup_nlp_pipeline(nlp_model_dir)
 
-    disabled_annotators = frozenset(args.disable.split(","))
+    disabled_annotators = frozenset(disable.split(",")) if disable else frozenset()
     annotators = []
 
-    for annotator_class in (SubsumptionAnnotator,
-                            CoreferenceAnnotator,
-                            CollectionAnnotator,
-                            PurposeAnnotator,
-                            ListAnnotator,
-                            SubjectAnnotator):
+    for annotator_class in (
+        SubsumptionAnnotator,
+        CoreferenceAnnotator,
+        CollectionAnnotator,
+        PurposeAnnotator,
+        ListAnnotator,
+        SubjectAnnotator,
+    ):
         if annotator_class.__name__ not in disabled_annotators:
             annotators.append(annotator_class(nlp))
 
-    for d in args.workdirs:
+    for d in workdirs:
         logging.info("Processing %s ...", d)
 
         document = PolicyDocument.load(d, nlp)
@@ -51,4 +49,11 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--nlp", default="", help="NLP model directory")
+    parser.add_argument(
+        "--disable", default="", help="Disable annotators for ablation study"
+    )
+    parser.add_argument("workdirs", nargs="+", help="Input directories")
+    args = parser.parse_args()
+    main(args.workdirs, args.nlp, args.disable)