fix(environment): Update playwright dependency to latest version without specific version

lukeblevins · lukeblevins · commit a4020ee56210 · 2025-06-30T11:21:11.000-04:00
refactor(notebook): Improve markdown formatting and shell command sections in poligrapher notebook
fix(html_crawler): Update readability.js commit hash and improve error handling
fix(pdf_parser): Change browser launch to use Edge and enhance URL handling for PDF detection
diff --git a/environment.yml b/environment.yml
@@ -23,7 +23,7 @@ dependencies:
   - numpy==1.26.4
   - pandas==2.2.2
   - pip==25.1
-  - playwright==1.49.1
+  - playwright
   - pyee==12.0.0
   - rapidfuzz==3.6.1
   - regex==2024.9.11
diff --git a/notebooks/poligrapher_notebook.ipynb b/notebooks/poligrapher_notebook.ipynb
@@ -59,7 +59,17 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "Make sure you create the conda environment:\n",
+        "Make sure you create the conda environment:"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "vscode": {
+          "languageId": "shellscript"
+        }
+      },
+      "source": [
         "```sh\n",
         "conda env create -f ./environment.yml\n",
         "```"
@@ -233,29 +243,23 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": null,
+      "cell_type": "markdown",
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
         },
         "id": "BttUs_LCrxPQ",
-        "outputId": "3d5293d8-3904-4d2c-f771-4935d7e58c43"
+        "outputId": "3d5293d8-3904-4d2c-f771-4935d7e58c43",
+        "vscode": {
+          "languageId": "shellscript"
+        }
       },
-      "outputs": [],
       "source": [
-        "from install_playwright import install\n",
-        "from playwright.async_api import async_playwright\n",
-        "\n",
-        "\n",
-        "async def main():\n",
-        "    async with async_playwright() as p:\n",
-        "        install(p.firefox)\n",
-        "        install(p.chromium)\n",
-        "\n",
-        "\n",
-        "# Run the async function\n",
-        "await main()"
+        "```sh\n",
+        "playwright install firefox\n",
+        "playwright install chromium\n",
+        "playwright install msedge\n",
+        "```"
       ]
     },
     {
@@ -300,7 +304,6 @@
       "source": [
         "from poligrapher.scripts import (\n",
         "    build_graph,\n",
-        "    flow_consistency_analysis,\n",
         "    html_crawler,\n",
         "    init_document,\n",
         "    pdf_parser,\n",
@@ -327,6 +330,9 @@
       },
       "outputs": [],
       "source": [
+        "from requests import RequestException\n",
+        "\n",
+        "\n",
         "async def generate_graph_from_html(html_path, output_folder):\n",
         "    \"\"\"\n",
         "    Generate a graph from an HTML file.\n",
@@ -395,6 +401,9 @@
         "        try:\n",
         "            await generate_graph_from_html(policy_url, output_folder)\n",
         "            print(f\"Graphs for {policy_url} have been generated using webpage parser\")\n",
+        "        except RequestException as ex:\n",
+        "            print(f\"Error generating graphs for {policy_url}\")\n",
+        "            print(ex)\n",
         "        except BaseException as e:\n",
         "            try:\n",
         "                # Fallback to the pdf parser method\n",
@@ -605,7 +614,7 @@
         "        full_dir_path = os.path.join(root, dir)\n",
         "        if needs_csv_extract(full_dir_path):\n",
         "            yml_file = os.path.join(full_dir_path, yml_path)\n",
-        "            if os.path.exists(yml_file):\n",
+        "            if os.path.exists(yml_file) and yml_file not in graph_files:\n",
         "                graph_files.append(yml_file)\n",
         "\n",
         "for graph_file in graph_files:\n",
diff --git a/poligrapher/scripts/html_crawler.py b/poligrapher/scripts/html_crawler.py
@@ -19,7 +19,7 @@
 import requests
 from requests_cache import CachedSession
 
-READABILITY_JS_COMMIT = "8e8ec27cd2013940bc6f3cc609de10e35a1d9d86"
+READABILITY_JS_COMMIT = "04fd32f72b448c12b02ba6c40928b67e510bac49"
 READABILITY_JS_URL = (
     f"https://raw.githubusercontent.com/mozilla/readability/{READABILITY_JS_COMMIT}"
 )
@@ -78,25 +78,7 @@ def url_arg_handler(url):
         req.close()
         return base64_url
 
-    # Test connection
-    headers = {
-        "User-Agent": (
-            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
-            "AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/120.0.0.0 Safari/537.36"
-        ),
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-        "Accept-Language": "en-US,en;q=0.5",
-        "Range": "bytes=0-1023",  # Only fetch the first 1KB
-    }
-    try:
-        resp = requests.get(url, headers=headers, timeout=REQUESTS_TIMEOUT, stream=True)
-        resp.raise_for_status()
-        resp.close()
-        return url
-    except Exception as e:
-        logging.error("Failed to connect to %r: %s", url, e)
-        return None
+    return url
 
 async def main(url, output):
     logging.basicConfig(
@@ -106,10 +88,6 @@ async def main(url, output):
     args = argparse.Namespace(url=url, output=output, no_readability_js=False)
     access_url = url_arg_handler(args.url)
 
-    if access_url is None:
-        logging.error("URL failed pre-tests. Exiting...")
-        sys.exit(-1)
-
     firefox_configs = {
         # Bypass CSP so we can always inject scripts
         "security.csp.enable": False,
@@ -129,7 +107,9 @@ async def main(url, output):
     async with async_playwright() as p:
         # Firefox generates simpler accessibility tree than chromium
         # Tested on Debian's firefox-esr 91.5.0esr-1~deb11u1
-        browser = await p.firefox.launch(firefox_user_prefs=firefox_configs)
+        browser = await p.firefox.launch(
+            firefox_user_prefs=firefox_configs, headless=True
+        )
         context = await browser.new_context(bypass_csp=True)
 
         async def error_cleanup(msg):
@@ -161,7 +141,7 @@ async def error_cleanup(msg):
         # Check HTTP errors
         for url in navigated_urls:
             if (status_code := url_status.get(url, 0)) >= 400:
-                error_cleanup(f"Got HTTP error {status_code}")
+                await error_cleanup(f"Got HTTP error {status_code}")
 
         # Apply readability.js
         await page.evaluate("window.stop()")
@@ -207,10 +187,10 @@ async def error_cleanup(msg):
             lang = "UNKNOWN"
 
         if not lang.lower().startswith("en"):
-            error_cleanup(f"Content language {lang} isn't English")
+            await error_cleanup(f"Content language {lang} isn't English")
 
         if re.search(r"(data|privacy)\s*(?:policy|notice)", soup_text, re.I) is None:
-            error_cleanup("Not like a privacy policy")
+            await error_cleanup("Not like a privacy policy")
 
         # obtain the accessibility tree
         snapshot = await page.accessibility.snapshot(interesting_only=False)
diff --git a/poligrapher/scripts/pdf_parser.py b/poligrapher/scripts/pdf_parser.py
@@ -7,7 +7,6 @@
 import requests
 import markdown
 import urllib.parse as urlparse
-import tempfile
 import pymupdf4llm
 
 from pathlib import Path
@@ -25,7 +24,7 @@ async def create_pdf(url, args):
         sys.exit(-1)
 
     async with async_playwright() as p:
-        browser = await p.chromium.launch(headless=True)
+        browser = await p.chromium.launch(channel="msedge", headless=True)
         context = await browser.new_context(bypass_csp=True)
 
         async def error_cleanup(msg):
@@ -58,7 +57,7 @@ async def error_cleanup(msg):
         # Check HTTP errors
         for url in navigated_urls:
             if (status_code := url_status.get(url, 0)) >= 400:
-                error_cleanup(f"Got HTTP error {status_code}")
+                await error_cleanup(f"Got HTTP error {status_code}")
 
         output_dir = Path(args.output)
         temp_pdf_path = output_dir / "output.pdf"
@@ -80,7 +79,7 @@ def download_pdf(url, args):
         return None
 
     filename = Path(urlparse.urlparse(url).path).name
-    if not filename.endswith(".pdf"):
+    if not filename.endswith([".pdf", ".PDF"]):
         filename = "downloaded.pdf"
 
     temp_pdf_path = Path(args.output).joinpath("output.pdf")
@@ -112,29 +111,9 @@ async def url_arg_handler(url, args):
         # No scheme: assume HTTPS
         parsed_url = parsed_url._replace(scheme="https")
         url = parsed_url.geturl()
-        # Test connection
-        headers = {
-            "User-Agent": (
-                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
-                "AppleWebKit/537.36 (KHTML, like Gecko) "
-                "Chrome/120.0.0.0 Safari/537.36"
-            ),
-            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-            "Accept-Language": "en-US,en;q=0.5",
-            "Range": "bytes=0-1023",  # Only fetch the first 1KB
-        }
-        try:
-            resp = requests.get(
-                url, headers=headers, timeout=REQUESTS_TIMEOUT, stream=True
-            )
-            resp.raise_for_status()
-            resp.close()
-        except Exception as e:
-            logging.error("Failed to connect to %r: %s", url, e)
-            return None
 
         # Determine if website or PDF link
-        if url.endswith(".pdf"):
+        if url.endswith(".pdf") or url.endswith(".PDF"):
             logging.info("Interpreting %r as a PDF URL", url)
             # Download and return the local file path
             downloaded = download_pdf(url, args)
@@ -155,15 +134,7 @@ async def url_arg_handler(url, args):
 
 async def main(url, output):
     args = argparse.Namespace(url=url, output=output)
-
     pdf_path = await url_arg_handler(args.url, args)
-    if pdf_path is None:
-        logging.error("Invalid input path or URL")
-        exit(1)
-
-    # if not pdf_path.is_file():
-    #     logging.error("File %r not found", pdf_path)
-    #     exit(1)
 
     # Convert the PDF to Markdown and then to HTML
     md_text = pymupdf4llm.to_markdown(pdf_path)