Skip to content

Commit a4020ee

Browse files
committed
fix(environment): Update playwright dependency to latest version without specific version
refactor(notebook): Improve markdown formatting and shell command sections in poligrapher notebook fix(html_crawler): Update readability.js commit hash and improve error handling fix(pdf_parser): Change browser launch to use Edge and enhance URL handling for PDF detection
1 parent d87c29f commit a4020ee

File tree

4 files changed

+41
-81
lines changed

4 files changed

+41
-81
lines changed

environment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ dependencies:
2323
- numpy==1.26.4
2424
- pandas==2.2.2
2525
- pip==25.1
26-
- playwright==1.49.1
26+
- playwright
2727
- pyee==12.0.0
2828
- rapidfuzz==3.6.1
2929
- regex==2024.9.11

notebooks/poligrapher_notebook.ipynb

Lines changed: 28 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,17 @@
5959
"cell_type": "markdown",
6060
"metadata": {},
6161
"source": [
62-
"Make sure you create the conda environment:\n",
62+
"Make sure you create the conda environment:"
63+
]
64+
},
65+
{
66+
"cell_type": "markdown",
67+
"metadata": {
68+
"vscode": {
69+
"languageId": "shellscript"
70+
}
71+
},
72+
"source": [
6373
"```sh\n",
6474
"conda env create -f ./environment.yml\n",
6575
"```"
@@ -233,29 +243,23 @@
233243
]
234244
},
235245
{
236-
"cell_type": "code",
237-
"execution_count": null,
246+
"cell_type": "markdown",
238247
"metadata": {
239248
"colab": {
240249
"base_uri": "https://localhost:8080/"
241250
},
242251
"id": "BttUs_LCrxPQ",
243-
"outputId": "3d5293d8-3904-4d2c-f771-4935d7e58c43"
252+
"outputId": "3d5293d8-3904-4d2c-f771-4935d7e58c43",
253+
"vscode": {
254+
"languageId": "shellscript"
255+
}
244256
},
245-
"outputs": [],
246257
"source": [
247-
"from install_playwright import install\n",
248-
"from playwright.async_api import async_playwright\n",
249-
"\n",
250-
"\n",
251-
"async def main():\n",
252-
" async with async_playwright() as p:\n",
253-
" install(p.firefox)\n",
254-
" install(p.chromium)\n",
255-
"\n",
256-
"\n",
257-
"# Run the async function\n",
258-
"await main()"
258+
"```sh\n",
259+
"playwright install firefox\n",
260+
"playwright install chromium\n",
261+
"playwright install msedge\n",
262+
"```"
259263
]
260264
},
261265
{
@@ -300,7 +304,6 @@
300304
"source": [
301305
"from poligrapher.scripts import (\n",
302306
" build_graph,\n",
303-
" flow_consistency_analysis,\n",
304307
" html_crawler,\n",
305308
" init_document,\n",
306309
" pdf_parser,\n",
@@ -327,6 +330,9 @@
327330
},
328331
"outputs": [],
329332
"source": [
333+
"from requests import RequestException\n",
334+
"\n",
335+
"\n",
330336
"async def generate_graph_from_html(html_path, output_folder):\n",
331337
" \"\"\"\n",
332338
" Generate a graph from an HTML file.\n",
@@ -395,6 +401,9 @@
395401
" try:\n",
396402
" await generate_graph_from_html(policy_url, output_folder)\n",
397403
" print(f\"Graphs for {policy_url} have been generated using webpage parser\")\n",
404+
" except RequestException as ex:\n",
405+
" print(f\"Error generating graphs for {policy_url}\")\n",
406+
" print(ex)\n",
398407
" except BaseException as e:\n",
399408
" try:\n",
400409
" # Fallback to the pdf parser method\n",
@@ -605,7 +614,7 @@
605614
" full_dir_path = os.path.join(root, dir)\n",
606615
" if needs_csv_extract(full_dir_path):\n",
607616
" yml_file = os.path.join(full_dir_path, yml_path)\n",
608-
" if os.path.exists(yml_file):\n",
617+
" if os.path.exists(yml_file) and yml_file not in graph_files:\n",
609618
" graph_files.append(yml_file)\n",
610619
"\n",
611620
"for graph_file in graph_files:\n",

poligrapher/scripts/html_crawler.py

Lines changed: 8 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import requests
2020
from requests_cache import CachedSession
2121

22-
READABILITY_JS_COMMIT = "8e8ec27cd2013940bc6f3cc609de10e35a1d9d86"
22+
READABILITY_JS_COMMIT = "04fd32f72b448c12b02ba6c40928b67e510bac49"
2323
READABILITY_JS_URL = (
2424
f"https://raw.githubusercontent.com/mozilla/readability/{READABILITY_JS_COMMIT}"
2525
)
@@ -78,25 +78,7 @@ def url_arg_handler(url):
7878
req.close()
7979
return base64_url
8080

81-
# Test connection
82-
headers = {
83-
"User-Agent": (
84-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
85-
"AppleWebKit/537.36 (KHTML, like Gecko) "
86-
"Chrome/120.0.0.0 Safari/537.36"
87-
),
88-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
89-
"Accept-Language": "en-US,en;q=0.5",
90-
"Range": "bytes=0-1023", # Only fetch the first 1KB
91-
}
92-
try:
93-
resp = requests.get(url, headers=headers, timeout=REQUESTS_TIMEOUT, stream=True)
94-
resp.raise_for_status()
95-
resp.close()
96-
return url
97-
except Exception as e:
98-
logging.error("Failed to connect to %r: %s", url, e)
99-
return None
81+
return url
10082

10183
async def main(url, output):
10284
logging.basicConfig(
@@ -106,10 +88,6 @@ async def main(url, output):
10688
args = argparse.Namespace(url=url, output=output, no_readability_js=False)
10789
access_url = url_arg_handler(args.url)
10890

109-
if access_url is None:
110-
logging.error("URL failed pre-tests. Exiting...")
111-
sys.exit(-1)
112-
11391
firefox_configs = {
11492
# Bypass CSP so we can always inject scripts
11593
"security.csp.enable": False,
@@ -129,7 +107,9 @@ async def main(url, output):
129107
async with async_playwright() as p:
130108
# Firefox generates simpler accessibility tree than chromium
131109
# Tested on Debian's firefox-esr 91.5.0esr-1~deb11u1
132-
browser = await p.firefox.launch(firefox_user_prefs=firefox_configs)
110+
browser = await p.firefox.launch(
111+
firefox_user_prefs=firefox_configs, headless=True
112+
)
133113
context = await browser.new_context(bypass_csp=True)
134114

135115
async def error_cleanup(msg):
@@ -161,7 +141,7 @@ async def error_cleanup(msg):
161141
# Check HTTP errors
162142
for url in navigated_urls:
163143
if (status_code := url_status.get(url, 0)) >= 400:
164-
error_cleanup(f"Got HTTP error {status_code}")
144+
await error_cleanup(f"Got HTTP error {status_code}")
165145

166146
# Apply readability.js
167147
await page.evaluate("window.stop()")
@@ -207,10 +187,10 @@ async def error_cleanup(msg):
207187
lang = "UNKNOWN"
208188

209189
if not lang.lower().startswith("en"):
210-
error_cleanup(f"Content language {lang} isn't English")
190+
await error_cleanup(f"Content language {lang} isn't English")
211191

212192
if re.search(r"(data|privacy)\s*(?:policy|notice)", soup_text, re.I) is None:
213-
error_cleanup("Not like a privacy policy")
193+
await error_cleanup("Not like a privacy policy")
214194

215195
# obtain the accessibility tree
216196
snapshot = await page.accessibility.snapshot(interesting_only=False)

poligrapher/scripts/pdf_parser.py

Lines changed: 4 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import requests
88
import markdown
99
import urllib.parse as urlparse
10-
import tempfile
1110
import pymupdf4llm
1211

1312
from pathlib import Path
@@ -25,7 +24,7 @@ async def create_pdf(url, args):
2524
sys.exit(-1)
2625

2726
async with async_playwright() as p:
28-
browser = await p.chromium.launch(headless=True)
27+
browser = await p.chromium.launch(channel="msedge", headless=True)
2928
context = await browser.new_context(bypass_csp=True)
3029

3130
async def error_cleanup(msg):
@@ -58,7 +57,7 @@ async def error_cleanup(msg):
5857
# Check HTTP errors
5958
for url in navigated_urls:
6059
if (status_code := url_status.get(url, 0)) >= 400:
61-
error_cleanup(f"Got HTTP error {status_code}")
60+
await error_cleanup(f"Got HTTP error {status_code}")
6261

6362
output_dir = Path(args.output)
6463
temp_pdf_path = output_dir / "output.pdf"
@@ -80,7 +79,7 @@ def download_pdf(url, args):
8079
return None
8180

8281
filename = Path(urlparse.urlparse(url).path).name
83-
if not filename.endswith(".pdf"):
82+
if not filename.endswith([".pdf", ".PDF"]):
8483
filename = "downloaded.pdf"
8584

8685
temp_pdf_path = Path(args.output).joinpath("output.pdf")
@@ -112,29 +111,9 @@ async def url_arg_handler(url, args):
112111
# No scheme: assume HTTPS
113112
parsed_url = parsed_url._replace(scheme="https")
114113
url = parsed_url.geturl()
115-
# Test connection
116-
headers = {
117-
"User-Agent": (
118-
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
119-
"AppleWebKit/537.36 (KHTML, like Gecko) "
120-
"Chrome/120.0.0.0 Safari/537.36"
121-
),
122-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
123-
"Accept-Language": "en-US,en;q=0.5",
124-
"Range": "bytes=0-1023", # Only fetch the first 1KB
125-
}
126-
try:
127-
resp = requests.get(
128-
url, headers=headers, timeout=REQUESTS_TIMEOUT, stream=True
129-
)
130-
resp.raise_for_status()
131-
resp.close()
132-
except Exception as e:
133-
logging.error("Failed to connect to %r: %s", url, e)
134-
return None
135114

136115
# Determine if website or PDF link
137-
if url.endswith(".pdf"):
116+
if url.endswith(".pdf") or url.endswith(".PDF"):
138117
logging.info("Interpreting %r as a PDF URL", url)
139118
# Download and return the local file path
140119
downloaded = download_pdf(url, args)
@@ -155,15 +134,7 @@ async def url_arg_handler(url, args):
155134

156135
async def main(url, output):
157136
args = argparse.Namespace(url=url, output=output)
158-
159137
pdf_path = await url_arg_handler(args.url, args)
160-
if pdf_path is None:
161-
logging.error("Invalid input path or URL")
162-
exit(1)
163-
164-
# if not pdf_path.is_file():
165-
# logging.error("File %r not found", pdf_path)
166-
# exit(1)
167138

168139
# Convert the PDF to Markdown and then to HTML
169140
md_text = pymupdf4llm.to_markdown(pdf_path)

0 commit comments

Comments
 (0)