Skip to content

Commit 56d40fa

Browse files
committed
Improve performance for proofread_canonicals()
1 parent a6c458d commit 56d40fa

File tree

1 file changed

+36
-15
lines changed

1 file changed

+36
-15
lines changed

build_docs.py

Lines changed: 36 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from __future__ import annotations
2424

2525
import argparse
26+
import concurrent.futures
2627
import dataclasses
2728
import datetime as dt
2829
import filecmp
@@ -1262,21 +1263,41 @@ def proofread_canonicals(
12621263
/3/whatsnew/3.11.html, which may not exist yet.
12631264
"""
12641265
logging.info("Checking canonical links...")
1265-
canonical_re = re.compile(
1266-
"""<link rel="canonical" href="https://docs.python.org/([^"]*)" />"""
1267-
)
1268-
for file in www_root.glob("**/*.html"):
1269-
html = file.read_text(encoding="UTF-8", errors="surrogateescape")
1270-
canonical = canonical_re.search(html)
1271-
if not canonical:
1272-
continue
1273-
target = canonical.group(1)
1274-
if not (www_root / target).exists():
1275-
logging.info("Removing broken canonical from %s to %s", file, target)
1276-
html = html.replace(canonical.group(0), "")
1277-
file.write_text(html, encoding="UTF-8", errors="surrogateescape")
1278-
if not skip_cache_invalidation:
1279-
purge(http, str(file).replace("/srv/docs.python.org/", ""))
1266+
worker_count = (os.cpu_count() or 1) + 2
1267+
with concurrent.futures.ThreadPoolExecutor(worker_count) as executor:
1268+
futures = {
1269+
executor.submit(_check_canonical_rel, file, www_root)
1270+
for file in www_root.glob("**/*.html")
1271+
}
1272+
paths_to_purge = {
1273+
res.relative_to(www_root) # strip the leading /srv/docs.python.org
1274+
for fut in concurrent.futures.as_completed(futures)
1275+
if (res := fut.result()) is not None
1276+
}
1277+
if not skip_cache_invalidation:
1278+
purge(http, *paths_to_purge)
1279+
1280+
1281+
_canonical_re = re.compile(
1282+
"""<link rel="canonical" href="https://docs.python.org/([^"]*)" />"""
1283+
)
1284+
1285+
1286+
def _check_canonical_rel(file: Path, www_root: Path):
1287+
# Check for a canonical relation link in the HTML.
1288+
# If one exists, ensure that the target exists
1289+
# or otherwise remove the canonical link element.
1290+
html = file.read_text(encoding="UTF-8", errors="surrogateescape")
1291+
canonical = _canonical_re.search(html)
1292+
if canonical is None:
1293+
return None
1294+
target = canonical.group(1)
1295+
if (www_root / target).exists():
1296+
return None
1297+
logging.info("Removing broken canonical from %s to %s", file, target)
1298+
html = html.replace(canonical.group(0), "")
1299+
file.write_text(html, encoding="UTF-8", errors="surrogateescape")
1300+
return file
12801301

12811302

12821303
def purge(http: urllib3.PoolManager, *paths: Path | str) -> None:

0 commit comments

Comments
 (0)