@@ -1278,25 +1278,25 @@ def proofread_canonicals(
12781278 purge (http , * paths_to_purge )
12791279
12801280
1281- _canonical_re = re .compile (
1282- """<link rel="canonical" href="https://docs.python.org/([^"]*)" />"""
1283- )
1284-
1285-
12861281def _check_canonical_rel (file : Path , www_root : Path ):
12871282 # Check for a canonical relation link in the HTML.
12881283 # If one exists, ensure that the target exists
12891284 # or otherwise remove the canonical link element.
1290- html = file .read_text (encoding = "UTF-8" , errors = "surrogateescape" )
1291- canonical = _canonical_re .search (html )
1292- if canonical is None :
1285+ prefix = b'<link rel="canonical" href="https://docs.python.org/'
1286+ suffix = b'" />'
1287+ pfx_len = len (prefix )
1288+ sfx_len = len (suffix )
1289+ html = file .read_bytes ()
1290+ try :
1291+ start = html .index (prefix )
1292+ end = html .index (suffix , start + pfx_len )
1293+ except ValueError :
12931294 return None
1294- target = canonical . group ( 1 )
1295+ target = html [ start + pfx_len : end ]. decode ( errors = "surrogateescape" )
12951296 if (www_root / target ).exists ():
12961297 return None
12971298 logging .info ("Removing broken canonical from %s to %s" , file , target )
1298- html = html .replace (canonical .group (0 ), "" )
1299- file .write_text (html , encoding = "UTF-8" , errors = "surrogateescape" )
1299+ file .write_bytes (html [:start ] + html [end + sfx_len :])
13001300 return file
13011301
13021302
0 commit comments