Skip to content

Commit fd85730

Browse files
authored
Merge pull request #33134 from tengqm/improve-linkchecker
Improve the linkchecker script
2 parents a580d52 + 95257a2 commit fd85730

File tree

1 file changed

+61
-45
lines changed

1 file changed

+61
-45
lines changed

scripts/linkchecker.py

Lines changed: 61 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,12 @@
11
#!/usr/bin/env python3
22
#
33
# This a link checker for Kubernetes documentation website.
4-
# - We cover the following cases for the language you provide via `-l`, which
5-
# defaults to 'en'.
6-
# - If the language specified is not English (`en`), we check if you are
7-
# actually using the localized links. For example, if you specify `zh` as
8-
# the language, and for link target `/docs/foo/bar`, we check if the English
9-
# version exists AND if the Chinese version exists as well. A checking record
10-
# is produced if the link can use the localized version.
4+
#
5+
# If the language to check is not English (`en`), we check if you are actually
6+
# using the localized links. For example, if you checking
7+
# `content/zh/docs/foo/bar`, we check if the English version exists AND if the
8+
# Chinese version exists as well. A checking record is produced if the link
9+
# can use the localized version.
1110
#
1211
# Usage: linkchecker.py -h
1312
#
@@ -64,19 +63,24 @@
6463
C_RED = "\033[31m"
6564
C_GREEN = "\033[32m"
6665
C_YELLOW = "\033[33m"
67-
C_GRAY = "\033[90m"
66+
C_GRAY = "\033[90m"
6867
C_CYAN = "\033[36m"
6968
C_END = "\033[0m"
7069

7170
# Command line arguments shared across functions
7271
ARGS = None
72+
# Command line parser
73+
PARSER = None
74+
# Language as parsed from the file path
75+
LANG = None
7376
# Global result dictionary keyed by page examined
7477
RESULT = {}
7578
# Cached redirect entries
7679
REDIRECTS = {}
7780
# Cached anchors in target pages
7881
ANCHORS = {}
7982

83+
8084
def new_record(level, message, target):
8185
"""Create new checking record.
8286
@@ -89,7 +93,7 @@ def new_record(level, message, target):
8993
global ARGS
9094

9195
# Skip info when verbose
92-
if ARGS.verbose == False and level == "INFO":
96+
if ARGS.verbose is False and level == "INFO":
9397
return None
9498

9599
result = None
@@ -98,9 +102,9 @@ def new_record(level, message, target):
98102
else:
99103
target = C_GRAY + target + C_END
100104
if level == "INFO":
101-
result = target + ": " + C_GREEN + message + C_END
105+
result = target + ": " + C_GREEN + message + C_END
102106
elif level == "WARNING":
103-
result = target + ": " + C_YELLOW+ message + C_END
107+
result = target + ": " + C_YELLOW + message + C_END
104108
else: # default to error
105109
result = target + ": " + C_RED + message + C_END
106110

@@ -286,7 +290,7 @@ def check_target(page, anchor, target):
286290

287291
# link to English or localized page
288292
if (target.startswith("/docs/") or
289-
target.startswith("/" + ARGS.lang + "/docs/")):
293+
target.startswith("/" + LANG + "/docs/")):
290294

291295
# target is shared reference (kubectl or kubernetes-api?
292296
if (target.find("/docs/reference/generated/kubectl/") >= 0 or
@@ -305,22 +309,22 @@ def check_target(page, anchor, target):
305309
if ok:
306310
# We do't do additional checks for English site even if it has
307311
# links to a non-English page
308-
if ARGS.lang == "en":
312+
if LANG == "en":
309313
return None
310314

311315
# If we are already checking localized link, fine
312-
if target.startswith("/" + ARGS.lang + "/docs/"):
316+
if target.startswith("/" + LANG + "/docs/"):
313317
return None
314318

315319
# additional check for localization even if English target exists
316-
base = os.path.join(ROOT, "content", ARGS.lang)
320+
base = os.path.join(ROOT, "content", LANG)
317321
found = check_file_exists(base, target)
318322
if not found:
319323
# Still to be translated
320324
return None
321325
msg = ("Localized page detected, please append '/%s' to the target"
322-
% ARGS.lang)
323-
return new_record("ERROR", "Link not using localized page", target)
326+
% LANG)
327+
return new_record("ERROR", msg, target)
324328

325329
# taget might be a redirect entry
326330
real_target = get_redirect(target)
@@ -333,15 +337,16 @@ def check_target(page, anchor, target):
333337
msg = "Link may be wrong for the anchor [%s]" % anchor
334338
return new_record("WARNING", msg, target)
335339

336-
def check_anchor(target_page, anchor):
340+
341+
def check_anchor(target, anchor):
337342
"""Check if an anchor is defined in the target page
338343
339-
:param target_page: The target page to check
344+
:param target: The target page to check
340345
:param anchor: Anchor string to find in the target page
341346
"""
342-
if target_page not in ANCHORS:
347+
if target not in ANCHORS:
343348
try:
344-
with open(target_page, "r") as f:
349+
with open(target, "r") as f:
345350
data = f.readlines()
346351
except Exception as ex:
347352
print("[Error] failed in reading markdown file: " + str(ex))
@@ -351,16 +356,18 @@ def check_anchor(target_page, anchor):
351356
regex1 = re.compile(anchor_pattern1)
352357
anchor_pattern2 = r"{#(.*?)}"
353358
regex2 = re.compile(anchor_pattern2)
354-
ANCHORS[target_page] = regex1.findall(content) + regex2.findall(content)
355-
return anchor in ANCHORS[target_page]
359+
ANCHORS[target] = regex1.findall(content) + regex2.findall(content)
360+
return anchor in ANCHORS[target]
361+
356362

357363
def check_apiref_target(target, anchor):
358364
"""Check a link to an API reference page.
359365
360366
:param target: The link target string to check
361367
:param anchor: Anchor string from the content page
362368
"""
363-
base = os.path.join(ROOT, "content", "en", "docs", "reference", "kubernetes-api")
369+
base = os.path.join(ROOT, "content", "en", "docs", "reference",
370+
"kubernetes-api")
364371
ok = check_file_exists(base + "/", target)
365372
if not ok:
366373
return new_record("ERROR", "API reference page not found", target)
@@ -370,7 +377,9 @@ def check_apiref_target(target, anchor):
370377

371378
target_page = os.path.join(base, target)+".md"
372379
if not check_anchor(target_page, anchor):
373-
return new_record("ERROR", "Anchor not found in API reference page", target+"#"+anchor)
380+
return new_record("ERROR", "Anchor not found in API reference page",
381+
target+"#"+anchor)
382+
374383

375384
def validate_links(page):
376385
"""Find and validate links on a content page.
@@ -398,8 +407,8 @@ def validate_links(page):
398407
records.append(r)
399408

400409
# searches for pattern: {{< api-reference page="" anchor=""
401-
apiref_pattern = r"{{ *< *api-reference page=\"([^\"]*?)\" *anchor=\"(.*?)\""
402-
regex = re.compile(apiref_pattern)
410+
apiref_re = r"{{ *< *api-reference page=\"([^\"]*?)\" *anchor=\"(.*?)\""
411+
regex = re.compile(apiref_re)
403412

404413
matches = regex.findall(content)
405414
for m in matches:
@@ -408,8 +417,8 @@ def validate_links(page):
408417
records.append(r)
409418

410419
# searches for pattern: {{< api-reference page=""
411-
apiref_pattern = r"{{ *< *api-reference page=\"([^\"]*?)\""
412-
regex = re.compile(apiref_pattern)
420+
apiref_re = r"{{ *< *api-reference page=\"([^\"]*?)\""
421+
regex = re.compile(apiref_re)
413422

414423
matches = regex.findall(content)
415424
for m in matches:
@@ -426,31 +435,38 @@ def parse_arguments():
426435
427436
Result is returned and saved into global variable ARGS.
428437
"""
429-
parser = argparse.ArgumentParser(description="Links checker for docs.")
430-
parser.add_argument("-l", dest="lang", default="en", metavar="<LANG>",
431-
help=("two letter language code, e.g. 'zh'. "
432-
"(default='en')"))
433-
parser.add_argument("-v", dest="verbose", action="store_true",
438+
global PARSER
439+
440+
PARSER = argparse.ArgumentParser(description="Links checker for docs.")
441+
PARSER.add_argument("-v", dest="verbose", action="store_true",
434442
help="switch on verbose level")
435-
parser.add_argument("-f", dest="filter", default="/docs/**/*.md",
436-
metavar="<FILTER>",
437-
help=("File pattern to scan, e.g. '/docs/foo.md'. "
438-
"(default='/docs/**/*.md')"))
439-
parser.add_argument("-n", "--no-color", action="store_true",
443+
PARSER.add_argument("-n", "--no-color", action="store_true",
440444
help="Suppress colored printing.")
445+
PARSER.add_argument("-f", dest="filter", default="content/en/docs/**/*.md",
446+
metavar="<FILTER>",
447+
help=("File pattern to scan. "
448+
"(default='content/en/docs/**/*.md')"))
441449

442-
return parser.parse_args()
450+
return PARSER.parse_args()
443451

444452

445453
def main():
446454
"""The main entry of the program."""
447-
global ARGS, ROOT, REDIRECTS
455+
global ARGS, ROOT, REDIRECTS, PARSER, LANG
448456

449457
ARGS = parse_arguments()
450-
print("Language: " + ARGS.lang)
451458
ROOT = os.path.join(os.path.dirname(__file__), '..')
452-
content_dir = os.path.join(ROOT, 'content')
453-
lang_dir = os.path.join(content_dir, ARGS.lang)
459+
460+
print(ARGS.filter)
461+
parts = ARGS.filter.split("/", 2)
462+
if len(parts) != 3 or parts[0] != "content":
463+
print("ERROR:\nPlease specify file pattern in the format "
464+
"'content/<lang>/<path-pattern>', for example:\n"
465+
"'content/zh/docs/concepts/**/*.md'\n")
466+
PARSER.print_help()
467+
sys.exit(-1)
468+
469+
LANG = parts[1]
454470

455471
# read redirects data
456472
redirects_fn = os.path.join(ROOT, "static", "_redirects")
@@ -473,7 +489,7 @@ def main():
473489
print("[Error] failed in reading redirects file: " + str(ex))
474490
return
475491

476-
folders = [f for f in glob.glob(lang_dir + ARGS.filter, recursive=True)]
492+
folders = [f for f in glob.glob(ARGS.filter, recursive=True)]
477493
for page in folders:
478494
validate_links(page)
479495

0 commit comments

Comments
 (0)