microsite-template/check-external-links.sh at main · The-AI-Alliance/microsite-template · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env zsh
#---------------------------------------------------------------------------
# Checks that all external links, those starting with http*, have "targets".
#---------------------------------------------------------------------------

default_path="docs"

help() {
	cat << EOF
A fairly crude, but mostly effective tool that checks URLs in markdown
files and HTML template files to make sure they all have "targets".
For markdown files, it prints anchor tags ("[name](url)") that don't
have a "{:target=...}" appended to them. For HTML files, it looks for
the equivalent "<a href="..." target="...">...</a>".

It also handles our way of referencing the separate glossary site, where
the markdown URL will be '[Term]({{site.glossaryurl}}/#term)' and not have
an explicit 'http...' leader.

It attempts to correctly ignore image URLs, e.g., '![label](https://example.com/image.png)',
by looking at the file extension (jpg|jpeg|png|svg|mp3|mp4). This means
that if a URL that should have a target happens to have one of those strings,
it won't be checked for the target!

It doesn't exit with an error if such links are found, because in some
cases, this might be intentional.

Usage: $script [-h|--help] [-n|--noop] [-v|--verbose] [path1 ...]

Where the arguments are the following:
-h | --help            Print this message and exit
-n | --noop            Just print the commands but don't make changes.
-v | --verbose         Print the paths as they are processed. Mostly useful
                       when no problems are found and you are paranoid nothing
                       was checked. ;)
path1 ...              Check these paths. Directories will be visited recursively.
                       Default: All markdown and HTML files under "$default_path",
                       excluding files under "_site" and "_sass".
NOTES:
1. Skips files found under "temp", "tmp", "_site", and "_sass" directories.
EOF
}

error() {
	for arg in "$@"
	do
		echo "ERROR: $arg"
	done
	help
	exit 1
}

paths=()
: ${VERBOSE=}
while [[ $# -gt 0 ]]
do
	case $1 in
		-h|--h*)
			help
			exit 0
			;;
		-n|--n*)
			NOOP=echo
			;;
		-v|--v*)
			VERBOSE=echo
			;;
		-*)
			error "Unrecognized option: $1"
			;;
		*)
			paths+=("$1")
			;;
	esac
	shift
done

[[ ${#paths[@]} -gt 0 ]] || paths=("$default_path")

eg=$(which egrep)
# Use a somewhat complicated script to find the URLs starting
# with http, print only the matches and then filter out the
# URLs that contain "target". It won't work perfectly, but ...
[[ -n "$VERBOSE" ]] && echo "Checking markdown files:"
for path in "${paths[@]}"
do
	if [[ -n "$VERBOSE" ]]
	then
		dir=$([[ -d "$path" ]] && echo "(directory)")
		echo "$path $dir"
	fi
	$NOOP $eg -nHoR '\(https?[^)]+\)(\S*)' \
		--include '*.markdown' --include '*.md' \
 		--exclude-dir 'temp' --exclude-dir 'tmp' \
		--exclude-dir '_site' --exclude-dir '_sass' \
		$path | $eg -v 'target=' | $eg -v '\.(jpg|jpeg|png|svg|mp3|mp4)'
	$NOOP $eg -nHoR '\(\{\{site.glossaryurl\}\}[^)]*\)(\S*)' \
		--include '*.markdown' --include '*.md' \
 		--exclude-dir 'temp' --exclude-dir 'tmp' \
		--exclude-dir '_site' --exclude-dir '_sass' \
		$path | $eg -v 'target=' | $eg -v '\.(jpg|jpeg|png|svg|mp3|mp4)'
done

[[ -n "$VERBOSE" ]] && echo "Checking HTML files:"
for path in "${paths[@]}"
do
	if [[ -n "$VERBOSE" ]]
	then
		dir=$([[ -d "$path" ]] && echo "(directory)")
		echo "$path $dir"
	fi
	$NOOP $eg -nHoR '<a\s*href="https?[^>]+>' \
		--include '*.html' \
 		--exclude-dir 'temp' --exclude-dir 'tmp' \
		--exclude-dir '_site' --exclude-dir '_sass' \
		$path | $eg -v 'target=' | $eg -v '\.(jpg|jpeg|png|svg|mp3|mp4)'
done