-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathcheck-external-links.sh
More file actions
executable file
·117 lines (105 loc) · 3.55 KB
/
check-external-links.sh
File metadata and controls
executable file
·117 lines (105 loc) · 3.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env zsh
#---------------------------------------------------------------------------
# Checks that all external links, those starting with http*, have "targets".
#---------------------------------------------------------------------------
default_path="docs"
help() {
cat << EOF
A fairly crude, but mostly effective tool that checks URLs in markdown
files and HTML template files to make sure they all have "targets".
For markdown files, it prints anchor tags ("[name](url)") that don't
have a "{:target=...}" appended to them. For HTML files, it looks for
the equivalent "<a href="..." target="...">...</a>".
It also handles our way of referencing the separate glossary site, where
the markdown URL will be '[Term]({{site.glossaryurl}}/#term)' and not have
an explicit 'http...' leader.
It attempts to correctly ignore image URLs, e.g., '',
by looking at the file extension (jpg|jpeg|png|svg|mp3|mp4). This means
that if a URL that should have a target happens to have one of those strings,
it won't be checked for the target!
It doesn't exit with an error if such links are found, because in some
cases, this might be intentional.
Usage: $script [-h|--help] [-n|--noop] [-v|--verbose] [path1 ...]
Where the arguments are the following:
-h | --help Print this message and exit
-n | --noop Just print the commands but don't make changes.
-v | --verbose Print the paths as they are processed. Mostly useful
when no problems are found and you are paranoid nothing
was checked. ;)
path1 ... Check these paths. Directories will be visited recursively.
Default: All markdown and HTML files under "$default_path",
excluding files under "_site" and "_sass".
NOTES:
1. Skips files found under "temp", "tmp", "_site", and "_sass" directories.
EOF
}
error() {
for arg in "$@"
do
echo "ERROR: $arg"
done
help
exit 1
}
paths=()
: ${VERBOSE=}
while [[ $# -gt 0 ]]
do
case $1 in
-h|--h*)
help
exit 0
;;
-n|--n*)
NOOP=echo
;;
-v|--v*)
VERBOSE=echo
;;
-*)
error "Unrecognized option: $1"
;;
*)
paths+=("$1")
;;
esac
shift
done
[[ ${#paths[@]} -gt 0 ]] || paths=("$default_path")
eg=$(which egrep)
# Use a somewhat complicated script to find the URLs starting
# with http, print only the matches and then filter out the
# URLs that contain "target". It won't work perfectly, but ...
[[ -n "$VERBOSE" ]] && echo "Checking markdown files:"
for path in "${paths[@]}"
do
if [[ -n "$VERBOSE" ]]
then
dir=$([[ -d "$path" ]] && echo "(directory)")
echo "$path $dir"
fi
$NOOP $eg -nHoR '\(https?[^)]+\)(\S*)' \
--include '*.markdown' --include '*.md' \
--exclude-dir 'temp' --exclude-dir 'tmp' \
--exclude-dir '_site' --exclude-dir '_sass' \
$path | $eg -v 'target=' | $eg -v '\.(jpg|jpeg|png|svg|mp3|mp4)'
$NOOP $eg -nHoR '\(\{\{site.glossaryurl\}\}[^)]*\)(\S*)' \
--include '*.markdown' --include '*.md' \
--exclude-dir 'temp' --exclude-dir 'tmp' \
--exclude-dir '_site' --exclude-dir '_sass' \
$path | $eg -v 'target=' | $eg -v '\.(jpg|jpeg|png|svg|mp3|mp4)'
done
[[ -n "$VERBOSE" ]] && echo "Checking HTML files:"
for path in "${paths[@]}"
do
if [[ -n "$VERBOSE" ]]
then
dir=$([[ -d "$path" ]] && echo "(directory)")
echo "$path $dir"
fi
$NOOP $eg -nHoR '<a\s*href="https?[^>]+>' \
--include '*.html' \
--exclude-dir 'temp' --exclude-dir 'tmp' \
--exclude-dir '_site' --exclude-dir '_sass' \
$path | $eg -v 'target=' | $eg -v '\.(jpg|jpeg|png|svg|mp3|mp4)'
done