Skip to content

Commit 645396a

Browse files
authored
Merge pull request docker-library#18629 from infosiftr/improved-toc
Improve `toc.sh` to support more of GitHub's edge cases
2 parents a1029c8 + 190775c commit 645396a

File tree

1 file changed

+62
-51
lines changed

1 file changed

+62
-51
lines changed

toc.sh

Lines changed: 62 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4,71 +4,82 @@ set -Eeuo pipefail
44
self="$(basename "$0")"
55
usage() {
66
cat <<-EOU
7-
usage: $self path/to/README.md
7+
usage: $self path/to/markdown.md
88
eg: $self README.md
99
10-
WARNING: if README.md has the TOC-replacement comments,
11-
README.md.bak will be clobbered and the TOC will be inserted
10+
WARNING: this will *always* clobber any path/to/markdown.md.{toc,bak} while processing; use with caution!
1211
EOU
1312
}
1413

1514
markdown="${1:-}"
16-
if ! shift || [ ! -f "$markdown" ]; then usage >&2; exit 1; fi
15+
if ! shift || [ ! -s "$markdown" ]; then usage >&2; exit 1; fi
1716

18-
toc="$(
19-
gawk '
20-
# ignore comments in code blocks, which are not headers but look like them
21-
/^```/ { ignore = !ignore }
17+
# see https://gist.github.com/tianon/75e267d9137b1c2978031b66b3a98987 for an insane test case for this (with several rough edges)
2218

23-
/^#/ && !ignore {
24-
level = length($1)
25-
$1 = ""
26-
gsub(/^[[:space:]]|[[:space:]]$/, "")
19+
jq --raw-input --null-input --raw-output '
20+
reduce inputs as $line ({ toc: "" };
21+
if $line | test("^```") then
22+
.ignore |= not
23+
else . end
24+
| if .ignore then . else
25+
(
26+
$line
27+
| capture("^(?<hash>#+)[[:space:]]*(?<heading>.*?)[[:space:]]*$")
28+
// null
29+
) as $cap
30+
| if $cap then
31+
($cap.hash | length) as $level
32+
| .levels[$level] += 1
33+
| .levels |= (.[range($level+1; length)] = 0)
34+
| (
35+
$cap.heading
36+
| ascii_downcase
37+
# https://github.com/thlorenz/anchor-markdown-header/blob/6b9bc1c902e48942666859fb6f795d91cbfd48e7/anchor-markdown-header.js#L33-L48
38+
| gsub(" "; "-")
39+
# escape codes (commented out because this is not something GitHub strips, although it *does* strip % which is not included below, so that is added here)
40+
#| gsub("%[abcdef0-9]{2}"; ""; "i")
41+
| gsub("%"; "")
42+
# single chars that are removed
43+
| gsub("[\\\\/?!:\\[\\]`.,()*\"'"'"';{}+=<>~$|#@&–—]"; "")
44+
# CJK punctuations that are removed
45+
| gsub("[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]"; "")
46+
# Strip emojis (*technically* this is way too aggressive and will strip out *all* UTF-8, but 🤷)
47+
| (split("") | map(select(utf8bytelength == 1)) | join(""))
48+
# TODO Strip embedded markdown formatting
49+
) as $anchor
50+
# handle repetition (same end anchor)
51+
| (
52+
(.seen // []) as $seen
53+
| first(
54+
# this 1000 limits how many repeated headings we can have, but 1000 of the exact same header text seems pretty generous 🙊
55+
$anchor + (range(1000) | if . > 0 then "-\(.)" else "" end)
56+
| select(IN($seen[]) | not)
57+
)
58+
// error("repetition level too deep on #\($anchor) (\($line)) at line \(input_line_number)")
59+
) as $finalAnchor
60+
| .toc += "\("\t" * ($level-1) // "")\(.levels[$level]).\t[\($cap.heading)](#\($finalAnchor))\n"
61+
| .seen += [ $finalAnchor ]
62+
else . end
63+
end
64+
)
65+
| .toc
66+
' "$markdown" > "$markdown.toc"
2767

28-
++levelCounter[level]
29-
for (i in levelCounter) {
30-
if (i > level) {
31-
levelCounter[i] = 0
32-
}
33-
}
34-
prefix = levelCounter[level] ".\t"
35-
for (i = 1; i < level; ++i) {
36-
prefix = "\t" prefix
37-
}
38-
39-
# https://github.com/thlorenz/anchor-markdown-header/blob/56f77a232ab1915106ad1746b99333bf83ee32a2/anchor-markdown-header.js#L20-L30
40-
hash = tolower($0)
41-
gsub(/ /, "-", hash)
42-
gsub(/[\/?!:\[\]`.,()*"'"'"';{}+=<>~\$|#@&–—]/, "", hash)
43-
gsub(/[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]/, "", hash)
44-
45-
printf "%s[%s](#%s)\n", prefix, $0, hash
46-
}
47-
' "$markdown"
48-
)"
49-
50-
toFile="${markdown}.bak"
51-
gawk -v toFile="$toFile" -v toc="$toc" '
52-
BEGIN { printf "" > toFile }
68+
gawk -v tocFile="$markdown.toc" '
5369
/^<!-- AUTOGENERATED TOC -->$/ {
5470
inToc = !inToc
5571
seenToc = 1
5672
if (inToc) {
57-
print >> toFile
58-
print "" >> toFile
59-
print toc >> toFile
60-
print "" >> toFile
61-
print >> toFile
73+
print
74+
print ""
75+
system("cat " tocFile)
76+
# no need for another newline because tocFile should already end with one
77+
print
6278
}
6379
next
6480
}
65-
!inToc { print >> toFile }
66-
END { if (!seenToc) { close(toFile); printf "" > toFile } }
67-
' "$markdown"
81+
!inToc { print }
82+
' "$markdown" > "$markdown.bak"
6883

69-
if [ -s "$toFile" ]; then
70-
mv "$toFile" "$markdown"
71-
else
72-
rm "$toFile"
73-
echo "$toc"
74-
fi
84+
mv -f "$markdown.bak" "$markdown"
85+
rm -f "$markdown.toc"

0 commit comments

Comments
 (0)