Skip to content

Commit 190775c

Browse files
committed
Improve toc.sh to support more of GitHub's edge cases
Most notably, this allows emoji in headers to generate the correct anchors. See https://gist.github.com/tianon/75e267d9137b1c2978031b66b3a98987 for a bunch of test cases I worked through this with (and found bugs in GitHub's implementation while reverse engineering it, which is fun, and quirks of the library I was copying that aren't correct for the GitHub implementation). I've also verified that this causes no changes to the TOC here or over in the docs repo (as expected).
1 parent 45eb2c5 commit 190775c

File tree

1 file changed

+62
-51
lines changed

1 file changed

+62
-51
lines changed

toc.sh

Lines changed: 62 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -4,71 +4,82 @@ set -Eeuo pipefail
44
self="$(basename "$0")"
55
usage() {
66
cat <<-EOU
7-
usage: $self path/to/README.md
7+
usage: $self path/to/markdown.md
88
eg: $self README.md
99
10-
WARNING: if README.md has the TOC-replacement comments,
11-
README.md.bak will be clobbered and the TOC will be inserted
10+
WARNING: this will *always* clobber any path/to/markdown.md.{toc,bak} while processing; use with caution!
1211
EOU
1312
}
1413

1514
markdown="${1:-}"
16-
if ! shift || [ ! -f "$markdown" ]; then usage >&2; exit 1; fi
15+
if ! shift || [ ! -s "$markdown" ]; then usage >&2; exit 1; fi
1716

18-
toc="$(
19-
gawk '
20-
# ignore comments in code blocks, which are not headers but look like them
21-
/^```/ { ignore = !ignore }
17+
# see https://gist.github.com/tianon/75e267d9137b1c2978031b66b3a98987 for an insane test case for this (with several rough edges)
2218

23-
/^#/ && !ignore {
24-
level = length($1)
25-
$1 = ""
26-
gsub(/^[[:space:]]|[[:space:]]$/, "")
19+
jq --raw-input --null-input --raw-output '
20+
reduce inputs as $line ({ toc: "" };
21+
if $line | test("^```") then
22+
.ignore |= not
23+
else . end
24+
| if .ignore then . else
25+
(
26+
$line
27+
| capture("^(?<hash>#+)[[:space:]]*(?<heading>.*?)[[:space:]]*$")
28+
// null
29+
) as $cap
30+
| if $cap then
31+
($cap.hash | length) as $level
32+
| .levels[$level] += 1
33+
| .levels |= (.[range($level+1; length)] = 0)
34+
| (
35+
$cap.heading
36+
| ascii_downcase
37+
# https://github.com/thlorenz/anchor-markdown-header/blob/6b9bc1c902e48942666859fb6f795d91cbfd48e7/anchor-markdown-header.js#L33-L48
38+
| gsub(" "; "-")
39+
# escape codes (commented out because this is not something GitHub strips, although it *does* strip % which is not included below, so that is added here)
40+
#| gsub("%[abcdef0-9]{2}"; ""; "i")
41+
| gsub("%"; "")
42+
# single chars that are removed
43+
| gsub("[\\\\/?!:\\[\\]`.,()*\"'"'"';{}+=<>~$|#@&–—]"; "")
44+
# CJK punctuations that are removed
45+
| gsub("[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]"; "")
46+
# Strip emojis (*technically* this is way too aggressive and will strip out *all* UTF-8, but 🤷)
47+
| (split("") | map(select(utf8bytelength == 1)) | join(""))
48+
# TODO Strip embedded markdown formatting
49+
) as $anchor
50+
# handle repetition (same end anchor)
51+
| (
52+
(.seen // []) as $seen
53+
| first(
54+
# this 1000 limits how many repeated headings we can have, but 1000 of the exact same header text seems pretty generous 🙊
55+
$anchor + (range(1000) | if . > 0 then "-\(.)" else "" end)
56+
| select(IN($seen[]) | not)
57+
)
58+
// error("repetition level too deep on #\($anchor) (\($line)) at line \(input_line_number)")
59+
) as $finalAnchor
60+
| .toc += "\("\t" * ($level-1) // "")\(.levels[$level]).\t[\($cap.heading)](#\($finalAnchor))\n"
61+
| .seen += [ $finalAnchor ]
62+
else . end
63+
end
64+
)
65+
| .toc
66+
' "$markdown" > "$markdown.toc"
2767

28-
++levelCounter[level]
29-
for (i in levelCounter) {
30-
if (i > level) {
31-
levelCounter[i] = 0
32-
}
33-
}
34-
prefix = levelCounter[level] ".\t"
35-
for (i = 1; i < level; ++i) {
36-
prefix = "\t" prefix
37-
}
38-
39-
# https://github.com/thlorenz/anchor-markdown-header/blob/56f77a232ab1915106ad1746b99333bf83ee32a2/anchor-markdown-header.js#L20-L30
40-
hash = tolower($0)
41-
gsub(/ /, "-", hash)
42-
gsub(/[\/?!:\[\]`.,()*"'"'"';{}+=<>~\$|#@&–—]/, "", hash)
43-
gsub(/[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]/, "", hash)
44-
45-
printf "%s[%s](#%s)\n", prefix, $0, hash
46-
}
47-
' "$markdown"
48-
)"
49-
50-
toFile="${markdown}.bak"
51-
gawk -v toFile="$toFile" -v toc="$toc" '
52-
BEGIN { printf "" > toFile }
68+
gawk -v tocFile="$markdown.toc" '
5369
/^<!-- AUTOGENERATED TOC -->$/ {
5470
inToc = !inToc
5571
seenToc = 1
5672
if (inToc) {
57-
print >> toFile
58-
print "" >> toFile
59-
print toc >> toFile
60-
print "" >> toFile
61-
print >> toFile
73+
print
74+
print ""
75+
system("cat " tocFile)
76+
# no need for another newline because tocFile should already end with one
77+
print
6278
}
6379
next
6480
}
65-
!inToc { print >> toFile }
66-
END { if (!seenToc) { close(toFile); printf "" > toFile } }
67-
' "$markdown"
81+
!inToc { print }
82+
' "$markdown" > "$markdown.bak"
6883

69-
if [ -s "$toFile" ]; then
70-
mv "$toFile" "$markdown"
71-
else
72-
rm "$toFile"
73-
echo "$toc"
74-
fi
84+
mv -f "$markdown.bak" "$markdown"
85+
rm -f "$markdown.toc"

0 commit comments

Comments
 (0)