@@ -4,71 +4,82 @@ set -Eeuo pipefail
44self=" $( basename " $0 " ) "
55usage () {
66 cat << -EOU
7- usage: $self path/to/README .md
7+ usage: $self path/to/markdown .md
88 eg: $self README.md
99
10- WARNING: if README.md has the TOC-replacement comments,
11- README.md.bak will be clobbered and the TOC will be inserted
10+ WARNING: this will *always* clobber any path/to/markdown.md.{toc,bak} while processing; use with caution!
1211 EOU
1312}
1413
1514markdown=" ${1:- } "
16- if ! shift || [ ! -f " $markdown " ]; then usage >&2 ; exit 1; fi
15+ if ! shift || [ ! -s " $markdown " ]; then usage >&2 ; exit 1; fi
1716
18- toc=" $(
19- gawk '
20- # ignore comments in code blocks, which are not headers but look like them
21- /^```/ { ignore = !ignore }
17+ # see https://gist.github.com/tianon/75e267d9137b1c2978031b66b3a98987 for an insane test case for this (with several rough edges)
2218
23- /^#/ && !ignore {
24- level = length($1)
25- $1 = ""
26- gsub(/^[[:space:]]|[[:space:]]$/, "")
19+ jq --raw-input --null-input --raw-output '
20+ reduce inputs as $line ({ toc: "" };
21+ if $line | test("^```") then
22+ .ignore |= not
23+ else . end
24+ | if .ignore then . else
25+ (
26+ $line
27+ | capture("^(?<hash>#+)[[:space:]]*(?<heading>.*?)[[:space:]]*$")
28+ // null
29+ ) as $cap
30+ | if $cap then
31+ ($cap.hash | length) as $level
32+ | .levels[$level] += 1
33+ | .levels |= (.[range($level+1; length)] = 0)
34+ | (
35+ $cap.heading
36+ | ascii_downcase
37+ # https://github.com/thlorenz/anchor-markdown-header/blob/6b9bc1c902e48942666859fb6f795d91cbfd48e7/anchor-markdown-header.js#L33-L48
38+ | gsub(" "; "-")
39+ # escape codes (commented out because this is not something GitHub strips, although it *does* strip % which is not included below, so that is added here)
40+ #| gsub("%[abcdef0-9]{2}"; ""; "i")
41+ | gsub("%"; "")
42+ # single chars that are removed
43+ | gsub("[\\\\/?!:\\[\\]`.,()*\"' " '" ' ;{}+=<>~$|#@&–—]"; "")
44+ # CJK punctuations that are removed
45+ | gsub("[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]"; "")
46+ # Strip emojis (*technically* this is way too aggressive and will strip out *all* UTF-8, but 🤷)
47+ | (split("") | map(select(utf8bytelength == 1)) | join(""))
48+ # TODO Strip embedded markdown formatting
49+ ) as $anchor
50+ # handle repetition (same end anchor)
51+ | (
52+ (.seen // []) as $seen
53+ | first(
54+ # this 1000 limits how many repeated headings we can have, but 1000 of the exact same header text seems pretty generous 🙊
55+ $anchor + (range(1000) | if . > 0 then "-\(.)" else "" end)
56+ | select(IN($seen[]) | not)
57+ )
58+ // error("repetition level too deep on #\($anchor) (\($line)) at line \(input_line_number)")
59+ ) as $finalAnchor
60+ | .toc += "\("\t" * ($level-1) // "")\(.levels[$level]).\t[\($cap.heading)](#\($finalAnchor))\n"
61+ | .seen += [ $finalAnchor ]
62+ else . end
63+ end
64+ )
65+ | .toc
66+ ' " $markdown " > " $markdown .toc"
2767
28- ++levelCounter[level]
29- for (i in levelCounter) {
30- if (i > level) {
31- levelCounter[i] = 0
32- }
33- }
34- prefix = levelCounter[level] ".\t"
35- for (i = 1; i < level; ++i) {
36- prefix = "\t" prefix
37- }
38-
39- # https://github.com/thlorenz/anchor-markdown-header/blob/56f77a232ab1915106ad1746b99333bf83ee32a2/anchor-markdown-header.js#L20-L30
40- hash = tolower($0)
41- gsub(/ /, "-", hash)
42- gsub(/[\/?!:\[\]`.,()*"' " '" ' ;{}+=<>~\$|#@&–—]/, "", hash)
43- gsub(/[。?!,、;:“”【】()〔〕[]﹃﹄“ ”‘’﹁﹂—…-~《》〈〉「」]/, "", hash)
44-
45- printf "%s[%s](#%s)\n", prefix, $0, hash
46- }
47- ' " $markdown "
48- ) "
49-
50- toFile=" ${markdown} .bak"
51- gawk -v toFile=" $toFile " -v toc=" $toc " '
52- BEGIN { printf "" > toFile }
68+ gawk -v tocFile=" $markdown .toc" '
5369 /^<!-- AUTOGENERATED TOC -->$/ {
5470 inToc = !inToc
5571 seenToc = 1
5672 if (inToc) {
57- print >> toFile
58- print "" >> toFile
59- print toc >> toFile
60- print "" >> toFile
61- print >> toFile
73+ print
74+ print ""
75+ system("cat " tocFile)
76+ # no need for another newline because tocFile should already end with one
77+ print
6278 }
6379 next
6480 }
65- !inToc { print >> toFile }
66- END { if (!seenToc) { close(toFile); printf "" > toFile } }
67- ' " $markdown "
81+ !inToc { print }
82+ ' " $markdown " > " $markdown .bak"
6883
69- if [ -s " $toFile " ]; then
70- mv " $toFile " " $markdown "
71- else
72- rm " $toFile "
73- echo " $toc "
74- fi
84+ mv -f " $markdown .bak" " $markdown "
85+ rm -f " $markdown .toc"
0 commit comments