11-- html-cleanup-v2.lua
2- -- HTML → commonmark_x 终极清理版(支持任意嵌套 callout + 完整反转义)
2+ -- HTML → commonmark_x 终极清理版(支持任意嵌套 callout + 完整反转义 + 元数据提取 )
33
44local function unescape_math (s )
55 if not s then return " " end
6- s = pandoc .utils .stringify (s )
6+ if type (s ) ~= " string" then
7+ s = pandoc .utils .stringify (s )
8+ end
79 -- HTML entities 完整解码
810 s = s :gsub (" &" , " &" )
911 :gsub (" <" , " <" )
@@ -36,7 +38,37 @@ local function max_fence_length(str)
3638 return maxn
3739end
3840
39- -- 1. Inline arithmatex: [\$F_1\'\$] → $F_1'$
41+ -- 转换元数据(如果需要作为 frontmatter 写入)
42+ function Pandoc (doc )
43+ local meta = doc .meta
44+ -- 尝试从 RawBlocks 中搜寻元数据(启发式)
45+ for i , el in ipairs (doc .blocks ) do
46+ if el .t == " RawBlock" and el .format == " html" then
47+ local title = el .text :match (' data%-title="([^"]+)"' )
48+ local url = el .text :match (' data%-url="([^"]+)"' )
49+ if title then meta .title = title end
50+ if url then meta .url = url end
51+ end
52+ end
53+
54+ -- 移除那些提取完后的元数据注释
55+ local new_blocks = pandoc .List ()
56+ for _ , el in ipairs (doc .blocks ) do
57+ local skip = false
58+ if el .t == " RawBlock" and el .format == " html" then
59+ if el .text :match (' mkdocs%-fragment' ) or el .text :match (' </article>' ) then
60+ skip = true
61+ end
62+ end
63+ if not skip then
64+ new_blocks :insert (el )
65+ end
66+ end
67+ doc .blocks = new_blocks
68+ return doc
69+ end
70+
71+ -- 2. Inline arithmatex: [\$F_1\'\$] → $F_1'$
4072function Span (el )
4173 if el .classes and el .classes :includes (" arithmatex" ) then
4274 local text = pandoc .utils .stringify (el .content )
@@ -46,7 +78,7 @@ function Span(el)
4678 end
4779end
4880
49- -- 2 . Block arithmatex + Admonition + details
81+ -- 3 . Block arithmatex + Admonition
5082function Div (el )
5183 -- arithmatex display math
5284 if el .classes and el .classes :includes (" arithmatex" ) then
@@ -64,12 +96,12 @@ function Div(el)
6496 break
6597 end
6698 end
67- if not callout_type and el .classes and el .classes :includes (" admonition" ) then
99+ if not callout_type and el .classes and ( el .classes :includes (" admonition" ) or el . classes : includes ( " details " ) ) then
68100 callout_type = " note"
69101 end
70102
71103 if callout_type then
72- -- 提取标题(admonition-title / Header / <summary>)
104+ -- 提取标题
73105 local title = nil
74106 local body = pandoc .List {}
75107 for _ , blk in ipairs (el .content ) do
@@ -92,10 +124,7 @@ function Div(el)
92124 end
93125 end
94126
95- -- 写入内部内容(已经过 filter 处理)
96127 local inner_md = pandoc .write (pandoc .Pandoc (body ), " commonmark_x" )
97-
98- -- 动态 fence 长度:比内部最大多 1,保证嵌套绝对安全
99128 local fence_len = math.max (3 , max_fence_length (inner_md ) + 1 )
100129 local fence = string.rep (" :" , fence_len )
101130
@@ -111,22 +140,8 @@ function Div(el)
111140 return el
112141end
113142
114- -- 3. 清除所有标题的 {#_1} {#_2} 等 id
115- function Header (el )
116- el .identifier = " "
117- return el
118- end
119-
120- -- 4. 处理被 skipped 的 <details>
121- function RawBlock (el )
122- if el .format == " html" and el .text :match (" ^%s*<details" ) then
123- return pandoc .Div (pandoc .List {}, pandoc .Attr (" " , {" details" }))
124- end
125- end
126-
127143return {
144+ {Pandoc = Pandoc },
128145 {Span = Span },
129- {Div = Div },
130- {Header = Header },
131- {RawBlock = RawBlock }
132- }
146+ {Div = Div }
147+ }
0 commit comments