Skip to content

Commit 8704176

Browse files
authored
Use a safer, two-stage transformation in readqmd.lua (#13792)
Use a safer two-stage transformation to send shortcodes through the Markdown parsing layer. This avoids us clobbering the existing Markdown structure when shortcodes are invoked inside link targets. Pandoc 3.7 and later are more strict in where spaces can appear inside links and image targets, and this transformation should now work in that setting.
1 parent d96d9da commit 8704176

File tree

2 files changed

+126
-28
lines changed

2 files changed

+126
-28
lines changed

src/resources/pandoc/datadir/lpegshortcode.lua

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,12 +313,64 @@ local function wrap_lpeg_match(pattern, txt)
313313
return txt
314314
end
315315

316+
-- Convert a string to its hexadecimal representation
317+
local function string_to_hex(str)
318+
return (str:gsub('.', function(c)
319+
return string.format('%02X', string.byte(c))
320+
end))
321+
end
322+
323+
local md_shortcode_2_uuid = "b58fc729-690b-4000-b19f-365a4093b2ff"
324+
local md_shortcode_2_uuid_pattern = "b58fc729%-690b%-4000%-b19f%-365a4093b2ff%-"
325+
local function md_escaped_shortcode_2_fun(s)
326+
return table.concat({
327+
md_shortcode_2_uuid,
328+
"-",
329+
string_to_hex("{{{<" .. s .. ">}}}"),
330+
"-"
331+
})
332+
end
333+
334+
local function md_shortcode_2_fun(open, space, lst, close)
335+
local raw = open .. space
336+
for i = 1, #lst do
337+
local un = unshortcode:match(lst[i])
338+
raw = raw .. (un or lst[i])
339+
end
340+
raw = raw .. close
341+
return table.concat({
342+
md_shortcode_2_uuid,
343+
"-",
344+
string_to_hex(raw),
345+
"-"
346+
});
347+
end
348+
349+
-- This new transformation into a plain UUID-guarded string,
350+
-- is designed to survive the pandoc markdown reader barrier under Pandoc 3.7 and later.
351+
-- we still need the first shortcode transformation to actually convert
352+
-- to a span when it's safe to do so, but this transformation
353+
-- is safe to use in all contexts (including link and image targets).
354+
local md_shortcode_2 = make_shortcode_parser({
355+
escaped = md_escaped_shortcode_2_fun,
356+
string = md_string_param,
357+
keyvalue = md_keyvalue_param,
358+
shortcode = md_shortcode_2_fun,
359+
ignore_pattern = lpeg.P("{.hidden .quarto-markdown-envelope-contents render-id=\"") * (lpeg.P(1) - lpeg.P("\"}"))^1 * lpeg.P("\"}")
360+
})
361+
316362
return {
317363
lpegs = {
318364
md_shortcode = md_shortcode,
365+
md_shortcode_2 = md_shortcode_2,
366+
md_shortcode_2_uuid = md_shortcode_2_uuid_pattern,
319367
unshortcode = unshortcode -- for undoing shortcodes in non-markdown contexts
320368
},
321369

370+
parse_md_shortcode_2 = function(txt)
371+
return wrap_lpeg_match(md_shortcode_2, txt)
372+
end,
373+
322374
parse_md_shortcode = function(txt)
323375
return wrap_lpeg_match(md_shortcode, txt)
324376
end,

src/resources/pandoc/datadir/readqmd.lua

Lines changed: 74 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -113,23 +113,20 @@ local function unescape_invalid_tags(str, tags)
113113
return str
114114
end
115115

116-
local function urldecode(url)
117-
if url == nil then
118-
return
119-
end
120-
url = url:gsub("+", " ")
121-
url = url:gsub("%%(%x%x)", function(x)
122-
return string.char(tonumber(x, 16))
123-
end)
124-
url = url:gsub('%&quot%;', '"')
125-
return url
116+
-- Convert a hexadecimal string back to the original string
117+
local function hex_to_string(hex)
118+
return (hex:gsub('..', function(cc)
119+
return string.char(tonumber(cc, 16))
120+
end))
126121
end
127122

128123
local function readqmd(txt, opts)
124+
local uuid_pattern = "b58fc729%-690b%-4000%-b19f%-365a4093b2ff%-([A-Fa-f0-9]+)%-"
129125
local tags
130126
txt = md_fenced_div.attempt_to_fix_fenced_div(txt)
131127
txt, tags = escape_invalid_tags(txt)
132-
txt = md_shortcode.parse_md_shortcode(txt)
128+
txt = md_shortcode.parse_md_shortcode_2(txt)
129+
print(txt)
133130
local flavor = {
134131
format = "markdown",
135132
extensions = {},
@@ -151,17 +148,27 @@ local function readqmd(txt, opts)
151148
-- so we need to undo that damage here
152149

153150
local unshortcode_text = function (c)
154-
if c.text:match("data%-is%-shortcode%=%\"1%\"") then
155-
c.text = md_shortcode.unparse_md_shortcode(c.text)
156-
end
151+
c.text = c.text:gsub(uuid_pattern, hex_to_string)
157152
return c
158153
end
159154

160155
local function filter_attrs(el)
161156
for k,v in pairs(el.attributes) do
162-
if type(v) == "string" and v:match("data%-is%-shortcode%=%\"1%\"") then
163-
local new_v = md_shortcode.unparse_md_shortcode(v)
164-
el.attributes[k] = new_v
157+
if type(v) == "string" then
158+
local new_str = v:gsub(uuid_pattern, hex_to_string)
159+
-- we avoid always assigning to slightly workaround
160+
-- what appears to be a foundational problem with Pandoc's Lua API
161+
-- while accessing attributes with repeated keys.
162+
-- Quarto is still going to be broken for the case
163+
-- where there are shortcodes inside values of attributes with
164+
-- repeated keys:
165+
--
166+
-- []{k='{{< meta k1 >}}' k='{{< meta k2 >}}'}
167+
--
168+
-- But I don't know how to work around this.
169+
if new_str ~= v then
170+
el.attributes[k] = new_str
171+
end
165172
end
166173
end
167174
return el
@@ -170,9 +177,7 @@ local function readqmd(txt, opts)
170177
local doc = pandoc.read(txt or "", flavor, opts):walk {
171178
CodeBlock = function (cb)
172179
cb.classes = cb.classes:map(restore_invalid_tags)
173-
if cb.text:match("data%-is%-shortcode%=%\"1%\"") then
174-
cb.text = md_shortcode.unparse_md_shortcode(cb.text)
175-
end
180+
cb.text = cb.text:gsub(uuid_pattern, hex_to_string)
176181
cb.text = unescape_invalid_tags(cb.text, tags)
177182
return cb
178183
end,
@@ -184,20 +189,61 @@ local function readqmd(txt, opts)
184189
Div = filter_attrs,
185190
Link = function (l)
186191
l = filter_attrs(l)
187-
if l.target:match("data%-is%-shortcode%=%%221%%22") then
188-
l.target = md_shortcode.unparse_md_shortcode(urldecode(l.target))
189-
return l
190-
end
192+
l.target = l.target:gsub(uuid_pattern, hex_to_string)
191193
return l
192194
end,
193195
Image = function (i)
194196
i = filter_attrs(i)
195-
if i.src:match("data%-is%-shortcode%=%%221%%22") then
196-
i.src = md_shortcode.unparse_md_shortcode(urldecode(i.src))
197-
return i
198-
end
197+
-- Replace UUID-encoded shortcodes in i.src
198+
i.src = i.src:gsub(uuid_pattern, hex_to_string)
199199
return i
200200
end,
201+
Str = function(str_node)
202+
local str = str_node.text
203+
-- Quick check: if UUID not present at all, return as-is
204+
if not str:find("b58fc729-690b-4000-b19f-365a4093b2ff", 1, true) then
205+
return nil
206+
end
207+
208+
local result = pandoc.Inlines{}
209+
local pos = 1
210+
211+
while true do
212+
local match_start, match_end, hex_content = str:find(uuid_pattern, pos)
213+
214+
if not match_start then
215+
-- No more matches; append remaining string if any
216+
if pos <= #str then
217+
table.insert(result, pandoc.Str(str:sub(pos)))
218+
end
219+
break
220+
end
221+
222+
-- Append prefix before the match as a Str node (if non-empty)
223+
if match_start > pos then
224+
table.insert(result, pandoc.Str(str:sub(pos, match_start - 1)))
225+
end
226+
227+
-- Convert hex to original shortcode string
228+
local shortcode_text = hex_to_string(hex_content)
229+
230+
-- Parse the shortcode to markdown span syntax
231+
local parsed_md = md_shortcode.parse_md_shortcode(shortcode_text) or ""
232+
233+
-- Convert to Pandoc inlines via pandoc.read
234+
local doc = pandoc.read(parsed_md, "markdown")
235+
local inlines = doc.blocks[1] and doc.blocks[1].content or pandoc.Inlines{}
236+
-- Append the inlines to result
237+
for _, inline in ipairs(inlines) do
238+
table.insert(result, inline)
239+
end
240+
241+
-- Move position past the match
242+
pos = match_end + 1
243+
end
244+
245+
return result
246+
end
201247
}
202248
return doc
203249
end

0 commit comments

Comments
 (0)