Skip to content

Commit 5aaa6e0

Browse files
committed
perf - combine another astpipeline filter step
1 parent d9319b8 commit 5aaa6e0

File tree

2 files changed

+337
-19
lines changed

2 files changed

+337
-19
lines changed

src/resources/filters/normalize/astpipeline.lua

Lines changed: 335 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,338 @@
22
-- Copyright (C) 2023 Posit Software, PBC
33

44
function quarto_ast_pipeline()
5-
local function warn_on_stray_triple_colons()
6-
return {
7-
Str = function(el)
8-
if string.match(el.text, ":::(:*)") then
9-
local error_message =
10-
"\nThe following string was found in the document: " .. el.text ..
11-
"\n\nThis usually indicates a problem with a fenced div in the document. Please check the document for errors."
12-
warn(error_message)
5+
local patterns = require("modules/patterns")
6+
local constants = require("modules/constants")
7+
8+
local function astpipeline_process_tables()
9+
local function replace_spaces_not_in_tags(text)
10+
local parts = {}
11+
local intag = false
12+
local lastchange = 1
13+
for i = 1, #text do
14+
local char = text:sub(i, i)
15+
if not intag then
16+
if char == '<' then
17+
intag = true
18+
elseif char == ' ' then
19+
table.insert(parts, text:sub(lastchange, i-1))
20+
table.insert(parts, '&nbsp;')
21+
lastchange = i+1
22+
end
23+
else
24+
if char == '>' then
25+
intag = false
26+
end
27+
end
28+
end
29+
table.insert(parts, text:sub(lastchange))
30+
return table.concat(parts, '')
31+
end
32+
33+
local function preprocess_table_text(src)
34+
-- html manipulation with regex is fraught, but these specific
35+
-- changes are safe assuming that no one is using quarto- as
36+
-- a prefix for dataset attributes in the tables.
37+
-- See
38+
-- * https://www.w3.org/html/wg/spec/syntax.html#start-tags
39+
-- * https://www.w3.org/html/wg/spec/syntax.html#end-tags
40+
41+
src = src:gsub("<th([%s>])", "<td data-quarto-table-cell-role=\"th\"%1")
42+
src = src:gsub("</th([%s>])", "</td%1")
43+
src = src:gsub("<table([%s>])", "<table data-quarto-postprocess=\"true\"%1")
44+
45+
return src
46+
end
47+
local function juice(htmltext)
48+
-- return htmltext
49+
return pandoc.system.with_temporary_directory('juice', function(tmpdir)
50+
-- replace any long data uris with uuids
51+
local data_uri_uuid = '273dae7e-3633-4385-9b0c-203d2d7a2d37'
52+
local data_uris = {}
53+
local data_uri_regex = 'data:image/[a-z]+;base64,[a-zA-Z0-9+/]+=*'
54+
htmltext = htmltext:gsub(data_uri_regex, function(data_uri)
55+
-- juice truncates around 15k characters; let's guard any over 2000 characters
56+
if #data_uri > 2000 then
57+
table.insert(data_uris, data_uri)
58+
return data_uri_uuid
59+
else
60+
return data_uri
61+
end
62+
end)
63+
local juice_in = pandoc.path.join({tmpdir, 'juice-in.html'})
64+
local jin = assert(io.open(juice_in, 'w'))
65+
jin:write(htmltext)
66+
jin:flush()
67+
local quarto_path = pandoc.path.join({os.getenv('QUARTO_BIN_PATH'), 'quarto'})
68+
local jout, jerr = io.popen(quarto_path .. ' run ' ..
69+
pandoc.path.join({os.getenv('QUARTO_SHARE_PATH'), 'scripts', 'juice.ts'}) .. ' ' ..
70+
juice_in, 'r')
71+
if not jout then
72+
quarto.log.error('Running juice failed with message: ' .. (jerr or "Unknown error"))
73+
return htmltext
74+
end
75+
local content = jout:read('a')
76+
local success, _, exitCode = jout:close()
77+
-- Check the exit status
78+
if not success then
79+
quarto.log.error("Running juice failed with exit code: " .. (exitCode or "unknown exit code"))
80+
return htmltext
81+
else
82+
local index = 1
83+
content = content:gsub(data_uri_uuid:gsub('-', '%%-'), function(_)
84+
local data_uri = data_uris[index]
85+
index = index + 1
86+
return data_uri
87+
end)
88+
return content
89+
end
90+
end)
91+
end
92+
local function should_handle_raw_html_as_table(el)
93+
if not _quarto.format.isRawHtml(el) then
94+
return nil
95+
end
96+
-- See https://github.com/quarto-dev/quarto-cli/issues/8670
97+
-- and https://quarto.org/docs/authoring/tables.html#library-authors
98+
-- for the motivation for this change.
99+
if string.find(el.text, patterns.html_disable_table_processing_comment) then
100+
return nil
101+
end
102+
-- if we have a raw html table in a format that doesn't handle raw_html
103+
-- then have pandoc parse the table into a proper AST table block
104+
-- we're already at a state of sin here, cf https://stackoverflow.com/a/1732454
105+
-- but this is important enough to do a little more work anyway
106+
local pat = patterns.html_table
107+
local i, j = string.find(el.text, pat)
108+
if i == nil then
109+
return nil
110+
end
111+
return true
112+
end
113+
local function handle_raw_html_as_table(el)
114+
local eltext
115+
if(_quarto.format.isTypstOutput()) then
116+
eltext = juice(el.text)
117+
else
118+
eltext = el.text
119+
end
120+
121+
local blocks = pandoc.Blocks({})
122+
local start = patterns.html_start_tag("table")
123+
local finish = patterns.html_end_tag("table")
124+
125+
126+
local cursor = 1
127+
local len = string.len(eltext)
128+
129+
while cursor < len do
130+
-- find the first table start tag
131+
local i, j = string.find(eltext, start, cursor)
132+
if i == nil then
133+
-- no more tables
134+
break
135+
end
136+
137+
-- find the closest table end tag
138+
-- that produces a valid table parsing from Pandoc
139+
local cursor_2 = j + 1
140+
local nesting = 1
141+
while cursor_2 < len do
142+
local k1, l1 = string.find(eltext, start, cursor_2)
143+
local k2, l2 = string.find(eltext, finish, cursor_2)
144+
if k1 == nil and k2 == nil then
145+
cursor = len
146+
break
147+
end
148+
if k1 and (k2 == nil or k1 < k2) then
149+
nesting = nesting + 1
150+
cursor_2 = l1 + 1
151+
else
152+
-- not k1 or k1 >= k2
153+
nesting = nesting - 1
154+
cursor_2 = l2 + 1
155+
if nesting == 0 then
156+
local tableHtml = string.sub(eltext, i, l2)
157+
-- Pandoc's HTML-table -> AST-table processing does not faithfully respect
158+
-- `th` vs `td` elements. This causes some complex tables to be parsed incorrectly,
159+
-- and changes which elements are `th` and which are `td`.
160+
--
161+
-- For quarto, this change is not acceptable because `td` and `th` have
162+
-- accessibility impacts (see https://github.com/rstudio/gt/issues/678 for a concrete
163+
-- request from a screen-reader user).
164+
--
165+
-- To preserve td and th, we replace `th` elements in the input with
166+
-- `td data-quarto-table-cell-role="th"`.
167+
--
168+
-- Then, in our HTML postprocessor,
169+
-- we replace th elements with td (since pandoc chooses to set some of its table
170+
-- elements as th, even if the original table requested not to), and replace those
171+
-- annotated td elements with th elements.
172+
tableHtml = preprocess_table_text(tableHtml)
173+
local tableDoc = pandoc.read(tableHtml, "html+raw_html")
174+
local found = false
175+
local skip = false
176+
_quarto.traverser(tableDoc, {
177+
Table = function(table)
178+
found = true
179+
if table.attributes[constants.kDisableProcessing] == "true" then
180+
skip = true
181+
end
182+
end,
183+
})
184+
if #tableDoc.blocks ~= 1 then
185+
warn("Unable to parse table from raw html block: skipping.")
186+
skip = true
187+
end
188+
if found and not skip then
189+
flags.has_tables = true
190+
if cursor ~= i then
191+
blocks:insert(pandoc.RawBlock(el.format, string.sub(eltext, cursor, i - 1)))
192+
end
193+
blocks:insert(tableDoc.blocks[1])
194+
end
195+
cursor = l2 + 1
196+
break
197+
end
13198
end
199+
end
200+
end
201+
if #blocks == 0 then
202+
return nil
203+
end
204+
if cursor > 1 and cursor <= len then
205+
blocks:insert(pandoc.RawBlock(el.format, string.sub(eltext, cursor)))
206+
end
207+
return _quarto.ast.scaffold_element(blocks)
208+
end
209+
local function should_handle_raw_html_as_pre_tag(pre_tag)
210+
if not _quarto.format.isRawHtml(pre_tag) then
211+
return nil
14212
end
15-
}
213+
local pat = patterns.html_pre_tag
214+
local i, j = string.find(pre_tag.text, pat)
215+
if i == nil then
216+
return nil
217+
end
218+
return true
219+
end
220+
local function handle_raw_html_as_pre_tag(pre_tag)
221+
local eltext
222+
if(_quarto.format.isTypstOutput()) then
223+
eltext = juice(pre_tag.text)
224+
else
225+
eltext = pre_tag.text
226+
end
227+
228+
local preContentHtml = eltext:match('<pre[^>]*>(.*)</pre>')
229+
if not preContentHtml then
230+
quarto.log.error('no pre', eltext:sub(1,1700))
231+
return nil
232+
end
233+
preContentHtml = replace_spaces_not_in_tags(preContentHtml)
234+
preContentHtml = preContentHtml:gsub('\n','<br />')
235+
local preDoc = pandoc.read(preContentHtml, "html+raw_html")
236+
local block1 = preDoc.blocks[1]
237+
local blocks = pandoc.Blocks({
238+
pandoc.Div(block1, pandoc.Attr("", {}, {style = 'font-family: Inconsolata, Roboto Mono, Courier New;'}))
239+
})
240+
return _quarto.ast.scaffold_element(blocks)
241+
end
242+
243+
local disable_html_table_processing = false
244+
local disable_html_pre_tag_processing = false
245+
if param(constants.kHtmlTableProcessing) == "none" then
246+
disable_html_table_processing = true
247+
end
248+
if param(constants.kHtmlPreTagProcessing) == "none" then
249+
disable_html_pre_tag_processing = true
250+
end
251+
252+
local filter = {
253+
traverse = 'topdown',
254+
Div = function(div)
255+
if div.attributes[constants.kHtmlTableProcessing] and not disable_html_table_processing then
256+
-- catch and remove attributes
257+
local htmlTableProcessing = div.attributes[constants.kHtmlTableProcessing]
258+
div.attributes[constants.kHtmlTableProcessing] = nil
259+
if htmlTableProcessing == "none" then
260+
if div.attr == pandoc.Attr() then
261+
-- if no other attributes are set on the div, don't keep it
262+
return div.content, false
263+
else
264+
-- when set on a div like div.cell-output-display, we need to keep it
265+
return div, false
266+
end
267+
end
268+
end
269+
if div.attributes[constants.kHtmlPreTagProcessing] and not disable_html_pre_tag_processing then
270+
local htmlPreTagProcessing = div.attributes[constants.kHtmlPreTagProcessing]
271+
if htmlPreTagProcessing == "parse" then
272+
local pre_tag = quarto.utils.match('Div/[1]/RawBlock')(div)
273+
if pre_tag and should_handle_raw_html_as_pre_tag(pre_tag) then
274+
return handle_raw_html_as_pre_tag(pre_tag), false
275+
end
276+
end
277+
end
278+
end,
279+
RawBlock = function(el)
280+
if not should_handle_raw_html_as_table(el) or disable_html_table_processing then
281+
return nil
282+
end
283+
return handle_raw_html_as_table(el)
284+
end
285+
};
286+
287+
-- table_merge_raw_html from table-rawhtml.lua
288+
if _quarto.format.isHtmlOutput() then
289+
filter.Blocks = function(blocks)
290+
local pending_raw = pandoc.List()
291+
local next_element_idx = 1
292+
for _, el in ipairs(blocks) do
293+
if _quarto.format.isRawHtml(el) and
294+
el.text:find(patterns.html_table_tag_name) then
295+
pending_raw:insert(el.text)
296+
else
297+
if next(pending_raw) then
298+
blocks[next_element_idx] =
299+
pandoc.RawBlock("html", table.concat(pending_raw, "\n"))
300+
pending_raw = pandoc.List()
301+
next_element_idx = next_element_idx + 1
302+
end
303+
blocks[next_element_idx] = el
304+
next_element_idx = next_element_idx + 1
305+
end
306+
end
307+
if #pending_raw > 0 then
308+
blocks[next_element_idx] =
309+
pandoc.RawBlock("html", table.concat(pending_raw, "\n"))
310+
next_element_idx = next_element_idx + 1
311+
end
312+
for i = next_element_idx, #blocks do
313+
blocks[i] = nil
314+
end
315+
return blocks
316+
end
317+
end
318+
319+
return filter
16320
end
321+
17322
return {
18-
{ name = "normalize-table-merge-raw-html",
19-
filter = table_merge_raw_html(),
323+
{ name = "astpipeline-process-tables",
324+
filter = astpipeline_process_tables(),
20325
traverser = 'jog',
21326
},
22-
327+
-- { name = "normalize-table-merge-raw-html",
328+
-- filter = table_merge_raw_html(),
329+
-- traverser = 'jog',
330+
-- },
23331
-- this filter can't be combined with others because it's top-down processing.
24332
-- unfortunate.
25-
{ name = "normalize-html-table-processing",
26-
filter = parse_html_tables(),
27-
traverser = 'jog',
28-
},
333+
-- { name = "normalize-html-table-processing",
334+
-- filter = parse_html_tables(),
335+
-- traverser = 'jog',
336+
-- },
29337

30338
{ name = "normalize-combined-1",
31339
filter = combineFilters({
@@ -34,10 +342,20 @@ function quarto_ast_pipeline()
34342
parse_extended_nodes(),
35343
code_filename(),
36344
normalize_fixup_data_uri_image_extension(),
37-
warn_on_stray_triple_colons(),
345+
{
346+
Str = function(el)
347+
if string.match(el.text, ":::(:*)") then
348+
local error_message =
349+
"\nThe following string was found in the document: " .. el.text ..
350+
"\n\nThis usually indicates a problem with a fenced div in the document. Please check the document for errors."
351+
warn(error_message)
352+
end
353+
end
354+
},
38355
}),
39356
traverser = 'jog',
40357
},
358+
41359
{
42360
name = "normalize-combine-2",
43361
filter = combineFilters({

src/resources/filters/quarto-pre/parsefiguredivs.lua

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -802,7 +802,7 @@ function forward_cell_subcaps()
802802
if type(subcaps) == "table" then
803803
nsubcaps = #subcaps
804804
end
805-
div.content = _quarto.ast.walk(div.content, {
805+
div.content = _quarto.traverser(div.content, {
806806
Div = function(subdiv)
807807
if type(nsubcaps) == "number" and index > nsubcaps or not subdiv.classes:includes("cell-output-display") then
808808
return nil
@@ -815,7 +815,7 @@ function forward_cell_subcaps()
815815
end
816816
end
817817
-- now we attempt to insert subcaptions where it makes sense for them to be inserted
818-
subdiv.content = _quarto.ast.walk(subdiv.content, {
818+
subdiv.content = _quarto.traverser(subdiv.content, {
819819
Table = function(pandoc_table)
820820
pandoc_table.caption.long = quarto.utils.as_blocks(get_subcap())
821821
pandoc_table.identifier = div.identifier .. "-" .. tostring(index)

0 commit comments

Comments
 (0)