diff --git a/src/resources/filters/normalize/astpipeline.lua b/src/resources/filters/normalize/astpipeline.lua
index 1345086fa07..3b95ac03f8e 100644
--- a/src/resources/filters/normalize/astpipeline.lua
+++ b/src/resources/filters/normalize/astpipeline.lua
@@ -2,31 +2,329 @@
-- Copyright (C) 2023 Posit Software, PBC
function quarto_ast_pipeline()
- local function warn_on_stray_triple_colons()
- return {
- Str = function(el)
- if string.match(el.text, ":::(:*)") then
- local error_message =
- "\nThe following string was found in the document: " .. el.text ..
- "\n\nThis usually indicates a problem with a fenced div in the document. Please check the document for errors."
- warn(error_message)
+ local patterns = require("modules/patterns")
+ local constants = require("modules/constants")
+
+ local function astpipeline_process_tables()
+ local function replace_spaces_not_in_tags(text)
+ local parts = {}
+ local intag = false
+ local lastchange = 1
+ for i = 1, #text do
+ local char = text:sub(i, i)
+ if not intag then
+ if char == '<' then
+ intag = true
+ elseif char == ' ' then
+ table.insert(parts, text:sub(lastchange, i-1))
+ table.insert(parts, ' ')
+ lastchange = i+1
+ end
+ else
+ if char == '>' then
+ intag = false
+ end
+ end
+ end
+ table.insert(parts, text:sub(lastchange))
+ return table.concat(parts, '')
+ end
+
+ local function preprocess_table_text(src)
+ -- html manipulation with regex is fraught, but these specific
+ -- changes are safe assuming that no one is using quarto- as
+ -- a prefix for dataset attributes in the tables.
+ -- See
+ -- * https://www.w3.org/html/wg/spec/syntax.html#start-tags
+ -- * https://www.w3.org/html/wg/spec/syntax.html#end-tags
+
+ src = src:gsub("
])", " | ])", " | ])", " 2000 then
+ table.insert(data_uris, data_uri)
+ return data_uri_uuid
+ else
+ return data_uri
+ end
+ end)
+ local juice_in = pandoc.path.join({tmpdir, 'juice-in.html'})
+ local jin = assert(io.open(juice_in, 'w'))
+ jin:write(htmltext)
+ jin:flush()
+ local quarto_path = pandoc.path.join({os.getenv('QUARTO_BIN_PATH'), 'quarto'})
+ local jout, jerr = io.popen(quarto_path .. ' run ' ..
+ pandoc.path.join({os.getenv('QUARTO_SHARE_PATH'), 'scripts', 'juice.ts'}) .. ' ' ..
+ juice_in, 'r')
+ if not jout then
+ quarto.log.error('Running juice failed with message: ' .. (jerr or "Unknown error"))
+ return htmltext
+ end
+ local content = jout:read('a')
+ local success, _, exitCode = jout:close()
+ -- Check the exit status
+ if not success then
+ quarto.log.error("Running juice failed with exit code: " .. (exitCode or "unknown exit code"))
+ return htmltext
+ else
+ local index = 1
+ content = content:gsub(data_uri_uuid:gsub('-', '%%-'), function(_)
+ local data_uri = data_uris[index]
+ index = index + 1
+ return data_uri
+ end)
+ return content
+ end
+ end)
+ end
+ local function should_handle_raw_html_as_table(el)
+ if not _quarto.format.isRawHtml(el) then
+ return nil
+ end
+ -- See https://github.com/quarto-dev/quarto-cli/issues/8670
+ -- and https://quarto.org/docs/authoring/tables.html#library-authors
+ -- for the motivation for this change.
+ if string.find(el.text, patterns.html_disable_table_processing_comment) then
+ return nil
+ end
+ -- if we have a raw html table in a format that doesn't handle raw_html
+ -- then have pandoc parse the table into a proper AST table block
+ -- we're already at a state of sin here, cf https://stackoverflow.com/a/1732454
+ -- but this is important enough to do a little more work anyway
+ local pat = patterns.html_table
+ local i, j = string.find(el.text, pat)
+ if i == nil then
+ return nil
+ end
+ return true
+ end
+ local function handle_raw_html_as_table(el)
+ local eltext
+ if(_quarto.format.isTypstOutput()) then
+ eltext = juice(el.text)
+ else
+ eltext = el.text
+ end
+
+ local blocks = pandoc.Blocks({})
+ local start = patterns.html_start_tag("table")
+ local finish = patterns.html_end_tag("table")
+
+
+ local cursor = 1
+ local len = string.len(eltext)
+
+ while cursor < len do
+ -- find the first table start tag
+ local i, j = string.find(eltext, start, cursor)
+ if i == nil then
+ -- no more tables
+ break
+ end
+
+ -- find the closest table end tag
+ -- that produces a valid table parsing from Pandoc
+ local cursor_2 = j + 1
+ local nesting = 1
+ while cursor_2 < len do
+ local k1, l1 = string.find(eltext, start, cursor_2)
+ local k2, l2 = string.find(eltext, finish, cursor_2)
+ if k1 == nil and k2 == nil then
+ cursor = len
+ break
+ end
+ if k1 and (k2 == nil or k1 < k2) then
+ nesting = nesting + 1
+ cursor_2 = l1 + 1
+ else
+ -- not k1 or k1 >= k2
+ nesting = nesting - 1
+ cursor_2 = l2 + 1
+ if nesting == 0 then
+ local tableHtml = string.sub(eltext, i, l2)
+ -- Pandoc's HTML-table -> AST-table processing does not faithfully respect
+ -- `th` vs `td` elements. This causes some complex tables to be parsed incorrectly,
+ -- and changes which elements are `th` and which are `td`.
+ --
+ -- For quarto, this change is not acceptable because `td` and `th` have
+ -- accessibility impacts (see https://github.com/rstudio/gt/issues/678 for a concrete
+ -- request from a screen-reader user).
+ --
+ -- To preserve td and th, we replace `th` elements in the input with
+ -- `td data-quarto-table-cell-role="th"`.
+ --
+ -- Then, in our HTML postprocessor,
+ -- we replace th elements with td (since pandoc chooses to set some of its table
+ -- elements as th, even if the original table requested not to), and replace those
+ -- annotated td elements with th elements.
+ tableHtml = preprocess_table_text(tableHtml)
+ local tableDoc = pandoc.read(tableHtml, "html+raw_html")
+ local found = false
+ local skip = false
+ _quarto.traverser(tableDoc, {
+ Table = function(table)
+ found = true
+ if table.attributes[constants.kDisableProcessing] == "true" then
+ skip = true
+ end
+ end,
+ })
+ if #tableDoc.blocks ~= 1 then
+ warn("Unable to parse table from raw html block: skipping.")
+ skip = true
+ end
+ if found and not skip then
+ flags.has_tables = true
+ if cursor ~= i then
+ blocks:insert(pandoc.RawBlock(el.format, string.sub(eltext, cursor, i - 1)))
+ end
+ blocks:insert(tableDoc.blocks[1])
+ end
+ cursor = l2 + 1
+ break
+ end
end
+ end
+ end
+ if #blocks == 0 then
+ return nil
+ end
+ if cursor > 1 and cursor <= len then
+ blocks:insert(pandoc.RawBlock(el.format, string.sub(eltext, cursor)))
+ end
+ return _quarto.ast.scaffold_element(blocks)
+ end
+ local function should_handle_raw_html_as_pre_tag(pre_tag)
+ if not _quarto.format.isRawHtml(pre_tag) then
+ return nil
end
- }
+ local pat = patterns.html_pre_tag
+ local i, j = string.find(pre_tag.text, pat)
+ if i == nil then
+ return nil
+ end
+ return true
+ end
+ local function handle_raw_html_as_pre_tag(pre_tag)
+ local eltext
+ if(_quarto.format.isTypstOutput()) then
+ eltext = juice(pre_tag.text)
+ else
+ eltext = pre_tag.text
+ end
+
+ local preContentHtml = eltext:match(']*>(.*)
')
+ if not preContentHtml then
+ quarto.log.error('no pre', eltext:sub(1,1700))
+ return nil
+ end
+ preContentHtml = replace_spaces_not_in_tags(preContentHtml)
+ preContentHtml = preContentHtml:gsub('\n','
')
+ local preDoc = pandoc.read(preContentHtml, "html+raw_html")
+ local block1 = preDoc.blocks[1]
+ local blocks = pandoc.Blocks({
+ pandoc.Div(block1, pandoc.Attr("", {}, {style = 'font-family: Inconsolata, Roboto Mono, Courier New;'}))
+ })
+ return _quarto.ast.scaffold_element(blocks)
+ end
+
+ local disable_html_table_processing = false
+ local disable_html_pre_tag_processing = false
+ if param(constants.kHtmlTableProcessing) == "none" then
+ disable_html_table_processing = true
+ end
+ if param(constants.kHtmlPreTagProcessing) == "none" then
+ disable_html_pre_tag_processing = true
+ end
+
+ local filter = {
+ traverse = 'topdown',
+ Div = function(div)
+ if div.attributes[constants.kHtmlTableProcessing] and not disable_html_table_processing then
+ -- catch and remove attributes
+ local htmlTableProcessing = div.attributes[constants.kHtmlTableProcessing]
+ div.attributes[constants.kHtmlTableProcessing] = nil
+ if htmlTableProcessing == "none" then
+ if div.attr == pandoc.Attr() then
+ -- if no other attributes are set on the div, don't keep it
+ return div.content, false
+ else
+ -- when set on a div like div.cell-output-display, we need to keep it
+ return div, false
+ end
+ end
+ end
+ if div.attributes[constants.kHtmlPreTagProcessing] and not disable_html_pre_tag_processing then
+ local htmlPreTagProcessing = div.attributes[constants.kHtmlPreTagProcessing]
+ if htmlPreTagProcessing == "parse" then
+ local pre_tag = quarto.utils.match('Div/[1]/RawBlock')(div)
+ if pre_tag and should_handle_raw_html_as_pre_tag(pre_tag) then
+ return handle_raw_html_as_pre_tag(pre_tag), false
+ end
+ end
+ end
+ end,
+ RawBlock = function(el)
+ if not should_handle_raw_html_as_table(el) or disable_html_table_processing then
+ return nil
+ end
+ return handle_raw_html_as_table(el)
+ end
+ };
+
+ -- table_merge_raw_html from table-rawhtml.lua
+ if _quarto.format.isHtmlOutput() then
+ filter.Blocks = function(blocks)
+ local pending_raw = pandoc.List()
+ local next_element_idx = 1
+ for _, el in ipairs(blocks) do
+ if _quarto.format.isRawHtml(el) and
+ el.text:find(patterns.html_table_tag_name) then
+ pending_raw:insert(el.text)
+ else
+ if next(pending_raw) then
+ blocks[next_element_idx] =
+ pandoc.RawBlock("html", table.concat(pending_raw, "\n"))
+ pending_raw = pandoc.List()
+ next_element_idx = next_element_idx + 1
+ end
+ blocks[next_element_idx] = el
+ next_element_idx = next_element_idx + 1
+ end
+ end
+ if #pending_raw > 0 then
+ blocks[next_element_idx] =
+ pandoc.RawBlock("html", table.concat(pending_raw, "\n"))
+ next_element_idx = next_element_idx + 1
+ end
+ for i = next_element_idx, #blocks do
+ blocks[i] = nil
+ end
+ return blocks
+ end
+ end
+
+ return filter
end
- return {
- { name = "normalize-table-merge-raw-html",
- filter = table_merge_raw_html(),
- traverser = 'jog',
- },
- -- this filter can't be combined with others because it's top-down processing.
- -- unfortunate.
- { name = "normalize-html-table-processing",
- filter = parse_html_tables(),
+ return {
+ { name = "astpipeline-process-tables",
+ filter = astpipeline_process_tables(),
traverser = 'jog',
},
-
+
{ name = "normalize-combined-1",
filter = combineFilters({
extract_latex_quartomarkdown_commands(),
@@ -34,10 +332,20 @@ function quarto_ast_pipeline()
parse_extended_nodes(),
code_filename(),
normalize_fixup_data_uri_image_extension(),
- warn_on_stray_triple_colons(),
+ {
+ Str = function(el)
+ if string.match(el.text, ":::(:*)") then
+ local error_message =
+ "\nThe following string was found in the document: " .. el.text ..
+ "\n\nThis usually indicates a problem with a fenced div in the document. Please check the document for errors."
+ warn(error_message)
+ end
+ end
+ },
}),
traverser = 'jog',
},
+
{
name = "normalize-combine-2",
filter = combineFilters({
diff --git a/src/resources/filters/quarto-pre/parsefiguredivs.lua b/src/resources/filters/quarto-pre/parsefiguredivs.lua
index a066abe76af..fc17ccb9042 100644
--- a/src/resources/filters/quarto-pre/parsefiguredivs.lua
+++ b/src/resources/filters/quarto-pre/parsefiguredivs.lua
@@ -802,7 +802,7 @@ function forward_cell_subcaps()
if type(subcaps) == "table" then
nsubcaps = #subcaps
end
- div.content = _quarto.ast.walk(div.content, {
+ div.content = _quarto.traverser(div.content, {
Div = function(subdiv)
if type(nsubcaps) == "number" and index > nsubcaps or not subdiv.classes:includes("cell-output-display") then
return nil
@@ -815,7 +815,7 @@ function forward_cell_subcaps()
end
end
-- now we attempt to insert subcaptions where it makes sense for them to be inserted
- subdiv.content = _quarto.ast.walk(subdiv.content, {
+ subdiv.content = _quarto.traverser(subdiv.content, {
Table = function(pandoc_table)
pandoc_table.caption.long = quarto.utils.as_blocks(get_subcap())
pandoc_table.identifier = div.identifier .. "-" .. tostring(index)