|
2 | 2 | -- Copyright (C) 2023 Posit Software, PBC |
3 | 3 |
|
4 | 4 | function quarto_ast_pipeline() |
5 | | - local function warn_on_stray_triple_colons() |
6 | | - return { |
7 | | - Str = function(el) |
8 | | - if string.match(el.text, ":::(:*)") then |
9 | | - local error_message = |
10 | | - "\nThe following string was found in the document: " .. el.text .. |
11 | | - "\n\nThis usually indicates a problem with a fenced div in the document. Please check the document for errors." |
12 | | - warn(error_message) |
| 5 | + local patterns = require("modules/patterns") |
| 6 | + local constants = require("modules/constants") |
| 7 | + |
| 8 | + local function astpipeline_process_tables() |
| 9 | + local function replace_spaces_not_in_tags(text) |
| 10 | + local parts = {} |
| 11 | + local intag = false |
| 12 | + local lastchange = 1 |
| 13 | + for i = 1, #text do |
| 14 | + local char = text:sub(i, i) |
| 15 | + if not intag then |
| 16 | + if char == '<' then |
| 17 | + intag = true |
| 18 | + elseif char == ' ' then |
| 19 | + table.insert(parts, text:sub(lastchange, i-1)) |
| 20 | + table.insert(parts, ' ') |
| 21 | + lastchange = i+1 |
| 22 | + end |
| 23 | + else |
| 24 | + if char == '>' then |
| 25 | + intag = false |
| 26 | + end |
| 27 | + end |
| 28 | + end |
| 29 | + table.insert(parts, text:sub(lastchange)) |
| 30 | + return table.concat(parts, '') |
| 31 | + end |
| 32 | + |
| 33 | + local function preprocess_table_text(src) |
| 34 | + -- html manipulation with regex is fraught, but these specific |
| 35 | + -- changes are safe assuming that no one is using quarto- as |
| 36 | + -- a prefix for dataset attributes in the tables. |
| 37 | + -- See |
| 38 | + -- * https://www.w3.org/html/wg/spec/syntax.html#start-tags |
| 39 | + -- * https://www.w3.org/html/wg/spec/syntax.html#end-tags |
| 40 | + |
| 41 | + src = src:gsub("<th([%s>])", "<td data-quarto-table-cell-role=\"th\"%1") |
| 42 | + src = src:gsub("</th([%s>])", "</td%1") |
| 43 | + src = src:gsub("<table([%s>])", "<table data-quarto-postprocess=\"true\"%1") |
| 44 | + |
| 45 | + return src |
| 46 | + end |
| 47 | + local function juice(htmltext) |
| 48 | + -- return htmltext |
| 49 | + return pandoc.system.with_temporary_directory('juice', function(tmpdir) |
| 50 | + -- replace any long data uris with uuids |
| 51 | + local data_uri_uuid = '273dae7e-3633-4385-9b0c-203d2d7a2d37' |
| 52 | + local data_uris = {} |
| 53 | + local data_uri_regex = 'data:image/[a-z]+;base64,[a-zA-Z0-9+/]+=*' |
| 54 | + htmltext = htmltext:gsub(data_uri_regex, function(data_uri) |
| 55 | + -- juice truncates around 15k characters; let's guard any over 2000 characters |
| 56 | + if #data_uri > 2000 then |
| 57 | + table.insert(data_uris, data_uri) |
| 58 | + return data_uri_uuid |
| 59 | + else |
| 60 | + return data_uri |
| 61 | + end |
| 62 | + end) |
| 63 | + local juice_in = pandoc.path.join({tmpdir, 'juice-in.html'}) |
| 64 | + local jin = assert(io.open(juice_in, 'w')) |
| 65 | + jin:write(htmltext) |
| 66 | + jin:flush() |
| 67 | + local quarto_path = pandoc.path.join({os.getenv('QUARTO_BIN_PATH'), 'quarto'}) |
| 68 | + local jout, jerr = io.popen(quarto_path .. ' run ' .. |
| 69 | + pandoc.path.join({os.getenv('QUARTO_SHARE_PATH'), 'scripts', 'juice.ts'}) .. ' ' .. |
| 70 | + juice_in, 'r') |
| 71 | + if not jout then |
| 72 | + quarto.log.error('Running juice failed with message: ' .. (jerr or "Unknown error")) |
| 73 | + return htmltext |
| 74 | + end |
| 75 | + local content = jout:read('a') |
| 76 | + local success, _, exitCode = jout:close() |
| 77 | + -- Check the exit status |
| 78 | + if not success then |
| 79 | + quarto.log.error("Running juice failed with exit code: " .. (exitCode or "unknown exit code")) |
| 80 | + return htmltext |
| 81 | + else |
| 82 | + local index = 1 |
| 83 | + content = content:gsub(data_uri_uuid:gsub('-', '%%-'), function(_) |
| 84 | + local data_uri = data_uris[index] |
| 85 | + index = index + 1 |
| 86 | + return data_uri |
| 87 | + end) |
| 88 | + return content |
| 89 | + end |
| 90 | + end) |
| 91 | + end |
| 92 | + local function should_handle_raw_html_as_table(el) |
| 93 | + if not _quarto.format.isRawHtml(el) then |
| 94 | + return nil |
| 95 | + end |
| 96 | + -- See https://github.com/quarto-dev/quarto-cli/issues/8670 |
| 97 | + -- and https://quarto.org/docs/authoring/tables.html#library-authors |
| 98 | + -- for the motivation for this change. |
| 99 | + if string.find(el.text, patterns.html_disable_table_processing_comment) then |
| 100 | + return nil |
| 101 | + end |
| 102 | + -- if we have a raw html table in a format that doesn't handle raw_html |
| 103 | + -- then have pandoc parse the table into a proper AST table block |
| 104 | + -- we're already at a state of sin here, cf https://stackoverflow.com/a/1732454 |
| 105 | + -- but this is important enough to do a little more work anyway |
| 106 | + local pat = patterns.html_table |
| 107 | + local i, j = string.find(el.text, pat) |
| 108 | + if i == nil then |
| 109 | + return nil |
| 110 | + end |
| 111 | + return true |
| 112 | + end |
| 113 | + local function handle_raw_html_as_table(el) |
| 114 | + local eltext |
| 115 | + if(_quarto.format.isTypstOutput()) then |
| 116 | + eltext = juice(el.text) |
| 117 | + else |
| 118 | + eltext = el.text |
| 119 | + end |
| 120 | + |
| 121 | + local blocks = pandoc.Blocks({}) |
| 122 | + local start = patterns.html_start_tag("table") |
| 123 | + local finish = patterns.html_end_tag("table") |
| 124 | + |
| 125 | + |
| 126 | + local cursor = 1 |
| 127 | + local len = string.len(eltext) |
| 128 | + |
| 129 | + while cursor < len do |
| 130 | + -- find the first table start tag |
| 131 | + local i, j = string.find(eltext, start, cursor) |
| 132 | + if i == nil then |
| 133 | + -- no more tables |
| 134 | + break |
| 135 | + end |
| 136 | + |
| 137 | + -- find the closest table end tag |
| 138 | + -- that produces a valid table parsing from Pandoc |
| 139 | + local cursor_2 = j + 1 |
| 140 | + local nesting = 1 |
| 141 | + while cursor_2 < len do |
| 142 | + local k1, l1 = string.find(eltext, start, cursor_2) |
| 143 | + local k2, l2 = string.find(eltext, finish, cursor_2) |
| 144 | + if k1 == nil and k2 == nil then |
| 145 | + cursor = len |
| 146 | + break |
| 147 | + end |
| 148 | + if k1 and (k2 == nil or k1 < k2) then |
| 149 | + nesting = nesting + 1 |
| 150 | + cursor_2 = l1 + 1 |
| 151 | + else |
| 152 | + -- not k1 or k1 >= k2 |
| 153 | + nesting = nesting - 1 |
| 154 | + cursor_2 = l2 + 1 |
| 155 | + if nesting == 0 then |
| 156 | + local tableHtml = string.sub(eltext, i, l2) |
| 157 | + -- Pandoc's HTML-table -> AST-table processing does not faithfully respect |
| 158 | + -- `th` vs `td` elements. This causes some complex tables to be parsed incorrectly, |
| 159 | + -- and changes which elements are `th` and which are `td`. |
| 160 | + -- |
| 161 | + -- For quarto, this change is not acceptable because `td` and `th` have |
| 162 | + -- accessibility impacts (see https://github.com/rstudio/gt/issues/678 for a concrete |
| 163 | + -- request from a screen-reader user). |
| 164 | + -- |
| 165 | + -- To preserve td and th, we replace `th` elements in the input with |
| 166 | + -- `td data-quarto-table-cell-role="th"`. |
| 167 | + -- |
| 168 | + -- Then, in our HTML postprocessor, |
| 169 | + -- we replace th elements with td (since pandoc chooses to set some of its table |
| 170 | + -- elements as th, even if the original table requested not to), and replace those |
| 171 | + -- annotated td elements with th elements. |
| 172 | + tableHtml = preprocess_table_text(tableHtml) |
| 173 | + local tableDoc = pandoc.read(tableHtml, "html+raw_html") |
| 174 | + local found = false |
| 175 | + local skip = false |
| 176 | + _quarto.traverser(tableDoc, { |
| 177 | + Table = function(table) |
| 178 | + found = true |
| 179 | + if table.attributes[constants.kDisableProcessing] == "true" then |
| 180 | + skip = true |
| 181 | + end |
| 182 | + end, |
| 183 | + }) |
| 184 | + if #tableDoc.blocks ~= 1 then |
| 185 | + warn("Unable to parse table from raw html block: skipping.") |
| 186 | + skip = true |
| 187 | + end |
| 188 | + if found and not skip then |
| 189 | + flags.has_tables = true |
| 190 | + if cursor ~= i then |
| 191 | + blocks:insert(pandoc.RawBlock(el.format, string.sub(eltext, cursor, i - 1))) |
| 192 | + end |
| 193 | + blocks:insert(tableDoc.blocks[1]) |
| 194 | + end |
| 195 | + cursor = l2 + 1 |
| 196 | + break |
| 197 | + end |
13 | 198 | end |
| 199 | + end |
| 200 | + end |
| 201 | + if #blocks == 0 then |
| 202 | + return nil |
| 203 | + end |
| 204 | + if cursor > 1 and cursor <= len then |
| 205 | + blocks:insert(pandoc.RawBlock(el.format, string.sub(eltext, cursor))) |
| 206 | + end |
| 207 | + return _quarto.ast.scaffold_element(blocks) |
| 208 | + end |
| 209 | + local function should_handle_raw_html_as_pre_tag(pre_tag) |
| 210 | + if not _quarto.format.isRawHtml(pre_tag) then |
| 211 | + return nil |
14 | 212 | end |
15 | | - } |
| 213 | + local pat = patterns.html_pre_tag |
| 214 | + local i, j = string.find(pre_tag.text, pat) |
| 215 | + if i == nil then |
| 216 | + return nil |
| 217 | + end |
| 218 | + return true |
| 219 | + end |
| 220 | + local function handle_raw_html_as_pre_tag(pre_tag) |
| 221 | + local eltext |
| 222 | + if(_quarto.format.isTypstOutput()) then |
| 223 | + eltext = juice(pre_tag.text) |
| 224 | + else |
| 225 | + eltext = pre_tag.text |
| 226 | + end |
| 227 | + |
| 228 | + local preContentHtml = eltext:match('<pre[^>]*>(.*)</pre>') |
| 229 | + if not preContentHtml then |
| 230 | + quarto.log.error('no pre', eltext:sub(1,1700)) |
| 231 | + return nil |
| 232 | + end |
| 233 | + preContentHtml = replace_spaces_not_in_tags(preContentHtml) |
| 234 | + preContentHtml = preContentHtml:gsub('\n','<br />') |
| 235 | + local preDoc = pandoc.read(preContentHtml, "html+raw_html") |
| 236 | + local block1 = preDoc.blocks[1] |
| 237 | + local blocks = pandoc.Blocks({ |
| 238 | + pandoc.Div(block1, pandoc.Attr("", {}, {style = 'font-family: Inconsolata, Roboto Mono, Courier New;'})) |
| 239 | + }) |
| 240 | + return _quarto.ast.scaffold_element(blocks) |
| 241 | + end |
| 242 | + |
| 243 | + local disable_html_table_processing = false |
| 244 | + local disable_html_pre_tag_processing = false |
| 245 | + if param(constants.kHtmlTableProcessing) == "none" then |
| 246 | + disable_html_table_processing = true |
| 247 | + end |
| 248 | + if param(constants.kHtmlPreTagProcessing) == "none" then |
| 249 | + disable_html_pre_tag_processing = true |
| 250 | + end |
| 251 | + |
| 252 | + local filter = { |
| 253 | + traverse = 'topdown', |
| 254 | + Div = function(div) |
| 255 | + if div.attributes[constants.kHtmlTableProcessing] and not disable_html_table_processing then |
| 256 | + -- catch and remove attributes |
| 257 | + local htmlTableProcessing = div.attributes[constants.kHtmlTableProcessing] |
| 258 | + div.attributes[constants.kHtmlTableProcessing] = nil |
| 259 | + if htmlTableProcessing == "none" then |
| 260 | + if div.attr == pandoc.Attr() then |
| 261 | + -- if no other attributes are set on the div, don't keep it |
| 262 | + return div.content, false |
| 263 | + else |
| 264 | + -- when set on a div like div.cell-output-display, we need to keep it |
| 265 | + return div, false |
| 266 | + end |
| 267 | + end |
| 268 | + end |
| 269 | + if div.attributes[constants.kHtmlPreTagProcessing] and not disable_html_pre_tag_processing then |
| 270 | + local htmlPreTagProcessing = div.attributes[constants.kHtmlPreTagProcessing] |
| 271 | + if htmlPreTagProcessing == "parse" then |
| 272 | + local pre_tag = quarto.utils.match('Div/[1]/RawBlock')(div) |
| 273 | + if pre_tag and should_handle_raw_html_as_pre_tag(pre_tag) then |
| 274 | + return handle_raw_html_as_pre_tag(pre_tag), false |
| 275 | + end |
| 276 | + end |
| 277 | + end |
| 278 | + end, |
| 279 | + RawBlock = function(el) |
| 280 | + if not should_handle_raw_html_as_table(el) or disable_html_table_processing then |
| 281 | + return nil |
| 282 | + end |
| 283 | + return handle_raw_html_as_table(el) |
| 284 | + end |
| 285 | + }; |
| 286 | + |
| 287 | + -- table_merge_raw_html from table-rawhtml.lua |
| 288 | + if _quarto.format.isHtmlOutput() then |
| 289 | + filter.Blocks = function(blocks) |
| 290 | + local pending_raw = pandoc.List() |
| 291 | + local next_element_idx = 1 |
| 292 | + for _, el in ipairs(blocks) do |
| 293 | + if _quarto.format.isRawHtml(el) and |
| 294 | + el.text:find(patterns.html_table_tag_name) then |
| 295 | + pending_raw:insert(el.text) |
| 296 | + else |
| 297 | + if next(pending_raw) then |
| 298 | + blocks[next_element_idx] = |
| 299 | + pandoc.RawBlock("html", table.concat(pending_raw, "\n")) |
| 300 | + pending_raw = pandoc.List() |
| 301 | + next_element_idx = next_element_idx + 1 |
| 302 | + end |
| 303 | + blocks[next_element_idx] = el |
| 304 | + next_element_idx = next_element_idx + 1 |
| 305 | + end |
| 306 | + end |
| 307 | + if #pending_raw > 0 then |
| 308 | + blocks[next_element_idx] = |
| 309 | + pandoc.RawBlock("html", table.concat(pending_raw, "\n")) |
| 310 | + next_element_idx = next_element_idx + 1 |
| 311 | + end |
| 312 | + for i = next_element_idx, #blocks do |
| 313 | + blocks[i] = nil |
| 314 | + end |
| 315 | + return blocks |
| 316 | + end |
| 317 | + end |
| 318 | + |
| 319 | + return filter |
16 | 320 | end |
17 | | - return { |
18 | | - { name = "normalize-table-merge-raw-html", |
19 | | - filter = table_merge_raw_html(), |
20 | | - traverser = 'jog', |
21 | | - }, |
22 | 321 |
|
23 | | - -- this filter can't be combined with others because it's top-down processing. |
24 | | - -- unfortunate. |
25 | | - { name = "normalize-html-table-processing", |
26 | | - filter = parse_html_tables(), |
| 322 | + return { |
| 323 | + { name = "astpipeline-process-tables", |
| 324 | + filter = astpipeline_process_tables(), |
27 | 325 | traverser = 'jog', |
28 | 326 | }, |
29 | | - |
| 327 | + |
30 | 328 | { name = "normalize-combined-1", |
31 | 329 | filter = combineFilters({ |
32 | 330 | extract_latex_quartomarkdown_commands(), |
33 | 331 | forward_cell_subcaps(), |
34 | 332 | parse_extended_nodes(), |
35 | 333 | code_filename(), |
36 | 334 | normalize_fixup_data_uri_image_extension(), |
37 | | - warn_on_stray_triple_colons(), |
| 335 | + { |
| 336 | + Str = function(el) |
| 337 | + if string.match(el.text, ":::(:*)") then |
| 338 | + local error_message = |
| 339 | + "\nThe following string was found in the document: " .. el.text .. |
| 340 | + "\n\nThis usually indicates a problem with a fenced div in the document. Please check the document for errors." |
| 341 | + warn(error_message) |
| 342 | + end |
| 343 | + end |
| 344 | + }, |
38 | 345 | }), |
39 | 346 | traverser = 'jog', |
40 | 347 | }, |
| 348 | + |
41 | 349 | { |
42 | 350 | name = "normalize-combine-2", |
43 | 351 | filter = combineFilters({ |
|
0 commit comments