22-- Copyright (C) 2023 Posit Software, PBC
33
44function quarto_ast_pipeline ()
5- local function warn_on_stray_triple_colons ()
6- return {
7- Str = function (el )
8- if string.match (el .text , " :::(:*)" ) then
9- local error_message =
10- " \n The following string was found in the document: " .. el .text ..
11- " \n\n This usually indicates a problem with a fenced div in the document. Please check the document for errors."
12- warn (error_message )
5+ local patterns = require (" modules/patterns" )
6+ local constants = require (" modules/constants" )
7+
8+ local function astpipeline_process_tables ()
9+ local function replace_spaces_not_in_tags (text )
10+ local parts = {}
11+ local intag = false
12+ local lastchange = 1
13+ for i = 1 , # text do
14+ local char = text :sub (i , i )
15+ if not intag then
16+ if char == ' <' then
17+ intag = true
18+ elseif char == ' ' then
19+ table.insert (parts , text :sub (lastchange , i - 1 ))
20+ table.insert (parts , ' ' )
21+ lastchange = i + 1
22+ end
23+ else
24+ if char == ' >' then
25+ intag = false
26+ end
27+ end
28+ end
29+ table.insert (parts , text :sub (lastchange ))
30+ return table.concat (parts , ' ' )
31+ end
32+
33+ local function preprocess_table_text (src )
34+ -- html manipulation with regex is fraught, but these specific
35+ -- changes are safe assuming that no one is using quarto- as
36+ -- a prefix for dataset attributes in the tables.
37+ -- See
38+ -- * https://www.w3.org/html/wg/spec/syntax.html#start-tags
39+ -- * https://www.w3.org/html/wg/spec/syntax.html#end-tags
40+
41+ src = src :gsub (" <th([%s>])" , " <td data-quarto-table-cell-role=\" th\" %1" )
42+ src = src :gsub (" </th([%s>])" , " </td%1" )
43+ src = src :gsub (" <table([%s>])" , " <table data-quarto-postprocess=\" true\" %1" )
44+
45+ return src
46+ end
47+ local function juice (htmltext )
48+ -- return htmltext
49+ return pandoc .system .with_temporary_directory (' juice' , function (tmpdir )
50+ -- replace any long data uris with uuids
51+ local data_uri_uuid = ' 273dae7e-3633-4385-9b0c-203d2d7a2d37'
52+ local data_uris = {}
53+ local data_uri_regex = ' data:image/[a-z]+;base64,[a-zA-Z0-9+/]+=*'
54+ htmltext = htmltext :gsub (data_uri_regex , function (data_uri )
55+ -- juice truncates around 15k characters; let's guard any over 2000 characters
56+ if # data_uri > 2000 then
57+ table.insert (data_uris , data_uri )
58+ return data_uri_uuid
59+ else
60+ return data_uri
61+ end
62+ end )
63+ local juice_in = pandoc .path .join ({tmpdir , ' juice-in.html' })
64+ local jin = assert (io.open (juice_in , ' w' ))
65+ jin :write (htmltext )
66+ jin :flush ()
67+ local quarto_path = pandoc .path .join ({os.getenv (' QUARTO_BIN_PATH' ), ' quarto' })
68+ local jout , jerr = io.popen (quarto_path .. ' run ' ..
69+ pandoc .path .join ({os.getenv (' QUARTO_SHARE_PATH' ), ' scripts' , ' juice.ts' }) .. ' ' ..
70+ juice_in , ' r' )
71+ if not jout then
72+ quarto .log .error (' Running juice failed with message: ' .. (jerr or " Unknown error" ))
73+ return htmltext
74+ end
75+ local content = jout :read (' a' )
76+ local success , _ , exitCode = jout :close ()
77+ -- Check the exit status
78+ if not success then
79+ quarto .log .error (" Running juice failed with exit code: " .. (exitCode or " unknown exit code" ))
80+ return htmltext
81+ else
82+ local index = 1
83+ content = content :gsub (data_uri_uuid :gsub (' -' , ' %%-' ), function (_ )
84+ local data_uri = data_uris [index ]
85+ index = index + 1
86+ return data_uri
87+ end )
88+ return content
89+ end
90+ end )
91+ end
92+ local function should_handle_raw_html_as_table (el )
93+ if not _quarto .format .isRawHtml (el ) then
94+ return nil
95+ end
96+ -- See https://github.com/quarto-dev/quarto-cli/issues/8670
97+ -- and https://quarto.org/docs/authoring/tables.html#library-authors
98+ -- for the motivation for this change.
99+ if string.find (el .text , patterns .html_disable_table_processing_comment ) then
100+ return nil
101+ end
102+ -- if we have a raw html table in a format that doesn't handle raw_html
103+ -- then have pandoc parse the table into a proper AST table block
104+ -- we're already at a state of sin here, cf https://stackoverflow.com/a/1732454
105+ -- but this is important enough to do a little more work anyway
106+ local pat = patterns .html_table
107+ local i , j = string.find (el .text , pat )
108+ if i == nil then
109+ return nil
110+ end
111+ return true
112+ end
113+ local function handle_raw_html_as_table (el )
114+ local eltext
115+ if (_quarto .format .isTypstOutput ()) then
116+ eltext = juice (el .text )
117+ else
118+ eltext = el .text
119+ end
120+
121+ local blocks = pandoc .Blocks ({})
122+ local start = patterns .html_start_tag (" table" )
123+ local finish = patterns .html_end_tag (" table" )
124+
125+
126+ local cursor = 1
127+ local len = string.len (eltext )
128+
129+ while cursor < len do
130+ -- find the first table start tag
131+ local i , j = string.find (eltext , start , cursor )
132+ if i == nil then
133+ -- no more tables
134+ break
135+ end
136+
137+ -- find the closest table end tag
138+ -- that produces a valid table parsing from Pandoc
139+ local cursor_2 = j + 1
140+ local nesting = 1
141+ while cursor_2 < len do
142+ local k1 , l1 = string.find (eltext , start , cursor_2 )
143+ local k2 , l2 = string.find (eltext , finish , cursor_2 )
144+ if k1 == nil and k2 == nil then
145+ cursor = len
146+ break
147+ end
148+ if k1 and (k2 == nil or k1 < k2 ) then
149+ nesting = nesting + 1
150+ cursor_2 = l1 + 1
151+ else
152+ -- not k1 or k1 >= k2
153+ nesting = nesting - 1
154+ cursor_2 = l2 + 1
155+ if nesting == 0 then
156+ local tableHtml = string.sub (eltext , i , l2 )
157+ -- Pandoc's HTML-table -> AST-table processing does not faithfully respect
158+ -- `th` vs `td` elements. This causes some complex tables to be parsed incorrectly,
159+ -- and changes which elements are `th` and which are `td`.
160+ --
161+ -- For quarto, this change is not acceptable because `td` and `th` have
162+ -- accessibility impacts (see https://github.com/rstudio/gt/issues/678 for a concrete
163+ -- request from a screen-reader user).
164+ --
165+ -- To preserve td and th, we replace `th` elements in the input with
166+ -- `td data-quarto-table-cell-role="th"`.
167+ --
168+ -- Then, in our HTML postprocessor,
169+ -- we replace th elements with td (since pandoc chooses to set some of its table
170+ -- elements as th, even if the original table requested not to), and replace those
171+ -- annotated td elements with th elements.
172+ tableHtml = preprocess_table_text (tableHtml )
173+ local tableDoc = pandoc .read (tableHtml , " html+raw_html" )
174+ local found = false
175+ local skip = false
176+ _quarto .traverser (tableDoc , {
177+ Table = function (table )
178+ found = true
179+ if table .attributes [constants .kDisableProcessing ] == " true" then
180+ skip = true
181+ end
182+ end ,
183+ })
184+ if # tableDoc .blocks ~= 1 then
185+ warn (" Unable to parse table from raw html block: skipping." )
186+ skip = true
187+ end
188+ if found and not skip then
189+ flags .has_tables = true
190+ if cursor ~= i then
191+ blocks :insert (pandoc .RawBlock (el .format , string.sub (eltext , cursor , i - 1 )))
192+ end
193+ blocks :insert (tableDoc .blocks [1 ])
194+ end
195+ cursor = l2 + 1
196+ break
197+ end
13198 end
199+ end
200+ end
201+ if # blocks == 0 then
202+ return nil
203+ end
204+ if cursor > 1 and cursor <= len then
205+ blocks :insert (pandoc .RawBlock (el .format , string.sub (eltext , cursor )))
206+ end
207+ return _quarto .ast .scaffold_element (blocks )
208+ end
209+ local function should_handle_raw_html_as_pre_tag (pre_tag )
210+ if not _quarto .format .isRawHtml (pre_tag ) then
211+ return nil
14212 end
15- }
213+ local pat = patterns .html_pre_tag
214+ local i , j = string.find (pre_tag .text , pat )
215+ if i == nil then
216+ return nil
217+ end
218+ return true
219+ end
220+ local function handle_raw_html_as_pre_tag (pre_tag )
221+ local eltext
222+ if (_quarto .format .isTypstOutput ()) then
223+ eltext = juice (pre_tag .text )
224+ else
225+ eltext = pre_tag .text
226+ end
227+
228+ local preContentHtml = eltext :match (' <pre[^>]*>(.*)</pre>' )
229+ if not preContentHtml then
230+ quarto .log .error (' no pre' , eltext :sub (1 ,1700 ))
231+ return nil
232+ end
233+ preContentHtml = replace_spaces_not_in_tags (preContentHtml )
234+ preContentHtml = preContentHtml :gsub (' \n ' ,' <br />' )
235+ local preDoc = pandoc .read (preContentHtml , " html+raw_html" )
236+ local block1 = preDoc .blocks [1 ]
237+ local blocks = pandoc .Blocks ({
238+ pandoc .Div (block1 , pandoc .Attr (" " , {}, {style = ' font-family: Inconsolata, Roboto Mono, Courier New;' }))
239+ })
240+ return _quarto .ast .scaffold_element (blocks )
241+ end
242+
243+ local disable_html_table_processing = false
244+ local disable_html_pre_tag_processing = false
245+ if param (constants .kHtmlTableProcessing ) == " none" then
246+ disable_html_table_processing = true
247+ end
248+ if param (constants .kHtmlPreTagProcessing ) == " none" then
249+ disable_html_pre_tag_processing = true
250+ end
251+
252+ local filter = {
253+ traversal = ' topdown' ,
254+ Div = function (div )
255+ if div .attributes [constants .kHtmlTableProcessing ] and not disable_html_table_processing then
256+ -- catch and remove attributes
257+ local htmlTableProcessing = div .attributes [constants .kHtmlTableProcessing ]
258+ div .attributes [constants .kHtmlTableProcessing ] = nil
259+ if htmlTableProcessing == " none" then
260+ if div .attr == pandoc .Attr () then
261+ -- if no other attributes are set on the div, don't keep it
262+ return div .content , false
263+ else
264+ -- when set on a div like div.cell-output-display, we need to keep it
265+ return div , false
266+ end
267+ end
268+ end
269+ if div .attributes [constants .kHtmlPreTagProcessing ] and not disable_html_pre_tag_processing then
270+ local htmlPreTagProcessing = div .attributes [constants .kHtmlPreTagProcessing ]
271+ if htmlPreTagProcessing == " parse" then
272+ local pre_tag = quarto .utils .match (' Div/[1]/RawBlock' )(div )
273+ if pre_tag and should_handle_raw_html_as_pre_tag (pre_tag ) then
274+ return handle_raw_html_as_pre_tag (pre_tag ), false
275+ end
276+ end
277+ end
278+ end ,
279+ RawBlock = function (el )
280+ if not should_handle_raw_html_as_table (el ) or disable_html_table_processing then
281+ return nil
282+ end
283+ return handle_raw_html_as_table (el )
284+ end
285+ };
286+
287+ -- table_merge_raw_html from table-rawhtml.lua
288+ if _quarto .format .isHtmlOutput () then
289+ filter .Blocks = function (blocks )
290+ local pending_raw = pandoc .List ()
291+ local next_element_idx = 1
292+ for _ , el in ipairs (blocks ) do
293+ if _quarto .format .isRawHtml (el ) and
294+ el .text :find (patterns .html_table_tag_name ) then
295+ pending_raw :insert (el .text )
296+ else
297+ if next (pending_raw ) then
298+ blocks [next_element_idx ] =
299+ pandoc .RawBlock (" html" , table.concat (pending_raw , " \n " ))
300+ pending_raw = pandoc .List ()
301+ next_element_idx = next_element_idx + 1
302+ end
303+ blocks [next_element_idx ] = el
304+ next_element_idx = next_element_idx + 1
305+ end
306+ end
307+ if # pending_raw > 0 then
308+ blocks [next_element_idx ] =
309+ pandoc .RawBlock (" html" , table.concat (pending_raw , " \n " ))
310+ next_element_idx = next_element_idx + 1
311+ end
312+ for i = next_element_idx , # blocks do
313+ blocks [i ] = nil
314+ end
315+ return blocks
316+ end
317+ end
318+
319+ return filter
16320 end
321+
17322 return {
18- { name = " normalize-table-merge-raw-html " ,
19- filter = table_merge_raw_html (),
323+ { name = " astpipeline-process-tables " ,
324+ filter = astpipeline_process_tables (),
20325 traverser = ' jog' ,
21326 },
22-
327+ -- { name = "normalize-table-merge-raw-html",
328+ -- filter = table_merge_raw_html(),
329+ -- traverser = 'jog',
330+ -- },
23331 -- this filter can't be combined with others because it's top-down processing.
24332 -- unfortunate.
25- { name = " normalize-html-table-processing" ,
26- filter = parse_html_tables (),
27- traverser = ' jog' ,
28- },
333+ -- { name = "normalize-html-table-processing",
334+ -- filter = parse_html_tables(),
335+ -- traverser = 'jog',
336+ -- },
29337
30338 { name = " normalize-combined-1" ,
31339 filter = combineFilters ({
@@ -34,10 +342,20 @@ function quarto_ast_pipeline()
34342 parse_extended_nodes (),
35343 code_filename (),
36344 normalize_fixup_data_uri_image_extension (),
37- warn_on_stray_triple_colons (),
345+ {
346+ Str = function (el )
347+ if string.match (el .text , " :::(:*)" ) then
348+ local error_message =
349+ " \n The following string was found in the document: " .. el .text ..
350+ " \n\n This usually indicates a problem with a fenced div in the document. Please check the document for errors."
351+ warn (error_message )
352+ end
353+ end
354+ },
38355 }),
39356 traverser = ' jog' ,
40357 },
358+
41359 {
42360 name = " normalize-combine-2" ,
43361 filter = combineFilters ({
0 commit comments