Skip to content

Commit 2e3bf51

Browse files
cscheidclaude
andcommitted
Implement table captions in tree-sitter grammar
Implements support for Pandoc-style table captions that appear after pipe tables using the tree-sitter grammar for precise parsing. Table caption syntax: - Blank line followed by ": caption text" - Caption must appear immediately after a pipe table - Only the ': caption' format is supported Implementation approach: - Added table_caption rule to tree-sitter grammar as child of pipe_table - Table captions are parsed at grammar level, not in postprocessor - Reverted _line rule to disallow paragraphs starting with ':' (restores safety check for detecting accidental fenced div continuation) Changes: - Modified tree-sitter-markdown/grammar.js to add table_caption rule - Updated pipe_table processor to extract caption from parse tree - Added table_caption handler in treesitter.rs dispatcher - Removed postprocessor caption detection logic and with_blocks filter - Removed extract_table_caption function from postprocess.rs Benefits: - Table captions only recognized in correct context (after pipe tables) - Restored paragraph safety check prevents malformed documents - Cleaner separation between grammar structure and AST transformation - Better performance by eliminating block scanning in postprocessor Output matches Pandoc's JSON AST structure with caption attached to table's caption.long field as a Plain block. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <[email protected]>
1 parent f014d7d commit 2e3bf51

File tree

9 files changed

+19036
-17051
lines changed

9 files changed

+19036
-17051
lines changed

crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,7 @@ fn native_visitor<T: Write>(
712712
}
713713
"pipe_table_delimiter_row" => process_pipe_table_delimiter_row(children, context),
714714
"pipe_table_cell" => process_pipe_table_cell(node, children, context),
715+
"table_caption" => PandocNativeIntermediate::IntermediateInlines(native_inlines(children)),
715716
"pipe_table" => process_pipe_table(node, children, context),
716717
"setext_h1_underline" => PandocNativeIntermediate::IntermediateSetextHeadingLevel(1),
717718
"setext_h2_underline" => PandocNativeIntermediate::IntermediateSetextHeadingLevel(2),

crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/pipe_table.rs

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ pub fn process_pipe_table(
151151
let mut header: Option<Row> = None;
152152
let mut colspec: Vec<ColSpec> = Vec::new();
153153
let mut rows: Vec<Row> = Vec::new();
154+
let mut caption_inlines: Option<Inlines> = None;
154155
for (node, child) in children {
155156
if node == "block_continuation" {
156157
continue; // skip block continuation nodes
@@ -179,6 +180,12 @@ pub fn process_pipe_table(
179180
} else {
180181
panic!("Expected Row in pipe_table_row, got {:?}", child);
181182
}
183+
} else if node == "table_caption" {
184+
if let PandocNativeIntermediate::IntermediateInlines(inlines) = child {
185+
caption_inlines = Some(inlines);
186+
} else {
187+
panic!("Expected Inlines in table_caption, got {:?}", child);
188+
}
182189
} else {
183190
panic!("Unexpected node in pipe_table: {}", node);
184191
}
@@ -204,12 +211,25 @@ pub fn process_pipe_table(
204211
(vec![header.unwrap()], rows)
205212
};
206213

207-
PandocNativeIntermediate::IntermediateBlock(Block::Table(Table {
208-
attr,
209-
caption: Caption {
214+
// Construct caption from caption_inlines if present
215+
let caption = if let Some(inlines) = caption_inlines {
216+
Caption {
217+
short: None,
218+
long: Some(vec![Block::Plain(Plain {
219+
content: inlines,
220+
source_info: node_source_info_with_context(node, context),
221+
})]),
222+
}
223+
} else {
224+
Caption {
210225
short: None,
211226
long: None,
212-
},
227+
}
228+
};
229+
230+
PandocNativeIntermediate::IntermediateBlock(Block::Table(Table {
231+
attr,
232+
caption,
213233
colspec,
214234
head: TableHead {
215235
attr: empty_attr(),

crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ fn transform_definition_list_div(div: Div) -> Block {
259259
})
260260
}
261261

262+
262263
/// Apply post-processing transformations to the Pandoc AST
263264
pub fn postprocess(doc: Pandoc) -> Result<Pandoc, Vec<String>> {
264265
let mut errors = Vec::new();
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
| Name | Age |
2+
|-------|-----|
3+
| Alice | 30 |
4+
| Bob | 25 |
5+
6+
: Sample table caption
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[ Table ( "" , [] , [] ) (Caption Nothing [ Plain [Str "Sample", Space, Str "table", Space, Str "caption"] ]) [(AlignDefault, ColWidthDefault), (AlignDefault, ColWidthDefault)] (TableHead ( "" , [] , [] ) [Row ( "" , [] , [] ) [Cell ( "" , [] , [] ) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str "Name"]] , Cell ( "" , [] , [] ) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str "Age"]] ] ]) [TableBody ( "" , [] , [] ) (RowHeadColumns 0) [] [Row ( "" , [] , [] ) [Cell ( "" , [] , [] ) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str "Alice"]] , Cell ( "" , [] , [] ) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str "30"]] ] , Row ( "" , [] , [] ) [Cell ( "" , [] , [] ) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str "Bob"]] , Cell ( "" , [] , [] ) AlignDefault (RowSpan 1) (ColSpan 1) [Plain [Str "25"]] ] ]] (TableFoot ( "" , [] , [] ) [] ) ]

crates/tree-sitter-qmd/tree-sitter-markdown/grammar.js

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ module.exports = grammar({
355355
),
356356
// Some symbols get parsed as single tokens so that html blocks get detected properly
357357
_code_line: $ => prec.right(repeat1(choice($._word, $._display_math_state_track_marker, $._inline_math_state_track_marker, $._whitespace, common.punctuation_without($, [])))),
358-
358+
359359
// the gymnastics around `:` in _line exist to make the parser reject paragraphs that start with a colon.
360360
// Those are technically valid in Markdown, but disallowing them here makes it possible to detect an
361361
// accidentally-continued paragraph with a colon that should have been a fenced div marker.
@@ -375,8 +375,24 @@ module.exports = grammar({
375375
$.pipe_table_delimiter_row,
376376
repeat(seq($._pipe_table_newline, optional($.pipe_table_row))),
377377
choice($._newline, $._eof),
378+
optional($.table_caption),
378379
)),
379380

381+
// Table caption: blank line followed by ": caption text"
382+
// This is a Pandoc extension for table captions
383+
table_caption: $ => prec(1, seq(
384+
$._blank_line,
385+
':',
386+
optional(seq(
387+
optional($._whitespace),
388+
alias($._table_caption_line, $.inline)
389+
)),
390+
choice($._newline, $._eof),
391+
)),
392+
393+
// Caption line content - similar to _line but only used in table_caption context
394+
_table_caption_line: $ => prec.right(repeat1(choice($._word, $._display_math_state_track_marker, $._inline_math_state_track_marker, $._whitespace, common.punctuation_without($, [])))),
395+
380396
_pipe_table_newline: $ => seq(
381397
$._pipe_table_line_ending,
382398
optional($.block_continuation)

crates/tree-sitter-qmd/tree-sitter-markdown/src/grammar.json

Lines changed: 254 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4722,10 +4722,264 @@
47224722
"name": "_eof"
47234723
}
47244724
]
4725+
},
4726+
{
4727+
"type": "CHOICE",
4728+
"members": [
4729+
{
4730+
"type": "SYMBOL",
4731+
"name": "table_caption"
4732+
},
4733+
{
4734+
"type": "BLANK"
4735+
}
4736+
]
47254737
}
47264738
]
47274739
}
47284740
},
4741+
"table_caption": {
4742+
"type": "PREC",
4743+
"value": 1,
4744+
"content": {
4745+
"type": "SEQ",
4746+
"members": [
4747+
{
4748+
"type": "SYMBOL",
4749+
"name": "_blank_line"
4750+
},
4751+
{
4752+
"type": "STRING",
4753+
"value": ":"
4754+
},
4755+
{
4756+
"type": "CHOICE",
4757+
"members": [
4758+
{
4759+
"type": "SEQ",
4760+
"members": [
4761+
{
4762+
"type": "CHOICE",
4763+
"members": [
4764+
{
4765+
"type": "SYMBOL",
4766+
"name": "_whitespace"
4767+
},
4768+
{
4769+
"type": "BLANK"
4770+
}
4771+
]
4772+
},
4773+
{
4774+
"type": "ALIAS",
4775+
"content": {
4776+
"type": "SYMBOL",
4777+
"name": "_table_caption_line"
4778+
},
4779+
"named": true,
4780+
"value": "inline"
4781+
}
4782+
]
4783+
},
4784+
{
4785+
"type": "BLANK"
4786+
}
4787+
]
4788+
},
4789+
{
4790+
"type": "CHOICE",
4791+
"members": [
4792+
{
4793+
"type": "SYMBOL",
4794+
"name": "_newline"
4795+
},
4796+
{
4797+
"type": "SYMBOL",
4798+
"name": "_eof"
4799+
}
4800+
]
4801+
}
4802+
]
4803+
}
4804+
},
4805+
"_table_caption_line": {
4806+
"type": "PREC_RIGHT",
4807+
"value": 0,
4808+
"content": {
4809+
"type": "REPEAT1",
4810+
"content": {
4811+
"type": "CHOICE",
4812+
"members": [
4813+
{
4814+
"type": "SYMBOL",
4815+
"name": "_word"
4816+
},
4817+
{
4818+
"type": "SYMBOL",
4819+
"name": "_display_math_state_track_marker"
4820+
},
4821+
{
4822+
"type": "SYMBOL",
4823+
"name": "_inline_math_state_track_marker"
4824+
},
4825+
{
4826+
"type": "SYMBOL",
4827+
"name": "_whitespace"
4828+
},
4829+
{
4830+
"type": "SEQ",
4831+
"members": [
4832+
{
4833+
"type": "CHOICE",
4834+
"members": [
4835+
{
4836+
"type": "STRING",
4837+
"value": "!"
4838+
},
4839+
{
4840+
"type": "STRING",
4841+
"value": "\""
4842+
},
4843+
{
4844+
"type": "STRING",
4845+
"value": "#"
4846+
},
4847+
{
4848+
"type": "STRING",
4849+
"value": "$"
4850+
},
4851+
{
4852+
"type": "STRING",
4853+
"value": "%"
4854+
},
4855+
{
4856+
"type": "STRING",
4857+
"value": "&"
4858+
},
4859+
{
4860+
"type": "STRING",
4861+
"value": "'"
4862+
},
4863+
{
4864+
"type": "STRING",
4865+
"value": "("
4866+
},
4867+
{
4868+
"type": "STRING",
4869+
"value": ")"
4870+
},
4871+
{
4872+
"type": "STRING",
4873+
"value": "*"
4874+
},
4875+
{
4876+
"type": "STRING",
4877+
"value": "+"
4878+
},
4879+
{
4880+
"type": "STRING",
4881+
"value": ","
4882+
},
4883+
{
4884+
"type": "STRING",
4885+
"value": "-"
4886+
},
4887+
{
4888+
"type": "STRING",
4889+
"value": "."
4890+
},
4891+
{
4892+
"type": "STRING",
4893+
"value": "/"
4894+
},
4895+
{
4896+
"type": "STRING",
4897+
"value": ":"
4898+
},
4899+
{
4900+
"type": "STRING",
4901+
"value": ";"
4902+
},
4903+
{
4904+
"type": "STRING",
4905+
"value": "<"
4906+
},
4907+
{
4908+
"type": "STRING",
4909+
"value": "="
4910+
},
4911+
{
4912+
"type": "STRING",
4913+
"value": ">"
4914+
},
4915+
{
4916+
"type": "STRING",
4917+
"value": "?"
4918+
},
4919+
{
4920+
"type": "STRING",
4921+
"value": "@"
4922+
},
4923+
{
4924+
"type": "STRING",
4925+
"value": "["
4926+
},
4927+
{
4928+
"type": "STRING",
4929+
"value": "\\"
4930+
},
4931+
{
4932+
"type": "STRING",
4933+
"value": "]"
4934+
},
4935+
{
4936+
"type": "STRING",
4937+
"value": "^"
4938+
},
4939+
{
4940+
"type": "STRING",
4941+
"value": "_"
4942+
},
4943+
{
4944+
"type": "STRING",
4945+
"value": "`"
4946+
},
4947+
{
4948+
"type": "STRING",
4949+
"value": "{"
4950+
},
4951+
{
4952+
"type": "STRING",
4953+
"value": "|"
4954+
},
4955+
{
4956+
"type": "STRING",
4957+
"value": "}"
4958+
},
4959+
{
4960+
"type": "STRING",
4961+
"value": "~"
4962+
}
4963+
]
4964+
},
4965+
{
4966+
"type": "CHOICE",
4967+
"members": [
4968+
{
4969+
"type": "SYMBOL",
4970+
"name": "_last_token_punctuation"
4971+
},
4972+
{
4973+
"type": "BLANK"
4974+
}
4975+
]
4976+
}
4977+
]
4978+
}
4979+
]
4980+
}
4981+
}
4982+
},
47294983
"_pipe_table_newline": {
47304984
"type": "SEQ",
47314985
"members": [

0 commit comments

Comments
 (0)