From 9a0fc16db583f15dab57ba97e33af9c5bacf04e8 Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Thu, 16 Oct 2025 21:12:59 -0500 Subject: [PATCH 01/11] parse yaml markdown and interpret tags --- .../quarto-markdown-pandoc/src/pandoc/meta.rs | 52 +++- .../quarto-markdown-pandoc/src/pandoc/mod.rs | 2 +- .../quarto-markdown-pandoc/tests/test_meta.rs | 185 ++++++++++++- .../tests/yaml-markdown-parse-failure.qmd | 8 + .../tests/yaml-tagged-strings.qmd | 9 + docs/syntax/index.qmd | 1 + docs/syntax/yaml-metadata.qmd | 242 ++++++++++++++++++ 7 files changed, 489 insertions(+), 10 deletions(-) create mode 100644 crates/quarto-markdown-pandoc/tests/yaml-markdown-parse-failure.qmd create mode 100644 crates/quarto-markdown-pandoc/tests/yaml-tagged-strings.qmd create mode 100644 docs/syntax/yaml-metadata.qmd diff --git a/crates/quarto-markdown-pandoc/src/pandoc/meta.rs b/crates/quarto-markdown-pandoc/src/pandoc/meta.rs index 4058dfb..ec3f6bf 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/meta.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/meta.rs @@ -4,10 +4,12 @@ */ use crate::pandoc::block::Blocks; -use crate::pandoc::inline::Inlines; +use crate::pandoc::inline::{Inline, Inlines, Span, Str}; +use crate::pandoc::location::empty_source_info; use crate::readers; use crate::{pandoc::RawBlock, utils::output::VerboseOutput}; use hashlink::LinkedHashMap; +use std::collections::HashMap; use std::{io, mem}; use yaml_rust2::parser::{Event, MarkedEventReceiver, Parser}; @@ -82,7 +84,29 @@ impl YamlEventHandler { } } - fn parse_scalar(&self, s: &str) -> MetaValue { + fn parse_scalar(&self, s: &str, tag: Option) -> MetaValue { + // Check if this scalar has a YAML tag (like !path, !glob, !str) + if let Some(t) = tag { + // Tagged strings bypass markdown parsing - wrap in Span immediately + let mut attributes = HashMap::new(); + attributes.insert("tag".to_string(), t.suffix.clone()); + + let span = Span { + attr: ( + String::new(), + vec!["yaml-tagged-string".to_string()], + attributes, + ), + content: vec![Inline::Str(Str { + text: s.to_string(), + source_info: empty_source_info(), + })], + source_info: empty_source_info(), + }; + return MetaValue::MetaInlines(vec![Inline::Span(span)]); + } + + // Untagged scalars: parse as booleans or strings (will be parsed as markdown later) if s == "true" { MetaValue::MetaBool(true) } else if s == "false" { @@ -116,12 +140,12 @@ impl MarkedEventReceiver for YamlEventHandler { self.push_value(MetaValue::MetaList(list)); } } - Event::Scalar(s, ..) => match self.stack.last_mut() { + Event::Scalar(s, _style, _anchor, tag) => match self.stack.last_mut() { Some(ContextFrame::Map(_, key_slot @ None)) => { *key_slot = Some(s.to_string()); } Some(ContextFrame::Map(_, Some(_))) | Some(ContextFrame::List(_)) => { - let value = self.parse_scalar(&s); + let value = self.parse_scalar(&s, tag); self.push_value(value); } _ => {} @@ -187,10 +211,22 @@ pub fn parse_metadata_strings(meta: MetaValue, outer_metadata: &mut Meta) -> Met } MetaValue::MetaBlocks(pandoc.blocks) } - _ => panic!( - "(unimplemented syntax error, this is a bug!) Failed to parse metadata string as markdown: {}", - s - ), + Err(_) => { + // Markdown parse failed - wrap in Span with class "yaml-markdown-syntax-error" + let span = Span { + attr: ( + String::new(), + vec!["yaml-markdown-syntax-error".to_string()], + HashMap::new(), + ), + content: vec![Inline::Str(Str { + text: s.clone(), + source_info: empty_source_info(), + })], + source_info: empty_source_info(), + }; + MetaValue::MetaInlines(vec![Inline::Span(span)]) + } } } MetaValue::MetaList(list) => { diff --git a/crates/quarto-markdown-pandoc/src/pandoc/mod.rs b/crates/quarto-markdown-pandoc/src/pandoc/mod.rs index 4117cbd..9d7db3c 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/mod.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/mod.rs @@ -37,5 +37,5 @@ pub use crate::pandoc::table::{ }; pub use crate::pandoc::ast_context::ASTContext; -pub use crate::pandoc::meta::{Meta, MetaValue, rawblock_to_meta}; +pub use crate::pandoc::meta::{Meta, MetaValue, parse_metadata_strings, rawblock_to_meta}; pub use crate::pandoc::treesitter::treesitter_to_pandoc; diff --git a/crates/quarto-markdown-pandoc/tests/test_meta.rs b/crates/quarto-markdown-pandoc/tests/test_meta.rs index bd53cb4..3c8039f 100644 --- a/crates/quarto-markdown-pandoc/tests/test_meta.rs +++ b/crates/quarto-markdown-pandoc/tests/test_meta.rs @@ -3,8 +3,11 @@ * Copyright (c) 2025 Posit, PBC */ +use hashlink::LinkedHashMap; use quarto_markdown_pandoc::pandoc::location::{Location, Range, SourceInfo}; -use quarto_markdown_pandoc::pandoc::{MetaValue, RawBlock, rawblock_to_meta}; +use quarto_markdown_pandoc::pandoc::{ + Inline, MetaValue, RawBlock, parse_metadata_strings, rawblock_to_meta, +}; use std::fs; #[test] @@ -54,3 +57,183 @@ fn test_metadata_parsing() { Some(MetaValue::MetaList(_)) )); } + +#[test] +fn test_yaml_tagged_strings() { + // Test that YAML tags (!path, !glob, !str) prevent markdown parsing + let content = fs::read_to_string("tests/yaml-tagged-strings.qmd").unwrap(); + + let block = RawBlock { + format: "quarto_minus_metadata".to_string(), + text: content, + source_info: SourceInfo::with_range(Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 0, + row: 0, + column: 0, + }, + }), + }; + + let mut meta = rawblock_to_meta(block); + let mut outer_meta = LinkedHashMap::new(); + + // Parse metadata strings + for (k, v) in meta.drain() { + let parsed = parse_metadata_strings(v, &mut outer_meta); + outer_meta.insert(k, parsed); + } + + // Check plain_path - should be MetaInlines with Span wrapper + let plain_path = outer_meta.get("plain_path").expect("plain_path not found"); + if let MetaValue::MetaInlines(inlines) = plain_path { + assert_eq!(inlines.len(), 1, "Expected exactly one inline"); + if let Inline::Span(span) = &inlines[0] { + assert!(span.attr.1.contains(&"yaml-tagged-string".to_string())); + assert_eq!(span.attr.2.get("tag"), Some(&"path".to_string())); + // Extract the string content + if let Inline::Str(s) = &span.content[0] { + assert_eq!(s.text, "images/neovim-*.png"); + } else { + panic!("Expected Str inline inside Span"); + } + } else { + panic!("Expected Span inline, got: {:?}", inlines[0]); + } + } else { + panic!("Expected MetaInlines for plain_path"); + } + + // Check glob_pattern + let glob_pattern = outer_meta + .get("glob_pattern") + .expect("glob_pattern not found"); + if let MetaValue::MetaInlines(inlines) = glob_pattern { + if let Inline::Span(span) = &inlines[0] { + assert_eq!(span.attr.2.get("tag"), Some(&"glob".to_string())); + if let Inline::Str(s) = &span.content[0] { + assert_eq!(s.text, "posts/*/index.qmd"); + } + } + } + + // Check literal_string + let literal_string = outer_meta + .get("literal_string") + .expect("literal_string not found"); + if let MetaValue::MetaInlines(inlines) = literal_string { + if let Inline::Span(span) = &inlines[0] { + assert_eq!(span.attr.2.get("tag"), Some(&"str".to_string())); + if let Inline::Str(s) = &span.content[0] { + assert_eq!(s.text, "_foo_.py"); + } + } + } + + // Check regular_markdown - should have parsed markdown (Emph element) + let regular_markdown = outer_meta + .get("regular_markdown") + .expect("regular_markdown not found"); + if let MetaValue::MetaInlines(inlines) = regular_markdown { + // Should contain Emph for *emphasis* + let has_emph = inlines + .iter() + .any(|inline| matches!(inline, Inline::Emph(_))); + assert!( + has_emph, + "regular_markdown should have Emph element from *emphasis*" + ); + } else { + panic!("Expected MetaInlines for regular_markdown"); + } +} + +#[test] +fn test_yaml_markdown_parse_failure() { + // Test that untagged strings that fail markdown parsing are gracefully handled + let content = fs::read_to_string("tests/yaml-markdown-parse-failure.qmd").unwrap(); + + let block = RawBlock { + format: "quarto_minus_metadata".to_string(), + text: content, + source_info: SourceInfo::with_range(Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 0, + row: 0, + column: 0, + }, + }), + }; + + let mut meta = rawblock_to_meta(block); + let mut outer_meta = LinkedHashMap::new(); + + // Parse metadata strings - this should not panic + for (k, v) in meta.drain() { + let parsed = parse_metadata_strings(v, &mut outer_meta); + outer_meta.insert(k, parsed); + } + + // Check untagged_path - should be wrapped in error span + let untagged_path = outer_meta + .get("untagged_path") + .expect("untagged_path not found"); + if let MetaValue::MetaInlines(inlines) = untagged_path { + if let Inline::Span(span) = &inlines[0] { + assert!( + span.attr + .1 + .contains(&"yaml-markdown-syntax-error".to_string()) + ); + if let Inline::Str(s) = &span.content[0] { + assert_eq!(s.text, "posts/*/index.qmd"); + } + } else { + panic!("Expected Span inline for failed parse"); + } + } else { + panic!("Expected MetaInlines for untagged_path"); + } + + // Check another_glob - should also be wrapped in error span + let another_glob = outer_meta + .get("another_glob") + .expect("another_glob not found"); + if let MetaValue::MetaInlines(inlines) = another_glob { + if let Inline::Span(span) = &inlines[0] { + assert!( + span.attr + .1 + .contains(&"yaml-markdown-syntax-error".to_string()) + ); + if let Inline::Str(s) = &span.content[0] { + assert_eq!(s.text, "images/*.png"); + } + } + } + + // Check underscore_file - this one should successfully parse as markdown with Emph + let underscore_file = outer_meta + .get("underscore_file") + .expect("underscore_file not found"); + if let MetaValue::MetaInlines(inlines) = underscore_file { + // _foo_ should become Emph element + let has_emph = inlines + .iter() + .any(|inline| matches!(inline, Inline::Emph(_))); + assert!( + has_emph, + "underscore_file should have Emph element from _foo_" + ); + } +} diff --git a/crates/quarto-markdown-pandoc/tests/yaml-markdown-parse-failure.qmd b/crates/quarto-markdown-pandoc/tests/yaml-markdown-parse-failure.qmd new file mode 100644 index 0000000..1692aac --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/yaml-markdown-parse-failure.qmd @@ -0,0 +1,8 @@ +--- +title: Test Markdown Parse Failure Fallback +untagged_path: posts/*/index.qmd +another_glob: images/*.png +underscore_file: _foo_.py +--- + +Test document for graceful handling of markdown parse failures in untagged strings. diff --git a/crates/quarto-markdown-pandoc/tests/yaml-tagged-strings.qmd b/crates/quarto-markdown-pandoc/tests/yaml-tagged-strings.qmd new file mode 100644 index 0000000..5c5709c --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/yaml-tagged-strings.qmd @@ -0,0 +1,9 @@ +--- +title: Test YAML Tagged Strings +plain_path: !path images/neovim-*.png +glob_pattern: !glob posts/*/index.qmd +literal_string: !str _foo_.py +regular_markdown: This has *emphasis* +--- + +Test document for YAML tag support. diff --git a/docs/syntax/index.qmd b/docs/syntax/index.qmd index 3980610..fad9595 100644 --- a/docs/syntax/index.qmd +++ b/docs/syntax/index.qmd @@ -13,3 +13,4 @@ The features documented here are currently under development. The syntax and beh - [Definition Lists](definition-lists.qmd) - Create definition lists using an embedded markdown DSL - [Editorial Marks](editorial-marks.qmd) - Annotate text with highlights, insertions, deletions, and comments - [Footnotes](footnotes.qmd) - Add footnotes with inline or fenced block syntax +- [YAML Metadata](yaml-metadata.qmd) - Control markdown parsing in metadata with YAML tags diff --git a/docs/syntax/yaml-metadata.qmd b/docs/syntax/yaml-metadata.qmd new file mode 100644 index 0000000..0ca322c --- /dev/null +++ b/docs/syntax/yaml-metadata.qmd @@ -0,0 +1,242 @@ +--- +title: "YAML Metadata" +--- + +## Overview + +YAML front matter provides document-level metadata in Quarto documents. In `quarto-markdown`, metadata values are parsed as markdown by default, allowing you to use formatting like `*emphasis*` and `[links](url)`. However, some values—like file paths and glob patterns—should be treated as literal strings. + +## Default Behavior: Markdown Parsing + +By default, string values in YAML metadata are parsed as markdown: + +```yaml +--- +title: This has *emphasis* +description: Visit [our website](https://example.com) for more info +--- +``` + +The `title` will render with italicized "emphasis", and the `description` will include a clickable link. + +## YAML Tags for Literal Strings + +When you need to prevent markdown parsing, use YAML tags to mark values as literal strings: + +### Available Tags + +- `!str` - Plain string, no markdown parsing +- `!path` - File path (same as `!str`, but semantically clearer) +- `!glob` - Glob pattern (same as `!str`, but semantically clearer) + +### Syntax + +Prefix the value with the tag: + +```yaml +--- +title: My Blog +resources: + - !path images/neovim-*.png + - !path _foo_.py +listing: + contents: !glob posts/*/index.qmd +plain: !str "Text with * and _ chars that won't be parsed" +--- +``` + +## Why Use Tags? + +### Problem: Wildcard Characters + +File paths and glob patterns often contain characters that have special meaning in markdown: + +- `*` - Indicates emphasis in markdown +- `_` - Also indicates emphasis in markdown +- `[` and `]` - Indicate links in markdown + +Without tags, these can cause unexpected results: + +```yaml +--- +# Without tag: _foo_.py is parsed as markdown +# The underscore characters create italic emphasis! +file: _foo_.py # Parsed as: foo.py + +# With tag: preserved literally +file: !path _foo_.py # Parsed as: "_foo_.py" +--- +``` + +### Examples Where Tags Help + +**Glob patterns with wildcards:** + +```yaml +--- +# ❌ Without tag: asterisk triggers markdown parsing error +listing: posts/*/index.qmd + +# ✅ With tag: preserved as literal glob pattern +listing: !glob posts/*/index.qmd +--- +``` + +**File paths with underscores:** + +```yaml +--- +# ❌ Without tag: underscores create italic emphasis +script: _build_helper.py # Parsed as: buildhelper.py + +# ✅ With tag: preserved as literal file path +script: !path _build_helper.py +--- +``` + +**Relative paths:** + +```yaml +--- +# ❌ Without tag: parsed as markdown, leading dots may confuse parser +redirect: ../_redirect.html + +# ✅ With tag: preserved as literal path +redirect: !path ../_redirect.html +--- +``` + +## Graceful Fallback + +If a string fails to parse as markdown (e.g., because it contains `*` wildcard characters), `quarto-markdown` will gracefully preserve it as a literal string instead of crashing. + +The parser wraps failed parses in a special marker that downstream tools can detect: + +```json +{ + "t": "MetaInlines", + "c": [{ + "t": "Span", + "c": [ + ["", ["yaml-markdown-syntax-error"], []], + [{"t": "Str", "c": "posts/*/index.qmd"}] + ] + }] +} +``` + +While this graceful fallback prevents crashes, using explicit tags is better practice: + +1. **Intent**: Tags clearly communicate that a value should be literal +2. **Reliability**: Tags work even if the string happens to be valid markdown +3. **Tooling**: Downstream tools can recognize tagged strings by the `yaml-tagged-string` class + +## When to Use Tags + +Use YAML tags when metadata values contain: + +- **File paths**: Especially those with wildcards, underscores, or special characters +- **Glob patterns**: Any pattern using `*`, `?`, or `[...]` syntax +- **Configuration strings**: Technical strings that shouldn't be formatted + +**Don't need tags** for: + +- **Titles and descriptions**: Where you want markdown formatting +- **Plain text**: Simple strings without special characters +- **Booleans and numbers**: These are never parsed as markdown + +## Complete Example + +```yaml +--- +title: My Data Science Blog +description: Articles about *statistics* and [machine learning](ml.html) + +# File paths with wildcards - use tags +resources: + - !path images/*.png + - !path data/**/*.csv + - !path _utils.py + +# Glob patterns - use tags +listing: + contents: !glob posts/*/index.qmd + +# Configuration strings - use tags +redirect: !path ../index.html +template: !str {{< special-syntax >}} + +# Regular strings - no tags needed +author: Jane Doe +date: 2024-01-15 +--- +``` + +## Implementation Details + +### Tagged String Representation + +Tagged strings are converted to Pandoc's `MetaInlines` with a `Span` wrapper: + +```json +{ + "t": "MetaInlines", + "c": [{ + "t": "Span", + "c": [ + ["", ["yaml-tagged-string"], [["tag", "path"]]], + [{"t": "Str", "c": "images/neovim-*.png"}] + ] + }] +} +``` + +The representation includes: + +- **Class**: `yaml-tagged-string` - Identifies this as a tagged value +- **Attribute**: `tag` - Contains the tag name (`str`, `path`, or `glob`) +- **Content**: The literal string value wrapped in a `Str` inline + +### Compatibility with Pandoc + +Tagged strings are compatible with Pandoc's Lua filter API: + +```lua +-- Extract the string value +local value = pandoc.utils.stringify(meta.listing.contents) +-- value == "posts/*/index.qmd" + +-- Check if it's a tagged string +if meta.listing.contents[1].classes[1] == "yaml-tagged-string" then + local tag = meta.listing.contents[1].attributes.tag + -- tag == "glob" +end +``` + +The `pandoc.utils.stringify()` function correctly extracts the string content, so most filters will work without modification. + +## Migration Guide + +If you have existing documents that are failing to parse due to file paths or glob patterns: + +1. **Identify problematic values**: Look for metadata with `*`, `_`, or other markdown special characters +2. **Add tags**: Prefix those values with `!path`, `!glob`, or `!str` +3. **Test**: Verify the document parses without errors + +Example migration: + +```yaml +# Before (may crash or parse incorrectly) +--- +listing: posts/*/index.qmd +resources: [images/*.png, _helper.py] +--- + +# After (explicit and reliable) +--- +listing: !glob posts/*/index.qmd +resources: + - !path images/*.png + - !path _helper.py +--- +``` From 03966189fb2273d8dacd062c527fbb3d5c7f4272 Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Fri, 17 Oct 2025 07:36:44 -0500 Subject: [PATCH 02/11] be more robust in the presence of unexpected AST --- .../src/pandoc/inline.rs | 195 ++++++++++++++++-- 1 file changed, 180 insertions(+), 15 deletions(-) diff --git a/crates/quarto-markdown-pandoc/src/pandoc/inline.rs b/crates/quarto-markdown-pandoc/src/pandoc/inline.rs index fcdbe3a..5052aa8 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/inline.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/inline.rs @@ -434,7 +434,7 @@ pub fn make_cite_inline( // first we split the content along semicolons let citations: Vec = content .split(is_semicolon) - .map(|slice| { + .flat_map(|slice| { let inlines = slice.to_vec(); let mut cite: Option = None; let mut prefix: Inlines = vec![]; @@ -455,21 +455,43 @@ pub fn make_cite_inline( let Some(mut c) = cite else { panic!("Cite inline should have at least one citation, found none") }; - if c.citations.len() != 1 { - panic!( - "Cite inline should have exactly one citation, found: {:?}", - c.citations - ); - } - let mut citation = c.citations.pop().unwrap(); - if citation.mode == CitationMode::AuthorInText { - // if the mode is AuthorInText, it becomes NormalCitation inside - // a compound cite - citation.mode = CitationMode::NormalCitation; + + // Handle the case where a Cite already has multiple citations + // This can happen when citation syntax appears in contexts like tables + // where the parser creates a Cite with multiple citations + if c.citations.len() == 1 { + // Simple case: one citation, apply prefix and suffix directly + let mut citation = c.citations.pop().unwrap(); + if citation.mode == CitationMode::AuthorInText { + // if the mode is AuthorInText, it becomes NormalCitation inside + // a compound cite + citation.mode = CitationMode::NormalCitation; + } + citation.prefix = prefix; + citation.suffix = suffix; + vec![citation] + } else { + // Complex case: multiple citations already present + // Apply prefix to the first citation and suffix to the last + let num_citations = c.citations.len(); + for (i, citation) in c.citations.iter_mut().enumerate() { + if citation.mode == CitationMode::AuthorInText { + citation.mode = CitationMode::NormalCitation; + } + if i == 0 { + // Prepend prefix to the first citation's prefix + let mut new_prefix = prefix.clone(); + new_prefix.extend(citation.prefix.clone()); + citation.prefix = new_prefix; + } + if i == num_citations - 1 { + // Append suffix to the last citation's suffix + citation.suffix.extend(suffix.clone()); + } + } + // Return all citations from this slice + c.citations } - citation.prefix = prefix; - citation.suffix = suffix; - citation }) .collect(); return Inline::Cite(Cite { @@ -487,3 +509,146 @@ fn make_inline_leftover(node: &tree_sitter::Node, input_bytes: &[u8]) -> Inline source_info: node_source_info(node), }) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::pandoc::location::Location; + + fn dummy_source_info() -> SourceInfo { + SourceInfo { + filename_index: None, + range: Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 0, + row: 0, + column: 0, + }, + }, + } + } + + fn make_str(text: &str) -> Inline { + Inline::Str(Str { + text: text.to_string(), + source_info: dummy_source_info(), + }) + } + + fn make_space() -> Inline { + Inline::Space(Space { + source_info: dummy_source_info(), + }) + } + + fn make_citation(id: &str, prefix: Inlines, suffix: Inlines) -> Citation { + Citation { + id: id.to_string(), + prefix, + suffix, + mode: CitationMode::NormalCitation, + note_num: 0, + hash: 0, + } + } + + #[test] + fn test_make_cite_inline_with_multiple_citations() { + // Test case: a Cite inline that already contains multiple citations + // This simulates what happens when the parser encounters citation syntax + // in unsupported contexts (e.g., grid tables) + + // Create a Cite with two citations already in it + let multi_cite = Inline::Cite(Cite { + citations: vec![ + make_citation( + "knuth1984", + vec![], + vec![make_str(","), make_space(), make_str("pp. 33-35")], + ), + make_citation( + "wickham2015", + vec![make_space(), make_str("also"), make_space()], + vec![make_str(","), make_space(), make_str("chap. 1")], + ), + ], + content: vec![], + source_info: dummy_source_info(), + }); + + // Now call make_cite_inline with content that includes this multi-citation Cite + // along with a prefix "see" + let content = vec![make_str("see"), make_space(), multi_cite]; + + let result = make_cite_inline( + ("".to_string(), vec![], std::collections::HashMap::new()), + ("".to_string(), "".to_string()), + content, + dummy_source_info(), + ); + + // Verify the result is a Cite + match result { + Inline::Cite(cite) => { + // Should have 2 citations + assert_eq!(cite.citations.len(), 2); + + // First citation should have the prefix "see " prepended + assert_eq!(cite.citations[0].id, "knuth1984"); + assert_eq!(cite.citations[0].prefix.len(), 2); + match &cite.citations[0].prefix[0] { + Inline::Str(s) => assert_eq!(s.text, "see"), + _ => panic!("Expected Str"), + } + + // Second citation should have its original prefix intact + assert_eq!(cite.citations[1].id, "wickham2015"); + assert_eq!(cite.citations[1].prefix.len(), 3); + } + _ => panic!("Expected Cite inline, got: {:?}", result), + } + } + + #[test] + fn test_make_cite_inline_with_single_citation_still_works() { + // Test that the normal case (single citation) still works + let single_cite = Inline::Cite(Cite { + citations: vec![make_citation("knuth1984", vec![], vec![])], + content: vec![], + source_info: dummy_source_info(), + }); + + let content = vec![ + make_str("see"), + make_space(), + single_cite, + make_str(","), + make_space(), + make_str("pp. 33"), + ]; + + let result = make_cite_inline( + ("".to_string(), vec![], std::collections::HashMap::new()), + ("".to_string(), "".to_string()), + content, + dummy_source_info(), + ); + + match result { + Inline::Cite(cite) => { + assert_eq!(cite.citations.len(), 1); + assert_eq!(cite.citations[0].id, "knuth1984"); + // Prefix should be "see " + assert_eq!(cite.citations[0].prefix.len(), 2); + // Suffix should be ", pp. 33" + assert_eq!(cite.citations[0].suffix.len(), 3); + } + _ => panic!("Expected Cite inline"), + } + } +} From aa9363c0e5882db56e0955c6a0edbf9428869785 Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Fri, 17 Oct 2025 08:16:06 -0500 Subject: [PATCH 03/11] fix scanner matching rules inside code blocks --- .../tests/snapshots/native/026.qmd | 13 +++++++++++++ .../tests/snapshots/native/026.qmd.snapshot | 1 + .../tree-sitter-markdown/src/scanner.c | 8 +++++++- 3 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd create mode 100644 crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd.snapshot diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd b/crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd new file mode 100644 index 0000000..c041672 --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd @@ -0,0 +1,13 @@ +```powershell +$ENV:QUARTO_PRINT_STACK="true" +``` + +```bash +export FOO=$BAR +echo $HOME +``` + +```r +# Dollar signs in comments +x <- "$variable" +``` diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd.snapshot new file mode 100644 index 0000000..d89bd35 --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd.snapshot @@ -0,0 +1 @@ +[ CodeBlock ( "" , ["powershell"] , [] ) "$ENV:QUARTO_PRINT_STACK=\"true\"", CodeBlock ( "" , ["bash"] , [] ) "export FOO=$BAR\necho $HOME", CodeBlock ( "" , ["r"] , [] ) "# Dollar signs in comments\nx <- \"$variable\"" ] \ No newline at end of file diff --git a/crates/tree-sitter-qmd/tree-sitter-markdown/src/scanner.c b/crates/tree-sitter-qmd/tree-sitter-markdown/src/scanner.c index 80672ff..386e733 100644 --- a/crates/tree-sitter-qmd/tree-sitter-markdown/src/scanner.c +++ b/crates/tree-sitter-qmd/tree-sitter-markdown/src/scanner.c @@ -1464,7 +1464,13 @@ static bool scan(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { // and go on. But we can only serialize state if we successfully return an external // token. // - if (!s->simulate && lexer->lookahead == '$' && valid_symbols[DISPLAY_MATH_STATE_TRACK_MARKER]) { + // Don't track math state when inside a fenced code block - dollar signs should be literal + bool inside_fenced_code = s->open_blocks.size > 0 && + s->open_blocks.items[s->open_blocks.size - 1] == FENCED_CODE_BLOCK; + + if (!s->simulate && lexer->lookahead == '$' && + !inside_fenced_code && + valid_symbols[DISPLAY_MATH_STATE_TRACK_MARKER]) { advance(s, lexer); if (lexer->lookahead == '$') { advance(s, lexer); From 9c85a1bd69b22f460f5947269696cc14856798c5 Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Fri, 17 Oct 2025 10:13:52 -0500 Subject: [PATCH 04/11] new binary: syntax conversion helper --- Cargo.lock | 150 +++++++++++- crates/qmd-syntax-helper/Cargo.toml | 29 +++ crates/qmd-syntax-helper/README.md | 125 ++++++++++ .../filters/grid-table-to-list-table.lua | 189 ++++++++++++++ .../src/conversions/grid_tables.rs | 231 ++++++++++++++++++ .../qmd-syntax-helper/src/conversions/mod.rs | 1 + crates/qmd-syntax-helper/src/lib.rs | 2 + crates/qmd-syntax-helper/src/main.rs | 64 +++++ crates/qmd-syntax-helper/src/utils/file_io.rs | 13 + crates/qmd-syntax-helper/src/utils/mod.rs | 2 + .../qmd-syntax-helper/src/utils/resources.rs | 138 +++++++++++ .../tests/fixtures/simple-grid-table.md | 7 + .../tests/grid_tables_test.rs | 39 +++ 13 files changed, 978 insertions(+), 12 deletions(-) create mode 100644 crates/qmd-syntax-helper/Cargo.toml create mode 100644 crates/qmd-syntax-helper/README.md create mode 100644 crates/qmd-syntax-helper/resources/filters/grid-table-to-list-table.lua create mode 100644 crates/qmd-syntax-helper/src/conversions/grid_tables.rs create mode 100644 crates/qmd-syntax-helper/src/conversions/mod.rs create mode 100644 crates/qmd-syntax-helper/src/lib.rs create mode 100644 crates/qmd-syntax-helper/src/main.rs create mode 100644 crates/qmd-syntax-helper/src/utils/file_io.rs create mode 100644 crates/qmd-syntax-helper/src/utils/mod.rs create mode 100644 crates/qmd-syntax-helper/src/utils/resources.rs create mode 100644 crates/qmd-syntax-helper/tests/fixtures/simple-grid-table.md create mode 100644 crates/qmd-syntax-helper/tests/grid_tables_test.rs diff --git a/Cargo.lock b/Cargo.lock index 998c8f8..07b2087 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,7 +47,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" dependencies = [ - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -58,9 +58,15 @@ checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.60.2", ] +[[package]] +name = "anyhow" +version = "1.0.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" + [[package]] name = "arbitrary" version = "1.4.2" @@ -152,6 +158,16 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "colored" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" +dependencies = [ + "lazy_static", + "windows-sys 0.59.0", +] + [[package]] name = "console_error_panic_hook" version = "0.1.7" @@ -236,6 +252,25 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "include_dir" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "923d117408f1e49d914f1a379a309cffe4f18c05cf4e3d12e613a15fc81bd0dd" +dependencies = [ + "include_dir_macros", +] + +[[package]] +name = "include_dir_macros" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cab85a7ed0bd5f0e76d93846e0147172bed2e2d3f859bcc33a8d9699cad1a75" +dependencies = [ + "proc-macro2", + "quote", +] + [[package]] name = "indexmap" version = "2.11.0" @@ -278,6 +313,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "libc" version = "0.2.175" @@ -343,6 +384,18 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "qmd-syntax-helper" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "colored", + "include_dir", + "quarto-markdown-pandoc", + "regex", +] + [[package]] name = "quarto-markdown-pandoc" version = "0.0.0" @@ -684,7 +737,7 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0978bf7171b3d90bac376700cb56d606feb40f251a475a5d6634613564460b22" dependencies = [ - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -693,13 +746,38 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets", + "windows-targets 0.53.3", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -709,58 +787,106 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" dependencies = [ "windows-link", - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + [[package]] name = "windows_aarch64_gnullvm" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + [[package]] name = "windows_aarch64_msvc" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + [[package]] name = "windows_i686_gnu" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + [[package]] name = "windows_i686_gnullvm" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + [[package]] name = "windows_i686_msvc" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + [[package]] name = "windows_x86_64_gnu" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + [[package]] name = "windows_x86_64_gnullvm" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + [[package]] name = "windows_x86_64_msvc" version = "0.53.0" diff --git a/crates/qmd-syntax-helper/Cargo.toml b/crates/qmd-syntax-helper/Cargo.toml new file mode 100644 index 0000000..5de8fc0 --- /dev/null +++ b/crates/qmd-syntax-helper/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "qmd-syntax-helper" +version = "0.1.0" +authors.workspace = true +homepage.workspace = true +keywords.workspace = true +categories.workspace = true +license.workspace = true +repository.workspace = true +edition.workspace = true + +[lib] +name = "qmd_syntax_helper" +path = "src/lib.rs" + +[[bin]] +name = "qmd-syntax-helper" +path = "src/main.rs" + +[dependencies] +clap = { version = "4.5", features = ["derive"] } +anyhow = "1.0" +regex = "1.10" +colored = "2.1" +quarto-markdown-pandoc.workspace = true +include_dir = "0.7" + +[lints] +workspace = true diff --git a/crates/qmd-syntax-helper/README.md b/crates/qmd-syntax-helper/README.md new file mode 100644 index 0000000..738bd13 --- /dev/null +++ b/crates/qmd-syntax-helper/README.md @@ -0,0 +1,125 @@ +# qmd-syntax-helper + +A command-line tool for converting and fixing Quarto Markdown syntax issues. + +## Overview + +`qmd-syntax-helper` helps migrate Quarto Markdown documents between different syntax styles and fix common syntax issues. It's designed to handle bulk conversions across entire projects while preserving document semantics. + +## Features + +### Grid Table Conversion + +Convert Pandoc-style grid tables to Quarto's list-table format: + +```bash +# Convert a single file (output to stdout) +qmd-syntax-helper ungrid-tables input.qmd + +# Convert in-place +qmd-syntax-helper ungrid-tables --in-place input.qmd + +# Check what would change without modifying files +qmd-syntax-helper ungrid-tables --check input.qmd + +# Convert multiple files +qmd-syntax-helper ungrid-tables --in-place docs/**/*.qmd + +# Verbose output +qmd-syntax-helper ungrid-tables --in-place --verbose input.qmd +``` + +**Before (Grid Table):** +```markdown ++-----------+-----------+ +| Header 1 | Header 2 | ++===========+===========+ +| Cell 1 | Cell 2 | ++-----------+-----------+ +``` + +**After (List Table):** +```markdown +::: {.list-table header-rows="1" widths="0.5,0.5"} + +* * Header 1 + * Header 2 + +* * Cell 1 + * Cell 2 + +::: +``` + +## Installation + +From the quarto-markdown repository: + +```bash +cargo build --release --bin qmd-syntax-helper +# Binary will be in target/release/qmd-syntax-helper +``` + +## Requirements + +- Rust 2024 edition +- For grid table conversion: + - `pandoc` must be in PATH + - `quarto-markdown-pandoc` workspace crate (used as library) + +## Future Converters + +Planned conversions include: +- Reference-style links → inline links +- Attribute syntax fixes +- Shortcode migrations +- YAML frontmatter fixes + +## Development + +### Running Tests + +```bash +cargo test --package qmd-syntax-helper +``` + +### Adding New Converters + +1. Create a new module in `src/conversions/` +2. Implement the conversion logic +3. Add a new subcommand in `src/main.rs` +4. Add tests in `tests/` + +## Architecture + +``` +src/ + main.rs # CLI entry point + lib.rs # Public API + conversions/ + mod.rs + grid_tables.rs # Grid table converter + utils/ + file_io.rs # File I/O utilities + resources.rs # Embedded resource management +resources/ + filters/ + grid-table-to-list-table.lua # Pandoc Lua filter (embedded at compile time) +``` + +### Conversion Pipeline + +Grid table conversion uses a two-stage pipeline: + +1. **Pandoc with Lua filter**: Converts Markdown with grid tables to Pandoc JSON AST + - Uses embedded Lua filter to transform Table nodes to list-table Div format + - Extracted to temp directory at runtime via ResourceManager + +2. **quarto-markdown-pandoc library**: Converts Pandoc JSON AST back to Markdown + - Uses `quarto_markdown_pandoc::readers::json::read()` to parse JSON + - Uses `quarto_markdown_pandoc::writers::qmd::write()` to generate Markdown + - Pure Rust library calls (no subprocess overhead) + +## License + +MIT diff --git a/crates/qmd-syntax-helper/resources/filters/grid-table-to-list-table.lua b/crates/qmd-syntax-helper/resources/filters/grid-table-to-list-table.lua new file mode 100644 index 0000000..b6eb372 --- /dev/null +++ b/crates/qmd-syntax-helper/resources/filters/grid-table-to-list-table.lua @@ -0,0 +1,189 @@ +-- Lua filter to convert Pandoc grid tables to list-table format +-- This produces output that can be processed by list-table.lua + +if PANDOC_VERSION and PANDOC_VERSION.must_be_at_least then + PANDOC_VERSION:must_be_at_least("2.11") +else + error("pandoc version >=2.11 is required") +end + +-- Convert alignment enum to character code +local function alignment_to_char(align) + local align_str = tostring(align) + if align_str == 'AlignLeft' then return 'l' + elseif align_str == 'AlignRight' then return 'r' + elseif align_str == 'AlignCenter' then return 'c' + else return 'd' end +end + +-- Convert a cell to a list of blocks with optional attribute span prepended +local function cell_to_blocks(cell) + -- Extract cell properties using Lua API + local contents = cell.contents + local align = cell.alignment + local rowspan = cell.row_span + local colspan = cell.col_span + local attr = cell.attr + + -- Clone the blocks to avoid modifying the original + local blocks = pandoc.Blocks({}) + for _, block in ipairs(contents) do + table.insert(blocks, block:clone()) + end + + -- If we have non-default cell attributes, prepend an empty span + local align_str = tostring(align) + if rowspan ~= 1 or colspan ~= 1 or align_str ~= 'AlignDefault' then + local span_attr = pandoc.Attr('', {}, {}) + if colspan ~= 1 then + span_attr.attributes.colspan = tostring(colspan) + end + if rowspan ~= 1 then + span_attr.attributes.rowspan = tostring(rowspan) + end + if align_str ~= 'AlignDefault' then + span_attr.attributes.align = alignment_to_char(align) + end + + local empty_span = pandoc.Span({}, span_attr) + + -- Insert the empty span at the beginning of the first block's content + if #blocks > 0 and blocks[1].content then + table.insert(blocks[1].content, 1, empty_span) + else + -- If there's no content, create a paragraph with just the span + blocks = pandoc.Blocks({pandoc.Para({empty_span})}) + end + end + + -- Ensure we have at least one block + if #blocks == 0 then + blocks = pandoc.Blocks({pandoc.Para({})}) + end + + return blocks +end + +-- Convert a Pandoc Table to a list-table Div +local function table_to_list_table(tbl) + -- Extract table components using Lua API + local attr = tbl.attr + local caption = tbl.caption + local colspecs = tbl.colspecs + local thead = tbl.head + local tbodies = tbl.bodies + local tfoot = tbl.foot + + -- Build div attributes, starting from table attributes + local div_attr = pandoc.Attr(attr.identifier, {'list-table'}, {}) + + -- Copy table classes + for _, class in ipairs(attr.classes) do + table.insert(div_attr.classes, class) + end + + -- Copy table attributes + for k, v in pairs(attr.attributes) do + div_attr.attributes[k] = v + end + + -- Count header rows from thead + local thead_rows = thead.rows + local header_row_count = #thead_rows + if header_row_count > 0 then + div_attr.attributes['header-rows'] = tostring(header_row_count) + end + + -- Extract alignments and widths from colspecs + local aligns = {} + local widths = {} + local has_non_default_widths = false + + for i, colspec in ipairs(colspecs) do + -- ColSpec is a pair: [1] = alignment, [2] = width + local align = colspec[1] + local width = colspec[2] + + table.insert(aligns, alignment_to_char(align)) + + -- Width is a number (0.0-1.0) or ColWidthDefault + if type(width) == "number" and width > 0 then + table.insert(widths, tostring(width)) + has_non_default_widths = true + else + -- ColWidthDefault or 0 + table.insert(widths, "1") + end + end + + -- Only add aligns if there are non-default alignments + local has_non_default_aligns = false + for _, a in ipairs(aligns) do + if a ~= 'd' then + has_non_default_aligns = true + break + end + end + + if has_non_default_aligns then + div_attr.attributes.aligns = table.concat(aligns, ',') + end + + if has_non_default_widths then + div_attr.attributes.widths = table.concat(widths, ',') + end + + -- Build div content + local content = {} + + -- Add caption if present + if caption and caption.long and #caption.long > 0 then + for _, block in ipairs(caption.long) do + table.insert(content, block) + end + end + + -- Build list of rows (each row is a list item containing a bullet list of cells) + local row_items = {} + + -- Add header rows + for _, row in ipairs(thead_rows) do + local cells = row.cells + local cell_blocks_list = {} + for _, cell in ipairs(cells) do + table.insert(cell_blocks_list, cell_to_blocks(cell)) + end + -- Each row item contains a single bullet list of cells + table.insert(row_items, {pandoc.BulletList(cell_blocks_list)}) + end + + -- Add body rows from all table bodies + for _, tbody in ipairs(tbodies) do + for _, row in ipairs(tbody.body) do + local cells = row.cells + local cell_blocks_list = {} + for _, cell in ipairs(cells) do + table.insert(cell_blocks_list, cell_to_blocks(cell)) + end + -- Each row item contains a single bullet list of cells + table.insert(row_items, {pandoc.BulletList(cell_blocks_list)}) + end + end + + -- Add footer rows if any + for _, row in ipairs(tfoot.rows) do + local cells = row.cells + local cell_blocks_list = {} + for _, cell in ipairs(cells) do + table.insert(cell_blocks_list, cell_to_blocks(cell)) + end + table.insert(row_items, {pandoc.BulletList(cell_blocks_list)}) + end + + -- Create the outer bullet list (list of rows) + table.insert(content, pandoc.BulletList(row_items)) + + return pandoc.Div(content, div_attr) +end + +return {{Table = table_to_list_table}} diff --git a/crates/qmd-syntax-helper/src/conversions/grid_tables.rs b/crates/qmd-syntax-helper/src/conversions/grid_tables.rs new file mode 100644 index 0000000..18adc24 --- /dev/null +++ b/crates/qmd-syntax-helper/src/conversions/grid_tables.rs @@ -0,0 +1,231 @@ +use anyhow::{Context, Result}; +use colored::Colorize; +use regex::Regex; +use std::path::Path; +use std::process::{Command, Stdio}; + +use crate::utils::file_io::{read_file, write_file}; +use crate::utils::resources::ResourceManager; +use quarto_markdown_pandoc::readers::json; +use quarto_markdown_pandoc::writers::qmd; + +pub struct GridTableConverter { + grid_start_regex: Regex, + table_line_regex: Regex, + caption_regex: Regex, + resources: ResourceManager, +} + +#[derive(Debug)] +pub struct GridTable { + pub text: String, + pub start_line: usize, + pub end_line: usize, +} + +impl GridTableConverter { + pub fn new() -> Result { + Ok(Self { + // Matches lines that start with + and contain - or = + grid_start_regex: Regex::new(r"^\+[-=+]+\+").unwrap(), + // Matches table content lines (start with + or |) + table_line_regex: Regex::new(r"^[+|]").unwrap(), + // Matches caption lines (start with :) + caption_regex: Regex::new(r"^:").unwrap(), + resources: ResourceManager::new()?, + }) + } + + /// Find all grid tables in the content + pub fn find_grid_tables(&self, content: &str) -> Vec { + let lines: Vec<&str> = content.lines().collect(); + let mut tables = Vec::new(); + let mut i = 0; + + while i < lines.len() { + let line = lines[i]; + + // Check if this line starts a grid table + if self.grid_start_regex.is_match(line) { + let start_idx = i; + let mut table_lines = vec![line]; + i += 1; + + // Collect all lines that are part of the table + while i < lines.len() { + let line = lines[i]; + + // Table content lines start with + or | + if self.table_line_regex.is_match(line) { + table_lines.push(line); + i += 1; + } + // Caption line starts with : and must immediately follow table + else if self.caption_regex.is_match(line) + && i == start_idx + table_lines.len() + { + table_lines.push(line); + i += 1; + break; + } else { + break; + } + } + + // Found a complete table + let table_text = table_lines.join("\n"); + tables.push(GridTable { + text: table_text, + start_line: start_idx, + end_line: i - 1, + }); + } else { + i += 1; + } + } + + tables + } + + /// Convert a single grid table by: + /// 1. Running pandoc with the Lua filter to convert to JSON + /// 2. Running quarto-markdown-pandoc to convert JSON to markdown + pub fn convert_table(&self, table_text: &str) -> Result { + use std::io::Write; + + // Get the Lua filter path from resources + let filter_path = self + .resources + .get_resource("filters/grid-table-to-list-table.lua")?; + + // Step 1: pandoc -f markdown -t json -L filter.lua + let mut pandoc = Command::new("pandoc") + .args(&["-f", "markdown", "-t", "json"]) + .arg("-L") + .arg(&filter_path) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .context("Failed to spawn pandoc")?; + + { + let stdin = pandoc + .stdin + .as_mut() + .context("Failed to get pandoc stdin")?; + stdin.write_all(table_text.as_bytes())?; + } + + let pandoc_output = pandoc.wait_with_output()?; + + if !pandoc_output.status.success() { + anyhow::bail!( + "pandoc failed: {}", + String::from_utf8_lossy(&pandoc_output.stderr) + ); + } + + // Step 2: Use library to convert JSON to markdown + let mut json_reader = std::io::Cursor::new(&pandoc_output.stdout); + let (pandoc_ast, _ctx) = json::read(&mut json_reader) + .context("Failed to parse JSON output from pandoc")?; + + let mut output = Vec::new(); + qmd::write(&pandoc_ast, &mut output) + .context("Failed to write markdown output")?; + + let result = String::from_utf8(output) + .context("Failed to parse output as UTF-8")? + .trim_end() + .to_string(); + + Ok(result) + } + + /// Process a single file + pub fn process_file( + &self, + file_path: &Path, + in_place: bool, + check: bool, + verbose: bool, + ) -> Result<()> { + let content = read_file(file_path)?; + let tables = self.find_grid_tables(&content); + + if tables.is_empty() { + if verbose { + println!(" No grid tables found"); + } + return Ok(()); + } + + if verbose || check { + println!( + " Found {} grid table(s)", + tables.len().to_string().yellow() + ); + } + + // Convert each table and build new content + let mut lines: Vec = content.lines().map(|s| s.to_string()).collect(); + let mut offset: isize = 0; // Track line offset as we modify + + for (idx, table) in tables.iter().enumerate() { + if verbose { + println!(" Converting table {}...", idx + 1); + } + + let converted = self.convert_table(&table.text)?; + + // Calculate actual line positions with offset + let start = (table.start_line as isize + offset) as usize; + let end = (table.end_line as isize + offset) as usize; + + if check { + println!( + " Table {} at lines {}-{}:", + idx + 1, + table.start_line, + table.end_line + ); + println!( + " {} {} lines -> {} {} lines", + "Original:".red(), + table.end_line - table.start_line + 1, + "Converted:".green(), + converted.lines().count() + ); + } + + // Replace the table in the lines + let converted_lines: Vec = converted.lines().map(|s| s.to_string()).collect(); + let new_len = converted_lines.len(); + let old_len = end - start + 1; + + // Splice in the new lines + lines.splice(start..=end, converted_lines); + + // Update offset for next table + offset += new_len as isize - old_len as isize; + } + + if check { + println!(" {} No changes written (--check mode)", "✓".green()); + return Ok(()); + } + + let new_content = lines.join("\n") + "\n"; + + if in_place { + write_file(file_path, &new_content)?; + println!(" {} Converted {} table(s)", "✓".green(), tables.len()); + } else { + // Output to stdout + print!("{}", new_content); + } + + Ok(()) + } +} diff --git a/crates/qmd-syntax-helper/src/conversions/mod.rs b/crates/qmd-syntax-helper/src/conversions/mod.rs new file mode 100644 index 0000000..4cc9258 --- /dev/null +++ b/crates/qmd-syntax-helper/src/conversions/mod.rs @@ -0,0 +1 @@ +pub mod grid_tables; diff --git a/crates/qmd-syntax-helper/src/lib.rs b/crates/qmd-syntax-helper/src/lib.rs new file mode 100644 index 0000000..a9c7828 --- /dev/null +++ b/crates/qmd-syntax-helper/src/lib.rs @@ -0,0 +1,2 @@ +pub mod conversions; +pub mod utils; diff --git a/crates/qmd-syntax-helper/src/main.rs b/crates/qmd-syntax-helper/src/main.rs new file mode 100644 index 0000000..c2fe51e --- /dev/null +++ b/crates/qmd-syntax-helper/src/main.rs @@ -0,0 +1,64 @@ +use anyhow::Result; +use clap::{Parser, Subcommand}; +use std::path::PathBuf; + +mod conversions; +mod utils; + +use conversions::grid_tables::GridTableConverter; + +#[derive(Parser)] +#[command(name = "qmd-syntax-helper")] +#[command(about = "Helper tool for converting and fixing Quarto Markdown syntax")] +#[command(version)] +struct Cli { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + /// Convert grid tables to list-table format + UngridTables { + /// Input files (can be multiple files or glob patterns) + #[arg(required = true)] + files: Vec, + + /// Edit files in place + #[arg(short, long)] + in_place: bool, + + /// Check mode: show what would be changed without modifying files + #[arg(short, long)] + check: bool, + + /// Show verbose output + #[arg(short, long)] + verbose: bool, + }, +} + +fn main() -> Result<()> { + let cli = Cli::parse(); + + match cli.command { + Commands::UngridTables { + files, + in_place, + check, + verbose, + } => { + let converter = GridTableConverter::new()?; + + for file_path in files { + if verbose { + println!("Processing: {}", file_path.display()); + } + + converter.process_file(&file_path, in_place, check, verbose)?; + } + + Ok(()) + } + } +} diff --git a/crates/qmd-syntax-helper/src/utils/file_io.rs b/crates/qmd-syntax-helper/src/utils/file_io.rs new file mode 100644 index 0000000..baa7fc9 --- /dev/null +++ b/crates/qmd-syntax-helper/src/utils/file_io.rs @@ -0,0 +1,13 @@ +use anyhow::{Context, Result}; +use std::fs; +use std::path::Path; + +/// Read a file to a string +pub fn read_file(path: &Path) -> Result { + fs::read_to_string(path).with_context(|| format!("Failed to read file: {}", path.display())) +} + +/// Write content to a file +pub fn write_file(path: &Path, content: &str) -> Result<()> { + fs::write(path, content).with_context(|| format!("Failed to write file: {}", path.display())) +} diff --git a/crates/qmd-syntax-helper/src/utils/mod.rs b/crates/qmd-syntax-helper/src/utils/mod.rs new file mode 100644 index 0000000..99ac0b5 --- /dev/null +++ b/crates/qmd-syntax-helper/src/utils/mod.rs @@ -0,0 +1,2 @@ +pub mod file_io; +pub mod resources; diff --git a/crates/qmd-syntax-helper/src/utils/resources.rs b/crates/qmd-syntax-helper/src/utils/resources.rs new file mode 100644 index 0000000..5c0d037 --- /dev/null +++ b/crates/qmd-syntax-helper/src/utils/resources.rs @@ -0,0 +1,138 @@ +use anyhow::{Context, Result}; +use include_dir::{include_dir, Dir}; +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, Ordering}; + +static RESOURCE_MANAGER_COUNTER: AtomicU64 = AtomicU64::new(0); +static RESOURCES_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/resources"); + +/// A resource manager that embeds files at compile time and extracts them +/// to a temporary directory at runtime. Automatically cleans up on drop. +pub struct ResourceManager { + temp_dir: PathBuf, +} + +impl ResourceManager { + /// Create a new resource manager with embedded resources + pub fn new() -> Result { + // Use both process ID and a unique counter to avoid conflicts between + // multiple ResourceManager instances in the same process (e.g., parallel tests) + let instance_id = RESOURCE_MANAGER_COUNTER.fetch_add(1, Ordering::SeqCst); + let temp_dir = std::env::temp_dir().join(format!( + "qmd-syntax-helper-{}-{}", + std::process::id(), + instance_id + )); + + fs::create_dir_all(&temp_dir) + .with_context(|| format!("Failed to create temp directory: {}", temp_dir.display()))?; + + Ok(Self { temp_dir }) + } + + /// Get a path to a resource, extracting it to temp dir if needed + pub fn get_resource(&self, path: &str) -> Result { + // Find the file in the embedded directory + let file = RESOURCES_DIR + .get_file(path) + .ok_or_else(|| anyhow::anyhow!("Resource not found: {}", path))?; + + // Determine output path in temp directory + let output_path = self.temp_dir.join(path); + + // Create parent directories if needed + if let Some(parent) = output_path.parent() { + fs::create_dir_all(parent)?; + } + + // Write the resource to the temp directory + fs::write(&output_path, file.contents()) + .with_context(|| format!("Failed to write resource to: {}", output_path.display()))?; + + Ok(output_path) + } + + /// Get the temp directory path + pub fn temp_dir(&self) -> &Path { + &self.temp_dir + } + + /// List all available resources + pub fn list_resources(&self) -> Vec { + let mut resources = Vec::new(); + Self::collect_files(&RESOURCES_DIR, "", &mut resources); + resources + } + + /// Recursively collect all file paths from a directory + fn collect_files(dir: &Dir, prefix: &str, resources: &mut Vec) { + for file in dir.files() { + let name = file.path().file_name().unwrap().to_string_lossy(); + let full_path = if prefix.is_empty() { + name.to_string() + } else { + format!("{}/{}", prefix, name) + }; + resources.push(full_path); + } + + for subdir in dir.dirs() { + let name = subdir.path().file_name().unwrap().to_string_lossy(); + let new_prefix = if prefix.is_empty() { + name.to_string() + } else { + format!("{}/{}", prefix, name) + }; + Self::collect_files(subdir, &new_prefix, resources); + } + } +} + +impl Drop for ResourceManager { + fn drop(&mut self) { + // Clean up temp directory + if self.temp_dir.exists() { + // Ignore errors here so it works well under stack unwinding + let _ = fs::remove_dir_all(&self.temp_dir); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_resource_manager_creates_temp_dir() { + let rm = ResourceManager::new().unwrap(); + assert!(rm.temp_dir().exists()); + } + + #[test] + fn test_resource_manager_lists_resources() { + let rm = ResourceManager::new().unwrap(); + let resources = rm.list_resources(); + assert!(resources.contains(&"filters/grid-table-to-list-table.lua".to_string())); + } + + #[test] + fn test_resource_manager_extracts_resource() { + let rm = ResourceManager::new().unwrap(); + let path = rm + .get_resource("filters/grid-table-to-list-table.lua") + .unwrap(); + assert!(path.exists()); + assert!(fs::read_to_string(&path).unwrap().contains("Lua filter")); + } + + #[test] + fn test_resource_manager_cleans_up() { + let temp_dir = { + let rm = ResourceManager::new().unwrap(); + rm.temp_dir().to_path_buf() + }; + // After rm is dropped, temp dir should be cleaned up + assert!(!temp_dir.exists()); + } +} diff --git a/crates/qmd-syntax-helper/tests/fixtures/simple-grid-table.md b/crates/qmd-syntax-helper/tests/fixtures/simple-grid-table.md new file mode 100644 index 0000000..a68d0b0 --- /dev/null +++ b/crates/qmd-syntax-helper/tests/fixtures/simple-grid-table.md @@ -0,0 +1,7 @@ ++-----------+-----------+ +| Header 1 | Header 2 | ++===========+===========+ +| Cell 1 | Cell 2 | ++-----------+-----------+ +| Cell 3 | Cell 4 | ++-----------+-----------+ diff --git a/crates/qmd-syntax-helper/tests/grid_tables_test.rs b/crates/qmd-syntax-helper/tests/grid_tables_test.rs new file mode 100644 index 0000000..e660c32 --- /dev/null +++ b/crates/qmd-syntax-helper/tests/grid_tables_test.rs @@ -0,0 +1,39 @@ +use qmd_syntax_helper::conversions::grid_tables::GridTableConverter; +use std::fs; +use std::path::Path; + +#[test] +fn test_finds_simple_grid_table() { + let converter = GridTableConverter::new().expect("Failed to create converter"); + let fixture_path = Path::new("tests/fixtures/simple-grid-table.md"); + let content = fs::read_to_string(fixture_path).expect("Failed to read fixture"); + + // The converter should find one grid table + let tables = converter.find_grid_tables(&content); + assert_eq!(tables.len(), 1); + + // The table should span lines 0-5 (6 lines total) + assert_eq!(tables[0].start_line, 0); + assert_eq!(tables[0].end_line, 6); +} + +#[test] +fn test_converts_grid_table() { + let converter = GridTableConverter::new().expect("Failed to create converter"); + let fixture_path = Path::new("tests/fixtures/simple-grid-table.md"); + let content = fs::read_to_string(fixture_path).expect("Failed to read fixture"); + + let tables = converter.find_grid_tables(&content); + assert_eq!(tables.len(), 1); + + // Convert the table + let converted = converter + .convert_table(&tables[0].text) + .expect("Failed to convert table"); + + // The converted output should contain list-table syntax + assert!(converted.contains("::: {.list-table")); + assert!(converted.contains("header-rows=")); + assert!(converted.contains("* * Header 1")); + assert!(converted.contains("* * Cell 1")); +} From a0921b1af8b44269a8f8a6224066142ad1637b24 Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Fri, 17 Oct 2025 10:51:17 -0500 Subject: [PATCH 05/11] defn list checking, glob expansion --- crates/qmd-syntax-helper/Cargo.toml | 3 + .../filters/definition-list-to-div.lua | 60 ++++ .../src/conversions/definition_lists.rs | 268 ++++++++++++++++++ .../qmd-syntax-helper/src/conversions/mod.rs | 1 + .../qmd-syntax-helper/src/diagnostics/mod.rs | 1 + .../src/diagnostics/syntax_check.rs | 123 ++++++++ crates/qmd-syntax-helper/src/lib.rs | 1 + crates/qmd-syntax-helper/src/main.rs | 103 ++++++- .../src/utils/glob_expand.rs | 49 ++++ crates/qmd-syntax-helper/src/utils/mod.rs | 1 + 10 files changed, 607 insertions(+), 3 deletions(-) create mode 100644 crates/qmd-syntax-helper/resources/filters/definition-list-to-div.lua create mode 100644 crates/qmd-syntax-helper/src/conversions/definition_lists.rs create mode 100644 crates/qmd-syntax-helper/src/diagnostics/mod.rs create mode 100644 crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs create mode 100644 crates/qmd-syntax-helper/src/utils/glob_expand.rs diff --git a/crates/qmd-syntax-helper/Cargo.toml b/crates/qmd-syntax-helper/Cargo.toml index 5de8fc0..9b4f98c 100644 --- a/crates/qmd-syntax-helper/Cargo.toml +++ b/crates/qmd-syntax-helper/Cargo.toml @@ -24,6 +24,9 @@ regex = "1.10" colored = "2.1" quarto-markdown-pandoc.workspace = true include_dir = "0.7" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +glob = "0.3" [lints] workspace = true diff --git a/crates/qmd-syntax-helper/resources/filters/definition-list-to-div.lua b/crates/qmd-syntax-helper/resources/filters/definition-list-to-div.lua new file mode 100644 index 0000000..1f01c12 --- /dev/null +++ b/crates/qmd-syntax-helper/resources/filters/definition-list-to-div.lua @@ -0,0 +1,60 @@ +-- Lua filter to convert Pandoc DefinitionList AST nodes to div-based definition lists +-- This produces output in the definition-list div syntax used by quarto-markdown + +if PANDOC_VERSION and PANDOC_VERSION.must_be_at_least then + PANDOC_VERSION:must_be_at_least("2.11") +else + error("pandoc version >=2.11 is required") +end + +-- Convert a DefinitionList to a div with .definition-list class +local function definition_list_to_div(def_list) + -- Build div attributes with .definition-list class + local div_attr = pandoc.Attr('', {'definition-list'}, {}) + + -- Build the outer bullet list containing all term-definition pairs + local outer_items = {} + + -- Each item in the definition list is a tuple: (term, definitions) + -- term: list of inline elements + -- definitions: list of definition blocks (each definition is a list of blocks) + for _, item in ipairs(def_list.content) do + local term = item[1] -- List of inline elements + local definitions = item[2] -- List of definition blocks + + -- Create the inner bullet list containing the definitions + local def_items = {} + for _, def_blocks in ipairs(definitions) do + -- Each definition is a list of blocks + -- Clone the blocks to avoid modifying the original + local blocks = pandoc.Blocks({}) + for _, block in ipairs(def_blocks) do + table.insert(blocks, block:clone()) + end + + -- Ensure we have at least one block + if #blocks == 0 then + blocks = pandoc.Blocks({pandoc.Para({})}) + end + + table.insert(def_items, blocks) + end + + -- Create a bullet list for the definitions + local def_list_elem = pandoc.BulletList(def_items) + + -- Create the outer list item containing: + -- 1. The term as a paragraph + -- 2. The nested bullet list of definitions + local term_para = pandoc.Para(term) + table.insert(outer_items, {term_para, def_list_elem}) + end + + -- Create the outer bullet list (list of term-definition pairs) + local outer_list = pandoc.BulletList(outer_items) + + -- Create the div containing the outer list + return pandoc.Div({outer_list}, div_attr) +end + +return {{DefinitionList = definition_list_to_div}} diff --git a/crates/qmd-syntax-helper/src/conversions/definition_lists.rs b/crates/qmd-syntax-helper/src/conversions/definition_lists.rs new file mode 100644 index 0000000..652029d --- /dev/null +++ b/crates/qmd-syntax-helper/src/conversions/definition_lists.rs @@ -0,0 +1,268 @@ +use anyhow::{Context, Result}; +use colored::Colorize; +use regex::Regex; +use std::path::Path; +use std::process::{Command, Stdio}; + +use crate::utils::file_io::{read_file, write_file}; +use crate::utils::resources::ResourceManager; +use quarto_markdown_pandoc::readers::json; +use quarto_markdown_pandoc::writers::qmd; + +pub struct DefinitionListConverter { + def_item_regex: Regex, + resources: ResourceManager, +} + +#[derive(Debug)] +pub struct DefinitionList { + pub text: String, + pub start_line: usize, + pub end_line: usize, +} + +impl DefinitionListConverter { + pub fn new() -> Result { + Ok(Self { + // Matches definition list items that start with `:` followed by spaces + def_item_regex: Regex::new(r"^:\s+").unwrap(), + resources: ResourceManager::new()?, + }) + } + + /// Find all definition lists in the content + pub fn find_definition_lists(&self, content: &str) -> Vec { + let lines: Vec<&str> = content.lines().collect(); + let mut lists = Vec::new(); + let mut i = 0; + + while i < lines.len() { + let line = lines[i]; + + // Look for a definition item (line starting with `: `) + // But not div fences (`::`or `:::`) + if self.def_item_regex.is_match(line) && !line.starts_with("::") { + // Found a definition item, now scan backwards to find the term + let mut start_idx = i; + + // Skip back over any blank lines + while start_idx > 0 && lines[start_idx - 1].trim().is_empty() { + start_idx -= 1; + } + + // The line before the blank lines should be the term + if start_idx > 0 { + start_idx -= 1; + } + + // Now scan forward to collect all terms and definitions in this list + let mut end_idx = i; + i += 1; + + loop { + // Continue through continuation lines and blank lines + while i < lines.len() { + let line = lines[i]; + if line.starts_with(" ") || line.trim().is_empty() { + end_idx = i; + i += 1; + } else { + break; + } + } + + // Check if the next item is part of this definition list + // It should be: optional non-blank line (term), then blank lines, then `: ` + if i < lines.len() { + let potential_term = lines[i]; + + // Not a definition line, might be next term + if !self.def_item_regex.is_match(potential_term) + || potential_term.starts_with("::") { + // Look ahead for a definition line + let mut j = i + 1; + while j < lines.len() && lines[j].trim().is_empty() { + j += 1; + } + + if j < lines.len() + && self.def_item_regex.is_match(lines[j]) + && !lines[j].starts_with("::") { + // Found another term-definition pair + end_idx = j; + i = j + 1; + continue; + } else { + // No more definition items + break; + } + } else { + // This IS a definition line (continuation of same term) + end_idx = i; + i += 1; + continue; + } + } else { + break; + } + } + + // Extract the definition list text + let list_lines = &lines[start_idx..=end_idx]; + let list_text = list_lines.join("\n"); + + lists.push(DefinitionList { + text: list_text, + start_line: start_idx, + end_line: end_idx, + }); + } else { + i += 1; + } + } + + lists + } + + /// Convert a single definition list by: + /// 1. Running pandoc with the Lua filter to convert to JSON + /// 2. Using quarto-markdown-pandoc library to convert JSON to markdown + pub fn convert_list(&self, list_text: &str) -> Result { + use std::io::Write; + + // Get the Lua filter path from resources + let filter_path = self + .resources + .get_resource("filters/definition-list-to-div.lua")?; + + // Step 1: pandoc -f markdown -t json -L filter.lua + let mut pandoc = Command::new("pandoc") + .args(&["-f", "markdown", "-t", "json"]) + .arg("-L") + .arg(&filter_path) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .context("Failed to spawn pandoc")?; + + { + let stdin = pandoc + .stdin + .as_mut() + .context("Failed to get pandoc stdin")?; + stdin.write_all(list_text.as_bytes())?; + } + + let pandoc_output = pandoc.wait_with_output()?; + + if !pandoc_output.status.success() { + anyhow::bail!( + "pandoc failed: {}", + String::from_utf8_lossy(&pandoc_output.stderr) + ); + } + + // Step 2: Use library to convert JSON to markdown + let mut json_reader = std::io::Cursor::new(&pandoc_output.stdout); + let (pandoc_ast, _ctx) = json::read(&mut json_reader) + .context("Failed to parse JSON output from pandoc")?; + + let mut output = Vec::new(); + qmd::write(&pandoc_ast, &mut output) + .context("Failed to write markdown output")?; + + let result = String::from_utf8(output) + .context("Failed to parse output as UTF-8")? + .trim_end() + .to_string(); + + Ok(result) + } + + /// Process a single file + pub fn process_file( + &self, + file_path: &Path, + in_place: bool, + check: bool, + verbose: bool, + ) -> Result<()> { + let content = read_file(file_path)?; + let lists = self.find_definition_lists(&content); + + if lists.is_empty() { + if verbose { + println!(" No definition lists found"); + } + return Ok(()); + } + + if verbose || check { + println!( + " Found {} definition list(s)", + lists.len().to_string().yellow() + ); + } + + // Convert each list and build new content + let mut lines: Vec = content.lines().map(|s| s.to_string()).collect(); + let mut offset: isize = 0; // Track line offset as we modify + + for (idx, list) in lists.iter().enumerate() { + if verbose { + println!(" Converting list {}...", idx + 1); + } + + let converted = self.convert_list(&list.text)?; + + // Calculate actual line positions with offset + let start = (list.start_line as isize + offset) as usize; + let end = (list.end_line as isize + offset) as usize; + + if check { + println!( + " List {} at lines {}-{}:", + idx + 1, + list.start_line, + list.end_line + ); + println!( + " {} {} lines -> {} {} lines", + "Original:".red(), + list.end_line - list.start_line + 1, + "Converted:".green(), + converted.lines().count() + ); + } + + // Replace the list in the lines + let converted_lines: Vec = converted.lines().map(|s| s.to_string()).collect(); + let new_len = converted_lines.len(); + let old_len = end - start + 1; + + // Splice in the new lines + lines.splice(start..=end, converted_lines); + + // Update offset for next list + offset += new_len as isize - old_len as isize; + } + + if check { + println!(" {} No changes written (--check mode)", "✓".green()); + return Ok(()); + } + + let new_content = lines.join("\n") + "\n"; + + if in_place { + write_file(file_path, &new_content)?; + println!(" {} Converted {} list(s)", "✓".green(), lists.len()); + } else { + // Output to stdout + print!("{}", new_content); + } + + Ok(()) + } +} diff --git a/crates/qmd-syntax-helper/src/conversions/mod.rs b/crates/qmd-syntax-helper/src/conversions/mod.rs index 4cc9258..4cccac7 100644 --- a/crates/qmd-syntax-helper/src/conversions/mod.rs +++ b/crates/qmd-syntax-helper/src/conversions/mod.rs @@ -1 +1,2 @@ +pub mod definition_lists; pub mod grid_tables; diff --git a/crates/qmd-syntax-helper/src/diagnostics/mod.rs b/crates/qmd-syntax-helper/src/diagnostics/mod.rs new file mode 100644 index 0000000..370c564 --- /dev/null +++ b/crates/qmd-syntax-helper/src/diagnostics/mod.rs @@ -0,0 +1 @@ +pub mod syntax_check; diff --git a/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs b/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs new file mode 100644 index 0000000..71510ce --- /dev/null +++ b/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs @@ -0,0 +1,123 @@ +use anyhow::{Context, Result}; +use colored::Colorize; +use serde::{Deserialize, Serialize}; +use std::fs; +use std::path::{Path, PathBuf}; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CheckResult { + pub file: PathBuf, + pub success: bool, + pub error_message: Option, +} + +pub struct SyntaxChecker { + pub results: Vec, +} + +impl SyntaxChecker { + pub fn new() -> Self { + Self { + results: Vec::new(), + } + } + + /// Check a single file by attempting to parse it + pub fn check_file(&mut self, file_path: &Path, verbose: bool) -> Result<()> { + if verbose { + print!("Checking: {} ... ", file_path.display()); + } + + let result = self.parse_file(file_path); + + match &result { + Ok(_) => { + if verbose { + println!("{}", "✓".green()); + } + self.results.push(CheckResult { + file: file_path.to_path_buf(), + success: true, + error_message: None, + }); + } + Err(e) => { + if verbose { + println!("{}", "✗".red()); + println!(" Error: {}", e); + } + self.results.push(CheckResult { + file: file_path.to_path_buf(), + success: false, + error_message: Some(e.to_string()), + }); + } + } + + Ok(()) + } + + /// Parse a file using quarto-markdown-pandoc + fn parse_file(&self, file_path: &Path) -> Result<()> { + let content = fs::read_to_string(file_path) + .with_context(|| format!("Failed to read file: {}", file_path.display()))?; + + // Use the quarto-markdown-pandoc library to parse + let mut sink = std::io::sink(); + let filename = file_path.to_string_lossy(); + + let result = quarto_markdown_pandoc::readers::qmd::read( + content.as_bytes(), + false, // not loose mode + &filename, + &mut sink, + None:: Vec>, // no custom error formatter + ); + + match result { + Ok(_) => Ok(()), + Err(errors) => { + // Join error messages + let error_msg = errors.join("\n"); + Err(anyhow::anyhow!("{}", error_msg)) + } + } + } + + /// Print a summary of the results + pub fn print_summary(&self) { + let total = self.results.len(); + let successes = self.results.iter().filter(|r| r.success).count(); + let failures = total - successes; + + println!("\n{}", "=== Summary ===".bold()); + println!("Total files: {}", total); + println!("Successful: {} {}", successes, "✓".green()); + println!("Failed: {} {}", failures, if failures > 0 { "✗".red() } else { "✓".green() }); + + if failures > 0 { + let success_rate = (successes as f64 / total as f64) * 100.0; + println!("Success rate: {:.1}%", success_rate); + } + } + + /// Get a list of failed files + pub fn failed_files(&self) -> Vec<&CheckResult> { + self.results.iter().filter(|r| !r.success).collect() + } + + /// Export results as JSONL + pub fn export_jsonl(&self, output_path: &Path) -> Result<()> { + let mut output = String::new(); + for result in &self.results { + let json = serde_json::to_string(result)?; + output.push_str(&json); + output.push('\n'); + } + + fs::write(output_path, output) + .with_context(|| format!("Failed to write to: {}", output_path.display()))?; + + Ok(()) + } +} diff --git a/crates/qmd-syntax-helper/src/lib.rs b/crates/qmd-syntax-helper/src/lib.rs index a9c7828..b64b3a4 100644 --- a/crates/qmd-syntax-helper/src/lib.rs +++ b/crates/qmd-syntax-helper/src/lib.rs @@ -1,2 +1,3 @@ pub mod conversions; +pub mod diagnostics; pub mod utils; diff --git a/crates/qmd-syntax-helper/src/main.rs b/crates/qmd-syntax-helper/src/main.rs index c2fe51e..43069f0 100644 --- a/crates/qmd-syntax-helper/src/main.rs +++ b/crates/qmd-syntax-helper/src/main.rs @@ -3,9 +3,13 @@ use clap::{Parser, Subcommand}; use std::path::PathBuf; mod conversions; +mod diagnostics; mod utils; +use conversions::definition_lists::DefinitionListConverter; use conversions::grid_tables::GridTableConverter; +use diagnostics::syntax_check::SyntaxChecker; +use utils::glob_expand::expand_globs; #[derive(Parser)] #[command(name = "qmd-syntax-helper")] @@ -20,9 +24,9 @@ struct Cli { enum Commands { /// Convert grid tables to list-table format UngridTables { - /// Input files (can be multiple files or glob patterns) + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") #[arg(required = true)] - files: Vec, + files: Vec, /// Edit files in place #[arg(short, long)] @@ -36,6 +40,44 @@ enum Commands { #[arg(short, long)] verbose: bool, }, + + /// Convert definition lists to div-based format + UndefLists { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Edit files in place + #[arg(short, long)] + in_place: bool, + + /// Check mode: show what would be changed without modifying files + #[arg(short, long)] + check: bool, + + /// Show verbose output + #[arg(short, long)] + verbose: bool, + }, + + /// Check syntax of files and report errors + Check { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Show verbose output (each file as processed) + #[arg(short, long)] + verbose: bool, + + /// Output results as JSONL + #[arg(long)] + json: bool, + + /// Save detailed results to file + #[arg(short, long)] + output: Option, + }, } fn main() -> Result<()> { @@ -49,8 +91,9 @@ fn main() -> Result<()> { verbose, } => { let converter = GridTableConverter::new()?; + let file_paths = expand_globs(&files)?; - for file_path in files { + for file_path in file_paths { if verbose { println!("Processing: {}", file_path.display()); } @@ -58,6 +101,60 @@ fn main() -> Result<()> { converter.process_file(&file_path, in_place, check, verbose)?; } + Ok(()) + } + Commands::UndefLists { + files, + in_place, + check, + verbose, + } => { + let converter = DefinitionListConverter::new()?; + let file_paths = expand_globs(&files)?; + + for file_path in file_paths { + if verbose { + println!("Processing: {}", file_path.display()); + } + + converter.process_file(&file_path, in_place, check, verbose)?; + } + + Ok(()) + } + Commands::Check { + files, + verbose, + json, + output, + } => { + let mut checker = SyntaxChecker::new(); + let file_paths = expand_globs(&files)?; + + for file_path in file_paths { + checker.check_file(&file_path, verbose)?; + } + + // Print summary if not JSON mode + if !json { + checker.print_summary(); + } + + // Save to output file if specified + if let Some(output_path) = output { + checker.export_jsonl(&output_path)?; + if !json { + println!("\nDetailed results written to: {}", output_path.display()); + } + } + + // Print JSON to stdout if requested + if json { + for result in &checker.results { + println!("{}", serde_json::to_string(result)?); + } + } + Ok(()) } } diff --git a/crates/qmd-syntax-helper/src/utils/glob_expand.rs b/crates/qmd-syntax-helper/src/utils/glob_expand.rs new file mode 100644 index 0000000..e87b7d5 --- /dev/null +++ b/crates/qmd-syntax-helper/src/utils/glob_expand.rs @@ -0,0 +1,49 @@ +use anyhow::{Context, Result}; +use std::path::PathBuf; + +/// Expand glob patterns into a list of file paths +/// +/// If a pattern doesn't contain glob characters (*, ?, [, ]), +/// treat it as a literal path. +pub fn expand_globs(patterns: &[String]) -> Result> { + let mut files = Vec::new(); + + for pattern in patterns { + // Check if pattern contains glob characters + if pattern.contains('*') || pattern.contains('?') || pattern.contains('[') { + // It's a glob pattern - expand it + let paths = glob::glob(pattern) + .with_context(|| format!("Invalid glob pattern: {}", pattern))?; + + for path in paths { + let path = path.with_context(|| format!("Failed to read glob match for: {}", pattern))?; + files.push(path); + } + } else { + // It's a literal path - use as-is + files.push(PathBuf::from(pattern)); + } + } + + Ok(files) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_literal_path() { + let patterns = vec!["test.qmd".to_string()]; + let result = expand_globs(&patterns).unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0], PathBuf::from("test.qmd")); + } + + #[test] + fn test_multiple_literals() { + let patterns = vec!["a.qmd".to_string(), "b.qmd".to_string()]; + let result = expand_globs(&patterns).unwrap(); + assert_eq!(result.len(), 2); + } +} diff --git a/crates/qmd-syntax-helper/src/utils/mod.rs b/crates/qmd-syntax-helper/src/utils/mod.rs index 99ac0b5..2054ac3 100644 --- a/crates/qmd-syntax-helper/src/utils/mod.rs +++ b/crates/qmd-syntax-helper/src/utils/mod.rs @@ -1,2 +1,3 @@ pub mod file_io; +pub mod glob_expand; pub mod resources; From 9d9a546216e606185f39434d2be8ea9f73b03000 Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Fri, 17 Oct 2025 11:10:36 -0500 Subject: [PATCH 06/11] div whitespace --- Cargo.lock | 3 + .../src/conversions/definition_lists.rs | 13 +- .../src/conversions/div_whitespace.rs | 210 ++++++++++++++++++ .../src/conversions/grid_tables.rs | 7 +- .../qmd-syntax-helper/src/conversions/mod.rs | 1 + .../src/diagnostics/syntax_check.rs | 18 +- crates/qmd-syntax-helper/src/main.rs | 39 ++++ .../src/utils/glob_expand.rs | 3 +- .../qmd-syntax-helper/src/utils/resources.rs | 2 +- .../tests/div_whitespace_test.rs | 145 ++++++++++++ 10 files changed, 427 insertions(+), 14 deletions(-) create mode 100644 crates/qmd-syntax-helper/src/conversions/div_whitespace.rs create mode 100644 crates/qmd-syntax-helper/tests/div_whitespace_test.rs diff --git a/Cargo.lock b/Cargo.lock index 07b2087..db17e8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -391,9 +391,12 @@ dependencies = [ "anyhow", "clap", "colored", + "glob", "include_dir", "quarto-markdown-pandoc", "regex", + "serde", + "serde_json", ] [[package]] diff --git a/crates/qmd-syntax-helper/src/conversions/definition_lists.rs b/crates/qmd-syntax-helper/src/conversions/definition_lists.rs index 652029d..aa48aeb 100644 --- a/crates/qmd-syntax-helper/src/conversions/definition_lists.rs +++ b/crates/qmd-syntax-helper/src/conversions/definition_lists.rs @@ -78,7 +78,8 @@ impl DefinitionListConverter { // Not a definition line, might be next term if !self.def_item_regex.is_match(potential_term) - || potential_term.starts_with("::") { + || potential_term.starts_with("::") + { // Look ahead for a definition line let mut j = i + 1; while j < lines.len() && lines[j].trim().is_empty() { @@ -87,7 +88,8 @@ impl DefinitionListConverter { if j < lines.len() && self.def_item_regex.is_match(lines[j]) - && !lines[j].starts_with("::") { + && !lines[j].starts_with("::") + { // Found another term-definition pair end_idx = j; i = j + 1; @@ -165,12 +167,11 @@ impl DefinitionListConverter { // Step 2: Use library to convert JSON to markdown let mut json_reader = std::io::Cursor::new(&pandoc_output.stdout); - let (pandoc_ast, _ctx) = json::read(&mut json_reader) - .context("Failed to parse JSON output from pandoc")?; + let (pandoc_ast, _ctx) = + json::read(&mut json_reader).context("Failed to parse JSON output from pandoc")?; let mut output = Vec::new(); - qmd::write(&pandoc_ast, &mut output) - .context("Failed to write markdown output")?; + qmd::write(&pandoc_ast, &mut output).context("Failed to write markdown output")?; let result = String::from_utf8(output) .context("Failed to parse output as UTF-8")? diff --git a/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs b/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs new file mode 100644 index 0000000..6b45263 --- /dev/null +++ b/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs @@ -0,0 +1,210 @@ +use anyhow::{Context, Result}; +use colored::Colorize; +use serde::{Deserialize, Serialize}; +use std::fs; +use std::path::Path; + +use crate::utils::file_io::{read_file, write_file}; + +#[derive(Debug, Serialize, Deserialize)] +struct ErrorLocation { + row: usize, + column: usize, + byte_offset: usize, + size: usize, +} + +#[derive(Debug, Serialize, Deserialize)] +struct ParseError { + filename: String, + title: String, + message: String, + location: ErrorLocation, +} + +pub struct DivWhitespaceConverter {} + +impl DivWhitespaceConverter { + pub fn new() -> Result { + Ok(Self {}) + } + + /// Parse a file and get error locations as JSON + fn get_parse_errors(&self, file_path: &Path) -> Result> { + let content = fs::read_to_string(file_path) + .with_context(|| format!("Failed to read file: {}", file_path.display()))?; + + // Use the quarto-markdown-pandoc library to parse with JSON error formatter + let mut sink = std::io::sink(); + let filename = file_path.to_string_lossy(); + + let result = quarto_markdown_pandoc::readers::qmd::read( + content.as_bytes(), + false, // not loose mode + &filename, + &mut sink, + Some( + quarto_markdown_pandoc::readers::qmd_error_messages::produce_json_error_messages + as fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + ), + ); + + match result { + Ok(_) => Ok(Vec::new()), // No errors + Err(error_messages) => { + // Parse the JSON error output + // The error messages come as a single JSON array string + if error_messages.is_empty() { + return Ok(Vec::new()); + } + + let json_str = error_messages.join(""); + let errors: Vec = + serde_json::from_str(&json_str).context("Failed to parse JSON error output")?; + + Ok(errors) + } + } + } + + /// Find div fence errors that need whitespace fixes + fn find_div_whitespace_errors(&self, content: &str, errors: &[ParseError]) -> Vec { + let mut fix_positions = Vec::new(); + let lines: Vec<&str> = content.lines().collect(); + + for error in errors { + // Skip errors that are not about div fences + // We're looking for "Missing Space After Div Fence" or errors on lines with ::: + let is_div_error = error.title.contains("Div Fence") || error.title == "Parse error"; + + if !is_div_error { + continue; + } + + // The error might be on the line itself or the line before (for div fences) + // Check both the current line and the previous line + let lines_to_check = if error.location.row > 0 { + vec![error.location.row - 1, error.location.row] + } else { + vec![error.location.row] + }; + + for &line_idx in &lines_to_check { + if line_idx >= lines.len() { + continue; + } + + let line = lines[line_idx]; + + // Check if this line starts with ::: followed immediately by { + let trimmed = line.trim_start(); + if let Some(after_colon) = trimmed.strip_prefix(":::") { + if after_colon.starts_with('{') { + // Calculate the position right after ::: + // We need byte offset, not char offset + let line_start = content + .lines() + .take(line_idx) + .map(|l| l.len() + 1) // +1 for newline + .sum::(); + + let indent_bytes = line.len() - trimmed.len(); + let fix_pos = line_start + indent_bytes + 3; // +3 for ":::" + + fix_positions.push(fix_pos); + break; // Found it, no need to check other lines for this error + } + } + } + } + + // Remove duplicates and sort + fix_positions.sort_unstable(); + fix_positions.dedup(); + + fix_positions + } + + /// Apply fixes to content by inserting spaces at specified positions + fn apply_fixes(&self, content: &str, fix_positions: &[usize]) -> String { + let mut result = String::with_capacity(content.len() + fix_positions.len()); + let mut last_pos = 0; + + for &pos in fix_positions { + // Copy content up to this position + result.push_str(&content[last_pos..pos]); + // Insert a space + result.push(' '); + last_pos = pos; + } + + // Copy remaining content + result.push_str(&content[last_pos..]); + + result + } + + /// Process a single file + pub fn process_file( + &self, + file_path: &Path, + in_place: bool, + check: bool, + verbose: bool, + ) -> Result<()> { + let content = read_file(file_path)?; + + // Get parse errors + let errors = self.get_parse_errors(file_path)?; + + if errors.is_empty() { + if verbose { + println!(" No div whitespace issues found"); + } + return Ok(()); + } + + // Find positions that need fixes + let fix_positions = self.find_div_whitespace_errors(&content, &errors); + + if fix_positions.is_empty() { + if verbose { + println!(" No div whitespace issues found"); + } + return Ok(()); + } + + if verbose || check { + println!( + " Found {} div fence(s) needing whitespace fixes", + fix_positions.len().to_string().yellow() + ); + } + + if check { + println!(" {} No changes written (--check mode)", "✓".green()); + return Ok(()); + } + + // Apply fixes + let new_content = self.apply_fixes(&content, &fix_positions); + + if in_place { + write_file(file_path, &new_content)?; + println!( + " {} Fixed {} div fence(s)", + "✓".green(), + fix_positions.len() + ); + } else { + // Output to stdout + print!("{}", new_content); + } + + Ok(()) + } +} diff --git a/crates/qmd-syntax-helper/src/conversions/grid_tables.rs b/crates/qmd-syntax-helper/src/conversions/grid_tables.rs index 18adc24..1f303cf 100644 --- a/crates/qmd-syntax-helper/src/conversions/grid_tables.rs +++ b/crates/qmd-syntax-helper/src/conversions/grid_tables.rs @@ -128,12 +128,11 @@ impl GridTableConverter { // Step 2: Use library to convert JSON to markdown let mut json_reader = std::io::Cursor::new(&pandoc_output.stdout); - let (pandoc_ast, _ctx) = json::read(&mut json_reader) - .context("Failed to parse JSON output from pandoc")?; + let (pandoc_ast, _ctx) = + json::read(&mut json_reader).context("Failed to parse JSON output from pandoc")?; let mut output = Vec::new(); - qmd::write(&pandoc_ast, &mut output) - .context("Failed to write markdown output")?; + qmd::write(&pandoc_ast, &mut output).context("Failed to write markdown output")?; let result = String::from_utf8(output) .context("Failed to parse output as UTF-8")? diff --git a/crates/qmd-syntax-helper/src/conversions/mod.rs b/crates/qmd-syntax-helper/src/conversions/mod.rs index 4cccac7..9c282df 100644 --- a/crates/qmd-syntax-helper/src/conversions/mod.rs +++ b/crates/qmd-syntax-helper/src/conversions/mod.rs @@ -1,2 +1,3 @@ pub mod definition_lists; +pub mod div_whitespace; pub mod grid_tables; diff --git a/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs b/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs index 71510ce..79f072d 100644 --- a/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs +++ b/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs @@ -71,7 +71,13 @@ impl SyntaxChecker { false, // not loose mode &filename, &mut sink, - None:: Vec>, // no custom error formatter + None::< + fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + >, // no custom error formatter ); match result { @@ -93,7 +99,15 @@ impl SyntaxChecker { println!("\n{}", "=== Summary ===".bold()); println!("Total files: {}", total); println!("Successful: {} {}", successes, "✓".green()); - println!("Failed: {} {}", failures, if failures > 0 { "✗".red() } else { "✓".green() }); + println!( + "Failed: {} {}", + failures, + if failures > 0 { + "✗".red() + } else { + "✓".green() + } + ); if failures > 0 { let success_rate = (successes as f64 / total as f64) * 100.0; diff --git a/crates/qmd-syntax-helper/src/main.rs b/crates/qmd-syntax-helper/src/main.rs index 43069f0..7db7b1b 100644 --- a/crates/qmd-syntax-helper/src/main.rs +++ b/crates/qmd-syntax-helper/src/main.rs @@ -7,6 +7,7 @@ mod diagnostics; mod utils; use conversions::definition_lists::DefinitionListConverter; +use conversions::div_whitespace::DivWhitespaceConverter; use conversions::grid_tables::GridTableConverter; use diagnostics::syntax_check::SyntaxChecker; use utils::glob_expand::expand_globs; @@ -60,6 +61,25 @@ enum Commands { verbose: bool, }, + /// Fix div fences missing whitespace (:::{ -> ::: {) + FixDivWhitespace { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Edit files in place + #[arg(short, long)] + in_place: bool, + + /// Check mode: show what would be changed without modifying files + #[arg(short, long)] + check: bool, + + /// Show verbose output + #[arg(short, long)] + verbose: bool, + }, + /// Check syntax of files and report errors Check { /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") @@ -122,6 +142,25 @@ fn main() -> Result<()> { Ok(()) } + Commands::FixDivWhitespace { + files, + in_place, + check, + verbose, + } => { + let converter = DivWhitespaceConverter::new()?; + let file_paths = expand_globs(&files)?; + + for file_path in file_paths { + if verbose { + println!("Processing: {}", file_path.display()); + } + + converter.process_file(&file_path, in_place, check, verbose)?; + } + + Ok(()) + } Commands::Check { files, verbose, diff --git a/crates/qmd-syntax-helper/src/utils/glob_expand.rs b/crates/qmd-syntax-helper/src/utils/glob_expand.rs index e87b7d5..09762f4 100644 --- a/crates/qmd-syntax-helper/src/utils/glob_expand.rs +++ b/crates/qmd-syntax-helper/src/utils/glob_expand.rs @@ -16,7 +16,8 @@ pub fn expand_globs(patterns: &[String]) -> Result> { .with_context(|| format!("Invalid glob pattern: {}", pattern))?; for path in paths { - let path = path.with_context(|| format!("Failed to read glob match for: {}", pattern))?; + let path = + path.with_context(|| format!("Failed to read glob match for: {}", pattern))?; files.push(path); } } else { diff --git a/crates/qmd-syntax-helper/src/utils/resources.rs b/crates/qmd-syntax-helper/src/utils/resources.rs index 5c0d037..f3d0286 100644 --- a/crates/qmd-syntax-helper/src/utils/resources.rs +++ b/crates/qmd-syntax-helper/src/utils/resources.rs @@ -1,5 +1,5 @@ use anyhow::{Context, Result}; -use include_dir::{include_dir, Dir}; +use include_dir::{Dir, include_dir}; use std::fs; use std::path::{Path, PathBuf}; use std::sync::atomic::{AtomicU64, Ordering}; diff --git a/crates/qmd-syntax-helper/tests/div_whitespace_test.rs b/crates/qmd-syntax-helper/tests/div_whitespace_test.rs new file mode 100644 index 0000000..215300d --- /dev/null +++ b/crates/qmd-syntax-helper/tests/div_whitespace_test.rs @@ -0,0 +1,145 @@ +use qmd_syntax_helper::conversions::div_whitespace::DivWhitespaceConverter; +use std::fs; + +#[test] +fn test_div_whitespace_conversion() { + let temp_dir = std::env::temp_dir().join(format!("qmd-test-{}", std::process::id())); + std::fs::create_dir_all(&temp_dir).unwrap(); + let test_file = temp_dir.join("test.qmd"); + + // Create test content with div fences missing whitespace + let input_content = r#"# Test file + +:::{.class} +Content with class +::: + +:::{#id} +Content with id +::: + +:::{} +Content with empty attrs +::: + +::: {.already-good} +Already has space +::: +"#; + + fs::write(&test_file, input_content).unwrap(); + + let converter = DivWhitespaceConverter::new().unwrap(); + + // Process the file in-place + converter + .process_file(&test_file, true, false, false) + .unwrap(); + + let result = fs::read_to_string(&test_file).unwrap(); + + // Verify all div fences now have spaces + assert!(result.contains("::: {.class}"), "Should fix :::{{.class}}"); + assert!(result.contains("::: {#id}"), "Should fix :::{{#id}}"); + assert!(result.contains("::: {}"), "Should fix :::{{}}"); + assert!( + result.contains("::: {.already-good}"), + "Should preserve already-good format" + ); + + // Clean up + std::fs::remove_dir_all(&temp_dir).ok(); + + // Verify content is preserved + assert!(result.contains("Content with class")); + assert!(result.contains("Content with id")); + assert!(result.contains("Content with empty attrs")); + assert!(result.contains("Already has space")); +} + +#[test] +fn test_div_whitespace_in_code_blocks_untouched() { + let temp_dir = std::env::temp_dir().join(format!("qmd-test-{}", std::process::id() + 1)); + std::fs::create_dir_all(&temp_dir).unwrap(); + let test_file = temp_dir.join("test.qmd"); + + // Content with div fence patterns in code blocks should not be modified + let input_content = r#"# Test file + +Here's an example in a code block: + +``` +:::{.class} +This is in a code block +::: +``` + +This one should be fixed: + +:::{.real-div} +Real div content +::: +"#; + + fs::write(&test_file, input_content).unwrap(); + + let converter = DivWhitespaceConverter::new().unwrap(); + converter + .process_file(&test_file, true, false, false) + .unwrap(); + + let result = fs::read_to_string(&test_file).unwrap(); + + // The one in the code block should remain unchanged (parser won't report it as an error) + // The real div should be fixed + assert!( + result.contains("::: {.real-div}"), + "Should fix real div fence" + ); + + // Code block content should be preserved exactly + assert!( + result.contains("```\n:::{.class}\nThis is in a code block\n:::\n```"), + "Code block should be unchanged" + ); + + // Clean up + std::fs::remove_dir_all(&temp_dir).ok(); +} + +#[test] +fn test_no_changes_when_all_correct() { + let temp_dir = std::env::temp_dir().join(format!("qmd-test-{}", std::process::id() + 2)); + std::fs::create_dir_all(&temp_dir).unwrap(); + let test_file = temp_dir.join("test.qmd"); + + let input_content = r#"# Test file + +::: {.class} +Content +::: + +::: {} +Content +::: +"#; + + fs::write(&test_file, input_content).unwrap(); + let original = fs::read_to_string(&test_file).unwrap(); + + let converter = DivWhitespaceConverter::new().unwrap(); + converter + .process_file(&test_file, true, false, false) + .unwrap(); + + let result = fs::read_to_string(&test_file).unwrap(); + + // Content should be identical + assert_eq!( + original, result, + "Should not modify already-correct content" + ); + + // Clean up + std::fs::remove_dir_all(&temp_dir).ok(); +} From be02b2741f8dc5c7deaf4b6f68d2d5932d015fe8 Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Fri, 17 Oct 2025 11:40:14 -0500 Subject: [PATCH 07/11] restructure syntax helping tool --- .../src/conversions/definition_lists.rs | 112 +++++++++ .../src/conversions/div_whitespace.rs | 83 +++++++ .../src/conversions/grid_tables.rs | 112 +++++++++ crates/qmd-syntax-helper/src/lib.rs | 1 + crates/qmd-syntax-helper/src/main.rs | 216 +++++++++--------- crates/qmd-syntax-helper/src/main_old.rs | 201 ++++++++++++++++ crates/qmd-syntax-helper/src/rule.rs | 99 ++++++++ 7 files changed, 718 insertions(+), 106 deletions(-) create mode 100644 crates/qmd-syntax-helper/src/main_old.rs create mode 100644 crates/qmd-syntax-helper/src/rule.rs diff --git a/crates/qmd-syntax-helper/src/conversions/definition_lists.rs b/crates/qmd-syntax-helper/src/conversions/definition_lists.rs index aa48aeb..0f3fe4c 100644 --- a/crates/qmd-syntax-helper/src/conversions/definition_lists.rs +++ b/crates/qmd-syntax-helper/src/conversions/definition_lists.rs @@ -4,6 +4,7 @@ use regex::Regex; use std::path::Path; use std::process::{Command, Stdio}; +use crate::rule::{CheckResult, ConvertResult, Rule}; use crate::utils::file_io::{read_file, write_file}; use crate::utils::resources::ResourceManager; use quarto_markdown_pandoc::readers::json; @@ -267,3 +268,114 @@ impl DefinitionListConverter { Ok(()) } } + +impl Rule for DefinitionListConverter { + fn name(&self) -> &str { + "definition-lists" + } + + fn description(&self) -> &str { + "Convert definition lists to div-based format" + } + + fn check(&self, file_path: &Path, verbose: bool) -> Result { + let content = read_file(file_path)?; + let lists = self.find_definition_lists(&content); + + if verbose { + if lists.is_empty() { + println!(" No definition lists found"); + } else { + println!(" Found {} definition list(s)", lists.len()); + } + } + + Ok(CheckResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + has_issue: !lists.is_empty(), + issue_count: lists.len(), + message: if lists.is_empty() { + None + } else { + Some(format!("Found {} definition list(s)", lists.len())) + }, + }) + } + + fn convert( + &self, + file_path: &Path, + in_place: bool, + check_mode: bool, + verbose: bool, + ) -> Result { + let content = read_file(file_path)?; + let lists = self.find_definition_lists(&content); + + if lists.is_empty() { + return Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: 0, + message: None, + }); + } + + // Convert each list and build new content + let mut lines: Vec = content.lines().map(|s| s.to_string()).collect(); + let mut offset: isize = 0; + + for (idx, list) in lists.iter().enumerate() { + if verbose { + println!(" Converting list {}...", idx + 1); + } + + let converted = self.convert_list(&list.text)?; + let start = (list.start_line as isize + offset) as usize; + let end = (list.end_line as isize + offset) as usize; + + if check_mode && verbose { + println!( + " List {} at lines {}-{}:", + idx + 1, + list.start_line, + list.end_line + ); + println!( + " {} {} lines -> {} {} lines", + "Original:".red(), + list.end_line - list.start_line + 1, + "Converted:".green(), + converted.lines().count() + ); + } + + let converted_lines: Vec = converted.lines().map(|s| s.to_string()).collect(); + let new_len = converted_lines.len(); + let old_len = end - start + 1; + + lines.splice(start..=end, converted_lines); + offset += new_len as isize - old_len as isize; + } + + let new_content = lines.join("\n") + "\n"; + + if !check_mode { + if in_place { + write_file(file_path, &new_content)?; + } + } + + Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: lists.len(), + message: if in_place { + Some(format!("Converted {} list(s)", lists.len())) + } else { + Some(new_content) + }, + }) + } +} diff --git a/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs b/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs index 6b45263..c79b249 100644 --- a/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs +++ b/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs @@ -4,6 +4,7 @@ use serde::{Deserialize, Serialize}; use std::fs; use std::path::Path; +use crate::rule::{CheckResult, ConvertResult, Rule}; use crate::utils::file_io::{read_file, write_file}; #[derive(Debug, Serialize, Deserialize)] @@ -208,3 +209,85 @@ impl DivWhitespaceConverter { Ok(()) } } + +impl Rule for DivWhitespaceConverter { + fn name(&self) -> &str { + "div-whitespace" + } + + fn description(&self) -> &str { + "Fix div fences missing whitespace (:::{ -> ::: {)" + } + + fn check(&self, file_path: &Path, verbose: bool) -> Result { + let content = read_file(file_path)?; + let errors = self.get_parse_errors(file_path)?; + let fix_positions = self.find_div_whitespace_errors(&content, &errors); + + if verbose { + if fix_positions.is_empty() { + println!(" No div whitespace issues found"); + } else { + println!( + " Found {} div fence(s) needing whitespace fixes", + fix_positions.len() + ); + } + } + + Ok(CheckResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + has_issue: !fix_positions.is_empty(), + issue_count: fix_positions.len(), + message: if fix_positions.is_empty() { + None + } else { + Some(format!( + "Found {} div fence(s) needing whitespace fixes", + fix_positions.len() + )) + }, + }) + } + + fn convert( + &self, + file_path: &Path, + in_place: bool, + check_mode: bool, + verbose: bool, + ) -> Result { + let content = read_file(file_path)?; + let errors = self.get_parse_errors(file_path)?; + let fix_positions = self.find_div_whitespace_errors(&content, &errors); + + if fix_positions.is_empty() { + return Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: 0, + message: None, + }); + } + + let new_content = self.apply_fixes(&content, &fix_positions); + + if !check_mode { + if in_place { + write_file(file_path, &new_content)?; + } + } + + Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: fix_positions.len(), + message: if in_place { + Some(format!("Fixed {} div fence(s)", fix_positions.len())) + } else { + Some(new_content) + }, + }) + } +} diff --git a/crates/qmd-syntax-helper/src/conversions/grid_tables.rs b/crates/qmd-syntax-helper/src/conversions/grid_tables.rs index 1f303cf..5949c6f 100644 --- a/crates/qmd-syntax-helper/src/conversions/grid_tables.rs +++ b/crates/qmd-syntax-helper/src/conversions/grid_tables.rs @@ -4,6 +4,7 @@ use regex::Regex; use std::path::Path; use std::process::{Command, Stdio}; +use crate::rule::{CheckResult, ConvertResult, Rule}; use crate::utils::file_io::{read_file, write_file}; use crate::utils::resources::ResourceManager; use quarto_markdown_pandoc::readers::json; @@ -228,3 +229,114 @@ impl GridTableConverter { Ok(()) } } + +impl Rule for GridTableConverter { + fn name(&self) -> &str { + "grid-tables" + } + + fn description(&self) -> &str { + "Convert grid tables to list-table format" + } + + fn check(&self, file_path: &Path, verbose: bool) -> Result { + let content = read_file(file_path)?; + let tables = self.find_grid_tables(&content); + + if verbose { + if tables.is_empty() { + println!(" No grid tables found"); + } else { + println!(" Found {} grid table(s)", tables.len()); + } + } + + Ok(CheckResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + has_issue: !tables.is_empty(), + issue_count: tables.len(), + message: if tables.is_empty() { + None + } else { + Some(format!("Found {} grid table(s)", tables.len())) + }, + }) + } + + fn convert( + &self, + file_path: &Path, + in_place: bool, + check_mode: bool, + verbose: bool, + ) -> Result { + let content = read_file(file_path)?; + let tables = self.find_grid_tables(&content); + + if tables.is_empty() { + return Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: 0, + message: None, + }); + } + + // Convert each table and build new content + let mut lines: Vec = content.lines().map(|s| s.to_string()).collect(); + let mut offset: isize = 0; + + for (idx, table) in tables.iter().enumerate() { + if verbose { + println!(" Converting table {}...", idx + 1); + } + + let converted = self.convert_table(&table.text)?; + let start = (table.start_line as isize + offset) as usize; + let end = (table.end_line as isize + offset) as usize; + + if check_mode && verbose { + println!( + " Table {} at lines {}-{}:", + idx + 1, + table.start_line, + table.end_line + ); + println!( + " {} {} lines -> {} {} lines", + "Original:".red(), + table.end_line - table.start_line + 1, + "Converted:".green(), + converted.lines().count() + ); + } + + let converted_lines: Vec = converted.lines().map(|s| s.to_string()).collect(); + let new_len = converted_lines.len(); + let old_len = end - start + 1; + + lines.splice(start..=end, converted_lines); + offset += new_len as isize - old_len as isize; + } + + let new_content = lines.join("\n") + "\n"; + + if !check_mode { + if in_place { + write_file(file_path, &new_content)?; + } + } + + Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: tables.len(), + message: if in_place { + Some(format!("Converted {} table(s)", tables.len())) + } else { + Some(new_content) + }, + }) + } +} diff --git a/crates/qmd-syntax-helper/src/lib.rs b/crates/qmd-syntax-helper/src/lib.rs index b64b3a4..2a009f2 100644 --- a/crates/qmd-syntax-helper/src/lib.rs +++ b/crates/qmd-syntax-helper/src/lib.rs @@ -1,3 +1,4 @@ pub mod conversions; pub mod diagnostics; +pub mod rule; pub mod utils; diff --git a/crates/qmd-syntax-helper/src/main.rs b/crates/qmd-syntax-helper/src/main.rs index 7db7b1b..de5c86a 100644 --- a/crates/qmd-syntax-helper/src/main.rs +++ b/crates/qmd-syntax-helper/src/main.rs @@ -1,15 +1,14 @@ use anyhow::Result; use clap::{Parser, Subcommand}; +use colored::Colorize; use std::path::PathBuf; mod conversions; mod diagnostics; +mod rule; mod utils; -use conversions::definition_lists::DefinitionListConverter; -use conversions::div_whitespace::DivWhitespaceConverter; -use conversions::grid_tables::GridTableConverter; -use diagnostics::syntax_check::SyntaxChecker; +use rule::{Rule, RuleRegistry}; use utils::glob_expand::expand_globs; #[derive(Parser)] @@ -23,50 +22,39 @@ struct Cli { #[derive(Subcommand)] enum Commands { - /// Convert grid tables to list-table format - UngridTables { + /// Check files for known problems + Check { /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") #[arg(required = true)] files: Vec, - /// Edit files in place - #[arg(short, long)] - in_place: bool, - - /// Check mode: show what would be changed without modifying files - #[arg(short, long)] - check: bool, + /// Rules to check (defaults to "all") + #[arg(short = 'r', long = "rule", default_values_t = vec!["all".to_string()])] + rule: Vec, /// Show verbose output #[arg(short, long)] verbose: bool, - }, - - /// Convert definition lists to div-based format - UndefLists { - /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") - #[arg(required = true)] - files: Vec, - - /// Edit files in place - #[arg(short, long)] - in_place: bool, - /// Check mode: show what would be changed without modifying files - #[arg(short, long)] - check: bool, + /// Output results as JSONL + #[arg(long)] + json: bool, - /// Show verbose output + /// Save detailed results to file #[arg(short, long)] - verbose: bool, + output: Option, }, - /// Fix div fences missing whitespace (:::{ -> ::: {) - FixDivWhitespace { + /// Convert/fix problems in files + Convert { /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") #[arg(required = true)] files: Vec, + /// Rules to apply (defaults to "all") + #[arg(short = 'r', long = "rule", default_values_t = vec!["all".to_string()])] + rule: Vec, + /// Edit files in place #[arg(short, long)] in_place: bool, @@ -80,121 +68,137 @@ enum Commands { verbose: bool, }, - /// Check syntax of files and report errors - Check { - /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") - #[arg(required = true)] - files: Vec, - - /// Show verbose output (each file as processed) - #[arg(short, long)] - verbose: bool, - - /// Output results as JSONL - #[arg(long)] - json: bool, - - /// Save detailed results to file - #[arg(short, long)] - output: Option, - }, + /// List all available rules + ListRules, } fn main() -> Result<()> { let cli = Cli::parse(); + let registry = RuleRegistry::new()?; match cli.command { - Commands::UngridTables { + Commands::Check { files, - in_place, - check, + rule: rule_names, verbose, + json, + output, } => { - let converter = GridTableConverter::new()?; let file_paths = expand_globs(&files)?; + let rules = resolve_rules(®istry, &rule_names)?; + + let mut all_results = Vec::new(); for file_path in file_paths { - if verbose { - println!("Processing: {}", file_path.display()); + if verbose && !json { + println!("Checking: {}", file_path.display()); } - converter.process_file(&file_path, in_place, check, verbose)?; + for rule in &rules { + match rule.check(&file_path, verbose && !json) { + Ok(result) => { + all_results.push(result.clone()); + if !json && result.has_issue { + println!(" {} {}", "✗".red(), result.message.unwrap_or_default()); + } + } + Err(e) => { + if !json { + eprintln!(" {} Error checking {}: {}", "✗".red(), rule.name(), e); + } + } + } + } } - Ok(()) - } - Commands::UndefLists { - files, - in_place, - check, - verbose, - } => { - let converter = DefinitionListConverter::new()?; - let file_paths = expand_globs(&files)?; - - for file_path in file_paths { - if verbose { - println!("Processing: {}", file_path.display()); + // Output handling + if json { + for result in &all_results { + println!("{}", serde_json::to_string(result)?); } + } - converter.process_file(&file_path, in_place, check, verbose)?; + if let Some(output_path) = output { + let mut output_str = String::new(); + for result in &all_results { + output_str.push_str(&serde_json::to_string(result)?); + output_str.push('\n'); + } + std::fs::write(output_path, output_str)?; } Ok(()) } - Commands::FixDivWhitespace { + + Commands::Convert { files, + rule: rule_names, in_place, - check, + check: check_mode, verbose, } => { - let converter = DivWhitespaceConverter::new()?; let file_paths = expand_globs(&files)?; + let rules = resolve_rules(®istry, &rule_names)?; for file_path in file_paths { if verbose { println!("Processing: {}", file_path.display()); } - converter.process_file(&file_path, in_place, check, verbose)?; + // Apply fixes sequentially, reparsing between each rule + for rule in &rules { + match rule.convert(&file_path, in_place, check_mode, verbose) { + Ok(result) => { + if result.fixes_applied > 0 { + if verbose || check_mode { + println!( + " {} {} - {}", + if check_mode { "Would fix" } else { "Fixed" }, + rule.name(), + result.message.clone().unwrap_or_default() + ); + } + + if !in_place && !check_mode && result.message.is_some() { + // Output to stdout if not in-place + print!("{}", result.message.unwrap()); + } + } + } + Err(e) => { + eprintln!(" {} Error converting {}: {}", "✗".red(), rule.name(), e); + // Stop on first error (transactional) + return Err(e); + } + } + } } Ok(()) } - Commands::Check { - files, - verbose, - json, - output, - } => { - let mut checker = SyntaxChecker::new(); - let file_paths = expand_globs(&files)?; - for file_path in file_paths { - checker.check_file(&file_path, verbose)?; - } - - // Print summary if not JSON mode - if !json { - checker.print_summary(); + Commands::ListRules => { + println!("{}", "Available rules:".bold()); + for name in registry.list_names() { + let rule = registry.get(&name)?; + println!(" {} - {}", name.cyan(), rule.description()); } - - // Save to output file if specified - if let Some(output_path) = output { - checker.export_jsonl(&output_path)?; - if !json { - println!("\nDetailed results written to: {}", output_path.display()); - } - } - - // Print JSON to stdout if requested - if json { - for result in &checker.results { - println!("{}", serde_json::to_string(result)?); - } - } - Ok(()) } } } + +fn resolve_rules( + registry: &RuleRegistry, + names: &[String], +) -> Result>> { + if names.len() == 1 && names[0] == "all" { + Ok(registry.all()) + } else { + let mut rules = Vec::new(); + for name in names { + rules.push(registry.get(name)?); + } + Ok(rules) + } +} diff --git a/crates/qmd-syntax-helper/src/main_old.rs b/crates/qmd-syntax-helper/src/main_old.rs new file mode 100644 index 0000000..b0b59f2 --- /dev/null +++ b/crates/qmd-syntax-helper/src/main_old.rs @@ -0,0 +1,201 @@ +use anyhow::Result; +use clap::{Parser, Subcommand}; +use std::path::PathBuf; + +mod conversions; +mod diagnostics; +mod problem; +mod utils; + +use conversions::definition_lists::DefinitionListConverter; +use conversions::div_whitespace::DivWhitespaceConverter; +use conversions::grid_tables::GridTableConverter; +use diagnostics::syntax_check::SyntaxChecker; +use utils::glob_expand::expand_globs; + +#[derive(Parser)] +#[command(name = "qmd-syntax-helper")] +#[command(about = "Helper tool for converting and fixing Quarto Markdown syntax")] +#[command(version)] +struct Cli { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + /// Convert grid tables to list-table format + UngridTables { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Edit files in place + #[arg(short, long)] + in_place: bool, + + /// Check mode: show what would be changed without modifying files + #[arg(short, long)] + check: bool, + + /// Show verbose output + #[arg(short, long)] + verbose: bool, + }, + + /// Convert definition lists to div-based format + UndefLists { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Edit files in place + #[arg(short, long)] + in_place: bool, + + /// Check mode: show what would be changed without modifying files + #[arg(short, long)] + check: bool, + + /// Show verbose output + #[arg(short, long)] + verbose: bool, + }, + + /// Fix div fences missing whitespace (:::{ -> ::: {) + FixDivWhitespace { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Edit files in place + #[arg(short, long)] + in_place: bool, + + /// Check mode: show what would be changed without modifying files + #[arg(short, long)] + check: bool, + + /// Show verbose output + #[arg(short, long)] + verbose: bool, + }, + + /// Check syntax of files and report errors + Check { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Show verbose output (each file as processed) + #[arg(short, long)] + verbose: bool, + + /// Output results as JSONL + #[arg(long)] + json: bool, + + /// Save detailed results to file + #[arg(short, long)] + output: Option, + }, +} + +fn main() -> Result<()> { + let cli = Cli::parse(); + + match cli.command { + Commands::UngridTables { + files, + in_place, + check, + verbose, + } => { + let converter = GridTableConverter::new()?; + let file_paths = expand_globs(&files)?; + + for file_path in file_paths { + if verbose { + println!("Processing: {}", file_path.display()); + } + + converter.process_file(&file_path, in_place, check, verbose)?; + } + + Ok(()) + } + Commands::UndefLists { + files, + in_place, + check, + verbose, + } => { + let converter = DefinitionListConverter::new()?; + let file_paths = expand_globs(&files)?; + + for file_path in file_paths { + if verbose { + println!("Processing: {}", file_path.display()); + } + + converter.process_file(&file_path, in_place, check, verbose)?; + } + + Ok(()) + } + Commands::FixDivWhitespace { + files, + in_place, + check, + verbose, + } => { + let converter = DivWhitespaceConverter::new()?; + let file_paths = expand_globs(&files)?; + + for file_path in file_paths { + if verbose { + println!("Processing: {}", file_path.display()); + } + + converter.process_file(&file_path, in_place, check, verbose)?; + } + + Ok(()) + } + Commands::Check { + files, + verbose, + json, + output, + } => { + let mut checker = SyntaxChecker::new(); + let file_paths = expand_globs(&files)?; + + for file_path in file_paths { + checker.check_file(&file_path, verbose)?; + } + + // Print summary if not JSON mode + if !json { + checker.print_summary(); + } + + // Save to output file if specified + if let Some(output_path) = output { + checker.export_jsonl(&output_path)?; + if !json { + println!("\nDetailed results written to: {}", output_path.display()); + } + } + + // Print JSON to stdout if requested + if json { + for result in &checker.results { + println!("{}", serde_json::to_string(result)?); + } + } + + Ok(()) + } + } +} diff --git a/crates/qmd-syntax-helper/src/rule.rs b/crates/qmd-syntax-helper/src/rule.rs new file mode 100644 index 0000000..e3bcf6a --- /dev/null +++ b/crates/qmd-syntax-helper/src/rule.rs @@ -0,0 +1,99 @@ +use anyhow::{Result, anyhow}; +use std::collections::HashMap; +use std::path::Path; +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; + +/// Result of checking a file for a specific rule +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CheckResult { + pub rule_name: String, + pub file_path: String, + pub has_issue: bool, + pub issue_count: usize, + pub message: Option, +} + +/// Result of converting/fixing a file +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConvertResult { + pub rule_name: String, + pub file_path: String, + pub fixes_applied: usize, + pub message: Option, +} + +/// A rule that can check for and fix issues in Quarto Markdown files +pub trait Rule { + /// The name of this rule (e.g., "grid-tables", "div-whitespace") + fn name(&self) -> &str; + + /// A short description of what this rule checks/fixes + fn description(&self) -> &str; + + /// Check if a file violates this rule + fn check(&self, file_path: &Path, verbose: bool) -> Result; + + /// Convert/fix rule violations in a file + /// If in_place is false, returns the converted content as a string in the message field + fn convert( + &self, + file_path: &Path, + in_place: bool, + check_mode: bool, + verbose: bool, + ) -> Result; +} + +/// Registry of all available rules +pub struct RuleRegistry { + rules: HashMap>, +} + +impl RuleRegistry { + /// Create a new registry and register all known rules + pub fn new() -> Result { + let mut registry = Self { + rules: HashMap::new(), + }; + + // Register all rules + registry.register(Arc::new( + crate::conversions::grid_tables::GridTableConverter::new()?, + )); + registry.register(Arc::new( + crate::conversions::div_whitespace::DivWhitespaceConverter::new()?, + )); + registry.register(Arc::new( + crate::conversions::definition_lists::DefinitionListConverter::new()?, + )); + + Ok(registry) + } + + /// Register a rule + fn register(&mut self, rule: Arc) { + self.rules.insert(rule.name().to_string(), rule); + } + + /// Get a rule by name, or return an error if not found + pub fn get(&self, name: &str) -> Result> { + self.rules + .get(name) + .cloned() + .ok_or_else(|| anyhow!("Unknown rule: {}", name)) + } + + /// Get all registered rules + pub fn all(&self) -> Vec> { + self.rules.values().cloned().collect() + } + + /// List all rule names + pub fn list_names(&self) -> Vec { + let mut names: Vec = self.rules.keys().cloned().collect(); + names.sort(); + names + } +} From 3bc9345e9b83d906dcde43cb1daacfbca2d8109a Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Fri, 17 Oct 2025 11:45:39 -0500 Subject: [PATCH 08/11] clean up warnings --- .../src/conversions/definition_lists.rs | 86 ------------------- .../src/conversions/div_whitespace.rs | 3 +- .../src/conversions/grid_tables.rs | 86 ------------------- .../qmd-syntax-helper/src/diagnostics/mod.rs | 2 +- .../qmd-syntax-helper/src/utils/resources.rs | 3 + 5 files changed, 6 insertions(+), 174 deletions(-) diff --git a/crates/qmd-syntax-helper/src/conversions/definition_lists.rs b/crates/qmd-syntax-helper/src/conversions/definition_lists.rs index 0f3fe4c..636cff7 100644 --- a/crates/qmd-syntax-helper/src/conversions/definition_lists.rs +++ b/crates/qmd-syntax-helper/src/conversions/definition_lists.rs @@ -181,92 +181,6 @@ impl DefinitionListConverter { Ok(result) } - - /// Process a single file - pub fn process_file( - &self, - file_path: &Path, - in_place: bool, - check: bool, - verbose: bool, - ) -> Result<()> { - let content = read_file(file_path)?; - let lists = self.find_definition_lists(&content); - - if lists.is_empty() { - if verbose { - println!(" No definition lists found"); - } - return Ok(()); - } - - if verbose || check { - println!( - " Found {} definition list(s)", - lists.len().to_string().yellow() - ); - } - - // Convert each list and build new content - let mut lines: Vec = content.lines().map(|s| s.to_string()).collect(); - let mut offset: isize = 0; // Track line offset as we modify - - for (idx, list) in lists.iter().enumerate() { - if verbose { - println!(" Converting list {}...", idx + 1); - } - - let converted = self.convert_list(&list.text)?; - - // Calculate actual line positions with offset - let start = (list.start_line as isize + offset) as usize; - let end = (list.end_line as isize + offset) as usize; - - if check { - println!( - " List {} at lines {}-{}:", - idx + 1, - list.start_line, - list.end_line - ); - println!( - " {} {} lines -> {} {} lines", - "Original:".red(), - list.end_line - list.start_line + 1, - "Converted:".green(), - converted.lines().count() - ); - } - - // Replace the list in the lines - let converted_lines: Vec = converted.lines().map(|s| s.to_string()).collect(); - let new_len = converted_lines.len(); - let old_len = end - start + 1; - - // Splice in the new lines - lines.splice(start..=end, converted_lines); - - // Update offset for next list - offset += new_len as isize - old_len as isize; - } - - if check { - println!(" {} No changes written (--check mode)", "✓".green()); - return Ok(()); - } - - let new_content = lines.join("\n") + "\n"; - - if in_place { - write_file(file_path, &new_content)?; - println!(" {} Converted {} list(s)", "✓".green(), lists.len()); - } else { - // Output to stdout - print!("{}", new_content); - } - - Ok(()) - } } impl Rule for DefinitionListConverter { diff --git a/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs b/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs index c79b249..3f62354 100644 --- a/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs +++ b/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs @@ -150,6 +150,7 @@ impl DivWhitespaceConverter { } /// Process a single file + #[allow(dead_code)] pub fn process_file( &self, file_path: &Path, @@ -256,7 +257,7 @@ impl Rule for DivWhitespaceConverter { file_path: &Path, in_place: bool, check_mode: bool, - verbose: bool, + _verbose: bool, ) -> Result { let content = read_file(file_path)?; let errors = self.get_parse_errors(file_path)?; diff --git a/crates/qmd-syntax-helper/src/conversions/grid_tables.rs b/crates/qmd-syntax-helper/src/conversions/grid_tables.rs index 5949c6f..ae74a1b 100644 --- a/crates/qmd-syntax-helper/src/conversions/grid_tables.rs +++ b/crates/qmd-syntax-helper/src/conversions/grid_tables.rs @@ -142,92 +142,6 @@ impl GridTableConverter { Ok(result) } - - /// Process a single file - pub fn process_file( - &self, - file_path: &Path, - in_place: bool, - check: bool, - verbose: bool, - ) -> Result<()> { - let content = read_file(file_path)?; - let tables = self.find_grid_tables(&content); - - if tables.is_empty() { - if verbose { - println!(" No grid tables found"); - } - return Ok(()); - } - - if verbose || check { - println!( - " Found {} grid table(s)", - tables.len().to_string().yellow() - ); - } - - // Convert each table and build new content - let mut lines: Vec = content.lines().map(|s| s.to_string()).collect(); - let mut offset: isize = 0; // Track line offset as we modify - - for (idx, table) in tables.iter().enumerate() { - if verbose { - println!(" Converting table {}...", idx + 1); - } - - let converted = self.convert_table(&table.text)?; - - // Calculate actual line positions with offset - let start = (table.start_line as isize + offset) as usize; - let end = (table.end_line as isize + offset) as usize; - - if check { - println!( - " Table {} at lines {}-{}:", - idx + 1, - table.start_line, - table.end_line - ); - println!( - " {} {} lines -> {} {} lines", - "Original:".red(), - table.end_line - table.start_line + 1, - "Converted:".green(), - converted.lines().count() - ); - } - - // Replace the table in the lines - let converted_lines: Vec = converted.lines().map(|s| s.to_string()).collect(); - let new_len = converted_lines.len(); - let old_len = end - start + 1; - - // Splice in the new lines - lines.splice(start..=end, converted_lines); - - // Update offset for next table - offset += new_len as isize - old_len as isize; - } - - if check { - println!(" {} No changes written (--check mode)", "✓".green()); - return Ok(()); - } - - let new_content = lines.join("\n") + "\n"; - - if in_place { - write_file(file_path, &new_content)?; - println!(" {} Converted {} table(s)", "✓".green(), tables.len()); - } else { - // Output to stdout - print!("{}", new_content); - } - - Ok(()) - } } impl Rule for GridTableConverter { diff --git a/crates/qmd-syntax-helper/src/diagnostics/mod.rs b/crates/qmd-syntax-helper/src/diagnostics/mod.rs index 370c564..6daff44 100644 --- a/crates/qmd-syntax-helper/src/diagnostics/mod.rs +++ b/crates/qmd-syntax-helper/src/diagnostics/mod.rs @@ -1 +1 @@ -pub mod syntax_check; +// pub mod syntax_check; // Unused - kept for reference only diff --git a/crates/qmd-syntax-helper/src/utils/resources.rs b/crates/qmd-syntax-helper/src/utils/resources.rs index f3d0286..81965e1 100644 --- a/crates/qmd-syntax-helper/src/utils/resources.rs +++ b/crates/qmd-syntax-helper/src/utils/resources.rs @@ -54,11 +54,13 @@ impl ResourceManager { } /// Get the temp directory path + #[allow(dead_code)] pub fn temp_dir(&self) -> &Path { &self.temp_dir } /// List all available resources + #[allow(dead_code)] pub fn list_resources(&self) -> Vec { let mut resources = Vec::new(); Self::collect_files(&RESOURCES_DIR, "", &mut resources); @@ -66,6 +68,7 @@ impl ResourceManager { } /// Recursively collect all file paths from a directory + #[allow(dead_code)] fn collect_files(dir: &Dir, prefix: &str, resources: &mut Vec) { for file in dir.files() { let name = file.path().file_name().unwrap().to_string_lossy(); From 8cccaea6e884c19c0e5fb1ec7efeaa61d2b30c3d Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Fri, 17 Oct 2025 14:04:32 -0500 Subject: [PATCH 09/11] --json output --- .../src/conversions/definition_lists.rs | 29 +- .../src/conversions/div_whitespace.rs | 66 +++- .../src/conversions/grid_tables.rs | 29 +- .../qmd-syntax-helper/src/diagnostics/mod.rs | 1 + .../src/diagnostics/parse_check.rs | 82 +++++ .../src/diagnostics/syntax_check.rs | 15 +- crates/qmd-syntax-helper/src/main.rs | 85 ++++- crates/qmd-syntax-helper/src/rule.rs | 22 +- .../src/pandoc/treesitter.rs | 11 +- .../pandoc/treesitter_utils/postprocess.rs | 49 ++- .../quarto-markdown-pandoc/src/readers/qmd.rs | 26 +- .../src/utils/error_collector.rs | 341 ++++++++++++++++++ .../quarto-markdown-pandoc/src/utils/mod.rs | 1 + crates/quarto-markdown-pandoc/tests/test.rs | 23 +- .../tests/test_inline_locations.rs | 34 +- 15 files changed, 726 insertions(+), 88 deletions(-) create mode 100644 crates/qmd-syntax-helper/src/diagnostics/parse_check.rs create mode 100644 crates/quarto-markdown-pandoc/src/utils/error_collector.rs diff --git a/crates/qmd-syntax-helper/src/conversions/definition_lists.rs b/crates/qmd-syntax-helper/src/conversions/definition_lists.rs index 636cff7..9124fd6 100644 --- a/crates/qmd-syntax-helper/src/conversions/definition_lists.rs +++ b/crates/qmd-syntax-helper/src/conversions/definition_lists.rs @@ -192,7 +192,7 @@ impl Rule for DefinitionListConverter { "Convert definition lists to div-based format" } - fn check(&self, file_path: &Path, verbose: bool) -> Result { + fn check(&self, file_path: &Path, verbose: bool) -> Result> { let content = read_file(file_path)?; let lists = self.find_definition_lists(&content); @@ -204,17 +204,22 @@ impl Rule for DefinitionListConverter { } } - Ok(CheckResult { - rule_name: self.name().to_string(), - file_path: file_path.to_string_lossy().to_string(), - has_issue: !lists.is_empty(), - issue_count: lists.len(), - message: if lists.is_empty() { - None - } else { - Some(format!("Found {} definition list(s)", lists.len())) - }, - }) + let mut results = Vec::new(); + for list in lists { + results.push(CheckResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + has_issue: true, + issue_count: 1, + message: Some("Definition list found".to_string()), + location: Some(crate::rule::SourceLocation { + row: list.start_line + 1, // Convert 0-indexed to 1-indexed + column: 1, + }), + }); + } + + Ok(results) } fn convert( diff --git a/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs b/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs index 3f62354..db18d91 100644 --- a/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs +++ b/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs @@ -64,10 +64,17 @@ impl DivWhitespaceConverter { } let json_str = error_messages.join(""); - let errors: Vec = - serde_json::from_str(&json_str).context("Failed to parse JSON error output")?; - Ok(errors) + // Try to parse as JSON array + match serde_json::from_str::>(&json_str) { + Ok(errors) => Ok(errors), + Err(_) => { + // If parsing fails, the messages are likely plain text warnings/debug messages + // rather than actual syntax errors. These don't indicate div whitespace issues, + // so we can safely ignore them for this specific rule. + Ok(Vec::new()) + } + } } } } @@ -130,6 +137,29 @@ impl DivWhitespaceConverter { fix_positions } + /// Convert byte offset to row/column (1-indexed) + fn byte_offset_to_location(&self, content: &str, byte_offset: usize) -> crate::rule::SourceLocation { + let mut row = 1; + let mut column = 1; + let mut current_offset = 0; + + for ch in content.chars() { + if current_offset >= byte_offset { + break; + } + current_offset += ch.len_utf8(); + + if ch == '\n' { + row += 1; + column = 1; + } else { + column += 1; + } + } + + crate::rule::SourceLocation { row, column } + } + /// Apply fixes to content by inserting spaces at specified positions fn apply_fixes(&self, content: &str, fix_positions: &[usize]) -> String { let mut result = String::with_capacity(content.len() + fix_positions.len()); @@ -220,7 +250,7 @@ impl Rule for DivWhitespaceConverter { "Fix div fences missing whitespace (:::{ -> ::: {)" } - fn check(&self, file_path: &Path, verbose: bool) -> Result { + fn check(&self, file_path: &Path, verbose: bool) -> Result> { let content = read_file(file_path)?; let errors = self.get_parse_errors(file_path)?; let fix_positions = self.find_div_whitespace_errors(&content, &errors); @@ -236,20 +266,20 @@ impl Rule for DivWhitespaceConverter { } } - Ok(CheckResult { - rule_name: self.name().to_string(), - file_path: file_path.to_string_lossy().to_string(), - has_issue: !fix_positions.is_empty(), - issue_count: fix_positions.len(), - message: if fix_positions.is_empty() { - None - } else { - Some(format!( - "Found {} div fence(s) needing whitespace fixes", - fix_positions.len() - )) - }, - }) + let mut results = Vec::new(); + for &pos in &fix_positions { + let location = self.byte_offset_to_location(&content, pos); + results.push(CheckResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + has_issue: true, + issue_count: 1, + message: Some("Div fence missing whitespace (:::{ should be ::: {)".to_string()), + location: Some(location), + }); + } + + Ok(results) } fn convert( diff --git a/crates/qmd-syntax-helper/src/conversions/grid_tables.rs b/crates/qmd-syntax-helper/src/conversions/grid_tables.rs index ae74a1b..fc9fc25 100644 --- a/crates/qmd-syntax-helper/src/conversions/grid_tables.rs +++ b/crates/qmd-syntax-helper/src/conversions/grid_tables.rs @@ -153,7 +153,7 @@ impl Rule for GridTableConverter { "Convert grid tables to list-table format" } - fn check(&self, file_path: &Path, verbose: bool) -> Result { + fn check(&self, file_path: &Path, verbose: bool) -> Result> { let content = read_file(file_path)?; let tables = self.find_grid_tables(&content); @@ -165,17 +165,22 @@ impl Rule for GridTableConverter { } } - Ok(CheckResult { - rule_name: self.name().to_string(), - file_path: file_path.to_string_lossy().to_string(), - has_issue: !tables.is_empty(), - issue_count: tables.len(), - message: if tables.is_empty() { - None - } else { - Some(format!("Found {} grid table(s)", tables.len())) - }, - }) + let mut results = Vec::new(); + for table in tables { + results.push(CheckResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + has_issue: true, + issue_count: 1, + message: Some("Grid table found".to_string()), + location: Some(crate::rule::SourceLocation { + row: table.start_line + 1, // Convert 0-indexed to 1-indexed + column: 1, + }), + }); + } + + Ok(results) } fn convert( diff --git a/crates/qmd-syntax-helper/src/diagnostics/mod.rs b/crates/qmd-syntax-helper/src/diagnostics/mod.rs index 6daff44..06d57d6 100644 --- a/crates/qmd-syntax-helper/src/diagnostics/mod.rs +++ b/crates/qmd-syntax-helper/src/diagnostics/mod.rs @@ -1 +1,2 @@ +pub mod parse_check; // pub mod syntax_check; // Unused - kept for reference only diff --git a/crates/qmd-syntax-helper/src/diagnostics/parse_check.rs b/crates/qmd-syntax-helper/src/diagnostics/parse_check.rs new file mode 100644 index 0000000..9dc25a8 --- /dev/null +++ b/crates/qmd-syntax-helper/src/diagnostics/parse_check.rs @@ -0,0 +1,82 @@ +use anyhow::{Context, Result}; +use std::fs; +use std::path::Path; + +use crate::rule::{CheckResult, ConvertResult, Rule}; + +pub struct ParseChecker {} + +impl ParseChecker { + pub fn new() -> Result { + Ok(Self {}) + } + + /// Check if a file parses successfully + fn check_parse(&self, file_path: &Path) -> Result { + let content = fs::read_to_string(file_path) + .with_context(|| format!("Failed to read file: {}", file_path.display()))?; + + let mut sink = std::io::sink(); + let filename = file_path.to_string_lossy(); + + let result = quarto_markdown_pandoc::readers::qmd::read( + content.as_bytes(), + false, + &filename, + &mut sink, + Some( + quarto_markdown_pandoc::readers::qmd_error_messages::produce_json_error_messages + as fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + ), + ); + + Ok(result.is_ok()) + } +} + +impl Rule for ParseChecker { + fn name(&self) -> &str { + "parse" + } + + fn description(&self) -> &str { + "Check if file parses successfully" + } + + fn check(&self, file_path: &Path, _verbose: bool) -> Result> { + let parses = self.check_parse(file_path)?; + + if parses { + Ok(vec![]) + } else { + Ok(vec![CheckResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + has_issue: true, + issue_count: 1, + message: Some("File failed to parse".to_string()), + location: None, // Parse errors don't have specific locations + }]) + } + } + + fn convert( + &self, + file_path: &Path, + _in_place: bool, + _check_mode: bool, + _verbose: bool, + ) -> Result { + // Parse errors can't be auto-fixed + Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: 0, + message: Some("Parse errors cannot be automatically fixed".to_string()), + }) + } +} diff --git a/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs b/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs index 79f072d..08b4ffa 100644 --- a/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs +++ b/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs @@ -71,13 +71,14 @@ impl SyntaxChecker { false, // not loose mode &filename, &mut sink, - None::< - fn( - &[u8], - &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, - &str, - ) -> Vec, - >, // no custom error formatter + Some( + quarto_markdown_pandoc::readers::qmd_error_messages::produce_json_error_messages + as fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + ), // Use JSON error formatter for machine-readable errors ); match result { diff --git a/crates/qmd-syntax-helper/src/main.rs b/crates/qmd-syntax-helper/src/main.rs index de5c86a..21d6d24 100644 --- a/crates/qmd-syntax-helper/src/main.rs +++ b/crates/qmd-syntax-helper/src/main.rs @@ -96,10 +96,12 @@ fn main() -> Result<()> { for rule in &rules { match rule.check(&file_path, verbose && !json) { - Ok(result) => { - all_results.push(result.clone()); - if !json && result.has_issue { - println!(" {} {}", "✗".red(), result.message.unwrap_or_default()); + Ok(results) => { + for result in results { + all_results.push(result.clone()); + if !json && result.has_issue { + println!(" {} {}", "✗".red(), result.message.unwrap_or_default()); + } } } Err(e) => { @@ -111,6 +113,11 @@ fn main() -> Result<()> { } } + // Print summary if not in JSON mode + if !json && !all_results.is_empty() { + print_check_summary(&all_results); + } + // Output handling if json { for result in &all_results { @@ -202,3 +209,73 @@ fn resolve_rules( Ok(rules) } } + +fn print_check_summary(results: &[rule::CheckResult]) { + use std::collections::{HashMap, HashSet}; + + // Get unique files checked + let unique_files: HashSet<&str> = results.iter().map(|r| r.file_path.as_str()).collect(); + let total_files = unique_files.len(); + + // Count files with issues (at least one result with has_issue=true) + let mut files_with_issues = HashSet::new(); + let mut total_issues = 0; + + // Track issues by rule type + let mut issues_by_rule: HashMap = HashMap::new(); + let mut files_by_rule: HashMap> = HashMap::new(); + + for result in results { + if result.has_issue { + files_with_issues.insert(&result.file_path); + total_issues += result.issue_count; + + // Track by rule + *issues_by_rule.entry(result.rule_name.clone()).or_insert(0) += result.issue_count; + files_by_rule + .entry(result.rule_name.clone()) + .or_insert_with(HashSet::new) + .insert(result.file_path.clone()); + } + } + + let files_with_issues_count = files_with_issues.len(); + let files_clean = total_files - files_with_issues_count; + + println!("\n{}", "=== Summary ===".bold()); + println!("Total files: {}", total_files); + println!( + "Files with issues: {} {}", + files_with_issues_count, + if files_with_issues_count > 0 { + "✗".red() + } else { + "✓".green() + } + ); + println!("Clean files: {} {}", files_clean, "✓".green()); + + if !issues_by_rule.is_empty() { + println!("\n{}", "Issues by rule:".bold()); + let mut rule_names: Vec<_> = issues_by_rule.keys().collect(); + rule_names.sort(); + + for rule_name in rule_names { + let count = issues_by_rule[rule_name]; + let file_count = files_by_rule[rule_name].len(); + println!( + " {}: {} issue(s) in {} file(s)", + rule_name.cyan(), + count, + file_count + ); + } + } + + println!("\nTotal issues found: {}", total_issues); + + if total_files > 0 { + let success_rate = (files_clean as f64 / total_files as f64) * 100.0; + println!("Success rate: {:.1}%", success_rate); + } +} diff --git a/crates/qmd-syntax-helper/src/rule.rs b/crates/qmd-syntax-helper/src/rule.rs index e3bcf6a..e7e46d4 100644 --- a/crates/qmd-syntax-helper/src/rule.rs +++ b/crates/qmd-syntax-helper/src/rule.rs @@ -5,14 +5,24 @@ use std::sync::Arc; use serde::{Deserialize, Serialize}; +/// Location information for a violation +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SourceLocation { + pub row: usize, + pub column: usize, +} + /// Result of checking a file for a specific rule +/// Each CheckResult represents a single violation #[derive(Debug, Clone, Serialize, Deserialize)] pub struct CheckResult { pub rule_name: String, pub file_path: String, pub has_issue: bool, - pub issue_count: usize, + pub issue_count: usize, // Kept for backwards compatibility, always 1 when has_issue=true pub message: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub location: Option, } /// Result of converting/fixing a file @@ -33,7 +43,8 @@ pub trait Rule { fn description(&self) -> &str; /// Check if a file violates this rule - fn check(&self, file_path: &Path, verbose: bool) -> Result; + /// Returns a vector of CheckResults, one per violation found + fn check(&self, file_path: &Path, verbose: bool) -> Result>; /// Convert/fix rule violations in a file /// If in_place is false, returns the converted content as a string in the message field @@ -58,7 +69,12 @@ impl RuleRegistry { rules: HashMap::new(), }; - // Register all rules + // Register diagnostic rules first (parse check should run before conversion rules) + registry.register(Arc::new( + crate::diagnostics::parse_check::ParseChecker::new()?, + )); + + // Register conversion rules registry.register(Arc::new( crate::conversions::grid_tables::GridTableConverter::new()?, )); diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs index 12e4f94..020df52 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs @@ -732,11 +732,12 @@ fn native_visitor( result } -pub fn treesitter_to_pandoc( +pub fn treesitter_to_pandoc( buf: &mut T, tree: &tree_sitter_qmd::MarkdownTree, input_bytes: &[u8], context: &ASTContext, + error_collector: &mut E, ) -> Result> { let result = bottomup_traverse_concrete_tree( &mut tree.walk(), @@ -749,7 +750,13 @@ pub fn treesitter_to_pandoc( let (_, PandocNativeIntermediate::IntermediatePandoc(pandoc)) = result else { panic!("Expected Pandoc, got {:?}", result) }; - let result = postprocess(pandoc)?; + let result = match postprocess(pandoc, error_collector) { + Ok(doc) => doc, + Err(()) => { + // Postprocess found errors, return the error messages from the collector + return Err(error_collector.messages()); + } + }; let result = merge_strs(result); Ok(result) } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs index 224f1e0..ce15fc6 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs @@ -14,6 +14,8 @@ use crate::pandoc::location::{Range, SourceInfo, empty_range, empty_source_info} use crate::pandoc::pandoc::Pandoc; use crate::pandoc::shortcode::shortcode_to_span; use crate::utils::autoid; +use crate::utils::error_collector::ErrorCollector; +use std::cell::RefCell; use std::collections::HashMap; /// Trim leading and trailing spaces from inlines @@ -260,9 +262,11 @@ fn transform_definition_list_div(div: Div) -> Block { } /// Apply post-processing transformations to the Pandoc AST -pub fn postprocess(doc: Pandoc) -> Result> { - let mut errors = Vec::new(); +pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> Result { let result = { + // Wrap error_collector in RefCell for interior mutability across multiple closures + let error_collector_ref = RefCell::new(error_collector); + // Track seen header IDs to avoid duplicates let mut seen_ids: HashMap = HashMap::new(); // Track citation count for numbering @@ -606,11 +610,14 @@ pub fn postprocess(doc: Pandoc) -> Result> { FilterResult(result, true) }) .with_attr(|attr| { - // TODO in order to do good error messages here, attr will need source mapping - errors.push(format!( - "Found attr in postprocess: {:?} - this should have been removed", - attr - )); + // TODO: Add source location when attr has it + error_collector_ref.borrow_mut().error( + format!( + "Found attr in postprocess: {:?} - this should have been removed", + attr + ), + None, + ); FilterResult(vec![], false) }) .with_blocks(|blocks| { @@ -632,12 +639,13 @@ pub fn postprocess(doc: Pandoc) -> Result> { }; // Don't add the CaptionBlock to the result (it's now attached) } else { - // TODO: Issue a warning/error when proper error infrastructure is ready - // For now, print a warning to stderr - eprintln!( - "Warning: Caption found without a preceding table at {}:{}", - caption_block.source_info.range.start.row + 1, - caption_block.source_info.range.start.column + 1 + // Issue a warning when caption has no preceding table + error_collector_ref.borrow_mut().warn( + "Caption found without a preceding table".to_string(), + Some(&crate::utils::error_collector::SourceInfo::new( + caption_block.source_info.range.start.row + 1, + caption_block.source_info.range.start.column + 1, + )), ); // Remove the caption from the output (don't add to result) } @@ -649,13 +657,16 @@ pub fn postprocess(doc: Pandoc) -> Result> { FilterResult(result, true) }); - topdown_traverse(doc, &mut filter) + let pandoc_result = topdown_traverse(doc, &mut filter); + + // Check if any errors were collected (before moving out of RefCell) + let has_errors = error_collector_ref.borrow().has_errors(); + + (pandoc_result, has_errors) }; - if !errors.is_empty() { - Err(errors) - } else { - Ok(result) - } + + // Return based on whether errors were found + if result.1 { Err(()) } else { Ok(result.0) } } /// Convert smart typography strings diff --git a/crates/quarto-markdown-pandoc/src/readers/qmd.rs b/crates/quarto-markdown-pandoc/src/readers/qmd.rs index 72b5b54..c799dad 100644 --- a/crates/quarto-markdown-pandoc/src/readers/qmd.rs +++ b/crates/quarto-markdown-pandoc/src/readers/qmd.rs @@ -16,6 +16,7 @@ use crate::pandoc::{self, Block, Meta}; use crate::pandoc::{MetaValue, rawblock_to_meta}; use crate::readers::qmd_error_messages::{produce_error_message, produce_error_message_json}; use crate::traversals; +use crate::utils::error_collector::{JsonErrorCollector, TextErrorCollector}; use std::io::Write; use tree_sitter::LogType; use tree_sitter_qmd::MarkdownParser; @@ -137,8 +138,29 @@ where } let context = ASTContext::with_filename(filename.to_string()); - let mut result = - pandoc::treesitter_to_pandoc(&mut output_stream, &tree, &input_bytes, &context)?; + + // Create appropriate error collector based on whether JSON errors are requested + let mut result = if error_formatter.is_some() { + // JSON error format requested + let mut error_collector = JsonErrorCollector::new(); + pandoc::treesitter_to_pandoc( + &mut output_stream, + &tree, + &input_bytes, + &context, + &mut error_collector, + )? + } else { + // Text error format (default) + let mut error_collector = TextErrorCollector::new(); + pandoc::treesitter_to_pandoc( + &mut output_stream, + &tree, + &input_bytes, + &context, + &mut error_collector, + )? + }; let mut meta_from_parses = Meta::default(); result = { diff --git a/crates/quarto-markdown-pandoc/src/utils/error_collector.rs b/crates/quarto-markdown-pandoc/src/utils/error_collector.rs new file mode 100644 index 0000000..be7476b --- /dev/null +++ b/crates/quarto-markdown-pandoc/src/utils/error_collector.rs @@ -0,0 +1,341 @@ +/// Source location information for errors +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SourceInfo { + pub row: usize, + pub column: usize, +} + +impl SourceInfo { + pub fn new(row: usize, column: usize) -> Self { + Self { row, column } + } +} + +/// Trait for collecting errors and warnings during parsing/processing +pub trait ErrorCollector { + /// Add a warning message (non-fatal) + fn warn(&mut self, message: String, location: Option<&SourceInfo>); + + /// Add an error message (fatal) + fn error(&mut self, message: String, location: Option<&SourceInfo>); + + /// Check if any errors were collected + fn has_errors(&self) -> bool; + + /// Get a copy of collected messages (without consuming the collector) + fn messages(&self) -> Vec; + + /// Convert collected errors into final format (consumes the collector) + fn into_messages(self) -> Vec; +} + +/// Text-based error collector that produces human-readable messages +#[derive(Debug, Default)] +pub struct TextErrorCollector { + messages: Vec, + has_errors: bool, +} + +impl TextErrorCollector { + pub fn new() -> Self { + Self { + messages: Vec::new(), + has_errors: false, + } + } +} + +impl ErrorCollector for TextErrorCollector { + fn warn(&mut self, message: String, location: Option<&SourceInfo>) { + let formatted = if let Some(loc) = location { + format!("Warning: {} at {}:{}", message, loc.row, loc.column) + } else { + format!("Warning: {}", message) + }; + self.messages.push(formatted); + } + + fn error(&mut self, message: String, location: Option<&SourceInfo>) { + let formatted = if let Some(loc) = location { + format!("Error: {} at {}:{}", message, loc.row, loc.column) + } else { + format!("Error: {}", message) + }; + self.messages.push(formatted); + self.has_errors = true; + } + + fn has_errors(&self) -> bool { + self.has_errors + } + + fn messages(&self) -> Vec { + self.messages.clone() + } + + fn into_messages(self) -> Vec { + self.messages + } +} + +/// JSON-based error collector that produces structured JSON messages +#[derive(Debug, Default)] +pub struct JsonErrorCollector { + messages: Vec, + has_errors: bool, +} + +impl JsonErrorCollector { + pub fn new() -> Self { + Self { + messages: Vec::new(), + has_errors: false, + } + } + + fn format_json_message(title: &str, message: String, location: Option<&SourceInfo>) -> String { + use serde_json::json; + + let json_obj = if let Some(loc) = location { + json!({ + "title": title, + "message": message, + "location": { + "row": loc.row, + "column": loc.column + } + }) + } else { + json!({ + "title": title, + "message": message + }) + }; + + json_obj.to_string() + } +} + +impl ErrorCollector for JsonErrorCollector { + fn warn(&mut self, message: String, location: Option<&SourceInfo>) { + let formatted = Self::format_json_message("Warning", message, location); + self.messages.push(formatted); + } + + fn error(&mut self, message: String, location: Option<&SourceInfo>) { + let formatted = Self::format_json_message("Error", message, location); + self.messages.push(formatted); + self.has_errors = true; + } + + fn has_errors(&self) -> bool { + self.has_errors + } + + fn messages(&self) -> Vec { + self.messages.clone() + } + + fn into_messages(self) -> Vec { + self.messages + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_text_collector_warning_without_location() { + let mut collector = TextErrorCollector::new(); + collector.warn("This is a warning".to_string(), None); + + assert!(!collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + assert_eq!(messages[0], "Warning: This is a warning"); + } + + #[test] + fn test_text_collector_warning_with_location() { + let mut collector = TextErrorCollector::new(); + let location = SourceInfo::new(35, 1); + collector.warn( + "Caption found without a preceding table".to_string(), + Some(&location), + ); + + assert!(!collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + assert_eq!( + messages[0], + "Warning: Caption found without a preceding table at 35:1" + ); + } + + #[test] + fn test_text_collector_error_without_location() { + let mut collector = TextErrorCollector::new(); + collector.error("This is an error".to_string(), None); + + assert!(collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + assert_eq!(messages[0], "Error: This is an error"); + } + + #[test] + fn test_text_collector_error_with_location() { + let mut collector = TextErrorCollector::new(); + let location = SourceInfo::new(42, 10); + collector.error("Found attr in postprocess".to_string(), Some(&location)); + + assert!(collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + assert_eq!(messages[0], "Error: Found attr in postprocess at 42:10"); + } + + #[test] + fn test_text_collector_multiple_messages() { + let mut collector = TextErrorCollector::new(); + let loc1 = SourceInfo::new(10, 5); + let loc2 = SourceInfo::new(20, 15); + + collector.warn("First warning".to_string(), Some(&loc1)); + collector.error("First error".to_string(), Some(&loc2)); + collector.warn("Second warning".to_string(), None); + + assert!(collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 3); + assert_eq!(messages[0], "Warning: First warning at 10:5"); + assert_eq!(messages[1], "Error: First error at 20:15"); + assert_eq!(messages[2], "Warning: Second warning"); + } + + #[test] + fn test_json_collector_warning_without_location() { + let mut collector = JsonErrorCollector::new(); + collector.warn("This is a warning".to_string(), None); + + assert!(!collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + + // Parse and verify JSON structure + let parsed: serde_json::Value = serde_json::from_str(&messages[0]).unwrap(); + assert_eq!(parsed["title"], "Warning"); + assert_eq!(parsed["message"], "This is a warning"); + assert!(parsed.get("location").is_none()); + } + + #[test] + fn test_json_collector_warning_with_location() { + let mut collector = JsonErrorCollector::new(); + let location = SourceInfo::new(35, 1); + collector.warn( + "Caption found without a preceding table".to_string(), + Some(&location), + ); + + assert!(!collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + + // Parse and verify JSON structure + let parsed: serde_json::Value = serde_json::from_str(&messages[0]).unwrap(); + assert_eq!(parsed["title"], "Warning"); + assert_eq!(parsed["message"], "Caption found without a preceding table"); + assert_eq!(parsed["location"]["row"], 35); + assert_eq!(parsed["location"]["column"], 1); + } + + #[test] + fn test_json_collector_error_without_location() { + let mut collector = JsonErrorCollector::new(); + collector.error("This is an error".to_string(), None); + + assert!(collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + + // Parse and verify JSON structure + let parsed: serde_json::Value = serde_json::from_str(&messages[0]).unwrap(); + assert_eq!(parsed["title"], "Error"); + assert_eq!(parsed["message"], "This is an error"); + assert!(parsed.get("location").is_none()); + } + + #[test] + fn test_json_collector_error_with_location() { + let mut collector = JsonErrorCollector::new(); + let location = SourceInfo::new(42, 10); + collector.error("Found attr in postprocess".to_string(), Some(&location)); + + assert!(collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + + // Parse and verify JSON structure + let parsed: serde_json::Value = serde_json::from_str(&messages[0]).unwrap(); + assert_eq!(parsed["title"], "Error"); + assert_eq!(parsed["message"], "Found attr in postprocess"); + assert_eq!(parsed["location"]["row"], 42); + assert_eq!(parsed["location"]["column"], 10); + } + + #[test] + fn test_json_collector_multiple_messages() { + let mut collector = JsonErrorCollector::new(); + let loc1 = SourceInfo::new(10, 5); + let loc2 = SourceInfo::new(20, 15); + + collector.warn("First warning".to_string(), Some(&loc1)); + collector.error("First error".to_string(), Some(&loc2)); + collector.warn("Second warning".to_string(), None); + + assert!(collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 3); + + // Verify each message is valid JSON + let parsed1: serde_json::Value = serde_json::from_str(&messages[0]).unwrap(); + assert_eq!(parsed1["title"], "Warning"); + assert_eq!(parsed1["message"], "First warning"); + assert_eq!(parsed1["location"]["row"], 10); + + let parsed2: serde_json::Value = serde_json::from_str(&messages[1]).unwrap(); + assert_eq!(parsed2["title"], "Error"); + assert_eq!(parsed2["message"], "First error"); + assert_eq!(parsed2["location"]["row"], 20); + + let parsed3: serde_json::Value = serde_json::from_str(&messages[2]).unwrap(); + assert_eq!(parsed3["title"], "Warning"); + assert_eq!(parsed3["message"], "Second warning"); + assert!(parsed3.get("location").is_none()); + } + + #[test] + fn test_empty_collector_has_no_errors() { + let collector = TextErrorCollector::new(); + assert!(!collector.has_errors()); + + let collector = JsonErrorCollector::new(); + assert!(!collector.has_errors()); + } + + #[test] + fn test_collector_with_only_warnings_has_no_errors() { + let mut collector = TextErrorCollector::new(); + collector.warn("Warning 1".to_string(), None); + collector.warn("Warning 2".to_string(), None); + assert!(!collector.has_errors()); + + let mut collector = JsonErrorCollector::new(); + collector.warn("Warning 1".to_string(), None); + collector.warn("Warning 2".to_string(), None); + assert!(!collector.has_errors()); + } +} diff --git a/crates/quarto-markdown-pandoc/src/utils/mod.rs b/crates/quarto-markdown-pandoc/src/utils/mod.rs index fd42ced..9014e30 100644 --- a/crates/quarto-markdown-pandoc/src/utils/mod.rs +++ b/crates/quarto-markdown-pandoc/src/utils/mod.rs @@ -5,6 +5,7 @@ pub mod autoid; pub mod concrete_tree_depth; +pub mod error_collector; pub mod output; pub mod string_write_adapter; pub mod text; diff --git a/crates/quarto-markdown-pandoc/tests/test.rs b/crates/quarto-markdown-pandoc/tests/test.rs index cc3f998..2a489d0 100644 --- a/crates/quarto-markdown-pandoc/tests/test.rs +++ b/crates/quarto-markdown-pandoc/tests/test.rs @@ -6,6 +6,7 @@ use glob::glob; use quarto_markdown_pandoc::errors::parse_is_good; use quarto_markdown_pandoc::pandoc::{ASTContext, treesitter_to_pandoc}; +use quarto_markdown_pandoc::utils::error_collector::TextErrorCollector; use quarto_markdown_pandoc::utils::output::VerboseOutput; use quarto_markdown_pandoc::{readers, writers}; use std::io::{self, Write}; @@ -22,12 +23,14 @@ fn unit_test_simple_qmd_parses() { .parse(input_bytes, None) .expect("Failed to parse input"); let mut buf = Vec::new(); + let mut error_collector = TextErrorCollector::new(); writers::native::write( &treesitter_to_pandoc( &mut std::io::sink(), &tree, &input_bytes, &ASTContext::anonymous(), + &mut error_collector, ) .unwrap(), &mut buf, @@ -126,6 +129,7 @@ fn matches_pandoc_commonmark_reader(input: &str) -> bool { } let mut buf1 = Vec::new(); let mut buf2 = Vec::new(); + let mut error_collector1 = TextErrorCollector::new(); writers::native::write( &treesitter_to_pandoc( &mut std::io::sink(), @@ -134,6 +138,7 @@ fn matches_pandoc_commonmark_reader(input: &str) -> bool { .unwrap(), input.as_bytes(), &ASTContext::anonymous(), + &mut error_collector1, ) .unwrap(), &mut buf1, @@ -141,6 +146,7 @@ fn matches_pandoc_commonmark_reader(input: &str) -> bool { .unwrap(); let native_output = String::from_utf8(buf1).expect("Invalid UTF-8 in output"); let context_for_json = ASTContext::anonymous(); + let mut error_collector2 = TextErrorCollector::new(); writers::json::write( &treesitter_to_pandoc( &mut std::io::sink(), @@ -149,6 +155,7 @@ fn matches_pandoc_commonmark_reader(input: &str) -> bool { .unwrap(), input.as_bytes(), &context_for_json, + &mut error_collector2, ) .unwrap(), &context_for_json, @@ -353,9 +360,15 @@ fn test_json_writer() { .parse(input_bytes, None) .expect("Failed to parse input"); let test_context = ASTContext::anonymous(); - let pandoc = - treesitter_to_pandoc(&mut std::io::sink(), &tree, input_bytes, &test_context) - .unwrap(); + let mut error_collector = TextErrorCollector::new(); + let pandoc = treesitter_to_pandoc( + &mut std::io::sink(), + &tree, + input_bytes, + &test_context, + &mut error_collector, + ) + .unwrap(); let mut buf = Vec::new(); writers::json::write(&pandoc, &test_context, &mut buf).unwrap(); let our_json = String::from_utf8(buf).expect("Invalid UTF-8 in our JSON output"); @@ -435,11 +448,13 @@ fn test_html_writer() { let tree = parser .parse(input_bytes, None) .expect("Failed to parse input"); + let mut error_collector = TextErrorCollector::new(); let pandoc = treesitter_to_pandoc( &mut std::io::sink(), &tree, input_bytes, &ASTContext::anonymous(), + &mut error_collector, ) .unwrap(); let mut buf = Vec::new(); @@ -541,11 +556,13 @@ fn test_do_not_smoke() { let tree = parser .parse(input_bytes, None) .expect("Failed to parse input"); + let mut error_collector = TextErrorCollector::new(); let _ = treesitter_to_pandoc( &mut std::io::sink(), &tree, input_bytes, &ASTContext::anonymous(), + &mut error_collector, ); file_count += 1; } diff --git a/crates/quarto-markdown-pandoc/tests/test_inline_locations.rs b/crates/quarto-markdown-pandoc/tests/test_inline_locations.rs index a6b7671..6d5a08a 100644 --- a/crates/quarto-markdown-pandoc/tests/test_inline_locations.rs +++ b/crates/quarto-markdown-pandoc/tests/test_inline_locations.rs @@ -4,6 +4,7 @@ */ use quarto_markdown_pandoc::pandoc::{ASTContext, treesitter_to_pandoc}; +use quarto_markdown_pandoc::utils::error_collector::TextErrorCollector; use quarto_markdown_pandoc::writers; use tree_sitter_qmd::MarkdownParser; @@ -17,8 +18,15 @@ fn test_inline_source_locations() { .expect("Failed to parse input"); let context = ASTContext::anonymous(); - let pandoc = treesitter_to_pandoc(&mut std::io::sink(), &tree, &input_bytes, &context) - .expect("Failed to convert to Pandoc AST"); + let mut error_collector = TextErrorCollector::new(); + let pandoc = treesitter_to_pandoc( + &mut std::io::sink(), + &tree, + &input_bytes, + &context, + &mut error_collector, + ) + .expect("Failed to convert to Pandoc AST"); let mut buf = Vec::new(); writers::json::write(&pandoc, &context, &mut buf).unwrap(); @@ -94,8 +102,15 @@ fn test_merged_strings_preserve_location() { .expect("Failed to parse input"); let context = ASTContext::anonymous(); - let pandoc = treesitter_to_pandoc(&mut std::io::sink(), &tree, &input_bytes, &context) - .expect("Failed to convert to Pandoc AST"); + let mut error_collector = TextErrorCollector::new(); + let pandoc = treesitter_to_pandoc( + &mut std::io::sink(), + &tree, + &input_bytes, + &context, + &mut error_collector, + ) + .expect("Failed to convert to Pandoc AST"); let mut buf = Vec::new(); writers::json::write(&pandoc, &context, &mut buf).unwrap(); @@ -149,8 +164,15 @@ fn test_separate_strings_keep_separate_locations() { .expect("Failed to parse input"); let context = ASTContext::anonymous(); - let pandoc = treesitter_to_pandoc(&mut std::io::sink(), &tree, &input_bytes, &context) - .expect("Failed to convert to Pandoc AST"); + let mut error_collector = TextErrorCollector::new(); + let pandoc = treesitter_to_pandoc( + &mut std::io::sink(), + &tree, + &input_bytes, + &context, + &mut error_collector, + ) + .expect("Failed to convert to Pandoc AST"); let mut buf = Vec::new(); writers::json::write(&pandoc, &context, &mut buf).unwrap(); From cdcf7fefddf2ddd13cfa0cb3fe27029bafe96f6f Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Fri, 17 Oct 2025 16:04:32 -0500 Subject: [PATCH 10/11] desugar and table attributes --- .../pandoc/treesitter_utils/postprocess.rs | 74 ++++++- .../tests/snapshots/json/math-with-attr.qmd | 9 + .../json/math-with-attr.qmd.snapshot | 1 + .../snapshots/json/table-caption-attr.qmd | 5 + .../json/table-caption-attr.qmd.snapshot | 1 + docs/syntax/desugaring/index.qmd | 31 +++ docs/syntax/desugaring/math-attributes.qmd | 193 ++++++++++++++++++ docs/syntax/index.qmd | 4 + 8 files changed, 315 insertions(+), 3 deletions(-) create mode 100644 crates/quarto-markdown-pandoc/tests/snapshots/json/math-with-attr.qmd create mode 100644 crates/quarto-markdown-pandoc/tests/snapshots/json/math-with-attr.qmd.snapshot create mode 100644 crates/quarto-markdown-pandoc/tests/snapshots/json/table-caption-attr.qmd create mode 100644 crates/quarto-markdown-pandoc/tests/snapshots/json/table-caption-attr.qmd.snapshot create mode 100644 docs/syntax/desugaring/index.qmd create mode 100644 docs/syntax/desugaring/math-attributes.qmd diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs index ce15fc6..41db36e 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs @@ -450,6 +450,45 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R ) }) .with_inlines(|inlines| { + // Combined filter: Handle Math + Attr pattern, then citation suffix pattern + + // Step 1: Handle Math nodes followed by Attr + // Pattern: Math, Space (optional), Attr -> Span with "quarto-math-with-attribute" class + let mut math_processed = vec![]; + let mut i = 0; + + while i < inlines.len() { + if let Inline::Math(math) = &inlines[i] { + // Check if followed by Space then Attr, or just Attr + let has_space = i + 1 < inlines.len() && matches!(inlines[i + 1], Inline::Space(_)); + let attr_idx = if has_space { i + 2 } else { i + 1 }; + + if attr_idx < inlines.len() { + if let Inline::Attr(attr) = &inlines[attr_idx] { + // Found Math + (Space?) + Attr pattern + // Wrap Math in a Span with the attribute + let mut classes = vec!["quarto-math-with-attribute".to_string()]; + classes.extend(attr.1.clone()); + + math_processed.push(Inline::Span(Span { + attr: (attr.0.clone(), classes, attr.2.clone()), + content: vec![Inline::Math(math.clone())], + source_info: empty_source_info(), + })); + + // Skip the Math, optional Space, and Attr + i = attr_idx + 1; + continue; + } + } + } + + // Not a Math + Attr pattern, add as is + math_processed.push(inlines[i].clone()); + i += 1; + } + + // Step 2: Handle citation suffix pattern on the math-processed result let mut result = vec![]; // states in this state machine: // 0. normal state, where we just collect inlines @@ -461,7 +500,7 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R let mut state = 0; let mut pending_cite: Option = None; - for inline in inlines { + for inline in math_processed { match state { 0 => { // Normal state - check if we see a valid cite @@ -629,11 +668,40 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R if let Block::CaptionBlock(caption_block) = block { // Look for a preceding Table if let Some(Block::Table(table)) = result.last_mut() { - // Attach caption to the table + // Extract any trailing Inline::Attr from caption content + let mut caption_content = caption_block.content.clone(); + let mut caption_attr: Option = None; + + if let Some(Inline::Attr(attr)) = caption_content.last() { + caption_attr = Some(attr.clone()); + caption_content.pop(); // Remove the Attr from caption content + } + + // If we found attributes in the caption, merge them with the table's attr + if let Some(caption_attr_value) = caption_attr { + // Merge: caption attributes override table attributes + // table.attr is (id, classes, key_values) + // Merge key-value pairs from caption into table + for (key, value) in caption_attr_value.2 { + table.attr.2.insert(key, value); + } + // Merge classes from caption into table + for class in caption_attr_value.1 { + if !table.attr.1.contains(&class) { + table.attr.1.push(class); + } + } + // Use caption id if table doesn't have one + if table.attr.0.is_empty() && !caption_attr_value.0.is_empty() { + table.attr.0 = caption_attr_value.0; + } + } + + // Attach caption to the table (with Attr removed from content) table.caption = Caption { short: None, long: Some(vec![Block::Plain(Plain { - content: caption_block.content.clone(), + content: caption_content, source_info: caption_block.source_info.clone(), })]), }; diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/json/math-with-attr.qmd b/crates/quarto-markdown-pandoc/tests/snapshots/json/math-with-attr.qmd new file mode 100644 index 0000000..7864f94 --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/snapshots/json/math-with-attr.qmd @@ -0,0 +1,9 @@ +Inline math with attribute: $E = mc^2$ {#eq-einstein} + +Display math with attribute: + +$$ +\int_0^\infty e^{-x^2} dx = \frac{\sqrt{\pi}}{2} +$$ {#eq-gaussian} + +Another inline example: $a^2 + b^2 = c^2$ {#eq-pythagorean} diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/json/math-with-attr.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/json/math-with-attr.qmd.snapshot new file mode 100644 index 0000000..5c8df94 --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/snapshots/json/math-with-attr.qmd.snapshot @@ -0,0 +1 @@ +{"astContext":{"filenames":["tests/snapshots/json/math-with-attr.qmd"]},"blocks":[{"c":[{"c":"Inline","l":{"end":{"column":6,"offset":6,"row":0},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Str"},{"l":{"end":{"column":7,"offset":7,"row":0},"filenameIndex":0,"start":{"column":6,"offset":6,"row":0}},"t":"Space"},{"c":"math","l":{"end":{"column":11,"offset":11,"row":0},"filenameIndex":0,"start":{"column":7,"offset":7,"row":0}},"t":"Str"},{"l":{"end":{"column":12,"offset":12,"row":0},"filenameIndex":0,"start":{"column":11,"offset":11,"row":0}},"t":"Space"},{"c":"with","l":{"end":{"column":16,"offset":16,"row":0},"filenameIndex":0,"start":{"column":12,"offset":12,"row":0}},"t":"Str"},{"l":{"end":{"column":17,"offset":17,"row":0},"filenameIndex":0,"start":{"column":16,"offset":16,"row":0}},"t":"Space"},{"c":"attribute:","l":{"end":{"column":27,"offset":27,"row":0},"filenameIndex":0,"start":{"column":17,"offset":17,"row":0}},"t":"Str"},{"l":{"end":{"column":28,"offset":28,"row":0},"filenameIndex":0,"start":{"column":27,"offset":27,"row":0}},"t":"Space"},{"c":[["eq-einstein",["quarto-math-with-attribute"],[]],[{"c":[{"t":"InlineMath"},"E = mc^2"],"l":{"end":{"column":38,"offset":38,"row":0},"filenameIndex":0,"start":{"column":28,"offset":28,"row":0}},"t":"Math"}]],"l":{"end":{"column":0,"offset":0,"row":0},"filenameIndex":null,"start":{"column":0,"offset":0,"row":0}},"t":"Span"}],"l":{"end":{"column":0,"offset":54,"row":1},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Para"},{"c":[{"c":"Display","l":{"end":{"column":7,"offset":62,"row":2},"filenameIndex":0,"start":{"column":0,"offset":55,"row":2}},"t":"Str"},{"l":{"end":{"column":8,"offset":63,"row":2},"filenameIndex":0,"start":{"column":7,"offset":62,"row":2}},"t":"Space"},{"c":"math","l":{"end":{"column":12,"offset":67,"row":2},"filenameIndex":0,"start":{"column":8,"offset":63,"row":2}},"t":"Str"},{"l":{"end":{"column":13,"offset":68,"row":2},"filenameIndex":0,"start":{"column":12,"offset":67,"row":2}},"t":"Space"},{"c":"with","l":{"end":{"column":17,"offset":72,"row":2},"filenameIndex":0,"start":{"column":13,"offset":68,"row":2}},"t":"Str"},{"l":{"end":{"column":18,"offset":73,"row":2},"filenameIndex":0,"start":{"column":17,"offset":72,"row":2}},"t":"Space"},{"c":"attribute:","l":{"end":{"column":28,"offset":83,"row":2},"filenameIndex":0,"start":{"column":18,"offset":73,"row":2}},"t":"Str"}],"l":{"end":{"column":0,"offset":84,"row":3},"filenameIndex":0,"start":{"column":0,"offset":55,"row":2}},"t":"Para"},{"c":[{"c":[["eq-gaussian",["quarto-math-with-attribute"],[]],[{"c":[{"t":"DisplayMath"},"\n\\int_0^\\infty e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}\n"],"l":{"end":{"column":2,"offset":139,"row":6},"filenameIndex":0,"start":{"column":0,"offset":85,"row":4}},"t":"Math"}]],"l":{"end":{"column":0,"offset":0,"row":0},"filenameIndex":null,"start":{"column":0,"offset":0,"row":0}},"t":"Span"}],"l":{"end":{"column":0,"offset":155,"row":7},"filenameIndex":0,"start":{"column":0,"offset":85,"row":4}},"t":"Para"},{"c":[{"c":"Another","l":{"end":{"column":7,"offset":163,"row":8},"filenameIndex":0,"start":{"column":0,"offset":156,"row":8}},"t":"Str"},{"l":{"end":{"column":8,"offset":164,"row":8},"filenameIndex":0,"start":{"column":7,"offset":163,"row":8}},"t":"Space"},{"c":"inline","l":{"end":{"column":14,"offset":170,"row":8},"filenameIndex":0,"start":{"column":8,"offset":164,"row":8}},"t":"Str"},{"l":{"end":{"column":15,"offset":171,"row":8},"filenameIndex":0,"start":{"column":14,"offset":170,"row":8}},"t":"Space"},{"c":"example:","l":{"end":{"column":23,"offset":179,"row":8},"filenameIndex":0,"start":{"column":15,"offset":171,"row":8}},"t":"Str"},{"l":{"end":{"column":24,"offset":180,"row":8},"filenameIndex":0,"start":{"column":23,"offset":179,"row":8}},"t":"Space"},{"c":[["eq-pythagorean",["quarto-math-with-attribute"],[]],[{"c":[{"t":"InlineMath"},"a^2 + b^2 = c^2"],"l":{"end":{"column":41,"offset":197,"row":8},"filenameIndex":0,"start":{"column":24,"offset":180,"row":8}},"t":"Math"}]],"l":{"end":{"column":0,"offset":0,"row":0},"filenameIndex":null,"start":{"column":0,"offset":0,"row":0}},"t":"Span"}],"l":{"end":{"column":0,"offset":216,"row":9},"filenameIndex":0,"start":{"column":0,"offset":156,"row":8}},"t":"Para"}],"meta":{},"pandoc-api-version":[1,23,1]} \ No newline at end of file diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/json/table-caption-attr.qmd b/crates/quarto-markdown-pandoc/tests/snapshots/json/table-caption-attr.qmd new file mode 100644 index 0000000..7917b49 --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/snapshots/json/table-caption-attr.qmd @@ -0,0 +1,5 @@ +| Column 1 | Column 2 | +|----------|----------| +| Data 1 | Data 2 | + +: Table caption {tbl-colwidths="[30,70]"} diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/json/table-caption-attr.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/json/table-caption-attr.qmd.snapshot new file mode 100644 index 0000000..d451688 --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/snapshots/json/table-caption-attr.qmd.snapshot @@ -0,0 +1 @@ +{"astContext":{"filenames":["tests/snapshots/json/table-caption-attr.qmd"]},"blocks":[{"c":[["",[],[["tbl-colwidths","[30,70]"]]],[null,[{"c":[{"c":"Table","l":{"end":{"column":7,"offset":80,"row":4},"filenameIndex":0,"start":{"column":2,"offset":75,"row":4}},"t":"Str"},{"l":{"end":{"column":8,"offset":81,"row":4},"filenameIndex":0,"start":{"column":7,"offset":80,"row":4}},"t":"Space"},{"c":"caption","l":{"end":{"column":15,"offset":88,"row":4},"filenameIndex":0,"start":{"column":8,"offset":81,"row":4}},"t":"Str"},{"l":{"end":{"column":16,"offset":89,"row":4},"filenameIndex":0,"start":{"column":15,"offset":88,"row":4}},"t":"Space"}],"l":{"end":{"column":0,"offset":115,"row":5},"filenameIndex":0,"start":{"column":0,"offset":72,"row":3}},"t":"Plain"}]],[[{"t":"AlignDefault"},{"t":"ColWidthDefault"}],[{"t":"AlignDefault"},{"t":"ColWidthDefault"}]],[["",[],[]],[[["",[],[]],[[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Column","l":{"end":{"column":8,"offset":8,"row":0},"filenameIndex":0,"start":{"column":2,"offset":2,"row":0}},"t":"Str"},{"l":{"end":{"column":9,"offset":9,"row":0},"filenameIndex":0,"start":{"column":8,"offset":8,"row":0}},"t":"Space"},{"c":"1","l":{"end":{"column":10,"offset":10,"row":0},"filenameIndex":0,"start":{"column":9,"offset":9,"row":0}},"t":"Str"}],"l":{"end":{"column":11,"offset":11,"row":0},"filenameIndex":0,"start":{"column":2,"offset":2,"row":0}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Column","l":{"end":{"column":19,"offset":19,"row":0},"filenameIndex":0,"start":{"column":13,"offset":13,"row":0}},"t":"Str"},{"l":{"end":{"column":20,"offset":20,"row":0},"filenameIndex":0,"start":{"column":19,"offset":19,"row":0}},"t":"Space"},{"c":"2","l":{"end":{"column":21,"offset":21,"row":0},"filenameIndex":0,"start":{"column":20,"offset":20,"row":0}},"t":"Str"}],"l":{"end":{"column":22,"offset":22,"row":0},"filenameIndex":0,"start":{"column":13,"offset":13,"row":0}},"t":"Plain"}]]]]]],[[["",[],[]],0,[],[[["",[],[]],[[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Data","l":{"end":{"column":6,"offset":54,"row":2},"filenameIndex":0,"start":{"column":2,"offset":50,"row":2}},"t":"Str"},{"l":{"end":{"column":7,"offset":55,"row":2},"filenameIndex":0,"start":{"column":6,"offset":54,"row":2}},"t":"Space"},{"c":"1","l":{"end":{"column":8,"offset":56,"row":2},"filenameIndex":0,"start":{"column":7,"offset":55,"row":2}},"t":"Str"}],"l":{"end":{"column":11,"offset":59,"row":2},"filenameIndex":0,"start":{"column":2,"offset":50,"row":2}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Data","l":{"end":{"column":17,"offset":65,"row":2},"filenameIndex":0,"start":{"column":13,"offset":61,"row":2}},"t":"Str"},{"l":{"end":{"column":18,"offset":66,"row":2},"filenameIndex":0,"start":{"column":17,"offset":65,"row":2}},"t":"Space"},{"c":"2","l":{"end":{"column":19,"offset":67,"row":2},"filenameIndex":0,"start":{"column":18,"offset":66,"row":2}},"t":"Str"}],"l":{"end":{"column":22,"offset":70,"row":2},"filenameIndex":0,"start":{"column":13,"offset":61,"row":2}},"t":"Plain"}]]]]]]],[["",[],[]],[]]],"l":{"end":{"column":0,"offset":72,"row":3},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Table"}],"meta":{},"pandoc-api-version":[1,23,1]} \ No newline at end of file diff --git a/docs/syntax/desugaring/index.qmd b/docs/syntax/desugaring/index.qmd new file mode 100644 index 0000000..38a31cc --- /dev/null +++ b/docs/syntax/desugaring/index.qmd @@ -0,0 +1,31 @@ +--- +title: "AST Desugaring" +--- + +Desugaring is the process of transforming extended syntax constructs into simpler, equivalent representations in the AST. Quarto Markdown includes several syntax features that don't have direct equivalents in Pandoc's standard AST. During the parsing and post-processing phases, `quarto-markdown-pandoc` transforms these extended constructs into standard Pandoc AST nodes with special attributes or classes, allowing downstream tools to recognize and process them appropriately. + +## Desugaring Transformations + +The following transformations are applied during AST post-processing: + +- [**Math with Attributes**](math-attributes.qmd) - Math expressions followed by attributes are wrapped in Span nodes with a special class +- **Editorial Marks** - Insert, Delete, Highlight, and EditComment nodes are converted to Span nodes with identifying classes +- **Table Caption Attributes** - Attributes in table captions are extracted and merged with the table's attribute field +- **Definition Lists** - Divs with `definition-list` class are transformed into DefinitionList blocks +- **Figures** - Single-image paragraphs are automatically promoted to Figure blocks with captions +- **Note References** - NoteReference nodes are converted to Span nodes with reference metadata +- **Shortcodes** - Shortcode nodes are transformed into Span nodes +- **Citation Suffixes** - Citation followed by space and span are merged into citation with suffix + +## Implementation + +All desugaring transformations are implemented in `src/pandoc/treesitter_utils/postprocess.rs`. The transformations are applied using a filter-based traversal system that walks the AST and applies pattern-matching transformations. + +## Why Desugar? + +Desugaring allows us to: + +1. **Maintain compatibility** - Output can be consumed by standard Pandoc tools and filters +2. **Preserve semantics** - Special classes and attributes allow downstream tools to distinguish desugared constructs +3. **Simplify the AST** - Fewer node types make the AST easier to work with and reason about +4. **Enable incremental processing** - Tools can process desugared output without needing to understand Quarto-specific syntax diff --git a/docs/syntax/desugaring/math-attributes.qmd b/docs/syntax/desugaring/math-attributes.qmd new file mode 100644 index 0000000..9d9fa1d --- /dev/null +++ b/docs/syntax/desugaring/math-attributes.qmd @@ -0,0 +1,193 @@ +--- +title: "Math with Attributes" +--- + +## Overview + +In Quarto Markdown, mathematical expressions (both inline and display) can be followed by attribute blocks to assign identifiers, classes, and key-value pairs. Since Pandoc's `Math` inline node doesn't support attributes directly, `quarto-markdown-pandoc` desugars these constructs into `Span` nodes containing the math expression, with a special class to indicate the transformation. + +## Syntax + +Math expressions with attributes follow this pattern: + +``` markdown +Inline math: $E = mc^2$ {#eq-einstein} + +Display math: +$$ +\int_0^\infty e^{-x^2} dx = \frac{\sqrt{\pi}}{2} +$$ {#eq-gaussian} +``` + +The attribute block may include: + +- **Identifiers**: `{#eq-name}` - typically used for cross-referencing equations +- **Classes**: `{.myclass}` - additional styling or semantic classes +- **Key-value pairs**: `{key="value"}` - arbitrary metadata + +## Transformation + +### Pattern Matching + +The desugaring process looks for this pattern in the inline node sequence: + +1. A `Math` node (inline or display) +2. Optionally, a `Space` node +3. An `Attr` node containing the attributes + +### Output Structure + +When this pattern is found, it is transformed into: + +```default +Span { + attr: { + id: , + classes: ["quarto-math-with-attribute", ...additional classes from Attr], + attributes: + }, + content: [ + Math { } + ] +} +``` + +### Example + +**Input QMD:** + +``` markdown +The famous equation $E = mc^2$ {#eq-einstein} shows the relationship. +``` + +**Output AST (JSON):** + +``` json +{ + "t": "Para", + "c": [ + {"t": "Str", "c": "The"}, + {"t": "Space"}, + {"t": "Str", "c": "famous"}, + {"t": "Space"}, + {"t": "Str", "c": "equation"}, + {"t": "Space"}, + { + "t": "Span", + "c": [ + ["eq-einstein", ["quarto-math-with-attribute"], []], + [ + { + "t": "Math", + "c": [ + {"t": "InlineMath"}, + "E = mc^2" + ] + } + ] + ] + }, + {"t": "Space"}, + {"t": "Str", "c": "shows"}, + {"t": "Space"}, + {"t": "Str", "c": "the"}, + {"t": "Space"}, + {"t": "Str", "c": "relationship."} + ] +} +``` + +## Implementation Details + +### Location in Code + +The transformation is implemented in `src/pandoc/treesitter_utils/postprocess.rs` as part of the combined `.with_inlines()` filter (lines ~452-489). + +### Processing Steps + +1. **First Pass - Math + Attr Detection**: The filter iterates through inline nodes looking for `Math` nodes +2. **Lookahead**: When a `Math` node is found, checks if it's followed by an optional `Space` and then an `Attr` +3. **Wrapping**: If the pattern matches, creates a new `Span` with: + - The `quarto-math-with-attribute` class prepended to any existing classes + - The ID and key-value attributes from the `Attr` node + - The original `Math` node as its sole content +4. **Skipping**: Advances the iterator past the `Math`, optional `Space`, and `Attr` nodes + +### Filter Chaining + +This transformation is part of a combined filter that also handles citation suffix patterns. The two transformations are applied sequentially: + +1. Math + Attr patterns are processed first +2. The result is then passed to the citation suffix processor + +This ensures both patterns can coexist in the same document without interference. + +## Use Cases + +### Cross-referencing Equations + +The primary use case is cross-referencing equations in Quarto documents: + +``` markdown +As shown in @eq-einstein, energy and mass are equivalent. + +Einstein's famous equation: $E = mc^2$ {#eq-einstein} +``` + +The `#eq-` prefix is significant - it signals to Quarto's rendering pipeline that this is an equation reference. + +### Styling Math Blocks + +Additional classes can be used for custom styling: + +``` markdown +$\sum_{i=1}^n i = \frac{n(n+1)}{2}$ {#eq-sum .theorem} +``` + +### Metadata Attachment + +Key-value attributes can attach metadata to equations: + +``` markdown +$$ +F = ma +$$ {#eq-newton author="Newton" year="1687"} +``` + +## Recognition in Downstream Tools + +Downstream tools (Lua filters, Quarto extensions, etc.) can recognize desugared math expressions by checking for the `quarto-math-with-attribute` class on `Span` nodes: + +``` lua +if span.classes:includes("quarto-math-with-attribute") then + -- This span contains a math expression that originally had attributes + local math = span.content[1] -- The Math node + local id = span.identifier -- The equation ID + -- Process accordingly... +end +``` + +## Design Rationale + +### Why Use a Span? + +1. **Pandoc Compatibility**: `Span` is a standard Pandoc node that all tools understand +2. **Attribute Support**: Spans naturally support the id/classes/attributes triple +3. **Transparency**: Most renderers will pass through the span and render the contained math normally +4. **Extensibility**: The special class allows tools to opt-in to special handling + +### Why Not Extend Math? + +We could have extended Pandoc's `Math` node to include attributes, but this would: + +- Break compatibility with standard Pandoc +- Require changes to all downstream tools +- Make the AST diverge significantly from Pandoc's design + +### The Special Class Marker + +The `quarto-math-with-attribute` class serves as a marker that distinguishes desugared math from regular spans that happen to contain math. This allows tools to: + +- Distinguish intentional math-with-attributes from coincidental nesting +- Reconstruct the original syntax when round-tripping through QMD +- Apply equation-specific processing (numbering, cross-references, etc.) diff --git a/docs/syntax/index.qmd b/docs/syntax/index.qmd index fad9595..bdd64f3 100644 --- a/docs/syntax/index.qmd +++ b/docs/syntax/index.qmd @@ -14,3 +14,7 @@ The features documented here are currently under development. The syntax and beh - [Editorial Marks](editorial-marks.qmd) - Annotate text with highlights, insertions, deletions, and comments - [Footnotes](footnotes.qmd) - Add footnotes with inline or fenced block syntax - [YAML Metadata](yaml-metadata.qmd) - Control markdown parsing in metadata with YAML tags + +## Advanced Topics + +- [AST Desugaring](desugaring/index.qmd) - How extended syntax constructs are transformed into standard Pandoc AST nodes From 3ab7f1fbf49d1a732e8ab743d3346023779a638b Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Fri, 17 Oct 2025 17:26:17 -0500 Subject: [PATCH 11/11] docs on desugaring --- docs/syntax/desugaring/definition-lists.qmd | 79 ++++++++++++++ docs/syntax/desugaring/editorial-marks.qmd | 72 +++++++++++++ docs/syntax/desugaring/index.qmd | 17 +-- docs/syntax/desugaring/math-attributes.qmd | 112 -------------------- docs/syntax/desugaring/note-references.qmd | 73 +++++++++++++ docs/syntax/desugaring/table-captions.qmd | 70 ++++++++++++ 6 files changed, 298 insertions(+), 125 deletions(-) create mode 100644 docs/syntax/desugaring/definition-lists.qmd create mode 100644 docs/syntax/desugaring/editorial-marks.qmd create mode 100644 docs/syntax/desugaring/note-references.qmd create mode 100644 docs/syntax/desugaring/table-captions.qmd diff --git a/docs/syntax/desugaring/definition-lists.qmd b/docs/syntax/desugaring/definition-lists.qmd new file mode 100644 index 0000000..5116cf3 --- /dev/null +++ b/docs/syntax/desugaring/definition-lists.qmd @@ -0,0 +1,79 @@ +--- +title: "Definition Lists" +--- + +## Overview + +Quarto Markdown supports definition lists through a special div syntax with the `definition-list` class. During post-processing, divs meeting the structural requirements are transformed into Pandoc's native `DefinitionList` blocks. + +## Transformation + +A div with class `definition-list` containing a specific bullet list structure is converted to a `DefinitionList` block. + +### Required Structure + +```markdown +::: {.definition-list} +- Term 1 + - Definition 1a + - Definition 1b +- Term 2 + - Definition 2 +::: +``` + +The structure must follow these rules: + +1. Div must have `definition-list` class +2. Div contains exactly one bullet list +3. Each list item has exactly two blocks: + - First: Plain or Paragraph (the term) + - Second: BulletList (the definitions) + +## Example + +### Input QMD + +```markdown +::: {.definition-list} +- **Markdown** + - A lightweight markup language + - Easy to read and write +- **Pandoc** + - A universal document converter +::: +``` + +### Output Structure + +Transforms to a `DefinitionList` block: + +```json +{ + "t": "DefinitionList", + "c": [ + [ + [{"t": "Strong", "c": [{"t": "Str", "c": "Markdown"}]}], + [ + [[{"t": "Plain", "c": [{"t": "Str", "c": "A lightweight markup language"}]}]], + [[{"t": "Plain", "c": [{"t": "Str", "c": "Easy to read and write"}]}]] + ] + ], + [ + [{"t": "Strong", "c": [{"t": "Str", "c": "Pandoc"}]}], + [ + [[{"t": "Plain", "c": [{"t": "Str", "c": "A universal document converter"}]}]] + ] + ] + ] +} +``` + +## Validation + +Invalid structures are left as regular divs. Common validation failures: + +- Div contains more than one bullet list +- List items don't have exactly two blocks +- First block is not Plain or Paragraph +- Second block is not a BulletList diff --git a/docs/syntax/desugaring/editorial-marks.qmd b/docs/syntax/desugaring/editorial-marks.qmd new file mode 100644 index 0000000..81b09ec --- /dev/null +++ b/docs/syntax/desugaring/editorial-marks.qmd @@ -0,0 +1,72 @@ +--- +title: "Editorial Marks" +--- + +## Overview + +Quarto Markdown's editorial marks (`[!! highlight]`, `[++ insert]`, `[-- delete]`, `[>> comment]`) are custom inline node types that don't exist in Pandoc's AST. During post-processing, these nodes are desugared into standard `Span` nodes with special classes. + +## Transformation + +All four editorial mark types follow the same desugaring pattern: + +| Original Node | Special Class | Example | +|---------------|---------------|---------| +| `Insert` | `quarto-insert` | `[++ text]` | +| `Delete` | `quarto-delete` | `[-- text]` | +| `Highlight` | `quarto-highlight` | `[!! text]` | +| `EditComment` | `quarto-edit-comment` | `[>> text]` | + +The content is trimmed (leading/trailing spaces removed) before being placed in the Span. + +## Example + +### Input QMD + +```markdown +This has [++ added text]{#my-add .important} and [!! highlighted]{.warn}. +``` + +### Output AST (simplified) + +```json +[ + {"t": "Str", "c": "This"}, + {"t": "Space"}, + {"t": "Str", "c": "has"}, + {"t": "Space"}, + { + "t": "Span", + "c": [ + ["my-add", ["quarto-insert", "important"], []], + [{"t": "Str", "c": "added"}, {"t": "Space"}, {"t": "Str", "c": "text"}] + ] + }, + {"t": "Space"}, + {"t": "Str", "c": "and"}, + {"t": "Space"}, + { + "t": "Span", + "c": [ + ["", ["quarto-highlight", "warn"], []], + [{"t": "Str", "c": "highlighted"}] + ] + } +] +``` + +## Recognition + +Downstream tools can identify desugared editorial marks by checking for the special classes: + +```lua +if span.classes:includes("quarto-insert") then + -- Handle insertion suggestion +elseif span.classes:includes("quarto-delete") then + -- Handle deletion suggestion +elseif span.classes:includes("quarto-highlight") then + -- Handle highlight +elseif span.classes:includes("quarto-edit-comment") then + -- Handle editorial comment +end +``` diff --git a/docs/syntax/desugaring/index.qmd b/docs/syntax/desugaring/index.qmd index 38a31cc..eedc223 100644 --- a/docs/syntax/desugaring/index.qmd +++ b/docs/syntax/desugaring/index.qmd @@ -9,23 +9,14 @@ Desugaring is the process of transforming extended syntax constructs into simple The following transformations are applied during AST post-processing: - [**Math with Attributes**](math-attributes.qmd) - Math expressions followed by attributes are wrapped in Span nodes with a special class -- **Editorial Marks** - Insert, Delete, Highlight, and EditComment nodes are converted to Span nodes with identifying classes -- **Table Caption Attributes** - Attributes in table captions are extracted and merged with the table's attribute field -- **Definition Lists** - Divs with `definition-list` class are transformed into DefinitionList blocks +- [**Editorial Marks**](editorial-marks.qmd) - Insert, Delete, Highlight, and EditComment nodes are converted to Span nodes with identifying classes +- [**Table Caption Attributes**](table-captions.qmd) - Attributes in table captions are extracted and merged with the table's attribute field +- [**Definition Lists**](definition-lists.qmd) - Divs with `definition-list` class are transformed into DefinitionList blocks +- [**Note References**](note-references.qmd) - NoteReference nodes are converted to Span nodes with reference metadata - **Figures** - Single-image paragraphs are automatically promoted to Figure blocks with captions -- **Note References** - NoteReference nodes are converted to Span nodes with reference metadata - **Shortcodes** - Shortcode nodes are transformed into Span nodes - **Citation Suffixes** - Citation followed by space and span are merged into citation with suffix ## Implementation All desugaring transformations are implemented in `src/pandoc/treesitter_utils/postprocess.rs`. The transformations are applied using a filter-based traversal system that walks the AST and applies pattern-matching transformations. - -## Why Desugar? - -Desugaring allows us to: - -1. **Maintain compatibility** - Output can be consumed by standard Pandoc tools and filters -2. **Preserve semantics** - Special classes and attributes allow downstream tools to distinguish desugared constructs -3. **Simplify the AST** - Fewer node types make the AST easier to work with and reason about -4. **Enable incremental processing** - Tools can process desugared output without needing to understand Quarto-specific syntax diff --git a/docs/syntax/desugaring/math-attributes.qmd b/docs/syntax/desugaring/math-attributes.qmd index 9d9fa1d..947cb8f 100644 --- a/docs/syntax/desugaring/math-attributes.qmd +++ b/docs/syntax/desugaring/math-attributes.qmd @@ -35,23 +35,6 @@ The desugaring process looks for this pattern in the inline node sequence: 2. Optionally, a `Space` node 3. An `Attr` node containing the attributes -### Output Structure - -When this pattern is found, it is transformed into: - -```default -Span { - attr: { - id: , - classes: ["quarto-math-with-attribute", ...additional classes from Attr], - attributes: - }, - content: [ - Math { } - ] -} -``` - ### Example **Input QMD:** @@ -96,98 +79,3 @@ The famous equation $E = mc^2$ {#eq-einstein} shows the relationship. ] } ``` - -## Implementation Details - -### Location in Code - -The transformation is implemented in `src/pandoc/treesitter_utils/postprocess.rs` as part of the combined `.with_inlines()` filter (lines ~452-489). - -### Processing Steps - -1. **First Pass - Math + Attr Detection**: The filter iterates through inline nodes looking for `Math` nodes -2. **Lookahead**: When a `Math` node is found, checks if it's followed by an optional `Space` and then an `Attr` -3. **Wrapping**: If the pattern matches, creates a new `Span` with: - - The `quarto-math-with-attribute` class prepended to any existing classes - - The ID and key-value attributes from the `Attr` node - - The original `Math` node as its sole content -4. **Skipping**: Advances the iterator past the `Math`, optional `Space`, and `Attr` nodes - -### Filter Chaining - -This transformation is part of a combined filter that also handles citation suffix patterns. The two transformations are applied sequentially: - -1. Math + Attr patterns are processed first -2. The result is then passed to the citation suffix processor - -This ensures both patterns can coexist in the same document without interference. - -## Use Cases - -### Cross-referencing Equations - -The primary use case is cross-referencing equations in Quarto documents: - -``` markdown -As shown in @eq-einstein, energy and mass are equivalent. - -Einstein's famous equation: $E = mc^2$ {#eq-einstein} -``` - -The `#eq-` prefix is significant - it signals to Quarto's rendering pipeline that this is an equation reference. - -### Styling Math Blocks - -Additional classes can be used for custom styling: - -``` markdown -$\sum_{i=1}^n i = \frac{n(n+1)}{2}$ {#eq-sum .theorem} -``` - -### Metadata Attachment - -Key-value attributes can attach metadata to equations: - -``` markdown -$$ -F = ma -$$ {#eq-newton author="Newton" year="1687"} -``` - -## Recognition in Downstream Tools - -Downstream tools (Lua filters, Quarto extensions, etc.) can recognize desugared math expressions by checking for the `quarto-math-with-attribute` class on `Span` nodes: - -``` lua -if span.classes:includes("quarto-math-with-attribute") then - -- This span contains a math expression that originally had attributes - local math = span.content[1] -- The Math node - local id = span.identifier -- The equation ID - -- Process accordingly... -end -``` - -## Design Rationale - -### Why Use a Span? - -1. **Pandoc Compatibility**: `Span` is a standard Pandoc node that all tools understand -2. **Attribute Support**: Spans naturally support the id/classes/attributes triple -3. **Transparency**: Most renderers will pass through the span and render the contained math normally -4. **Extensibility**: The special class allows tools to opt-in to special handling - -### Why Not Extend Math? - -We could have extended Pandoc's `Math` node to include attributes, but this would: - -- Break compatibility with standard Pandoc -- Require changes to all downstream tools -- Make the AST diverge significantly from Pandoc's design - -### The Special Class Marker - -The `quarto-math-with-attribute` class serves as a marker that distinguishes desugared math from regular spans that happen to contain math. This allows tools to: - -- Distinguish intentional math-with-attributes from coincidental nesting -- Reconstruct the original syntax when round-tripping through QMD -- Apply equation-specific processing (numbering, cross-references, etc.) diff --git a/docs/syntax/desugaring/note-references.qmd b/docs/syntax/desugaring/note-references.qmd new file mode 100644 index 0000000..457bb57 --- /dev/null +++ b/docs/syntax/desugaring/note-references.qmd @@ -0,0 +1,73 @@ +--- +title: "Note References" +--- + +## Overview + +Note references in Quarto Markdown use a custom `NoteReference` inline node type. During post-processing, these are desugared into standard `Span` nodes with a special class and metadata storing the reference ID. + +## Transformation + +Each `NoteReference` node is transformed to: + +```default +Span { + attr: { + id: "", + classes: ["quarto-note-reference"], + attributes: {"reference-id": } + }, + content: [] +} +``` + +The resulting span has empty content - the reference ID is stored in the attributes. + +## Example + +### Input QMD + +```markdown +Here is some text[^note-1] with a footnote. + +[^note-1]: This is the footnote content. +``` + +### Output Structure + +The `[^note-1]` becomes: + +```json +{ + "t": "Span", + "c": [ + ["", ["quarto-note-reference"], [["reference-id", "note-1"]]], + [] + ] +} +``` + +## Recognition + +Downstream tools can identify note references by checking for the special class and extracting the ID: + +```lua +if span.classes:includes("quarto-note-reference") then + local note_id = span.attributes["reference-id"] + -- Process the note reference... +end +``` + +## Use Cases + +### Footnotes + +Standard footnote references that will be rendered as superscript numbers linking to the footnote content. + +### Endnotes + +References to notes collected at the end of the document or section. + +### Cross-references + +References to note definitions elsewhere in the document for documentation or citation purposes. diff --git a/docs/syntax/desugaring/table-captions.qmd b/docs/syntax/desugaring/table-captions.qmd new file mode 100644 index 0000000..27ca65e --- /dev/null +++ b/docs/syntax/desugaring/table-captions.qmd @@ -0,0 +1,70 @@ +--- +title: "Table Caption Attributes" +--- + +## Overview + +Table captions in Quarto Markdown can include attributes after the caption text. Since Pandoc's `Table` node stores caption content separately from table attributes, these caption attributes are extracted and merged into the table's attribute field during post-processing. + +## Transformation + +When a table caption ends with an `Attr` node: + +1. Extract the `Attr` from the end of caption content +2. Merge its key-value pairs into the table's attributes +3. Merge its classes into the table's class list +4. Use its ID if the table doesn't already have one + +## Example + +### Input QMD + +```markdown +| Col 1 | Col 2 | +|-------|-------| +| A | B | + +: Table caption {#tbl-mytable .special tbl-colwidths="[30,70]"} +``` + +### Output Structure + +The table's `attr` field becomes: + +```json +{ + "id": "tbl-mytable", + "classes": ["special"], + "attributes": { + "tbl-colwidths": "[30,70]" + } +} +``` + +The caption content no longer includes the `Attr` node - only the text "Table caption" remains. + +## Use Cases + +### Cross-referencing Tables + +```markdown +See @tbl-results for the data. + +: Results {#tbl-results} +``` + +### Column Width Specifications + +The `tbl-colwidths` attribute is commonly used to control column widths: + +```markdown +: Wide second column {tbl-colwidths="[30,70]"} +``` + +### Custom Styling + +Classes can be used for custom table styling: + +```markdown +: Summary table {.striped .bordered} +```