diff --git a/Cargo.lock b/Cargo.lock index 998c8f8..db17e8a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -47,7 +47,7 @@ version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" dependencies = [ - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -58,9 +58,15 @@ checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys", + "windows-sys 0.60.2", ] +[[package]] +name = "anyhow" +version = "1.0.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" + [[package]] name = "arbitrary" version = "1.4.2" @@ -152,6 +158,16 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "colored" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117725a109d387c937a1533ce01b450cbde6b88abceea8473c4d7a85853cda3c" +dependencies = [ + "lazy_static", + "windows-sys 0.59.0", +] + [[package]] name = "console_error_panic_hook" version = "0.1.7" @@ -236,6 +252,25 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "include_dir" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "923d117408f1e49d914f1a379a309cffe4f18c05cf4e3d12e613a15fc81bd0dd" +dependencies = [ + "include_dir_macros", +] + +[[package]] +name = "include_dir_macros" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cab85a7ed0bd5f0e76d93846e0147172bed2e2d3f859bcc33a8d9699cad1a75" +dependencies = [ + "proc-macro2", + "quote", +] + [[package]] name = "indexmap" version = "2.11.0" @@ -278,6 +313,12 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + [[package]] name = "libc" version = "0.2.175" @@ -343,6 +384,21 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "qmd-syntax-helper" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "colored", + "glob", + "include_dir", + "quarto-markdown-pandoc", + "regex", + "serde", + "serde_json", +] + [[package]] name = "quarto-markdown-pandoc" version = "0.0.0" @@ -684,7 +740,7 @@ version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0978bf7171b3d90bac376700cb56d606feb40f251a475a5d6634613564460b22" dependencies = [ - "windows-sys", + "windows-sys 0.60.2", ] [[package]] @@ -693,13 +749,38 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.60.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" dependencies = [ - "windows-targets", + "windows-targets 0.53.3", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -709,58 +790,106 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" dependencies = [ "windows-link", - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_gnullvm", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", ] +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + [[package]] name = "windows_aarch64_gnullvm" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + [[package]] name = "windows_aarch64_msvc" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + [[package]] name = "windows_i686_gnu" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + [[package]] name = "windows_i686_gnullvm" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + [[package]] name = "windows_i686_msvc" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + [[package]] name = "windows_x86_64_gnu" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + [[package]] name = "windows_x86_64_gnullvm" version = "0.53.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + [[package]] name = "windows_x86_64_msvc" version = "0.53.0" diff --git a/crates/qmd-syntax-helper/Cargo.toml b/crates/qmd-syntax-helper/Cargo.toml new file mode 100644 index 0000000..9b4f98c --- /dev/null +++ b/crates/qmd-syntax-helper/Cargo.toml @@ -0,0 +1,32 @@ +[package] +name = "qmd-syntax-helper" +version = "0.1.0" +authors.workspace = true +homepage.workspace = true +keywords.workspace = true +categories.workspace = true +license.workspace = true +repository.workspace = true +edition.workspace = true + +[lib] +name = "qmd_syntax_helper" +path = "src/lib.rs" + +[[bin]] +name = "qmd-syntax-helper" +path = "src/main.rs" + +[dependencies] +clap = { version = "4.5", features = ["derive"] } +anyhow = "1.0" +regex = "1.10" +colored = "2.1" +quarto-markdown-pandoc.workspace = true +include_dir = "0.7" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +glob = "0.3" + +[lints] +workspace = true diff --git a/crates/qmd-syntax-helper/README.md b/crates/qmd-syntax-helper/README.md new file mode 100644 index 0000000..738bd13 --- /dev/null +++ b/crates/qmd-syntax-helper/README.md @@ -0,0 +1,125 @@ +# qmd-syntax-helper + +A command-line tool for converting and fixing Quarto Markdown syntax issues. + +## Overview + +`qmd-syntax-helper` helps migrate Quarto Markdown documents between different syntax styles and fix common syntax issues. It's designed to handle bulk conversions across entire projects while preserving document semantics. + +## Features + +### Grid Table Conversion + +Convert Pandoc-style grid tables to Quarto's list-table format: + +```bash +# Convert a single file (output to stdout) +qmd-syntax-helper ungrid-tables input.qmd + +# Convert in-place +qmd-syntax-helper ungrid-tables --in-place input.qmd + +# Check what would change without modifying files +qmd-syntax-helper ungrid-tables --check input.qmd + +# Convert multiple files +qmd-syntax-helper ungrid-tables --in-place docs/**/*.qmd + +# Verbose output +qmd-syntax-helper ungrid-tables --in-place --verbose input.qmd +``` + +**Before (Grid Table):** +```markdown ++-----------+-----------+ +| Header 1 | Header 2 | ++===========+===========+ +| Cell 1 | Cell 2 | ++-----------+-----------+ +``` + +**After (List Table):** +```markdown +::: {.list-table header-rows="1" widths="0.5,0.5"} + +* * Header 1 + * Header 2 + +* * Cell 1 + * Cell 2 + +::: +``` + +## Installation + +From the quarto-markdown repository: + +```bash +cargo build --release --bin qmd-syntax-helper +# Binary will be in target/release/qmd-syntax-helper +``` + +## Requirements + +- Rust 2024 edition +- For grid table conversion: + - `pandoc` must be in PATH + - `quarto-markdown-pandoc` workspace crate (used as library) + +## Future Converters + +Planned conversions include: +- Reference-style links → inline links +- Attribute syntax fixes +- Shortcode migrations +- YAML frontmatter fixes + +## Development + +### Running Tests + +```bash +cargo test --package qmd-syntax-helper +``` + +### Adding New Converters + +1. Create a new module in `src/conversions/` +2. Implement the conversion logic +3. Add a new subcommand in `src/main.rs` +4. Add tests in `tests/` + +## Architecture + +``` +src/ + main.rs # CLI entry point + lib.rs # Public API + conversions/ + mod.rs + grid_tables.rs # Grid table converter + utils/ + file_io.rs # File I/O utilities + resources.rs # Embedded resource management +resources/ + filters/ + grid-table-to-list-table.lua # Pandoc Lua filter (embedded at compile time) +``` + +### Conversion Pipeline + +Grid table conversion uses a two-stage pipeline: + +1. **Pandoc with Lua filter**: Converts Markdown with grid tables to Pandoc JSON AST + - Uses embedded Lua filter to transform Table nodes to list-table Div format + - Extracted to temp directory at runtime via ResourceManager + +2. **quarto-markdown-pandoc library**: Converts Pandoc JSON AST back to Markdown + - Uses `quarto_markdown_pandoc::readers::json::read()` to parse JSON + - Uses `quarto_markdown_pandoc::writers::qmd::write()` to generate Markdown + - Pure Rust library calls (no subprocess overhead) + +## License + +MIT diff --git a/crates/qmd-syntax-helper/resources/filters/definition-list-to-div.lua b/crates/qmd-syntax-helper/resources/filters/definition-list-to-div.lua new file mode 100644 index 0000000..1f01c12 --- /dev/null +++ b/crates/qmd-syntax-helper/resources/filters/definition-list-to-div.lua @@ -0,0 +1,60 @@ +-- Lua filter to convert Pandoc DefinitionList AST nodes to div-based definition lists +-- This produces output in the definition-list div syntax used by quarto-markdown + +if PANDOC_VERSION and PANDOC_VERSION.must_be_at_least then + PANDOC_VERSION:must_be_at_least("2.11") +else + error("pandoc version >=2.11 is required") +end + +-- Convert a DefinitionList to a div with .definition-list class +local function definition_list_to_div(def_list) + -- Build div attributes with .definition-list class + local div_attr = pandoc.Attr('', {'definition-list'}, {}) + + -- Build the outer bullet list containing all term-definition pairs + local outer_items = {} + + -- Each item in the definition list is a tuple: (term, definitions) + -- term: list of inline elements + -- definitions: list of definition blocks (each definition is a list of blocks) + for _, item in ipairs(def_list.content) do + local term = item[1] -- List of inline elements + local definitions = item[2] -- List of definition blocks + + -- Create the inner bullet list containing the definitions + local def_items = {} + for _, def_blocks in ipairs(definitions) do + -- Each definition is a list of blocks + -- Clone the blocks to avoid modifying the original + local blocks = pandoc.Blocks({}) + for _, block in ipairs(def_blocks) do + table.insert(blocks, block:clone()) + end + + -- Ensure we have at least one block + if #blocks == 0 then + blocks = pandoc.Blocks({pandoc.Para({})}) + end + + table.insert(def_items, blocks) + end + + -- Create a bullet list for the definitions + local def_list_elem = pandoc.BulletList(def_items) + + -- Create the outer list item containing: + -- 1. The term as a paragraph + -- 2. The nested bullet list of definitions + local term_para = pandoc.Para(term) + table.insert(outer_items, {term_para, def_list_elem}) + end + + -- Create the outer bullet list (list of term-definition pairs) + local outer_list = pandoc.BulletList(outer_items) + + -- Create the div containing the outer list + return pandoc.Div({outer_list}, div_attr) +end + +return {{DefinitionList = definition_list_to_div}} diff --git a/crates/qmd-syntax-helper/resources/filters/grid-table-to-list-table.lua b/crates/qmd-syntax-helper/resources/filters/grid-table-to-list-table.lua new file mode 100644 index 0000000..b6eb372 --- /dev/null +++ b/crates/qmd-syntax-helper/resources/filters/grid-table-to-list-table.lua @@ -0,0 +1,189 @@ +-- Lua filter to convert Pandoc grid tables to list-table format +-- This produces output that can be processed by list-table.lua + +if PANDOC_VERSION and PANDOC_VERSION.must_be_at_least then + PANDOC_VERSION:must_be_at_least("2.11") +else + error("pandoc version >=2.11 is required") +end + +-- Convert alignment enum to character code +local function alignment_to_char(align) + local align_str = tostring(align) + if align_str == 'AlignLeft' then return 'l' + elseif align_str == 'AlignRight' then return 'r' + elseif align_str == 'AlignCenter' then return 'c' + else return 'd' end +end + +-- Convert a cell to a list of blocks with optional attribute span prepended +local function cell_to_blocks(cell) + -- Extract cell properties using Lua API + local contents = cell.contents + local align = cell.alignment + local rowspan = cell.row_span + local colspan = cell.col_span + local attr = cell.attr + + -- Clone the blocks to avoid modifying the original + local blocks = pandoc.Blocks({}) + for _, block in ipairs(contents) do + table.insert(blocks, block:clone()) + end + + -- If we have non-default cell attributes, prepend an empty span + local align_str = tostring(align) + if rowspan ~= 1 or colspan ~= 1 or align_str ~= 'AlignDefault' then + local span_attr = pandoc.Attr('', {}, {}) + if colspan ~= 1 then + span_attr.attributes.colspan = tostring(colspan) + end + if rowspan ~= 1 then + span_attr.attributes.rowspan = tostring(rowspan) + end + if align_str ~= 'AlignDefault' then + span_attr.attributes.align = alignment_to_char(align) + end + + local empty_span = pandoc.Span({}, span_attr) + + -- Insert the empty span at the beginning of the first block's content + if #blocks > 0 and blocks[1].content then + table.insert(blocks[1].content, 1, empty_span) + else + -- If there's no content, create a paragraph with just the span + blocks = pandoc.Blocks({pandoc.Para({empty_span})}) + end + end + + -- Ensure we have at least one block + if #blocks == 0 then + blocks = pandoc.Blocks({pandoc.Para({})}) + end + + return blocks +end + +-- Convert a Pandoc Table to a list-table Div +local function table_to_list_table(tbl) + -- Extract table components using Lua API + local attr = tbl.attr + local caption = tbl.caption + local colspecs = tbl.colspecs + local thead = tbl.head + local tbodies = tbl.bodies + local tfoot = tbl.foot + + -- Build div attributes, starting from table attributes + local div_attr = pandoc.Attr(attr.identifier, {'list-table'}, {}) + + -- Copy table classes + for _, class in ipairs(attr.classes) do + table.insert(div_attr.classes, class) + end + + -- Copy table attributes + for k, v in pairs(attr.attributes) do + div_attr.attributes[k] = v + end + + -- Count header rows from thead + local thead_rows = thead.rows + local header_row_count = #thead_rows + if header_row_count > 0 then + div_attr.attributes['header-rows'] = tostring(header_row_count) + end + + -- Extract alignments and widths from colspecs + local aligns = {} + local widths = {} + local has_non_default_widths = false + + for i, colspec in ipairs(colspecs) do + -- ColSpec is a pair: [1] = alignment, [2] = width + local align = colspec[1] + local width = colspec[2] + + table.insert(aligns, alignment_to_char(align)) + + -- Width is a number (0.0-1.0) or ColWidthDefault + if type(width) == "number" and width > 0 then + table.insert(widths, tostring(width)) + has_non_default_widths = true + else + -- ColWidthDefault or 0 + table.insert(widths, "1") + end + end + + -- Only add aligns if there are non-default alignments + local has_non_default_aligns = false + for _, a in ipairs(aligns) do + if a ~= 'd' then + has_non_default_aligns = true + break + end + end + + if has_non_default_aligns then + div_attr.attributes.aligns = table.concat(aligns, ',') + end + + if has_non_default_widths then + div_attr.attributes.widths = table.concat(widths, ',') + end + + -- Build div content + local content = {} + + -- Add caption if present + if caption and caption.long and #caption.long > 0 then + for _, block in ipairs(caption.long) do + table.insert(content, block) + end + end + + -- Build list of rows (each row is a list item containing a bullet list of cells) + local row_items = {} + + -- Add header rows + for _, row in ipairs(thead_rows) do + local cells = row.cells + local cell_blocks_list = {} + for _, cell in ipairs(cells) do + table.insert(cell_blocks_list, cell_to_blocks(cell)) + end + -- Each row item contains a single bullet list of cells + table.insert(row_items, {pandoc.BulletList(cell_blocks_list)}) + end + + -- Add body rows from all table bodies + for _, tbody in ipairs(tbodies) do + for _, row in ipairs(tbody.body) do + local cells = row.cells + local cell_blocks_list = {} + for _, cell in ipairs(cells) do + table.insert(cell_blocks_list, cell_to_blocks(cell)) + end + -- Each row item contains a single bullet list of cells + table.insert(row_items, {pandoc.BulletList(cell_blocks_list)}) + end + end + + -- Add footer rows if any + for _, row in ipairs(tfoot.rows) do + local cells = row.cells + local cell_blocks_list = {} + for _, cell in ipairs(cells) do + table.insert(cell_blocks_list, cell_to_blocks(cell)) + end + table.insert(row_items, {pandoc.BulletList(cell_blocks_list)}) + end + + -- Create the outer bullet list (list of rows) + table.insert(content, pandoc.BulletList(row_items)) + + return pandoc.Div(content, div_attr) +end + +return {{Table = table_to_list_table}} diff --git a/crates/qmd-syntax-helper/src/conversions/definition_lists.rs b/crates/qmd-syntax-helper/src/conversions/definition_lists.rs new file mode 100644 index 0000000..9124fd6 --- /dev/null +++ b/crates/qmd-syntax-helper/src/conversions/definition_lists.rs @@ -0,0 +1,300 @@ +use anyhow::{Context, Result}; +use colored::Colorize; +use regex::Regex; +use std::path::Path; +use std::process::{Command, Stdio}; + +use crate::rule::{CheckResult, ConvertResult, Rule}; +use crate::utils::file_io::{read_file, write_file}; +use crate::utils::resources::ResourceManager; +use quarto_markdown_pandoc::readers::json; +use quarto_markdown_pandoc::writers::qmd; + +pub struct DefinitionListConverter { + def_item_regex: Regex, + resources: ResourceManager, +} + +#[derive(Debug)] +pub struct DefinitionList { + pub text: String, + pub start_line: usize, + pub end_line: usize, +} + +impl DefinitionListConverter { + pub fn new() -> Result { + Ok(Self { + // Matches definition list items that start with `:` followed by spaces + def_item_regex: Regex::new(r"^:\s+").unwrap(), + resources: ResourceManager::new()?, + }) + } + + /// Find all definition lists in the content + pub fn find_definition_lists(&self, content: &str) -> Vec { + let lines: Vec<&str> = content.lines().collect(); + let mut lists = Vec::new(); + let mut i = 0; + + while i < lines.len() { + let line = lines[i]; + + // Look for a definition item (line starting with `: `) + // But not div fences (`::`or `:::`) + if self.def_item_regex.is_match(line) && !line.starts_with("::") { + // Found a definition item, now scan backwards to find the term + let mut start_idx = i; + + // Skip back over any blank lines + while start_idx > 0 && lines[start_idx - 1].trim().is_empty() { + start_idx -= 1; + } + + // The line before the blank lines should be the term + if start_idx > 0 { + start_idx -= 1; + } + + // Now scan forward to collect all terms and definitions in this list + let mut end_idx = i; + i += 1; + + loop { + // Continue through continuation lines and blank lines + while i < lines.len() { + let line = lines[i]; + if line.starts_with(" ") || line.trim().is_empty() { + end_idx = i; + i += 1; + } else { + break; + } + } + + // Check if the next item is part of this definition list + // It should be: optional non-blank line (term), then blank lines, then `: ` + if i < lines.len() { + let potential_term = lines[i]; + + // Not a definition line, might be next term + if !self.def_item_regex.is_match(potential_term) + || potential_term.starts_with("::") + { + // Look ahead for a definition line + let mut j = i + 1; + while j < lines.len() && lines[j].trim().is_empty() { + j += 1; + } + + if j < lines.len() + && self.def_item_regex.is_match(lines[j]) + && !lines[j].starts_with("::") + { + // Found another term-definition pair + end_idx = j; + i = j + 1; + continue; + } else { + // No more definition items + break; + } + } else { + // This IS a definition line (continuation of same term) + end_idx = i; + i += 1; + continue; + } + } else { + break; + } + } + + // Extract the definition list text + let list_lines = &lines[start_idx..=end_idx]; + let list_text = list_lines.join("\n"); + + lists.push(DefinitionList { + text: list_text, + start_line: start_idx, + end_line: end_idx, + }); + } else { + i += 1; + } + } + + lists + } + + /// Convert a single definition list by: + /// 1. Running pandoc with the Lua filter to convert to JSON + /// 2. Using quarto-markdown-pandoc library to convert JSON to markdown + pub fn convert_list(&self, list_text: &str) -> Result { + use std::io::Write; + + // Get the Lua filter path from resources + let filter_path = self + .resources + .get_resource("filters/definition-list-to-div.lua")?; + + // Step 1: pandoc -f markdown -t json -L filter.lua + let mut pandoc = Command::new("pandoc") + .args(&["-f", "markdown", "-t", "json"]) + .arg("-L") + .arg(&filter_path) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .context("Failed to spawn pandoc")?; + + { + let stdin = pandoc + .stdin + .as_mut() + .context("Failed to get pandoc stdin")?; + stdin.write_all(list_text.as_bytes())?; + } + + let pandoc_output = pandoc.wait_with_output()?; + + if !pandoc_output.status.success() { + anyhow::bail!( + "pandoc failed: {}", + String::from_utf8_lossy(&pandoc_output.stderr) + ); + } + + // Step 2: Use library to convert JSON to markdown + let mut json_reader = std::io::Cursor::new(&pandoc_output.stdout); + let (pandoc_ast, _ctx) = + json::read(&mut json_reader).context("Failed to parse JSON output from pandoc")?; + + let mut output = Vec::new(); + qmd::write(&pandoc_ast, &mut output).context("Failed to write markdown output")?; + + let result = String::from_utf8(output) + .context("Failed to parse output as UTF-8")? + .trim_end() + .to_string(); + + Ok(result) + } +} + +impl Rule for DefinitionListConverter { + fn name(&self) -> &str { + "definition-lists" + } + + fn description(&self) -> &str { + "Convert definition lists to div-based format" + } + + fn check(&self, file_path: &Path, verbose: bool) -> Result> { + let content = read_file(file_path)?; + let lists = self.find_definition_lists(&content); + + if verbose { + if lists.is_empty() { + println!(" No definition lists found"); + } else { + println!(" Found {} definition list(s)", lists.len()); + } + } + + let mut results = Vec::new(); + for list in lists { + results.push(CheckResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + has_issue: true, + issue_count: 1, + message: Some("Definition list found".to_string()), + location: Some(crate::rule::SourceLocation { + row: list.start_line + 1, // Convert 0-indexed to 1-indexed + column: 1, + }), + }); + } + + Ok(results) + } + + fn convert( + &self, + file_path: &Path, + in_place: bool, + check_mode: bool, + verbose: bool, + ) -> Result { + let content = read_file(file_path)?; + let lists = self.find_definition_lists(&content); + + if lists.is_empty() { + return Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: 0, + message: None, + }); + } + + // Convert each list and build new content + let mut lines: Vec = content.lines().map(|s| s.to_string()).collect(); + let mut offset: isize = 0; + + for (idx, list) in lists.iter().enumerate() { + if verbose { + println!(" Converting list {}...", idx + 1); + } + + let converted = self.convert_list(&list.text)?; + let start = (list.start_line as isize + offset) as usize; + let end = (list.end_line as isize + offset) as usize; + + if check_mode && verbose { + println!( + " List {} at lines {}-{}:", + idx + 1, + list.start_line, + list.end_line + ); + println!( + " {} {} lines -> {} {} lines", + "Original:".red(), + list.end_line - list.start_line + 1, + "Converted:".green(), + converted.lines().count() + ); + } + + let converted_lines: Vec = converted.lines().map(|s| s.to_string()).collect(); + let new_len = converted_lines.len(); + let old_len = end - start + 1; + + lines.splice(start..=end, converted_lines); + offset += new_len as isize - old_len as isize; + } + + let new_content = lines.join("\n") + "\n"; + + if !check_mode { + if in_place { + write_file(file_path, &new_content)?; + } + } + + Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: lists.len(), + message: if in_place { + Some(format!("Converted {} list(s)", lists.len())) + } else { + Some(new_content) + }, + }) + } +} diff --git a/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs b/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs new file mode 100644 index 0000000..db18d91 --- /dev/null +++ b/crates/qmd-syntax-helper/src/conversions/div_whitespace.rs @@ -0,0 +1,324 @@ +use anyhow::{Context, Result}; +use colored::Colorize; +use serde::{Deserialize, Serialize}; +use std::fs; +use std::path::Path; + +use crate::rule::{CheckResult, ConvertResult, Rule}; +use crate::utils::file_io::{read_file, write_file}; + +#[derive(Debug, Serialize, Deserialize)] +struct ErrorLocation { + row: usize, + column: usize, + byte_offset: usize, + size: usize, +} + +#[derive(Debug, Serialize, Deserialize)] +struct ParseError { + filename: String, + title: String, + message: String, + location: ErrorLocation, +} + +pub struct DivWhitespaceConverter {} + +impl DivWhitespaceConverter { + pub fn new() -> Result { + Ok(Self {}) + } + + /// Parse a file and get error locations as JSON + fn get_parse_errors(&self, file_path: &Path) -> Result> { + let content = fs::read_to_string(file_path) + .with_context(|| format!("Failed to read file: {}", file_path.display()))?; + + // Use the quarto-markdown-pandoc library to parse with JSON error formatter + let mut sink = std::io::sink(); + let filename = file_path.to_string_lossy(); + + let result = quarto_markdown_pandoc::readers::qmd::read( + content.as_bytes(), + false, // not loose mode + &filename, + &mut sink, + Some( + quarto_markdown_pandoc::readers::qmd_error_messages::produce_json_error_messages + as fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + ), + ); + + match result { + Ok(_) => Ok(Vec::new()), // No errors + Err(error_messages) => { + // Parse the JSON error output + // The error messages come as a single JSON array string + if error_messages.is_empty() { + return Ok(Vec::new()); + } + + let json_str = error_messages.join(""); + + // Try to parse as JSON array + match serde_json::from_str::>(&json_str) { + Ok(errors) => Ok(errors), + Err(_) => { + // If parsing fails, the messages are likely plain text warnings/debug messages + // rather than actual syntax errors. These don't indicate div whitespace issues, + // so we can safely ignore them for this specific rule. + Ok(Vec::new()) + } + } + } + } + } + + /// Find div fence errors that need whitespace fixes + fn find_div_whitespace_errors(&self, content: &str, errors: &[ParseError]) -> Vec { + let mut fix_positions = Vec::new(); + let lines: Vec<&str> = content.lines().collect(); + + for error in errors { + // Skip errors that are not about div fences + // We're looking for "Missing Space After Div Fence" or errors on lines with ::: + let is_div_error = error.title.contains("Div Fence") || error.title == "Parse error"; + + if !is_div_error { + continue; + } + + // The error might be on the line itself or the line before (for div fences) + // Check both the current line and the previous line + let lines_to_check = if error.location.row > 0 { + vec![error.location.row - 1, error.location.row] + } else { + vec![error.location.row] + }; + + for &line_idx in &lines_to_check { + if line_idx >= lines.len() { + continue; + } + + let line = lines[line_idx]; + + // Check if this line starts with ::: followed immediately by { + let trimmed = line.trim_start(); + if let Some(after_colon) = trimmed.strip_prefix(":::") { + if after_colon.starts_with('{') { + // Calculate the position right after ::: + // We need byte offset, not char offset + let line_start = content + .lines() + .take(line_idx) + .map(|l| l.len() + 1) // +1 for newline + .sum::(); + + let indent_bytes = line.len() - trimmed.len(); + let fix_pos = line_start + indent_bytes + 3; // +3 for ":::" + + fix_positions.push(fix_pos); + break; // Found it, no need to check other lines for this error + } + } + } + } + + // Remove duplicates and sort + fix_positions.sort_unstable(); + fix_positions.dedup(); + + fix_positions + } + + /// Convert byte offset to row/column (1-indexed) + fn byte_offset_to_location(&self, content: &str, byte_offset: usize) -> crate::rule::SourceLocation { + let mut row = 1; + let mut column = 1; + let mut current_offset = 0; + + for ch in content.chars() { + if current_offset >= byte_offset { + break; + } + current_offset += ch.len_utf8(); + + if ch == '\n' { + row += 1; + column = 1; + } else { + column += 1; + } + } + + crate::rule::SourceLocation { row, column } + } + + /// Apply fixes to content by inserting spaces at specified positions + fn apply_fixes(&self, content: &str, fix_positions: &[usize]) -> String { + let mut result = String::with_capacity(content.len() + fix_positions.len()); + let mut last_pos = 0; + + for &pos in fix_positions { + // Copy content up to this position + result.push_str(&content[last_pos..pos]); + // Insert a space + result.push(' '); + last_pos = pos; + } + + // Copy remaining content + result.push_str(&content[last_pos..]); + + result + } + + /// Process a single file + #[allow(dead_code)] + pub fn process_file( + &self, + file_path: &Path, + in_place: bool, + check: bool, + verbose: bool, + ) -> Result<()> { + let content = read_file(file_path)?; + + // Get parse errors + let errors = self.get_parse_errors(file_path)?; + + if errors.is_empty() { + if verbose { + println!(" No div whitespace issues found"); + } + return Ok(()); + } + + // Find positions that need fixes + let fix_positions = self.find_div_whitespace_errors(&content, &errors); + + if fix_positions.is_empty() { + if verbose { + println!(" No div whitespace issues found"); + } + return Ok(()); + } + + if verbose || check { + println!( + " Found {} div fence(s) needing whitespace fixes", + fix_positions.len().to_string().yellow() + ); + } + + if check { + println!(" {} No changes written (--check mode)", "✓".green()); + return Ok(()); + } + + // Apply fixes + let new_content = self.apply_fixes(&content, &fix_positions); + + if in_place { + write_file(file_path, &new_content)?; + println!( + " {} Fixed {} div fence(s)", + "✓".green(), + fix_positions.len() + ); + } else { + // Output to stdout + print!("{}", new_content); + } + + Ok(()) + } +} + +impl Rule for DivWhitespaceConverter { + fn name(&self) -> &str { + "div-whitespace" + } + + fn description(&self) -> &str { + "Fix div fences missing whitespace (:::{ -> ::: {)" + } + + fn check(&self, file_path: &Path, verbose: bool) -> Result> { + let content = read_file(file_path)?; + let errors = self.get_parse_errors(file_path)?; + let fix_positions = self.find_div_whitespace_errors(&content, &errors); + + if verbose { + if fix_positions.is_empty() { + println!(" No div whitespace issues found"); + } else { + println!( + " Found {} div fence(s) needing whitespace fixes", + fix_positions.len() + ); + } + } + + let mut results = Vec::new(); + for &pos in &fix_positions { + let location = self.byte_offset_to_location(&content, pos); + results.push(CheckResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + has_issue: true, + issue_count: 1, + message: Some("Div fence missing whitespace (:::{ should be ::: {)".to_string()), + location: Some(location), + }); + } + + Ok(results) + } + + fn convert( + &self, + file_path: &Path, + in_place: bool, + check_mode: bool, + _verbose: bool, + ) -> Result { + let content = read_file(file_path)?; + let errors = self.get_parse_errors(file_path)?; + let fix_positions = self.find_div_whitespace_errors(&content, &errors); + + if fix_positions.is_empty() { + return Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: 0, + message: None, + }); + } + + let new_content = self.apply_fixes(&content, &fix_positions); + + if !check_mode { + if in_place { + write_file(file_path, &new_content)?; + } + } + + Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: fix_positions.len(), + message: if in_place { + Some(format!("Fixed {} div fence(s)", fix_positions.len())) + } else { + Some(new_content) + }, + }) + } +} diff --git a/crates/qmd-syntax-helper/src/conversions/grid_tables.rs b/crates/qmd-syntax-helper/src/conversions/grid_tables.rs new file mode 100644 index 0000000..fc9fc25 --- /dev/null +++ b/crates/qmd-syntax-helper/src/conversions/grid_tables.rs @@ -0,0 +1,261 @@ +use anyhow::{Context, Result}; +use colored::Colorize; +use regex::Regex; +use std::path::Path; +use std::process::{Command, Stdio}; + +use crate::rule::{CheckResult, ConvertResult, Rule}; +use crate::utils::file_io::{read_file, write_file}; +use crate::utils::resources::ResourceManager; +use quarto_markdown_pandoc::readers::json; +use quarto_markdown_pandoc::writers::qmd; + +pub struct GridTableConverter { + grid_start_regex: Regex, + table_line_regex: Regex, + caption_regex: Regex, + resources: ResourceManager, +} + +#[derive(Debug)] +pub struct GridTable { + pub text: String, + pub start_line: usize, + pub end_line: usize, +} + +impl GridTableConverter { + pub fn new() -> Result { + Ok(Self { + // Matches lines that start with + and contain - or = + grid_start_regex: Regex::new(r"^\+[-=+]+\+").unwrap(), + // Matches table content lines (start with + or |) + table_line_regex: Regex::new(r"^[+|]").unwrap(), + // Matches caption lines (start with :) + caption_regex: Regex::new(r"^:").unwrap(), + resources: ResourceManager::new()?, + }) + } + + /// Find all grid tables in the content + pub fn find_grid_tables(&self, content: &str) -> Vec { + let lines: Vec<&str> = content.lines().collect(); + let mut tables = Vec::new(); + let mut i = 0; + + while i < lines.len() { + let line = lines[i]; + + // Check if this line starts a grid table + if self.grid_start_regex.is_match(line) { + let start_idx = i; + let mut table_lines = vec![line]; + i += 1; + + // Collect all lines that are part of the table + while i < lines.len() { + let line = lines[i]; + + // Table content lines start with + or | + if self.table_line_regex.is_match(line) { + table_lines.push(line); + i += 1; + } + // Caption line starts with : and must immediately follow table + else if self.caption_regex.is_match(line) + && i == start_idx + table_lines.len() + { + table_lines.push(line); + i += 1; + break; + } else { + break; + } + } + + // Found a complete table + let table_text = table_lines.join("\n"); + tables.push(GridTable { + text: table_text, + start_line: start_idx, + end_line: i - 1, + }); + } else { + i += 1; + } + } + + tables + } + + /// Convert a single grid table by: + /// 1. Running pandoc with the Lua filter to convert to JSON + /// 2. Running quarto-markdown-pandoc to convert JSON to markdown + pub fn convert_table(&self, table_text: &str) -> Result { + use std::io::Write; + + // Get the Lua filter path from resources + let filter_path = self + .resources + .get_resource("filters/grid-table-to-list-table.lua")?; + + // Step 1: pandoc -f markdown -t json -L filter.lua + let mut pandoc = Command::new("pandoc") + .args(&["-f", "markdown", "-t", "json"]) + .arg("-L") + .arg(&filter_path) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .context("Failed to spawn pandoc")?; + + { + let stdin = pandoc + .stdin + .as_mut() + .context("Failed to get pandoc stdin")?; + stdin.write_all(table_text.as_bytes())?; + } + + let pandoc_output = pandoc.wait_with_output()?; + + if !pandoc_output.status.success() { + anyhow::bail!( + "pandoc failed: {}", + String::from_utf8_lossy(&pandoc_output.stderr) + ); + } + + // Step 2: Use library to convert JSON to markdown + let mut json_reader = std::io::Cursor::new(&pandoc_output.stdout); + let (pandoc_ast, _ctx) = + json::read(&mut json_reader).context("Failed to parse JSON output from pandoc")?; + + let mut output = Vec::new(); + qmd::write(&pandoc_ast, &mut output).context("Failed to write markdown output")?; + + let result = String::from_utf8(output) + .context("Failed to parse output as UTF-8")? + .trim_end() + .to_string(); + + Ok(result) + } +} + +impl Rule for GridTableConverter { + fn name(&self) -> &str { + "grid-tables" + } + + fn description(&self) -> &str { + "Convert grid tables to list-table format" + } + + fn check(&self, file_path: &Path, verbose: bool) -> Result> { + let content = read_file(file_path)?; + let tables = self.find_grid_tables(&content); + + if verbose { + if tables.is_empty() { + println!(" No grid tables found"); + } else { + println!(" Found {} grid table(s)", tables.len()); + } + } + + let mut results = Vec::new(); + for table in tables { + results.push(CheckResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + has_issue: true, + issue_count: 1, + message: Some("Grid table found".to_string()), + location: Some(crate::rule::SourceLocation { + row: table.start_line + 1, // Convert 0-indexed to 1-indexed + column: 1, + }), + }); + } + + Ok(results) + } + + fn convert( + &self, + file_path: &Path, + in_place: bool, + check_mode: bool, + verbose: bool, + ) -> Result { + let content = read_file(file_path)?; + let tables = self.find_grid_tables(&content); + + if tables.is_empty() { + return Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: 0, + message: None, + }); + } + + // Convert each table and build new content + let mut lines: Vec = content.lines().map(|s| s.to_string()).collect(); + let mut offset: isize = 0; + + for (idx, table) in tables.iter().enumerate() { + if verbose { + println!(" Converting table {}...", idx + 1); + } + + let converted = self.convert_table(&table.text)?; + let start = (table.start_line as isize + offset) as usize; + let end = (table.end_line as isize + offset) as usize; + + if check_mode && verbose { + println!( + " Table {} at lines {}-{}:", + idx + 1, + table.start_line, + table.end_line + ); + println!( + " {} {} lines -> {} {} lines", + "Original:".red(), + table.end_line - table.start_line + 1, + "Converted:".green(), + converted.lines().count() + ); + } + + let converted_lines: Vec = converted.lines().map(|s| s.to_string()).collect(); + let new_len = converted_lines.len(); + let old_len = end - start + 1; + + lines.splice(start..=end, converted_lines); + offset += new_len as isize - old_len as isize; + } + + let new_content = lines.join("\n") + "\n"; + + if !check_mode { + if in_place { + write_file(file_path, &new_content)?; + } + } + + Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: tables.len(), + message: if in_place { + Some(format!("Converted {} table(s)", tables.len())) + } else { + Some(new_content) + }, + }) + } +} diff --git a/crates/qmd-syntax-helper/src/conversions/mod.rs b/crates/qmd-syntax-helper/src/conversions/mod.rs new file mode 100644 index 0000000..9c282df --- /dev/null +++ b/crates/qmd-syntax-helper/src/conversions/mod.rs @@ -0,0 +1,3 @@ +pub mod definition_lists; +pub mod div_whitespace; +pub mod grid_tables; diff --git a/crates/qmd-syntax-helper/src/diagnostics/mod.rs b/crates/qmd-syntax-helper/src/diagnostics/mod.rs new file mode 100644 index 0000000..06d57d6 --- /dev/null +++ b/crates/qmd-syntax-helper/src/diagnostics/mod.rs @@ -0,0 +1,2 @@ +pub mod parse_check; +// pub mod syntax_check; // Unused - kept for reference only diff --git a/crates/qmd-syntax-helper/src/diagnostics/parse_check.rs b/crates/qmd-syntax-helper/src/diagnostics/parse_check.rs new file mode 100644 index 0000000..9dc25a8 --- /dev/null +++ b/crates/qmd-syntax-helper/src/diagnostics/parse_check.rs @@ -0,0 +1,82 @@ +use anyhow::{Context, Result}; +use std::fs; +use std::path::Path; + +use crate::rule::{CheckResult, ConvertResult, Rule}; + +pub struct ParseChecker {} + +impl ParseChecker { + pub fn new() -> Result { + Ok(Self {}) + } + + /// Check if a file parses successfully + fn check_parse(&self, file_path: &Path) -> Result { + let content = fs::read_to_string(file_path) + .with_context(|| format!("Failed to read file: {}", file_path.display()))?; + + let mut sink = std::io::sink(); + let filename = file_path.to_string_lossy(); + + let result = quarto_markdown_pandoc::readers::qmd::read( + content.as_bytes(), + false, + &filename, + &mut sink, + Some( + quarto_markdown_pandoc::readers::qmd_error_messages::produce_json_error_messages + as fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + ), + ); + + Ok(result.is_ok()) + } +} + +impl Rule for ParseChecker { + fn name(&self) -> &str { + "parse" + } + + fn description(&self) -> &str { + "Check if file parses successfully" + } + + fn check(&self, file_path: &Path, _verbose: bool) -> Result> { + let parses = self.check_parse(file_path)?; + + if parses { + Ok(vec![]) + } else { + Ok(vec![CheckResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + has_issue: true, + issue_count: 1, + message: Some("File failed to parse".to_string()), + location: None, // Parse errors don't have specific locations + }]) + } + } + + fn convert( + &self, + file_path: &Path, + _in_place: bool, + _check_mode: bool, + _verbose: bool, + ) -> Result { + // Parse errors can't be auto-fixed + Ok(ConvertResult { + rule_name: self.name().to_string(), + file_path: file_path.to_string_lossy().to_string(), + fixes_applied: 0, + message: Some("Parse errors cannot be automatically fixed".to_string()), + }) + } +} diff --git a/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs b/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs new file mode 100644 index 0000000..08b4ffa --- /dev/null +++ b/crates/qmd-syntax-helper/src/diagnostics/syntax_check.rs @@ -0,0 +1,138 @@ +use anyhow::{Context, Result}; +use colored::Colorize; +use serde::{Deserialize, Serialize}; +use std::fs; +use std::path::{Path, PathBuf}; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CheckResult { + pub file: PathBuf, + pub success: bool, + pub error_message: Option, +} + +pub struct SyntaxChecker { + pub results: Vec, +} + +impl SyntaxChecker { + pub fn new() -> Self { + Self { + results: Vec::new(), + } + } + + /// Check a single file by attempting to parse it + pub fn check_file(&mut self, file_path: &Path, verbose: bool) -> Result<()> { + if verbose { + print!("Checking: {} ... ", file_path.display()); + } + + let result = self.parse_file(file_path); + + match &result { + Ok(_) => { + if verbose { + println!("{}", "✓".green()); + } + self.results.push(CheckResult { + file: file_path.to_path_buf(), + success: true, + error_message: None, + }); + } + Err(e) => { + if verbose { + println!("{}", "✗".red()); + println!(" Error: {}", e); + } + self.results.push(CheckResult { + file: file_path.to_path_buf(), + success: false, + error_message: Some(e.to_string()), + }); + } + } + + Ok(()) + } + + /// Parse a file using quarto-markdown-pandoc + fn parse_file(&self, file_path: &Path) -> Result<()> { + let content = fs::read_to_string(file_path) + .with_context(|| format!("Failed to read file: {}", file_path.display()))?; + + // Use the quarto-markdown-pandoc library to parse + let mut sink = std::io::sink(); + let filename = file_path.to_string_lossy(); + + let result = quarto_markdown_pandoc::readers::qmd::read( + content.as_bytes(), + false, // not loose mode + &filename, + &mut sink, + Some( + quarto_markdown_pandoc::readers::qmd_error_messages::produce_json_error_messages + as fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + ), // Use JSON error formatter for machine-readable errors + ); + + match result { + Ok(_) => Ok(()), + Err(errors) => { + // Join error messages + let error_msg = errors.join("\n"); + Err(anyhow::anyhow!("{}", error_msg)) + } + } + } + + /// Print a summary of the results + pub fn print_summary(&self) { + let total = self.results.len(); + let successes = self.results.iter().filter(|r| r.success).count(); + let failures = total - successes; + + println!("\n{}", "=== Summary ===".bold()); + println!("Total files: {}", total); + println!("Successful: {} {}", successes, "✓".green()); + println!( + "Failed: {} {}", + failures, + if failures > 0 { + "✗".red() + } else { + "✓".green() + } + ); + + if failures > 0 { + let success_rate = (successes as f64 / total as f64) * 100.0; + println!("Success rate: {:.1}%", success_rate); + } + } + + /// Get a list of failed files + pub fn failed_files(&self) -> Vec<&CheckResult> { + self.results.iter().filter(|r| !r.success).collect() + } + + /// Export results as JSONL + pub fn export_jsonl(&self, output_path: &Path) -> Result<()> { + let mut output = String::new(); + for result in &self.results { + let json = serde_json::to_string(result)?; + output.push_str(&json); + output.push('\n'); + } + + fs::write(output_path, output) + .with_context(|| format!("Failed to write to: {}", output_path.display()))?; + + Ok(()) + } +} diff --git a/crates/qmd-syntax-helper/src/lib.rs b/crates/qmd-syntax-helper/src/lib.rs new file mode 100644 index 0000000..2a009f2 --- /dev/null +++ b/crates/qmd-syntax-helper/src/lib.rs @@ -0,0 +1,4 @@ +pub mod conversions; +pub mod diagnostics; +pub mod rule; +pub mod utils; diff --git a/crates/qmd-syntax-helper/src/main.rs b/crates/qmd-syntax-helper/src/main.rs new file mode 100644 index 0000000..21d6d24 --- /dev/null +++ b/crates/qmd-syntax-helper/src/main.rs @@ -0,0 +1,281 @@ +use anyhow::Result; +use clap::{Parser, Subcommand}; +use colored::Colorize; +use std::path::PathBuf; + +mod conversions; +mod diagnostics; +mod rule; +mod utils; + +use rule::{Rule, RuleRegistry}; +use utils::glob_expand::expand_globs; + +#[derive(Parser)] +#[command(name = "qmd-syntax-helper")] +#[command(about = "Helper tool for converting and fixing Quarto Markdown syntax")] +#[command(version)] +struct Cli { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + /// Check files for known problems + Check { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Rules to check (defaults to "all") + #[arg(short = 'r', long = "rule", default_values_t = vec!["all".to_string()])] + rule: Vec, + + /// Show verbose output + #[arg(short, long)] + verbose: bool, + + /// Output results as JSONL + #[arg(long)] + json: bool, + + /// Save detailed results to file + #[arg(short, long)] + output: Option, + }, + + /// Convert/fix problems in files + Convert { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Rules to apply (defaults to "all") + #[arg(short = 'r', long = "rule", default_values_t = vec!["all".to_string()])] + rule: Vec, + + /// Edit files in place + #[arg(short, long)] + in_place: bool, + + /// Check mode: show what would be changed without modifying files + #[arg(short, long)] + check: bool, + + /// Show verbose output + #[arg(short, long)] + verbose: bool, + }, + + /// List all available rules + ListRules, +} + +fn main() -> Result<()> { + let cli = Cli::parse(); + let registry = RuleRegistry::new()?; + + match cli.command { + Commands::Check { + files, + rule: rule_names, + verbose, + json, + output, + } => { + let file_paths = expand_globs(&files)?; + let rules = resolve_rules(®istry, &rule_names)?; + + let mut all_results = Vec::new(); + + for file_path in file_paths { + if verbose && !json { + println!("Checking: {}", file_path.display()); + } + + for rule in &rules { + match rule.check(&file_path, verbose && !json) { + Ok(results) => { + for result in results { + all_results.push(result.clone()); + if !json && result.has_issue { + println!(" {} {}", "✗".red(), result.message.unwrap_or_default()); + } + } + } + Err(e) => { + if !json { + eprintln!(" {} Error checking {}: {}", "✗".red(), rule.name(), e); + } + } + } + } + } + + // Print summary if not in JSON mode + if !json && !all_results.is_empty() { + print_check_summary(&all_results); + } + + // Output handling + if json { + for result in &all_results { + println!("{}", serde_json::to_string(result)?); + } + } + + if let Some(output_path) = output { + let mut output_str = String::new(); + for result in &all_results { + output_str.push_str(&serde_json::to_string(result)?); + output_str.push('\n'); + } + std::fs::write(output_path, output_str)?; + } + + Ok(()) + } + + Commands::Convert { + files, + rule: rule_names, + in_place, + check: check_mode, + verbose, + } => { + let file_paths = expand_globs(&files)?; + let rules = resolve_rules(®istry, &rule_names)?; + + for file_path in file_paths { + if verbose { + println!("Processing: {}", file_path.display()); + } + + // Apply fixes sequentially, reparsing between each rule + for rule in &rules { + match rule.convert(&file_path, in_place, check_mode, verbose) { + Ok(result) => { + if result.fixes_applied > 0 { + if verbose || check_mode { + println!( + " {} {} - {}", + if check_mode { "Would fix" } else { "Fixed" }, + rule.name(), + result.message.clone().unwrap_or_default() + ); + } + + if !in_place && !check_mode && result.message.is_some() { + // Output to stdout if not in-place + print!("{}", result.message.unwrap()); + } + } + } + Err(e) => { + eprintln!(" {} Error converting {}: {}", "✗".red(), rule.name(), e); + // Stop on first error (transactional) + return Err(e); + } + } + } + } + + Ok(()) + } + + Commands::ListRules => { + println!("{}", "Available rules:".bold()); + for name in registry.list_names() { + let rule = registry.get(&name)?; + println!(" {} - {}", name.cyan(), rule.description()); + } + Ok(()) + } + } +} + +fn resolve_rules( + registry: &RuleRegistry, + names: &[String], +) -> Result>> { + if names.len() == 1 && names[0] == "all" { + Ok(registry.all()) + } else { + let mut rules = Vec::new(); + for name in names { + rules.push(registry.get(name)?); + } + Ok(rules) + } +} + +fn print_check_summary(results: &[rule::CheckResult]) { + use std::collections::{HashMap, HashSet}; + + // Get unique files checked + let unique_files: HashSet<&str> = results.iter().map(|r| r.file_path.as_str()).collect(); + let total_files = unique_files.len(); + + // Count files with issues (at least one result with has_issue=true) + let mut files_with_issues = HashSet::new(); + let mut total_issues = 0; + + // Track issues by rule type + let mut issues_by_rule: HashMap = HashMap::new(); + let mut files_by_rule: HashMap> = HashMap::new(); + + for result in results { + if result.has_issue { + files_with_issues.insert(&result.file_path); + total_issues += result.issue_count; + + // Track by rule + *issues_by_rule.entry(result.rule_name.clone()).or_insert(0) += result.issue_count; + files_by_rule + .entry(result.rule_name.clone()) + .or_insert_with(HashSet::new) + .insert(result.file_path.clone()); + } + } + + let files_with_issues_count = files_with_issues.len(); + let files_clean = total_files - files_with_issues_count; + + println!("\n{}", "=== Summary ===".bold()); + println!("Total files: {}", total_files); + println!( + "Files with issues: {} {}", + files_with_issues_count, + if files_with_issues_count > 0 { + "✗".red() + } else { + "✓".green() + } + ); + println!("Clean files: {} {}", files_clean, "✓".green()); + + if !issues_by_rule.is_empty() { + println!("\n{}", "Issues by rule:".bold()); + let mut rule_names: Vec<_> = issues_by_rule.keys().collect(); + rule_names.sort(); + + for rule_name in rule_names { + let count = issues_by_rule[rule_name]; + let file_count = files_by_rule[rule_name].len(); + println!( + " {}: {} issue(s) in {} file(s)", + rule_name.cyan(), + count, + file_count + ); + } + } + + println!("\nTotal issues found: {}", total_issues); + + if total_files > 0 { + let success_rate = (files_clean as f64 / total_files as f64) * 100.0; + println!("Success rate: {:.1}%", success_rate); + } +} diff --git a/crates/qmd-syntax-helper/src/main_old.rs b/crates/qmd-syntax-helper/src/main_old.rs new file mode 100644 index 0000000..b0b59f2 --- /dev/null +++ b/crates/qmd-syntax-helper/src/main_old.rs @@ -0,0 +1,201 @@ +use anyhow::Result; +use clap::{Parser, Subcommand}; +use std::path::PathBuf; + +mod conversions; +mod diagnostics; +mod problem; +mod utils; + +use conversions::definition_lists::DefinitionListConverter; +use conversions::div_whitespace::DivWhitespaceConverter; +use conversions::grid_tables::GridTableConverter; +use diagnostics::syntax_check::SyntaxChecker; +use utils::glob_expand::expand_globs; + +#[derive(Parser)] +#[command(name = "qmd-syntax-helper")] +#[command(about = "Helper tool for converting and fixing Quarto Markdown syntax")] +#[command(version)] +struct Cli { + #[command(subcommand)] + command: Commands, +} + +#[derive(Subcommand)] +enum Commands { + /// Convert grid tables to list-table format + UngridTables { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Edit files in place + #[arg(short, long)] + in_place: bool, + + /// Check mode: show what would be changed without modifying files + #[arg(short, long)] + check: bool, + + /// Show verbose output + #[arg(short, long)] + verbose: bool, + }, + + /// Convert definition lists to div-based format + UndefLists { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Edit files in place + #[arg(short, long)] + in_place: bool, + + /// Check mode: show what would be changed without modifying files + #[arg(short, long)] + check: bool, + + /// Show verbose output + #[arg(short, long)] + verbose: bool, + }, + + /// Fix div fences missing whitespace (:::{ -> ::: {) + FixDivWhitespace { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Edit files in place + #[arg(short, long)] + in_place: bool, + + /// Check mode: show what would be changed without modifying files + #[arg(short, long)] + check: bool, + + /// Show verbose output + #[arg(short, long)] + verbose: bool, + }, + + /// Check syntax of files and report errors + Check { + /// Input files (can be multiple files or glob patterns like "docs/**/*.qmd") + #[arg(required = true)] + files: Vec, + + /// Show verbose output (each file as processed) + #[arg(short, long)] + verbose: bool, + + /// Output results as JSONL + #[arg(long)] + json: bool, + + /// Save detailed results to file + #[arg(short, long)] + output: Option, + }, +} + +fn main() -> Result<()> { + let cli = Cli::parse(); + + match cli.command { + Commands::UngridTables { + files, + in_place, + check, + verbose, + } => { + let converter = GridTableConverter::new()?; + let file_paths = expand_globs(&files)?; + + for file_path in file_paths { + if verbose { + println!("Processing: {}", file_path.display()); + } + + converter.process_file(&file_path, in_place, check, verbose)?; + } + + Ok(()) + } + Commands::UndefLists { + files, + in_place, + check, + verbose, + } => { + let converter = DefinitionListConverter::new()?; + let file_paths = expand_globs(&files)?; + + for file_path in file_paths { + if verbose { + println!("Processing: {}", file_path.display()); + } + + converter.process_file(&file_path, in_place, check, verbose)?; + } + + Ok(()) + } + Commands::FixDivWhitespace { + files, + in_place, + check, + verbose, + } => { + let converter = DivWhitespaceConverter::new()?; + let file_paths = expand_globs(&files)?; + + for file_path in file_paths { + if verbose { + println!("Processing: {}", file_path.display()); + } + + converter.process_file(&file_path, in_place, check, verbose)?; + } + + Ok(()) + } + Commands::Check { + files, + verbose, + json, + output, + } => { + let mut checker = SyntaxChecker::new(); + let file_paths = expand_globs(&files)?; + + for file_path in file_paths { + checker.check_file(&file_path, verbose)?; + } + + // Print summary if not JSON mode + if !json { + checker.print_summary(); + } + + // Save to output file if specified + if let Some(output_path) = output { + checker.export_jsonl(&output_path)?; + if !json { + println!("\nDetailed results written to: {}", output_path.display()); + } + } + + // Print JSON to stdout if requested + if json { + for result in &checker.results { + println!("{}", serde_json::to_string(result)?); + } + } + + Ok(()) + } + } +} diff --git a/crates/qmd-syntax-helper/src/rule.rs b/crates/qmd-syntax-helper/src/rule.rs new file mode 100644 index 0000000..e7e46d4 --- /dev/null +++ b/crates/qmd-syntax-helper/src/rule.rs @@ -0,0 +1,115 @@ +use anyhow::{Result, anyhow}; +use std::collections::HashMap; +use std::path::Path; +use std::sync::Arc; + +use serde::{Deserialize, Serialize}; + +/// Location information for a violation +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SourceLocation { + pub row: usize, + pub column: usize, +} + +/// Result of checking a file for a specific rule +/// Each CheckResult represents a single violation +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CheckResult { + pub rule_name: String, + pub file_path: String, + pub has_issue: bool, + pub issue_count: usize, // Kept for backwards compatibility, always 1 when has_issue=true + pub message: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub location: Option, +} + +/// Result of converting/fixing a file +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ConvertResult { + pub rule_name: String, + pub file_path: String, + pub fixes_applied: usize, + pub message: Option, +} + +/// A rule that can check for and fix issues in Quarto Markdown files +pub trait Rule { + /// The name of this rule (e.g., "grid-tables", "div-whitespace") + fn name(&self) -> &str; + + /// A short description of what this rule checks/fixes + fn description(&self) -> &str; + + /// Check if a file violates this rule + /// Returns a vector of CheckResults, one per violation found + fn check(&self, file_path: &Path, verbose: bool) -> Result>; + + /// Convert/fix rule violations in a file + /// If in_place is false, returns the converted content as a string in the message field + fn convert( + &self, + file_path: &Path, + in_place: bool, + check_mode: bool, + verbose: bool, + ) -> Result; +} + +/// Registry of all available rules +pub struct RuleRegistry { + rules: HashMap>, +} + +impl RuleRegistry { + /// Create a new registry and register all known rules + pub fn new() -> Result { + let mut registry = Self { + rules: HashMap::new(), + }; + + // Register diagnostic rules first (parse check should run before conversion rules) + registry.register(Arc::new( + crate::diagnostics::parse_check::ParseChecker::new()?, + )); + + // Register conversion rules + registry.register(Arc::new( + crate::conversions::grid_tables::GridTableConverter::new()?, + )); + registry.register(Arc::new( + crate::conversions::div_whitespace::DivWhitespaceConverter::new()?, + )); + registry.register(Arc::new( + crate::conversions::definition_lists::DefinitionListConverter::new()?, + )); + + Ok(registry) + } + + /// Register a rule + fn register(&mut self, rule: Arc) { + self.rules.insert(rule.name().to_string(), rule); + } + + /// Get a rule by name, or return an error if not found + pub fn get(&self, name: &str) -> Result> { + self.rules + .get(name) + .cloned() + .ok_or_else(|| anyhow!("Unknown rule: {}", name)) + } + + /// Get all registered rules + pub fn all(&self) -> Vec> { + self.rules.values().cloned().collect() + } + + /// List all rule names + pub fn list_names(&self) -> Vec { + let mut names: Vec = self.rules.keys().cloned().collect(); + names.sort(); + names + } +} diff --git a/crates/qmd-syntax-helper/src/utils/file_io.rs b/crates/qmd-syntax-helper/src/utils/file_io.rs new file mode 100644 index 0000000..baa7fc9 --- /dev/null +++ b/crates/qmd-syntax-helper/src/utils/file_io.rs @@ -0,0 +1,13 @@ +use anyhow::{Context, Result}; +use std::fs; +use std::path::Path; + +/// Read a file to a string +pub fn read_file(path: &Path) -> Result { + fs::read_to_string(path).with_context(|| format!("Failed to read file: {}", path.display())) +} + +/// Write content to a file +pub fn write_file(path: &Path, content: &str) -> Result<()> { + fs::write(path, content).with_context(|| format!("Failed to write file: {}", path.display())) +} diff --git a/crates/qmd-syntax-helper/src/utils/glob_expand.rs b/crates/qmd-syntax-helper/src/utils/glob_expand.rs new file mode 100644 index 0000000..09762f4 --- /dev/null +++ b/crates/qmd-syntax-helper/src/utils/glob_expand.rs @@ -0,0 +1,50 @@ +use anyhow::{Context, Result}; +use std::path::PathBuf; + +/// Expand glob patterns into a list of file paths +/// +/// If a pattern doesn't contain glob characters (*, ?, [, ]), +/// treat it as a literal path. +pub fn expand_globs(patterns: &[String]) -> Result> { + let mut files = Vec::new(); + + for pattern in patterns { + // Check if pattern contains glob characters + if pattern.contains('*') || pattern.contains('?') || pattern.contains('[') { + // It's a glob pattern - expand it + let paths = glob::glob(pattern) + .with_context(|| format!("Invalid glob pattern: {}", pattern))?; + + for path in paths { + let path = + path.with_context(|| format!("Failed to read glob match for: {}", pattern))?; + files.push(path); + } + } else { + // It's a literal path - use as-is + files.push(PathBuf::from(pattern)); + } + } + + Ok(files) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_literal_path() { + let patterns = vec!["test.qmd".to_string()]; + let result = expand_globs(&patterns).unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0], PathBuf::from("test.qmd")); + } + + #[test] + fn test_multiple_literals() { + let patterns = vec!["a.qmd".to_string(), "b.qmd".to_string()]; + let result = expand_globs(&patterns).unwrap(); + assert_eq!(result.len(), 2); + } +} diff --git a/crates/qmd-syntax-helper/src/utils/mod.rs b/crates/qmd-syntax-helper/src/utils/mod.rs new file mode 100644 index 0000000..2054ac3 --- /dev/null +++ b/crates/qmd-syntax-helper/src/utils/mod.rs @@ -0,0 +1,3 @@ +pub mod file_io; +pub mod glob_expand; +pub mod resources; diff --git a/crates/qmd-syntax-helper/src/utils/resources.rs b/crates/qmd-syntax-helper/src/utils/resources.rs new file mode 100644 index 0000000..81965e1 --- /dev/null +++ b/crates/qmd-syntax-helper/src/utils/resources.rs @@ -0,0 +1,141 @@ +use anyhow::{Context, Result}; +use include_dir::{Dir, include_dir}; +use std::fs; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, Ordering}; + +static RESOURCE_MANAGER_COUNTER: AtomicU64 = AtomicU64::new(0); +static RESOURCES_DIR: Dir = include_dir!("$CARGO_MANIFEST_DIR/resources"); + +/// A resource manager that embeds files at compile time and extracts them +/// to a temporary directory at runtime. Automatically cleans up on drop. +pub struct ResourceManager { + temp_dir: PathBuf, +} + +impl ResourceManager { + /// Create a new resource manager with embedded resources + pub fn new() -> Result { + // Use both process ID and a unique counter to avoid conflicts between + // multiple ResourceManager instances in the same process (e.g., parallel tests) + let instance_id = RESOURCE_MANAGER_COUNTER.fetch_add(1, Ordering::SeqCst); + let temp_dir = std::env::temp_dir().join(format!( + "qmd-syntax-helper-{}-{}", + std::process::id(), + instance_id + )); + + fs::create_dir_all(&temp_dir) + .with_context(|| format!("Failed to create temp directory: {}", temp_dir.display()))?; + + Ok(Self { temp_dir }) + } + + /// Get a path to a resource, extracting it to temp dir if needed + pub fn get_resource(&self, path: &str) -> Result { + // Find the file in the embedded directory + let file = RESOURCES_DIR + .get_file(path) + .ok_or_else(|| anyhow::anyhow!("Resource not found: {}", path))?; + + // Determine output path in temp directory + let output_path = self.temp_dir.join(path); + + // Create parent directories if needed + if let Some(parent) = output_path.parent() { + fs::create_dir_all(parent)?; + } + + // Write the resource to the temp directory + fs::write(&output_path, file.contents()) + .with_context(|| format!("Failed to write resource to: {}", output_path.display()))?; + + Ok(output_path) + } + + /// Get the temp directory path + #[allow(dead_code)] + pub fn temp_dir(&self) -> &Path { + &self.temp_dir + } + + /// List all available resources + #[allow(dead_code)] + pub fn list_resources(&self) -> Vec { + let mut resources = Vec::new(); + Self::collect_files(&RESOURCES_DIR, "", &mut resources); + resources + } + + /// Recursively collect all file paths from a directory + #[allow(dead_code)] + fn collect_files(dir: &Dir, prefix: &str, resources: &mut Vec) { + for file in dir.files() { + let name = file.path().file_name().unwrap().to_string_lossy(); + let full_path = if prefix.is_empty() { + name.to_string() + } else { + format!("{}/{}", prefix, name) + }; + resources.push(full_path); + } + + for subdir in dir.dirs() { + let name = subdir.path().file_name().unwrap().to_string_lossy(); + let new_prefix = if prefix.is_empty() { + name.to_string() + } else { + format!("{}/{}", prefix, name) + }; + Self::collect_files(subdir, &new_prefix, resources); + } + } +} + +impl Drop for ResourceManager { + fn drop(&mut self) { + // Clean up temp directory + if self.temp_dir.exists() { + // Ignore errors here so it works well under stack unwinding + let _ = fs::remove_dir_all(&self.temp_dir); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_resource_manager_creates_temp_dir() { + let rm = ResourceManager::new().unwrap(); + assert!(rm.temp_dir().exists()); + } + + #[test] + fn test_resource_manager_lists_resources() { + let rm = ResourceManager::new().unwrap(); + let resources = rm.list_resources(); + assert!(resources.contains(&"filters/grid-table-to-list-table.lua".to_string())); + } + + #[test] + fn test_resource_manager_extracts_resource() { + let rm = ResourceManager::new().unwrap(); + let path = rm + .get_resource("filters/grid-table-to-list-table.lua") + .unwrap(); + assert!(path.exists()); + assert!(fs::read_to_string(&path).unwrap().contains("Lua filter")); + } + + #[test] + fn test_resource_manager_cleans_up() { + let temp_dir = { + let rm = ResourceManager::new().unwrap(); + rm.temp_dir().to_path_buf() + }; + // After rm is dropped, temp dir should be cleaned up + assert!(!temp_dir.exists()); + } +} diff --git a/crates/qmd-syntax-helper/tests/div_whitespace_test.rs b/crates/qmd-syntax-helper/tests/div_whitespace_test.rs new file mode 100644 index 0000000..215300d --- /dev/null +++ b/crates/qmd-syntax-helper/tests/div_whitespace_test.rs @@ -0,0 +1,145 @@ +use qmd_syntax_helper::conversions::div_whitespace::DivWhitespaceConverter; +use std::fs; + +#[test] +fn test_div_whitespace_conversion() { + let temp_dir = std::env::temp_dir().join(format!("qmd-test-{}", std::process::id())); + std::fs::create_dir_all(&temp_dir).unwrap(); + let test_file = temp_dir.join("test.qmd"); + + // Create test content with div fences missing whitespace + let input_content = r#"# Test file + +:::{.class} +Content with class +::: + +:::{#id} +Content with id +::: + +:::{} +Content with empty attrs +::: + +::: {.already-good} +Already has space +::: +"#; + + fs::write(&test_file, input_content).unwrap(); + + let converter = DivWhitespaceConverter::new().unwrap(); + + // Process the file in-place + converter + .process_file(&test_file, true, false, false) + .unwrap(); + + let result = fs::read_to_string(&test_file).unwrap(); + + // Verify all div fences now have spaces + assert!(result.contains("::: {.class}"), "Should fix :::{{.class}}"); + assert!(result.contains("::: {#id}"), "Should fix :::{{#id}}"); + assert!(result.contains("::: {}"), "Should fix :::{{}}"); + assert!( + result.contains("::: {.already-good}"), + "Should preserve already-good format" + ); + + // Clean up + std::fs::remove_dir_all(&temp_dir).ok(); + + // Verify content is preserved + assert!(result.contains("Content with class")); + assert!(result.contains("Content with id")); + assert!(result.contains("Content with empty attrs")); + assert!(result.contains("Already has space")); +} + +#[test] +fn test_div_whitespace_in_code_blocks_untouched() { + let temp_dir = std::env::temp_dir().join(format!("qmd-test-{}", std::process::id() + 1)); + std::fs::create_dir_all(&temp_dir).unwrap(); + let test_file = temp_dir.join("test.qmd"); + + // Content with div fence patterns in code blocks should not be modified + let input_content = r#"# Test file + +Here's an example in a code block: + +``` +:::{.class} +This is in a code block +::: +``` + +This one should be fixed: + +:::{.real-div} +Real div content +::: +"#; + + fs::write(&test_file, input_content).unwrap(); + + let converter = DivWhitespaceConverter::new().unwrap(); + converter + .process_file(&test_file, true, false, false) + .unwrap(); + + let result = fs::read_to_string(&test_file).unwrap(); + + // The one in the code block should remain unchanged (parser won't report it as an error) + // The real div should be fixed + assert!( + result.contains("::: {.real-div}"), + "Should fix real div fence" + ); + + // Code block content should be preserved exactly + assert!( + result.contains("```\n:::{.class}\nThis is in a code block\n:::\n```"), + "Code block should be unchanged" + ); + + // Clean up + std::fs::remove_dir_all(&temp_dir).ok(); +} + +#[test] +fn test_no_changes_when_all_correct() { + let temp_dir = std::env::temp_dir().join(format!("qmd-test-{}", std::process::id() + 2)); + std::fs::create_dir_all(&temp_dir).unwrap(); + let test_file = temp_dir.join("test.qmd"); + + let input_content = r#"# Test file + +::: {.class} +Content +::: + +::: {} +Content +::: +"#; + + fs::write(&test_file, input_content).unwrap(); + let original = fs::read_to_string(&test_file).unwrap(); + + let converter = DivWhitespaceConverter::new().unwrap(); + converter + .process_file(&test_file, true, false, false) + .unwrap(); + + let result = fs::read_to_string(&test_file).unwrap(); + + // Content should be identical + assert_eq!( + original, result, + "Should not modify already-correct content" + ); + + // Clean up + std::fs::remove_dir_all(&temp_dir).ok(); +} diff --git a/crates/qmd-syntax-helper/tests/fixtures/simple-grid-table.md b/crates/qmd-syntax-helper/tests/fixtures/simple-grid-table.md new file mode 100644 index 0000000..a68d0b0 --- /dev/null +++ b/crates/qmd-syntax-helper/tests/fixtures/simple-grid-table.md @@ -0,0 +1,7 @@ ++-----------+-----------+ +| Header 1 | Header 2 | ++===========+===========+ +| Cell 1 | Cell 2 | ++-----------+-----------+ +| Cell 3 | Cell 4 | ++-----------+-----------+ diff --git a/crates/qmd-syntax-helper/tests/grid_tables_test.rs b/crates/qmd-syntax-helper/tests/grid_tables_test.rs new file mode 100644 index 0000000..e660c32 --- /dev/null +++ b/crates/qmd-syntax-helper/tests/grid_tables_test.rs @@ -0,0 +1,39 @@ +use qmd_syntax_helper::conversions::grid_tables::GridTableConverter; +use std::fs; +use std::path::Path; + +#[test] +fn test_finds_simple_grid_table() { + let converter = GridTableConverter::new().expect("Failed to create converter"); + let fixture_path = Path::new("tests/fixtures/simple-grid-table.md"); + let content = fs::read_to_string(fixture_path).expect("Failed to read fixture"); + + // The converter should find one grid table + let tables = converter.find_grid_tables(&content); + assert_eq!(tables.len(), 1); + + // The table should span lines 0-5 (6 lines total) + assert_eq!(tables[0].start_line, 0); + assert_eq!(tables[0].end_line, 6); +} + +#[test] +fn test_converts_grid_table() { + let converter = GridTableConverter::new().expect("Failed to create converter"); + let fixture_path = Path::new("tests/fixtures/simple-grid-table.md"); + let content = fs::read_to_string(fixture_path).expect("Failed to read fixture"); + + let tables = converter.find_grid_tables(&content); + assert_eq!(tables.len(), 1); + + // Convert the table + let converted = converter + .convert_table(&tables[0].text) + .expect("Failed to convert table"); + + // The converted output should contain list-table syntax + assert!(converted.contains("::: {.list-table")); + assert!(converted.contains("header-rows=")); + assert!(converted.contains("* * Header 1")); + assert!(converted.contains("* * Cell 1")); +} diff --git a/crates/quarto-markdown-pandoc/src/pandoc/inline.rs b/crates/quarto-markdown-pandoc/src/pandoc/inline.rs index fcdbe3a..5052aa8 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/inline.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/inline.rs @@ -434,7 +434,7 @@ pub fn make_cite_inline( // first we split the content along semicolons let citations: Vec = content .split(is_semicolon) - .map(|slice| { + .flat_map(|slice| { let inlines = slice.to_vec(); let mut cite: Option = None; let mut prefix: Inlines = vec![]; @@ -455,21 +455,43 @@ pub fn make_cite_inline( let Some(mut c) = cite else { panic!("Cite inline should have at least one citation, found none") }; - if c.citations.len() != 1 { - panic!( - "Cite inline should have exactly one citation, found: {:?}", - c.citations - ); - } - let mut citation = c.citations.pop().unwrap(); - if citation.mode == CitationMode::AuthorInText { - // if the mode is AuthorInText, it becomes NormalCitation inside - // a compound cite - citation.mode = CitationMode::NormalCitation; + + // Handle the case where a Cite already has multiple citations + // This can happen when citation syntax appears in contexts like tables + // where the parser creates a Cite with multiple citations + if c.citations.len() == 1 { + // Simple case: one citation, apply prefix and suffix directly + let mut citation = c.citations.pop().unwrap(); + if citation.mode == CitationMode::AuthorInText { + // if the mode is AuthorInText, it becomes NormalCitation inside + // a compound cite + citation.mode = CitationMode::NormalCitation; + } + citation.prefix = prefix; + citation.suffix = suffix; + vec![citation] + } else { + // Complex case: multiple citations already present + // Apply prefix to the first citation and suffix to the last + let num_citations = c.citations.len(); + for (i, citation) in c.citations.iter_mut().enumerate() { + if citation.mode == CitationMode::AuthorInText { + citation.mode = CitationMode::NormalCitation; + } + if i == 0 { + // Prepend prefix to the first citation's prefix + let mut new_prefix = prefix.clone(); + new_prefix.extend(citation.prefix.clone()); + citation.prefix = new_prefix; + } + if i == num_citations - 1 { + // Append suffix to the last citation's suffix + citation.suffix.extend(suffix.clone()); + } + } + // Return all citations from this slice + c.citations } - citation.prefix = prefix; - citation.suffix = suffix; - citation }) .collect(); return Inline::Cite(Cite { @@ -487,3 +509,146 @@ fn make_inline_leftover(node: &tree_sitter::Node, input_bytes: &[u8]) -> Inline source_info: node_source_info(node), }) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::pandoc::location::Location; + + fn dummy_source_info() -> SourceInfo { + SourceInfo { + filename_index: None, + range: Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 0, + row: 0, + column: 0, + }, + }, + } + } + + fn make_str(text: &str) -> Inline { + Inline::Str(Str { + text: text.to_string(), + source_info: dummy_source_info(), + }) + } + + fn make_space() -> Inline { + Inline::Space(Space { + source_info: dummy_source_info(), + }) + } + + fn make_citation(id: &str, prefix: Inlines, suffix: Inlines) -> Citation { + Citation { + id: id.to_string(), + prefix, + suffix, + mode: CitationMode::NormalCitation, + note_num: 0, + hash: 0, + } + } + + #[test] + fn test_make_cite_inline_with_multiple_citations() { + // Test case: a Cite inline that already contains multiple citations + // This simulates what happens when the parser encounters citation syntax + // in unsupported contexts (e.g., grid tables) + + // Create a Cite with two citations already in it + let multi_cite = Inline::Cite(Cite { + citations: vec![ + make_citation( + "knuth1984", + vec![], + vec![make_str(","), make_space(), make_str("pp. 33-35")], + ), + make_citation( + "wickham2015", + vec![make_space(), make_str("also"), make_space()], + vec![make_str(","), make_space(), make_str("chap. 1")], + ), + ], + content: vec![], + source_info: dummy_source_info(), + }); + + // Now call make_cite_inline with content that includes this multi-citation Cite + // along with a prefix "see" + let content = vec![make_str("see"), make_space(), multi_cite]; + + let result = make_cite_inline( + ("".to_string(), vec![], std::collections::HashMap::new()), + ("".to_string(), "".to_string()), + content, + dummy_source_info(), + ); + + // Verify the result is a Cite + match result { + Inline::Cite(cite) => { + // Should have 2 citations + assert_eq!(cite.citations.len(), 2); + + // First citation should have the prefix "see " prepended + assert_eq!(cite.citations[0].id, "knuth1984"); + assert_eq!(cite.citations[0].prefix.len(), 2); + match &cite.citations[0].prefix[0] { + Inline::Str(s) => assert_eq!(s.text, "see"), + _ => panic!("Expected Str"), + } + + // Second citation should have its original prefix intact + assert_eq!(cite.citations[1].id, "wickham2015"); + assert_eq!(cite.citations[1].prefix.len(), 3); + } + _ => panic!("Expected Cite inline, got: {:?}", result), + } + } + + #[test] + fn test_make_cite_inline_with_single_citation_still_works() { + // Test that the normal case (single citation) still works + let single_cite = Inline::Cite(Cite { + citations: vec![make_citation("knuth1984", vec![], vec![])], + content: vec![], + source_info: dummy_source_info(), + }); + + let content = vec![ + make_str("see"), + make_space(), + single_cite, + make_str(","), + make_space(), + make_str("pp. 33"), + ]; + + let result = make_cite_inline( + ("".to_string(), vec![], std::collections::HashMap::new()), + ("".to_string(), "".to_string()), + content, + dummy_source_info(), + ); + + match result { + Inline::Cite(cite) => { + assert_eq!(cite.citations.len(), 1); + assert_eq!(cite.citations[0].id, "knuth1984"); + // Prefix should be "see " + assert_eq!(cite.citations[0].prefix.len(), 2); + // Suffix should be ", pp. 33" + assert_eq!(cite.citations[0].suffix.len(), 3); + } + _ => panic!("Expected Cite inline"), + } + } +} diff --git a/crates/quarto-markdown-pandoc/src/pandoc/meta.rs b/crates/quarto-markdown-pandoc/src/pandoc/meta.rs index 4058dfb..ec3f6bf 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/meta.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/meta.rs @@ -4,10 +4,12 @@ */ use crate::pandoc::block::Blocks; -use crate::pandoc::inline::Inlines; +use crate::pandoc::inline::{Inline, Inlines, Span, Str}; +use crate::pandoc::location::empty_source_info; use crate::readers; use crate::{pandoc::RawBlock, utils::output::VerboseOutput}; use hashlink::LinkedHashMap; +use std::collections::HashMap; use std::{io, mem}; use yaml_rust2::parser::{Event, MarkedEventReceiver, Parser}; @@ -82,7 +84,29 @@ impl YamlEventHandler { } } - fn parse_scalar(&self, s: &str) -> MetaValue { + fn parse_scalar(&self, s: &str, tag: Option) -> MetaValue { + // Check if this scalar has a YAML tag (like !path, !glob, !str) + if let Some(t) = tag { + // Tagged strings bypass markdown parsing - wrap in Span immediately + let mut attributes = HashMap::new(); + attributes.insert("tag".to_string(), t.suffix.clone()); + + let span = Span { + attr: ( + String::new(), + vec!["yaml-tagged-string".to_string()], + attributes, + ), + content: vec![Inline::Str(Str { + text: s.to_string(), + source_info: empty_source_info(), + })], + source_info: empty_source_info(), + }; + return MetaValue::MetaInlines(vec![Inline::Span(span)]); + } + + // Untagged scalars: parse as booleans or strings (will be parsed as markdown later) if s == "true" { MetaValue::MetaBool(true) } else if s == "false" { @@ -116,12 +140,12 @@ impl MarkedEventReceiver for YamlEventHandler { self.push_value(MetaValue::MetaList(list)); } } - Event::Scalar(s, ..) => match self.stack.last_mut() { + Event::Scalar(s, _style, _anchor, tag) => match self.stack.last_mut() { Some(ContextFrame::Map(_, key_slot @ None)) => { *key_slot = Some(s.to_string()); } Some(ContextFrame::Map(_, Some(_))) | Some(ContextFrame::List(_)) => { - let value = self.parse_scalar(&s); + let value = self.parse_scalar(&s, tag); self.push_value(value); } _ => {} @@ -187,10 +211,22 @@ pub fn parse_metadata_strings(meta: MetaValue, outer_metadata: &mut Meta) -> Met } MetaValue::MetaBlocks(pandoc.blocks) } - _ => panic!( - "(unimplemented syntax error, this is a bug!) Failed to parse metadata string as markdown: {}", - s - ), + Err(_) => { + // Markdown parse failed - wrap in Span with class "yaml-markdown-syntax-error" + let span = Span { + attr: ( + String::new(), + vec!["yaml-markdown-syntax-error".to_string()], + HashMap::new(), + ), + content: vec![Inline::Str(Str { + text: s.clone(), + source_info: empty_source_info(), + })], + source_info: empty_source_info(), + }; + MetaValue::MetaInlines(vec![Inline::Span(span)]) + } } } MetaValue::MetaList(list) => { diff --git a/crates/quarto-markdown-pandoc/src/pandoc/mod.rs b/crates/quarto-markdown-pandoc/src/pandoc/mod.rs index 4117cbd..9d7db3c 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/mod.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/mod.rs @@ -37,5 +37,5 @@ pub use crate::pandoc::table::{ }; pub use crate::pandoc::ast_context::ASTContext; -pub use crate::pandoc::meta::{Meta, MetaValue, rawblock_to_meta}; +pub use crate::pandoc::meta::{Meta, MetaValue, parse_metadata_strings, rawblock_to_meta}; pub use crate::pandoc::treesitter::treesitter_to_pandoc; diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs index 12e4f94..020df52 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs @@ -732,11 +732,12 @@ fn native_visitor( result } -pub fn treesitter_to_pandoc( +pub fn treesitter_to_pandoc( buf: &mut T, tree: &tree_sitter_qmd::MarkdownTree, input_bytes: &[u8], context: &ASTContext, + error_collector: &mut E, ) -> Result> { let result = bottomup_traverse_concrete_tree( &mut tree.walk(), @@ -749,7 +750,13 @@ pub fn treesitter_to_pandoc( let (_, PandocNativeIntermediate::IntermediatePandoc(pandoc)) = result else { panic!("Expected Pandoc, got {:?}", result) }; - let result = postprocess(pandoc)?; + let result = match postprocess(pandoc, error_collector) { + Ok(doc) => doc, + Err(()) => { + // Postprocess found errors, return the error messages from the collector + return Err(error_collector.messages()); + } + }; let result = merge_strs(result); Ok(result) } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs index 224f1e0..ce15fc6 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs @@ -14,6 +14,8 @@ use crate::pandoc::location::{Range, SourceInfo, empty_range, empty_source_info} use crate::pandoc::pandoc::Pandoc; use crate::pandoc::shortcode::shortcode_to_span; use crate::utils::autoid; +use crate::utils::error_collector::ErrorCollector; +use std::cell::RefCell; use std::collections::HashMap; /// Trim leading and trailing spaces from inlines @@ -260,9 +262,11 @@ fn transform_definition_list_div(div: Div) -> Block { } /// Apply post-processing transformations to the Pandoc AST -pub fn postprocess(doc: Pandoc) -> Result> { - let mut errors = Vec::new(); +pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> Result { let result = { + // Wrap error_collector in RefCell for interior mutability across multiple closures + let error_collector_ref = RefCell::new(error_collector); + // Track seen header IDs to avoid duplicates let mut seen_ids: HashMap = HashMap::new(); // Track citation count for numbering @@ -606,11 +610,14 @@ pub fn postprocess(doc: Pandoc) -> Result> { FilterResult(result, true) }) .with_attr(|attr| { - // TODO in order to do good error messages here, attr will need source mapping - errors.push(format!( - "Found attr in postprocess: {:?} - this should have been removed", - attr - )); + // TODO: Add source location when attr has it + error_collector_ref.borrow_mut().error( + format!( + "Found attr in postprocess: {:?} - this should have been removed", + attr + ), + None, + ); FilterResult(vec![], false) }) .with_blocks(|blocks| { @@ -632,12 +639,13 @@ pub fn postprocess(doc: Pandoc) -> Result> { }; // Don't add the CaptionBlock to the result (it's now attached) } else { - // TODO: Issue a warning/error when proper error infrastructure is ready - // For now, print a warning to stderr - eprintln!( - "Warning: Caption found without a preceding table at {}:{}", - caption_block.source_info.range.start.row + 1, - caption_block.source_info.range.start.column + 1 + // Issue a warning when caption has no preceding table + error_collector_ref.borrow_mut().warn( + "Caption found without a preceding table".to_string(), + Some(&crate::utils::error_collector::SourceInfo::new( + caption_block.source_info.range.start.row + 1, + caption_block.source_info.range.start.column + 1, + )), ); // Remove the caption from the output (don't add to result) } @@ -649,13 +657,16 @@ pub fn postprocess(doc: Pandoc) -> Result> { FilterResult(result, true) }); - topdown_traverse(doc, &mut filter) + let pandoc_result = topdown_traverse(doc, &mut filter); + + // Check if any errors were collected (before moving out of RefCell) + let has_errors = error_collector_ref.borrow().has_errors(); + + (pandoc_result, has_errors) }; - if !errors.is_empty() { - Err(errors) - } else { - Ok(result) - } + + // Return based on whether errors were found + if result.1 { Err(()) } else { Ok(result.0) } } /// Convert smart typography strings diff --git a/crates/quarto-markdown-pandoc/src/readers/qmd.rs b/crates/quarto-markdown-pandoc/src/readers/qmd.rs index 72b5b54..c799dad 100644 --- a/crates/quarto-markdown-pandoc/src/readers/qmd.rs +++ b/crates/quarto-markdown-pandoc/src/readers/qmd.rs @@ -16,6 +16,7 @@ use crate::pandoc::{self, Block, Meta}; use crate::pandoc::{MetaValue, rawblock_to_meta}; use crate::readers::qmd_error_messages::{produce_error_message, produce_error_message_json}; use crate::traversals; +use crate::utils::error_collector::{JsonErrorCollector, TextErrorCollector}; use std::io::Write; use tree_sitter::LogType; use tree_sitter_qmd::MarkdownParser; @@ -137,8 +138,29 @@ where } let context = ASTContext::with_filename(filename.to_string()); - let mut result = - pandoc::treesitter_to_pandoc(&mut output_stream, &tree, &input_bytes, &context)?; + + // Create appropriate error collector based on whether JSON errors are requested + let mut result = if error_formatter.is_some() { + // JSON error format requested + let mut error_collector = JsonErrorCollector::new(); + pandoc::treesitter_to_pandoc( + &mut output_stream, + &tree, + &input_bytes, + &context, + &mut error_collector, + )? + } else { + // Text error format (default) + let mut error_collector = TextErrorCollector::new(); + pandoc::treesitter_to_pandoc( + &mut output_stream, + &tree, + &input_bytes, + &context, + &mut error_collector, + )? + }; let mut meta_from_parses = Meta::default(); result = { diff --git a/crates/quarto-markdown-pandoc/src/utils/error_collector.rs b/crates/quarto-markdown-pandoc/src/utils/error_collector.rs new file mode 100644 index 0000000..be7476b --- /dev/null +++ b/crates/quarto-markdown-pandoc/src/utils/error_collector.rs @@ -0,0 +1,341 @@ +/// Source location information for errors +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SourceInfo { + pub row: usize, + pub column: usize, +} + +impl SourceInfo { + pub fn new(row: usize, column: usize) -> Self { + Self { row, column } + } +} + +/// Trait for collecting errors and warnings during parsing/processing +pub trait ErrorCollector { + /// Add a warning message (non-fatal) + fn warn(&mut self, message: String, location: Option<&SourceInfo>); + + /// Add an error message (fatal) + fn error(&mut self, message: String, location: Option<&SourceInfo>); + + /// Check if any errors were collected + fn has_errors(&self) -> bool; + + /// Get a copy of collected messages (without consuming the collector) + fn messages(&self) -> Vec; + + /// Convert collected errors into final format (consumes the collector) + fn into_messages(self) -> Vec; +} + +/// Text-based error collector that produces human-readable messages +#[derive(Debug, Default)] +pub struct TextErrorCollector { + messages: Vec, + has_errors: bool, +} + +impl TextErrorCollector { + pub fn new() -> Self { + Self { + messages: Vec::new(), + has_errors: false, + } + } +} + +impl ErrorCollector for TextErrorCollector { + fn warn(&mut self, message: String, location: Option<&SourceInfo>) { + let formatted = if let Some(loc) = location { + format!("Warning: {} at {}:{}", message, loc.row, loc.column) + } else { + format!("Warning: {}", message) + }; + self.messages.push(formatted); + } + + fn error(&mut self, message: String, location: Option<&SourceInfo>) { + let formatted = if let Some(loc) = location { + format!("Error: {} at {}:{}", message, loc.row, loc.column) + } else { + format!("Error: {}", message) + }; + self.messages.push(formatted); + self.has_errors = true; + } + + fn has_errors(&self) -> bool { + self.has_errors + } + + fn messages(&self) -> Vec { + self.messages.clone() + } + + fn into_messages(self) -> Vec { + self.messages + } +} + +/// JSON-based error collector that produces structured JSON messages +#[derive(Debug, Default)] +pub struct JsonErrorCollector { + messages: Vec, + has_errors: bool, +} + +impl JsonErrorCollector { + pub fn new() -> Self { + Self { + messages: Vec::new(), + has_errors: false, + } + } + + fn format_json_message(title: &str, message: String, location: Option<&SourceInfo>) -> String { + use serde_json::json; + + let json_obj = if let Some(loc) = location { + json!({ + "title": title, + "message": message, + "location": { + "row": loc.row, + "column": loc.column + } + }) + } else { + json!({ + "title": title, + "message": message + }) + }; + + json_obj.to_string() + } +} + +impl ErrorCollector for JsonErrorCollector { + fn warn(&mut self, message: String, location: Option<&SourceInfo>) { + let formatted = Self::format_json_message("Warning", message, location); + self.messages.push(formatted); + } + + fn error(&mut self, message: String, location: Option<&SourceInfo>) { + let formatted = Self::format_json_message("Error", message, location); + self.messages.push(formatted); + self.has_errors = true; + } + + fn has_errors(&self) -> bool { + self.has_errors + } + + fn messages(&self) -> Vec { + self.messages.clone() + } + + fn into_messages(self) -> Vec { + self.messages + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_text_collector_warning_without_location() { + let mut collector = TextErrorCollector::new(); + collector.warn("This is a warning".to_string(), None); + + assert!(!collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + assert_eq!(messages[0], "Warning: This is a warning"); + } + + #[test] + fn test_text_collector_warning_with_location() { + let mut collector = TextErrorCollector::new(); + let location = SourceInfo::new(35, 1); + collector.warn( + "Caption found without a preceding table".to_string(), + Some(&location), + ); + + assert!(!collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + assert_eq!( + messages[0], + "Warning: Caption found without a preceding table at 35:1" + ); + } + + #[test] + fn test_text_collector_error_without_location() { + let mut collector = TextErrorCollector::new(); + collector.error("This is an error".to_string(), None); + + assert!(collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + assert_eq!(messages[0], "Error: This is an error"); + } + + #[test] + fn test_text_collector_error_with_location() { + let mut collector = TextErrorCollector::new(); + let location = SourceInfo::new(42, 10); + collector.error("Found attr in postprocess".to_string(), Some(&location)); + + assert!(collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + assert_eq!(messages[0], "Error: Found attr in postprocess at 42:10"); + } + + #[test] + fn test_text_collector_multiple_messages() { + let mut collector = TextErrorCollector::new(); + let loc1 = SourceInfo::new(10, 5); + let loc2 = SourceInfo::new(20, 15); + + collector.warn("First warning".to_string(), Some(&loc1)); + collector.error("First error".to_string(), Some(&loc2)); + collector.warn("Second warning".to_string(), None); + + assert!(collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 3); + assert_eq!(messages[0], "Warning: First warning at 10:5"); + assert_eq!(messages[1], "Error: First error at 20:15"); + assert_eq!(messages[2], "Warning: Second warning"); + } + + #[test] + fn test_json_collector_warning_without_location() { + let mut collector = JsonErrorCollector::new(); + collector.warn("This is a warning".to_string(), None); + + assert!(!collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + + // Parse and verify JSON structure + let parsed: serde_json::Value = serde_json::from_str(&messages[0]).unwrap(); + assert_eq!(parsed["title"], "Warning"); + assert_eq!(parsed["message"], "This is a warning"); + assert!(parsed.get("location").is_none()); + } + + #[test] + fn test_json_collector_warning_with_location() { + let mut collector = JsonErrorCollector::new(); + let location = SourceInfo::new(35, 1); + collector.warn( + "Caption found without a preceding table".to_string(), + Some(&location), + ); + + assert!(!collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + + // Parse and verify JSON structure + let parsed: serde_json::Value = serde_json::from_str(&messages[0]).unwrap(); + assert_eq!(parsed["title"], "Warning"); + assert_eq!(parsed["message"], "Caption found without a preceding table"); + assert_eq!(parsed["location"]["row"], 35); + assert_eq!(parsed["location"]["column"], 1); + } + + #[test] + fn test_json_collector_error_without_location() { + let mut collector = JsonErrorCollector::new(); + collector.error("This is an error".to_string(), None); + + assert!(collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + + // Parse and verify JSON structure + let parsed: serde_json::Value = serde_json::from_str(&messages[0]).unwrap(); + assert_eq!(parsed["title"], "Error"); + assert_eq!(parsed["message"], "This is an error"); + assert!(parsed.get("location").is_none()); + } + + #[test] + fn test_json_collector_error_with_location() { + let mut collector = JsonErrorCollector::new(); + let location = SourceInfo::new(42, 10); + collector.error("Found attr in postprocess".to_string(), Some(&location)); + + assert!(collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 1); + + // Parse and verify JSON structure + let parsed: serde_json::Value = serde_json::from_str(&messages[0]).unwrap(); + assert_eq!(parsed["title"], "Error"); + assert_eq!(parsed["message"], "Found attr in postprocess"); + assert_eq!(parsed["location"]["row"], 42); + assert_eq!(parsed["location"]["column"], 10); + } + + #[test] + fn test_json_collector_multiple_messages() { + let mut collector = JsonErrorCollector::new(); + let loc1 = SourceInfo::new(10, 5); + let loc2 = SourceInfo::new(20, 15); + + collector.warn("First warning".to_string(), Some(&loc1)); + collector.error("First error".to_string(), Some(&loc2)); + collector.warn("Second warning".to_string(), None); + + assert!(collector.has_errors()); + let messages = collector.into_messages(); + assert_eq!(messages.len(), 3); + + // Verify each message is valid JSON + let parsed1: serde_json::Value = serde_json::from_str(&messages[0]).unwrap(); + assert_eq!(parsed1["title"], "Warning"); + assert_eq!(parsed1["message"], "First warning"); + assert_eq!(parsed1["location"]["row"], 10); + + let parsed2: serde_json::Value = serde_json::from_str(&messages[1]).unwrap(); + assert_eq!(parsed2["title"], "Error"); + assert_eq!(parsed2["message"], "First error"); + assert_eq!(parsed2["location"]["row"], 20); + + let parsed3: serde_json::Value = serde_json::from_str(&messages[2]).unwrap(); + assert_eq!(parsed3["title"], "Warning"); + assert_eq!(parsed3["message"], "Second warning"); + assert!(parsed3.get("location").is_none()); + } + + #[test] + fn test_empty_collector_has_no_errors() { + let collector = TextErrorCollector::new(); + assert!(!collector.has_errors()); + + let collector = JsonErrorCollector::new(); + assert!(!collector.has_errors()); + } + + #[test] + fn test_collector_with_only_warnings_has_no_errors() { + let mut collector = TextErrorCollector::new(); + collector.warn("Warning 1".to_string(), None); + collector.warn("Warning 2".to_string(), None); + assert!(!collector.has_errors()); + + let mut collector = JsonErrorCollector::new(); + collector.warn("Warning 1".to_string(), None); + collector.warn("Warning 2".to_string(), None); + assert!(!collector.has_errors()); + } +} diff --git a/crates/quarto-markdown-pandoc/src/utils/mod.rs b/crates/quarto-markdown-pandoc/src/utils/mod.rs index fd42ced..9014e30 100644 --- a/crates/quarto-markdown-pandoc/src/utils/mod.rs +++ b/crates/quarto-markdown-pandoc/src/utils/mod.rs @@ -5,6 +5,7 @@ pub mod autoid; pub mod concrete_tree_depth; +pub mod error_collector; pub mod output; pub mod string_write_adapter; pub mod text; diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd b/crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd new file mode 100644 index 0000000..c041672 --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd @@ -0,0 +1,13 @@ +```powershell +$ENV:QUARTO_PRINT_STACK="true" +``` + +```bash +export FOO=$BAR +echo $HOME +``` + +```r +# Dollar signs in comments +x <- "$variable" +``` diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd.snapshot new file mode 100644 index 0000000..d89bd35 --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/snapshots/native/026.qmd.snapshot @@ -0,0 +1 @@ +[ CodeBlock ( "" , ["powershell"] , [] ) "$ENV:QUARTO_PRINT_STACK=\"true\"", CodeBlock ( "" , ["bash"] , [] ) "export FOO=$BAR\necho $HOME", CodeBlock ( "" , ["r"] , [] ) "# Dollar signs in comments\nx <- \"$variable\"" ] \ No newline at end of file diff --git a/crates/quarto-markdown-pandoc/tests/test.rs b/crates/quarto-markdown-pandoc/tests/test.rs index cc3f998..2a489d0 100644 --- a/crates/quarto-markdown-pandoc/tests/test.rs +++ b/crates/quarto-markdown-pandoc/tests/test.rs @@ -6,6 +6,7 @@ use glob::glob; use quarto_markdown_pandoc::errors::parse_is_good; use quarto_markdown_pandoc::pandoc::{ASTContext, treesitter_to_pandoc}; +use quarto_markdown_pandoc::utils::error_collector::TextErrorCollector; use quarto_markdown_pandoc::utils::output::VerboseOutput; use quarto_markdown_pandoc::{readers, writers}; use std::io::{self, Write}; @@ -22,12 +23,14 @@ fn unit_test_simple_qmd_parses() { .parse(input_bytes, None) .expect("Failed to parse input"); let mut buf = Vec::new(); + let mut error_collector = TextErrorCollector::new(); writers::native::write( &treesitter_to_pandoc( &mut std::io::sink(), &tree, &input_bytes, &ASTContext::anonymous(), + &mut error_collector, ) .unwrap(), &mut buf, @@ -126,6 +129,7 @@ fn matches_pandoc_commonmark_reader(input: &str) -> bool { } let mut buf1 = Vec::new(); let mut buf2 = Vec::new(); + let mut error_collector1 = TextErrorCollector::new(); writers::native::write( &treesitter_to_pandoc( &mut std::io::sink(), @@ -134,6 +138,7 @@ fn matches_pandoc_commonmark_reader(input: &str) -> bool { .unwrap(), input.as_bytes(), &ASTContext::anonymous(), + &mut error_collector1, ) .unwrap(), &mut buf1, @@ -141,6 +146,7 @@ fn matches_pandoc_commonmark_reader(input: &str) -> bool { .unwrap(); let native_output = String::from_utf8(buf1).expect("Invalid UTF-8 in output"); let context_for_json = ASTContext::anonymous(); + let mut error_collector2 = TextErrorCollector::new(); writers::json::write( &treesitter_to_pandoc( &mut std::io::sink(), @@ -149,6 +155,7 @@ fn matches_pandoc_commonmark_reader(input: &str) -> bool { .unwrap(), input.as_bytes(), &context_for_json, + &mut error_collector2, ) .unwrap(), &context_for_json, @@ -353,9 +360,15 @@ fn test_json_writer() { .parse(input_bytes, None) .expect("Failed to parse input"); let test_context = ASTContext::anonymous(); - let pandoc = - treesitter_to_pandoc(&mut std::io::sink(), &tree, input_bytes, &test_context) - .unwrap(); + let mut error_collector = TextErrorCollector::new(); + let pandoc = treesitter_to_pandoc( + &mut std::io::sink(), + &tree, + input_bytes, + &test_context, + &mut error_collector, + ) + .unwrap(); let mut buf = Vec::new(); writers::json::write(&pandoc, &test_context, &mut buf).unwrap(); let our_json = String::from_utf8(buf).expect("Invalid UTF-8 in our JSON output"); @@ -435,11 +448,13 @@ fn test_html_writer() { let tree = parser .parse(input_bytes, None) .expect("Failed to parse input"); + let mut error_collector = TextErrorCollector::new(); let pandoc = treesitter_to_pandoc( &mut std::io::sink(), &tree, input_bytes, &ASTContext::anonymous(), + &mut error_collector, ) .unwrap(); let mut buf = Vec::new(); @@ -541,11 +556,13 @@ fn test_do_not_smoke() { let tree = parser .parse(input_bytes, None) .expect("Failed to parse input"); + let mut error_collector = TextErrorCollector::new(); let _ = treesitter_to_pandoc( &mut std::io::sink(), &tree, input_bytes, &ASTContext::anonymous(), + &mut error_collector, ); file_count += 1; } diff --git a/crates/quarto-markdown-pandoc/tests/test_inline_locations.rs b/crates/quarto-markdown-pandoc/tests/test_inline_locations.rs index a6b7671..6d5a08a 100644 --- a/crates/quarto-markdown-pandoc/tests/test_inline_locations.rs +++ b/crates/quarto-markdown-pandoc/tests/test_inline_locations.rs @@ -4,6 +4,7 @@ */ use quarto_markdown_pandoc::pandoc::{ASTContext, treesitter_to_pandoc}; +use quarto_markdown_pandoc::utils::error_collector::TextErrorCollector; use quarto_markdown_pandoc::writers; use tree_sitter_qmd::MarkdownParser; @@ -17,8 +18,15 @@ fn test_inline_source_locations() { .expect("Failed to parse input"); let context = ASTContext::anonymous(); - let pandoc = treesitter_to_pandoc(&mut std::io::sink(), &tree, &input_bytes, &context) - .expect("Failed to convert to Pandoc AST"); + let mut error_collector = TextErrorCollector::new(); + let pandoc = treesitter_to_pandoc( + &mut std::io::sink(), + &tree, + &input_bytes, + &context, + &mut error_collector, + ) + .expect("Failed to convert to Pandoc AST"); let mut buf = Vec::new(); writers::json::write(&pandoc, &context, &mut buf).unwrap(); @@ -94,8 +102,15 @@ fn test_merged_strings_preserve_location() { .expect("Failed to parse input"); let context = ASTContext::anonymous(); - let pandoc = treesitter_to_pandoc(&mut std::io::sink(), &tree, &input_bytes, &context) - .expect("Failed to convert to Pandoc AST"); + let mut error_collector = TextErrorCollector::new(); + let pandoc = treesitter_to_pandoc( + &mut std::io::sink(), + &tree, + &input_bytes, + &context, + &mut error_collector, + ) + .expect("Failed to convert to Pandoc AST"); let mut buf = Vec::new(); writers::json::write(&pandoc, &context, &mut buf).unwrap(); @@ -149,8 +164,15 @@ fn test_separate_strings_keep_separate_locations() { .expect("Failed to parse input"); let context = ASTContext::anonymous(); - let pandoc = treesitter_to_pandoc(&mut std::io::sink(), &tree, &input_bytes, &context) - .expect("Failed to convert to Pandoc AST"); + let mut error_collector = TextErrorCollector::new(); + let pandoc = treesitter_to_pandoc( + &mut std::io::sink(), + &tree, + &input_bytes, + &context, + &mut error_collector, + ) + .expect("Failed to convert to Pandoc AST"); let mut buf = Vec::new(); writers::json::write(&pandoc, &context, &mut buf).unwrap(); diff --git a/crates/quarto-markdown-pandoc/tests/test_meta.rs b/crates/quarto-markdown-pandoc/tests/test_meta.rs index bd53cb4..3c8039f 100644 --- a/crates/quarto-markdown-pandoc/tests/test_meta.rs +++ b/crates/quarto-markdown-pandoc/tests/test_meta.rs @@ -3,8 +3,11 @@ * Copyright (c) 2025 Posit, PBC */ +use hashlink::LinkedHashMap; use quarto_markdown_pandoc::pandoc::location::{Location, Range, SourceInfo}; -use quarto_markdown_pandoc::pandoc::{MetaValue, RawBlock, rawblock_to_meta}; +use quarto_markdown_pandoc::pandoc::{ + Inline, MetaValue, RawBlock, parse_metadata_strings, rawblock_to_meta, +}; use std::fs; #[test] @@ -54,3 +57,183 @@ fn test_metadata_parsing() { Some(MetaValue::MetaList(_)) )); } + +#[test] +fn test_yaml_tagged_strings() { + // Test that YAML tags (!path, !glob, !str) prevent markdown parsing + let content = fs::read_to_string("tests/yaml-tagged-strings.qmd").unwrap(); + + let block = RawBlock { + format: "quarto_minus_metadata".to_string(), + text: content, + source_info: SourceInfo::with_range(Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 0, + row: 0, + column: 0, + }, + }), + }; + + let mut meta = rawblock_to_meta(block); + let mut outer_meta = LinkedHashMap::new(); + + // Parse metadata strings + for (k, v) in meta.drain() { + let parsed = parse_metadata_strings(v, &mut outer_meta); + outer_meta.insert(k, parsed); + } + + // Check plain_path - should be MetaInlines with Span wrapper + let plain_path = outer_meta.get("plain_path").expect("plain_path not found"); + if let MetaValue::MetaInlines(inlines) = plain_path { + assert_eq!(inlines.len(), 1, "Expected exactly one inline"); + if let Inline::Span(span) = &inlines[0] { + assert!(span.attr.1.contains(&"yaml-tagged-string".to_string())); + assert_eq!(span.attr.2.get("tag"), Some(&"path".to_string())); + // Extract the string content + if let Inline::Str(s) = &span.content[0] { + assert_eq!(s.text, "images/neovim-*.png"); + } else { + panic!("Expected Str inline inside Span"); + } + } else { + panic!("Expected Span inline, got: {:?}", inlines[0]); + } + } else { + panic!("Expected MetaInlines for plain_path"); + } + + // Check glob_pattern + let glob_pattern = outer_meta + .get("glob_pattern") + .expect("glob_pattern not found"); + if let MetaValue::MetaInlines(inlines) = glob_pattern { + if let Inline::Span(span) = &inlines[0] { + assert_eq!(span.attr.2.get("tag"), Some(&"glob".to_string())); + if let Inline::Str(s) = &span.content[0] { + assert_eq!(s.text, "posts/*/index.qmd"); + } + } + } + + // Check literal_string + let literal_string = outer_meta + .get("literal_string") + .expect("literal_string not found"); + if let MetaValue::MetaInlines(inlines) = literal_string { + if let Inline::Span(span) = &inlines[0] { + assert_eq!(span.attr.2.get("tag"), Some(&"str".to_string())); + if let Inline::Str(s) = &span.content[0] { + assert_eq!(s.text, "_foo_.py"); + } + } + } + + // Check regular_markdown - should have parsed markdown (Emph element) + let regular_markdown = outer_meta + .get("regular_markdown") + .expect("regular_markdown not found"); + if let MetaValue::MetaInlines(inlines) = regular_markdown { + // Should contain Emph for *emphasis* + let has_emph = inlines + .iter() + .any(|inline| matches!(inline, Inline::Emph(_))); + assert!( + has_emph, + "regular_markdown should have Emph element from *emphasis*" + ); + } else { + panic!("Expected MetaInlines for regular_markdown"); + } +} + +#[test] +fn test_yaml_markdown_parse_failure() { + // Test that untagged strings that fail markdown parsing are gracefully handled + let content = fs::read_to_string("tests/yaml-markdown-parse-failure.qmd").unwrap(); + + let block = RawBlock { + format: "quarto_minus_metadata".to_string(), + text: content, + source_info: SourceInfo::with_range(Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 0, + row: 0, + column: 0, + }, + }), + }; + + let mut meta = rawblock_to_meta(block); + let mut outer_meta = LinkedHashMap::new(); + + // Parse metadata strings - this should not panic + for (k, v) in meta.drain() { + let parsed = parse_metadata_strings(v, &mut outer_meta); + outer_meta.insert(k, parsed); + } + + // Check untagged_path - should be wrapped in error span + let untagged_path = outer_meta + .get("untagged_path") + .expect("untagged_path not found"); + if let MetaValue::MetaInlines(inlines) = untagged_path { + if let Inline::Span(span) = &inlines[0] { + assert!( + span.attr + .1 + .contains(&"yaml-markdown-syntax-error".to_string()) + ); + if let Inline::Str(s) = &span.content[0] { + assert_eq!(s.text, "posts/*/index.qmd"); + } + } else { + panic!("Expected Span inline for failed parse"); + } + } else { + panic!("Expected MetaInlines for untagged_path"); + } + + // Check another_glob - should also be wrapped in error span + let another_glob = outer_meta + .get("another_glob") + .expect("another_glob not found"); + if let MetaValue::MetaInlines(inlines) = another_glob { + if let Inline::Span(span) = &inlines[0] { + assert!( + span.attr + .1 + .contains(&"yaml-markdown-syntax-error".to_string()) + ); + if let Inline::Str(s) = &span.content[0] { + assert_eq!(s.text, "images/*.png"); + } + } + } + + // Check underscore_file - this one should successfully parse as markdown with Emph + let underscore_file = outer_meta + .get("underscore_file") + .expect("underscore_file not found"); + if let MetaValue::MetaInlines(inlines) = underscore_file { + // _foo_ should become Emph element + let has_emph = inlines + .iter() + .any(|inline| matches!(inline, Inline::Emph(_))); + assert!( + has_emph, + "underscore_file should have Emph element from _foo_" + ); + } +} diff --git a/crates/quarto-markdown-pandoc/tests/yaml-markdown-parse-failure.qmd b/crates/quarto-markdown-pandoc/tests/yaml-markdown-parse-failure.qmd new file mode 100644 index 0000000..1692aac --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/yaml-markdown-parse-failure.qmd @@ -0,0 +1,8 @@ +--- +title: Test Markdown Parse Failure Fallback +untagged_path: posts/*/index.qmd +another_glob: images/*.png +underscore_file: _foo_.py +--- + +Test document for graceful handling of markdown parse failures in untagged strings. diff --git a/crates/quarto-markdown-pandoc/tests/yaml-tagged-strings.qmd b/crates/quarto-markdown-pandoc/tests/yaml-tagged-strings.qmd new file mode 100644 index 0000000..5c5709c --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/yaml-tagged-strings.qmd @@ -0,0 +1,9 @@ +--- +title: Test YAML Tagged Strings +plain_path: !path images/neovim-*.png +glob_pattern: !glob posts/*/index.qmd +literal_string: !str _foo_.py +regular_markdown: This has *emphasis* +--- + +Test document for YAML tag support. diff --git a/crates/tree-sitter-qmd/tree-sitter-markdown/src/scanner.c b/crates/tree-sitter-qmd/tree-sitter-markdown/src/scanner.c index 80672ff..386e733 100644 --- a/crates/tree-sitter-qmd/tree-sitter-markdown/src/scanner.c +++ b/crates/tree-sitter-qmd/tree-sitter-markdown/src/scanner.c @@ -1464,7 +1464,13 @@ static bool scan(Scanner *s, TSLexer *lexer, const bool *valid_symbols) { // and go on. But we can only serialize state if we successfully return an external // token. // - if (!s->simulate && lexer->lookahead == '$' && valid_symbols[DISPLAY_MATH_STATE_TRACK_MARKER]) { + // Don't track math state when inside a fenced code block - dollar signs should be literal + bool inside_fenced_code = s->open_blocks.size > 0 && + s->open_blocks.items[s->open_blocks.size - 1] == FENCED_CODE_BLOCK; + + if (!s->simulate && lexer->lookahead == '$' && + !inside_fenced_code && + valid_symbols[DISPLAY_MATH_STATE_TRACK_MARKER]) { advance(s, lexer); if (lexer->lookahead == '$') { advance(s, lexer); diff --git a/docs/syntax/index.qmd b/docs/syntax/index.qmd index 3980610..fad9595 100644 --- a/docs/syntax/index.qmd +++ b/docs/syntax/index.qmd @@ -13,3 +13,4 @@ The features documented here are currently under development. The syntax and beh - [Definition Lists](definition-lists.qmd) - Create definition lists using an embedded markdown DSL - [Editorial Marks](editorial-marks.qmd) - Annotate text with highlights, insertions, deletions, and comments - [Footnotes](footnotes.qmd) - Add footnotes with inline or fenced block syntax +- [YAML Metadata](yaml-metadata.qmd) - Control markdown parsing in metadata with YAML tags diff --git a/docs/syntax/yaml-metadata.qmd b/docs/syntax/yaml-metadata.qmd new file mode 100644 index 0000000..0ca322c --- /dev/null +++ b/docs/syntax/yaml-metadata.qmd @@ -0,0 +1,242 @@ +--- +title: "YAML Metadata" +--- + +## Overview + +YAML front matter provides document-level metadata in Quarto documents. In `quarto-markdown`, metadata values are parsed as markdown by default, allowing you to use formatting like `*emphasis*` and `[links](url)`. However, some values—like file paths and glob patterns—should be treated as literal strings. + +## Default Behavior: Markdown Parsing + +By default, string values in YAML metadata are parsed as markdown: + +```yaml +--- +title: This has *emphasis* +description: Visit [our website](https://example.com) for more info +--- +``` + +The `title` will render with italicized "emphasis", and the `description` will include a clickable link. + +## YAML Tags for Literal Strings + +When you need to prevent markdown parsing, use YAML tags to mark values as literal strings: + +### Available Tags + +- `!str` - Plain string, no markdown parsing +- `!path` - File path (same as `!str`, but semantically clearer) +- `!glob` - Glob pattern (same as `!str`, but semantically clearer) + +### Syntax + +Prefix the value with the tag: + +```yaml +--- +title: My Blog +resources: + - !path images/neovim-*.png + - !path _foo_.py +listing: + contents: !glob posts/*/index.qmd +plain: !str "Text with * and _ chars that won't be parsed" +--- +``` + +## Why Use Tags? + +### Problem: Wildcard Characters + +File paths and glob patterns often contain characters that have special meaning in markdown: + +- `*` - Indicates emphasis in markdown +- `_` - Also indicates emphasis in markdown +- `[` and `]` - Indicate links in markdown + +Without tags, these can cause unexpected results: + +```yaml +--- +# Without tag: _foo_.py is parsed as markdown +# The underscore characters create italic emphasis! +file: _foo_.py # Parsed as: foo.py + +# With tag: preserved literally +file: !path _foo_.py # Parsed as: "_foo_.py" +--- +``` + +### Examples Where Tags Help + +**Glob patterns with wildcards:** + +```yaml +--- +# ❌ Without tag: asterisk triggers markdown parsing error +listing: posts/*/index.qmd + +# ✅ With tag: preserved as literal glob pattern +listing: !glob posts/*/index.qmd +--- +``` + +**File paths with underscores:** + +```yaml +--- +# ❌ Without tag: underscores create italic emphasis +script: _build_helper.py # Parsed as: buildhelper.py + +# ✅ With tag: preserved as literal file path +script: !path _build_helper.py +--- +``` + +**Relative paths:** + +```yaml +--- +# ❌ Without tag: parsed as markdown, leading dots may confuse parser +redirect: ../_redirect.html + +# ✅ With tag: preserved as literal path +redirect: !path ../_redirect.html +--- +``` + +## Graceful Fallback + +If a string fails to parse as markdown (e.g., because it contains `*` wildcard characters), `quarto-markdown` will gracefully preserve it as a literal string instead of crashing. + +The parser wraps failed parses in a special marker that downstream tools can detect: + +```json +{ + "t": "MetaInlines", + "c": [{ + "t": "Span", + "c": [ + ["", ["yaml-markdown-syntax-error"], []], + [{"t": "Str", "c": "posts/*/index.qmd"}] + ] + }] +} +``` + +While this graceful fallback prevents crashes, using explicit tags is better practice: + +1. **Intent**: Tags clearly communicate that a value should be literal +2. **Reliability**: Tags work even if the string happens to be valid markdown +3. **Tooling**: Downstream tools can recognize tagged strings by the `yaml-tagged-string` class + +## When to Use Tags + +Use YAML tags when metadata values contain: + +- **File paths**: Especially those with wildcards, underscores, or special characters +- **Glob patterns**: Any pattern using `*`, `?`, or `[...]` syntax +- **Configuration strings**: Technical strings that shouldn't be formatted + +**Don't need tags** for: + +- **Titles and descriptions**: Where you want markdown formatting +- **Plain text**: Simple strings without special characters +- **Booleans and numbers**: These are never parsed as markdown + +## Complete Example + +```yaml +--- +title: My Data Science Blog +description: Articles about *statistics* and [machine learning](ml.html) + +# File paths with wildcards - use tags +resources: + - !path images/*.png + - !path data/**/*.csv + - !path _utils.py + +# Glob patterns - use tags +listing: + contents: !glob posts/*/index.qmd + +# Configuration strings - use tags +redirect: !path ../index.html +template: !str {{< special-syntax >}} + +# Regular strings - no tags needed +author: Jane Doe +date: 2024-01-15 +--- +``` + +## Implementation Details + +### Tagged String Representation + +Tagged strings are converted to Pandoc's `MetaInlines` with a `Span` wrapper: + +```json +{ + "t": "MetaInlines", + "c": [{ + "t": "Span", + "c": [ + ["", ["yaml-tagged-string"], [["tag", "path"]]], + [{"t": "Str", "c": "images/neovim-*.png"}] + ] + }] +} +``` + +The representation includes: + +- **Class**: `yaml-tagged-string` - Identifies this as a tagged value +- **Attribute**: `tag` - Contains the tag name (`str`, `path`, or `glob`) +- **Content**: The literal string value wrapped in a `Str` inline + +### Compatibility with Pandoc + +Tagged strings are compatible with Pandoc's Lua filter API: + +```lua +-- Extract the string value +local value = pandoc.utils.stringify(meta.listing.contents) +-- value == "posts/*/index.qmd" + +-- Check if it's a tagged string +if meta.listing.contents[1].classes[1] == "yaml-tagged-string" then + local tag = meta.listing.contents[1].attributes.tag + -- tag == "glob" +end +``` + +The `pandoc.utils.stringify()` function correctly extracts the string content, so most filters will work without modification. + +## Migration Guide + +If you have existing documents that are failing to parse due to file paths or glob patterns: + +1. **Identify problematic values**: Look for metadata with `*`, `_`, or other markdown special characters +2. **Add tags**: Prefix those values with `!path`, `!glob`, or `!str` +3. **Test**: Verify the document parses without errors + +Example migration: + +```yaml +# Before (may crash or parse incorrectly) +--- +listing: posts/*/index.qmd +resources: [images/*.png, _helper.py] +--- + +# After (explicit and reliable) +--- +listing: !glob posts/*/index.qmd +resources: + - !path images/*.png + - !path _helper.py +--- +```