From 7de726e8f69e8a753ec14e09036ab8a35bbd9078 Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Mon, 20 Oct 2025 17:10:56 -0500 Subject: [PATCH 1/2] update --- CLAUDE.md | 29 +- Cargo.lock | 56 + Cargo.toml | 25 +- crates/quarto-error-reporting/Cargo.toml | 24 + crates/quarto-error-reporting/README.md | 181 +++ .../quarto-error-reporting/error_catalog.json | 86 ++ crates/quarto-error-reporting/src/builder.rs | 516 ++++++++ crates/quarto-error-reporting/src/catalog.rs | 138 +++ .../quarto-error-reporting/src/diagnostic.rs | 716 +++++++++++ crates/quarto-error-reporting/src/lib.rs | 67 ++ crates/quarto-error-reporting/src/macros.rs | 45 + crates/quarto-markdown-pandoc/CLAUDE.md | 2 +- crates/quarto-markdown-pandoc/Cargo.toml | 8 +- crates/quarto-markdown-pandoc/src/filters.rs | 98 +- .../src/pandoc/ast_context.rs | 45 +- .../src/pandoc/block.rs | 107 +- .../src/pandoc/caption.rs | 3 +- .../src/pandoc/inline.rs | 178 ++- .../quarto-markdown-pandoc/src/pandoc/list.rs | 5 +- .../src/pandoc/location.rs | 137 ++- .../quarto-markdown-pandoc/src/pandoc/meta.rs | 460 +++++++- .../quarto-markdown-pandoc/src/pandoc/mod.rs | 6 +- .../src/pandoc/pandoc.rs | 4 +- .../src/pandoc/shortcode.rs | 5 +- .../src/pandoc/source_map_compat.rs | 113 ++ .../src/pandoc/table.rs | 20 +- .../src/pandoc/treesitter.rs | 64 +- .../pandoc/treesitter_utils/block_quote.rs | 7 +- .../src/pandoc/treesitter_utils/citation.rs | 6 +- .../src/pandoc/treesitter_utils/code_span.rs | 4 +- .../src/pandoc/treesitter_utils/document.rs | 5 +- .../treesitter_utils/editorial_marks.rs | 20 +- .../treesitter_utils/fenced_div_block.rs | 7 +- .../pandoc/treesitter_utils/inline_link.rs | 5 +- .../pandoc/treesitter_utils/note_reference.rs | 2 +- .../pandocnativeintermediate.rs | 2 +- .../pandoc/treesitter_utils/postprocess.rs | 74 +- .../pandoc/treesitter_utils/text_helpers.rs | 12 +- .../pandoc/treesitter_utils/thematic_break.rs | 4 +- .../pandoc/treesitter_utils/uri_autolink.rs | 6 +- .../src/readers/json.rs | 706 ++++++++--- .../quarto-markdown-pandoc/src/readers/qmd.rs | 165 +-- .../src/utils/diagnostic_collector.rs | 210 ++++ .../quarto-markdown-pandoc/src/utils/mod.rs | 3 +- .../src/writers/json.rs | 554 +++++++-- .../quarto-markdown-pandoc/src/writers/qmd.rs | 134 ++- .../tests/snapshots/json/001.qmd.snapshot | 2 +- .../tests/snapshots/json/002.qmd.snapshot | 2 +- .../tests/snapshots/json/003.qmd.snapshot | 2 +- .../json/math-with-attr.qmd.snapshot | 2 +- .../json/table-alignment.qmd.snapshot | 2 +- .../json/table-caption-attr.qmd.snapshot | 2 +- crates/quarto-markdown-pandoc/tests/test.rs | 69 +- .../tests/test_inline_locations.rs | 336 +++++- .../tests/test_json_roundtrip.rs | 139 +-- .../quarto-markdown-pandoc/tests/test_meta.rs | 14 +- .../tests/test_metadata_source_tracking.rs | 253 ++++ .../tests/test_nested_yaml_serialization.rs | 273 +++++ .../tests/test_yaml_tag_regression.rs | 114 ++ crates/quarto-source-map/Cargo.toml | 13 + crates/quarto-source-map/src/context.rs | 174 +++ crates/quarto-source-map/src/file_info.rs | 254 ++++ crates/quarto-source-map/src/lib.rs | 48 + crates/quarto-source-map/src/mapping.rs | 284 +++++ crates/quarto-source-map/src/source_info.rs | 868 ++++++++++++++ crates/quarto-source-map/src/types.rs | 169 +++ crates/quarto-source-map/src/utils.rs | 211 ++++ crates/quarto-yaml/Cargo.toml | 24 + crates/quarto-yaml/README.md | 154 +++ crates/quarto-yaml/YAML-1.2-REQUIREMENT.md | 113 ++ crates/quarto-yaml/benches/memory_overhead.rs | 267 +++++ .../quarto-yaml/benches/scaling_overhead.rs | 305 +++++ .../claude-notes/implementation-plan.md | 160 +++ .../claude-notes/implementation-status.md | 206 ++++ .../claude-notes/memory-overhead-analysis.md | 221 ++++ .../claude-notes/scaling-analysis.md | 238 ++++ crates/quarto-yaml/src/error.rs | 81 ++ crates/quarto-yaml/src/lib.rs | 42 + crates/quarto-yaml/src/parser.rs | 1051 +++++++++++++++++ .../quarto-yaml/src/yaml_with_source_info.rs | 310 +++++ crates/wasm-qmd-parser/src/utils.rs | 1 + docs/writers/json.qmd | 177 +++ 82 files changed, 10692 insertions(+), 953 deletions(-) create mode 100644 crates/quarto-error-reporting/Cargo.toml create mode 100644 crates/quarto-error-reporting/README.md create mode 100644 crates/quarto-error-reporting/error_catalog.json create mode 100644 crates/quarto-error-reporting/src/builder.rs create mode 100644 crates/quarto-error-reporting/src/catalog.rs create mode 100644 crates/quarto-error-reporting/src/diagnostic.rs create mode 100644 crates/quarto-error-reporting/src/lib.rs create mode 100644 crates/quarto-error-reporting/src/macros.rs create mode 100644 crates/quarto-markdown-pandoc/src/pandoc/source_map_compat.rs create mode 100644 crates/quarto-markdown-pandoc/src/utils/diagnostic_collector.rs create mode 100644 crates/quarto-markdown-pandoc/tests/test_metadata_source_tracking.rs create mode 100644 crates/quarto-markdown-pandoc/tests/test_nested_yaml_serialization.rs create mode 100644 crates/quarto-markdown-pandoc/tests/test_yaml_tag_regression.rs create mode 100644 crates/quarto-source-map/Cargo.toml create mode 100644 crates/quarto-source-map/src/context.rs create mode 100644 crates/quarto-source-map/src/file_info.rs create mode 100644 crates/quarto-source-map/src/lib.rs create mode 100644 crates/quarto-source-map/src/mapping.rs create mode 100644 crates/quarto-source-map/src/source_info.rs create mode 100644 crates/quarto-source-map/src/types.rs create mode 100644 crates/quarto-source-map/src/utils.rs create mode 100644 crates/quarto-yaml/Cargo.toml create mode 100644 crates/quarto-yaml/README.md create mode 100644 crates/quarto-yaml/YAML-1.2-REQUIREMENT.md create mode 100644 crates/quarto-yaml/benches/memory_overhead.rs create mode 100644 crates/quarto-yaml/benches/scaling_overhead.rs create mode 100644 crates/quarto-yaml/claude-notes/implementation-plan.md create mode 100644 crates/quarto-yaml/claude-notes/implementation-status.md create mode 100644 crates/quarto-yaml/claude-notes/memory-overhead-analysis.md create mode 100644 crates/quarto-yaml/claude-notes/scaling-analysis.md create mode 100644 crates/quarto-yaml/src/error.rs create mode 100644 crates/quarto-yaml/src/lib.rs create mode 100644 crates/quarto-yaml/src/parser.rs create mode 100644 crates/quarto-yaml/src/yaml_with_source_info.rs create mode 100644 docs/writers/json.qmd diff --git a/CLAUDE.md b/CLAUDE.md index 85de3c9..6928df1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,11 +1,9 @@ -# Quarto Markdown - -The main documentation for this repository is located at: -[crates/quarto-markdown-pandoc/CLAUDE.md](crates/quarto-markdown-pandoc/CLAUDE.md) +# Quarto Rust monorepo ## **WORK TRACKING** We use bd (beads) for issue tracking instead of Markdown TODOs or external tools. +We use plans for additional context and bookkeeping. Write plans to `claude-notes/plans/YYYY-MM-DD-.md`, and reference the plan file in the issues. ### Quick Reference @@ -101,7 +99,26 @@ When fixing ANY bug: 3. **THIRD**: Implement the fix 4. **FOURTH**: Run the test and verify it passes -**This is non-negotiable. Never implement a fix before verifying the test fails. Stop and ask the user if you cannot think of a way to mechanically test the bad behavior.** +**This is non-negotiable. Never implement a fix before verifying the test fails. Stop and ask the user if you cannot think of a way to mechanically test the bad behavior. Only deviate if writing new features.** + +## Workspace structure + +### `crates` - corresponds to the crates in the public quarto-markdown repo + +- `crates/qmd-syntax-helper`: a binary to help users convert qmd files to the new syntax +- `crates/quarto-error-reporting`: a library to help create uniform, helpful, beautiful error messages +- `crates/quarto-markdown-pandoc`: a binary to parse qmd text and produce Pandoc AST and other formats +- `crates/quarto-source-map`: a library to help maintain information about the source location of data structures in text files +- `crates/quarto-yaml`: a YAML parser that produces YAML objects and accurate fine-grained source location of elements +- `crates/tree-sitter-qmd`: tree-sitter grammars for block and inline parsers +- `crates/wasm-qmd-parser`: A WASM module with some entry points from `crates/quarto-markdown-pandoc` + +### `private-crates` - private crates we are not going to release yet + +- `private-crates/quarto-yaml-validation`: A library to validate YAML objects using schemas +- `private-crates/validate-yaml`: A binary to exercise `quarto-yaml-validation` +- `private-crates/quarto`: The future main entry point for the `quarto` command line binary. +- `private-crates/quarto-core`: supporting library for `quarto` ## General Instructions @@ -118,6 +135,6 @@ When fixing ANY bug: - Always create a plan. Always work on the plan one item at a time. - In the tree-sitter-markdown and tree-sitter-markdown-inline directories, you rebuild the parsers using "tree-sitter generate; tree-sitter build". Make sure the shell is in the correct directory before running those. Every time you change the tree-sitter parsers, rebuild them and run "tree-sitter test". If the tests fail, fix the code. Only change tree-sitter tests you've just added; do not touch any other tests. If you end up getting stuck there, stop and ask for my help. - When attempting to find binary differences between files, always use `xxd` instead of other tools. -- .c only works in JSON formats. Inside Lua filters, you need to use Pandoc's Lua API. Study https://raw.githubusercontent.com/jgm/pandoc/refs/heads/main/doc/lua-filters.md and make notes to yourself as necessary (use docs/for-claude in this directory) +- .c only works in JSON formats. Inside Lua filters, you need to use Pandoc's Lua API. Study https://raw.githubusercontent.com/jgm/pandoc/refs/heads/main/doc/lua-filters.md and make notes to yourself as necessary (use claude-notes in this directory) - Sometimes you get confused by macOS's weird renaming of /tmp. Prefer to use temporary directories local to the project you're working on (which you can later clean) - The documentation in docs/ is a user-facing Quarto website. There, you should document usage and not technical details. \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index db17e8a..e6207fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -244,6 +244,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7382cf6263419f2d8df38c55d7da83da5c18aef87fc7a7fc1fb1e344edfe14c1" dependencies = [ "hashbrown", + "serde", ] [[package]] @@ -399,6 +400,18 @@ dependencies = [ "serde_json", ] +[[package]] +name = "quarto-error-reporting" +version = "0.1.0" +dependencies = [ + "ariadne", + "once_cell", + "quarto-source-map", + "serde", + "serde_json", + "thiserror", +] + [[package]] name = "quarto-markdown-pandoc" version = "0.0.0" @@ -410,7 +423,11 @@ dependencies = [ "hashlink", "once_cell", "paste", + "quarto-error-reporting", + "quarto-source-map", + "quarto-yaml", "regex", + "serde", "serde_json", "tree-sitter", "tree-sitter-qmd", @@ -426,6 +443,25 @@ dependencies = [ "yaml-rust2", ] +[[package]] +name = "quarto-source-map" +version = "0.1.0" +dependencies = [ + "serde", + "serde_json", +] + +[[package]] +name = "quarto-yaml" +version = "0.1.0" +dependencies = [ + "quarto-source-map", + "regex", + "serde", + "thiserror", + "yaml-rust2", +] + [[package]] name = "quote" version = "1.0.40" @@ -553,6 +589,26 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tree-sitter" version = "0.25.8" diff --git a/Cargo.toml b/Cargo.toml index 084ae1d..616fe9b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ resolver = "2" [workspace.package] +version = "0.1.0" authors = ["Posit Software, PBC"] homepage = "https://github.com/posit-dev/quarto-markdown-syntax" keywords = ["parser"] @@ -16,21 +17,25 @@ edition = "2024" [workspace.dependencies] anyhow = "1.0.89" +ariadne = "0.4" +clap = { version = "4.5", features = ["derive", "cargo"] } insta = "1.40.0" memchr = "2.7.4" +once_cell = "1.19" proc-macro2 = "1.0.94" schemars = "0.8.21" -serde = "1.0.215" +serde = { version = "1.0.215", features = ["derive"] } serde_json = "1.0.132" +serde_yaml = "0.9" +thiserror = "1.0" toml = "0.8.19" +tracing-subscriber = { version = "0.3", features = ["env-filter"] } +yaml-rust2 = "0.10" [workspace.dependencies.proc-macro-error2] version = "2.0.1" default-features = false -[workspace.dependencies.tests_macros] -path = "./crates/tests_macros" - [workspace.dependencies.tracing] version = "0.1.40" features = ["std"] @@ -42,15 +47,21 @@ version = "0.25.8" [workspace.dependencies.tree-sitter-qmd] path = "./crates/tree-sitter-qmd" -[workspace.dependencies.tree-sitter-sexpr] -path = "./crates/tree-sitter-sexpr" - [workspace.dependencies.wasm-qmd-parser] path = "./crates/wasm-qmd-parser" [workspace.dependencies.quarto-markdown-pandoc] path = "./crates/quarto-markdown-pandoc" +[workspace.dependencies.quarto-yaml] +path = "./crates/quarto-yaml" + +[workspace.dependencies.quarto-error-reporting] +path = "./crates/quarto-error-reporting" + +[workspace.dependencies.quarto-source-map] +path = "./crates/quarto-source-map" + [workspace.lints.clippy] assigning_clones = "warn" diff --git a/crates/quarto-error-reporting/Cargo.toml b/crates/quarto-error-reporting/Cargo.toml new file mode 100644 index 0000000..ceee815 --- /dev/null +++ b/crates/quarto-error-reporting/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "quarto-error-reporting" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +repository.workspace = true +description = "Error reporting and diagnostic messages for Quarto" + +[dependencies] +# Source location tracking +quarto-source-map = { path = "../quarto-source-map" } + +# Error reporting +ariadne = { workspace = true } +thiserror = { workspace = true } +once_cell = { workspace = true } + +# Serialization +serde = { workspace = true } +serde_json = { workspace = true } + +[dev-dependencies] +# No dev dependencies yet diff --git a/crates/quarto-error-reporting/README.md b/crates/quarto-error-reporting/README.md new file mode 100644 index 0000000..e8262b7 --- /dev/null +++ b/crates/quarto-error-reporting/README.md @@ -0,0 +1,181 @@ +# quarto-error-reporting + +Error reporting and diagnostic messages for Quarto, providing structured, user-friendly error messages following tidyverse best practices. + +## Overview + +This crate provides a comprehensive error reporting system inspired by: + +- **[ariadne](https://docs.rs/ariadne/)**: Visual compiler-quality error messages with source code context +- **[R cli package](https://cli.r-lib.org/)**: Semantic, structured text output +- **[Tidyverse style guide](https://style.tidyverse.org/errors.html)**: Best practices for error message content + +## Current Status + +**Phase 1: Core Types** ✅ **COMPLETE** + +The crate provides complete types for representing diagnostic messages: + +- `DiagnosticMessage`: Main error message structure with optional error codes +- `MessageContent`: Content representation (Plain, Markdown) +- `DetailItem`: Individual detail bullets with error/info/note kinds +- `DiagnosticKind`: Error, Warning, Info, Note +- `ErrorCodeInfo`: Metadata for error codes +- Error catalog system (JSON-based, compile-time loaded) + +### Error Code System + +Quarto now supports TypeScript-style error codes for better searchability and documentation: + +**Format**: `Q--` (e.g., `Q-1-1`, `Q-2-301`) + +**Example**: +```rust +use quarto_error_reporting::DiagnosticMessage; + +let error = DiagnosticMessage::error("YAML Syntax Error") + .with_code("Q-1-1"); + +// Get docs URL automatically from catalog +if let Some(url) = error.docs_url() { + println!("See {} for more information", url); +} +``` + +**Benefits**: +- Users can Google "Q-1-1" instead of error text +- Error codes are stable across versions +- Each code maps to detailed documentation +- Optional but encouraged + +**Subsystem Numbers**: +- 0: Internal/System Errors +- 1: YAML and Configuration +- 2: Markdown and Parsing +- 3: Engines and Execution +- 4: Rendering and Formats +- 5: Projects and Structure +- 6: Extensions and Plugins +- 7: CLI and Tools +- 8: Publishing and Deployment +- 9+: Reserved for future use + +See `error_catalog.json` for the complete catalog and `/claude-notes/error-id-system-design.md` for full design documentation. + +### Builder API Usage + +The builder API encodes tidyverse guidelines directly in the API design: + +```rust +use quarto_error_reporting::DiagnosticMessageBuilder; + +let error = DiagnosticMessageBuilder::error("Incompatible types") + .with_code("Q-1-2") + .problem("Cannot combine date and datetime types") + .add_detail("`x` has type `date`") + .add_detail("`y` has type `datetime`") + .add_hint("Convert both to the same type?") + .build(); +``` + +**Builder methods**: +- `.error()`, `.warning()`, `.info()` - Create diagnostic with specified kind +- `.with_code()` - Set error code (Q--) +- `.problem()` - Set problem statement (the "what" - use "must" or "can't") +- `.add_detail()` - Add error detail (✖ bullet) +- `.add_info()` - Add info detail (i bullet) +- `.add_note()` - Add note detail (plain bullet) +- `.add_hint()` - Add hint (ends with ?) +- `.build()` - Construct the message +- `.build_with_validation()` - Build with tidyverse validation warnings + +## Planned Phases + +### Phase 2: Rendering Integration (Planned) + +- Integration with ariadne for visual terminal output +- JSON serialization for machine-readable errors +- Source span tracking for code locations + +### Phase 3: Console Output Helpers (Planned) + +**⚠️ Requires Design Discussion** + +Before implementing this phase, we need to discuss: + +1. **Missing Pandoc AST → ANSI Writer**: We don't yet have a writer that converts Pandoc AST to ANSI terminal output +2. **Relationship with ariadne**: How should the AST-to-ANSI writer relate to ariadne's visual error reports? + - Should they be separate systems? + - Should ariadne handle errors with source context, while the AST writer handles console messages without source context? + - How do we avoid duplication? + +### Phase 4: Builder API (Planned) + +Tidyverse-style builder methods that make it easy to construct well-structured error messages: + +```rust +let error = DiagnosticMessage::builder() + .error("Unclosed code block") + .problem("Code block started but never closed") + .add_detail("The code block starting with `` ```{python} `` was never closed") + .at_location(opening_span) + .add_hint("Did you forget the closing `` ``` ``?") + .build()?; +``` + +## Design Principles + +### Tidyverse Four-Part Structure + +Following tidyverse guidelines, diagnostic messages have: + +1. **Title**: Brief error message +2. **Problem**: What went wrong (using "must" or "can't") +3. **Details**: Specific information (max 5 bullets) +4. **Hints**: Optional guidance (ends with ?) + +### Semantic Markup + +Use Pandoc span syntax for semantic inline markup: + +```markdown +Could not find file `config.yaml`{.file} in directory `/home/user/.config`{.path} +``` + +Semantic classes (to be defined): +- `.file` - filenames and paths +- `.engine` - engine names (jupyter, knitr) +- `.format` - output formats (html, pdf) +- `.option` - YAML option names +- `.code` - generic code + +### Multiple Output Formats + +The same diagnostic message can be rendered to: + +- **ANSI terminal**: Colorful, formatted output for TTY +- **HTML**: Themeable output for web contexts +- **JSON**: Machine-readable for programmatic use + +## Implementation Notes + +This crate follows the design outlined in `/claude-notes/error-reporting-design-research.md`. + +Key decisions: +- ✅ Markdown strings → Pandoc AST internally (defer compile-time macros) +- ✅ Rust-only (WASM for cross-language if needed) +- ✅ Builder API encoding tidyverse guidelines +- ⚠️ Pandoc AST → ANSI writer needs design discussion +- ⚠️ Relationship with ariadne needs clarification + +## Development + +Run tests: + +```bash +cargo test -p quarto-error-reporting +``` + +## License + +MIT diff --git a/crates/quarto-error-reporting/error_catalog.json b/crates/quarto-error-reporting/error_catalog.json new file mode 100644 index 0000000..e286b3f --- /dev/null +++ b/crates/quarto-error-reporting/error_catalog.json @@ -0,0 +1,86 @@ +{ + "Q-0-1": { + "subsystem": "internal", + "title": "Internal Error", + "message_template": "An internal error occurred. This is a bug in Quarto.", + "docs_url": "https://quarto.org/docs/errors/Q-0-1", + "since_version": "99.9.9" + }, + "Q-1-10": { + "subsystem": "yaml", + "title": "Missing Required Property", + "message_template": "A required property is missing from the YAML document.", + "docs_url": "https://quarto.org/docs/errors/Q-1-10", + "since_version": "99.9.9" + }, + "Q-1-11": { + "subsystem": "yaml", + "title": "Type Mismatch", + "message_template": "The value has an incorrect type (expected one type, got another).", + "docs_url": "https://quarto.org/docs/errors/Q-1-11", + "since_version": "99.9.9" + }, + "Q-1-12": { + "subsystem": "yaml", + "title": "Invalid Enum Value", + "message_template": "The value is not one of the allowed enumeration values.", + "docs_url": "https://quarto.org/docs/errors/Q-1-12", + "since_version": "99.9.9" + }, + "Q-1-13": { + "subsystem": "yaml", + "title": "Array Length Constraint Violation", + "message_template": "The array length does not meet the minimum or maximum item constraints.", + "docs_url": "https://quarto.org/docs/errors/Q-1-13", + "since_version": "99.9.9" + }, + "Q-1-14": { + "subsystem": "yaml", + "title": "String Pattern Mismatch", + "message_template": "The string value does not match the required pattern.", + "docs_url": "https://quarto.org/docs/errors/Q-1-14", + "since_version": "99.9.9" + }, + "Q-1-15": { + "subsystem": "yaml", + "title": "Number Range Violation", + "message_template": "The numeric value is outside the allowed range or not a valid multiple.", + "docs_url": "https://quarto.org/docs/errors/Q-1-15", + "since_version": "99.9.9" + }, + "Q-1-16": { + "subsystem": "yaml", + "title": "Object Property Count Violation", + "message_template": "The object has too many or too few properties.", + "docs_url": "https://quarto.org/docs/errors/Q-1-16", + "since_version": "99.9.9" + }, + "Q-1-17": { + "subsystem": "yaml", + "title": "Unresolved Schema Reference", + "message_template": "A $ref reference in the schema could not be resolved.", + "docs_url": "https://quarto.org/docs/errors/Q-1-17", + "since_version": "99.9.9" + }, + "Q-1-18": { + "subsystem": "yaml", + "title": "Unknown Property", + "message_template": "An unknown property was found in a closed object schema.", + "docs_url": "https://quarto.org/docs/errors/Q-1-18", + "since_version": "99.9.9" + }, + "Q-1-19": { + "subsystem": "yaml", + "title": "Array Uniqueness Violation", + "message_template": "Array items must be unique but duplicates were found.", + "docs_url": "https://quarto.org/docs/errors/Q-1-19", + "since_version": "99.9.9" + }, + "Q-1-99": { + "subsystem": "yaml", + "title": "Generic Validation Error", + "message_template": "A validation error occurred.", + "docs_url": "https://quarto.org/docs/errors/Q-1-99", + "since_version": "99.9.9" + } +} diff --git a/crates/quarto-error-reporting/src/builder.rs b/crates/quarto-error-reporting/src/builder.rs new file mode 100644 index 0000000..59d7240 --- /dev/null +++ b/crates/quarto-error-reporting/src/builder.rs @@ -0,0 +1,516 @@ +//! Builder API for diagnostic messages. +//! +//! This module provides a builder pattern that encodes tidyverse-style error message +//! guidelines directly in the API, making it easy to construct well-structured error messages. + +use crate::diagnostic::{ + DetailItem, DetailKind, DiagnosticKind, DiagnosticMessage, MessageContent, +}; + +/// Builder for creating diagnostic messages following tidyverse guidelines. +/// +/// The builder API naturally encourages the tidyverse four-part error structure: +/// 1. **Title**: Brief error message (via `.error()`, `.warning()`, etc.) +/// 2. **Problem**: What went wrong - the "must" or "can't" statement (via `.problem()`) +/// 3. **Details**: Specific information - max 5 bulleted items (via `.add_detail()`, `.add_info()`) +/// 4. **Hints**: Optional guidance (via `.add_hint()`) +/// +/// # Example +/// +/// ``` +/// use quarto_error_reporting::DiagnosticMessageBuilder; +/// +/// let error = DiagnosticMessageBuilder::error("Incompatible types") +/// .with_code("Q-1-2") +/// .problem("Cannot combine date and datetime types") +/// .add_detail("`x`{.arg} has type `date`{.type}") +/// .add_detail("`y`{.arg} has type `datetime`{.type}") +/// .add_hint("Convert both to the same type?") +/// .build(); +/// +/// assert_eq!(error.title, "Incompatible types"); +/// assert_eq!(error.code, Some("Q-1-2".to_string())); +/// assert!(error.problem.is_some()); +/// assert_eq!(error.details.len(), 2); +/// assert_eq!(error.hints.len(), 1); +/// ``` +#[derive(Debug, Clone)] +pub struct DiagnosticMessageBuilder { + /// The kind of diagnostic (Error, Warning, Info) + kind: DiagnosticKind, + + /// Brief title for the error + title: String, + + /// Optional error code (e.g., "Q-1-1") + code: Option, + + /// The problem statement (the "what") + problem: Option, + + /// Specific error details (the "where/why") + details: Vec, + + /// Optional hints for fixing + hints: Vec, + + /// Source location for this diagnostic + location: Option, +} + +impl DiagnosticMessageBuilder { + /// Create a new builder with the specified kind and title. + /// + /// Most code should use the convenience methods `.error()`, `.warning()`, or `.info()` + /// instead of calling this directly. + pub fn new(kind: DiagnosticKind, title: impl Into) -> Self { + Self { + kind, + title: title.into(), + code: None, + problem: None, + details: Vec::new(), + hints: Vec::new(), + location: None, + } + } + + /// Create an error diagnostic builder. + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let error = DiagnosticMessageBuilder::error("YAML Syntax Error") + /// .build(); + /// ``` + pub fn error(title: impl Into) -> Self { + Self::new(DiagnosticKind::Error, title) + } + + /// Create a generic error for migration purposes. + /// + /// This is a convenience method for the migration from ErrorCollector to DiagnosticMessage. + /// It creates an error with code Q-0-99 and includes file/line information for tracking + /// where the error originated in the code. + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let error = DiagnosticMessageBuilder::generic_error( + /// "Found unexpected attribute", + /// file!(), + /// line!() + /// ); + /// assert_eq!(error.code, Some("Q-0-99".to_string())); + /// assert!(error.title.contains("Found unexpected attribute")); + /// ``` + pub fn generic_error(message: impl Into, file: &str, line: u32) -> DiagnosticMessage { + let title = format!("{} (at {}:{})", message.into(), file, line); + Self::error(title).with_code("Q-0-99").build() + } + + /// Create a generic warning for migration purposes. + /// + /// Similar to `generic_error()` but for warnings. + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let warning = DiagnosticMessageBuilder::generic_warning( + /// "Caption found without table", + /// file!(), + /// line!() + /// ); + /// assert_eq!(warning.code, Some("Q-0-99".to_string())); + /// ``` + pub fn generic_warning(message: impl Into, file: &str, line: u32) -> DiagnosticMessage { + let title = format!("{} (at {}:{})", message.into(), file, line); + Self::warning(title).with_code("Q-0-99").build() + } + + /// Create a warning diagnostic builder. + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let warning = DiagnosticMessageBuilder::warning("Deprecated feature") + /// .build(); + /// ``` + pub fn warning(title: impl Into) -> Self { + Self::new(DiagnosticKind::Warning, title) + } + + /// Create an info diagnostic builder. + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let info = DiagnosticMessageBuilder::info("Processing complete") + /// .build(); + /// ``` + pub fn info(title: impl Into) -> Self { + Self::new(DiagnosticKind::Info, title) + } + + /// Set the error code. + /// + /// Error codes follow the format `Q--` (e.g., "Q-1-1"). + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let error = DiagnosticMessageBuilder::error("YAML Syntax Error") + /// .with_code("Q-1-1") + /// .build(); + /// + /// assert_eq!(error.code, Some("Q-1-1".to_string())); + /// ``` + pub fn with_code(mut self, code: impl Into) -> Self { + self.code = Some(code.into()); + self + } + + /// Attach a source location to this diagnostic. + /// + /// The location identifies where in the source code the issue occurred. + /// The location may track transformation history, allowing the error to be + /// mapped back through multiple processing steps to the original source file. + /// + /// # Example + /// + /// ```ignore + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// use quarto_source_map::{SourceInfo, SourceContext, FileId, Range, Location}; + /// + /// let mut ctx = SourceContext::new(); + /// let file_id = ctx.add_file("test.qmd".into(), Some("content".into())); + /// let range = Range { + /// start: Location { offset: 0, row: 0, column: 0 }, + /// end: Location { offset: 7, row: 0, column: 7 }, + /// }; + /// let source_info = SourceInfo::original(file_id, range); + /// + /// let error = DiagnosticMessageBuilder::error("Parse error") + /// .with_location(source_info) + /// .build(); + /// ``` + pub fn with_location(mut self, location: quarto_source_map::SourceInfo) -> Self { + self.location = Some(location); + self + } + + /// Set the problem statement. + /// + /// Following tidyverse guidelines, the problem statement should: + /// - Start with a general, concise statement + /// - Use "must" for requirements or "can't" for impossibilities + /// - Be specific about types/expectations + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let error = DiagnosticMessageBuilder::error("Invalid input") + /// .problem("`n` must be a numeric vector, not a character vector") + /// .build(); + /// ``` + pub fn problem(mut self, stmt: impl Into) -> Self { + self.problem = Some(stmt.into()); + self + } + + /// Add an error detail (displayed with error/cross bullet). + /// + /// Error details provide specific information about what went wrong. + /// Following tidyverse guidelines: + /// - Keep sentences short and specific + /// - Reveal location, name, or content of problematic input + /// - Limit to 5 total details (error + info) to avoid overwhelming users + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let error = DiagnosticMessageBuilder::error("Incompatible lengths") + /// .add_detail("`x` has length 3") + /// .add_detail("`y` has length 5") + /// .build(); + /// + /// assert_eq!(error.details.len(), 2); + /// ``` + pub fn add_detail(mut self, detail: impl Into) -> Self { + self.details.push(DetailItem { + kind: DetailKind::Error, + content: detail.into(), + }); + self + } + + /// Add an info detail (displayed with info bullet). + /// + /// Info details provide additional context or explanatory information. + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let error = DiagnosticMessageBuilder::error("Missing file") + /// .add_detail("Could not find `config.yaml`") + /// .add_info("Default configuration will be used") + /// .build(); + /// ``` + pub fn add_info(mut self, info: impl Into) -> Self { + self.details.push(DetailItem { + kind: DetailKind::Info, + content: info.into(), + }); + self + } + + /// Add a note detail (displayed with plain bullet). + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let error = DiagnosticMessageBuilder::error("Parse error") + /// .add_note("This is an experimental feature") + /// .build(); + /// ``` + pub fn add_note(mut self, note: impl Into) -> Self { + self.details.push(DetailItem { + kind: DetailKind::Note, + content: note.into(), + }); + self + } + + /// Add a hint for fixing the error. + /// + /// Following tidyverse guidelines, hints should: + /// - Only be included when the problem source is clear and common + /// - Provide straightforward fix suggestions + /// - End with a question mark if suggesting action + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let error = DiagnosticMessageBuilder::error("Function not found") + /// .problem("Could not find function `summarise()`") + /// .add_hint("Did you mean `summarize()`?") + /// .build(); + /// + /// assert_eq!(error.hints.len(), 1); + /// ``` + pub fn add_hint(mut self, hint: impl Into) -> Self { + self.hints.push(hint.into()); + self + } + + /// Build the diagnostic message. + /// + /// This consumes the builder and returns the constructed `DiagnosticMessage`. + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let error = DiagnosticMessageBuilder::error("Parse error") + /// .problem("Invalid syntax") + /// .build(); + /// + /// assert_eq!(error.title, "Parse error"); + /// ``` + pub fn build(self) -> DiagnosticMessage { + DiagnosticMessage { + code: self.code, + title: self.title, + kind: self.kind, + problem: self.problem, + details: self.details, + hints: self.hints, + location: self.location, + } + } + + /// Build with validation. + /// + /// This validates the message structure according to tidyverse guidelines: + /// - Warns if there's no problem statement (recommended but not required) + /// - Warns if there are more than 5 details (overwhelming for users) + /// - Future: Could check that hints end with '?' + /// + /// Returns warnings as a Vec of strings. An empty Vec means validation passed. + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let (error, warnings) = DiagnosticMessageBuilder::error("Test error") + /// .build_with_validation(); + /// + /// // Warns because there's no problem statement + /// assert!(!warnings.is_empty()); + /// ``` + pub fn build_with_validation(self) -> (DiagnosticMessage, Vec) { + let mut warnings = Vec::new(); + + // Check for problem statement + if self.problem.is_none() { + warnings.push( + "Error message missing problem statement. \ + Consider adding .problem() to explain what went wrong." + .to_string(), + ); + } + + // Check detail count (tidyverse recommends max 5) + if self.details.len() > 5 { + warnings.push(format!( + "Error message has {} details. Tidyverse guidelines recommend max 5 to avoid \ + overwhelming users.", + self.details.len() + )); + } + + (self.build(), warnings) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_builder_error() { + let msg = DiagnosticMessageBuilder::error("Test error").build(); + assert_eq!(msg.title, "Test error"); + assert_eq!(msg.kind, DiagnosticKind::Error); + } + + #[test] + fn test_builder_warning() { + let msg = DiagnosticMessageBuilder::warning("Test warning").build(); + assert_eq!(msg.kind, DiagnosticKind::Warning); + } + + #[test] + fn test_builder_info() { + let msg = DiagnosticMessageBuilder::info("Test info").build(); + assert_eq!(msg.kind, DiagnosticKind::Info); + } + + #[test] + fn test_builder_with_code() { + let msg = DiagnosticMessageBuilder::error("Test") + .with_code("Q-1-1") + .build(); + assert_eq!(msg.code, Some("Q-1-1".to_string())); + } + + #[test] + fn test_builder_problem() { + let msg = DiagnosticMessageBuilder::error("Test") + .problem("Something went wrong") + .build(); + assert!(msg.problem.is_some()); + assert_eq!(msg.problem.unwrap().as_str(), "Something went wrong"); + } + + #[test] + fn test_builder_details() { + let msg = DiagnosticMessageBuilder::error("Test") + .add_detail("Detail 1") + .add_info("Info 1") + .add_note("Note 1") + .build(); + + assert_eq!(msg.details.len(), 3); + assert_eq!(msg.details[0].kind, DetailKind::Error); + assert_eq!(msg.details[1].kind, DetailKind::Info); + assert_eq!(msg.details[2].kind, DetailKind::Note); + } + + #[test] + fn test_builder_hints() { + let msg = DiagnosticMessageBuilder::error("Test") + .add_hint("Did you mean X?") + .add_hint("Try Y instead") + .build(); + + assert_eq!(msg.hints.len(), 2); + } + + #[test] + fn test_builder_complete_message() { + let msg = DiagnosticMessageBuilder::error("Incompatible types") + .with_code("Q-1-2") + .problem("Cannot combine date and datetime types") + .add_detail("`x` has type `date`") + .add_detail("`y` has type `datetime`") + .add_hint("Convert both to the same type?") + .build(); + + assert_eq!(msg.title, "Incompatible types"); + assert_eq!(msg.code, Some("Q-1-2".to_string())); + assert!(msg.problem.is_some()); + assert_eq!(msg.details.len(), 2); + assert_eq!(msg.hints.len(), 1); + } + + #[test] + fn test_builder_validation_no_problem() { + let (msg, warnings) = DiagnosticMessageBuilder::error("Test").build_with_validation(); + + assert_eq!(msg.title, "Test"); + assert!(!warnings.is_empty()); + assert!(warnings[0].contains("missing problem statement")); + } + + #[test] + fn test_builder_validation_too_many_details() { + let (_msg, warnings) = DiagnosticMessageBuilder::error("Test") + .problem("Something wrong") + .add_detail("1") + .add_detail("2") + .add_detail("3") + .add_detail("4") + .add_detail("5") + .add_detail("6") + .build_with_validation(); + + assert!(!warnings.is_empty()); + assert!(warnings[0].contains("6 details")); + assert!(warnings[0].contains("max 5")); + } + + #[test] + fn test_builder_validation_passes() { + let (_msg, warnings) = DiagnosticMessageBuilder::error("Test") + .problem("Something wrong") + .add_detail("Detail") + .build_with_validation(); + + assert!(warnings.is_empty()); + } +} diff --git a/crates/quarto-error-reporting/src/catalog.rs b/crates/quarto-error-reporting/src/catalog.rs new file mode 100644 index 0000000..82e17e7 --- /dev/null +++ b/crates/quarto-error-reporting/src/catalog.rs @@ -0,0 +1,138 @@ +//! Error code catalog and lookup. +//! +//! This module provides access to the centralized error catalog, which maps +//! error codes (like "Q-1-1") to their metadata (title, message template, docs URL, etc.). + +use once_cell::sync::Lazy; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +/// Metadata for an error code. +/// +/// Each entry in the error catalog describes a specific error code, +/// including its subsystem, title, default message, and documentation URL. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct ErrorCodeInfo { + /// Subsystem name (e.g., "yaml", "markdown", "engine") + pub subsystem: String, + + /// Short title for the error + pub title: String, + + /// Default message template (may include placeholders) + pub message_template: String, + + /// URL to documentation (optional) + #[serde(skip_serializing_if = "Option::is_none")] + pub docs_url: Option, + + /// When this error was introduced (version) + pub since_version: String, +} + +/// Global error catalog, loaded lazily from JSON at compile time. +/// +/// The catalog is loaded from `error_catalog.json` using `include_str!()`, +/// which embeds the JSON at compile time. This means no runtime file I/O. +/// +/// # Panics +/// +/// Panics if the embedded JSON is invalid. This should only happen during +/// development if someone manually edits the catalog incorrectly. +pub static ERROR_CATALOG: Lazy> = Lazy::new(|| { + let json_data = include_str!("../error_catalog.json"); + serde_json::from_str(json_data).expect("Invalid error catalog JSON - this is a bug in Quarto") +}); + +/// Look up error code information. +/// +/// Returns `None` if the error code is not found in the catalog. +/// +/// # Example +/// +/// ``` +/// use quarto_error_reporting::catalog::get_error_info; +/// +/// if let Some(info) = get_error_info("Q-0-1") { +/// println!("Error: {} - {}", info.title, info.message_template); +/// } +/// ``` +pub fn get_error_info(code: &str) -> Option<&ErrorCodeInfo> { + ERROR_CATALOG.get(code) +} + +/// Get documentation URL for an error code. +/// +/// Returns `None` if the error code is not found or has no documentation URL. +/// +/// # Example +/// +/// ``` +/// use quarto_error_reporting::catalog::get_docs_url; +/// +/// if let Some(url) = get_docs_url("Q-0-1") { +/// println!("See {} for more information", url); +/// } +/// ``` +pub fn get_docs_url(code: &str) -> Option<&str> { + ERROR_CATALOG + .get(code) + .and_then(|info| info.docs_url.as_deref()) +} + +/// Get the subsystem name for an error code. +/// +/// Returns `None` if the error code is not found. +/// +/// # Example +/// +/// ``` +/// use quarto_error_reporting::catalog::get_subsystem; +/// +/// assert_eq!(get_subsystem("Q-0-1"), Some("internal")); +/// ``` +pub fn get_subsystem(code: &str) -> Option<&str> { + ERROR_CATALOG.get(code).map(|info| info.subsystem.as_str()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_catalog_loads() { + // Just accessing ERROR_CATALOG will trigger loading + // If the JSON is invalid, this will panic + assert!(!ERROR_CATALOG.is_empty()); + } + + #[test] + fn test_internal_error_exists() { + let info = get_error_info("Q-0-1"); + assert!(info.is_some()); + + let info = info.unwrap(); + assert_eq!(info.subsystem, "internal"); + assert_eq!(info.title, "Internal Error"); + assert!(info.docs_url.is_some()); + } + + #[test] + fn test_get_docs_url() { + let url = get_docs_url("Q-0-1"); + assert!(url.is_some()); + assert!(url.unwrap().starts_with("https://quarto.org/docs/errors/")); + } + + #[test] + fn test_get_subsystem() { + assert_eq!(get_subsystem("Q-0-1"), Some("internal")); + assert_eq!(get_subsystem("Q-999-999"), None); + } + + #[test] + fn test_nonexistent_code() { + assert!(get_error_info("Q-999-999").is_none()); + assert!(get_docs_url("Q-999-999").is_none()); + } +} diff --git a/crates/quarto-error-reporting/src/diagnostic.rs b/crates/quarto-error-reporting/src/diagnostic.rs new file mode 100644 index 0000000..08cf24f --- /dev/null +++ b/crates/quarto-error-reporting/src/diagnostic.rs @@ -0,0 +1,716 @@ +//! Core diagnostic message types. +//! +//! This module defines the fundamental structures for representing diagnostic messages +//! (errors, warnings, info) following tidyverse-style guidelines. + +use serde::{Deserialize, Serialize}; + +/// The kind of diagnostic message. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum DiagnosticKind { + /// An error that prevents completion + Error, + /// A warning that doesn't prevent completion but indicates a problem + Warning, + /// Informational message + Info, + /// A note providing additional context + Note, +} + +/// How detail items should be presented (tidyverse x/i bullet style). +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum DetailKind { + /// Error detail (✖ bullet in tidyverse style) + Error, + /// Info detail (i bullet in tidyverse style) + Info, + /// Note detail (plain bullet) + Note, +} + +/// The content of a message or detail item. +/// +/// This will eventually support Pandoc AST for rich formatting, but starts +/// with simpler string-based content. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub enum MessageContent { + /// Plain text content + Plain(String), + /// Markdown content (will be parsed to Pandoc AST in later phases) + Markdown(String), + // Future: PandocAst(Box) +} + +impl MessageContent { + /// Get the raw string content for display + pub fn as_str(&self) -> &str { + match self { + MessageContent::Plain(s) => s, + MessageContent::Markdown(s) => s, + } + } + + /// Convert to JSON value with type information + pub fn to_json(&self) -> serde_json::Value { + use serde_json::json; + match self { + MessageContent::Plain(s) => json!({ + "type": "plain", + "content": s + }), + MessageContent::Markdown(s) => json!({ + "type": "markdown", + "content": s + }), + } + } +} + +impl From for MessageContent { + fn from(s: String) -> Self { + MessageContent::Markdown(s) + } +} + +impl From<&str> for MessageContent { + fn from(s: &str) -> Self { + MessageContent::Markdown(s.to_string()) + } +} + +/// A detail item in a diagnostic message. +/// +/// Following tidyverse guidelines, details provide specific information about +/// the error (what went wrong, where, with what values). +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct DetailItem { + /// The kind of detail (error, info, note) + pub kind: DetailKind, + /// The content of the detail + pub content: MessageContent, + // Future: Optional source span for details that point to specific code locations + // pub span: Option, +} + +/// A diagnostic message following tidyverse-style structure. +/// +/// Structure: +/// 1. **Code**: Optional error code (e.g., "Q-1-1") for searchability +/// 2. **Title**: Brief error message +/// 3. **Kind**: Error, Warning, Info +/// 4. **Problem**: What went wrong (the "must" or "can't" statement) +/// 5. **Details**: Specific information (bulleted, max 5 per tidyverse) +/// 6. **Hints**: Optional guidance for fixing (ends with ?) +/// +/// # Example +/// +/// ```ignore +/// let msg = DiagnosticMessage { +/// code: Some("Q-1-2".to_string()), +/// title: "Incompatible types".to_string(), +/// kind: DiagnosticKind::Error, +/// problem: Some("Cannot combine date and datetime types".into()), +/// details: vec![ +/// DetailItem { +/// kind: DetailKind::Error, +/// content: "`x`{.arg} has type `date`{.type}".into(), +/// }, +/// DetailItem { +/// kind: DetailKind::Error, +/// content: "`y`{.arg} has type `datetime`{.type}".into(), +/// }, +/// ], +/// hints: vec!["Convert both to the same type?".into()], +/// source_spans: vec![], +/// }; +/// ``` +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct DiagnosticMessage { + /// Optional error code (e.g., "Q-1-1") + /// + /// Error codes are optional but encouraged. They provide: + /// - Searchability (users can Google "Q-1-1") + /// - Stability (codes don't change even if message wording improves) + /// - Documentation (each code maps to a detailed explanation) + #[serde(skip_serializing_if = "Option::is_none")] + pub code: Option, + + /// Brief title for the error + pub title: String, + + /// The kind of diagnostic (Error, Warning, Info) + pub kind: DiagnosticKind, + + /// The problem statement (the "what" - using "must" or "can't") + pub problem: Option, + + /// Specific error details (the "where/why" - max 5 per tidyverse) + pub details: Vec, + + /// Optional hints for fixing (ends with ?) + pub hints: Vec, + + /// Source location for this diagnostic + /// + /// When present, this identifies where in the source code the issue occurred. + /// The location may track transformation history, allowing the error to be + /// mapped back through multiple processing steps to the original source file. + #[serde(skip_serializing_if = "Option::is_none")] + pub location: Option, +} + +impl DiagnosticMessage { + /// Access the diagnostic message builder API. + /// + /// This is the recommended way to create diagnostic messages, as the builder API + /// encodes tidyverse-style guidelines and makes it easy to construct well-structured + /// error messages. + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::{DiagnosticMessage, DiagnosticMessageBuilder}; + /// + /// let error = DiagnosticMessageBuilder::error("Incompatible types") + /// .with_code("Q-1-2") + /// .problem("Cannot combine date and datetime types") + /// .add_detail("`x` has type `date`") + /// .add_detail("`y` has type `datetime`") + /// .add_hint("Convert both to the same type?") + /// .build(); + /// ``` + pub fn builder() -> crate::builder::DiagnosticMessageBuilder { + // This is just a convenience for accessing the builder type + // Users should call DiagnosticMessageBuilder::error() etc directly + crate::builder::DiagnosticMessageBuilder::error("") + } + + /// Create a new diagnostic message with just a title and kind. + /// + /// Note: Consider using `DiagnosticMessage::builder()` instead for better structure. + pub fn new(kind: DiagnosticKind, title: impl Into) -> Self { + Self { + code: None, + title: title.into(), + kind, + problem: None, + details: Vec::new(), + hints: Vec::new(), + location: None, + } + } + + /// Create an error diagnostic. + /// + /// Note: Consider using `DiagnosticMessage::builder().error()` instead for better structure. + pub fn error(title: impl Into) -> Self { + Self::new(DiagnosticKind::Error, title) + } + + /// Create a warning diagnostic. + /// + /// Note: Consider using `DiagnosticMessage::builder().warning()` instead for better structure. + pub fn warning(title: impl Into) -> Self { + Self::new(DiagnosticKind::Warning, title) + } + + /// Create an info diagnostic. + /// + /// Note: Consider using `DiagnosticMessage::builder().info()` instead for better structure. + pub fn info(title: impl Into) -> Self { + Self::new(DiagnosticKind::Info, title) + } + + /// Set the error code. + /// + /// Error codes follow the format `Q--` (e.g., "Q-1-1"). + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessage; + /// + /// let msg = DiagnosticMessage::error("YAML Syntax Error") + /// .with_code("Q-1-1"); + /// ``` + pub fn with_code(mut self, code: impl Into) -> Self { + self.code = Some(code.into()); + self + } + + /// Get the documentation URL for this error, if it has an error code. + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessage; + /// + /// let msg = DiagnosticMessage::error("Internal Error") + /// .with_code("Q-0-1"); + /// + /// assert!(msg.docs_url().is_some()); + /// ``` + pub fn docs_url(&self) -> Option<&str> { + self.code + .as_ref() + .and_then(|code| crate::catalog::get_docs_url(code)) + } + + /// Render this diagnostic message as text following tidyverse style. + /// + /// Format: + /// ```text + /// Error: title + /// Problem statement here + /// ✖ Error detail 1 + /// ✖ Error detail 2 + /// ℹ Info detail + /// • Note detail + /// ? Hint 1 + /// ? Hint 2 + /// ``` + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessageBuilder; + /// + /// let msg = DiagnosticMessageBuilder::error("Invalid input") + /// .problem("Values must be numeric") + /// .add_detail("Found text in column 3") + /// .add_hint("Convert to numbers first?") + /// .build(); + /// let text = msg.to_text(None); + /// assert!(text.contains("Error: Invalid input")); + /// assert!(text.contains("Values must be numeric")); + /// ``` + pub fn to_text(&self, ctx: Option<&quarto_source_map::SourceContext>) -> String { + use std::fmt::Write; + + let mut result = String::new(); + + // Title line with kind + let kind_str = match self.kind { + DiagnosticKind::Error => "Error", + DiagnosticKind::Warning => "Warning", + DiagnosticKind::Info => "Info", + DiagnosticKind::Note => "Note", + }; + + if let Some(code) = &self.code { + write!(result, "{} [{}]: {}", kind_str, code, self.title).unwrap(); + } else { + write!(result, "{}: {}", kind_str, self.title).unwrap(); + } + + // Add location if present + if let Some(loc) = &self.location { + if let Some(ctx) = ctx { + // Try to map to original source + if let Some(mapped) = loc.map_offset(loc.range.start.offset, ctx) { + if let Some(file) = ctx.get_file(mapped.file_id) { + write!( + result, + " at {}:{}:{}", + file.path, + mapped.location.row + 1, // Display as 1-based + mapped.location.column + 1 + ) + .unwrap(); + } + } + } else { + // No context, show immediate location + write!( + result, + " at {}:{}", + loc.range.start.row + 1, + loc.range.start.column + 1 + ) + .unwrap(); + } + } + + // Problem statement + if let Some(problem) = &self.problem { + write!(result, "\n{}", problem.as_str()).unwrap(); + } + + // Details with appropriate bullets + for detail in &self.details { + let bullet = match detail.kind { + DetailKind::Error => "✖", + DetailKind::Info => "ℹ", + DetailKind::Note => "•", + }; + write!(result, "\n{} {}", bullet, detail.content.as_str()).unwrap(); + } + + // Hints + for hint in &self.hints { + write!(result, "\n? {}", hint.as_str()).unwrap(); + } + + result + } + + /// Render this diagnostic message as a JSON value. + /// + /// Returns a structured JSON object with all fields: + /// ```json + /// { + /// "kind": "error", + /// "title": "Invalid input", + /// "code": "Q-1-2", + /// "problem": "Values must be numeric", + /// "details": [{"kind": "error", "content": "Found text in column 3"}], + /// "hints": ["Convert to numbers first?"] + /// } + /// ``` + /// + /// # Example + /// + /// ``` + /// use quarto_error_reporting::DiagnosticMessage; + /// + /// let msg = DiagnosticMessage::error("Something went wrong"); + /// let json = msg.to_json(); + /// assert_eq!(json["kind"], "error"); + /// assert_eq!(json["title"], "Something went wrong"); + /// ``` + pub fn to_json(&self) -> serde_json::Value { + use serde_json::json; + + let kind_str = match self.kind { + DiagnosticKind::Error => "error", + DiagnosticKind::Warning => "warning", + DiagnosticKind::Info => "info", + DiagnosticKind::Note => "note", + }; + + let mut obj = json!({ + "kind": kind_str, + "title": self.title, + }); + + // Add optional fields + if let Some(code) = &self.code { + obj["code"] = json!(code); + } + + if let Some(problem) = &self.problem { + obj["problem"] = problem.to_json(); + } + + if !self.details.is_empty() { + let details: Vec<_> = self + .details + .iter() + .map(|d| { + let detail_kind = match d.kind { + DetailKind::Error => "error", + DetailKind::Info => "info", + DetailKind::Note => "note", + }; + json!({ + "kind": detail_kind, + "content": d.content.to_json() + }) + }) + .collect(); + obj["details"] = json!(details); + } + + if !self.hints.is_empty() { + let hints: Vec<_> = self.hints.iter().map(|h| h.to_json()).collect(); + obj["hints"] = json!(hints); + } + + if let Some(location) = &self.location { + obj["location"] = json!(location); // quarto-source-map::SourceInfo is Serialize + } + + obj + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_diagnostic_kind() { + assert_eq!(DiagnosticKind::Error, DiagnosticKind::Error); + assert_ne!(DiagnosticKind::Error, DiagnosticKind::Warning); + } + + #[test] + fn test_message_content_from_str() { + let content: MessageContent = "test".into(); + assert_eq!(content.as_str(), "test"); + } + + #[test] + fn test_diagnostic_message_new() { + let msg = DiagnosticMessage::new(DiagnosticKind::Error, "Test error"); + assert_eq!(msg.title, "Test error"); + assert_eq!(msg.kind, DiagnosticKind::Error); + assert!(msg.code.is_none()); + assert!(msg.problem.is_none()); + assert!(msg.details.is_empty()); + assert!(msg.hints.is_empty()); + } + + #[test] + fn test_diagnostic_message_constructors() { + let error = DiagnosticMessage::error("Error"); + assert_eq!(error.kind, DiagnosticKind::Error); + assert!(error.code.is_none()); + + let warning = DiagnosticMessage::warning("Warning"); + assert_eq!(warning.kind, DiagnosticKind::Warning); + + let info = DiagnosticMessage::info("Info"); + assert_eq!(info.kind, DiagnosticKind::Info); + } + + #[test] + fn test_with_code() { + let msg = DiagnosticMessage::error("Test error").with_code("Q-1-1"); + assert_eq!(msg.code, Some("Q-1-1".to_string())); + } + + #[test] + fn test_docs_url() { + let msg = DiagnosticMessage::error("Internal Error").with_code("Q-0-1"); + assert!(msg.docs_url().is_some()); + assert!(msg.docs_url().unwrap().contains("Q-0-1")); + } + + #[test] + fn test_docs_url_without_code() { + let msg = DiagnosticMessage::error("Test error"); + assert!(msg.docs_url().is_none()); + } + + #[test] + fn test_docs_url_invalid_code() { + let msg = DiagnosticMessage::error("Test error").with_code("Q-999-999"); + assert!(msg.docs_url().is_none()); + } + + #[test] + fn test_to_text_simple_error() { + let msg = DiagnosticMessage::error("Something went wrong"); + assert_eq!(msg.to_text(None), "Error: Something went wrong"); + } + + #[test] + fn test_to_text_with_code() { + let msg = DiagnosticMessage::error("Something went wrong").with_code("Q-1-1"); + assert_eq!(msg.to_text(None), "Error [Q-1-1]: Something went wrong"); + } + + #[test] + fn test_to_text_full_message() { + use crate::builder::DiagnosticMessageBuilder; + + let msg = DiagnosticMessageBuilder::error("Invalid input") + .problem("Values must be numeric") + .add_detail("Found text in column 3") + .add_info("Columns should contain only numbers") + .add_hint("Convert to numbers first?") + .build(); + + let text = msg.to_text(None); + assert!(text.contains("Error: Invalid input")); + assert!(text.contains("Values must be numeric")); + assert!(text.contains("✖ Found text in column 3")); + assert!(text.contains("ℹ Columns should contain only numbers")); + assert!(text.contains("? Convert to numbers first?")); + } + + #[test] + fn test_to_json_simple() { + let msg = DiagnosticMessage::error("Something went wrong"); + let json = msg.to_json(); + + assert_eq!(json["kind"], "error"); + assert_eq!(json["title"], "Something went wrong"); + assert!(json.get("code").is_none()); + assert!(json.get("problem").is_none()); + } + + #[test] + fn test_to_json_with_code() { + let msg = DiagnosticMessage::error("Something went wrong").with_code("Q-1-1"); + let json = msg.to_json(); + + assert_eq!(json["kind"], "error"); + assert_eq!(json["title"], "Something went wrong"); + assert_eq!(json["code"], "Q-1-1"); + } + + #[test] + fn test_to_json_full_message() { + use crate::builder::DiagnosticMessageBuilder; + + let msg = DiagnosticMessageBuilder::error("Invalid input") + .with_code("Q-1-2") + .problem("Values must be numeric") + .add_detail("Found text in column 3") + .add_info("Expected numbers") + .add_hint("Convert to numbers first?") + .build(); + + let json = msg.to_json(); + assert_eq!(json["kind"], "error"); + assert_eq!(json["title"], "Invalid input"); + assert_eq!(json["code"], "Q-1-2"); + assert_eq!(json["problem"]["type"], "markdown"); + assert_eq!(json["problem"]["content"], "Values must be numeric"); + assert_eq!(json["details"][0]["kind"], "error"); + assert_eq!(json["details"][0]["content"]["type"], "markdown"); + assert_eq!( + json["details"][0]["content"]["content"], + "Found text in column 3" + ); + assert_eq!(json["details"][1]["kind"], "info"); + assert_eq!(json["details"][1]["content"]["type"], "markdown"); + assert_eq!(json["details"][1]["content"]["content"], "Expected numbers"); + assert_eq!(json["hints"][0]["type"], "markdown"); + assert_eq!(json["hints"][0]["content"], "Convert to numbers first?"); + } + + #[test] + fn test_to_json_warning() { + let msg = DiagnosticMessage::warning("Be careful"); + let json = msg.to_json(); + + assert_eq!(json["kind"], "warning"); + assert_eq!(json["title"], "Be careful"); + } + + #[test] + fn test_location_in_to_text_without_context() { + use crate::builder::DiagnosticMessageBuilder; + + // Create a location at row 10, column 5 + let location = quarto_source_map::SourceInfo::original( + quarto_source_map::FileId(0), + quarto_source_map::Range { + start: quarto_source_map::Location { + offset: 100, + row: 10, + column: 5, + }, + end: quarto_source_map::Location { + offset: 110, + row: 10, + column: 15, + }, + }, + ); + + let msg = DiagnosticMessageBuilder::error("Invalid syntax") + .with_location(location) + .build(); + + let text = msg.to_text(None); + + // Without context, should show immediate location (1-indexed) + assert!(text.contains("Invalid syntax")); + assert!(text.contains("at 11:6")); // row 10 + 1, column 5 + 1 + } + + #[test] + fn test_location_in_to_text_with_context() { + use crate::builder::DiagnosticMessageBuilder; + + // Create a source context with a file + let mut ctx = quarto_source_map::SourceContext::new(); + let file_id = ctx.add_file( + "test.qmd".to_string(), + Some("line 1\nline 2\nline 3\nline 4".to_string()), + ); + + // Create a location in that file + let location = quarto_source_map::SourceInfo::original( + file_id, + quarto_source_map::Range { + start: quarto_source_map::Location { + offset: 7, // Start of "line 2" + row: 1, + column: 0, + }, + end: quarto_source_map::Location { + offset: 13, + row: 1, + column: 6, + }, + }, + ); + + let msg = DiagnosticMessageBuilder::error("Invalid syntax") + .with_location(location) + .build(); + + let text = msg.to_text(Some(&ctx)); + + // With context, should show file path and 1-indexed location + assert!(text.contains("Invalid syntax")); + assert!(text.contains("test.qmd")); + assert!(text.contains("2:1")); // row 1 + 1, column 0 + 1 + } + + #[test] + fn test_location_in_to_json() { + use crate::builder::DiagnosticMessageBuilder; + + let location = quarto_source_map::SourceInfo::original( + quarto_source_map::FileId(0), + quarto_source_map::Range { + start: quarto_source_map::Location { + offset: 100, + row: 10, + column: 5, + }, + end: quarto_source_map::Location { + offset: 110, + row: 10, + column: 15, + }, + }, + ); + + let msg = DiagnosticMessageBuilder::error("Invalid syntax") + .with_location(location) + .build(); + + let json = msg.to_json(); + + // Should have location field with range info + assert!(json.get("location").is_some()); + let loc = &json["location"]; + assert!(loc.get("range").is_some()); + + // Verify the range is serialized correctly + let range = &loc["range"]; + assert_eq!(range["start"]["row"], 10); + assert_eq!(range["start"]["column"], 5); + assert_eq!(range["start"]["offset"], 100); + assert_eq!(range["end"]["row"], 10); + assert_eq!(range["end"]["column"], 15); + assert_eq!(range["end"]["offset"], 110); + } + + #[test] + fn test_location_optional_in_to_json() { + let msg = DiagnosticMessage::error("No location"); + let json = msg.to_json(); + + // Should not have location field when not provided + assert!(json.get("location").is_none()); + } +} diff --git a/crates/quarto-error-reporting/src/lib.rs b/crates/quarto-error-reporting/src/lib.rs new file mode 100644 index 0000000..9be921e --- /dev/null +++ b/crates/quarto-error-reporting/src/lib.rs @@ -0,0 +1,67 @@ +//! Error reporting and diagnostic messages for Quarto. +//! +//! This crate provides a structured approach to error reporting, inspired by: +//! - **ariadne**: Visual compiler-quality error messages with source context +//! - **R cli package**: Semantic, structured text output +//! - **Tidyverse style guide**: Best practices for error message content +//! +//! # Architecture +//! +//! The crate is organized into several phases: +//! +//! ## Phase 1: Core Types (Current) +//! - [`DiagnosticMessage`]: The main error message structure +//! - [`MessageContent`]: Content representation (Plain, Markdown, or Pandoc AST) +//! - [`DetailItem`]: Individual detail bullets with error/info/note kinds +//! - [`DiagnosticKind`]: Error, Warning, Info, etc. +//! +//! ## Phase 2: Rendering (Planned) +//! - Integration with ariadne for visual terminal output +//! - JSON serialization for machine-readable output +//! +//! ## Phase 3: Console Helpers (Planned) +//! - High-level console output primitives +//! - ANSI writer for Pandoc AST (requires discussion) +//! +//! ## Phase 4: Builder API (Planned) +//! - Tidyverse-style builder methods (`.problem()`, `.add_detail()`, `.add_hint()`) +//! +//! # Design Decisions +//! +//! - **Markdown-first**: Messages use Markdown strings, converted to Pandoc AST internally +//! - **Semantic markup**: Use Pandoc span syntax for semantic classes: `` `text`{.class} `` +//! - **Multiple outputs**: ANSI terminal, HTML, and JSON formats +//! - **Rust-idiomatic**: Designed for Rust ergonomics (WASM for cross-language if needed) +//! +//! # Example Usage (Future) +//! +//! ```ignore +//! use quarto_error_reporting::DiagnosticMessage; +//! +//! let error = DiagnosticMessage::builder() +//! .error("Unclosed code block") +//! .problem("Code block started but never closed") +//! .add_detail("The code block starting with `` ```{python} `` was never closed") +//! .at_location(opening_span) +//! .add_hint("Did you forget the closing `` ``` ``?") +//! .build()?; +//! +//! console.error(&error); +//! ``` + +// Phase 1: Core error types +pub mod diagnostic; + +// Error code catalog +pub mod catalog; + +// Phase 4: Builder API +pub mod builder; + +// Macros for convenient error creation +pub mod macros; + +// Re-export main types for convenience +pub use builder::DiagnosticMessageBuilder; +pub use catalog::{ERROR_CATALOG, ErrorCodeInfo, get_docs_url, get_error_info, get_subsystem}; +pub use diagnostic::{DetailItem, DetailKind, DiagnosticKind, DiagnosticMessage, MessageContent}; diff --git a/crates/quarto-error-reporting/src/macros.rs b/crates/quarto-error-reporting/src/macros.rs new file mode 100644 index 0000000..2971abb --- /dev/null +++ b/crates/quarto-error-reporting/src/macros.rs @@ -0,0 +1,45 @@ +//! Macros for creating diagnostic messages. + +/// Create a generic error with automatic file and line information. +/// +/// This macro is for migration purposes - it creates an error with code Q-0-99 +/// and automatically includes the file and line number where the error was created. +/// +/// # Example +/// +/// ``` +/// use quarto_error_reporting::generic_error; +/// +/// let error = generic_error!("Found unexpected attribute"); +/// assert_eq!(error.code, Some("Q-0-99".to_string())); +/// assert!(error.title.contains("Found unexpected attribute")); +/// assert!(error.title.contains(file!())); +/// ``` +#[macro_export] +macro_rules! generic_error { + ($message:expr) => { + $crate::DiagnosticMessageBuilder::generic_error($message, file!(), line!()) + }; +} + +/// Create a generic warning with automatic file and line information. +/// +/// This macro is for migration purposes - it creates a warning with code Q-0-99 +/// and automatically includes the file and line number where the warning was created. +/// +/// # Example +/// +/// ``` +/// use quarto_error_reporting::generic_warning; +/// +/// let warning = generic_warning!("Caption found without table"); +/// assert_eq!(warning.code, Some("Q-0-99".to_string())); +/// assert!(warning.title.contains("Caption found without table")); +/// assert!(warning.title.contains(file!())); +/// ``` +#[macro_export] +macro_rules! generic_warning { + ($message:expr) => { + $crate::DiagnosticMessageBuilder::generic_warning($message, file!(), line!()) + }; +} diff --git a/crates/quarto-markdown-pandoc/CLAUDE.md b/crates/quarto-markdown-pandoc/CLAUDE.md index 6bc6705..e959b37 100644 --- a/crates/quarto-markdown-pandoc/CLAUDE.md +++ b/crates/quarto-markdown-pandoc/CLAUDE.md @@ -87,5 +87,5 @@ The `quarto-markdown-pandoc` binary accepts the following options: - **When fixing roundtripping bugs**: FIRST add the failing test to `tests/roundtrip_tests/qmd-json-qmd`, run it to verify it fails with the expected output, THEN implement the fix, THEN verify the test passes. - When I say "@doit", I mean "create a plan, and work on it item by item." - When you're done editing a Rust file, run `cargo fmt` on it. -- If I ask you to write notes to yourself, do it in markdown and write the output in the `docs/for-claude` directory. +- If I ask you to write notes to yourself, do it in markdown and write the output in the `claude-notes` directory. - If you need more information on the syntax differences, you are allowed to read the [syntax notes](../../docs/syntax-notes.md) file. \ No newline at end of file diff --git a/crates/quarto-markdown-pandoc/Cargo.toml b/crates/quarto-markdown-pandoc/Cargo.toml index 39680a6..90c01d4 100644 --- a/crates/quarto-markdown-pandoc/Cargo.toml +++ b/crates/quarto-markdown-pandoc/Cargo.toml @@ -16,14 +16,18 @@ cargo-fuzz = true [dependencies] tree-sitter = { workspace = true } tree-sitter-qmd = { workspace = true } +quarto-error-reporting = { path = "../quarto-error-reporting" } +quarto-source-map = { path = "../quarto-source-map" } +quarto-yaml = { path = "../quarto-yaml" } regex = { version = "1.10.0", features = ["unicode"] } clap = { version = "4.0", features = ["derive"] } +serde = { workspace = true, features = ["derive"] } serde_json = "1.0" glob = "0.3" paste = "1.0.15" once_cell = "1.21.3" -yaml-rust2 = "0.10.3" -hashlink = "0.10.0" +yaml-rust2 = { workspace = true } +hashlink = { version = "0.10.0", features = ["serde_impl"] } error-message-macros = { path = "./error-message-macros" } ariadne = "0.4" diff --git a/crates/quarto-markdown-pandoc/src/filters.rs b/crates/quarto-markdown-pandoc/src/filters.rs index 3d9e0d6..fd81048 100644 --- a/crates/quarto-markdown-pandoc/src/filters.rs +++ b/crates/quarto-markdown-pandoc/src/filters.rs @@ -3,10 +3,10 @@ * Copyright (c) 2025 Posit, PBC */ -use crate::pandoc::MetaValue; +use crate::pandoc::MetaValueWithSourceInfo; use crate::pandoc::block::MetaBlock; use crate::pandoc::inline::AsInline; -use crate::pandoc::meta::Meta; +use crate::pandoc::meta::MetaMapEntry; use crate::pandoc::{self, Block, Blocks, Inline, Inlines}; // filters are destructive and take ownership of the input @@ -18,7 +18,12 @@ pub enum FilterReturn { type InlineFilterFn<'a, T> = Box FilterReturn + 'a>; type BlockFilterFn<'a, T> = Box FilterReturn + 'a>; -type MetaFilterFn<'a> = Box FilterReturn + 'a>; +type MetaFilterFn<'a> = Box< + dyn FnMut( + MetaValueWithSourceInfo, + ) -> FilterReturn + + 'a, +>; type InlineFilterField<'a, T> = Option>; type BlockFilterField<'a, T> = Option>; type MetaFilterField<'a> = Option>; @@ -158,7 +163,10 @@ impl<'a> Filter<'a> { pub fn with_meta(mut self, f: F) -> Filter<'a> where - F: FnMut(Meta) -> FilterReturn + 'a, + F: FnMut( + MetaValueWithSourceInfo, + ) -> FilterReturn + + 'a, { self.meta = Some(Box::new(f)); self @@ -701,18 +709,18 @@ pub fn topdown_traverse_block(block: Block, filter: &mut Filter) -> Blocks { return match f(meta.meta) { FilterReturn::Unchanged(m) => vec![Block::BlockMetadata(MetaBlock { meta: m, - source_info: meta.source_info.clone(), + source_info: meta.source_info, })], FilterReturn::FilterResult(new_meta, recurse) => { if !recurse { vec![Block::BlockMetadata(MetaBlock { meta: new_meta, - source_info: meta.source_info.clone(), + source_info: meta.source_info, })] } else { vec![Block::BlockMetadata(MetaBlock { meta: topdown_traverse_meta(new_meta, filter), - source_info: meta.source_info.clone(), + source_info: meta.source_info, })] } } @@ -1022,25 +1030,60 @@ pub fn topdown_traverse_blocks(vec: Blocks, filter: &mut Filter) -> Blocks { } } -pub fn topdown_traverse_meta_value(value: MetaValue, filter: &mut Filter) -> MetaValue { +pub fn topdown_traverse_meta_value_with_source_info( + value: MetaValueWithSourceInfo, + filter: &mut Filter, +) -> MetaValueWithSourceInfo { match value { - MetaValue::MetaMap(m) => MetaValue::MetaMap( - m.into_iter() - .map(|(k, v)| (k, topdown_traverse_meta_value(v, filter))) - .collect(), - ), - MetaValue::MetaList(l) => MetaValue::MetaList( - l.into_iter() - .map(|mv| topdown_traverse_meta_value(mv, filter)) - .collect(), - ), - MetaValue::MetaBlocks(b) => MetaValue::MetaBlocks(topdown_traverse_blocks(b, filter)), - MetaValue::MetaInlines(i) => MetaValue::MetaInlines(topdown_traverse_inlines(i, filter)), + MetaValueWithSourceInfo::MetaMap { + entries, + source_info, + } => { + let new_entries = entries + .into_iter() + .map(|entry| MetaMapEntry { + key: entry.key, + key_source: entry.key_source, + value: topdown_traverse_meta_value_with_source_info(entry.value, filter), + }) + .collect(); + MetaValueWithSourceInfo::MetaMap { + entries: new_entries, + source_info, + } + } + MetaValueWithSourceInfo::MetaList { items, source_info } => { + let new_items = items + .into_iter() + .map(|item| topdown_traverse_meta_value_with_source_info(item, filter)) + .collect(); + MetaValueWithSourceInfo::MetaList { + items: new_items, + source_info, + } + } + MetaValueWithSourceInfo::MetaBlocks { + content, + source_info, + } => MetaValueWithSourceInfo::MetaBlocks { + content: topdown_traverse_blocks(content, filter), + source_info, + }, + MetaValueWithSourceInfo::MetaInlines { + content, + source_info, + } => MetaValueWithSourceInfo::MetaInlines { + content: topdown_traverse_inlines(content, filter), + source_info, + }, value => value, } } -pub fn topdown_traverse_meta(meta: Meta, filter: &mut Filter) -> Meta { +pub fn topdown_traverse_meta( + meta: MetaValueWithSourceInfo, + filter: &mut Filter, +) -> MetaValueWithSourceInfo { if let Some(f) = &mut filter.meta { return match f(meta) { FilterReturn::FilterResult(new_meta, recurse) => { @@ -1049,19 +1092,10 @@ pub fn topdown_traverse_meta(meta: Meta, filter: &mut Filter) -> Meta { } topdown_traverse_meta(new_meta, filter) } - FilterReturn::Unchanged(m) => { - let meta_value = MetaValue::MetaMap(m); - match topdown_traverse_meta_value(meta_value, filter) { - MetaValue::MetaMap(m) => m, - _ => panic!("Expected MetaMap after filtering meta"), - } - } + FilterReturn::Unchanged(m) => topdown_traverse_meta_value_with_source_info(m, filter), }; } else { - return meta - .into_iter() - .map(|(k, v)| (k, topdown_traverse_meta_value(v, filter))) - .collect(); + return topdown_traverse_meta_value_with_source_info(meta, filter); } } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/ast_context.rs b/crates/quarto-markdown-pandoc/src/pandoc/ast_context.rs index 564fe38..b793ccc 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/ast_context.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/ast_context.rs @@ -3,6 +3,7 @@ * Copyright (c) 2025 Posit, PBC */ +use quarto_source_map::{FileId, SourceContext}; use std::cell::Cell; /// Context passed through the parsing pipeline to provide information @@ -15,27 +16,45 @@ pub struct ASTContext { /// Counter for example list numbering across the document /// Example lists continue numbering even when interrupted by other content pub example_list_counter: Cell, + /// Source context for tracking files and their content + pub source_context: SourceContext, } impl ASTContext { pub fn new() -> Self { + let mut source_context = SourceContext::new(); + // Always add an anonymous file so FileId(0) is valid + source_context.add_file("".to_string(), None); + ASTContext { - filenames: Vec::new(), + filenames: vec!["".to_string()], example_list_counter: Cell::new(1), + source_context, } } pub fn with_filename(filename: impl Into) -> Self { + let filename_str = filename.into(); + let mut source_context = SourceContext::new(); + // Add the file without content for now (content can be added later if needed) + source_context.add_file(filename_str.clone(), None); + ASTContext { - filenames: vec![filename.into()], + filenames: vec![filename_str], example_list_counter: Cell::new(1), + source_context, } } pub fn anonymous() -> Self { + let mut source_context = SourceContext::new(); + // Always add an anonymous file so FileId(0) is valid + source_context.add_file("".to_string(), None); + ASTContext { - filenames: Vec::new(), + filenames: vec!["".to_string()], example_list_counter: Cell::new(1), + source_context, } } @@ -49,6 +68,26 @@ impl ASTContext { pub fn primary_filename(&self) -> Option<&String> { self.filenames.first() } + + /// Get the primary file ID (FileId(0)), if any file exists in the source context + pub fn primary_file_id(&self) -> Option { + if self.source_context.get_file(FileId(0)).is_some() { + Some(FileId(0)) + } else { + None + } + } + + /// Get the FileId to use for new SourceInfo instances. + /// Since ASTContext constructors now ensure FileId(0) always exists, + /// this always returns FileId(0). + /// + /// This method exists for: + /// 1. Code clarity - makes it obvious we're getting a file ID from context + /// 2. Future flexibility - if we need to track current file differently + pub fn current_file_id(&self) -> FileId { + FileId(0) + } } impl Default for ASTContext { diff --git a/crates/quarto-markdown-pandoc/src/pandoc/block.rs b/crates/quarto-markdown-pandoc/src/pandoc/block.rs index 1558fe6..b3aab46 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/block.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/block.rs @@ -3,19 +3,15 @@ * Copyright (c) 2025 Posit, PBC */ -use crate::impl_source_location; -use crate::pandoc::Meta; +use crate::pandoc::MetaValueWithSourceInfo; use crate::pandoc::attr::Attr; use crate::pandoc::caption::Caption; use crate::pandoc::inline::Inlines; use crate::pandoc::list::ListAttributes; -use crate::pandoc::location::Range; -use crate::pandoc::location::SourceInfo; -use crate::pandoc::location::SourceLocation; -use crate::pandoc::location::node_source_info; use crate::pandoc::table::Table; +use serde::{Deserialize, Serialize}; -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum Block { Plain(Plain), Paragraph(Paragraph), @@ -40,145 +36,122 @@ pub enum Block { pub type Blocks = Vec; -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Plain { pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Paragraph { pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct LineBlock { pub content: Vec, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct CodeBlock { pub attr: Attr, pub text: String, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct RawBlock { pub format: String, pub text: String, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct BlockQuote { pub content: Blocks, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct OrderedList { pub attr: ListAttributes, pub content: Vec, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct BulletList { pub content: Vec, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct DefinitionList { pub content: Vec<(Inlines, Vec)>, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Header { pub level: usize, pub attr: Attr, pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct HorizontalRule { - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Figure { pub attr: Attr, pub caption: Caption, pub content: Blocks, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Div { pub attr: Attr, pub content: Blocks, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct MetaBlock { - pub meta: Meta, - pub source_info: SourceInfo, + pub meta: MetaValueWithSourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct NoteDefinitionPara { pub id: String, pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct NoteDefinitionFencedBlock { pub id: String, pub content: Blocks, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct CaptionBlock { pub content: Inlines, - pub source_info: SourceInfo, -} - -impl_source_location!( - // blocks - Plain, - Paragraph, - LineBlock, - CodeBlock, - RawBlock, - BlockQuote, - OrderedList, - BulletList, - DefinitionList, - Header, - HorizontalRule, - Table, - Figure, - Div, - // quarto extensions - MetaBlock, - NoteDefinitionPara, - NoteDefinitionFencedBlock, - CaptionBlock -); + pub source_info: quarto_source_map::SourceInfo, +} fn make_block_leftover(node: &tree_sitter::Node, input_bytes: &[u8]) -> Block { let text = node.utf8_text(input_bytes).unwrap().to_string(); Block::RawBlock(RawBlock { format: "quarto-internal-leftover".to_string(), text, - source_info: node_source_info(node), + source_info: crate::pandoc::location::node_source_info(node), }) } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/caption.rs b/crates/quarto-markdown-pandoc/src/pandoc/caption.rs index a686406..c731bb9 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/caption.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/caption.rs @@ -5,8 +5,9 @@ use crate::pandoc::block::Blocks; use crate::pandoc::inline::Inlines; +use serde::{Deserialize, Serialize}; -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Caption { pub short: Option, pub long: Option, diff --git a/crates/quarto-markdown-pandoc/src/pandoc/inline.rs b/crates/quarto-markdown-pandoc/src/pandoc/inline.rs index 5052aa8..3fbfd90 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/inline.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/inline.rs @@ -3,16 +3,12 @@ * Copyright (c) 2025 Posit, PBC */ -use crate::impl_source_location; use crate::pandoc::attr::{Attr, is_empty_attr}; use crate::pandoc::block::Blocks; -use crate::pandoc::location::Range; -use crate::pandoc::location::SourceInfo; -use crate::pandoc::location::SourceLocation; -use crate::pandoc::location::node_source_info; use crate::pandoc::shortcode::Shortcode; +use serde::{Deserialize, Serialize}; -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum Inline { Str(Str), Emph(Emph), @@ -52,7 +48,7 @@ pub enum Inline { pub type Inlines = Vec; -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] pub enum QuoteType { SingleQuote, DoubleQuote, @@ -60,146 +56,146 @@ pub enum QuoteType { pub type Target = (String, String); -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] pub enum MathType { InlineMath, DisplayMath, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Str { pub text: String, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Emph { pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Underline { pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Strong { pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Strikeout { pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Superscript { pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Subscript { pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct SmallCaps { pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Quoted { pub quote_type: QuoteType, pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Cite { pub citations: Vec, pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Code { pub attr: Attr, pub text: String, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Math { pub math_type: MathType, pub text: String, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct RawInline { pub format: String, pub text: String, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Link { pub attr: Attr, pub content: Inlines, pub target: Target, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Image { pub attr: Attr, pub content: Inlines, pub target: Target, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Note { pub content: Blocks, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Span { pub attr: Attr, pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Space { - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct LineBreak { - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct SoftBreak { - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct NoteReference { pub id: String, - pub range: Range, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Citation { pub id: String, pub prefix: Inlines, @@ -209,68 +205,41 @@ pub struct Citation { pub hash: usize, } -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] pub enum CitationMode { AuthorInText, SuppressAuthor, NormalCitation, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Insert { pub attr: Attr, pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Delete { pub attr: Attr, pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Highlight { pub attr: Attr, pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct EditComment { pub attr: Attr, pub content: Inlines, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } -impl_source_location!( - Str, - Emph, - Underline, - Strong, - Strikeout, - Superscript, - Subscript, - SmallCaps, - Quoted, - Cite, - Code, - Math, - RawInline, - Link, - Image, - Note, - Span, - Space, - LineBreak, - SoftBreak, - Insert, - Delete, - Highlight, - EditComment -); - pub trait AsInline { fn as_inline(self) -> Inline; } @@ -331,7 +300,7 @@ pub fn make_span_inline( attr: Attr, target: Target, content: Inlines, - source_info: SourceInfo, + source_info: quarto_source_map::SourceInfo, ) -> Inline { // non-empty targets are never Underline or SmallCaps if !is_empty_target(&target) { @@ -339,7 +308,7 @@ pub fn make_span_inline( attr, content, target, - source_info: source_info.clone(), + source_info, }); } if attr.1.contains(&"smallcaps".to_string()) { @@ -352,13 +321,13 @@ pub fn make_span_inline( if is_empty_attr(&new_attr) { return Inline::SmallCaps(SmallCaps { content, - source_info: source_info.clone(), + source_info, }); } let inner_inline = make_span_inline(new_attr, target, content, source_info.clone()); return Inline::SmallCaps(SmallCaps { content: vec![inner_inline], - source_info: source_info.clone(), + source_info, }); } else if attr.1.contains(&"ul".to_string()) { let mut new_attr = attr.clone(); @@ -366,13 +335,13 @@ pub fn make_span_inline( if is_empty_attr(&new_attr) { return Inline::Underline(Underline { content, - source_info: source_info.clone(), + source_info, }); } let inner_inline = make_span_inline(new_attr, target, content, source_info.clone()); return Inline::Underline(Underline { content: vec![inner_inline], - source_info: source_info.clone(), + source_info, }); } else if attr.1.contains(&"underline".to_string()) { let mut new_attr = attr.clone(); @@ -384,13 +353,13 @@ pub fn make_span_inline( if is_empty_attr(&new_attr) { return Inline::Underline(Underline { content, - source_info: source_info.clone(), + source_info, }); } let inner_inline = make_span_inline(new_attr, target, content, source_info.clone()); return Inline::Underline(Underline { content: vec![inner_inline], - source_info: source_info.clone(), + source_info, }); } @@ -405,7 +374,7 @@ pub fn make_cite_inline( attr: Attr, target: Target, content: Inlines, - source_info: SourceInfo, + source_info: quarto_source_map::SourceInfo, ) -> Inline { // the traversal here is slightly inefficient because we need // to non-destructively check for the goodness of the content @@ -506,31 +475,30 @@ fn make_inline_leftover(node: &tree_sitter::Node, input_bytes: &[u8]) -> Inline Inline::RawInline(RawInline { format: "quarto-internal-leftover".to_string(), text, - source_info: node_source_info(node), + source_info: crate::pandoc::location::node_source_info(node), }) } #[cfg(test)] mod tests { use super::*; - use crate::pandoc::location::Location; - fn dummy_source_info() -> SourceInfo { - SourceInfo { - filename_index: None, - range: Range { - start: Location { + fn dummy_source_info() -> quarto_source_map::SourceInfo { + quarto_source_map::SourceInfo::original( + quarto_source_map::FileId(0), + quarto_source_map::Range { + start: quarto_source_map::Location { offset: 0, row: 0, column: 0, }, - end: Location { + end: quarto_source_map::Location { offset: 0, row: 0, column: 0, }, }, - } + ) } fn make_str(text: &str) -> Inline { diff --git a/crates/quarto-markdown-pandoc/src/pandoc/list.rs b/crates/quarto-markdown-pandoc/src/pandoc/list.rs index e22edce..11b0c7e 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/list.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/list.rs @@ -3,7 +3,8 @@ * Copyright (c) 2025 Posit, PBC */ -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +use serde::{Deserialize, Serialize}; +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] pub enum ListNumberStyle { Default, Example, @@ -14,7 +15,7 @@ pub enum ListNumberStyle { UpperAlpha, } -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] pub enum ListNumberDelim { Default, Period, diff --git a/crates/quarto-markdown-pandoc/src/pandoc/location.rs b/crates/quarto-markdown-pandoc/src/pandoc/location.rs index bdd0b71..18c4cf5 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/location.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/location.rs @@ -4,18 +4,19 @@ */ use crate::pandoc::ast_context::ASTContext; +use serde::{Deserialize, Serialize}; //////////////////////////////////////////////////////////////////////////////////////////////////// // Source location tracking -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] pub struct Location { pub offset: usize, pub row: usize, pub column: usize, } -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] pub struct Range { pub start: Location, pub end: Location, @@ -23,7 +24,7 @@ pub struct Range { /// Encapsulates source location information for AST nodes /// The filename field now holds an index into the ASTContext.filenames vector -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct SourceInfo { pub filename_index: Option, pub range: Range, @@ -61,29 +62,68 @@ impl SourceInfo { }, } } -} -pub trait SourceLocation { - fn filename_index(&self) -> Option; - fn range(&self) -> Range; + /// Convert to quarto-source-map::SourceInfo (temporary conversion helper) + /// + /// This helper bridges between pandoc::location types and quarto-source-map types. + /// Long-term, code should use quarto-source-map types directly. + /// + /// Creates an Original mapping with a dummy FileId(0). + /// For proper filename support, use to_source_map_info_with_mapping with a real FileId. + pub fn to_source_map_info(&self) -> quarto_source_map::SourceInfo { + quarto_source_map::SourceInfo::original( + quarto_source_map::FileId(0), + quarto_source_map::Range { + start: quarto_source_map::Location { + offset: self.range.start.offset, + row: self.range.start.row, + column: self.range.start.column, + }, + end: quarto_source_map::Location { + offset: self.range.end.offset, + row: self.range.end.row, + column: self.range.end.column, + }, + }, + ) + } - /// Resolve the filename from the ASTContext using the stored index - fn filename<'a>(&self, context: &'a ASTContext) -> Option<&'a String> { - self.filename_index() - .and_then(|idx| context.filenames.get(idx)) + /// Convert to quarto-source-map::SourceInfo with proper FileId (temporary conversion helper) + /// + /// This helper bridges between pandoc::location types and quarto-source-map types. + /// Use this when you have a proper FileId mapping from your context. + pub fn to_source_map_info_with_mapping( + &self, + file_id: quarto_source_map::FileId, + ) -> quarto_source_map::SourceInfo { + quarto_source_map::SourceInfo::original( + file_id, + quarto_source_map::Range { + start: quarto_source_map::Location { + offset: self.range.start.offset, + row: self.range.start.row, + column: self.range.start.column, + }, + end: quarto_source_map::Location { + offset: self.range.end.offset, + row: self.range.end.row, + column: self.range.end.column, + }, + }, + ) } } -pub fn node_location(node: &tree_sitter::Node) -> Range { +pub fn node_location(node: &tree_sitter::Node) -> quarto_source_map::Range { let start = node.start_position(); let end = node.end_position(); - Range { - start: Location { + quarto_source_map::Range { + start: quarto_source_map::Location { offset: node.start_byte(), row: start.row, column: start.column, }, - end: Location { + end: quarto_source_map::Location { offset: node.end_byte(), row: end.row, column: end.column, @@ -91,18 +131,15 @@ pub fn node_location(node: &tree_sitter::Node) -> Range { } } -pub fn node_source_info(node: &tree_sitter::Node) -> SourceInfo { - SourceInfo::with_range(node_location(node)) +pub fn node_source_info(node: &tree_sitter::Node) -> quarto_source_map::SourceInfo { + quarto_source_map::SourceInfo::original(quarto_source_map::FileId(0), node_location(node)) } -pub fn node_source_info_with_context(node: &tree_sitter::Node, context: &ASTContext) -> SourceInfo { - // If the context has at least one filename, use index 0 - let filename_index = if context.filenames.is_empty() { - None - } else { - Some(0) - }; - SourceInfo::new(filename_index, node_location(node)) +pub fn node_source_info_with_context( + node: &tree_sitter::Node, + context: &ASTContext, +) -> quarto_source_map::SourceInfo { + quarto_source_map::SourceInfo::original(context.current_file_id(), node_location(node)) } pub fn empty_range() -> Range { @@ -120,23 +157,39 @@ pub fn empty_range() -> Range { } } -pub fn empty_source_info() -> SourceInfo { - SourceInfo::with_range(empty_range()) +pub fn empty_source_info() -> quarto_source_map::SourceInfo { + quarto_source_map::SourceInfo::original( + quarto_source_map::FileId(0), + quarto_source_map::Range { + start: quarto_source_map::Location { + offset: 0, + row: 0, + column: 0, + }, + end: quarto_source_map::Location { + offset: 0, + row: 0, + column: 0, + }, + }, + ) } -#[macro_export] -macro_rules! impl_source_location { - ($($type:ty),*) => { - $( - impl SourceLocation for $type { - fn filename_index(&self) -> Option { - self.source_info.filename_index - } - - fn range(&self) -> Range { - self.source_info.range.clone() - } - } - )* - }; +/// Extract filename index from quarto_source_map::SourceInfo by walking to Original mapping +pub fn extract_filename_index(info: &quarto_source_map::SourceInfo) -> Option { + match &info.mapping { + quarto_source_map::SourceMapping::Original { file_id } => Some(file_id.0), + quarto_source_map::SourceMapping::Substring { parent, .. } => { + extract_filename_index(parent) + } + quarto_source_map::SourceMapping::Transformed { parent, .. } => { + extract_filename_index(parent) + } + quarto_source_map::SourceMapping::Concat { pieces } => { + // Return first non-None filename_index from pieces + pieces + .iter() + .find_map(|p| extract_filename_index(&p.source_info)) + } + } } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/meta.rs b/crates/quarto-markdown-pandoc/src/pandoc/meta.rs index ec3f6bf..68f8d59 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/meta.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/meta.rs @@ -15,7 +15,7 @@ use yaml_rust2::parser::{Event, MarkedEventReceiver, Parser}; // Pandoc's MetaValue notably does not support numbers or nulls, so we don't either // https://pandoc.org/lua-filters.html#type-metavalue -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)] pub enum MetaValue { MetaString(String), MetaBool(bool), @@ -33,6 +33,311 @@ impl Default for MetaValue { pub type Meta = LinkedHashMap; +// Phase 4: MetaValueWithSourceInfo - Meta with full source tracking +// This replaces Meta for use in PandocAST, preserving source info through +// the YAML->Meta transformation where strings are parsed as Markdown. +#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)] +pub enum MetaValueWithSourceInfo { + MetaString { + value: String, + source_info: quarto_source_map::SourceInfo, + }, + MetaBool { + value: bool, + source_info: quarto_source_map::SourceInfo, + }, + MetaInlines { + content: Inlines, + source_info: quarto_source_map::SourceInfo, + }, + MetaBlocks { + content: Blocks, + source_info: quarto_source_map::SourceInfo, + }, + MetaList { + items: Vec, + source_info: quarto_source_map::SourceInfo, + }, + MetaMap { + entries: Vec, + source_info: quarto_source_map::SourceInfo, + }, +} + +#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)] +pub struct MetaMapEntry { + pub key: String, + pub key_source: quarto_source_map::SourceInfo, + pub value: MetaValueWithSourceInfo, +} + +impl Default for MetaValueWithSourceInfo { + fn default() -> Self { + MetaValueWithSourceInfo::MetaMap { + entries: Vec::new(), + source_info: quarto_source_map::SourceInfo::default(), + } + } +} + +impl MetaValueWithSourceInfo { + /// Get a value by key if this is a MetaMap + pub fn get(&self, key: &str) -> Option<&MetaValueWithSourceInfo> { + match self { + MetaValueWithSourceInfo::MetaMap { entries, .. } => { + entries.iter().find(|e| e.key == key).map(|e| &e.value) + } + _ => None, + } + } + + /// Check if a key exists if this is a MetaMap + pub fn contains_key(&self, key: &str) -> bool { + self.get(key).is_some() + } + + /// Check if this MetaMap is empty + pub fn is_empty(&self) -> bool { + match self { + MetaValueWithSourceInfo::MetaMap { entries, .. } => entries.is_empty(), + _ => false, + } + } + + /// Convert to old Meta format (loses source info) + pub fn to_meta_value(&self) -> MetaValue { + match self { + MetaValueWithSourceInfo::MetaString { value, .. } => { + MetaValue::MetaString(value.clone()) + } + MetaValueWithSourceInfo::MetaBool { value, .. } => MetaValue::MetaBool(*value), + MetaValueWithSourceInfo::MetaInlines { content, .. } => { + MetaValue::MetaInlines(content.clone()) + } + MetaValueWithSourceInfo::MetaBlocks { content, .. } => { + MetaValue::MetaBlocks(content.clone()) + } + MetaValueWithSourceInfo::MetaList { items, .. } => { + MetaValue::MetaList(items.iter().map(|item| item.to_meta_value()).collect()) + } + MetaValueWithSourceInfo::MetaMap { entries, .. } => { + let mut map = LinkedHashMap::new(); + for entry in entries { + map.insert(entry.key.clone(), entry.value.to_meta_value()); + } + MetaValue::MetaMap(map) + } + } + } + + /// Convert to old Meta format when self is a MetaMap (loses source info) + /// Panics if self is not a MetaMap + pub fn to_meta(&self) -> Meta { + match self { + MetaValueWithSourceInfo::MetaMap { entries, .. } => { + let mut map = LinkedHashMap::new(); + for entry in entries { + map.insert(entry.key.clone(), entry.value.to_meta_value()); + } + map + } + _ => panic!("to_meta() called on non-MetaMap variant"), + } + } +} + +/// Convert old Meta to new format (with dummy source info) +pub fn meta_from_legacy(meta: Meta) -> MetaValueWithSourceInfo { + let entries = meta + .into_iter() + .map(|(k, v)| MetaMapEntry { + key: k, + key_source: quarto_source_map::SourceInfo::default(), + value: meta_value_from_legacy(v), + }) + .collect(); + + MetaValueWithSourceInfo::MetaMap { + entries, + source_info: quarto_source_map::SourceInfo::default(), + } +} + +/// Convert old MetaValue to new format (with dummy source info) +pub fn meta_value_from_legacy(value: MetaValue) -> MetaValueWithSourceInfo { + match value { + MetaValue::MetaString(s) => MetaValueWithSourceInfo::MetaString { + value: s, + source_info: quarto_source_map::SourceInfo::default(), + }, + MetaValue::MetaBool(b) => MetaValueWithSourceInfo::MetaBool { + value: b, + source_info: quarto_source_map::SourceInfo::default(), + }, + MetaValue::MetaInlines(inlines) => MetaValueWithSourceInfo::MetaInlines { + content: inlines, + source_info: quarto_source_map::SourceInfo::default(), + }, + MetaValue::MetaBlocks(blocks) => MetaValueWithSourceInfo::MetaBlocks { + content: blocks, + source_info: quarto_source_map::SourceInfo::default(), + }, + MetaValue::MetaList(list) => MetaValueWithSourceInfo::MetaList { + items: list.into_iter().map(meta_value_from_legacy).collect(), + source_info: quarto_source_map::SourceInfo::default(), + }, + MetaValue::MetaMap(map) => { + let entries = map + .into_iter() + .map(|(k, v)| MetaMapEntry { + key: k, + key_source: quarto_source_map::SourceInfo::default(), + value: meta_value_from_legacy(v), + }) + .collect(); + MetaValueWithSourceInfo::MetaMap { + entries, + source_info: quarto_source_map::SourceInfo::default(), + } + } + } +} + +/// Transform YamlWithSourceInfo to MetaValueWithSourceInfo +/// +/// This is the core transformation that: +/// 1. Parses YAML strings as Markdown (creating Substring SourceInfos) +/// 2. Preserves source tracking through nested structures +/// 3. Handles special YAML tags (bypassing markdown parsing) +/// 4. Converts YAML types to Pandoc Meta types +/// +/// Takes ownership of the YamlWithSourceInfo to avoid unnecessary clones. +pub fn yaml_to_meta_with_source_info( + yaml: quarto_yaml::YamlWithSourceInfo, + _context: &crate::pandoc::ast_context::ASTContext, +) -> MetaValueWithSourceInfo { + use yaml_rust2::Yaml; + + // Check if this is an array or hash first, since we need to consume + // the value before matching on yaml.yaml + if yaml.is_array() { + let (items, source_info) = yaml.into_array().unwrap(); + let meta_items = items + .into_iter() + .map(|item| yaml_to_meta_with_source_info(item, _context)) + .collect(); + + return MetaValueWithSourceInfo::MetaList { + items: meta_items, + source_info, + }; + } + + if yaml.is_hash() { + let (entries, source_info) = yaml.into_hash().unwrap(); + let meta_entries = entries + .into_iter() + .filter_map(|entry| { + // Keys must be strings in Pandoc metadata + entry.key.yaml.as_str().map(|key_str| MetaMapEntry { + key: key_str.to_string(), + key_source: entry.key_span, + value: yaml_to_meta_with_source_info(entry.value, _context), + }) + }) + .collect(); + + return MetaValueWithSourceInfo::MetaMap { + entries: meta_entries, + source_info, + }; + } + + // For scalars, destructure to get owned values + let quarto_yaml::YamlWithSourceInfo { + yaml: yaml_value, + source_info, + tag, + .. + } = yaml; + + match yaml_value { + Yaml::String(s) => { + // Check for YAML tags (e.g., !path, !glob, !str) + if let Some((tag_suffix, _tag_source_info)) = tag { + // Tagged string - bypass markdown parsing + // Wrap in Span with class "yaml-tagged-string" and tag attribute + let mut attributes = HashMap::new(); + attributes.insert("tag".to_string(), tag_suffix.clone()); + + let span = Span { + attr: ( + String::new(), + vec!["yaml-tagged-string".to_string()], + attributes, + ), + content: vec![Inline::Str(Str { + text: s.clone(), + source_info: source_info.clone(), + })], + source_info: quarto_source_map::SourceInfo::default(), + }; + MetaValueWithSourceInfo::MetaInlines { + content: vec![Inline::Span(span)], + source_info, // Overall node source + } + } else { + // Untagged string - return as MetaString for later markdown parsing + MetaValueWithSourceInfo::MetaString { + value: s, + source_info, + } + } + } + + Yaml::Boolean(b) => MetaValueWithSourceInfo::MetaBool { + value: b, + source_info, + }, + + // Pandoc doesn't support null, numbers, etc. in metadata + // For now, convert them to strings + Yaml::Null => MetaValueWithSourceInfo::MetaString { + value: String::new(), + source_info, + }, + + Yaml::Integer(i) => MetaValueWithSourceInfo::MetaString { + value: i.to_string(), + source_info, + }, + + Yaml::Real(r) => MetaValueWithSourceInfo::MetaString { + value: r, + source_info, + }, + + Yaml::BadValue => MetaValueWithSourceInfo::MetaString { + value: String::new(), + source_info, + }, + + Yaml::Alias(_) => { + // YAML aliases are resolved by yaml-rust2, so this shouldn't happen + // But if it does, treat as empty string + MetaValueWithSourceInfo::MetaString { + value: String::new(), + source_info, + } + } + + // Array and Hash should have been handled above + Yaml::Array(_) | Yaml::Hash(_) => { + unreachable!("Array/Hash should be handled by into_array/into_hash") + } + } +} + fn extract_between_delimiters(input: &str) -> Option<&str> { let parts: Vec<&str> = input.split("---").collect(); if parts.len() >= 3 { @@ -99,9 +404,9 @@ impl YamlEventHandler { ), content: vec![Inline::Str(Str { text: s.to_string(), - source_info: empty_source_info(), + source_info: quarto_source_map::SourceInfo::default(), })], - source_info: empty_source_info(), + source_info: quarto_source_map::SourceInfo::default(), }; return MetaValue::MetaInlines(vec![Inline::Span(span)]); } @@ -156,6 +461,53 @@ impl MarkedEventReceiver for YamlEventHandler { } } +/// Convert RawBlock to MetaValueWithSourceInfo using quarto-yaml (Phase 4) +/// +/// This is the new implementation that preserves source location information +/// throughout the YAML -> Meta transformation. +pub fn rawblock_to_meta_with_source_info( + block: &RawBlock, + context: &crate::pandoc::ast_context::ASTContext, +) -> MetaValueWithSourceInfo { + if block.format != "quarto_minus_metadata" { + panic!( + "Expected RawBlock with format 'quarto_minus_metadata', got {}", + block.format + ); + } + + // Extract YAML content between --- delimiters + let content = extract_between_delimiters(&block.text).unwrap(); + + // Calculate offsets within RawBlock.text + // The text is "---\n\n---", so content starts at index 4 + let yaml_start = block.text.find("---\n").unwrap() + 4; + + // block.source_info is already quarto_source_map::SourceInfo + let parent = block.source_info.clone(); + + // Create Substring SourceInfo for the YAML content within the RawBlock + let yaml_parent = + quarto_source_map::SourceInfo::substring(parent, yaml_start, yaml_start + content.len()); + + // Parse YAML with source tracking + let yaml = match quarto_yaml::parse_with_parent(content, yaml_parent) { + Ok(y) => y, + Err(e) => panic!( + "(unimplemented syntax error - this is a bug!) Failed to parse metadata block as YAML: {}", + e + ), + }; + + // Transform YamlWithSourceInfo to MetaValueWithSourceInfo + // Pass by value since yaml is no longer needed + yaml_to_meta_with_source_info(yaml, context) +} + +/// Legacy version: Convert RawBlock to Meta (old implementation) +/// +/// This version uses yaml-rust2 directly and doesn't preserve source information. +/// Kept for backward compatibility during Phase 4. pub fn rawblock_to_meta(block: RawBlock) -> Meta { if block.format != "quarto_minus_metadata" { panic!( @@ -176,6 +528,101 @@ pub fn rawblock_to_meta(block: RawBlock) -> Meta { handler.result.unwrap() } +/// Parse metadata strings as markdown, preserving source information +pub fn parse_metadata_strings_with_source_info( + meta: MetaValueWithSourceInfo, + outer_metadata: &mut Vec, +) -> MetaValueWithSourceInfo { + match meta { + MetaValueWithSourceInfo::MetaString { value, source_info } => { + let mut output_stream = VerboseOutput::Sink(io::sink()); + let result = readers::qmd::read( + value.as_bytes(), + false, + "", + &mut output_stream, + None::< + fn( + &[u8], + &crate::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + >, + ); + match result { + Ok((mut pandoc, _context)) => { + // Merge parsed metadata, preserving full MetaMapEntry with key_source + if let MetaValueWithSourceInfo::MetaMap { entries, .. } = pandoc.meta { + for entry in entries { + outer_metadata.push(entry); + } + } + // Check if it's a single paragraph - if so, return MetaInlines with original source_info + if pandoc.blocks.len() == 1 { + if let crate::pandoc::Block::Paragraph(p) = &mut pandoc.blocks[0] { + return MetaValueWithSourceInfo::MetaInlines { + content: mem::take(&mut p.content), + source_info, // Preserve the original source_info from YAML + }; + } + } + MetaValueWithSourceInfo::MetaBlocks { + content: pandoc.blocks, + source_info, + } + } + Err(_) => { + // Markdown parse failed - wrap in Span with class "yaml-markdown-syntax-error" + let span = Span { + attr: ( + String::new(), + vec!["yaml-markdown-syntax-error".to_string()], + HashMap::new(), + ), + content: vec![Inline::Str(Str { + text: value.clone(), + source_info: quarto_source_map::SourceInfo::default(), + })], + source_info: quarto_source_map::SourceInfo::default(), + }; + MetaValueWithSourceInfo::MetaInlines { + content: vec![Inline::Span(span)], + source_info, + } + } + } + } + MetaValueWithSourceInfo::MetaList { items, source_info } => { + let parsed_items = items + .into_iter() + .map(|item| parse_metadata_strings_with_source_info(item, outer_metadata)) + .collect(); + MetaValueWithSourceInfo::MetaList { + items: parsed_items, + source_info, + } + } + MetaValueWithSourceInfo::MetaMap { + entries, + source_info, + } => { + let parsed_entries = entries + .into_iter() + .map(|entry| MetaMapEntry { + key: entry.key, + key_source: entry.key_source, + value: parse_metadata_strings_with_source_info(entry.value, outer_metadata), + }) + .collect(); + MetaValueWithSourceInfo::MetaMap { + entries: parsed_entries, + source_info, + } + } + other => other, + } +} + pub fn parse_metadata_strings(meta: MetaValue, outer_metadata: &mut Meta) -> MetaValue { match meta { MetaValue::MetaString(s) => { @@ -195,8 +642,11 @@ pub fn parse_metadata_strings(meta: MetaValue, outer_metadata: &mut Meta) -> Met ); match result { Ok((mut pandoc, _context)) => { - for (k, v) in pandoc.meta.into_iter() { - outer_metadata.insert(k, v); + // pandoc.meta is now MetaValueWithSourceInfo, convert it to Meta + if let MetaValueWithSourceInfo::MetaMap { entries, .. } = pandoc.meta { + for entry in entries { + outer_metadata.insert(entry.key, entry.value.to_meta_value()); + } } // we need to examine pandoc.blocks to see if it's a single paragraph or multiple blocks // if it's a single paragraph, we can return MetaInlines diff --git a/crates/quarto-markdown-pandoc/src/pandoc/mod.rs b/crates/quarto-markdown-pandoc/src/pandoc/mod.rs index 9d7db3c..fd82e10 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/mod.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/mod.rs @@ -13,6 +13,7 @@ pub mod location; pub mod meta; pub mod pandoc; pub mod shortcode; +pub mod source_map_compat; pub mod table; pub mod treesitter; pub mod treesitter_utils; @@ -37,5 +38,8 @@ pub use crate::pandoc::table::{ }; pub use crate::pandoc::ast_context::ASTContext; -pub use crate::pandoc::meta::{Meta, MetaValue, parse_metadata_strings, rawblock_to_meta}; + +pub use crate::pandoc::meta::{MetaValueWithSourceInfo, rawblock_to_meta_with_source_info}; +#[allow(unused_imports)] +pub use crate::pandoc::meta::{parse_metadata_strings, parse_metadata_strings_with_source_info}; pub use crate::pandoc::treesitter::treesitter_to_pandoc; diff --git a/crates/quarto-markdown-pandoc/src/pandoc/pandoc.rs b/crates/quarto-markdown-pandoc/src/pandoc/pandoc.rs index feb83ee..d35f17a 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/pandoc.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/pandoc.rs @@ -4,7 +4,7 @@ */ pub use crate::pandoc::block::Blocks; -pub use crate::pandoc::meta::Meta; +pub use crate::pandoc::meta::MetaValueWithSourceInfo; /* * A data structure that mimics Pandoc's `data Pandoc` type. * This is used to represent the parsed structure of a Quarto Markdown document. @@ -12,6 +12,6 @@ pub use crate::pandoc::meta::Meta; #[derive(Debug, Clone, PartialEq, Default)] pub struct Pandoc { - pub meta: Meta, + pub meta: MetaValueWithSourceInfo, pub blocks: Blocks, } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/shortcode.rs b/crates/quarto-markdown-pandoc/src/pandoc/shortcode.rs index f7c8bc1..d931568 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/shortcode.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/shortcode.rs @@ -5,9 +5,10 @@ use crate::pandoc::inline::{Inline, Inlines, Span}; use crate::pandoc::location::empty_source_info; +use serde::{Deserialize, Serialize}; use std::collections::HashMap; -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub enum ShortcodeArg { String(String), Number(f64), @@ -16,7 +17,7 @@ pub enum ShortcodeArg { KeyValue(HashMap), } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Shortcode { pub is_escaped: bool, pub name: String, diff --git a/crates/quarto-markdown-pandoc/src/pandoc/source_map_compat.rs b/crates/quarto-markdown-pandoc/src/pandoc/source_map_compat.rs new file mode 100644 index 0000000..4104c7f --- /dev/null +++ b/crates/quarto-markdown-pandoc/src/pandoc/source_map_compat.rs @@ -0,0 +1,113 @@ +/* + * source_map_compat.rs + * Copyright (c) 2025 Posit, PBC + */ + +//! Compatibility helpers for converting tree-sitter Nodes to quarto-source-map types. +//! +//! This module provides bridge functions to convert from tree-sitter's Node type +//! to quarto-source-map's SourceInfo, enabling gradual migration from the old +//! pandoc::location types. + +use quarto_source_map::{FileId, Location, Range, SourceInfo}; +use tree_sitter::Node; + +use crate::pandoc::ast_context::ASTContext; + +/// Convert a tree-sitter Node to a SourceInfo with an explicit FileId. +/// +/// This is the low-level conversion function that directly translates tree-sitter +/// positions to quarto-source-map coordinates. +/// +/// # Arguments +/// * `node` - The tree-sitter Node to convert +/// * `file_id` - The FileId of the source file this node comes from +/// +/// # Returns +/// A SourceInfo with Original mapping to the specified file +pub fn node_to_source_info(node: &Node, file_id: FileId) -> SourceInfo { + let start_pos = node.start_position(); + let end_pos = node.end_position(); + + SourceInfo::original( + file_id, + Range { + start: Location { + offset: node.start_byte(), + row: start_pos.row, + column: start_pos.column, + }, + end: Location { + offset: node.end_byte(), + row: end_pos.row, + column: end_pos.column, + }, + }, + ) +} + +/// Convert a tree-sitter Node to a SourceInfo using the primary file from ASTContext. +/// +/// This is the high-level conversion function that uses the context's primary file. +/// Most parsing code should use this variant. +/// +/// # Arguments +/// * `node` - The tree-sitter Node to convert +/// * `ctx` - The ASTContext containing the source context +/// +/// # Returns +/// A SourceInfo with Original mapping to the context's primary file. +/// If the context has no primary file, uses FileId(0) as a fallback. +pub fn node_to_source_info_with_context(node: &Node, ctx: &ASTContext) -> SourceInfo { + let file_id = ctx.primary_file_id().unwrap_or(FileId(0)); + node_to_source_info(node, file_id) +} + +/// Convert old pandoc::location::SourceInfo to new quarto-source-map::SourceInfo. +/// +/// This is a bridge function for gradual migration. It converts the old SourceInfo +/// (with filename_index) to the new SourceInfo (with FileId) using ASTContext. +/// +/// # Arguments +/// * `old_info` - The old SourceInfo from pandoc::location +/// * `ctx` - The ASTContext to resolve filename_index to FileId +/// +/// # Returns +/// A new SourceInfo with Original mapping to the appropriate file +pub fn old_to_new_source_info( + old_info: &crate::pandoc::location::SourceInfo, + ctx: &ASTContext, +) -> SourceInfo { + // Convert filename_index to FileId + // If the old info has a filename_index, try to get the corresponding FileId + // Otherwise, use the primary file or FileId(0) as fallback + let file_id = if let Some(filename_idx) = old_info.filename_index { + // Try to map filename_index to FileId + // For now, we'll use the primary file as a reasonable default + // TODO: In Phase 3, we'll have proper mapping from filename_index to FileId + ctx.primary_file_id().unwrap_or(FileId(filename_idx)) + } else { + ctx.primary_file_id().unwrap_or(FileId(0)) + }; + + // Convert the Range (both use the same Location structure) + SourceInfo::original( + file_id, + Range { + start: Location { + offset: old_info.range.start.offset, + row: old_info.range.start.row, + column: old_info.range.start.column, + }, + end: Location { + offset: old_info.range.end.offset, + row: old_info.range.end.row, + column: old_info.range.end.column, + }, + }, + ) +} + +// Note: Tests for these functions will be validated through integration tests +// when they're used in actual parsing modules. The tree-sitter-qmd parser +// setup is too complex to mock in unit tests here. diff --git a/crates/quarto-markdown-pandoc/src/pandoc/table.rs b/crates/quarto-markdown-pandoc/src/pandoc/table.rs index 2f77056..5fbb463 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/table.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/table.rs @@ -6,9 +6,9 @@ use crate::pandoc::attr::Attr; use crate::pandoc::block::Blocks; use crate::pandoc::caption::Caption; -use crate::pandoc::location::SourceInfo; +use serde::{Deserialize, Serialize}; -#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)] +#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] pub enum Alignment { Left, Center, @@ -16,7 +16,7 @@ pub enum Alignment { Default, } -#[derive(Debug, Clone, PartialEq, PartialOrd)] +#[derive(Debug, Clone, PartialEq, PartialOrd, Serialize, Deserialize)] pub enum ColWidth { Default, Percentage(f64), @@ -24,19 +24,19 @@ pub enum ColWidth { pub type ColSpec = (Alignment, ColWidth); -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Row { pub attr: Attr, pub cells: Vec, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct TableHead { pub attr: Attr, pub rows: Vec, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct TableBody { pub attr: Attr, pub rowhead_columns: usize, @@ -44,13 +44,13 @@ pub struct TableBody { pub body: Vec, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct TableFoot { pub attr: Attr, pub rows: Vec, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Cell { pub attr: Attr, pub alignment: Alignment, @@ -59,7 +59,7 @@ pub struct Cell { pub content: Blocks, } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct Table { pub attr: Attr, pub caption: Caption, @@ -67,5 +67,5 @@ pub struct Table { pub head: TableHead, pub bodies: Vec, pub foot: TableFoot, - pub source_info: SourceInfo, + pub source_info: quarto_source_map::SourceInfo, } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs index e2a53ed..b2f926e 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs @@ -56,10 +56,7 @@ use crate::pandoc::inline::{ Emph, Inline, Note, RawInline, Space, Str, Strikeout, Strong, Subscript, Superscript, }; use crate::pandoc::list::{ListAttributes, ListNumberDelim, ListNumberStyle}; -use crate::pandoc::location::{ - Range, SourceInfo, empty_source_info, node_location, node_source_info, - node_source_info_with_context, -}; +use crate::pandoc::location::{node_location, node_source_info, node_source_info_with_context}; use crate::pandoc::pandoc::Pandoc; use core::panic; use once_cell::sync::Lazy; @@ -70,7 +67,7 @@ use crate::traversals::bottomup_traverse_concrete_tree; use treesitter_utils::pandocnativeintermediate::PandocNativeIntermediate; -fn get_block_source_info(block: &Block) -> &SourceInfo { +fn get_block_source_info(block: &Block) -> &quarto_source_map::SourceInfo { match block { Block::Plain(b) => &b.source_info, Block::Paragraph(b) => &b.source_info, @@ -106,7 +103,7 @@ fn process_list( // but the next item might not itself be a paragraph. let mut has_loose_item = false; - let mut last_para_range: Option = None; + let mut last_para_range: Option = None; let mut last_item_end_row: Option = None; let mut list_items: Vec = Vec::new(); let mut is_ordered_list: Option = None; @@ -356,6 +353,7 @@ fn process_native_inline( whitespace_re: &Regex, inline_buf: &mut T, node_text_fn: impl Fn() -> String, + node_source_info_fn: impl Fn() -> quarto_source_map::SourceInfo, context: &ASTContext, ) -> Inline { match child { @@ -363,24 +361,16 @@ fn process_native_inline( PandocNativeIntermediate::IntermediateBaseText(text, range) => { if let Some(_) = whitespace_re.find(&text) { Inline::Space(Space { - source_info: SourceInfo::new( - if context.filenames.is_empty() { - None - } else { - Some(0) - }, + source_info: quarto_source_map::SourceInfo::original( + context.current_file_id(), range, ), }) } else { Inline::Str(Str { text: apply_smart_quotes(text), - source_info: SourceInfo::new( - if context.filenames.is_empty() { - None - } else { - Some(0) - }, + source_info: quarto_source_map::SourceInfo::original( + context.current_file_id(), range, ), }) @@ -405,7 +395,7 @@ fn process_native_inline( Inline::RawInline(RawInline { format: "quarto-internal-leftover".to_string(), text: node_text_fn(), - source_info: empty_source_info(), + source_info: node_source_info_fn(), }) } other => { @@ -418,7 +408,7 @@ fn process_native_inline( Inline::RawInline(RawInline { format: "quarto-internal-leftover".to_string(), text: node_text_fn(), - source_info: empty_source_info(), + source_info: node_source_info_fn(), }) } } @@ -441,24 +431,16 @@ fn process_native_inlines( PandocNativeIntermediate::IntermediateBaseText(text, range) => { if let Some(_) = whitespace_re.find(&text) { inlines.push(Inline::Space(Space { - source_info: SourceInfo::new( - if context.filenames.is_empty() { - None - } else { - Some(0) - }, + source_info: quarto_source_map::SourceInfo::original( + context.current_file_id(), range, ), })) } else { inlines.push(Inline::Str(Str { text: apply_smart_quotes(text), - source_info: SourceInfo::new( - if context.filenames.is_empty() { - None - } else { - Some(0) - }, + source_info: quarto_source_map::SourceInfo::original( + context.current_file_id(), range, ), })) @@ -501,6 +483,7 @@ fn native_visitor( let value = node_text(); PandocNativeIntermediate::IntermediateBaseText(extract_quoted_text(&value), location) }; + let node_source_info_fn = || node_source_info_with_context(node, context); let native_inline = |(node_name, child)| { process_native_inline( node_name, @@ -508,6 +491,7 @@ fn native_visitor( &whitespace_re, &mut inline_buf, &node_text, + &node_source_info_fn, context, ) }; @@ -647,7 +631,7 @@ fn native_visitor( Inline::Note(Note { content: vec![Block::Paragraph(Paragraph { content: inlines, - source_info: SourceInfo::with_range(node_location(node)), + source_info: node_source_info(node), })], source_info: node_source_info(node), }) @@ -732,13 +716,13 @@ fn native_visitor( result } -pub fn treesitter_to_pandoc( +pub fn treesitter_to_pandoc( buf: &mut T, tree: &tree_sitter_qmd::MarkdownTree, input_bytes: &[u8], context: &ASTContext, - error_collector: &mut E, -) -> Result> { + error_collector: &mut crate::utils::diagnostic_collector::DiagnosticCollector, +) -> Result> { let result = bottomup_traverse_concrete_tree( &mut tree.walk(), &mut |node, children, input_bytes, context| { @@ -753,8 +737,12 @@ pub fn treesitter_to_pandoc doc, Err(()) => { - // Postprocess found errors, return the error messages from the collector - return Err(error_collector.messages()); + // Postprocess found errors, return the diagnostics from the collector + // We need to get the diagnostics out - let's use a temporary collector + // Actually, we can't consume the collector here because it's borrowed + // We need to get a copy of the diagnostics + let diagnostics = error_collector.diagnostics().to_vec(); + return Err(diagnostics); } }; let result = merge_strs(result); diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/block_quote.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/block_quote.rs index 752f95d..c427e90 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/block_quote.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/block_quote.rs @@ -8,7 +8,7 @@ use crate::pandoc::ast_context::ASTContext; use crate::pandoc::block::{Block, BlockQuote, Blocks, RawBlock}; -use crate::pandoc::location::{SourceInfo, node_source_info_with_context}; +use crate::pandoc::location::node_source_info_with_context; use std::io::Write; use super::pandocnativeintermediate::PandocNativeIntermediate; @@ -46,7 +46,10 @@ pub fn process_block_quote( content.push(Block::RawBlock(RawBlock { format: "quarto_minus_metadata".to_string(), text, - source_info: SourceInfo::with_range(range), + source_info: quarto_source_map::SourceInfo::original( + quarto_source_map::FileId(0), + range, + ), })); } _ => { diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/citation.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/citation.rs index d50df07..48962ff 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/citation.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/citation.rs @@ -8,7 +8,7 @@ use crate::pandoc::ast_context::ASTContext; use crate::pandoc::inline::{Citation, CitationMode, Cite, Inline, Str}; -use crate::pandoc::location::node_source_info_with_context; +use crate::pandoc::source_map_compat; use super::pandocnativeintermediate::PandocNativeIntermediate; @@ -57,8 +57,8 @@ where }], content: vec![Inline::Str(Str { text: node_text(), - source_info: node_source_info_with_context(node, context), + source_info: source_map_compat::node_to_source_info_with_context(node, context), })], - source_info: node_source_info_with_context(node, context), + source_info: source_map_compat::node_to_source_info_with_context(node, context), })) } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/code_span.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/code_span.rs index d3d79dc..13839e0 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/code_span.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/code_span.rs @@ -33,7 +33,7 @@ pub fn process_code_span( // IntermediateUnknown here "consumes" the node ( node_name, - PandocNativeIntermediate::IntermediateUnknown(range.range), + PandocNativeIntermediate::IntermediateUnknown(range.range.clone()), ) } PandocNativeIntermediate::IntermediateRawFormat(raw, _) => { @@ -41,7 +41,7 @@ pub fn process_code_span( // IntermediateUnknown here "consumes" the node ( node_name, - PandocNativeIntermediate::IntermediateUnknown(range.range), + PandocNativeIntermediate::IntermediateUnknown(range.range.clone()), ) } PandocNativeIntermediate::IntermediateBaseText(text, range) => { diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/document.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/document.rs index b1b9a0c..e03808c 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/document.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/document.rs @@ -9,7 +9,7 @@ use crate::pandoc::ast_context::ASTContext; use crate::pandoc::block::{Block, RawBlock}; use crate::pandoc::location::node_source_info_with_context; -use crate::pandoc::pandoc::{Meta, Pandoc}; +use crate::pandoc::pandoc::{MetaValueWithSourceInfo, Pandoc}; use super::pandocnativeintermediate::PandocNativeIntermediate; @@ -37,7 +37,8 @@ pub fn process_document( } }); PandocNativeIntermediate::IntermediatePandoc(Pandoc { - meta: Meta::default(), + // Legitimate default: Initial document creation - metadata populated later from YAML + meta: MetaValueWithSourceInfo::default(), blocks, }) } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/editorial_marks.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/editorial_marks.rs index 7126ffd..bbc75f3 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/editorial_marks.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/editorial_marks.rs @@ -8,7 +8,7 @@ use crate::pandoc::ast_context::ASTContext; use crate::pandoc::inline::{Delete, EditComment, Highlight, Inline, Inlines, Insert, Space, Str}; -use crate::pandoc::location::{SourceInfo, node_source_info_with_context}; +use crate::pandoc::location::node_source_info_with_context; use once_cell::sync::Lazy; use regex::Regex; use std::collections::HashMap; @@ -44,26 +44,12 @@ macro_rules! process_editorial_mark { PandocNativeIntermediate::IntermediateBaseText(text, range) => { if let Some(_) = whitespace_re.find(&text) { content.push(Inline::Space(Space { - source_info: SourceInfo::new( - if context.filenames.is_empty() { - None - } else { - Some(0) - }, - range, - ), + source_info: quarto_source_map::SourceInfo::original(context.current_file_id(), range), })) } else { content.push(Inline::Str(Str { text: apply_smart_quotes(text), - source_info: SourceInfo::new( - if context.filenames.is_empty() { - None - } else { - Some(0) - }, - range, - ), + source_info: quarto_source_map::SourceInfo::original(context.current_file_id(), range), })) } } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/fenced_div_block.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/fenced_div_block.rs index 33168be..a7c746c 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/fenced_div_block.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/fenced_div_block.rs @@ -9,7 +9,7 @@ use crate::pandoc::ast_context::ASTContext; use crate::pandoc::attr::Attr; use crate::pandoc::block::{Block, Div, RawBlock}; -use crate::pandoc::location::{SourceInfo, node_source_info_with_context}; +use crate::pandoc::location::node_source_info_with_context; use std::collections::HashMap; use std::io::Write; @@ -67,7 +67,10 @@ pub fn process_fenced_div_block( content.push(Block::RawBlock(RawBlock { format: "quarto_minus_metadata".to_string(), text, - source_info: SourceInfo::with_range(range), + source_info: quarto_source_map::SourceInfo::original( + quarto_source_map::FileId(0), + range, + ), })); } _ => { diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/inline_link.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/inline_link.rs index cf56184..d9d7551 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/inline_link.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/inline_link.rs @@ -9,7 +9,6 @@ use crate::pandoc::ast_context::ASTContext; use crate::pandoc::attr::{Attr, is_empty_attr}; use crate::pandoc::inline::{Inline, is_empty_target, make_cite_inline, make_span_inline}; -use crate::pandoc::location::node_source_info_with_context; use std::collections::HashMap; use std::io::Write; @@ -75,14 +74,14 @@ where attr, target, content, - node_source_info_with_context(node, context), + crate::pandoc::source_map_compat::node_to_source_info_with_context(node, context), ) } else { make_span_inline( attr, target, content, - node_source_info_with_context(node, context), + crate::pandoc::source_map_compat::node_to_source_info_with_context(node, context), ) }) } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/note_reference.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/note_reference.rs index 1d149c8..a3355e2 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/note_reference.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/note_reference.rs @@ -33,6 +33,6 @@ pub fn process_note_reference( } PandocNativeIntermediate::IntermediateInline(Inline::NoteReference(NoteReference { id, - range: node_source_info_with_context(node, context).range, + source_info: node_source_info_with_context(node, context), })) } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/pandocnativeintermediate.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/pandocnativeintermediate.rs index 54a3198..1a1a253 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/pandocnativeintermediate.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/pandocnativeintermediate.rs @@ -8,10 +8,10 @@ use crate::pandoc::block::{Block, Blocks}; use crate::pandoc::inline::{Inline, Inlines}; use crate::pandoc::list::ListAttributes; -use crate::pandoc::location::Range; use crate::pandoc::pandoc::Pandoc; use crate::pandoc::shortcode::ShortcodeArg; use crate::pandoc::table::{Alignment, Cell, Row}; +use quarto_source_map::Range; use std::collections::HashMap; #[derive(Debug, Clone, PartialEq)] diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs index d6fe038..ae8f0df 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs @@ -10,11 +10,11 @@ use crate::pandoc::attr::{Attr, is_empty_attr}; use crate::pandoc::block::{Block, Blocks, DefinitionList, Div, Figure, Plain}; use crate::pandoc::caption::Caption; use crate::pandoc::inline::{Inline, Inlines, Space, Span, Str, Superscript}; -use crate::pandoc::location::{Range, SourceInfo, empty_range, empty_source_info}; +use crate::pandoc::location::empty_source_info; use crate::pandoc::pandoc::Pandoc; use crate::pandoc::shortcode::shortcode_to_span; use crate::utils::autoid; -use crate::utils::error_collector::ErrorCollector; +use crate::utils::diagnostic_collector::DiagnosticCollector; use std::cell::RefCell; use std::collections::HashMap; @@ -145,11 +145,8 @@ pub fn coalesce_abbreviations(inlines: Vec) -> (Vec, bool) { } // Create the Str node (possibly coalesced) - let source_info = if j > i + 1 { - SourceInfo::with_range(Range { - start: start_info.range.start.clone(), - end: end_info.range.end.clone(), - }) + let source_info = if did_coalesce { + start_info.combine(&end_info) } else { start_info }; @@ -262,7 +259,7 @@ fn transform_definition_list_div(div: Div) -> Block { } /// Apply post-processing transformations to the Pandoc AST -pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> Result { +pub fn postprocess(doc: Pandoc, error_collector: &mut DiagnosticCollector) -> Result { let result = { // Wrap error_collector in RefCell for interior mutability across multiple closures let error_collector_ref = RefCell::new(error_collector); @@ -351,6 +348,7 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R let mut new_image = image.clone(); new_image.attr = image_attr; // FIXME all source location is broken here + // TODO: Should propagate from image.source_info and para.source_info FilterResult( vec![Block::Figure(Figure { attr: figure_attr, @@ -358,14 +356,17 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R short: None, long: Some(vec![Block::Plain(Plain { content: image.content.clone(), - source_info: SourceInfo::with_range(empty_range()), + // TODO: Should derive from image.content inlines + source_info: quarto_source_map::SourceInfo::default(), })]), }, content: vec![Block::Plain(Plain { content: vec![Inline::Image(new_image)], - source_info: SourceInfo::with_range(empty_range()), + // TODO: Should use image.source_info + source_info: quarto_source_map::SourceInfo::default(), })], - source_info: SourceInfo::with_range(empty_range()), + // TODO: Should use para.source_info + source_info: quarto_source_map::SourceInfo::default(), })], true, ) @@ -383,7 +384,7 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R }) .with_note_reference(|note_ref| { let mut kv = HashMap::new(); - kv.insert("reference-id".to_string(), note_ref.id); + kv.insert("reference-id".to_string(), note_ref.id.clone()); FilterResult( vec![Inline::Span(Span { attr: ( @@ -392,7 +393,7 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R kv, ), content: vec![], - source_info: empty_source_info(), + source_info: note_ref.source_info, })], false, ) @@ -405,7 +406,7 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R vec![Inline::Span(Span { attr: (insert.attr.0, classes, insert.attr.2), content, - source_info: empty_source_info(), + source_info: insert.source_info, })], true, ) @@ -418,7 +419,7 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R vec![Inline::Span(Span { attr: (delete.attr.0, classes, delete.attr.2), content, - source_info: empty_source_info(), + source_info: delete.source_info, })], true, ) @@ -431,7 +432,7 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R vec![Inline::Span(Span { attr: (highlight.attr.0, classes, highlight.attr.2), content, - source_info: empty_source_info(), + source_info: highlight.source_info, })], true, ) @@ -444,7 +445,7 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R vec![Inline::Span(Span { attr: (edit_comment.attr.0, classes, edit_comment.attr.2), content, - source_info: empty_source_info(), + source_info: edit_comment.source_info, })], true, ) @@ -474,7 +475,8 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R math_processed.push(Inline::Span(Span { attr: (attr.0.clone(), classes, attr.2.clone()), content: vec![Inline::Math(math.clone())], - source_info: empty_source_info(), + // TODO: Should combine() source info from math and attr (see k-82) + source_info: quarto_source_map::SourceInfo::default(), })); // Skip the Math, optional Space, and Attr @@ -555,7 +557,8 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R // bracket attached to the first word and closing bracket to the last word // e.g., "@knuth [p. 33]" becomes: Str("@knuth"), Space, Str("[p."), Space, Str("33]") cite.content.push(Inline::Space(Space { - source_info: SourceInfo::with_range(empty_range()), + // Synthetic Space: inserted to separate citation from suffix + source_info: quarto_source_map::SourceInfo::default(), })); // The span content may have been merged into a single string, so we need to @@ -569,9 +572,7 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R if i > 0 { bracketed_content.push(Inline::Space( Space { - source_info: SourceInfo::with_range( - empty_range(), - ), + source_info: empty_source_info(), }, )); } @@ -616,7 +617,8 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R result.push(Inline::Cite(cite)); } result.push(Inline::Space(Space { - source_info: SourceInfo::with_range(empty_range()), + // Synthetic Space: restore space between cite and invalid span + source_info: quarto_source_map::SourceInfo::default(), })); result.push(inline); state = 0; @@ -627,7 +629,8 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R result.push(Inline::Cite(cite)); } result.push(Inline::Space(Space { - source_info: SourceInfo::with_range(empty_range()), + // Synthetic Space: restore space between cite and non-span element + source_info: quarto_source_map::SourceInfo::default(), })); result.push(inline); state = 0; @@ -642,7 +645,8 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R result.push(Inline::Cite(cite)); if state == 2 { result.push(Inline::Space(Space { - source_info: SourceInfo::with_range(empty_range()), + // Synthetic Space: restore trailing space after incomplete citation pattern + source_info: quarto_source_map::SourceInfo::default(), })); } } @@ -651,13 +655,10 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R }) .with_attr(|attr| { // TODO: Add source location when attr has it - error_collector_ref.borrow_mut().error( - format!( - "Found attr in postprocess: {:?} - this should have been removed", - attr - ), - None, - ); + error_collector_ref.borrow_mut().error(format!( + "Found attr in postprocess: {:?} - this should have been removed", + attr + )); FilterResult(vec![], false) }) .with_blocks(|blocks| { @@ -709,12 +710,9 @@ pub fn postprocess(doc: Pandoc, error_collector: &mut E) -> R // Don't add the CaptionBlock to the result (it's now attached) } else { // Issue a warning when caption has no preceding table - error_collector_ref.borrow_mut().warn( + error_collector_ref.borrow_mut().warn_at( "Caption found without a preceding table".to_string(), - Some(&crate::utils::error_collector::SourceInfo::new( - caption_block.source_info.range.start.row + 1, - caption_block.source_info.range.start.column + 1, - )), + caption_block.source_info.clone(), ); // Remove the caption from the output (don't add to result) } @@ -757,7 +755,7 @@ pub fn merge_strs(pandoc: Pandoc) -> Pandoc { pandoc, &mut Filter::new().with_inlines(|inlines| { let mut current_str: Option = None; - let mut current_source_info: Option = None; + let mut current_source_info: Option = None; let mut result: Inlines = Vec::new(); let mut did_merge = false; for inline in inlines { diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/text_helpers.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/text_helpers.rs index b167b7e..94d5a4d 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/text_helpers.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/text_helpers.rs @@ -4,7 +4,7 @@ */ use crate::pandoc::inline::{Inline, LineBreak, SoftBreak}; -use crate::pandoc::location::{SourceInfo, node_location}; +use crate::pandoc::location::node_location; use crate::pandoc::treesitter_utils::pandocnativeintermediate::PandocNativeIntermediate; use once_cell::sync::Lazy; use regex::Regex; @@ -119,11 +119,17 @@ pub fn create_line_break_inline( let range = node_location(node); let inline = if is_hard { Inline::LineBreak(LineBreak { - source_info: SourceInfo::with_range(range), + source_info: quarto_source_map::SourceInfo::original( + quarto_source_map::FileId(0), + range, + ), }) } else { Inline::SoftBreak(SoftBreak { - source_info: SourceInfo::with_range(range), + source_info: quarto_source_map::SourceInfo::original( + quarto_source_map::FileId(0), + range, + ), }) }; PandocNativeIntermediate::IntermediateInline(inline) diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/thematic_break.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/thematic_break.rs index 209794e..b88db9d 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/thematic_break.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/thematic_break.rs @@ -5,7 +5,7 @@ use crate::pandoc::ast_context::ASTContext; use crate::pandoc::block::{Block, HorizontalRule}; -use crate::pandoc::location::node_source_info_with_context; +use crate::pandoc::source_map_compat; use crate::pandoc::treesitter_utils::pandocnativeintermediate::PandocNativeIntermediate; /// Process a thematic break (horizontal rule) @@ -14,6 +14,6 @@ pub fn process_thematic_break( context: &ASTContext, ) -> PandocNativeIntermediate { PandocNativeIntermediate::IntermediateBlock(Block::HorizontalRule(HorizontalRule { - source_info: node_source_info_with_context(node, context), + source_info: source_map_compat::node_to_source_info_with_context(node, context), })) } diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/uri_autolink.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/uri_autolink.rs index 3b1cc25..bec32c4 100644 --- a/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/uri_autolink.rs +++ b/crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/uri_autolink.rs @@ -8,7 +8,7 @@ use crate::pandoc::ast_context::ASTContext; use crate::pandoc::inline::{Inline, Link, Str}; -use crate::pandoc::location::node_source_info_with_context; +use crate::pandoc::source_map_compat; use std::collections::HashMap; use super::pandocnativeintermediate::PandocNativeIntermediate; @@ -31,10 +31,10 @@ pub fn process_uri_autolink( PandocNativeIntermediate::IntermediateInline(Inline::Link(Link { content: vec![Inline::Str(Str { text: content.to_string(), - source_info: node_source_info_with_context(node, context), + source_info: source_map_compat::node_to_source_info_with_context(node, context), })], attr, target: (content.to_string(), "".to_string()), - source_info: node_source_info_with_context(node, context), + source_info: source_map_compat::node_to_source_info_with_context(node, context), })) } diff --git a/crates/quarto-markdown-pandoc/src/readers/json.rs b/crates/quarto-markdown-pandoc/src/readers/json.rs index 81f9c12..5300f8e 100644 --- a/crates/quarto-markdown-pandoc/src/readers/json.rs +++ b/crates/quarto-markdown-pandoc/src/readers/json.rs @@ -5,19 +5,22 @@ use crate::pandoc::ast_context::ASTContext; use crate::pandoc::block::MetaBlock; -use crate::pandoc::location::{Location, Range, SourceInfo}; +use crate::pandoc::location::{Location, Range}; +use crate::pandoc::meta::MetaMapEntry; use crate::pandoc::table::{ Alignment, Cell, ColSpec, ColWidth, Row, Table, TableBody, TableFoot, TableHead, }; use crate::pandoc::{ Attr, Block, BlockQuote, BulletList, Caption, Citation, CitationMode, Cite, Code, CodeBlock, DefinitionList, Div, Emph, Figure, Header, HorizontalRule, Image, Inline, Inlines, LineBlock, - Link, ListAttributes, ListNumberDelim, ListNumberStyle, Math, MathType, Meta, MetaValue, Note, - OrderedList, Pandoc, Paragraph, Plain, QuoteType, Quoted, RawBlock, RawInline, SmallCaps, - SoftBreak, Space, Span, Str, Strikeout, Strong, Subscript, Superscript, Underline, + Link, ListAttributes, ListNumberDelim, ListNumberStyle, Math, MathType, + MetaValueWithSourceInfo, Note, OrderedList, Pandoc, Paragraph, Plain, QuoteType, Quoted, + RawBlock, RawInline, SmallCaps, SoftBreak, Space, Span, Str, Strikeout, Strong, Subscript, + Superscript, Underline, }; -use hashlink::LinkedHashMap; +use quarto_source_map::{FileId, RangeMapping, SourceMapping}; use serde_json::Value; +use std::rc::Rc; #[derive(Debug)] pub enum JsonReadError { @@ -25,6 +28,9 @@ pub enum JsonReadError { MissingField(String), InvalidType(String), UnsupportedVariant(String), + InvalidSourceInfoRef(usize), + ExpectedSourceInfoRef, + MalformedSourceInfoPool, } impl std::fmt::Display for JsonReadError { @@ -36,6 +42,15 @@ impl std::fmt::Display for JsonReadError { JsonReadError::UnsupportedVariant(variant) => { write!(f, "Unsupported variant: {}", variant) } + JsonReadError::InvalidSourceInfoRef(id) => { + write!(f, "Invalid SourceInfo reference ID: {}", id) + } + JsonReadError::ExpectedSourceInfoRef => { + write!(f, "Expected SourceInfo $ref, got inline SourceInfo") + } + JsonReadError::MalformedSourceInfoPool => { + write!(f, "Malformed sourceInfoPool in astContext") + } } } } @@ -44,6 +59,270 @@ impl std::error::Error for JsonReadError {} type Result = std::result::Result; +/// Deserializer that reconstructs SourceInfo objects from a pool. +/// +/// During JSON deserialization, the sourceInfoPool from astContext is parsed +/// into a Vec. References in the AST ({"$ref": id}) are resolved +/// by looking up the ID in this pool. +/// +/// The pool must be built in topological order (parents before children) so +/// that when we reconstruct a SourceInfo with a parent_id, the parent already +/// exists in the pool. +struct SourceInfoDeserializer { + pool: Vec, +} + +impl SourceInfoDeserializer { + /// Create a new empty deserializer (for documents without SourceInfo) + fn empty() -> Self { + SourceInfoDeserializer { pool: Vec::new() } + } + + /// Build the pool from the sourceInfoPool JSON array (compact format) + /// + /// Compact format: {"r": [start_off, start_row, start_col, end_off, end_row, end_col], "t": type_code, "d": data} + /// ID is implicit from array index + fn new(pool_json: &Value) -> Result { + let pool_array = pool_json + .as_array() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + + let mut pool: Vec = Vec::with_capacity(pool_array.len()); + + // Build pool in order - parents must come before children + for item in pool_array { + // Parse range from "r" array: [start_offset, start_row, start_col, end_offset, end_row, end_col] + let range_array = item + .get("r") + .and_then(|v| v.as_array()) + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + + if range_array.len() != 6 { + return Err(JsonReadError::MalformedSourceInfoPool); + } + + let range = quarto_source_map::Range { + start: quarto_source_map::Location { + offset: range_array[0] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize, + row: range_array[1] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize, + column: range_array[2] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize, + }, + end: quarto_source_map::Location { + offset: range_array[3] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize, + row: range_array[4] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize, + column: range_array[5] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize, + }, + }; + + // Parse type code from "t" + let type_code = + item.get("t") + .and_then(|v| v.as_u64()) + .ok_or(JsonReadError::MalformedSourceInfoPool)? as usize; + + // Parse data from "d" + let data = item + .get("d") + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + + let mapping = match type_code { + 0 => { + // Original: data is file_id (number) + let file_id = data + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize; + SourceMapping::Original { + file_id: FileId(file_id), + } + } + 1 => { + // Substring: data is [parent_id, offset] + let data_array = data + .as_array() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + if data_array.len() != 2 { + return Err(JsonReadError::MalformedSourceInfoPool); + } + let parent_id = data_array[0] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize; + let offset = data_array[1] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize; + + let parent = pool + .get(parent_id) + .ok_or(JsonReadError::MalformedSourceInfoPool)? + .clone(); + + SourceMapping::Substring { + parent: Rc::new(parent), + offset, + } + } + 2 => { + // Concat: data is [[source_info_id, offset_in_concat, length], ...] + let pieces_array = data + .as_array() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + + let pieces: Result> = pieces_array + .iter() + .map(|piece_array| { + let piece = piece_array + .as_array() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + if piece.len() != 3 { + return Err(JsonReadError::MalformedSourceInfoPool); + } + let source_info_id = piece[0] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize; + let offset_in_concat = piece[1] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize; + let length = piece[2] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize; + + let source_info = pool + .get(source_info_id) + .ok_or(JsonReadError::MalformedSourceInfoPool)? + .clone(); + + Ok(quarto_source_map::SourcePiece { + source_info, + offset_in_concat, + length, + }) + }) + .collect(); + + SourceMapping::Concat { pieces: pieces? } + } + 3 => { + // Transformed: data is [parent_id, [[from_start, from_end, to_start, to_end], ...]] + let data_array = data + .as_array() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + if data_array.len() != 2 { + return Err(JsonReadError::MalformedSourceInfoPool); + } + let parent_id = data_array[0] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize; + let mapping_array = data_array[1] + .as_array() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + + let range_mappings: Result> = mapping_array + .iter() + .map(|rm_array| { + let rm = rm_array + .as_array() + .ok_or(JsonReadError::MalformedSourceInfoPool)?; + if rm.len() != 4 { + return Err(JsonReadError::MalformedSourceInfoPool); + } + Ok(RangeMapping { + from_start: rm[0] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize, + from_end: rm[1] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize, + to_start: rm[2] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize, + to_end: rm[3] + .as_u64() + .ok_or(JsonReadError::MalformedSourceInfoPool)? + as usize, + }) + }) + .collect(); + + let parent = pool + .get(parent_id) + .ok_or(JsonReadError::MalformedSourceInfoPool)? + .clone(); + + SourceMapping::Transformed { + parent: Rc::new(parent), + mapping: range_mappings?, + } + } + _ => { + return Err(JsonReadError::MalformedSourceInfoPool); + } + }; + + pool.push(quarto_source_map::SourceInfo { range, mapping }); + } + + Ok(SourceInfoDeserializer { pool }) + } + + /// Resolve a numeric reference to a SourceInfo + fn from_json_ref(&self, value: &Value) -> Result { + if let Some(ref_id) = value.as_u64() { + let id = ref_id as usize; + self.pool + .get(id) + .cloned() + .ok_or(JsonReadError::InvalidSourceInfoRef(id)) + } else { + Err(JsonReadError::ExpectedSourceInfoRef) + } + } +} + +/// Convert from old JSON format (filename_index, range) to new SourceInfo +fn make_source_info(filename_index: Option, range: Range) -> quarto_source_map::SourceInfo { + let file_id = FileId(filename_index.unwrap_or(0)); + let qsm_range = quarto_source_map::Range { + start: quarto_source_map::Location { + offset: range.start.offset, + row: range.start.row, + column: range.start.column, + }, + end: quarto_source_map::Location { + offset: range.end.offset, + row: range.end.row, + column: range.end.column, + }, + }; + quarto_source_map::SourceInfo::original(file_id, qsm_range) +} + fn empty_range() -> Range { Range { start: Location { @@ -160,7 +439,7 @@ fn read_citation_mode(value: &Value) -> Result { } } -fn read_inline(value: &Value) -> Result { +fn read_inline(value: &Value, deserializer: &SourceInfoDeserializer) -> Result { let obj = value .as_object() .ok_or_else(|| JsonReadError::InvalidType("Expected object for Inline".to_string()))?; @@ -182,7 +461,7 @@ fn read_inline(value: &Value) -> Result { .to_string(); Ok(Inline::Str(Str { text, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Space" => { @@ -191,7 +470,7 @@ fn read_inline(value: &Value) -> Result { .and_then(read_location) .unwrap_or_else(|| (None, empty_range())); Ok(Inline::Space(Space { - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "LineBreak" => { @@ -200,7 +479,7 @@ fn read_inline(value: &Value) -> Result { .and_then(read_location) .unwrap_or_else(|| (None, empty_range())); Ok(Inline::LineBreak(crate::pandoc::inline::LineBreak { - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "SoftBreak" => { @@ -209,27 +488,27 @@ fn read_inline(value: &Value) -> Result { .and_then(read_location) .unwrap_or_else(|| (None, empty_range())); Ok(Inline::SoftBreak(SoftBreak { - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "Emph" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let content = read_inlines(c)?; + let content = read_inlines(c, deserializer)?; Ok(Inline::Emph(Emph { content, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Strong" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let content = read_inlines(c)?; + let content = read_inlines(c, deserializer)?; Ok(Inline::Strong(Strong { content, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Code" => { @@ -252,7 +531,7 @@ fn read_inline(value: &Value) -> Result { Ok(Inline::Code(Code { attr, text, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Math" => { @@ -293,57 +572,57 @@ fn read_inline(value: &Value) -> Result { Ok(Inline::Math(Math { math_type, text, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Underline" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let content = read_inlines(c)?; + let content = read_inlines(c, deserializer)?; Ok(Inline::Underline(Underline { content, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Strikeout" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let content = read_inlines(c)?; + let content = read_inlines(c, deserializer)?; Ok(Inline::Strikeout(Strikeout { content, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Superscript" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let content = read_inlines(c)?; + let content = read_inlines(c, deserializer)?; Ok(Inline::Superscript(Superscript { content, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Subscript" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let content = read_inlines(c)?; + let content = read_inlines(c, deserializer)?; Ok(Inline::Subscript(Subscript { content, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "SmallCaps" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let content = read_inlines(c)?; + let content = read_inlines(c, deserializer)?; Ok(Inline::SmallCaps(SmallCaps { content, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Quoted" => { @@ -377,11 +656,11 @@ fn read_inline(value: &Value) -> Result { } }; - let content = read_inlines(&arr[1])?; + let content = read_inlines(&arr[1], deserializer)?; Ok(Inline::Quoted(Quoted { quote_type, content, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Link" => { @@ -398,7 +677,7 @@ fn read_inline(value: &Value) -> Result { } let attr = read_attr(&arr[0])?; - let content = read_inlines(&arr[1])?; + let content = read_inlines(&arr[1], deserializer)?; let target_arr = arr[2].as_array().ok_or_else(|| { JsonReadError::InvalidType("Link target must be array".to_string()) @@ -422,7 +701,7 @@ fn read_inline(value: &Value) -> Result { attr, content, target, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "RawInline" => { @@ -452,7 +731,7 @@ fn read_inline(value: &Value) -> Result { Ok(Inline::RawInline(RawInline { format, text, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Image" => { @@ -469,7 +748,7 @@ fn read_inline(value: &Value) -> Result { } let attr = read_attr(&arr[0])?; - let content = read_inlines(&arr[1])?; + let content = read_inlines(&arr[1], deserializer)?; let target_arr = arr[2].as_array().ok_or_else(|| { JsonReadError::InvalidType("Image target must be array".to_string()) @@ -495,7 +774,7 @@ fn read_inline(value: &Value) -> Result { attr, content, target, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Span" => { @@ -512,21 +791,21 @@ fn read_inline(value: &Value) -> Result { } let attr = read_attr(&arr[0])?; - let content = read_inlines(&arr[1])?; + let content = read_inlines(&arr[1], deserializer)?; Ok(Inline::Span(Span { attr, content, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Note" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let content = read_blocks(c)?; + let content = read_blocks(c, deserializer)?; Ok(Inline::Note(Note { content, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } "Cite" => { @@ -561,14 +840,18 @@ fn read_inline(value: &Value) -> Result { .ok_or_else(|| JsonReadError::MissingField("citationId".to_string()))? .to_string(); - let prefix = - read_inlines(citation_obj.get("citationPrefix").ok_or_else(|| { + let prefix = read_inlines( + citation_obj.get("citationPrefix").ok_or_else(|| { JsonReadError::MissingField("citationPrefix".to_string()) - })?)?; - let suffix = - read_inlines(citation_obj.get("citationSuffix").ok_or_else(|| { + })?, + deserializer, + )?; + let suffix = read_inlines( + citation_obj.get("citationSuffix").ok_or_else(|| { JsonReadError::MissingField("citationSuffix".to_string()) - })?)?; + })?, + deserializer, + )?; let mode = read_citation_mode(citation_obj.get("citationMode").ok_or_else(|| { @@ -599,23 +882,23 @@ fn read_inline(value: &Value) -> Result { .collect::>>()?; // Second element is the content inlines - let content = read_inlines(&c_arr[1])?; + let content = read_inlines(&c_arr[1], deserializer)?; Ok(Inline::Cite(Cite { citations, content, - source_info: SourceInfo::new(None, empty_range()), + source_info: make_source_info(None, empty_range()), })) } _ => Err(JsonReadError::UnsupportedVariant(format!("Inline: {}", t))), } } -fn read_inlines(value: &Value) -> Result { +fn read_inlines(value: &Value, deserializer: &SourceInfoDeserializer) -> Result { let arr = value .as_array() .ok_or_else(|| JsonReadError::InvalidType("Expected array for Inlines".to_string()))?; - arr.iter().map(read_inline).collect() + arr.iter().map(|v| read_inline(v, deserializer)).collect() } fn read_ast_context(value: &Value) -> Result { @@ -643,6 +926,7 @@ fn read_ast_context(value: &Value) -> Result { Ok(ASTContext { filenames, example_list_counter: std::cell::Cell::new(1), + source_context: quarto_source_map::SourceContext::new(), }) } @@ -663,31 +947,65 @@ fn read_pandoc(value: &Value) -> Result<(Pandoc, ASTContext)> { // We could validate the API version here if needed // let _api_version = obj.get("pandoc-api-version"); - let meta = read_meta( + // Read astContext first (we need it for key sources and source info pool) + let context = if let Some(ast_context_val) = obj.get("astContext") { + read_ast_context(ast_context_val)? + } else { + // If no astContext is present, create an empty one for backward compatibility + ASTContext::new() + }; + + // Extract sourceInfoPool and create deserializer + let deserializer = if let Some(ast_context_val) = obj.get("astContext") { + if let Some(ast_context_obj) = ast_context_val.as_object() { + if let Some(pool_json) = ast_context_obj.get("sourceInfoPool") { + SourceInfoDeserializer::new(pool_json)? + } else { + SourceInfoDeserializer::empty() + } + } else { + SourceInfoDeserializer::empty() + } + } else { + SourceInfoDeserializer::empty() + }; + + // Extract metaTopLevelKeySources if present + let key_sources = if let Some(ast_context_val) = obj.get("astContext") { + if let Some(ast_context_obj) = ast_context_val.as_object() { + if let Some(key_sources_val) = ast_context_obj.get("metaTopLevelKeySources") { + Some(key_sources_val) + } else { + None + } + } else { + None + } + } else { + None + }; + + let meta = read_meta_with_key_sources( obj.get("meta") .ok_or_else(|| JsonReadError::MissingField("meta".to_string()))?, + key_sources, + &deserializer, )?; let blocks = read_blocks( obj.get("blocks") .ok_or_else(|| JsonReadError::MissingField("blocks".to_string()))?, + &deserializer, )?; - let context = if let Some(ast_context_val) = obj.get("astContext") { - read_ast_context(ast_context_val)? - } else { - // If no astContext is present, create an empty one for backward compatibility - ASTContext::new() - }; - Ok((Pandoc { meta, blocks }, context)) } -fn read_blockss(value: &Value) -> Result>> { +fn read_blockss(value: &Value, deserializer: &SourceInfoDeserializer) -> Result>> { let arr = value .as_array() .ok_or_else(|| JsonReadError::InvalidType("Expected array for blockss".to_string()))?; arr.iter() - .map(|blocks_val| read_blocks(blocks_val)) + .map(|blocks_val| read_blocks(blocks_val, deserializer)) .collect() } @@ -751,7 +1069,7 @@ fn read_list_attributes(value: &Value) -> Result { Ok((start_num, number_style, number_delimiter)) } -fn read_caption(value: &Value) -> Result { +fn read_caption(value: &Value, deserializer: &SourceInfoDeserializer) -> Result { let arr = value .as_array() .ok_or_else(|| JsonReadError::InvalidType("Expected array for Caption".to_string()))?; @@ -765,23 +1083,23 @@ fn read_caption(value: &Value) -> Result { let short = if arr[0].is_null() { None } else { - Some(read_inlines(&arr[0])?) + Some(read_inlines(&arr[0], deserializer)?) }; let long = if arr[1].is_null() { None } else { - Some(read_blocks(&arr[1])?) + Some(read_blocks(&arr[1], deserializer)?) }; Ok(Caption { short, long }) } -fn read_blocks(value: &Value) -> Result> { +fn read_blocks(value: &Value, deserializer: &SourceInfoDeserializer) -> Result> { let arr = value .as_array() .ok_or_else(|| JsonReadError::InvalidType("Expected array for blocks".to_string()))?; - arr.iter().map(read_block).collect() + arr.iter().map(|v| read_block(v, deserializer)).collect() } fn read_alignment(value: &Value) -> Result { @@ -848,7 +1166,7 @@ fn read_colspec(value: &Value) -> Result { Ok((alignment, colwidth)) } -fn read_cell(value: &Value) -> Result { +fn read_cell(value: &Value, deserializer: &SourceInfoDeserializer) -> Result { let arr = value .as_array() .ok_or_else(|| JsonReadError::InvalidType("Expected array for Cell".to_string()))?; @@ -869,7 +1187,7 @@ fn read_cell(value: &Value) -> Result { .as_u64() .ok_or_else(|| JsonReadError::InvalidType("Cell col_span must be number".to_string()))? as usize; - let content = read_blocks(&arr[4])?; + let content = read_blocks(&arr[4], deserializer)?; Ok(Cell { attr, @@ -880,7 +1198,7 @@ fn read_cell(value: &Value) -> Result { }) } -fn read_row(value: &Value) -> Result { +fn read_row(value: &Value, deserializer: &SourceInfoDeserializer) -> Result { let arr = value .as_array() .ok_or_else(|| JsonReadError::InvalidType("Expected array for Row".to_string()))?; @@ -897,13 +1215,13 @@ fn read_row(value: &Value) -> Result { .ok_or_else(|| JsonReadError::InvalidType("Row cells must be array".to_string()))?; let cells = cells_arr .iter() - .map(read_cell) + .map(|v| read_cell(v, deserializer)) .collect::>>()?; Ok(Row { attr, cells }) } -fn read_table_head(value: &Value) -> Result { +fn read_table_head(value: &Value, deserializer: &SourceInfoDeserializer) -> Result { let arr = value .as_array() .ok_or_else(|| JsonReadError::InvalidType("Expected array for TableHead".to_string()))?; @@ -918,12 +1236,15 @@ fn read_table_head(value: &Value) -> Result { let rows_arr = arr[1] .as_array() .ok_or_else(|| JsonReadError::InvalidType("TableHead rows must be array".to_string()))?; - let rows = rows_arr.iter().map(read_row).collect::>>()?; + let rows = rows_arr + .iter() + .map(|v| read_row(v, deserializer)) + .collect::>>()?; Ok(TableHead { attr, rows }) } -fn read_table_body(value: &Value) -> Result { +fn read_table_body(value: &Value, deserializer: &SourceInfoDeserializer) -> Result { let arr = value .as_array() .ok_or_else(|| JsonReadError::InvalidType("Expected array for TableBody".to_string()))?; @@ -941,11 +1262,17 @@ fn read_table_body(value: &Value) -> Result { let head_arr = arr[2] .as_array() .ok_or_else(|| JsonReadError::InvalidType("TableBody head must be array".to_string()))?; - let head = head_arr.iter().map(read_row).collect::>>()?; + let head = head_arr + .iter() + .map(|v| read_row(v, deserializer)) + .collect::>>()?; let body_arr = arr[3] .as_array() .ok_or_else(|| JsonReadError::InvalidType("TableBody body must be array".to_string()))?; - let body = body_arr.iter().map(read_row).collect::>>()?; + let body = body_arr + .iter() + .map(|v| read_row(v, deserializer)) + .collect::>>()?; Ok(TableBody { attr, @@ -955,7 +1282,7 @@ fn read_table_body(value: &Value) -> Result { }) } -fn read_table_foot(value: &Value) -> Result { +fn read_table_foot(value: &Value, deserializer: &SourceInfoDeserializer) -> Result { let arr = value .as_array() .ok_or_else(|| JsonReadError::InvalidType("Expected array for TableFoot".to_string()))?; @@ -970,12 +1297,15 @@ fn read_table_foot(value: &Value) -> Result { let rows_arr = arr[1] .as_array() .ok_or_else(|| JsonReadError::InvalidType("TableFoot rows must be array".to_string()))?; - let rows = rows_arr.iter().map(read_row).collect::>>()?; + let rows = rows_arr + .iter() + .map(|v| read_row(v, deserializer)) + .collect::>>()?; Ok(TableFoot { attr, rows }) } -fn read_block(value: &Value) -> Result { +fn read_block(value: &Value, deserializer: &SourceInfoDeserializer) -> Result { let obj = value .as_object() .ok_or_else(|| JsonReadError::InvalidType("Expected object for Block".to_string()))?; @@ -995,20 +1325,20 @@ fn read_block(value: &Value) -> Result { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let content = read_inlines(c)?; + let content = read_inlines(c, deserializer)?; Ok(Block::Paragraph(Paragraph { content, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "Plain" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let content = read_inlines(c)?; + let content = read_inlines(c, deserializer)?; Ok(Block::Plain(Plain { content, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "LineBlock" => { @@ -1018,10 +1348,13 @@ fn read_block(value: &Value) -> Result { let arr = c.as_array().ok_or_else(|| { JsonReadError::InvalidType("LineBlock content must be array".to_string()) })?; - let content = arr.iter().map(read_inlines).collect::>>()?; + let content = arr + .iter() + .map(|v| read_inlines(v, deserializer)) + .collect::>>()?; Ok(Block::LineBlock(LineBlock { content, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "CodeBlock" => { @@ -1046,7 +1379,7 @@ fn read_block(value: &Value) -> Result { Ok(Block::CodeBlock(CodeBlock { attr, text, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "RawBlock" => { @@ -1076,17 +1409,17 @@ fn read_block(value: &Value) -> Result { Ok(Block::RawBlock(RawBlock { format, text, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "BlockQuote" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let content = read_blocks(c)?; + let content = read_blocks(c, deserializer)?; Ok(Block::BlockQuote(BlockQuote { content, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "OrderedList" => { @@ -1102,21 +1435,21 @@ fn read_block(value: &Value) -> Result { )); } let attr = read_list_attributes(&arr[0])?; - let content = read_blockss(&arr[1])?; + let content = read_blockss(&arr[1], deserializer)?; Ok(Block::OrderedList(OrderedList { attr, content, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "BulletList" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let content = read_blockss(c)?; + let content = read_blockss(c, deserializer)?; Ok(Block::BulletList(BulletList { content, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "DefinitionList" => { @@ -1137,14 +1470,14 @@ fn read_block(value: &Value) -> Result { "DefinitionList item must have 2 elements".to_string(), )); } - let term = read_inlines(&item_arr[0])?; - let definition = read_blockss(&item_arr[1])?; + let term = read_inlines(&item_arr[0], deserializer)?; + let definition = read_blockss(&item_arr[1], deserializer)?; Ok((term, definition)) }) .collect::>>()?; Ok(Block::DefinitionList(DefinitionList { content, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "Header" => { @@ -1163,16 +1496,16 @@ fn read_block(value: &Value) -> Result { JsonReadError::InvalidType("Header level must be number".to_string()) })? as usize; let attr = read_attr(&arr[1])?; - let content = read_inlines(&arr[2])?; + let content = read_inlines(&arr[2], deserializer)?; Ok(Block::Header(Header { level, attr, content, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "HorizontalRule" => Ok(Block::HorizontalRule(HorizontalRule { - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })), "Figure" => { let c = obj @@ -1187,13 +1520,13 @@ fn read_block(value: &Value) -> Result { )); } let attr = read_attr(&arr[0])?; - let caption = read_caption(&arr[1])?; - let content = read_blocks(&arr[2])?; + let caption = read_caption(&arr[1], deserializer)?; + let content = read_blocks(&arr[2], deserializer)?; Ok(Block::Figure(Figure { attr, caption, content, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "Table" => { @@ -1209,7 +1542,7 @@ fn read_block(value: &Value) -> Result { )); } let attr = read_attr(&arr[0])?; - let caption = read_caption(&arr[1])?; + let caption = read_caption(&arr[1], deserializer)?; let colspec_arr = arr[2].as_array().ok_or_else(|| { JsonReadError::InvalidType("Table colspec must be array".to_string()) })?; @@ -1217,15 +1550,15 @@ fn read_block(value: &Value) -> Result { .iter() .map(read_colspec) .collect::>>()?; - let head = read_table_head(&arr[3])?; + let head = read_table_head(&arr[3], deserializer)?; let bodies_arr = arr[4].as_array().ok_or_else(|| { JsonReadError::InvalidType("Table bodies must be array".to_string()) })?; let bodies = bodies_arr .iter() - .map(read_table_body) + .map(|v| read_table_body(v, deserializer)) .collect::>>()?; - let foot = read_table_foot(&arr[5])?; + let foot = read_table_foot(&arr[5], deserializer)?; Ok(Block::Table(Table { attr, caption, @@ -1233,7 +1566,7 @@ fn read_block(value: &Value) -> Result { head, bodies, foot, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "Div" => { @@ -1249,21 +1582,22 @@ fn read_block(value: &Value) -> Result { )); } let attr = read_attr(&arr[0])?; - let content = read_blocks(&arr[1])?; + let content = read_blocks(&arr[1], deserializer)?; Ok(Block::Div(Div { attr, content, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "BlockMetadata" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let meta = read_meta(c)?; + // BlockMetadata uses MetaValueWithSourceInfo format (not top-level meta) + let meta = read_meta_value_with_source_info(c, deserializer)?; Ok(Block::BlockMetadata(MetaBlock { meta, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), })) } "NoteDefinitionPara" => { @@ -1284,12 +1618,12 @@ fn read_block(value: &Value) -> Result { JsonReadError::InvalidType("NoteDefinitionPara id must be string".to_string()) })? .to_string(); - let content = read_inlines(&arr[1])?; + let content = read_inlines(&arr[1], deserializer)?; Ok(Block::NoteDefinitionPara( crate::pandoc::block::NoteDefinitionPara { id, content, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), }, )) } @@ -1315,12 +1649,12 @@ fn read_block(value: &Value) -> Result { ) })? .to_string(); - let content = read_blocks(&arr[1])?; + let content = read_blocks(&arr[1], deserializer)?; Ok(Block::NoteDefinitionFencedBlock( crate::pandoc::block::NoteDefinitionFencedBlock { id, content, - source_info: SourceInfo::new(filename_index, range), + source_info: make_source_info(filename_index, range), }, )) } @@ -1328,20 +1662,54 @@ fn read_block(value: &Value) -> Result { } } -fn read_meta(value: &Value) -> Result { +fn read_meta_with_key_sources( + value: &Value, + key_sources: Option<&Value>, + deserializer: &SourceInfoDeserializer, +) -> Result { + // meta is an object with key-value pairs (Pandoc-compatible format) let obj = value .as_object() .ok_or_else(|| JsonReadError::InvalidType("Expected object for Meta".to_string()))?; - let mut meta = LinkedHashMap::new(); + let mut entries = Vec::new(); for (key, val) in obj { - meta.insert(key.clone(), read_meta_value(val)?); + // Look up key_source from the provided map using deserializer + let key_source = if let Some(sources) = key_sources { + if let Some(sources_obj) = sources.as_object() { + if let Some(source_val) = sources_obj.get(key) { + deserializer.from_json_ref(source_val)? + } else { + // Legitimate default: JSON doesn't have source info for this key (backward compat) + quarto_source_map::SourceInfo::default() + } + } else { + // Legitimate default: JSON key_sources is not an object + quarto_source_map::SourceInfo::default() + } + } else { + // Legitimate default: No key_sources in JSON (backward compatibility) + quarto_source_map::SourceInfo::default() + }; + + entries.push(MetaMapEntry { + key: key.clone(), + key_source, + value: read_meta_value_with_source_info(val, deserializer)?, + }); } - Ok(meta) + Ok(MetaValueWithSourceInfo::MetaMap { + entries, + // Legitimate default: MetaMap itself doesn't have source tracking in JSON (only entries do) + source_info: quarto_source_map::SourceInfo::default(), + }) } -fn read_meta_value(value: &Value) -> Result { +fn read_meta_value_with_source_info( + value: &Value, + deserializer: &SourceInfoDeserializer, +) -> Result { let obj = value .as_object() .ok_or_else(|| JsonReadError::InvalidType("Expected object for MetaValue".to_string()))?; @@ -1350,32 +1718,52 @@ fn read_meta_value(value: &Value) -> Result { .and_then(|v| v.as_str()) .ok_or_else(|| JsonReadError::MissingField("t".to_string()))?; + // Read source_info using deserializer (new format), or use default (old format for backwards compatibility) + let source_info = if let Some(s) = obj.get("s") { + deserializer.from_json_ref(s)? + } else { + // Legitimate default: Old JSON format doesn't have "s" field (backward compatibility) + quarto_source_map::SourceInfo::default() + }; + match t { "MetaString" => { let c = obj.get("c").and_then(|v| v.as_str()).ok_or_else(|| { JsonReadError::InvalidType("MetaString content must be string".to_string()) })?; - Ok(MetaValue::MetaString(c.to_string())) + Ok(MetaValueWithSourceInfo::MetaString { + value: c.to_string(), + source_info, + }) } "MetaInlines" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let inlines = read_inlines(c)?; - Ok(MetaValue::MetaInlines(inlines)) + let inlines = read_inlines(c, deserializer)?; + Ok(MetaValueWithSourceInfo::MetaInlines { + content: inlines, + source_info, + }) } "MetaBlocks" => { let c = obj .get("c") .ok_or_else(|| JsonReadError::MissingField("c".to_string()))?; - let blocks = read_blocks(c)?; - Ok(MetaValue::MetaBlocks(blocks)) + let blocks = read_blocks(c, deserializer)?; + Ok(MetaValueWithSourceInfo::MetaBlocks { + content: blocks, + source_info, + }) } "MetaBool" => { let c = obj.get("c").and_then(|v| v.as_bool()).ok_or_else(|| { JsonReadError::InvalidType("MetaBool content must be boolean".to_string()) })?; - Ok(MetaValue::MetaBool(c)) + Ok(MetaValueWithSourceInfo::MetaBool { + value: c, + source_info, + }) } "MetaList" => { let c = obj @@ -1386,9 +1774,12 @@ fn read_meta_value(value: &Value) -> Result { })?; let list = arr .iter() - .map(read_meta_value) + .map(|v| read_meta_value_with_source_info(v, deserializer)) .collect::>>()?; - Ok(MetaValue::MetaList(list)) + Ok(MetaValueWithSourceInfo::MetaList { + items: list, + source_info, + }) } "MetaMap" => { let c = obj @@ -1397,26 +1788,63 @@ fn read_meta_value(value: &Value) -> Result { let arr = c.as_array().ok_or_else(|| { JsonReadError::InvalidType("MetaMap content must be array".to_string()) })?; - let mut map = LinkedHashMap::new(); + let mut entries = Vec::new(); for item in arr { - let kv_arr = item.as_array().ok_or_else(|| { - JsonReadError::InvalidType("MetaMap item must be array".to_string()) - })?; - if kv_arr.len() != 2 { + // Handle both old format (array) and new format (object) + let (key, key_source, value) = if let Some(obj) = item.as_object() { + // New format: {"key": "...", "key_source": {...}, "value": {...}} + let key = obj + .get("key") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + JsonReadError::MissingField("MetaMap entry missing 'key'".to_string()) + })? + .to_string(); + let key_source = if let Some(ks) = obj.get("key_source") { + deserializer.from_json_ref(ks)? + } else { + // Legitimate default: JSON entry doesn't have key_source (backward compat) + quarto_source_map::SourceInfo::default() + }; + let value = read_meta_value_with_source_info( + obj.get("value").ok_or_else(|| { + JsonReadError::MissingField("MetaMap entry missing 'value'".to_string()) + })?, + deserializer, + )?; + (key, key_source, value) + } else if let Some(kv_arr) = item.as_array() { + // Old format: ["key", {...}] + if kv_arr.len() != 2 { + return Err(JsonReadError::InvalidType( + "MetaMap item must have 2 elements".to_string(), + )); + } + let key = kv_arr[0] + .as_str() + .ok_or_else(|| { + JsonReadError::InvalidType("MetaMap key must be string".to_string()) + })? + .to_string(); + let value = read_meta_value_with_source_info(&kv_arr[1], deserializer)?; + // Legitimate default: Old JSON format [key, value] doesn't have key_source + (key, quarto_source_map::SourceInfo::default(), value) + } else { return Err(JsonReadError::InvalidType( - "MetaMap item must have 2 elements".to_string(), + "MetaMap item must be array or object".to_string(), )); - } - let key = kv_arr[0] - .as_str() - .ok_or_else(|| { - JsonReadError::InvalidType("MetaMap key must be string".to_string()) - })? - .to_string(); - let value = read_meta_value(&kv_arr[1])?; - map.insert(key, value); + }; + + entries.push(MetaMapEntry { + key, + key_source, + value, + }); } - Ok(MetaValue::MetaMap(map)) + Ok(MetaValueWithSourceInfo::MetaMap { + entries, + source_info, + }) } _ => Err(JsonReadError::UnsupportedVariant(format!( "MetaValue: {}", diff --git a/crates/quarto-markdown-pandoc/src/readers/qmd.rs b/crates/quarto-markdown-pandoc/src/readers/qmd.rs index 73e7950..42ab1fb 100644 --- a/crates/quarto-markdown-pandoc/src/readers/qmd.rs +++ b/crates/quarto-markdown-pandoc/src/readers/qmd.rs @@ -10,13 +10,12 @@ use crate::filters::topdown_traverse; use crate::filters::{Filter, FilterReturn}; use crate::pandoc::ast_context::ASTContext; use crate::pandoc::block::MetaBlock; -use crate::pandoc::location::SourceInfo; -use crate::pandoc::meta::parse_metadata_strings; -use crate::pandoc::{self, Block, Meta}; -use crate::pandoc::{MetaValue, rawblock_to_meta}; +use crate::pandoc::meta::parse_metadata_strings_with_source_info; +use crate::pandoc::rawblock_to_meta_with_source_info; +use crate::pandoc::{self, Block, MetaValueWithSourceInfo}; use crate::readers::qmd_error_messages::{produce_error_message, produce_error_message_json}; use crate::traversals; -use crate::utils::error_collector::{ErrorCollector, JsonErrorCollector, TextErrorCollector}; +use crate::utils::diagnostic_collector::DiagnosticCollector; use std::io::Write; use tree_sitter::LogType; use tree_sitter_qmd::MarkdownParser; @@ -139,96 +138,130 @@ where let context = ASTContext::with_filename(filename.to_string()); - // Create appropriate error collector based on whether JSON errors are requested - // and collect warnings after conversion - let mut result = if error_formatter.is_some() { - // JSON error format requested - let mut error_collector = JsonErrorCollector::new(); - let pandoc_result = pandoc::treesitter_to_pandoc( - &mut output_stream, - &tree, - &input_bytes, - &context, - &mut error_collector, - )?; - - // Output warnings to stderr as JSON - let warnings = error_collector.messages(); + // Create diagnostic collector and convert to Pandoc AST + let mut error_collector = DiagnosticCollector::new(); + let mut result = match pandoc::treesitter_to_pandoc( + &mut output_stream, + &tree, + &input_bytes, + &context, + &mut error_collector, + ) { + Ok(pandoc) => pandoc, + Err(diagnostics) => { + // Convert diagnostics to strings based on format + if error_formatter.is_some() { + return Err(diagnostics + .iter() + .map(|d| d.to_json().to_string()) + .collect()); + } else { + return Err(diagnostics.iter().map(|d| d.to_text(None)).collect()); + } + } + }; + + // Output warnings to stderr in appropriate format + if error_formatter.is_some() { + // JSON format + let warnings = error_collector.to_json(); for warning in warnings { eprintln!("{}", warning); } - - pandoc_result } else { - // Text error format (default) - let mut error_collector = TextErrorCollector::new(); - let pandoc_result = pandoc::treesitter_to_pandoc( - &mut output_stream, - &tree, - &input_bytes, - &context, - &mut error_collector, - )?; - - // Output warnings to stderr as formatted text - let warnings = error_collector.messages(); + // Text format (default) + let warnings = error_collector.to_text(); for warning in warnings { eprintln!("{}", warning); } - - pandoc_result - }; - let mut meta_from_parses = Meta::default(); + } + // Store complete MetaMapEntry objects to preserve key_source information + let mut meta_from_parses: Vec = Vec::new(); result = { let mut filter = Filter::new().with_raw_block(|rb| { if rb.format != "quarto_minus_metadata" { return Unchanged(rb); } - let filename_index = rb.source_info.filename_index; - let range = rb.source_info.range.clone(); - let result = rawblock_to_meta(rb); - let is_lexical = { - let val = result.get("_scope"); - matches!(val, Some(MetaValue::MetaString(s)) if s == "lexical") - }; + // Use new rawblock_to_meta_with_source_info - preserves source info! + let meta_with_source = rawblock_to_meta_with_source_info(&rb, &context); + + // Check if this is lexical metadata + let is_lexical = + if let MetaValueWithSourceInfo::MetaMap { ref entries, .. } = meta_with_source { + entries.iter().any(|e| { + e.key == "_scope" + && matches!( + &e.value, + MetaValueWithSourceInfo::MetaString { value, .. } if value == "lexical" + ) + }) + } else { + false + }; if is_lexical { - let mut inner_meta_from_parses = Meta::default(); - let mut meta_map = match parse_metadata_strings( - MetaValue::MetaMap(result), + // Lexical metadata - parse strings and return as BlockMetadata + let mut inner_meta_from_parses = Vec::new(); + let parsed_meta = parse_metadata_strings_with_source_info( + meta_with_source, &mut inner_meta_from_parses, - ) { - MetaValue::MetaMap(m) => m, - _ => panic!("Expected MetaMap from parse_metadata_strings"), + ); + + // Merge inner metadata if needed + let final_meta = if let MetaValueWithSourceInfo::MetaMap { + mut entries, + source_info, + } = parsed_meta + { + // Now inner_meta_from_parses preserves full MetaMapEntry with key_source + for entry in inner_meta_from_parses { + entries.push(entry); + } + MetaValueWithSourceInfo::MetaMap { + entries, + source_info, + } + } else { + parsed_meta }; - for (k, v) in inner_meta_from_parses { - meta_map.insert(k, v); - } + return FilterReturn::FilterResult( vec![Block::BlockMetadata(MetaBlock { - meta: meta_map, - source_info: SourceInfo::new(filename_index, range), + meta: final_meta, + source_info: rb.source_info.clone(), })], false, ); } else { - let meta_map = - match parse_metadata_strings(MetaValue::MetaMap(result), &mut meta_from_parses) - { - MetaValue::MetaMap(m) => m, - _ => panic!("Expected MetaMap from parse_metadata_strings"), - }; - for (k, v) in meta_map { - meta_from_parses.insert(k, v); + // Document-level metadata - parse strings and merge into meta_from_parses + let mut inner_meta = Vec::new(); + let parsed_meta = + parse_metadata_strings_with_source_info(meta_with_source, &mut inner_meta); + + // Extract MetaMapEntry objects (preserving key_source) and store them + if let MetaValueWithSourceInfo::MetaMap { entries, .. } = parsed_meta { + for entry in entries { + meta_from_parses.push(entry); + } + } + // Also add any inner metadata entries (now preserves key_source) + for entry in inner_meta { + meta_from_parses.push(entry); } return FilterReturn::FilterResult(vec![], false); } }); topdown_traverse(result, &mut filter) }; - for (k, v) in meta_from_parses.into_iter() { - result.meta.insert(k, v); + + // Merge meta_from_parses into result.meta + // result.meta is MetaValueWithSourceInfo::MetaMap, so we need to append entries + // Now meta_from_parses contains complete MetaMapEntry objects with key_source preserved + if let MetaValueWithSourceInfo::MetaMap { entries, .. } = &mut result.meta { + for entry in meta_from_parses.into_iter() { + entries.push(entry); + } } Ok((result, context)) } diff --git a/crates/quarto-markdown-pandoc/src/utils/diagnostic_collector.rs b/crates/quarto-markdown-pandoc/src/utils/diagnostic_collector.rs new file mode 100644 index 0000000..bdcda01 --- /dev/null +++ b/crates/quarto-markdown-pandoc/src/utils/diagnostic_collector.rs @@ -0,0 +1,210 @@ +///! DiagnosticCollector - collects DiagnosticMessage objects and renders them to text or JSON +use quarto_error_reporting::{DiagnosticKind, DiagnosticMessage}; + +/// Collector for diagnostic messages +#[derive(Debug)] +pub struct DiagnosticCollector { + diagnostics: Vec, +} + +impl DiagnosticCollector { + /// Create a new diagnostic collector + pub fn new() -> Self { + Self { + diagnostics: Vec::new(), + } + } + + /// Add a diagnostic message + pub fn add(&mut self, diagnostic: DiagnosticMessage) { + self.diagnostics.push(diagnostic); + } + + /// Helper: Add an error message (uses generic_error! macro for file/line tracking) + /// + /// For migration from ErrorCollector. Creates a DiagnosticMessage with code Q-0-99. + pub fn error(&mut self, message: impl Into) { + self.add(quarto_error_reporting::generic_error!(message.into())); + } + + /// Helper: Add a warning message (uses generic_warning! macro for file/line tracking) + /// + /// For migration from ErrorCollector. Creates a DiagnosticMessage with code Q-0-99. + pub fn warn(&mut self, message: impl Into) { + self.add(quarto_error_reporting::generic_warning!(message.into())); + } + + /// Add an error message with source location + /// + /// Use this when you have source location information available. + pub fn error_at( + &mut self, + message: impl Into, + location: quarto_source_map::SourceInfo, + ) { + let mut diagnostic = quarto_error_reporting::generic_error!(message.into()); + diagnostic.location = Some(location); + self.add(diagnostic); + } + + /// Add a warning message with source location + /// + /// Use this when you have source location information available. + pub fn warn_at(&mut self, message: impl Into, location: quarto_source_map::SourceInfo) { + let mut diagnostic = quarto_error_reporting::generic_warning!(message.into()); + diagnostic.location = Some(location); + self.add(diagnostic); + } + + /// Check if any errors were collected (warnings don't count) + pub fn has_errors(&self) -> bool { + self.diagnostics + .iter() + .any(|d| d.kind == DiagnosticKind::Error) + } + + /// Get a reference to the collected diagnostics + pub fn diagnostics(&self) -> &[DiagnosticMessage] { + &self.diagnostics + } + + /// Render all diagnostics to text strings + pub fn to_text(&self) -> Vec { + self.diagnostics.iter().map(|d| d.to_text(None)).collect() + } + + /// Render all diagnostics to JSON strings + pub fn to_json(&self) -> Vec { + self.diagnostics + .iter() + .map(|d| d.to_json().to_string()) + .collect() + } + + /// Consume the collector and return the diagnostics + pub fn into_diagnostics(self) -> Vec { + self.diagnostics + } +} + +impl Default for DiagnosticCollector { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use quarto_error_reporting::DiagnosticMessageBuilder; + + #[test] + fn test_new_collector() { + let collector = DiagnosticCollector::new(); + assert!(collector.diagnostics.is_empty()); + assert!(!collector.has_errors()); + } + + #[test] + fn test_add_diagnostic() { + let mut collector = DiagnosticCollector::new(); + let diag = DiagnosticMessageBuilder::error("Test error").build(); + collector.add(diag); + + assert_eq!(collector.diagnostics.len(), 1); + assert!(collector.has_errors()); + } + + #[test] + fn test_error_helper() { + let mut collector = DiagnosticCollector::new(); + collector.error("Something went wrong"); + + assert_eq!(collector.diagnostics.len(), 1); + assert!(collector.has_errors()); + assert_eq!(collector.diagnostics[0].code, Some("Q-0-99".to_string())); + } + + #[test] + fn test_warn_helper() { + let mut collector = DiagnosticCollector::new(); + collector.warn("Be careful"); + + assert_eq!(collector.diagnostics.len(), 1); + assert!(!collector.has_errors()); // Warnings don't count as errors + assert_eq!(collector.diagnostics[0].code, Some("Q-0-99".to_string())); + } + + #[test] + fn test_to_text() { + let mut collector = DiagnosticCollector::new(); + collector.error("Test error"); + collector.warn("Test warning"); + + let messages = collector.to_text(); + assert_eq!(messages.len(), 2); + assert!(messages[0].contains("Error")); + assert!(messages[0].contains("Test error")); + assert!(messages[1].contains("Warning")); + assert!(messages[1].contains("Test warning")); + } + + #[test] + fn test_to_json() { + let mut collector = DiagnosticCollector::new(); + collector.error("Test error"); + + let messages = collector.to_json(); + assert_eq!(messages.len(), 1); + + // Verify it's valid JSON + let parsed: serde_json::Value = serde_json::from_str(&messages[0]).unwrap(); + assert_eq!(parsed["kind"], "error"); + assert!(parsed["title"].as_str().unwrap().contains("Test error")); + } + + #[test] + fn test_can_render_both_formats() { + let mut collector = DiagnosticCollector::new(); + collector.error("Test error"); + + // Can render as both text and JSON without needing to decide at construction + let text = collector.to_text(); + let json = collector.to_json(); + + assert_eq!(text.len(), 1); + assert_eq!(json.len(), 1); + assert!(text[0].contains("Error")); + assert!(json[0].contains("\"kind\"")); + } + + #[test] + fn test_into_diagnostics() { + let mut collector = DiagnosticCollector::new(); + collector.error("Test error"); + collector.warn("Test warning"); + + let diagnostics = collector.into_diagnostics(); + assert_eq!(diagnostics.len(), 2); + assert_eq!(diagnostics[0].kind, DiagnosticKind::Error); + assert_eq!(diagnostics[1].kind, DiagnosticKind::Warning); + } + + #[test] + fn test_has_errors_with_only_warnings() { + let mut collector = DiagnosticCollector::new(); + collector.warn("Warning 1"); + collector.warn("Warning 2"); + + assert!(!collector.has_errors()); + } + + #[test] + fn test_has_errors_with_errors() { + let mut collector = DiagnosticCollector::new(); + collector.warn("Warning"); + collector.error("Error"); + + assert!(collector.has_errors()); + } +} diff --git a/crates/quarto-markdown-pandoc/src/utils/mod.rs b/crates/quarto-markdown-pandoc/src/utils/mod.rs index 9014e30..5f4e674 100644 --- a/crates/quarto-markdown-pandoc/src/utils/mod.rs +++ b/crates/quarto-markdown-pandoc/src/utils/mod.rs @@ -5,8 +5,7 @@ pub mod autoid; pub mod concrete_tree_depth; -pub mod error_collector; +pub mod diagnostic_collector; pub mod output; -pub mod string_write_adapter; pub mod text; pub mod tree_sitter_log_observer; diff --git a/crates/quarto-markdown-pandoc/src/writers/json.rs b/crates/quarto-markdown-pandoc/src/writers/json.rs index 58eb5ea..429d57f 100644 --- a/crates/quarto-markdown-pandoc/src/writers/json.rs +++ b/crates/quarto-markdown-pandoc/src/writers/json.rs @@ -6,22 +6,210 @@ use crate::pandoc::{ ASTContext, Attr, Block, Caption, CitationMode, Inline, Inlines, ListAttributes, Pandoc, }; +use quarto_source_map::{FileId, Range, RangeMapping, SourceInfo, SourceMapping}; +use serde::Serialize; use serde_json::{Value, json}; +use std::collections::HashMap; + +/// Serializable version of SourceInfo that uses ID references instead of Rc pointers. +/// +/// This structure is used during JSON serialization to avoid duplicating parent chains. +/// Each unique SourceInfo is assigned an ID and stored in a pool. References to parent +/// SourceInfo objects are replaced with parent_id integers. +/// +/// Serializes in compact format: {"r": [6 range values], "t": type_code, "d": type_data} +/// The ID is implicit from the array index in the pool. +struct SerializableSourceInfo { + id: usize, + range: Range, + mapping: SerializableSourceMapping, +} + +impl Serialize for SerializableSourceInfo { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + use serde::ser::SerializeMap; + let mut map = serializer.serialize_map(Some(3))?; + + // Serialize range as array [start_offset, start_row, start_col, end_offset, end_row, end_col] + let range_array = [ + self.range.start.offset, + self.range.start.row, + self.range.start.column, + self.range.end.offset, + self.range.end.row, + self.range.end.column, + ]; + map.serialize_entry("r", &range_array)?; + + // Serialize type code and data based on mapping variant + match &self.mapping { + SerializableSourceMapping::Original { file_id } => { + map.serialize_entry("t", &0)?; + map.serialize_entry("d", &file_id.0)?; + } + SerializableSourceMapping::Substring { parent_id, offset } => { + map.serialize_entry("t", &1)?; + map.serialize_entry("d", &[parent_id, offset])?; + } + SerializableSourceMapping::Concat { pieces } => { + map.serialize_entry("t", &2)?; + let piece_arrays: Vec<[usize; 3]> = pieces + .iter() + .map(|p| [p.source_info_id, p.offset_in_concat, p.length]) + .collect(); + map.serialize_entry("d", &piece_arrays)?; + } + SerializableSourceMapping::Transformed { parent_id, mapping } => { + map.serialize_entry("t", &3)?; + let mapping_arrays: Vec<[usize; 4]> = mapping + .iter() + .map(|m| [m.from_start, m.from_end, m.to_start, m.to_end]) + .collect(); + map.serialize_entry("d", &[json!(*parent_id), json!(mapping_arrays)])?; + } + } + + map.end() + } +} + +/// Serializable version of SourceMapping that uses parent_id instead of Rc. +enum SerializableSourceMapping { + Original { + file_id: FileId, + }, + Substring { + parent_id: usize, + offset: usize, + }, + Concat { + pieces: Vec, + }, + Transformed { + parent_id: usize, + mapping: Vec, + }, +} + +/// Serializable version of SourcePiece that uses source_info_id instead of SourceInfo. +struct SerializableSourcePiece { + source_info_id: usize, + offset_in_concat: usize, + length: usize, +} + +/// Serializer that builds a pool of unique SourceInfo objects and assigns IDs. +/// +/// During AST traversal, each SourceInfo is interned into the pool. Rc-shared +/// SourceInfo objects get the same ID (using pointer equality). Parent references +/// are serialized as parent_id integers instead of full nested objects. +/// +/// This approach reduces JSON size by ~93% for documents with many nodes sharing +/// the same parent chains (e.g., YAML metadata with siblings). +struct SourceInfoSerializer { + pool: Vec, + id_map: HashMap<*const SourceInfo, usize>, +} + +impl SourceInfoSerializer { + fn new() -> Self { + SourceInfoSerializer { + pool: Vec::new(), + id_map: HashMap::new(), + } + } + + /// Intern a SourceInfo into the pool, returning its ID. + /// + /// If this SourceInfo (or an Rc-equivalent) has already been interned, + /// returns the existing ID. Otherwise, recursively interns parents and + /// adds this SourceInfo to the pool with a new ID. + fn intern(&mut self, source_info: &SourceInfo) -> usize { + // For Rc-shared SourceInfo objects, we need to detect if they point to the same + // underlying data. We use the data pointer address for this. + let ptr = source_info as *const SourceInfo; + + // Check if already interned + if let Some(&id) = self.id_map.get(&ptr) { + return id; + } + + // Recursively intern parents and build the serializable mapping + let mapping = match &source_info.mapping { + SourceMapping::Original { file_id } => { + SerializableSourceMapping::Original { file_id: *file_id } + } + SourceMapping::Substring { parent, offset } => { + let parent_id = self.intern(parent); + SerializableSourceMapping::Substring { + parent_id, + offset: *offset, + } + } + SourceMapping::Transformed { parent, mapping } => { + let parent_id = self.intern(parent); + SerializableSourceMapping::Transformed { + parent_id, + mapping: mapping.clone(), + } + } + SourceMapping::Concat { pieces } => { + let serializable_pieces = pieces + .iter() + .map(|piece| SerializableSourcePiece { + source_info_id: self.intern(&piece.source_info), + offset_in_concat: piece.offset_in_concat, + length: piece.length, + }) + .collect(); + SerializableSourceMapping::Concat { + pieces: serializable_pieces, + } + } + }; + + // Calculate ID after recursion completes + let id = self.pool.len(); + + // Add to pool + self.pool.push(SerializableSourceInfo { + id, + range: source_info.range.clone(), + mapping, + }); + + // Record this pointer's ID for future lookups + self.id_map.insert(ptr, id); + + id + } + + /// Serialize a SourceInfo as a JSON reference: just the id number + fn to_json_ref(&mut self, source_info: &SourceInfo) -> Value { + let id = self.intern(source_info); + json!(id) + } +} + +fn write_location(source_info: &quarto_source_map::SourceInfo) -> Value { + // Extract filename index by walking to the Original mapping + let filename_index = crate::pandoc::location::extract_filename_index(source_info); -fn write_location(item: &T) -> Value { - let range = item.range(); json!({ "start": { - "offset": range.start.offset, - "row": range.start.row, - "column": range.start.column, + "offset": source_info.range.start.offset, + "row": source_info.range.start.row, + "column": source_info.range.start.column, }, "end": { - "offset": range.end.offset, - "row": range.end.row, - "column": range.end.column, + "offset": source_info.range.end.offset, + "row": source_info.range.end.row, + "column": source_info.range.end.column, }, - "filenameIndex": item.filename_index(), + "filenameIndex": filename_index, }) } @@ -44,39 +232,39 @@ fn write_citation_mode(mode: &CitationMode) -> Value { } } -fn write_inline(inline: &Inline) -> Value { +fn write_inline(inline: &Inline, serializer: &mut SourceInfoSerializer) -> Value { match inline { Inline::Str(s) => json!({ "t": "Str", "c": s.text, - "l": write_location(s) + "s": serializer.to_json_ref(&s.source_info) }), Inline::Space(space) => json!({ "t": "Space", - "l": write_location(space) + "s": serializer.to_json_ref(&space.source_info) }), Inline::LineBreak(lb) => json!({ "t": "LineBreak", - "l": write_location(lb) + "s": serializer.to_json_ref(&lb.source_info) }), Inline::SoftBreak(sb) => json!({ "t": "SoftBreak", - "l": write_location(sb) + "s": serializer.to_json_ref(&sb.source_info) }), Inline::Emph(e) => json!({ "t": "Emph", - "c": write_inlines(&e.content), - "l": write_location(e) + "c": write_inlines(&e.content, serializer), + "s": serializer.to_json_ref(&e.source_info) }), Inline::Strong(s) => json!({ "t": "Strong", - "c": write_inlines(&s.content), - "l": write_location(s) + "c": write_inlines(&s.content, serializer), + "s": serializer.to_json_ref(&s.source_info) }), Inline::Code(c) => json!({ "t": "Code", "c": [write_attr(&c.attr), c.text], - "l": write_location(c) + "s": serializer.to_json_ref(&c.source_info) }), Inline::Math(m) => { let math_type = match m.math_type { @@ -86,33 +274,33 @@ fn write_inline(inline: &Inline) -> Value { json!({ "t": "Math", "c": [math_type, m.text], - "l": write_location(m) + "s": serializer.to_json_ref(&m.source_info) }) } Inline::Underline(u) => json!({ "t": "Underline", - "c": write_inlines(&u.content), - "l": write_location(u) + "c": write_inlines(&u.content, serializer), + "s": serializer.to_json_ref(&u.source_info) }), Inline::Strikeout(s) => json!({ "t": "Strikeout", - "c": write_inlines(&s.content), - "l": write_location(s) + "c": write_inlines(&s.content, serializer), + "s": serializer.to_json_ref(&s.source_info) }), Inline::Superscript(s) => json!({ "t": "Superscript", - "c": write_inlines(&s.content), - "l": write_location(s) + "c": write_inlines(&s.content, serializer), + "s": serializer.to_json_ref(&s.source_info) }), Inline::Subscript(s) => json!({ "t": "Subscript", - "c": write_inlines(&s.content), - "l": write_location(s) + "c": write_inlines(&s.content, serializer), + "s": serializer.to_json_ref(&s.source_info) }), Inline::SmallCaps(s) => json!({ "t": "SmallCaps", - "c": write_inlines(&s.content), - "l": write_location(s) + "c": write_inlines(&s.content, serializer), + "s": serializer.to_json_ref(&s.source_info) }), Inline::Quoted(q) => { let quote_type = match q.quote_type { @@ -121,34 +309,34 @@ fn write_inline(inline: &Inline) -> Value { }; json!({ "t": "Quoted", - "c": [quote_type, write_inlines(&q.content)], - "l": write_location(q) + "c": [quote_type, write_inlines(&q.content, serializer)], + "s": serializer.to_json_ref(&q.source_info) }) } Inline::Link(link) => json!({ "t": "Link", - "c": [write_attr(&link.attr), write_inlines(&link.content), [link.target.0, link.target.1]], - "l": write_location(link) + "c": [write_attr(&link.attr), write_inlines(&link.content, serializer), [link.target.0, link.target.1]], + "s": serializer.to_json_ref(&link.source_info) }), Inline::RawInline(raw) => json!({ "t": "RawInline", "c": [raw.format.clone(), raw.text.clone()], - "l": write_location(raw) + "s": serializer.to_json_ref(&raw.source_info) }), Inline::Image(image) => json!({ "t": "Image", - "c": [write_attr(&image.attr), write_inlines(&image.content), [image.target.0, image.target.1]], - "l": write_location(image) + "c": [write_attr(&image.attr), write_inlines(&image.content, serializer), [image.target.0, image.target.1]], + "s": serializer.to_json_ref(&image.source_info) }), Inline::Span(span) => json!({ "t": "Span", - "c": [write_attr(&span.attr), write_inlines(&span.content)], - "l": write_location(span) + "c": [write_attr(&span.attr), write_inlines(&span.content, serializer)], + "s": serializer.to_json_ref(&span.source_info) }), Inline::Note(note) => json!({ "t": "Note", - "c": write_blocks(¬e.content), - "l": write_location(note) + "c": write_blocks(¬e.content, serializer), + "s": serializer.to_json_ref(¬e.source_info) }), // we can't test this just yet because // our citationNoteNum counter doesn't match Pandoc's @@ -158,16 +346,16 @@ fn write_inline(inline: &Inline) -> Value { cite.citations.iter().map(|citation| { json!({ "citationId": citation.id.clone(), - "citationPrefix": write_inlines(&citation.prefix), - "citationSuffix": write_inlines(&citation.suffix), + "citationPrefix": write_inlines(&citation.prefix, serializer), + "citationSuffix": write_inlines(&citation.suffix, serializer), "citationMode": write_citation_mode(&citation.mode), "citationHash": citation.hash, "citationNoteNum": citation.note_num }) }).collect::>(), - write_inlines(&cite.content) + write_inlines(&cite.content, serializer) ], - "l": write_location(cite) + "s": serializer.to_json_ref(&cite.source_info) }), Inline::Shortcode(_) | Inline::NoteReference(_) @@ -181,8 +369,13 @@ fn write_inline(inline: &Inline) -> Value { } } -fn write_inlines(inlines: &Inlines) -> Value { - json!(inlines.iter().map(write_inline).collect::>()) +fn write_inlines(inlines: &Inlines, serializer: &mut SourceInfoSerializer) -> Value { + json!( + inlines + .iter() + .map(|inline| write_inline(inline, serializer)) + .collect::>() + ) } fn write_list_attributes(attr: &ListAttributes) -> Value { @@ -204,22 +397,28 @@ fn write_list_attributes(attr: &ListAttributes) -> Value { json!([attr.0, number_style, number_delimiter]) } -fn write_blockss(blockss: &[Vec]) -> Value { +fn write_blockss(blockss: &[Vec], serializer: &mut SourceInfoSerializer) -> Value { json!( blockss .iter() - .map(|blocks| blocks.iter().map(write_block).collect::>()) + .map(|blocks| blocks + .iter() + .map(|block| write_block(block, serializer)) + .collect::>()) .collect::>() ) } -fn write_caption(caption: &Caption) -> Value { +fn write_caption(caption: &Caption, serializer: &mut SourceInfoSerializer) -> Value { json!([ - &caption.short.as_ref().map(|s| write_inlines(&s)), + &caption + .short + .as_ref() + .map(|s| write_inlines(&s, serializer)), &caption .long .as_ref() - .map(|l| write_blocks(&l)) + .map(|l| write_blocks(&l, serializer)) .unwrap_or_else(|| json!([])), ]) } @@ -244,56 +443,80 @@ fn write_colspec(colspec: &crate::pandoc::table::ColSpec) -> Value { json!([write_alignment(&colspec.0), write_colwidth(&colspec.1)]) } -fn write_cell(cell: &crate::pandoc::table::Cell) -> Value { +fn write_cell(cell: &crate::pandoc::table::Cell, serializer: &mut SourceInfoSerializer) -> Value { json!([ write_attr(&cell.attr), write_alignment(&cell.alignment), cell.row_span, cell.col_span, - write_blocks(&cell.content) + write_blocks(&cell.content, serializer) ]) } -fn write_row(row: &crate::pandoc::table::Row) -> Value { +fn write_row(row: &crate::pandoc::table::Row, serializer: &mut SourceInfoSerializer) -> Value { json!([ write_attr(&row.attr), - row.cells.iter().map(write_cell).collect::>() + row.cells + .iter() + .map(|cell| write_cell(cell, serializer)) + .collect::>() ]) } -fn write_table_head(head: &crate::pandoc::table::TableHead) -> Value { +fn write_table_head( + head: &crate::pandoc::table::TableHead, + serializer: &mut SourceInfoSerializer, +) -> Value { json!([ write_attr(&head.attr), - head.rows.iter().map(write_row).collect::>() + head.rows + .iter() + .map(|row| write_row(row, serializer)) + .collect::>() ]) } -fn write_table_body(body: &crate::pandoc::table::TableBody) -> Value { +fn write_table_body( + body: &crate::pandoc::table::TableBody, + serializer: &mut SourceInfoSerializer, +) -> Value { json!([ write_attr(&body.attr), body.rowhead_columns, - body.head.iter().map(write_row).collect::>(), - body.body.iter().map(write_row).collect::>() + body.head + .iter() + .map(|row| write_row(row, serializer)) + .collect::>(), + body.body + .iter() + .map(|row| write_row(row, serializer)) + .collect::>() ]) } -fn write_table_foot(foot: &crate::pandoc::table::TableFoot) -> Value { +fn write_table_foot( + foot: &crate::pandoc::table::TableFoot, + serializer: &mut SourceInfoSerializer, +) -> Value { json!([ write_attr(&foot.attr), - foot.rows.iter().map(write_row).collect::>() + foot.rows + .iter() + .map(|row| write_row(row, serializer)) + .collect::>() ]) } -fn write_block(block: &Block) -> Value { +fn write_block(block: &Block, serializer: &mut SourceInfoSerializer) -> Value { match block { Block::Figure(figure) => json!({ "t": "Figure", "c": [ write_attr(&figure.attr), - write_caption(&figure.caption), - write_blocks(&figure.content) + write_caption(&figure.caption, serializer), + write_blocks(&figure.content, serializer) ], - "l": write_location(figure) + "s": serializer.to_json_ref(&figure.source_info) }), Block::DefinitionList(deflist) => json!({ "t": "DefinitionList", @@ -301,99 +524,99 @@ fn write_block(block: &Block) -> Value { .iter() .map(|(term, definition)| { json!([ - write_inlines(term), - write_blockss(&definition), + write_inlines(term, serializer), + write_blockss(&definition, serializer), ]) }) .collect::>(), - "l": write_location(deflist), + "s": serializer.to_json_ref(&deflist.source_info), }), Block::OrderedList(orderedlist) => json!({ "t": "OrderedList", "c": [ write_list_attributes(&orderedlist.attr), - write_blockss(&orderedlist.content), + write_blockss(&orderedlist.content, serializer), ], - "l": write_location(orderedlist), + "s": serializer.to_json_ref(&orderedlist.source_info), }), Block::RawBlock(raw) => json!({ "t": "RawBlock", "c": [raw.format.clone(), raw.text.clone()], - "l": write_location(raw), + "s": serializer.to_json_ref(&raw.source_info), }), Block::HorizontalRule(block) => json!({ "t": "HorizontalRule", - "l": write_location(block), + "s": serializer.to_json_ref(&block.source_info), }), Block::Table(table) => json!({ "t": "Table", "c": [ write_attr(&table.attr), - write_caption(&table.caption), + write_caption(&table.caption, serializer), table.colspec.iter().map(write_colspec).collect::>(), - write_table_head(&table.head), - table.bodies.iter().map(write_table_body).collect::>(), - write_table_foot(&table.foot) + write_table_head(&table.head, serializer), + table.bodies.iter().map(|body| write_table_body(body, serializer)).collect::>(), + write_table_foot(&table.foot, serializer) ], - "l": write_location(table), + "s": serializer.to_json_ref(&table.source_info), }), Block::Div(div) => json!({ "t": "Div", - "c": [write_attr(&div.attr), write_blocks(&div.content)], - "l": write_location(div), + "c": [write_attr(&div.attr), write_blocks(&div.content, serializer)], + "s": serializer.to_json_ref(&div.source_info), }), Block::BlockQuote(quote) => json!({ "t": "BlockQuote", - "c": write_blocks("e.content), - "l": write_location(quote), + "c": write_blocks("e.content, serializer), + "s": serializer.to_json_ref("e.source_info), }), Block::LineBlock(lineblock) => json!({ "t": "LineBlock", - "c": lineblock.content.iter().map(write_inlines).collect::>(), - "l": write_location(lineblock), + "c": lineblock.content.iter().map(|inlines| write_inlines(inlines, serializer)).collect::>(), + "s": serializer.to_json_ref(&lineblock.source_info), }), Block::Paragraph(para) => json!({ "t": "Para", - "c": write_inlines(¶.content), - "l": write_location(para), + "c": write_inlines(¶.content, serializer), + "s": serializer.to_json_ref(¶.source_info), }), Block::Header(header) => { json!({ "t": "Header", - "c": [header.level, write_attr(&header.attr), write_inlines(&header.content)], - "l": write_location(header), + "c": [header.level, write_attr(&header.attr), write_inlines(&header.content, serializer)], + "s": serializer.to_json_ref(&header.source_info), }) } Block::CodeBlock(codeblock) => json!({ "t": "CodeBlock", "c": [write_attr(&codeblock.attr), codeblock.text], - "l": write_location(codeblock), + "s": serializer.to_json_ref(&codeblock.source_info), }), Block::Plain(plain) => json!({ "t": "Plain", - "c": write_inlines(&plain.content), - "l": write_location(plain), + "c": write_inlines(&plain.content, serializer), + "s": serializer.to_json_ref(&plain.source_info), }), Block::BulletList(bulletlist) => json!({ "t": "BulletList", - "c": bulletlist.content.iter().map(|blocks| blocks.iter().map(write_block).collect::>()).collect::>(), - "l": write_location(bulletlist), + "c": bulletlist.content.iter().map(|blocks| blocks.iter().map(|block| write_block(block, serializer)).collect::>()).collect::>(), + "s": serializer.to_json_ref(&bulletlist.source_info), }), Block::BlockMetadata(meta) => json!({ "t": "BlockMetadata", - "c": write_meta(&meta.meta), - "l": write_location(meta), + "c": write_meta_value_with_source_info(&meta.meta, serializer), + "s": serializer.to_json_ref(&meta.source_info), }), Block::NoteDefinitionPara(refdef) => json!({ "t": "NoteDefinitionPara", - "c": [refdef.id, write_inlines(&refdef.content)], - "l": write_location(refdef), + "c": [refdef.id, write_inlines(&refdef.content, serializer)], + "s": serializer.to_json_ref(&refdef.source_info), }), Block::NoteDefinitionFencedBlock(refdef) => json!({ "t": "NoteDefinitionFencedBlock", - "c": [refdef.id, write_blocks(&refdef.content)], - "l": write_location(refdef), + "c": [refdef.id, write_blocks(&refdef.content, serializer)], + "s": serializer.to_json_ref(&refdef.source_info), }), Block::CaptionBlock(_) => { panic!( @@ -403,55 +626,130 @@ fn write_block(block: &Block) -> Value { } } -fn write_meta_value(value: &crate::pandoc::MetaValue) -> Value { +fn write_meta_value_with_source_info( + value: &crate::pandoc::MetaValueWithSourceInfo, + serializer: &mut SourceInfoSerializer, +) -> Value { match value { - crate::pandoc::MetaValue::MetaString(s) => json!({ + crate::pandoc::MetaValueWithSourceInfo::MetaString { value, source_info } => json!({ "t": "MetaString", - "c": s + "c": value, + "s": serializer.to_json_ref(source_info) }), - crate::pandoc::MetaValue::MetaInlines(inlines) => json!({ + crate::pandoc::MetaValueWithSourceInfo::MetaBool { value, source_info } => json!({ + "t": "MetaBool", + "c": value, + "s": serializer.to_json_ref(source_info) + }), + crate::pandoc::MetaValueWithSourceInfo::MetaInlines { + content, + source_info, + } => json!({ "t": "MetaInlines", - "c": write_inlines(inlines) + "c": write_inlines(content, serializer), + "s": serializer.to_json_ref(source_info) }), - crate::pandoc::MetaValue::MetaBlocks(blocks) => json!({ + crate::pandoc::MetaValueWithSourceInfo::MetaBlocks { + content, + source_info, + } => json!({ "t": "MetaBlocks", - "c": write_blocks(blocks) + "c": write_blocks(content, serializer), + "s": serializer.to_json_ref(source_info) }), - crate::pandoc::MetaValue::MetaList(list) => json!({ + crate::pandoc::MetaValueWithSourceInfo::MetaList { items, source_info } => json!({ "t": "MetaList", - "c": list.iter().map(write_meta_value).collect::>() + "c": items.iter().map(|item| write_meta_value_with_source_info(item, serializer)).collect::>(), + "s": serializer.to_json_ref(source_info) }), - crate::pandoc::MetaValue::MetaMap(map) => json!({ + crate::pandoc::MetaValueWithSourceInfo::MetaMap { + entries, + source_info, + } => json!({ "t": "MetaMap", - "c": map.iter().map(|(k, v)| json!([k, write_meta_value(v)])).collect::>() - }), - crate::pandoc::MetaValue::MetaBool(b) => json!({ - "t": "MetaBool", - "c": b + "c": entries.iter().map(|entry| json!({ + "key": entry.key, + "key_source": serializer.to_json_ref(&entry.key_source), + "value": write_meta_value_with_source_info(&entry.value, serializer) + })).collect::>(), + "s": serializer.to_json_ref(source_info) }), } } -fn write_meta(meta: &crate::pandoc::Meta) -> Value { - let map: serde_json::Map = meta - .iter() - .map(|(k, v)| (k.clone(), write_meta_value(v))) - .collect(); - Value::Object(map) +fn write_meta( + meta: &crate::pandoc::MetaValueWithSourceInfo, + serializer: &mut SourceInfoSerializer, +) -> Value { + // meta should be a MetaMap variant + // Write as Pandoc-compatible object format + match meta { + crate::pandoc::MetaValueWithSourceInfo::MetaMap { entries, .. } => { + let map: serde_json::Map = entries + .iter() + .map(|entry| { + ( + entry.key.clone(), + write_meta_value_with_source_info(&entry.value, serializer), + ) + }) + .collect(); + Value::Object(map) + } + _ => panic!("Expected MetaMap for Pandoc.meta"), + } } -fn write_blocks(blocks: &[Block]) -> Value { - json!(blocks.iter().map(write_block).collect::>()) +fn write_blocks(blocks: &[Block], serializer: &mut SourceInfoSerializer) -> Value { + json!( + blocks + .iter() + .map(|block| write_block(block, serializer)) + .collect::>() + ) } fn write_pandoc(pandoc: &Pandoc, context: &ASTContext) -> Value { + // Create the SourceInfo serializer + let mut serializer = SourceInfoSerializer::new(); + + // Serialize AST, which will build the pool + let meta_json = write_meta(&pandoc.meta, &mut serializer); + let blocks_json = write_blocks(&pandoc.blocks, &mut serializer); + + // Extract top-level key sources from metadata using the serializer + let meta_top_level_key_sources: serde_json::Map = + if let crate::pandoc::MetaValueWithSourceInfo::MetaMap { entries, .. } = &pandoc.meta { + entries + .iter() + .map(|entry| (entry.key.clone(), serializer.to_json_ref(&entry.key_source))) + .collect() + } else { + serde_json::Map::new() + }; + + // Build astContext with pool and metaTopLevelKeySources + let mut ast_context_obj = serde_json::Map::new(); + ast_context_obj.insert("filenames".to_string(), json!(context.filenames)); + + // Only include sourceInfoPool if non-empty + if !serializer.pool.is_empty() { + ast_context_obj.insert("sourceInfoPool".to_string(), json!(serializer.pool)); + } + + // Only include metaTopLevelKeySources if non-empty + if !meta_top_level_key_sources.is_empty() { + ast_context_obj.insert( + "metaTopLevelKeySources".to_string(), + Value::Object(meta_top_level_key_sources), + ); + } + json!({ "pandoc-api-version": [1, 23, 1], - "meta": write_meta(&pandoc.meta), - "blocks": write_blocks(&pandoc.blocks), - "astContext": { - "filenames": context.filenames, - }, + "meta": meta_json, + "blocks": blocks_json, + "astContext": ast_context_obj, }) } diff --git a/crates/quarto-markdown-pandoc/src/writers/qmd.rs b/crates/quarto-markdown-pandoc/src/writers/qmd.rs index 094e314..965ad07 100644 --- a/crates/quarto-markdown-pandoc/src/writers/qmd.rs +++ b/crates/quarto-markdown-pandoc/src/writers/qmd.rs @@ -6,13 +6,11 @@ use crate::pandoc::attr::is_empty_attr; use crate::pandoc::block::MetaBlock; use crate::pandoc::list::{ListNumberDelim, ListNumberStyle}; -use crate::pandoc::meta::MetaValue; use crate::pandoc::table::{Alignment, Cell, Table}; use crate::pandoc::{ Block, BlockQuote, BulletList, CodeBlock, DefinitionList, Figure, Header, HorizontalRule, - LineBlock, Meta, OrderedList, Pandoc, Paragraph, Plain, RawBlock, Str, + LineBlock, OrderedList, Pandoc, Paragraph, Plain, RawBlock, Str, }; -use crate::utils::string_write_adapter::StringWriteAdapter; use hashlink::LinkedHashMap; use std::io::{self, Write}; use yaml_rust2::{Yaml, YamlEmitter}; @@ -173,84 +171,105 @@ impl<'a, W: Write + ?Sized> Write for OrderedListContext<'a, W> { } } -/// Convert a MetaValue to a yaml_rust2::Yaml value +/// Convert a MetaValueWithSourceInfo to a yaml_rust2::Yaml value /// MetaInlines and MetaBlocks are rendered using the qmd writer -fn meta_value_to_yaml(value: &MetaValue) -> std::io::Result { +fn meta_value_with_source_info_to_yaml( + value: &crate::pandoc::MetaValueWithSourceInfo, +) -> std::io::Result { match value { - MetaValue::MetaString(s) => Ok(Yaml::String(s.clone())), - MetaValue::MetaBool(b) => Ok(Yaml::Boolean(*b)), - MetaValue::MetaInlines(inlines) => { + crate::pandoc::MetaValueWithSourceInfo::MetaString { value, .. } => { + Ok(Yaml::String(value.clone())) + } + crate::pandoc::MetaValueWithSourceInfo::MetaBool { value, .. } => Ok(Yaml::Boolean(*value)), + crate::pandoc::MetaValueWithSourceInfo::MetaInlines { content, .. } => { // Render inlines using the qmd writer - let mut buffer = String::new(); - let mut adapter = StringWriteAdapter::new(&mut buffer); - for inline in inlines { - write_inline(inline, &mut adapter)?; + let mut buffer = Vec::::new(); + for inline in content { + write_inline(inline, &mut buffer)?; } - Ok(Yaml::String(buffer)) + let result = String::from_utf8(buffer) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + Ok(Yaml::String(result)) } - MetaValue::MetaBlocks(blocks) => { + crate::pandoc::MetaValueWithSourceInfo::MetaBlocks { content, .. } => { // Render blocks using the qmd writer - let mut buffer = String::new(); - let mut adapter = StringWriteAdapter::new(&mut buffer); - for (i, block) in blocks.iter().enumerate() { + let mut buffer = Vec::::new(); + for (i, block) in content.iter().enumerate() { if i > 0 { - writeln!(&mut adapter)?; + writeln!(&mut buffer)?; } - write_block(block, &mut adapter)?; + write_block(block, &mut buffer)?; } + let result = String::from_utf8(buffer) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; // Trim trailing newline to avoid extra spacing in YAML - let trimmed = buffer.trim_end(); + let trimmed = result.trim_end(); Ok(Yaml::String(trimmed.to_string())) } - MetaValue::MetaList(list) => { + crate::pandoc::MetaValueWithSourceInfo::MetaList { items, .. } => { let mut yaml_list = Vec::new(); - for item in list { - yaml_list.push(meta_value_to_yaml(item)?); + for item in items { + yaml_list.push(meta_value_with_source_info_to_yaml(item)?); } Ok(Yaml::Array(yaml_list)) } - MetaValue::MetaMap(map) => { + crate::pandoc::MetaValueWithSourceInfo::MetaMap { entries, .. } => { // LinkedHashMap preserves insertion order let mut yaml_map = LinkedHashMap::new(); - for (key, val) in map { - yaml_map.insert(Yaml::String(key.clone()), meta_value_to_yaml(val)?); + for entry in entries { + yaml_map.insert( + Yaml::String(entry.key.clone()), + meta_value_with_source_info_to_yaml(&entry.value)?, + ); } Ok(Yaml::Hash(yaml_map)) } } } -fn write_meta(meta: &Meta, buf: &mut T) -> std::io::Result { - if meta.is_empty() { - Ok(false) - } else { - // Convert Meta to YAML - // LinkedHashMap preserves insertion order - let mut yaml_map = LinkedHashMap::new(); - for (key, value) in meta { - yaml_map.insert(Yaml::String(key.clone()), meta_value_to_yaml(value)?); - } - let yaml = Yaml::Hash(yaml_map); +fn write_meta( + meta: &crate::pandoc::MetaValueWithSourceInfo, + buf: &mut T, +) -> std::io::Result { + // meta should be a MetaMap variant + match meta { + crate::pandoc::MetaValueWithSourceInfo::MetaMap { entries, .. } => { + if entries.is_empty() { + Ok(false) + } else { + // Convert Meta to YAML + // LinkedHashMap preserves insertion order + let mut yaml_map = LinkedHashMap::new(); + for entry in entries { + yaml_map.insert( + Yaml::String(entry.key.clone()), + meta_value_with_source_info_to_yaml(&entry.value)?, + ); + } + let yaml = Yaml::Hash(yaml_map); + + // Emit YAML to string + let mut yaml_str = String::new(); + let mut emitter = YamlEmitter::new(&mut yaml_str); + emitter + .dump(&yaml) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + + // The YamlEmitter adds "---\n" at the start and includes the content + // We need to add the closing "---\n" + // First, ensure yaml_str ends with a newline + if !yaml_str.ends_with('\n') { + yaml_str.push('\n'); + } - // Emit YAML to string - let mut yaml_str = String::new(); - let mut emitter = YamlEmitter::new(&mut yaml_str); - emitter - .dump(&yaml) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?; + // Write the YAML metadata block + write!(buf, "{}", yaml_str)?; + writeln!(buf, "---")?; - // The YamlEmitter adds "---\n" at the start and includes the content - // We need to add the closing "---\n" - // First, ensure yaml_str ends with a newline - if !yaml_str.ends_with('\n') { - yaml_str.push('\n'); + Ok(true) + } } - - // Write the YAML metadata block - write!(buf, "{}", yaml_str)?; - writeln!(buf, "---")?; - - Ok(true) + _ => panic!("Expected MetaMap for metadata"), } } @@ -624,9 +643,10 @@ fn write_table(table: &Table, buf: &mut dyn std::io::Write) -> std::io::Result<( for row in &all_rows { let mut cell_strings = Vec::new(); for (i, cell) in row.cells.iter().take(num_cols).enumerate() { - let mut content = String::new(); - let mut adapter = StringWriteAdapter::new(&mut content); - write_cell_content(cell, &mut adapter)?; + let mut buffer = Vec::::new(); + write_cell_content(cell, &mut buffer)?; + let content = String::from_utf8(buffer) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; let content = content.trim().to_string(); if content.len() > max_widths[i] { diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/json/001.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/json/001.qmd.snapshot index b6e5806..2fb967c 100644 --- a/crates/quarto-markdown-pandoc/tests/snapshots/json/001.qmd.snapshot +++ b/crates/quarto-markdown-pandoc/tests/snapshots/json/001.qmd.snapshot @@ -1 +1 @@ -{"astContext":{"filenames":["tests/snapshots/json/001.qmd"]},"blocks":[{"c":[{"c":"This","l":{"end":{"column":4,"offset":4,"row":0},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Str"},{"l":{"end":{"column":5,"offset":5,"row":0},"filenameIndex":0,"start":{"column":4,"offset":4,"row":0}},"t":"Space"},{"c":"is","l":{"end":{"column":7,"offset":7,"row":0},"filenameIndex":0,"start":{"column":5,"offset":5,"row":0}},"t":"Str"},{"l":{"end":{"column":8,"offset":8,"row":0},"filenameIndex":0,"start":{"column":7,"offset":7,"row":0}},"t":"Space"},{"c":"a","l":{"end":{"column":9,"offset":9,"row":0},"filenameIndex":0,"start":{"column":8,"offset":8,"row":0}},"t":"Str"},{"l":{"end":{"column":10,"offset":10,"row":0},"filenameIndex":0,"start":{"column":9,"offset":9,"row":0}},"t":"Space"},{"c":[{"c":"bold","l":{"end":{"column":16,"offset":16,"row":0},"filenameIndex":0,"start":{"column":12,"offset":12,"row":0}},"t":"Str"}],"l":{"end":{"column":18,"offset":18,"row":0},"filenameIndex":0,"start":{"column":10,"offset":10,"row":0}},"t":"Strong"},{"l":{"end":{"column":19,"offset":19,"row":0},"filenameIndex":0,"start":{"column":18,"offset":18,"row":0}},"t":"Space"},{"c":"test.","l":{"end":{"column":24,"offset":24,"row":0},"filenameIndex":0,"start":{"column":19,"offset":19,"row":0}},"t":"Str"}],"l":{"end":{"column":0,"offset":25,"row":1},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Para"}],"meta":{},"pandoc-api-version":[1,23,1]} \ No newline at end of file +{"astContext":{"filenames":["tests/snapshots/json/001.qmd"],"sourceInfoPool":[{"d":0,"r":[0,0,0,4,0,4],"t":0},{"d":0,"r":[4,0,4,5,0,5],"t":0},{"d":0,"r":[5,0,5,7,0,7],"t":0},{"d":0,"r":[7,0,7,8,0,8],"t":0},{"d":0,"r":[8,0,8,9,0,9],"t":0},{"d":0,"r":[9,0,9,10,0,10],"t":0},{"d":0,"r":[12,0,12,16,0,16],"t":0},{"d":0,"r":[10,0,10,18,0,18],"t":0},{"d":0,"r":[18,0,18,19,0,19],"t":0},{"d":0,"r":[19,0,19,23,0,23],"t":0},{"d":0,"r":[23,0,23,24,0,24],"t":0},{"d":[[9,0,4],[10,4,1]],"r":[0,0,0,5,0,0],"t":2},{"d":0,"r":[0,0,0,25,1,0],"t":0}]},"blocks":[{"c":[{"c":"This","s":0,"t":"Str"},{"s":1,"t":"Space"},{"c":"is","s":2,"t":"Str"},{"s":3,"t":"Space"},{"c":"a","s":4,"t":"Str"},{"s":5,"t":"Space"},{"c":[{"c":"bold","s":6,"t":"Str"}],"s":7,"t":"Strong"},{"s":8,"t":"Space"},{"c":"test.","s":11,"t":"Str"}],"s":12,"t":"Para"}],"meta":{},"pandoc-api-version":[1,23,1]} \ No newline at end of file diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/json/002.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/json/002.qmd.snapshot index e0fa98b..80f92d9 100644 --- a/crates/quarto-markdown-pandoc/tests/snapshots/json/002.qmd.snapshot +++ b/crates/quarto-markdown-pandoc/tests/snapshots/json/002.qmd.snapshot @@ -1 +1 @@ -{"astContext":{"filenames":["tests/snapshots/json/002.qmd"]},"blocks":[{"c":[["",["hello"],[]],[]],"l":{"end":{"column":0,"offset":63,"row":11},"filenameIndex":0,"start":{"column":0,"offset":26,"row":4}},"t":"Div"}],"meta":{"nested":{"c":[{"c":"meta","l":{"end":{"column":4,"offset":4,"row":0},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Str"}],"t":"MetaInlines"},"title":{"c":[{"c":"metadata1","l":{"end":{"column":9,"offset":9,"row":0},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Str"}],"t":"MetaInlines"}},"pandoc-api-version":[1,23,1]} \ No newline at end of file +{"astContext":{"filenames":["tests/snapshots/json/002.qmd"],"metaTopLevelKeySources":{"nested":14,"title":12},"sourceInfoPool":[{"d":0,"r":[0,0,0,8,0,8],"t":0},{"d":0,"r":[8,0,8,9,0,9],"t":0},{"d":[[0,0,8],[1,8,1]],"r":[0,0,0,9,0,0],"t":2},{"d":0,"r":[0,0,0,63,11,0],"t":0},{"d":[3,4],"r":[0,0,0,16,0,0],"t":1},{"d":[4,7],"r":[0,0,0,9,0,0],"t":1},{"d":0,"r":[0,0,0,4,0,4],"t":0},{"d":0,"r":[37,6,0,58,9,0],"t":0},{"d":[7,4],"r":[0,0,0,12,0,0],"t":1},{"d":[8,8],"r":[0,0,0,4,0,0],"t":1},{"d":0,"r":[26,4,0,63,11,0],"t":0},{"d":[3,4],"r":[0,0,0,16,0,0],"t":1},{"d":[11,0],"r":[0,0,0,5,0,0],"t":1},{"d":[7,4],"r":[0,0,0,12,0,0],"t":1},{"d":[13,0],"r":[0,0,0,6,0,0],"t":1}]},"blocks":[{"c":[["",["hello"],[]],[]],"s":10,"t":"Div"}],"meta":{"nested":{"c":[{"c":"meta","s":6,"t":"Str"}],"s":9,"t":"MetaInlines"},"title":{"c":[{"c":"metadata1","s":2,"t":"Str"}],"s":5,"t":"MetaInlines"}},"pandoc-api-version":[1,23,1]} \ No newline at end of file diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/json/003.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/json/003.qmd.snapshot index b0572a9..26dc690 100644 --- a/crates/quarto-markdown-pandoc/tests/snapshots/json/003.qmd.snapshot +++ b/crates/quarto-markdown-pandoc/tests/snapshots/json/003.qmd.snapshot @@ -1 +1 @@ -{"astContext":{"filenames":["tests/snapshots/json/003.qmd"]},"blocks":[{"c":[["",["hello"],[]],[{"c":{"_scope":{"c":[{"c":"lexical","l":{"end":{"column":7,"offset":7,"row":0},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Str"}],"t":"MetaInlines"},"nested":{"c":[{"c":"meta","l":{"end":{"column":4,"offset":4,"row":0},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Str"}],"t":"MetaInlines"}},"l":{"end":{"column":0,"offset":74,"row":10},"filenameIndex":null,"start":{"column":0,"offset":37,"row":6}},"t":"BlockMetadata"}]],"l":{"end":{"column":0,"offset":79,"row":12},"filenameIndex":0,"start":{"column":0,"offset":26,"row":4}},"t":"Div"}],"meta":{"title":{"c":[{"c":"metadata1","l":{"end":{"column":9,"offset":9,"row":0},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Str"}],"t":"MetaInlines"}},"pandoc-api-version":[1,23,1]} \ No newline at end of file +{"astContext":{"filenames":["tests/snapshots/json/003.qmd"],"metaTopLevelKeySources":{"title":22},"sourceInfoPool":[{"d":0,"r":[0,0,0,8,0,8],"t":0},{"d":0,"r":[8,0,8,9,0,9],"t":0},{"d":[[0,0,8],[1,8,1]],"r":[0,0,0,9,0,0],"t":2},{"d":0,"r":[0,0,0,79,12,0],"t":0},{"d":[3,4],"r":[0,0,0,16,0,0],"t":1},{"d":[4,7],"r":[0,0,0,9,0,0],"t":1},{"d":0,"r":[37,6,0,74,10,0],"t":0},{"d":[6,4],"r":[0,0,0,28,0,0],"t":1},{"d":[7,0],"r":[0,0,0,6,0,0],"t":1},{"d":0,"r":[0,0,0,7,0,7],"t":0},{"d":[6,4],"r":[0,0,0,28,0,0],"t":1},{"d":[10,8],"r":[0,0,0,7,0,0],"t":1},{"d":[6,4],"r":[0,0,0,28,0,0],"t":1},{"d":[12,16],"r":[0,0,0,6,0,0],"t":1},{"d":0,"r":[0,0,0,4,0,4],"t":0},{"d":[6,4],"r":[0,0,0,28,0,0],"t":1},{"d":[15,24],"r":[0,0,0,4,0,0],"t":1},{"d":[6,4],"r":[0,0,0,28,0,0],"t":1},{"d":[17,6],"r":[0,0,0,22,0,0],"t":1},{"d":0,"r":[37,6,0,74,10,0],"t":0},{"d":0,"r":[26,4,0,79,12,0],"t":0},{"d":[3,4],"r":[0,0,0,16,0,0],"t":1},{"d":[21,0],"r":[0,0,0,5,0,0],"t":1}]},"blocks":[{"c":[["",["hello"],[]],[{"c":{"c":[{"key":"_scope","key_source":8,"value":{"c":[{"c":"lexical","s":9,"t":"Str"}],"s":11,"t":"MetaInlines"}},{"key":"nested","key_source":13,"value":{"c":[{"c":"meta","s":14,"t":"Str"}],"s":16,"t":"MetaInlines"}}],"s":18,"t":"MetaMap"},"s":19,"t":"BlockMetadata"}]],"s":20,"t":"Div"}],"meta":{"title":{"c":[{"c":"metadata1","s":2,"t":"Str"}],"s":5,"t":"MetaInlines"}},"pandoc-api-version":[1,23,1]} \ No newline at end of file diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/json/math-with-attr.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/json/math-with-attr.qmd.snapshot index 5c8df94..b38b9bf 100644 --- a/crates/quarto-markdown-pandoc/tests/snapshots/json/math-with-attr.qmd.snapshot +++ b/crates/quarto-markdown-pandoc/tests/snapshots/json/math-with-attr.qmd.snapshot @@ -1 +1 @@ -{"astContext":{"filenames":["tests/snapshots/json/math-with-attr.qmd"]},"blocks":[{"c":[{"c":"Inline","l":{"end":{"column":6,"offset":6,"row":0},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Str"},{"l":{"end":{"column":7,"offset":7,"row":0},"filenameIndex":0,"start":{"column":6,"offset":6,"row":0}},"t":"Space"},{"c":"math","l":{"end":{"column":11,"offset":11,"row":0},"filenameIndex":0,"start":{"column":7,"offset":7,"row":0}},"t":"Str"},{"l":{"end":{"column":12,"offset":12,"row":0},"filenameIndex":0,"start":{"column":11,"offset":11,"row":0}},"t":"Space"},{"c":"with","l":{"end":{"column":16,"offset":16,"row":0},"filenameIndex":0,"start":{"column":12,"offset":12,"row":0}},"t":"Str"},{"l":{"end":{"column":17,"offset":17,"row":0},"filenameIndex":0,"start":{"column":16,"offset":16,"row":0}},"t":"Space"},{"c":"attribute:","l":{"end":{"column":27,"offset":27,"row":0},"filenameIndex":0,"start":{"column":17,"offset":17,"row":0}},"t":"Str"},{"l":{"end":{"column":28,"offset":28,"row":0},"filenameIndex":0,"start":{"column":27,"offset":27,"row":0}},"t":"Space"},{"c":[["eq-einstein",["quarto-math-with-attribute"],[]],[{"c":[{"t":"InlineMath"},"E = mc^2"],"l":{"end":{"column":38,"offset":38,"row":0},"filenameIndex":0,"start":{"column":28,"offset":28,"row":0}},"t":"Math"}]],"l":{"end":{"column":0,"offset":0,"row":0},"filenameIndex":null,"start":{"column":0,"offset":0,"row":0}},"t":"Span"}],"l":{"end":{"column":0,"offset":54,"row":1},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Para"},{"c":[{"c":"Display","l":{"end":{"column":7,"offset":62,"row":2},"filenameIndex":0,"start":{"column":0,"offset":55,"row":2}},"t":"Str"},{"l":{"end":{"column":8,"offset":63,"row":2},"filenameIndex":0,"start":{"column":7,"offset":62,"row":2}},"t":"Space"},{"c":"math","l":{"end":{"column":12,"offset":67,"row":2},"filenameIndex":0,"start":{"column":8,"offset":63,"row":2}},"t":"Str"},{"l":{"end":{"column":13,"offset":68,"row":2},"filenameIndex":0,"start":{"column":12,"offset":67,"row":2}},"t":"Space"},{"c":"with","l":{"end":{"column":17,"offset":72,"row":2},"filenameIndex":0,"start":{"column":13,"offset":68,"row":2}},"t":"Str"},{"l":{"end":{"column":18,"offset":73,"row":2},"filenameIndex":0,"start":{"column":17,"offset":72,"row":2}},"t":"Space"},{"c":"attribute:","l":{"end":{"column":28,"offset":83,"row":2},"filenameIndex":0,"start":{"column":18,"offset":73,"row":2}},"t":"Str"}],"l":{"end":{"column":0,"offset":84,"row":3},"filenameIndex":0,"start":{"column":0,"offset":55,"row":2}},"t":"Para"},{"c":[{"c":[["eq-gaussian",["quarto-math-with-attribute"],[]],[{"c":[{"t":"DisplayMath"},"\n\\int_0^\\infty e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}\n"],"l":{"end":{"column":2,"offset":139,"row":6},"filenameIndex":0,"start":{"column":0,"offset":85,"row":4}},"t":"Math"}]],"l":{"end":{"column":0,"offset":0,"row":0},"filenameIndex":null,"start":{"column":0,"offset":0,"row":0}},"t":"Span"}],"l":{"end":{"column":0,"offset":155,"row":7},"filenameIndex":0,"start":{"column":0,"offset":85,"row":4}},"t":"Para"},{"c":[{"c":"Another","l":{"end":{"column":7,"offset":163,"row":8},"filenameIndex":0,"start":{"column":0,"offset":156,"row":8}},"t":"Str"},{"l":{"end":{"column":8,"offset":164,"row":8},"filenameIndex":0,"start":{"column":7,"offset":163,"row":8}},"t":"Space"},{"c":"inline","l":{"end":{"column":14,"offset":170,"row":8},"filenameIndex":0,"start":{"column":8,"offset":164,"row":8}},"t":"Str"},{"l":{"end":{"column":15,"offset":171,"row":8},"filenameIndex":0,"start":{"column":14,"offset":170,"row":8}},"t":"Space"},{"c":"example:","l":{"end":{"column":23,"offset":179,"row":8},"filenameIndex":0,"start":{"column":15,"offset":171,"row":8}},"t":"Str"},{"l":{"end":{"column":24,"offset":180,"row":8},"filenameIndex":0,"start":{"column":23,"offset":179,"row":8}},"t":"Space"},{"c":[["eq-pythagorean",["quarto-math-with-attribute"],[]],[{"c":[{"t":"InlineMath"},"a^2 + b^2 = c^2"],"l":{"end":{"column":41,"offset":197,"row":8},"filenameIndex":0,"start":{"column":24,"offset":180,"row":8}},"t":"Math"}]],"l":{"end":{"column":0,"offset":0,"row":0},"filenameIndex":null,"start":{"column":0,"offset":0,"row":0}},"t":"Span"}],"l":{"end":{"column":0,"offset":216,"row":9},"filenameIndex":0,"start":{"column":0,"offset":156,"row":8}},"t":"Para"}],"meta":{},"pandoc-api-version":[1,23,1]} \ No newline at end of file +{"astContext":{"filenames":["tests/snapshots/json/math-with-attr.qmd"],"sourceInfoPool":[{"d":0,"r":[0,0,0,6,0,6],"t":0},{"d":0,"r":[6,0,6,7,0,7],"t":0},{"d":0,"r":[7,0,7,11,0,11],"t":0},{"d":0,"r":[11,0,11,12,0,12],"t":0},{"d":0,"r":[12,0,12,16,0,16],"t":0},{"d":0,"r":[16,0,16,17,0,17],"t":0},{"d":0,"r":[17,0,17,26,0,26],"t":0},{"d":0,"r":[26,0,26,27,0,27],"t":0},{"d":[[6,0,9],[7,9,1]],"r":[0,0,0,10,0,0],"t":2},{"d":0,"r":[27,0,27,28,0,28],"t":0},{"d":0,"r":[28,0,28,38,0,38],"t":0},{"d":0,"r":[0,0,0,0,0,0],"t":0},{"d":0,"r":[0,0,0,54,1,0],"t":0},{"d":0,"r":[55,2,0,62,2,7],"t":0},{"d":0,"r":[62,2,7,63,2,8],"t":0},{"d":0,"r":[63,2,8,67,2,12],"t":0},{"d":0,"r":[67,2,12,68,2,13],"t":0},{"d":0,"r":[68,2,13,72,2,17],"t":0},{"d":0,"r":[72,2,17,73,2,18],"t":0},{"d":0,"r":[73,2,18,82,2,27],"t":0},{"d":0,"r":[82,2,27,83,2,28],"t":0},{"d":[[19,0,9],[20,9,1]],"r":[0,0,0,10,0,0],"t":2},{"d":0,"r":[55,2,0,84,3,0],"t":0},{"d":0,"r":[85,4,0,139,6,2],"t":0},{"d":0,"r":[0,0,0,0,0,0],"t":0},{"d":0,"r":[85,4,0,155,7,0],"t":0},{"d":0,"r":[156,8,0,163,8,7],"t":0},{"d":0,"r":[163,8,7,164,8,8],"t":0},{"d":0,"r":[164,8,8,170,8,14],"t":0},{"d":0,"r":[170,8,14,171,8,15],"t":0},{"d":0,"r":[171,8,15,178,8,22],"t":0},{"d":0,"r":[178,8,22,179,8,23],"t":0},{"d":[[30,0,7],[31,7,1]],"r":[0,0,0,8,0,0],"t":2},{"d":0,"r":[179,8,23,180,8,24],"t":0},{"d":0,"r":[180,8,24,197,8,41],"t":0},{"d":0,"r":[0,0,0,0,0,0],"t":0},{"d":0,"r":[156,8,0,216,9,0],"t":0}]},"blocks":[{"c":[{"c":"Inline","s":0,"t":"Str"},{"s":1,"t":"Space"},{"c":"math","s":2,"t":"Str"},{"s":3,"t":"Space"},{"c":"with","s":4,"t":"Str"},{"s":5,"t":"Space"},{"c":"attribute:","s":8,"t":"Str"},{"s":9,"t":"Space"},{"c":[["eq-einstein",["quarto-math-with-attribute"],[]],[{"c":[{"t":"InlineMath"},"E = mc^2"],"s":10,"t":"Math"}]],"s":11,"t":"Span"}],"s":12,"t":"Para"},{"c":[{"c":"Display","s":13,"t":"Str"},{"s":14,"t":"Space"},{"c":"math","s":15,"t":"Str"},{"s":16,"t":"Space"},{"c":"with","s":17,"t":"Str"},{"s":18,"t":"Space"},{"c":"attribute:","s":21,"t":"Str"}],"s":22,"t":"Para"},{"c":[{"c":[["eq-gaussian",["quarto-math-with-attribute"],[]],[{"c":[{"t":"DisplayMath"},"\n\\int_0^\\infty e^{-x^2} dx = \\frac{\\sqrt{\\pi}}{2}\n"],"s":23,"t":"Math"}]],"s":24,"t":"Span"}],"s":25,"t":"Para"},{"c":[{"c":"Another","s":26,"t":"Str"},{"s":27,"t":"Space"},{"c":"inline","s":28,"t":"Str"},{"s":29,"t":"Space"},{"c":"example:","s":32,"t":"Str"},{"s":33,"t":"Space"},{"c":[["eq-pythagorean",["quarto-math-with-attribute"],[]],[{"c":[{"t":"InlineMath"},"a^2 + b^2 = c^2"],"s":34,"t":"Math"}]],"s":35,"t":"Span"}],"s":36,"t":"Para"}],"meta":{},"pandoc-api-version":[1,23,1]} \ No newline at end of file diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/json/table-alignment.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/json/table-alignment.qmd.snapshot index de4984b..e69e8dc 100644 --- a/crates/quarto-markdown-pandoc/tests/snapshots/json/table-alignment.qmd.snapshot +++ b/crates/quarto-markdown-pandoc/tests/snapshots/json/table-alignment.qmd.snapshot @@ -1 +1 @@ -{"astContext":{"filenames":["tests/snapshots/json/table-alignment.qmd"]},"blocks":[{"c":[["",[],[]],[null,[]],[[{"t":"AlignRight"},{"t":"ColWidthDefault"}],[{"t":"AlignLeft"},{"t":"ColWidthDefault"}],[{"t":"AlignCenter"},{"t":"ColWidthDefault"}],[{"t":"AlignDefault"},{"t":"ColWidthDefault"}]],[["",[],[]],[[["",[],[]],[[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Right","l":{"end":{"column":7,"offset":7,"row":0},"filenameIndex":0,"start":{"column":2,"offset":2,"row":0}},"t":"Str"}],"l":{"end":{"column":8,"offset":8,"row":0},"filenameIndex":0,"start":{"column":2,"offset":2,"row":0}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Left","l":{"end":{"column":14,"offset":14,"row":0},"filenameIndex":0,"start":{"column":10,"offset":10,"row":0}},"t":"Str"}],"l":{"end":{"column":15,"offset":15,"row":0},"filenameIndex":0,"start":{"column":10,"offset":10,"row":0}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Center","l":{"end":{"column":23,"offset":23,"row":0},"filenameIndex":0,"start":{"column":17,"offset":17,"row":0}},"t":"Str"}],"l":{"end":{"column":24,"offset":24,"row":0},"filenameIndex":0,"start":{"column":17,"offset":17,"row":0}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Default","l":{"end":{"column":33,"offset":33,"row":0},"filenameIndex":0,"start":{"column":26,"offset":26,"row":0}},"t":"Str"}],"l":{"end":{"column":34,"offset":34,"row":0},"filenameIndex":0,"start":{"column":26,"offset":26,"row":0}},"t":"Plain"}]]]]]],[[["",[],[]],0,[],[[["",[],[]],[[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"R1","l":{"end":{"column":4,"offset":76,"row":2},"filenameIndex":0,"start":{"column":2,"offset":74,"row":2}},"t":"Str"}],"l":{"end":{"column":8,"offset":80,"row":2},"filenameIndex":0,"start":{"column":2,"offset":74,"row":2}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"L1","l":{"end":{"column":12,"offset":84,"row":2},"filenameIndex":0,"start":{"column":10,"offset":82,"row":2}},"t":"Str"}],"l":{"end":{"column":15,"offset":87,"row":2},"filenameIndex":0,"start":{"column":10,"offset":82,"row":2}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"C1","l":{"end":{"column":19,"offset":91,"row":2},"filenameIndex":0,"start":{"column":17,"offset":89,"row":2}},"t":"Str"}],"l":{"end":{"column":24,"offset":96,"row":2},"filenameIndex":0,"start":{"column":17,"offset":89,"row":2}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"D1","l":{"end":{"column":28,"offset":100,"row":2},"filenameIndex":0,"start":{"column":26,"offset":98,"row":2}},"t":"Str"}],"l":{"end":{"column":34,"offset":106,"row":2},"filenameIndex":0,"start":{"column":26,"offset":98,"row":2}},"t":"Plain"}]]]],[["",[],[]],[[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"R2","l":{"end":{"column":4,"offset":112,"row":3},"filenameIndex":0,"start":{"column":2,"offset":110,"row":3}},"t":"Str"}],"l":{"end":{"column":8,"offset":116,"row":3},"filenameIndex":0,"start":{"column":2,"offset":110,"row":3}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"L2","l":{"end":{"column":12,"offset":120,"row":3},"filenameIndex":0,"start":{"column":10,"offset":118,"row":3}},"t":"Str"}],"l":{"end":{"column":15,"offset":123,"row":3},"filenameIndex":0,"start":{"column":10,"offset":118,"row":3}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"C2","l":{"end":{"column":19,"offset":127,"row":3},"filenameIndex":0,"start":{"column":17,"offset":125,"row":3}},"t":"Str"}],"l":{"end":{"column":24,"offset":132,"row":3},"filenameIndex":0,"start":{"column":17,"offset":125,"row":3}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"D2","l":{"end":{"column":28,"offset":136,"row":3},"filenameIndex":0,"start":{"column":26,"offset":134,"row":3}},"t":"Str"}],"l":{"end":{"column":34,"offset":142,"row":3},"filenameIndex":0,"start":{"column":26,"offset":134,"row":3}},"t":"Plain"}]]]]]]],[["",[],[]],[]]],"l":{"end":{"column":0,"offset":144,"row":4},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Table"}],"meta":{},"pandoc-api-version":[1,23,1]} \ No newline at end of file +{"astContext":{"filenames":["tests/snapshots/json/table-alignment.qmd"],"sourceInfoPool":[{"d":0,"r":[2,0,2,7,0,7],"t":0},{"d":0,"r":[2,0,2,8,0,8],"t":0},{"d":0,"r":[10,0,10,14,0,14],"t":0},{"d":0,"r":[10,0,10,15,0,15],"t":0},{"d":0,"r":[17,0,17,23,0,23],"t":0},{"d":0,"r":[17,0,17,24,0,24],"t":0},{"d":0,"r":[26,0,26,33,0,33],"t":0},{"d":0,"r":[26,0,26,34,0,34],"t":0},{"d":0,"r":[74,2,2,75,2,3],"t":0},{"d":0,"r":[75,2,3,76,2,4],"t":0},{"d":[[8,0,1],[9,1,1]],"r":[0,0,0,2,0,0],"t":2},{"d":0,"r":[74,2,2,80,2,8],"t":0},{"d":0,"r":[82,2,10,83,2,11],"t":0},{"d":0,"r":[83,2,11,84,2,12],"t":0},{"d":[[12,0,1],[13,1,1]],"r":[0,0,0,2,0,0],"t":2},{"d":0,"r":[82,2,10,87,2,15],"t":0},{"d":0,"r":[89,2,17,90,2,18],"t":0},{"d":0,"r":[90,2,18,91,2,19],"t":0},{"d":[[16,0,1],[17,1,1]],"r":[0,0,0,2,0,0],"t":2},{"d":0,"r":[89,2,17,96,2,24],"t":0},{"d":0,"r":[98,2,26,99,2,27],"t":0},{"d":0,"r":[99,2,27,100,2,28],"t":0},{"d":[[20,0,1],[21,1,1]],"r":[0,0,0,2,0,0],"t":2},{"d":0,"r":[98,2,26,106,2,34],"t":0},{"d":0,"r":[110,3,2,111,3,3],"t":0},{"d":0,"r":[111,3,3,112,3,4],"t":0},{"d":[[24,0,1],[25,1,1]],"r":[0,0,0,2,0,0],"t":2},{"d":0,"r":[110,3,2,116,3,8],"t":0},{"d":0,"r":[118,3,10,119,3,11],"t":0},{"d":0,"r":[119,3,11,120,3,12],"t":0},{"d":[[28,0,1],[29,1,1]],"r":[0,0,0,2,0,0],"t":2},{"d":0,"r":[118,3,10,123,3,15],"t":0},{"d":0,"r":[125,3,17,126,3,18],"t":0},{"d":0,"r":[126,3,18,127,3,19],"t":0},{"d":[[32,0,1],[33,1,1]],"r":[0,0,0,2,0,0],"t":2},{"d":0,"r":[125,3,17,132,3,24],"t":0},{"d":0,"r":[134,3,26,135,3,27],"t":0},{"d":0,"r":[135,3,27,136,3,28],"t":0},{"d":[[36,0,1],[37,1,1]],"r":[0,0,0,2,0,0],"t":2},{"d":0,"r":[134,3,26,142,3,34],"t":0},{"d":0,"r":[0,0,0,144,4,0],"t":0}]},"blocks":[{"c":[["",[],[]],[null,[]],[[{"t":"AlignRight"},{"t":"ColWidthDefault"}],[{"t":"AlignLeft"},{"t":"ColWidthDefault"}],[{"t":"AlignCenter"},{"t":"ColWidthDefault"}],[{"t":"AlignDefault"},{"t":"ColWidthDefault"}]],[["",[],[]],[[["",[],[]],[[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Right","s":0,"t":"Str"}],"s":1,"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Left","s":2,"t":"Str"}],"s":3,"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Center","s":4,"t":"Str"}],"s":5,"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Default","s":6,"t":"Str"}],"s":7,"t":"Plain"}]]]]]],[[["",[],[]],0,[],[[["",[],[]],[[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"R1","s":10,"t":"Str"}],"s":11,"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"L1","s":14,"t":"Str"}],"s":15,"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"C1","s":18,"t":"Str"}],"s":19,"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"D1","s":22,"t":"Str"}],"s":23,"t":"Plain"}]]]],[["",[],[]],[[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"R2","s":26,"t":"Str"}],"s":27,"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"L2","s":30,"t":"Str"}],"s":31,"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"C2","s":34,"t":"Str"}],"s":35,"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"D2","s":38,"t":"Str"}],"s":39,"t":"Plain"}]]]]]]],[["",[],[]],[]]],"s":40,"t":"Table"}],"meta":{},"pandoc-api-version":[1,23,1]} \ No newline at end of file diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/json/table-caption-attr.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/json/table-caption-attr.qmd.snapshot index d451688..88c1c5b 100644 --- a/crates/quarto-markdown-pandoc/tests/snapshots/json/table-caption-attr.qmd.snapshot +++ b/crates/quarto-markdown-pandoc/tests/snapshots/json/table-caption-attr.qmd.snapshot @@ -1 +1 @@ -{"astContext":{"filenames":["tests/snapshots/json/table-caption-attr.qmd"]},"blocks":[{"c":[["",[],[["tbl-colwidths","[30,70]"]]],[null,[{"c":[{"c":"Table","l":{"end":{"column":7,"offset":80,"row":4},"filenameIndex":0,"start":{"column":2,"offset":75,"row":4}},"t":"Str"},{"l":{"end":{"column":8,"offset":81,"row":4},"filenameIndex":0,"start":{"column":7,"offset":80,"row":4}},"t":"Space"},{"c":"caption","l":{"end":{"column":15,"offset":88,"row":4},"filenameIndex":0,"start":{"column":8,"offset":81,"row":4}},"t":"Str"},{"l":{"end":{"column":16,"offset":89,"row":4},"filenameIndex":0,"start":{"column":15,"offset":88,"row":4}},"t":"Space"}],"l":{"end":{"column":0,"offset":115,"row":5},"filenameIndex":0,"start":{"column":0,"offset":72,"row":3}},"t":"Plain"}]],[[{"t":"AlignDefault"},{"t":"ColWidthDefault"}],[{"t":"AlignDefault"},{"t":"ColWidthDefault"}]],[["",[],[]],[[["",[],[]],[[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Column","l":{"end":{"column":8,"offset":8,"row":0},"filenameIndex":0,"start":{"column":2,"offset":2,"row":0}},"t":"Str"},{"l":{"end":{"column":9,"offset":9,"row":0},"filenameIndex":0,"start":{"column":8,"offset":8,"row":0}},"t":"Space"},{"c":"1","l":{"end":{"column":10,"offset":10,"row":0},"filenameIndex":0,"start":{"column":9,"offset":9,"row":0}},"t":"Str"}],"l":{"end":{"column":11,"offset":11,"row":0},"filenameIndex":0,"start":{"column":2,"offset":2,"row":0}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Column","l":{"end":{"column":19,"offset":19,"row":0},"filenameIndex":0,"start":{"column":13,"offset":13,"row":0}},"t":"Str"},{"l":{"end":{"column":20,"offset":20,"row":0},"filenameIndex":0,"start":{"column":19,"offset":19,"row":0}},"t":"Space"},{"c":"2","l":{"end":{"column":21,"offset":21,"row":0},"filenameIndex":0,"start":{"column":20,"offset":20,"row":0}},"t":"Str"}],"l":{"end":{"column":22,"offset":22,"row":0},"filenameIndex":0,"start":{"column":13,"offset":13,"row":0}},"t":"Plain"}]]]]]],[[["",[],[]],0,[],[[["",[],[]],[[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Data","l":{"end":{"column":6,"offset":54,"row":2},"filenameIndex":0,"start":{"column":2,"offset":50,"row":2}},"t":"Str"},{"l":{"end":{"column":7,"offset":55,"row":2},"filenameIndex":0,"start":{"column":6,"offset":54,"row":2}},"t":"Space"},{"c":"1","l":{"end":{"column":8,"offset":56,"row":2},"filenameIndex":0,"start":{"column":7,"offset":55,"row":2}},"t":"Str"}],"l":{"end":{"column":11,"offset":59,"row":2},"filenameIndex":0,"start":{"column":2,"offset":50,"row":2}},"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Data","l":{"end":{"column":17,"offset":65,"row":2},"filenameIndex":0,"start":{"column":13,"offset":61,"row":2}},"t":"Str"},{"l":{"end":{"column":18,"offset":66,"row":2},"filenameIndex":0,"start":{"column":17,"offset":65,"row":2}},"t":"Space"},{"c":"2","l":{"end":{"column":19,"offset":67,"row":2},"filenameIndex":0,"start":{"column":18,"offset":66,"row":2}},"t":"Str"}],"l":{"end":{"column":22,"offset":70,"row":2},"filenameIndex":0,"start":{"column":13,"offset":61,"row":2}},"t":"Plain"}]]]]]]],[["",[],[]],[]]],"l":{"end":{"column":0,"offset":72,"row":3},"filenameIndex":0,"start":{"column":0,"offset":0,"row":0}},"t":"Table"}],"meta":{},"pandoc-api-version":[1,23,1]} \ No newline at end of file +{"astContext":{"filenames":["tests/snapshots/json/table-caption-attr.qmd"],"sourceInfoPool":[{"d":0,"r":[75,4,2,80,4,7],"t":0},{"d":0,"r":[80,4,7,81,4,8],"t":0},{"d":0,"r":[81,4,8,88,4,15],"t":0},{"d":0,"r":[88,4,15,89,4,16],"t":0},{"d":0,"r":[72,3,0,115,5,0],"t":0},{"d":0,"r":[2,0,2,8,0,8],"t":0},{"d":0,"r":[8,0,8,9,0,9],"t":0},{"d":0,"r":[9,0,9,10,0,10],"t":0},{"d":0,"r":[2,0,2,11,0,11],"t":0},{"d":0,"r":[13,0,13,19,0,19],"t":0},{"d":0,"r":[19,0,19,20,0,20],"t":0},{"d":0,"r":[20,0,20,21,0,21],"t":0},{"d":0,"r":[13,0,13,22,0,22],"t":0},{"d":0,"r":[50,2,2,54,2,6],"t":0},{"d":0,"r":[54,2,6,55,2,7],"t":0},{"d":0,"r":[55,2,7,56,2,8],"t":0},{"d":0,"r":[50,2,2,59,2,11],"t":0},{"d":0,"r":[61,2,13,65,2,17],"t":0},{"d":0,"r":[65,2,17,66,2,18],"t":0},{"d":0,"r":[66,2,18,67,2,19],"t":0},{"d":0,"r":[61,2,13,70,2,22],"t":0},{"d":0,"r":[0,0,0,72,3,0],"t":0}]},"blocks":[{"c":[["",[],[["tbl-colwidths","[30,70]"]]],[null,[{"c":[{"c":"Table","s":0,"t":"Str"},{"s":1,"t":"Space"},{"c":"caption","s":2,"t":"Str"},{"s":3,"t":"Space"}],"s":4,"t":"Plain"}]],[[{"t":"AlignDefault"},{"t":"ColWidthDefault"}],[{"t":"AlignDefault"},{"t":"ColWidthDefault"}]],[["",[],[]],[[["",[],[]],[[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Column","s":5,"t":"Str"},{"s":6,"t":"Space"},{"c":"1","s":7,"t":"Str"}],"s":8,"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Column","s":9,"t":"Str"},{"s":10,"t":"Space"},{"c":"2","s":11,"t":"Str"}],"s":12,"t":"Plain"}]]]]]],[[["",[],[]],0,[],[[["",[],[]],[[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Data","s":13,"t":"Str"},{"s":14,"t":"Space"},{"c":"1","s":15,"t":"Str"}],"s":16,"t":"Plain"}]],[["",[],[]],{"t":"AlignDefault"},1,1,[{"c":[{"c":"Data","s":17,"t":"Str"},{"s":18,"t":"Space"},{"c":"2","s":19,"t":"Str"}],"s":20,"t":"Plain"}]]]]]]],[["",[],[]],[]]],"s":21,"t":"Table"}],"meta":{},"pandoc-api-version":[1,23,1]} \ No newline at end of file diff --git a/crates/quarto-markdown-pandoc/tests/test.rs b/crates/quarto-markdown-pandoc/tests/test.rs index 2a489d0..52f63fe 100644 --- a/crates/quarto-markdown-pandoc/tests/test.rs +++ b/crates/quarto-markdown-pandoc/tests/test.rs @@ -6,7 +6,7 @@ use glob::glob; use quarto_markdown_pandoc::errors::parse_is_good; use quarto_markdown_pandoc::pandoc::{ASTContext, treesitter_to_pandoc}; -use quarto_markdown_pandoc::utils::error_collector::TextErrorCollector; +use quarto_markdown_pandoc::utils::diagnostic_collector::DiagnosticCollector; use quarto_markdown_pandoc::utils::output::VerboseOutput; use quarto_markdown_pandoc::{readers, writers}; use std::io::{self, Write}; @@ -23,7 +23,7 @@ fn unit_test_simple_qmd_parses() { .parse(input_bytes, None) .expect("Failed to parse input"); let mut buf = Vec::new(); - let mut error_collector = TextErrorCollector::new(); + let mut error_collector = DiagnosticCollector::new(); writers::native::write( &treesitter_to_pandoc( &mut std::io::sink(), @@ -129,7 +129,7 @@ fn matches_pandoc_commonmark_reader(input: &str) -> bool { } let mut buf1 = Vec::new(); let mut buf2 = Vec::new(); - let mut error_collector1 = TextErrorCollector::new(); + let mut error_collector1 = DiagnosticCollector::new(); writers::native::write( &treesitter_to_pandoc( &mut std::io::sink(), @@ -146,7 +146,7 @@ fn matches_pandoc_commonmark_reader(input: &str) -> bool { .unwrap(); let native_output = String::from_utf8(buf1).expect("Invalid UTF-8 in output"); let context_for_json = ASTContext::anonymous(); - let mut error_collector2 = TextErrorCollector::new(); + let mut error_collector2 = DiagnosticCollector::new(); writers::json::write( &treesitter_to_pandoc( &mut std::io::sink(), @@ -268,6 +268,12 @@ where let pattern = format!("tests/snapshots/{}/*.qmd", format); let mut file_count = 0; let mut failures = Vec::new(); + let mut updated_count = 0; + + // Check if we should update snapshots instead of comparing + let update_snapshots = std::env::var("UPDATE_SNAPSHOTS") + .map(|v| v == "1" || v.to_lowercase() == "true") + .unwrap_or(false); for entry in glob(&pattern).expect("Failed to read glob pattern") { match entry { @@ -290,19 +296,30 @@ where writer(&pandoc, &context, &mut buffer).unwrap(); let output = String::from_utf8(buffer).expect("Invalid UTF-8 in output"); - let snapshot = std::fs::read_to_string(&snapshot_path).unwrap_or_else(|_| { - panic!( - "Snapshot file {} does not exist, please create it", - snapshot_path.display() - ) - }); - - if output.trim() != snapshot.trim() { - failures.push(format!( - "Snapshot mismatch for file: {}\n Snapshot path: {}", - path.display(), - snapshot_path.display() - )); + + if update_snapshots { + // Update mode: write the output to the snapshot file + std::fs::write(&snapshot_path, &output).unwrap_or_else(|_| { + panic!("Failed to write snapshot file {}", snapshot_path.display()) + }); + eprintln!(" Updated snapshot: {}", snapshot_path.display()); + updated_count += 1; + } else { + // Normal mode: compare output with snapshot + let snapshot = std::fs::read_to_string(&snapshot_path).unwrap_or_else(|_| { + panic!( + "Snapshot file {} does not exist, please create it", + snapshot_path.display() + ) + }); + + if output.trim() != snapshot.trim() { + failures.push(format!( + "Snapshot mismatch for file: {}\n Snapshot path: {}", + path.display(), + snapshot_path.display() + )); + } } file_count += 1; } @@ -316,7 +333,12 @@ where format ); - if !failures.is_empty() { + if update_snapshots { + eprintln!( + "\n✓ Updated {} snapshot(s) for format '{}'", + updated_count, format + ); + } else if !failures.is_empty() { panic!( "\n\n{} snapshot(s) failed for format '{}':\n\n{}\n", failures.len(), @@ -328,8 +350,9 @@ where fn remove_location_fields(json: &mut serde_json::Value) { if let Some(obj) = json.as_object_mut() { - obj.remove("l"); // Remove the "l" field - obj.remove("astContext"); // Remove the astContext field + obj.remove("l"); // Remove the "l" field (old SourceInfo) + obj.remove("s"); // Remove the "s" field (new quarto_source_map::SourceInfo) + obj.remove("astContext"); // Remove the astContext field (includes metaTopLevelKeySources) for value in obj.values_mut() { remove_location_fields(value); } @@ -360,7 +383,7 @@ fn test_json_writer() { .parse(input_bytes, None) .expect("Failed to parse input"); let test_context = ASTContext::anonymous(); - let mut error_collector = TextErrorCollector::new(); + let mut error_collector = DiagnosticCollector::new(); let pandoc = treesitter_to_pandoc( &mut std::io::sink(), &tree, @@ -448,7 +471,7 @@ fn test_html_writer() { let tree = parser .parse(input_bytes, None) .expect("Failed to parse input"); - let mut error_collector = TextErrorCollector::new(); + let mut error_collector = DiagnosticCollector::new(); let pandoc = treesitter_to_pandoc( &mut std::io::sink(), &tree, @@ -556,7 +579,7 @@ fn test_do_not_smoke() { let tree = parser .parse(input_bytes, None) .expect("Failed to parse input"); - let mut error_collector = TextErrorCollector::new(); + let mut error_collector = DiagnosticCollector::new(); let _ = treesitter_to_pandoc( &mut std::io::sink(), &tree, diff --git a/crates/quarto-markdown-pandoc/tests/test_inline_locations.rs b/crates/quarto-markdown-pandoc/tests/test_inline_locations.rs index 6d5a08a..51acb18 100644 --- a/crates/quarto-markdown-pandoc/tests/test_inline_locations.rs +++ b/crates/quarto-markdown-pandoc/tests/test_inline_locations.rs @@ -4,10 +4,39 @@ */ use quarto_markdown_pandoc::pandoc::{ASTContext, treesitter_to_pandoc}; -use quarto_markdown_pandoc::utils::error_collector::TextErrorCollector; +use quarto_markdown_pandoc::utils::diagnostic_collector::DiagnosticCollector; use quarto_markdown_pandoc::writers; use tree_sitter_qmd::MarkdownParser; +/// Helper to resolve a source info reference from the pool (compact format) +/// Returns (start_offset, start_row, start_col, end_offset, end_row, end_col, type_code) +fn resolve_source_ref( + source_ref: &serde_json::Value, + pool: &[serde_json::Value], +) -> (usize, usize, usize, usize, usize, usize, usize) { + let ref_id = source_ref + .as_u64() + .expect("Expected source ref to be a number"); + let source_info = &pool[ref_id as usize]; + + let r = source_info["r"] + .as_array() + .expect("Expected r to be an array"); + let t = source_info["t"] + .as_u64() + .expect("Expected t to be a number") as usize; + + ( + r[0].as_u64().unwrap() as usize, // start_offset + r[1].as_u64().unwrap() as usize, // start_row + r[2].as_u64().unwrap() as usize, // start_col + r[3].as_u64().unwrap() as usize, // end_offset + r[4].as_u64().unwrap() as usize, // end_row + r[5].as_u64().unwrap() as usize, // end_col + t, + ) +} + #[test] fn test_inline_source_locations() { let input = "hello _world_."; @@ -18,7 +47,7 @@ fn test_inline_source_locations() { .expect("Failed to parse input"); let context = ASTContext::anonymous(); - let mut error_collector = TextErrorCollector::new(); + let mut error_collector = DiagnosticCollector::new(); let pandoc = treesitter_to_pandoc( &mut std::io::sink(), &tree, @@ -35,6 +64,11 @@ fn test_inline_source_locations() { let json_value: serde_json::Value = serde_json::from_str(&json_output).expect("Failed to parse JSON output"); + // Get the source info pool + let pool = json_value["astContext"]["sourceInfoPool"] + .as_array() + .expect("Expected sourceInfoPool to be an array"); + // Check that the source locations are correct for the inline nodes let blocks = json_value["blocks"].as_array().unwrap(); let para = &blocks[0]; @@ -44,50 +78,55 @@ fn test_inline_source_locations() { let hello_str = &inlines[0]; assert_eq!(hello_str["t"], "Str"); assert_eq!(hello_str["c"], "hello"); - let hello_loc = &hello_str["l"]; - assert_eq!(hello_loc["start"]["column"], 0); - assert_eq!(hello_loc["start"]["offset"], 0); - assert_eq!(hello_loc["end"]["column"], 5); - assert_eq!(hello_loc["end"]["offset"], 5); + let (start_off, start_row, start_col, end_off, end_row, end_col, _type) = + resolve_source_ref(&hello_str["s"], pool); + assert_eq!(start_col, 0); + assert_eq!(start_off, 0); + assert_eq!(end_col, 5); + assert_eq!(end_off, 5); // Second inline should be a Space let space = &inlines[1]; assert_eq!(space["t"], "Space"); - let space_loc = &space["l"]; - assert_eq!(space_loc["start"]["column"], 5); - assert_eq!(space_loc["start"]["offset"], 5); - assert_eq!(space_loc["end"]["column"], 6); - assert_eq!(space_loc["end"]["offset"], 6); + let (start_off, _start_row, start_col, end_off, _end_row, end_col, _t) = + resolve_source_ref(&space["s"], pool); + assert_eq!(start_col, 5); + assert_eq!(start_off, 5); + assert_eq!(end_col, 6); + assert_eq!(end_off, 6); // Third inline should be Emph containing "world" let emph = &inlines[2]; assert_eq!(emph["t"], "Emph"); - let emph_loc = &emph["l"]; - assert_eq!(emph_loc["start"]["column"], 6); - assert_eq!(emph_loc["start"]["offset"], 6); - assert_eq!(emph_loc["end"]["column"], 13); - assert_eq!(emph_loc["end"]["offset"], 13); + let (start_off, _start_row, start_col, end_off, _end_row, end_col, _t) = + resolve_source_ref(&emph["s"], pool); + assert_eq!(start_col, 6); + assert_eq!(start_off, 6); + assert_eq!(end_col, 13); + assert_eq!(end_off, 13); // Check the "world" string inside Emph let emph_content = emph["c"].as_array().unwrap(); let world_str = &emph_content[0]; assert_eq!(world_str["t"], "Str"); assert_eq!(world_str["c"], "world"); - let world_loc = &world_str["l"]; - assert_eq!(world_loc["start"]["column"], 7); - assert_eq!(world_loc["start"]["offset"], 7); - assert_eq!(world_loc["end"]["column"], 12); - assert_eq!(world_loc["end"]["offset"], 12); + let (start_off, _start_row, start_col, end_off, _end_row, end_col, _t) = + resolve_source_ref(&world_str["s"], pool); + assert_eq!(start_col, 7); + assert_eq!(start_off, 7); + assert_eq!(end_col, 12); + assert_eq!(end_off, 12); // Fourth inline should be "." let period = &inlines[3]; assert_eq!(period["t"], "Str"); assert_eq!(period["c"], "."); - let period_loc = &period["l"]; - assert_eq!(period_loc["start"]["column"], 13); - assert_eq!(period_loc["start"]["offset"], 13); - assert_eq!(period_loc["end"]["column"], 14); - assert_eq!(period_loc["end"]["offset"], 14); + let (start_off, _start_row, start_col, end_off, _end_row, end_col, _t) = + resolve_source_ref(&period["s"], pool); + assert_eq!(start_col, 13); + assert_eq!(start_off, 13); + assert_eq!(end_col, 14); + assert_eq!(end_off, 14); } #[test] @@ -102,7 +141,7 @@ fn test_merged_strings_preserve_location() { .expect("Failed to parse input"); let context = ASTContext::anonymous(); - let mut error_collector = TextErrorCollector::new(); + let mut error_collector = DiagnosticCollector::new(); let pandoc = treesitter_to_pandoc( &mut std::io::sink(), &tree, @@ -119,6 +158,11 @@ fn test_merged_strings_preserve_location() { let json_value: serde_json::Value = serde_json::from_str(&json_output).expect("Failed to parse JSON output"); + // Get the source info pool + let pool = json_value["astContext"]["sourceInfoPool"] + .as_array() + .expect("Expected sourceInfoPool to be an array"); + let blocks = json_value["blocks"].as_array().unwrap(); let para = &blocks[0]; let inlines = para["c"].as_array().unwrap(); @@ -131,11 +175,12 @@ fn test_merged_strings_preserve_location() { let hello = &inlines[0]; assert_eq!(hello["t"], "Str"); assert_eq!(hello["c"], "hello"); - let hello_loc = &hello["l"]; - assert_eq!(hello_loc["start"]["column"], 0); - assert_eq!(hello_loc["start"]["offset"], 0); - assert_eq!(hello_loc["end"]["column"], 5); - assert_eq!(hello_loc["end"]["offset"], 5); + let (start_off, _start_row, start_col, end_off, _end_row, end_col, _t) = + resolve_source_ref(&hello["s"], pool); + assert_eq!(start_col, 0); + assert_eq!(start_off, 0); + assert_eq!(end_col, 5); + assert_eq!(end_off, 5); // Second should be Space let space = &inlines[1]; @@ -145,11 +190,12 @@ fn test_merged_strings_preserve_location() { let world = &inlines[2]; assert_eq!(world["t"], "Str"); assert_eq!(world["c"], "world"); - let world_loc = &world["l"]; - assert_eq!(world_loc["start"]["column"], 6); - assert_eq!(world_loc["start"]["offset"], 6); - assert_eq!(world_loc["end"]["column"], 11); - assert_eq!(world_loc["end"]["offset"], 11); + let (start_off, _start_row, start_col, end_off, _end_row, end_col, _t) = + resolve_source_ref(&world["s"], pool); + assert_eq!(start_col, 6); + assert_eq!(start_off, 6); + assert_eq!(end_col, 11); + assert_eq!(end_off, 11); } #[test] @@ -164,7 +210,7 @@ fn test_separate_strings_keep_separate_locations() { .expect("Failed to parse input"); let context = ASTContext::anonymous(); - let mut error_collector = TextErrorCollector::new(); + let mut error_collector = DiagnosticCollector::new(); let pandoc = treesitter_to_pandoc( &mut std::io::sink(), &tree, @@ -181,6 +227,11 @@ fn test_separate_strings_keep_separate_locations() { let json_value: serde_json::Value = serde_json::from_str(&json_output).expect("Failed to parse JSON output"); + // Get the source info pool + let pool = json_value["astContext"]["sourceInfoPool"] + .as_array() + .expect("Expected sourceInfoPool to be an array"); + let blocks = json_value["blocks"].as_array().unwrap(); let para = &blocks[0]; let inlines = para["c"].as_array().unwrap(); @@ -192,28 +243,205 @@ fn test_separate_strings_keep_separate_locations() { let a_str = &inlines[0]; assert_eq!(a_str["t"], "Str"); assert_eq!(a_str["c"], "a"); - let a_loc = &a_str["l"]; - assert_eq!(a_loc["start"]["column"], 0); - assert_eq!(a_loc["start"]["offset"], 0); - assert_eq!(a_loc["end"]["column"], 1); - assert_eq!(a_loc["end"]["offset"], 1); + let (start_off, _start_row, start_col, end_off, _end_row, end_col, _t) = + resolve_source_ref(&a_str["s"], pool); + assert_eq!(start_col, 0); + assert_eq!(start_off, 0); + assert_eq!(end_col, 1); + assert_eq!(end_off, 1); // Second inline should be Strong containing "b" let strong = &inlines[1]; assert_eq!(strong["t"], "Strong"); - let strong_loc = &strong["l"]; - assert_eq!(strong_loc["start"]["column"], 1); - assert_eq!(strong_loc["start"]["offset"], 1); - assert_eq!(strong_loc["end"]["column"], 6); - assert_eq!(strong_loc["end"]["offset"], 6); + let (start_off, _start_row, start_col, end_off, _end_row, end_col, _t) = + resolve_source_ref(&strong["s"], pool); + assert_eq!(start_col, 1); + assert_eq!(start_off, 1); + assert_eq!(end_col, 6); + assert_eq!(end_off, 6); // Third inline should be "c" let c_str = &inlines[2]; assert_eq!(c_str["t"], "Str"); assert_eq!(c_str["c"], "c"); - let c_loc = &c_str["l"]; - assert_eq!(c_loc["start"]["column"], 6); - assert_eq!(c_loc["start"]["offset"], 6); - assert_eq!(c_loc["end"]["column"], 7); - assert_eq!(c_loc["end"]["offset"], 7); + let (start_off, _start_row, start_col, end_off, _end_row, end_col, _t) = + resolve_source_ref(&c_str["s"], pool); + assert_eq!(start_col, 6); + assert_eq!(start_off, 6); + assert_eq!(end_col, 7); + assert_eq!(end_off, 7); +} + +#[test] +fn test_note_source_location() { + // Test that inline notes have proper source location tracking + // including the synthetic Paragraph wrapper inside the Note + let input = "text^[note content]more"; + let mut parser = MarkdownParser::default(); + let input_bytes = input.as_bytes(); + let tree = parser + .parse(input_bytes, None) + .expect("Failed to parse input"); + + let context = ASTContext::anonymous(); + let mut error_collector = DiagnosticCollector::new(); + let pandoc = treesitter_to_pandoc( + &mut std::io::sink(), + &tree, + &input_bytes, + &context, + &mut error_collector, + ) + .expect("Failed to convert to Pandoc AST"); + + let mut buf = Vec::new(); + writers::json::write(&pandoc, &context, &mut buf).unwrap(); + let json_output = String::from_utf8(buf).expect("Invalid UTF-8 in output"); + + let json_value: serde_json::Value = + serde_json::from_str(&json_output).expect("Failed to parse JSON output"); + + // Get the source info pool + let pool = json_value["astContext"]["sourceInfoPool"] + .as_array() + .expect("Expected sourceInfoPool to be an array"); + + let blocks = json_value["blocks"].as_array().unwrap(); + let para = &blocks[0]; + let inlines = para["c"].as_array().unwrap(); + + // Should have three elements: "text", Note, "more" + assert_eq!(inlines.len(), 3); + + // First inline should be "text" + let text_str = &inlines[0]; + assert_eq!(text_str["t"], "Str"); + assert_eq!(text_str["c"], "text"); + + // Second inline should be Note with proper source location + let note = &inlines[1]; + assert_eq!(note["t"], "Note"); + + // Check Note's source location spans the entire ^[note content] + let (start_off, _start_row, start_col, end_off, _end_row, end_col, _t) = + resolve_source_ref(¬e["s"], pool); + assert_eq!(start_col, 4); + assert_eq!(start_off, 4); + assert_eq!(end_col, 19); + assert_eq!(end_off, 19); + + // Check Note content - should be a single Block::Paragraph + let note_blocks = note["c"].as_array().unwrap(); + assert_eq!(note_blocks.len(), 1); + + let note_para = ¬e_blocks[0]; + assert_eq!(note_para["t"], "Para"); + + // CRITICAL: The Paragraph wrapper should have proper source location + // not SourceInfo::default() which would be FileId(0) with offset 0 + let (start_off, _start_row, start_col, end_off, _end_row, end_col, _t) = + resolve_source_ref(¬e_para["s"], pool); + + // The paragraph wrapper should have the same source location as the Note itself + // since it's a synthetic wrapper for the note's content + assert_eq!(start_col, 4); + assert_eq!(start_off, 4); + assert_eq!(end_col, 19); + assert_eq!(end_off, 19); + + // Check the content inside the paragraph + // The parser splits "note content" into three inlines: "note", Space, "content" + let note_para_inlines = note_para["c"].as_array().unwrap(); + assert_eq!(note_para_inlines.len(), 3); + + let note_str = ¬e_para_inlines[0]; + assert_eq!(note_str["t"], "Str"); + assert_eq!(note_str["c"], "note"); + + let space = ¬e_para_inlines[1]; + assert_eq!(space["t"], "Space"); + + let content_str = ¬e_para_inlines[2]; + assert_eq!(content_str["t"], "Str"); + assert_eq!(content_str["c"], "content"); + + // Third inline should be "more" + let more_str = &inlines[2]; + assert_eq!(more_str["t"], "Str"); + assert_eq!(more_str["c"], "more"); +} + +#[test] +fn test_note_reference_source_location() { + // Test that NoteReference nodes have proper source location tracking + // This is verified through the Span it gets converted to in postprocess + let input = r#"Some text [^note1]. + +[^note1]: Note content here."#; + let mut parser = MarkdownParser::default(); + let input_bytes = input.as_bytes(); + let tree = parser + .parse(input_bytes, None) + .expect("Failed to parse input"); + + let context = ASTContext::anonymous(); + let mut error_collector = DiagnosticCollector::new(); + let pandoc = treesitter_to_pandoc( + &mut std::io::sink(), + &tree, + &input_bytes, + &context, + &mut error_collector, + ) + .expect("Failed to convert to Pandoc AST"); + + let mut buf = Vec::new(); + writers::json::write(&pandoc, &context, &mut buf).unwrap(); + let json_output = String::from_utf8(buf).expect("Invalid UTF-8 in output"); + + let json_value: serde_json::Value = + serde_json::from_str(&json_output).expect("Failed to parse JSON output"); + + // Get the source info pool + let pool = json_value["astContext"]["sourceInfoPool"] + .as_array() + .expect("Expected sourceInfoPool to be an array"); + + let blocks = json_value["blocks"].as_array().unwrap(); + let para = &blocks[0]; + let inlines = para["c"].as_array().unwrap(); + + // Should have six elements: "Some", Space, "text", Space, Span (converted from NoteReference), "." + assert_eq!(inlines.len(), 6); + + // The Span (converted from NoteReference) should be the 5th element (index 4) + let span = &inlines[4]; + assert_eq!(span["t"], "Span"); + + // Check that it has the quarto-note-reference class + let attr = &span["c"][0]; + let classes = attr[1].as_array().unwrap(); + assert!(classes.iter().any(|c| c == "quarto-note-reference")); + + // Check that the reference-id is correct + let kvs = attr[2].as_array().unwrap(); + assert_eq!(kvs.len(), 1); + assert_eq!(kvs[0][0], "reference-id"); + assert_eq!(kvs[0][1], "note1"); + + // CRITICAL: The Span should have proper source location from the NoteReference + // not SourceInfo::default() which would be FileId(0) with offset 0 + let (start_off, _start_row, start_col, end_off, _end_row, end_col, _t) = + resolve_source_ref(&span["s"], pool); + + // The [^note1] spans from column 10 to 18 (0-indexed) + assert_eq!(start_col, 10); + assert_eq!(start_off, 10); + assert_eq!(end_col, 18); + assert_eq!(end_off, 18); + + // Last inline should be "." + let period = &inlines[5]; + assert_eq!(period["t"], "Str"); + assert_eq!(period["c"], "."); } diff --git a/crates/quarto-markdown-pandoc/tests/test_json_roundtrip.rs b/crates/quarto-markdown-pandoc/tests/test_json_roundtrip.rs index 4aa960b..be55280 100644 --- a/crates/quarto-markdown-pandoc/tests/test_json_roundtrip.rs +++ b/crates/quarto-markdown-pandoc/tests/test_json_roundtrip.rs @@ -5,29 +5,29 @@ use hashlink::LinkedHashMap; use quarto_markdown_pandoc::pandoc::ast_context::ASTContext; -use quarto_markdown_pandoc::pandoc::location::SourceInfo; use quarto_markdown_pandoc::pandoc::{Block, Inline, Pandoc, Paragraph, Str}; use quarto_markdown_pandoc::readers; use quarto_markdown_pandoc::writers::json; +use quarto_source_map::{FileId, Location, Range, SourceInfo}; use std::collections::HashMap; #[test] fn test_json_roundtrip_simple_paragraph() { // Create a simple Pandoc document let original = Pandoc { - meta: LinkedHashMap::new(), + meta: quarto_markdown_pandoc::pandoc::MetaValueWithSourceInfo::default(), blocks: vec![Block::Paragraph(Paragraph { content: vec![Inline::Str(Str { text: "Hello, world!".to_string(), - source_info: SourceInfo::new( - None, - quarto_markdown_pandoc::pandoc::location::Range { - start: quarto_markdown_pandoc::pandoc::location::Location { + source_info: SourceInfo::original( + FileId(0), + Range { + start: Location { offset: 0, row: 0, column: 0, }, - end: quarto_markdown_pandoc::pandoc::location::Location { + end: Location { offset: 13, row: 0, column: 13, @@ -35,15 +35,15 @@ fn test_json_roundtrip_simple_paragraph() { }, ), })], - source_info: SourceInfo::new( - None, - quarto_markdown_pandoc::pandoc::location::Range { - start: quarto_markdown_pandoc::pandoc::location::Location { + source_info: SourceInfo::original( + FileId(0), + Range { + start: Location { offset: 0, row: 0, column: 0, }, - end: quarto_markdown_pandoc::pandoc::location::Location { + end: Location { offset: 13, row: 0, column: 13, @@ -87,28 +87,31 @@ fn test_json_roundtrip_simple_paragraph() { fn test_json_roundtrip_complex_document() { // Create a more complex document with multiple block types let original = Pandoc { - meta: { - let mut meta = LinkedHashMap::new(); - meta.insert( - "title".to_string(), - quarto_markdown_pandoc::pandoc::MetaValue::MetaString("Test Document".to_string()), - ); - meta + meta: quarto_markdown_pandoc::pandoc::MetaValueWithSourceInfo::MetaMap { + entries: vec![quarto_markdown_pandoc::pandoc::meta::MetaMapEntry { + key: "title".to_string(), + key_source: quarto_source_map::SourceInfo::default(), + value: quarto_markdown_pandoc::pandoc::MetaValueWithSourceInfo::MetaString { + value: "Test Document".to_string(), + source_info: quarto_source_map::SourceInfo::default(), + }, + }], + source_info: quarto_source_map::SourceInfo::default(), }, blocks: vec![ Block::Paragraph(Paragraph { content: vec![ Inline::Str(Str { text: "This is ".to_string(), - source_info: SourceInfo::new( - None, - quarto_markdown_pandoc::pandoc::location::Range { - start: quarto_markdown_pandoc::pandoc::location::Location { + source_info: SourceInfo::original( + FileId(0), + Range { + start: Location { offset: 0, row: 0, column: 0, }, - end: quarto_markdown_pandoc::pandoc::location::Location { + end: Location { offset: 8, row: 0, column: 8, @@ -119,15 +122,15 @@ fn test_json_roundtrip_complex_document() { Inline::Strong(quarto_markdown_pandoc::pandoc::Strong { content: vec![Inline::Str(Str { text: "bold text".to_string(), - source_info: SourceInfo::new( - None, - quarto_markdown_pandoc::pandoc::location::Range { - start: quarto_markdown_pandoc::pandoc::location::Location { + source_info: SourceInfo::original( + FileId(0), + Range { + start: Location { offset: 8, row: 0, column: 8, }, - end: quarto_markdown_pandoc::pandoc::location::Location { + end: Location { offset: 17, row: 0, column: 17, @@ -135,15 +138,15 @@ fn test_json_roundtrip_complex_document() { }, ), })], - source_info: SourceInfo::new( - None, - quarto_markdown_pandoc::pandoc::location::Range { - start: quarto_markdown_pandoc::pandoc::location::Location { + source_info: SourceInfo::original( + FileId(0), + Range { + start: Location { offset: 8, row: 0, column: 8, }, - end: quarto_markdown_pandoc::pandoc::location::Location { + end: Location { offset: 17, row: 0, column: 17, @@ -153,15 +156,15 @@ fn test_json_roundtrip_complex_document() { }), Inline::Str(Str { text: ".".to_string(), - source_info: SourceInfo::new( - None, - quarto_markdown_pandoc::pandoc::location::Range { - start: quarto_markdown_pandoc::pandoc::location::Location { + source_info: SourceInfo::original( + FileId(0), + Range { + start: Location { offset: 17, row: 0, column: 17, }, - end: quarto_markdown_pandoc::pandoc::location::Location { + end: Location { offset: 18, row: 0, column: 18, @@ -170,15 +173,15 @@ fn test_json_roundtrip_complex_document() { ), }), ], - source_info: SourceInfo::new( - None, - quarto_markdown_pandoc::pandoc::location::Range { - start: quarto_markdown_pandoc::pandoc::location::Location { + source_info: SourceInfo::original( + FileId(0), + Range { + start: Location { offset: 0, row: 0, column: 0, }, - end: quarto_markdown_pandoc::pandoc::location::Location { + end: Location { offset: 20, row: 0, column: 20, @@ -189,15 +192,15 @@ fn test_json_roundtrip_complex_document() { Block::CodeBlock(quarto_markdown_pandoc::pandoc::CodeBlock { attr: ("".to_string(), vec![], HashMap::new()), text: "print('Hello, world!')".to_string(), - source_info: SourceInfo::new( - None, - quarto_markdown_pandoc::pandoc::location::Range { - start: quarto_markdown_pandoc::pandoc::location::Location { + source_info: SourceInfo::original( + FileId(0), + Range { + start: Location { offset: 21, row: 1, column: 0, }, - end: quarto_markdown_pandoc::pandoc::location::Location { + end: Location { offset: 43, row: 1, column: 22, @@ -223,8 +226,10 @@ fn test_json_roundtrip_complex_document() { assert!(parsed.meta.contains_key("title")); match parsed.meta.get("title") { - Some(quarto_markdown_pandoc::pandoc::MetaValue::MetaString(title)) => { - assert_eq!(title, "Test Document"); + Some(quarto_markdown_pandoc::pandoc::MetaValueWithSourceInfo::MetaString { + value, .. + }) => { + assert_eq!(value, "Test Document"); } _ => panic!("Expected MetaString for title"), } @@ -252,20 +257,20 @@ fn test_json_write_then_read_matches_original_structure() { // with the same basic structure, even if exact equality is not possible let original = Pandoc { - meta: LinkedHashMap::new(), + meta: quarto_markdown_pandoc::pandoc::MetaValueWithSourceInfo::default(), blocks: vec![ Block::Plain(quarto_markdown_pandoc::pandoc::Plain { content: vec![Inline::Str(Str { text: "Plain text".to_string(), - source_info: SourceInfo::new( - Some(0), // Index 0 will point to "test.md" in the context - quarto_markdown_pandoc::pandoc::location::Range { - start: quarto_markdown_pandoc::pandoc::location::Location { + source_info: SourceInfo::original( + FileId(0), + Range { + start: Location { offset: 0, row: 0, column: 0, }, - end: quarto_markdown_pandoc::pandoc::location::Location { + end: Location { offset: 10, row: 0, column: 10, @@ -273,15 +278,15 @@ fn test_json_write_then_read_matches_original_structure() { }, ), })], - source_info: SourceInfo::new( - Some(0), - quarto_markdown_pandoc::pandoc::location::Range { - start: quarto_markdown_pandoc::pandoc::location::Location { + source_info: SourceInfo::original( + FileId(0), + Range { + start: Location { offset: 0, row: 0, column: 0, }, - end: quarto_markdown_pandoc::pandoc::location::Location { + end: Location { offset: 10, row: 0, column: 10, @@ -292,15 +297,15 @@ fn test_json_write_then_read_matches_original_structure() { Block::RawBlock(quarto_markdown_pandoc::pandoc::RawBlock { format: "html".to_string(), text: "
Raw HTML
".to_string(), - source_info: SourceInfo::new( - Some(0), - quarto_markdown_pandoc::pandoc::location::Range { - start: quarto_markdown_pandoc::pandoc::location::Location { + source_info: SourceInfo::original( + FileId(0), + Range { + start: Location { offset: 11, row: 1, column: 0, }, - end: quarto_markdown_pandoc::pandoc::location::Location { + end: Location { offset: 30, row: 1, column: 19, diff --git a/crates/quarto-markdown-pandoc/tests/test_meta.rs b/crates/quarto-markdown-pandoc/tests/test_meta.rs index 3c8039f..dfc22c5 100644 --- a/crates/quarto-markdown-pandoc/tests/test_meta.rs +++ b/crates/quarto-markdown-pandoc/tests/test_meta.rs @@ -5,9 +5,8 @@ use hashlink::LinkedHashMap; use quarto_markdown_pandoc::pandoc::location::{Location, Range, SourceInfo}; -use quarto_markdown_pandoc::pandoc::{ - Inline, MetaValue, RawBlock, parse_metadata_strings, rawblock_to_meta, -}; +use quarto_markdown_pandoc::pandoc::meta::{MetaValue, rawblock_to_meta}; +use quarto_markdown_pandoc::pandoc::{Inline, RawBlock, parse_metadata_strings}; use std::fs; #[test] @@ -28,7 +27,8 @@ fn test_metadata_parsing() { row: 0, column: 0, }, - }), + }) + .to_source_map_info(), }; let meta = rawblock_to_meta(block); @@ -77,7 +77,8 @@ fn test_yaml_tagged_strings() { row: 0, column: 0, }, - }), + }) + .to_source_map_info(), }; let mut meta = rawblock_to_meta(block); @@ -172,7 +173,8 @@ fn test_yaml_markdown_parse_failure() { row: 0, column: 0, }, - }), + }) + .to_source_map_info(), }; let mut meta = rawblock_to_meta(block); diff --git a/crates/quarto-markdown-pandoc/tests/test_metadata_source_tracking.rs b/crates/quarto-markdown-pandoc/tests/test_metadata_source_tracking.rs new file mode 100644 index 0000000..dcf680b --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/test_metadata_source_tracking.rs @@ -0,0 +1,253 @@ +/* + * test_metadata_source_tracking.rs + * Test that metadata source tracking is correct in PandocAST + */ + +use quarto_markdown_pandoc::pandoc::MetaValueWithSourceInfo; +use quarto_markdown_pandoc::readers; +use quarto_markdown_pandoc::writers; + +/// Helper to resolve a SourceInfo chain to absolute file offset +fn resolve_source_offset(source: &quarto_source_map::SourceInfo) -> usize { + match &source.mapping { + quarto_source_map::SourceMapping::Original { .. } => source.range.start.offset, + quarto_source_map::SourceMapping::Substring { offset, parent } => { + offset + resolve_source_offset(parent) + } + quarto_source_map::SourceMapping::Concat { .. } => { + // For concat, just use the start offset + source.range.start.offset + } + quarto_source_map::SourceMapping::Transformed { .. } => { + // For transformed, just use the start offset + source.range.start.offset + } + } +} + +#[test] +fn test_metadata_source_tracking_002_qmd() { + /* + * File: tests/snapshots/json/002.qmd + * Content: + * --- + * title: metadata1 + * --- + * + * ::: hello + * + * --- + * nested: meta + * --- + * + * ::: + * + * Byte offsets: + * - Line 0 (0-3): "---" + * - Line 1 (4-20): "title: metadata1" + * - "title" at offset 4-9 + * - ": " at offset 9-11 + * - "metadata1" at offset 11-20 + * - Line 2 (21-24): "---" + * - Line 7 (41-53): "nested: meta" + * - "nested" at offset 41-47 + * - ": " at offset 47-49 + * - "meta" at offset 49-53 + */ + + let test_file = "tests/snapshots/json/002.qmd"; + let content = std::fs::read_to_string(test_file).expect("Failed to read test file"); + + // Step 1: Read QMD to PandocAST + let mut output_stream = + quarto_markdown_pandoc::utils::output::VerboseOutput::Sink(std::io::sink()); + let (pandoc, context) = readers::qmd::read( + content.as_bytes(), + false, + test_file, + &mut output_stream, + None::< + fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + >, + ) + .expect("Failed to parse QMD"); + + // Verify document-level metadata: title: metadata1 + if let MetaValueWithSourceInfo::MetaMap { ref entries, .. } = pandoc.meta { + let title_entry = entries + .iter() + .find(|e| e.key == "title") + .expect("Should have 'title' in metadata"); + + // Verify key source: "title" + let key_offset = resolve_source_offset(&title_entry.key_source); + // "title" starts at position 0 in the YAML string "title: metadata1\n" + // Absolute offset should be 4 (start of YAML frontmatter content) + assert_eq!(key_offset, 4, "Key 'title' should start at file offset 4"); + + // Verify value source: "metadata1" + match &title_entry.value { + MetaValueWithSourceInfo::MetaInlines { source_info, .. } => { + let value_offset = resolve_source_offset(source_info); + // "metadata1" starts at position 7 in the YAML string "title: metadata1\n" + // Absolute offset should be 4 + 7 = 11 + assert_eq!( + value_offset, 11, + "Value 'metadata1' should start at file offset 11" + ); + } + other => panic!("Expected MetaInlines for title value, got {:?}", other), + } + } else { + panic!("Expected MetaMap for pandoc.meta"); + } + + // NOTE: Lexical metadata (nested: meta) test skipped for now + // The lexical metadata in ::: blocks appears to be processed differently + // and might not produce BlockMetadata in the final AST. + // This would require further investigation of the filter chain. + + // Step 2: Write to JSON + let mut json_output = Vec::new(); + writers::json::write(&pandoc, &context, &mut json_output).expect("Failed to write JSON"); + + // Step 3: Read JSON back to PandocAST + let mut json_reader = std::io::Cursor::new(json_output); + let (pandoc_from_json, _context_from_json) = + readers::json::read(&mut json_reader).expect("Failed to read JSON"); + + // Step 4: Verify source info is preserved through JSON roundtrip + // Check document-level metadata + if let MetaValueWithSourceInfo::MetaMap { ref entries, .. } = pandoc_from_json.meta { + let title_entry = entries + .iter() + .find(|e| e.key == "title") + .expect("Should have 'title' in metadata after JSON roundtrip"); + + let key_offset = resolve_source_offset(&title_entry.key_source); + // Key tracking through JSON roundtrip + assert_eq!( + key_offset, 4, + "After JSON roundtrip: Key 'title' should still start at file offset 4" + ); + + if let MetaValueWithSourceInfo::MetaInlines { source_info, .. } = &title_entry.value { + let value_offset = resolve_source_offset(source_info); + assert_eq!( + value_offset, 11, + "After JSON roundtrip: Value 'metadata1' should still start at file offset 11" + ); + } + } + + // NOTE: Lexical metadata roundtrip test also skipped (see above) + + eprintln!("\n✅ SUCCESS!"); + eprintln!("✓ Document-level metadata source tracking verified:"); + eprintln!(" - Value 'metadata1' correctly tracked to file offset 11"); + eprintln!("✓ Source info preserved through JSON roundtrip:"); + eprintln!(" - Value source still points to offset 11 after round-trip"); +} + +#[test] +fn test_nested_metadata_key_source_preservation() { + // Test that when metadata values contain markdown that itself has YAML, + // the key_source information is preserved (not lost via LinkedHashMap) + // + // This test verifies the fix for the LinkedHashMap limitation where + // outer_metadata was using HashMap and losing key_source + + let input = r#"--- +title: Simple title +description: This is a description +---"#; + + let (pandoc, _context) = readers::qmd::read( + input.as_bytes(), + false, + "test.qmd", + &mut std::io::sink(), + None::< + fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + >, + ) + .expect("Failed to parse"); + + // Extract metadata + let MetaValueWithSourceInfo::MetaMap { entries, .. } = pandoc.meta else { + panic!("Expected MetaMap"); + }; + + // Verify both entries have proper key_source tracking + let title_entry = entries + .iter() + .find(|e| e.key == "title") + .expect("Should have 'title' entry"); + + let desc_entry = entries + .iter() + .find(|e| e.key == "description") + .expect("Should have 'description' entry"); + + // CRITICAL: Verify keys have non-default source info + // Before the fix, when outer_metadata was LinkedHashMap, + // the key_source would be lost and default to offset 0 + + // Resolve the source info chain to get absolute file offsets + let title_offset = resolve_source_offset(&title_entry.key_source); + let desc_offset = resolve_source_offset(&desc_entry.key_source); + + eprintln!("\nTitle key resolved offset: {}", title_offset); + eprintln!("Description key resolved offset: {}", desc_offset); + + assert_ne!( + title_offset, 0, + "Title key should have non-zero offset (not SourceInfo::default())" + ); + + assert_ne!( + desc_offset, 0, + "Description key should have non-zero offset (not SourceInfo::default())" + ); + + // Verify keys are at EXACT expected locations in the YAML + // Input: "---\ntitle: Simple title\ndescription: This is a description\n---" + // 01234567890123456789012345678901234567890123456789012345678901234 + // 0 1 2 3 4 5 6 + // + // "---\n" = 4 bytes + // "title" starts at offset 4 + // "title: Simple title\n" = 20 bytes + // "description" starts at offset 24 + + assert_eq!( + title_offset, 4, + "Title key should be at exact offset 4, got {}", + title_offset + ); + + assert_eq!( + desc_offset, 24, + "Description key should be at exact offset 24, got {}", + desc_offset + ); + + eprintln!("\n✅ Metadata key_source preservation test passed!"); + eprintln!( + "✓ Title key has proper source tracking (offset {})", + title_offset + ); + eprintln!( + "✓ Description key has proper source tracking (offset {})", + desc_offset + ); + eprintln!("✓ LinkedHashMap fix working - key source information preserved!"); +} diff --git a/crates/quarto-markdown-pandoc/tests/test_nested_yaml_serialization.rs b/crates/quarto-markdown-pandoc/tests/test_nested_yaml_serialization.rs new file mode 100644 index 0000000..8834e94 --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/test_nested_yaml_serialization.rs @@ -0,0 +1,273 @@ +/* + * test_nested_yaml_serialization.rs + * Test to measure SourceInfo serialization size with deeply nested YAML + */ + +use quarto_markdown_pandoc::readers; +use quarto_markdown_pandoc::writers; + +/// Generate a .qmd file with nested YAML metadata of specified depth +fn generate_nested_yaml(depth: usize) -> String { + let mut yaml = String::from("---\n"); + + // Create nested structure: level1 -> level2 -> level3 -> ... + for i in 0..depth { + yaml.push_str(&format!("{}level{}: \n", " ".repeat(i), i + 1)); + } + + // Add a value at the deepest level + yaml.push_str(&format!("{}value: \"deep\"\n", " ".repeat(depth))); + + yaml.push_str("---\n\nSome content.\n"); + yaml +} + +#[test] +fn test_yaml_serialization_size_scaling() { + println!("\n=== YAML Serialization Size Analysis ===\n"); + println!( + "{:<10} {:<15} {:<15} {:<10}", + "Depth", "QMD Size", "JSON Size", "Ratio" + ); + println!("{:-<50}", ""); + + for depth in [1, 2, 3, 5, 10, 15, 20] { + let qmd_content = generate_nested_yaml(depth); + let qmd_size = qmd_content.len(); + + // Parse QMD to PandocAST + let mut output_stream = + quarto_markdown_pandoc::utils::output::VerboseOutput::Sink(std::io::sink()); + let (pandoc, context) = readers::qmd::read( + qmd_content.as_bytes(), + false, + "test.qmd", + &mut output_stream, + None::< + fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + >, + ) + .expect("Failed to parse QMD"); + + // Serialize to JSON + let mut json_output = Vec::new(); + writers::json::write(&pandoc, &context, &mut json_output).expect("Failed to write JSON"); + + let json_size = json_output.len(); + let ratio = json_size as f64 / qmd_size as f64; + + println!( + "{:<10} {:<15} {:<15} {:<10.2}x", + depth, qmd_size, json_size, ratio + ); + + // Verify roundtrip works + let mut json_reader = std::io::Cursor::new(json_output); + let (_pandoc_from_json, _context_from_json) = + readers::json::read(&mut json_reader).expect("Failed to read JSON"); + } + + println!("\n"); +} + +#[test] +fn test_yaml_serialization_with_siblings() { + println!("\n=== YAML Serialization with Sibling Nodes ===\n"); + println!( + "{:<10} {:<15} {:<15} {:<10}", + "Siblings", "QMD Size", "JSON Size", "Ratio" + ); + println!("{:-<50}", ""); + + for num_siblings in [1, 5, 10, 20, 50, 100] { + // Create YAML with many sibling nodes at depth 3 + let mut yaml = String::from("---\n"); + yaml.push_str("level1:\n"); + yaml.push_str(" level2:\n"); + + // Add multiple siblings at level 3 + for i in 0..num_siblings { + yaml.push_str(&format!(" item{}: \"value\"\n", i)); + } + + yaml.push_str("---\n\nSome content.\n"); + + let qmd_size = yaml.len(); + + // Parse and serialize + let mut output_stream = + quarto_markdown_pandoc::utils::output::VerboseOutput::Sink(std::io::sink()); + let (pandoc, context) = readers::qmd::read( + yaml.as_bytes(), + false, + "test.qmd", + &mut output_stream, + None::< + fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + >, + ) + .expect("Failed to parse QMD"); + + let mut json_output = Vec::new(); + writers::json::write(&pandoc, &context, &mut json_output).expect("Failed to write JSON"); + + let json_size = json_output.len(); + let ratio = json_size as f64 / qmd_size as f64; + + println!( + "{:<10} {:<15} {:<15} {:<10.2}x", + num_siblings, qmd_size, json_size, ratio + ); + } + + println!("\n"); +} + +#[test] +fn test_analyze_json_structure() { + // Create a moderately nested structure to analyze + let yaml = r#"--- +level1: + level2: + level3: + item1: "value1" + item2: "value2" + item3: "value3" +--- + +Some content. +"#; + + let mut output_stream = + quarto_markdown_pandoc::utils::output::VerboseOutput::Sink(std::io::sink()); + let (pandoc, context) = readers::qmd::read( + yaml.as_bytes(), + false, + "test.qmd", + &mut output_stream, + None::< + fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + >, + ) + .expect("Failed to parse QMD"); + + let mut json_output = Vec::new(); + writers::json::write(&pandoc, &context, &mut json_output).expect("Failed to write JSON"); + + let json_str = String::from_utf8(json_output.clone()).unwrap(); + + println!("\n=== JSON Structure Analysis ===\n"); + println!("Total JSON size: {} bytes", json_output.len()); + println!("QMD size: {} bytes", yaml.len()); + println!( + "Ratio: {:.2}x", + json_output.len() as f64 / yaml.len() as f64 + ); + + // Count occurrences of "Substring" (parent chain duplication indicator) + let substring_count = json_str.matches("\"Substring\"").count(); + println!("\nSubstring nodes in JSON: {}", substring_count); + + // Count occurrences of "Original" + let original_count = json_str.matches("\"Original\"").count(); + println!("Original nodes in JSON: {}", original_count); + + // Estimate duplication by counting "file_id" (appears in every Original node in chain) + let file_id_count = json_str.matches("\"file_id\"").count(); + println!( + "file_id occurrences: {} (indicates parent chain duplication)", + file_id_count + ); + + println!("\n"); +} + +/// Generate a complete binary tree of YAML metadata at specified depth +fn generate_binary_tree_yaml(depth: usize) -> String { + fn generate_tree(current_depth: usize, max_depth: usize, indent: usize) -> String { + if current_depth >= max_depth { + // Leaf node + return format!("{}leaf\n", " ".repeat(indent)); + } + + // Internal node with left and right children + let mut result = String::new(); + result.push_str(&format!("{}\n", " ".repeat(indent))); + result.push_str(&format!("{}left: ", " ".repeat(indent))); + result.push_str(&generate_tree(current_depth + 1, max_depth, indent + 1)); + result.push_str(&format!("{}right: ", " ".repeat(indent))); + result.push_str(&generate_tree(current_depth + 1, max_depth, indent + 1)); + result + } + + let mut yaml = String::from("---\n"); + yaml.push_str("data: "); + yaml.push_str(&generate_tree(0, depth, 1)); + yaml.push_str("---\n\nSome content.\n"); + yaml +} + +#[test] +fn test_binary_tree_serialization() { + println!("\n=== Binary Tree YAML Serialization ===\n"); + println!( + "{:<10} {:<12} {:<15} {:<15} {:<10}", + "Depth", "Nodes", "QMD Size", "JSON Size", "Ratio" + ); + println!("{:-<62}", ""); + + for depth in 1..=6 { + let qmd_content = generate_binary_tree_yaml(depth); + let qmd_size = qmd_content.len(); + let num_nodes = (1 << depth) - 1; // 2^depth - 1 + + // Parse QMD to PandocAST + let mut output_stream = + quarto_markdown_pandoc::utils::output::VerboseOutput::Sink(std::io::sink()); + let (pandoc, context) = readers::qmd::read( + qmd_content.as_bytes(), + false, + "test.qmd", + &mut output_stream, + None::< + fn( + &[u8], + &quarto_markdown_pandoc::utils::tree_sitter_log_observer::TreeSitterLogObserver, + &str, + ) -> Vec, + >, + ) + .expect("Failed to parse QMD"); + + // Serialize to JSON + let mut json_output = Vec::new(); + writers::json::write(&pandoc, &context, &mut json_output).expect("Failed to write JSON"); + + let json_size = json_output.len(); + let ratio = json_size as f64 / qmd_size as f64; + + println!( + "{:<10} {:<12} {:<15} {:<15} {:<10.2}x", + depth, num_nodes, qmd_size, json_size, ratio + ); + + // Verify roundtrip works + let mut json_reader = std::io::Cursor::new(json_output); + let (_pandoc_from_json, _context_from_json) = + readers::json::read(&mut json_reader).expect("Failed to read JSON"); + } + + println!("\n"); +} diff --git a/crates/quarto-markdown-pandoc/tests/test_yaml_tag_regression.rs b/crates/quarto-markdown-pandoc/tests/test_yaml_tag_regression.rs new file mode 100644 index 0000000..06d501a --- /dev/null +++ b/crates/quarto-markdown-pandoc/tests/test_yaml_tag_regression.rs @@ -0,0 +1,114 @@ +/* + * test_yaml_tag_regression.rs + * Copyright (c) 2025 Posit, PBC + * + * Tests for k-62: YAML tag information lost in new API + */ + +use quarto_markdown_pandoc::pandoc::ast_context::ASTContext; +use quarto_markdown_pandoc::pandoc::location::{Location, Range, SourceInfo}; +use quarto_markdown_pandoc::pandoc::meta::{ + MetaValueWithSourceInfo, parse_metadata_strings_with_source_info, + rawblock_to_meta_with_source_info, +}; +use quarto_markdown_pandoc::pandoc::{Inline, RawBlock}; + +#[test] +fn test_yaml_tags_preserved_in_new_api() { + // Test YAML with tagged strings + let yaml_content = r#"--- +tagged_path: !path images/*.png +tagged_glob: !glob posts/*/index.qmd +tagged_str: !str _foo_.py +regular: This has *emphasis* +---"#; + + let block = RawBlock { + format: "quarto_minus_metadata".to_string(), + text: yaml_content.to_string(), + source_info: SourceInfo::with_range(Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 0, + row: 0, + column: 0, + }, + }) + .to_source_map_info(), + }; + + let context = ASTContext::default(); + let meta = rawblock_to_meta_with_source_info(&block, &context); + + let mut outer_meta = Vec::new(); + let parsed_meta = parse_metadata_strings_with_source_info(meta, &mut outer_meta); + + // Extract entries + let entries = if let MetaValueWithSourceInfo::MetaMap { entries, .. } = parsed_meta { + entries + } else { + panic!("Expected MetaMap"); + }; + + // Check tagged_path - should be MetaInlines with Span wrapper + let tagged_path_entry = entries + .iter() + .find(|e| e.key == "tagged_path") + .expect("tagged_path not found"); + + if let MetaValueWithSourceInfo::MetaInlines { + content: inlines, .. + } = &tagged_path_entry.value + { + assert_eq!(inlines.len(), 1, "Expected exactly one inline"); + if let Inline::Span(span) = &inlines[0] { + // Should have yaml-tagged-string class + assert!( + span.attr.1.contains(&"yaml-tagged-string".to_string()), + "Expected yaml-tagged-string class, found: {:?}", + span.attr.1 + ); + // Should have tag attribute + assert_eq!( + span.attr.2.get("tag"), + Some(&"path".to_string()), + "Expected tag=path attribute" + ); + // Extract the string content + if let Inline::Str(s) = &span.content[0] { + assert_eq!(s.text, "images/*.png"); + } else { + panic!("Expected Str inline inside Span"); + } + } else { + panic!("Expected Span inline, got: {:?}", inlines[0]); + } + } else { + panic!( + "Expected MetaInlines for tagged_path, got: {:?}", + tagged_path_entry.value + ); + } + + // Check regular - should parse markdown normally (Emph element) + let regular_entry = entries + .iter() + .find(|e| e.key == "regular") + .expect("regular not found"); + + if let MetaValueWithSourceInfo::MetaInlines { + content: inlines, .. + } = ®ular_entry.value + { + let has_emph = inlines + .iter() + .any(|inline| matches!(inline, Inline::Emph(_))); + assert!(has_emph, "regular should have Emph element from *emphasis*"); + } else { + panic!("Expected MetaInlines for regular"); + } +} diff --git a/crates/quarto-source-map/Cargo.toml b/crates/quarto-source-map/Cargo.toml new file mode 100644 index 0000000..fc91fc6 --- /dev/null +++ b/crates/quarto-source-map/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "quarto-source-map" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +repository.workspace = true + +[dependencies] +serde = { workspace = true, features = ["derive", "rc"] } + +[dev-dependencies] +serde_json.workspace = true diff --git a/crates/quarto-source-map/src/context.rs b/crates/quarto-source-map/src/context.rs new file mode 100644 index 0000000..5d0ded6 --- /dev/null +++ b/crates/quarto-source-map/src/context.rs @@ -0,0 +1,174 @@ +//! Source context for managing files + +use crate::file_info::FileInformation; +use crate::types::FileId; +use serde::{Deserialize, Serialize}; + +/// Context for managing source files +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SourceContext { + files: Vec, +} + +/// A source file with content and metadata +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SourceFile { + /// File path or identifier + pub path: String, + /// File information for efficient location lookups (optional for serialization) + #[serde(skip_serializing_if = "Option::is_none")] + pub file_info: Option, + /// File metadata + pub metadata: FileMetadata, +} + +/// Metadata about a source file +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct FileMetadata { + /// File type (qmd, yaml, md, etc.) + pub file_type: Option, +} + +impl SourceContext { + /// Create a new empty source context + pub fn new() -> Self { + SourceContext { files: Vec::new() } + } + + /// Add a file to the context and return its ID + pub fn add_file(&mut self, path: String, content: Option) -> FileId { + let id = FileId(self.files.len()); + let file_info = content.as_ref().map(|c| FileInformation::new(c)); + self.files.push(SourceFile { + path, + file_info, + metadata: FileMetadata { file_type: None }, + }); + id + } + + /// Get a file by ID + pub fn get_file(&self, id: FileId) -> Option<&SourceFile> { + self.files.get(id.0) + } + + /// Create a copy without file information (for serialization) + pub fn without_content(&self) -> Self { + SourceContext { + files: self + .files + .iter() + .map(|f| SourceFile { + path: f.path.clone(), + file_info: None, + metadata: f.metadata.clone(), + }) + .collect(), + } + } +} + +impl Default for SourceContext { + fn default() -> Self { + Self::new() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_empty_context() { + let ctx = SourceContext::new(); + assert!(ctx.get_file(FileId(0)).is_none()); + } + + #[test] + fn test_add_and_get_file() { + let mut ctx = SourceContext::new(); + let id = ctx.add_file("test.qmd".to_string(), Some("# Hello".to_string())); + + assert_eq!(id, FileId(0)); + let file = ctx.get_file(id).unwrap(); + assert_eq!(file.path, "test.qmd"); + assert!(file.file_info.is_some()); + + // Verify the file info was built correctly + let info = file.file_info.as_ref().unwrap(); + assert_eq!(info.total_length(), 7); + } + + #[test] + fn test_multiple_files() { + let mut ctx = SourceContext::new(); + let id1 = ctx.add_file("first.qmd".to_string(), Some("First".to_string())); + let id2 = ctx.add_file("second.qmd".to_string(), Some("Second".to_string())); + + assert_eq!(id1, FileId(0)); + assert_eq!(id2, FileId(1)); + + let file1 = ctx.get_file(id1).unwrap(); + let file2 = ctx.get_file(id2).unwrap(); + + assert_eq!(file1.path, "first.qmd"); + assert_eq!(file2.path, "second.qmd"); + assert!(file1.file_info.is_some()); + assert!(file2.file_info.is_some()); + assert_eq!(file1.file_info.as_ref().unwrap().total_length(), 5); + assert_eq!(file2.file_info.as_ref().unwrap().total_length(), 6); + } + + #[test] + fn test_file_without_content() { + let mut ctx = SourceContext::new(); + let id = ctx.add_file("no-content.qmd".to_string(), None); + + let file = ctx.get_file(id).unwrap(); + assert_eq!(file.path, "no-content.qmd"); + assert!(file.file_info.is_none()); + } + + #[test] + fn test_without_content() { + let mut ctx = SourceContext::new(); + ctx.add_file("test1.qmd".to_string(), Some("Content 1".to_string())); + ctx.add_file("test2.qmd".to_string(), Some("Content 2".to_string())); + + let ctx_no_content = ctx.without_content(); + + let file1 = ctx_no_content.get_file(FileId(0)).unwrap(); + let file2 = ctx_no_content.get_file(FileId(1)).unwrap(); + + assert_eq!(file1.path, "test1.qmd"); + assert_eq!(file2.path, "test2.qmd"); + assert!(file1.file_info.is_none()); + assert!(file2.file_info.is_none()); + } + + #[test] + fn test_serialization() { + let mut ctx = SourceContext::new(); + ctx.add_file("test.qmd".to_string(), Some("# Test".to_string())); + + let json = serde_json::to_string(&ctx).unwrap(); + let deserialized: SourceContext = serde_json::from_str(&json).unwrap(); + + let file = deserialized.get_file(FileId(0)).unwrap(); + assert_eq!(file.path, "test.qmd"); + assert!(file.file_info.is_some()); + assert_eq!(file.file_info.as_ref().unwrap().total_length(), 6); + } + + #[test] + fn test_serialization_without_content() { + let mut ctx = SourceContext::new(); + ctx.add_file("test.qmd".to_string(), Some("# Test".to_string())); + + let ctx_no_content = ctx.without_content(); + let json = serde_json::to_string(&ctx_no_content).unwrap(); + + // Verify that None file_info is skipped in serialization + assert!(!json.contains("\"file_info\"")); + } +} diff --git a/crates/quarto-source-map/src/file_info.rs b/crates/quarto-source-map/src/file_info.rs new file mode 100644 index 0000000..e890ce2 --- /dev/null +++ b/crates/quarto-source-map/src/file_info.rs @@ -0,0 +1,254 @@ +//! Efficient file information for location lookups + +use crate::types::Location; +use serde::{Deserialize, Serialize}; + +/// Efficient file content analysis for location lookups +/// +/// This struct stores metadata about a file that enables fast conversion +/// from byte offsets to (row, column) positions without storing the full +/// file content. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct FileInformation { + /// Byte offsets of each newline character in the file + line_breaks: Vec, + + /// Total length of the file in bytes + total_length: usize, +} + +impl FileInformation { + /// Create file information by analyzing content + /// + /// Scans the content once to build an index of line break positions. + /// This enables O(log n) offset-to-location lookups via binary search. + /// + /// # Example + /// + /// ``` + /// use quarto_source_map::FileInformation; + /// + /// let info = FileInformation::new("line 1\nline 2\nline 3"); + /// ``` + pub fn new(content: &str) -> Self { + let line_breaks: Vec = content + .char_indices() + .filter_map(|(idx, ch)| if ch == '\n' { Some(idx) } else { None }) + .collect(); + + FileInformation { + line_breaks, + total_length: content.len(), + } + } + + /// Convert a byte offset to a Location with row and column + /// + /// Uses binary search to find which line contains the offset. + /// Runs in O(log n) time where n is the number of lines. + /// + /// Returns None if the offset is out of bounds. + /// + /// # Example + /// + /// ``` + /// use quarto_source_map::FileInformation; + /// + /// let info = FileInformation::new("hello\nworld"); + /// let loc = info.offset_to_location(6).unwrap(); + /// assert_eq!(loc.row, 1); + /// assert_eq!(loc.column, 0); + /// ``` + pub fn offset_to_location(&self, offset: usize) -> Option { + if offset > self.total_length { + return None; + } + + // Binary search to find which line the offset is on + // line_breaks[i] is the position of the i-th newline (0-indexed) + // So line 0 contains [0, line_breaks[0]) + // Line 1 contains [line_breaks[0]+1, line_breaks[1]) + // etc. + + let row = match self.line_breaks.binary_search(&offset) { + // Offset is exactly at a newline character + // That newline belongs to the line it terminates, not the next line + Ok(idx) => idx, + // Offset is between line breaks (or before the first, or after the last) + Err(idx) => idx, + }; + + // Column is distance from the start of this line + let line_start = if row == 0 { + 0 + } else { + self.line_breaks[row - 1] + 1 // +1 to skip past the '\n' + }; + + let column = offset - line_start; + + Some(Location { + offset, + row, + column, + }) + } + + /// Get the total length of the file in bytes + pub fn total_length(&self) -> usize { + self.total_length + } + + /// Get the number of lines in the file + pub fn line_count(&self) -> usize { + // If there are no newlines, there's 1 line + // If there are n newlines, there are n+1 lines + self.line_breaks.len() + 1 + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_empty_file() { + let info = FileInformation::new(""); + assert_eq!(info.total_length(), 0); + assert_eq!(info.line_count(), 1); + + let loc = info.offset_to_location(0).unwrap(); + assert_eq!(loc.offset, 0); + assert_eq!(loc.row, 0); + assert_eq!(loc.column, 0); + } + + #[test] + fn test_single_line() { + let info = FileInformation::new("hello world"); + assert_eq!(info.total_length(), 11); + assert_eq!(info.line_count(), 1); + + // Start of line + let loc = info.offset_to_location(0).unwrap(); + assert_eq!(loc.row, 0); + assert_eq!(loc.column, 0); + + // Middle of line + let loc = info.offset_to_location(6).unwrap(); + assert_eq!(loc.row, 0); + assert_eq!(loc.column, 6); + + // End of line + let loc = info.offset_to_location(11).unwrap(); + assert_eq!(loc.row, 0); + assert_eq!(loc.column, 11); + } + + #[test] + fn test_multiple_lines() { + let content = "line 1\nline 2\nline 3"; + let info = FileInformation::new(content); + assert_eq!(info.line_count(), 3); + + // First line + let loc = info.offset_to_location(0).unwrap(); + assert_eq!(loc.row, 0); + assert_eq!(loc.column, 0); + + // At first newline (offset 6 is '\n') + let loc = info.offset_to_location(6).unwrap(); + assert_eq!(loc.row, 0); + assert_eq!(loc.column, 6); + + // Start of second line (offset 7 is 'l' in "line 2") + let loc = info.offset_to_location(7).unwrap(); + assert_eq!(loc.row, 1); + assert_eq!(loc.column, 0); + + // At second newline (offset 13 is '\n') + let loc = info.offset_to_location(13).unwrap(); + assert_eq!(loc.row, 1); + assert_eq!(loc.column, 6); + + // Start of third line (offset 14 is 'l' in "line 3") + let loc = info.offset_to_location(14).unwrap(); + assert_eq!(loc.row, 2); + assert_eq!(loc.column, 0); + + // End of file + let loc = info.offset_to_location(20).unwrap(); + assert_eq!(loc.row, 2); + assert_eq!(loc.column, 6); + } + + #[test] + fn test_out_of_bounds() { + let info = FileInformation::new("hello"); + assert!(info.offset_to_location(100).is_none()); + } + + #[test] + fn test_unicode_content() { + // "café" - 'é' is 2 bytes in UTF-8 + let content = "café\nwörld"; // 4 chars + 1 newline + 5 chars = but more bytes + let info = FileInformation::new(content); + + // Verify we're working with byte offsets, not character offsets + // "café" is 5 bytes: c(1) a(1) f(1) é(2) + // newline is 1 byte + // So second line starts at byte offset 6 + let loc = info.offset_to_location(6).unwrap(); + assert_eq!(loc.row, 1); + assert_eq!(loc.column, 0); + } + + #[test] + fn test_file_ending_with_newline() { + let content = "line 1\nline 2\n"; + let info = FileInformation::new(content); + assert_eq!(info.line_count(), 3); // Empty third line + + // The final newline + let loc = info.offset_to_location(13).unwrap(); + assert_eq!(loc.row, 1); + assert_eq!(loc.column, 6); + + // After the final newline (empty line 3) + let loc = info.offset_to_location(14).unwrap(); + assert_eq!(loc.row, 2); + assert_eq!(loc.column, 0); + } + + #[test] + fn test_consecutive_newlines() { + let content = "a\n\n\nb"; + let info = FileInformation::new(content); + assert_eq!(info.line_count(), 4); + + // First line + let loc = info.offset_to_location(0).unwrap(); + assert_eq!(loc.row, 0); + assert_eq!(loc.column, 0); + + // First newline (offset 1) + let loc = info.offset_to_location(1).unwrap(); + assert_eq!(loc.row, 0); + assert_eq!(loc.column, 1); + + // Empty second line (offset 2) + let loc = info.offset_to_location(2).unwrap(); + assert_eq!(loc.row, 1); + assert_eq!(loc.column, 0); + + // Empty third line (offset 3) + let loc = info.offset_to_location(3).unwrap(); + assert_eq!(loc.row, 2); + assert_eq!(loc.column, 0); + + // Fourth line 'b' (offset 4) + let loc = info.offset_to_location(4).unwrap(); + assert_eq!(loc.row, 3); + assert_eq!(loc.column, 0); + } +} diff --git a/crates/quarto-source-map/src/lib.rs b/crates/quarto-source-map/src/lib.rs new file mode 100644 index 0000000..ec1aee6 --- /dev/null +++ b/crates/quarto-source-map/src/lib.rs @@ -0,0 +1,48 @@ +//! Source mapping for Quarto +//! +//! This crate provides unified source location tracking with support for +//! transformations (extraction, concatenation, normalization). It enables +//! precise error reporting and mapping positions back through transformation +//! chains to original source files. +//! +//! # Overview +//! +//! The core types are: +//! - [`SourceInfo`]: Tracks a location with its transformation history +//! - [`SourceMapping`]: Enum describing how content was transformed +//! - [`SourceContext`]: Manages files and provides content for mapping +//! +//! # Example +//! +//! ```rust +//! use quarto_source_map::*; +//! +//! // Create a context and register a file +//! let mut ctx = SourceContext::new(); +//! let file_id = ctx.add_file("main.qmd".into(), Some("# Hello\nWorld".into())); +//! +//! // Create a source location +//! let range = Range { +//! start: Location { offset: 0, row: 0, column: 0 }, +//! end: Location { offset: 7, row: 0, column: 7 }, +//! }; +//! let info = SourceInfo::original(file_id, range.clone()); +//! +//! // Verify the source info was created correctly +//! assert_eq!(info.range, range); +//! ``` + +pub mod context; +pub mod file_info; +pub mod mapping; +pub mod source_info; +pub mod types; +pub mod utils; + +// Re-export main types +pub use context::{FileMetadata, SourceContext, SourceFile}; +pub use file_info::FileInformation; +pub use mapping::MappedLocation; +pub use source_info::{RangeMapping, SourceInfo, SourceMapping, SourcePiece}; +pub use types::{FileId, Location, Range}; +pub use utils::{line_col_to_offset, offset_to_location, range_from_offsets}; diff --git a/crates/quarto-source-map/src/mapping.rs b/crates/quarto-source-map/src/mapping.rs new file mode 100644 index 0000000..a8a9376 --- /dev/null +++ b/crates/quarto-source-map/src/mapping.rs @@ -0,0 +1,284 @@ +//! Position mapping through transformation chains + +use crate::types::{FileId, Location}; +use crate::{SourceContext, SourceInfo}; + +/// Result of mapping a position back to an original file +#[derive(Debug, Clone, PartialEq)] +pub struct MappedLocation { + /// The original file + pub file_id: FileId, + /// Location in the original file + pub location: Location, +} + +impl SourceInfo { + /// Map an offset in the current text back to original source + pub fn map_offset(&self, offset: usize, ctx: &SourceContext) -> Option { + use crate::source_info::SourceMapping; + + match &self.mapping { + SourceMapping::Original { file_id } => { + // Direct mapping to original file + let file = ctx.get_file(*file_id)?; + let file_info = file.file_info.as_ref()?; + + // Convert offset to Location with row/column using efficient binary search + let location = file_info.offset_to_location(offset)?; + + Some(MappedLocation { + file_id: *file_id, + location, + }) + } + SourceMapping::Substring { + parent, + offset: parent_offset, + } => { + // Map to parent coordinates and recurse + let parent_offset_mapped = parent_offset + offset; + parent.map_offset(parent_offset_mapped, ctx) + } + SourceMapping::Concat { pieces } => { + // Find which piece contains this offset + for piece in pieces { + let piece_start = piece.offset_in_concat; + let piece_end = piece_start + piece.length; + + if offset >= piece_start && offset < piece_end { + // Offset is within this piece + let offset_in_piece = offset - piece_start; + return piece.source_info.map_offset(offset_in_piece, ctx); + } + } + None // Offset not found in any piece + } + SourceMapping::Transformed { parent, mapping } => { + // Find the mapping that contains this offset + for range_mapping in mapping { + if offset >= range_mapping.from_start && offset < range_mapping.from_end { + // Map to parent coordinates + let offset_in_range = offset - range_mapping.from_start; + let parent_offset = range_mapping.to_start + offset_in_range; + return parent.map_offset(parent_offset, ctx); + } + } + None // Offset not found in any mapping + } + } + } + + /// Map a range in the current text back to original source + pub fn map_range( + &self, + start: usize, + end: usize, + ctx: &SourceContext, + ) -> Option<(MappedLocation, MappedLocation)> { + let start_mapped = self.map_offset(start, ctx)?; + let end_mapped = self.map_offset(end, ctx)?; + Some((start_mapped, end_mapped)) + } +} + +#[cfg(test)] +mod tests { + use crate::types::{Location, Range}; + use crate::{SourceContext, SourceInfo}; + + #[test] + fn test_map_offset_original() { + let mut ctx = SourceContext::new(); + let file_id = ctx.add_file("test.qmd".to_string(), Some("hello\nworld".to_string())); + + let info = SourceInfo::original( + file_id, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 11, + row: 1, + column: 5, + }, + }, + ); + + // Test mapping offset 0 (start of first line) + let mapped = info.map_offset(0, &ctx).unwrap(); + assert_eq!(mapped.file_id, file_id); + assert_eq!(mapped.location.offset, 0); + assert_eq!(mapped.location.row, 0); + assert_eq!(mapped.location.column, 0); + + // Test mapping offset 6 (start of second line) + let mapped = info.map_offset(6, &ctx).unwrap(); + assert_eq!(mapped.file_id, file_id); + assert_eq!(mapped.location.offset, 6); + assert_eq!(mapped.location.row, 1); + assert_eq!(mapped.location.column, 0); + } + + #[test] + fn test_map_offset_substring() { + let mut ctx = SourceContext::new(); + let file_id = ctx.add_file("test.qmd".to_string(), Some("0123456789".to_string())); + + let original = SourceInfo::original( + file_id, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 10, + row: 0, + column: 10, + }, + }, + ); + + // Extract substring from offset 3 to 7 ("3456") + let substring = SourceInfo::substring(original, 3, 7); + + // Map offset 0 in substring (should be '3' at offset 3 in original) + let mapped = substring.map_offset(0, &ctx).unwrap(); + assert_eq!(mapped.file_id, file_id); + assert_eq!(mapped.location.offset, 3); + + // Map offset 2 in substring (should be '5' at offset 5 in original) + let mapped = substring.map_offset(2, &ctx).unwrap(); + assert_eq!(mapped.file_id, file_id); + assert_eq!(mapped.location.offset, 5); + } + + #[test] + fn test_map_offset_concat() { + let mut ctx = SourceContext::new(); + let file_id1 = ctx.add_file("first.qmd".to_string(), Some("AAA".to_string())); + let file_id2 = ctx.add_file("second.qmd".to_string(), Some("BBB".to_string())); + + let info1 = SourceInfo::original( + file_id1, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 3, + row: 0, + column: 3, + }, + }, + ); + + let info2 = SourceInfo::original( + file_id2, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 3, + row: 0, + column: 3, + }, + }, + ); + + // Concatenate: "AAABBB" + let concat = SourceInfo::concat(vec![(info1, 3), (info2, 3)]); + + // Map offset 1 (should be in first piece, second 'A') + let mapped = concat.map_offset(1, &ctx).unwrap(); + assert_eq!(mapped.file_id, file_id1); + assert_eq!(mapped.location.offset, 1); + + // Map offset 4 (should be in second piece, second 'B') + let mapped = concat.map_offset(4, &ctx).unwrap(); + assert_eq!(mapped.file_id, file_id2); + assert_eq!(mapped.location.offset, 1); + } + + #[test] + fn test_map_offset_transformed() { + let mut ctx = SourceContext::new(); + let file_id = ctx.add_file("test.qmd".to_string(), Some("0123456789".to_string())); + + let original = SourceInfo::original( + file_id, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 10, + row: 0, + column: 10, + }, + }, + ); + + // Transform: map [0,3) to [5,8), skip everything else + use crate::source_info::RangeMapping; + let transformed = SourceInfo::transformed( + original, + vec![RangeMapping { + from_start: 0, + from_end: 3, + to_start: 5, + to_end: 8, + }], + ); + + // Map offset 0 (should map to original offset 5, which is '5') + let mapped = transformed.map_offset(0, &ctx).unwrap(); + assert_eq!(mapped.file_id, file_id); + assert_eq!(mapped.location.offset, 5); + + // Map offset 2 (should map to original offset 7, which is '7') + let mapped = transformed.map_offset(2, &ctx).unwrap(); + assert_eq!(mapped.file_id, file_id); + assert_eq!(mapped.location.offset, 7); + } + + #[test] + fn test_map_range() { + let mut ctx = SourceContext::new(); + let file_id = ctx.add_file("test.qmd".to_string(), Some("hello\nworld".to_string())); + + let info = SourceInfo::original( + file_id, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 11, + row: 1, + column: 5, + }, + }, + ); + + // Map range [0, 5) which is "hello" + let (start, end) = info.map_range(0, 5, &ctx).unwrap(); + assert_eq!(start.file_id, file_id); + assert_eq!(start.location.offset, 0); + assert_eq!(end.file_id, file_id); + assert_eq!(end.location.offset, 5); + } +} diff --git a/crates/quarto-source-map/src/source_info.rs b/crates/quarto-source-map/src/source_info.rs new file mode 100644 index 0000000..2ff33bc --- /dev/null +++ b/crates/quarto-source-map/src/source_info.rs @@ -0,0 +1,868 @@ +//! Source information with transformation tracking + +use crate::types::{FileId, Location, Range}; +use serde::{Deserialize, Serialize}; +use std::rc::Rc; + +/// Source information tracking a location and its transformation history +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct SourceInfo { + /// The range in the immediate/current text + pub range: Range, + /// How this range maps to its source + pub mapping: SourceMapping, +} + +/// Describes how source content was transformed +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum SourceMapping { + /// Direct position in an original file + Original { file_id: FileId }, + /// Substring extraction from a parent source + Substring { + parent: Rc, + offset: usize, + }, + /// Concatenation of multiple sources + Concat { pieces: Vec }, + /// Transformed text with piecewise mapping + Transformed { + parent: Rc, + mapping: Vec, + }, +} + +/// A piece of a concatenated source +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct SourcePiece { + /// Source information for this piece + pub source_info: SourceInfo, + /// Where this piece starts in the concatenated string + pub offset_in_concat: usize, + /// Length of this piece + pub length: usize, +} + +/// Maps a range in transformed text to parent text +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct RangeMapping { + /// Start offset in transformed text + pub from_start: usize, + /// End offset in transformed text + pub from_end: usize, + /// Start offset in parent text + pub to_start: usize, + /// End offset in parent text + pub to_end: usize, +} + +impl Default for SourceInfo { + fn default() -> Self { + SourceInfo::original( + FileId(0), + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 0, + row: 0, + column: 0, + }, + }, + ) + } +} + +impl SourceInfo { + /// Create source info for a position in an original file + pub fn original(file_id: FileId, range: Range) -> Self { + SourceInfo { + range, + mapping: SourceMapping::Original { file_id }, + } + } + + /// Create source info for a substring extraction + pub fn substring(parent: SourceInfo, start: usize, end: usize) -> Self { + let length = end - start; + SourceInfo { + range: Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: length, + row: 0, + column: 0, + }, + }, + mapping: SourceMapping::Substring { + parent: Rc::new(parent), + offset: start, + }, + } + } + + /// Create source info for concatenated sources + pub fn concat(pieces: Vec<(SourceInfo, usize)>) -> Self { + let source_pieces: Vec = pieces + .into_iter() + .map(|(source_info, length)| SourcePiece { + source_info, + offset_in_concat: 0, // Will be calculated based on cumulative lengths + length, + }) + .collect(); + + // Calculate cumulative offsets + let mut cumulative_offset = 0; + let pieces_with_offsets: Vec = source_pieces + .into_iter() + .map(|mut piece| { + piece.offset_in_concat = cumulative_offset; + cumulative_offset += piece.length; + piece + }) + .collect(); + + let total_length = cumulative_offset; + + SourceInfo { + range: Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: total_length, + row: 0, + column: 0, + }, + }, + mapping: SourceMapping::Concat { + pieces: pieces_with_offsets, + }, + } + } + + /// Create source info for transformed text + pub fn transformed(parent: SourceInfo, mapping: Vec) -> Self { + // Find the max end offset in the transformed text + let total_length = mapping.iter().map(|m| m.from_end).max().unwrap_or(0); + + SourceInfo { + range: Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: total_length, + row: 0, + column: 0, + }, + }, + mapping: SourceMapping::Transformed { + parent: Rc::new(parent), + mapping, + }, + } + } + + /// Combine two SourceInfo objects representing adjacent text + /// + /// This creates a Concat mapping that preserves both sources. + /// The resulting SourceInfo spans from the start of self to the end of other. + pub fn combine(&self, other: &SourceInfo) -> Self { + let self_length = self.range.end.offset - self.range.start.offset; + let other_length = other.range.end.offset - other.range.start.offset; + + SourceInfo::concat(vec![ + (self.clone(), self_length), + (other.clone(), other_length), + ]) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::types::{FileId, Location, Range}; + + #[test] + fn test_original_source_info() { + let file_id = FileId(0); + let range = Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 10, + row: 0, + column: 10, + }, + }; + + let info = SourceInfo::original(file_id, range.clone()); + + assert_eq!(info.range, range); + match info.mapping { + SourceMapping::Original { file_id: mapped_id } => { + assert_eq!(mapped_id, file_id); + } + _ => panic!("Expected Original mapping"), + } + } + + #[test] + fn test_source_info_serialization() { + let file_id = FileId(0); + let range = Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 10, + row: 0, + column: 10, + }, + }; + + let info = SourceInfo::original(file_id, range); + let json = serde_json::to_string(&info).unwrap(); + let deserialized: SourceInfo = serde_json::from_str(&json).unwrap(); + + assert_eq!(info, deserialized); + } + + #[test] + fn test_substring_source_info() { + let file_id = FileId(0); + let parent_range = Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 100, + row: 0, + column: 100, + }, + }; + let parent = SourceInfo::original(file_id, parent_range); + + let substring = SourceInfo::substring(parent, 10, 20); + + assert_eq!(substring.range.start.offset, 0); + assert_eq!(substring.range.end.offset, 10); // length = 20 - 10 = 10 + + match substring.mapping { + SourceMapping::Substring { offset, .. } => { + assert_eq!(offset, 10); + } + _ => panic!("Expected Substring mapping"), + } + } + + #[test] + fn test_concat_source_info() { + let file_id1 = FileId(0); + let file_id2 = FileId(1); + + let info1 = SourceInfo::original( + file_id1, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 10, + row: 0, + column: 10, + }, + }, + ); + + let info2 = SourceInfo::original( + file_id2, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 15, + row: 0, + column: 15, + }, + }, + ); + + let concat = SourceInfo::concat(vec![(info1, 10), (info2, 15)]); + + assert_eq!(concat.range.start.offset, 0); + assert_eq!(concat.range.end.offset, 25); // 10 + 15 + + match concat.mapping { + SourceMapping::Concat { pieces } => { + assert_eq!(pieces.len(), 2); + assert_eq!(pieces[0].offset_in_concat, 0); + assert_eq!(pieces[0].length, 10); + assert_eq!(pieces[1].offset_in_concat, 10); + assert_eq!(pieces[1].length, 15); + } + _ => panic!("Expected Concat mapping"), + } + } + + #[test] + fn test_transformed_source_info() { + let file_id = FileId(0); + let parent = SourceInfo::original( + file_id, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 50, + row: 0, + column: 50, + }, + }, + ); + + let mapping = vec![ + RangeMapping { + from_start: 0, + from_end: 10, + to_start: 0, + to_end: 10, + }, + RangeMapping { + from_start: 10, + from_end: 20, + to_start: 20, + to_end: 30, + }, + ]; + + let transformed = SourceInfo::transformed(parent, mapping.clone()); + + assert_eq!(transformed.range.start.offset, 0); + assert_eq!(transformed.range.end.offset, 20); // max from_end + + match transformed.mapping { + SourceMapping::Transformed { mapping: m, .. } => { + assert_eq!(m, mapping); + } + _ => panic!("Expected Transformed mapping"), + } + } + + #[test] + fn test_nested_transformations() { + let file_id = FileId(0); + let original = SourceInfo::original( + file_id, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 100, + row: 0, + column: 100, + }, + }, + ); + + // Extract a substring + let substring = SourceInfo::substring(original, 10, 50); + + // Then transform it + let transformed = SourceInfo::transformed( + substring, + vec![RangeMapping { + from_start: 0, + from_end: 10, + to_start: 0, + to_end: 10, + }], + ); + + // Verify the chain: Original -> Substring -> Transformed + match &transformed.mapping { + SourceMapping::Transformed { parent, .. } => match &parent.mapping { + SourceMapping::Substring { + parent: grandparent, + offset, + } => { + assert_eq!(*offset, 10); + match &grandparent.mapping { + SourceMapping::Original { file_id: id } => { + assert_eq!(*id, file_id); + } + _ => panic!("Expected Original at root"), + } + } + _ => panic!("Expected Substring as parent"), + }, + _ => panic!("Expected Transformed at top level"), + } + } + + #[test] + fn test_combine_two_sources() { + let file_id = FileId(0); + + // Create two separate source info objects + let info1 = SourceInfo::original( + file_id, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 10, + row: 0, + column: 10, + }, + }, + ); + + let info2 = SourceInfo::original( + file_id, + Range { + start: Location { + offset: 15, + row: 0, + column: 15, + }, + end: Location { + offset: 25, + row: 0, + column: 25, + }, + }, + ); + + // Combine them + let combined = info1.combine(&info2); + + // Should create a Concat with total length = 10 + 10 = 20 + assert_eq!(combined.range.start.offset, 0); + assert_eq!(combined.range.end.offset, 20); + + match combined.mapping { + SourceMapping::Concat { pieces } => { + assert_eq!(pieces.len(), 2); + assert_eq!(pieces[0].length, 10); + assert_eq!(pieces[0].offset_in_concat, 0); + assert_eq!(pieces[1].length, 10); + assert_eq!(pieces[1].offset_in_concat, 10); + } + _ => panic!("Expected Concat mapping"), + } + } + + #[test] + fn test_combine_preserves_source_tracking() { + // Combine sources from different files + let file_id1 = FileId(5); + let file_id2 = FileId(10); + + let info1 = SourceInfo::original( + file_id1, + Range { + start: Location { + offset: 100, + row: 5, + column: 0, + }, + end: Location { + offset: 105, + row: 5, + column: 5, + }, + }, + ); + + let info2 = SourceInfo::original( + file_id2, + Range { + start: Location { + offset: 200, + row: 10, + column: 0, + }, + end: Location { + offset: 207, + row: 10, + column: 7, + }, + }, + ); + + let combined = info1.combine(&info2); + + // Verify both sources are preserved in the Concat + match combined.mapping { + SourceMapping::Concat { pieces } => { + assert_eq!(pieces.len(), 2); + + // First piece should come from file_id1 + match &pieces[0].source_info.mapping { + SourceMapping::Original { file_id } => assert_eq!(*file_id, file_id1), + _ => panic!("Expected Original mapping for first piece"), + } + + // Second piece should come from file_id2 + match &pieces[1].source_info.mapping { + SourceMapping::Original { file_id } => assert_eq!(*file_id, file_id2), + _ => panic!("Expected Original mapping for second piece"), + } + } + _ => panic!("Expected Concat mapping"), + } + } + + /// Test JSON serialization of Original mapping + #[test] + fn test_json_serialization_original() { + let file_id = FileId(0); + let range = Range { + start: Location { + offset: 10, + row: 1, + column: 5, + }, + end: Location { + offset: 50, + row: 3, + column: 10, + }, + }; + + let info = SourceInfo::original(file_id, range); + let json = serde_json::to_value(&info).unwrap(); + + // Verify JSON structure + assert_eq!(json["range"]["start"]["offset"], 10); + assert_eq!(json["range"]["start"]["row"], 1); + assert_eq!(json["range"]["start"]["column"], 5); + assert_eq!(json["range"]["end"]["offset"], 50); + assert_eq!(json["range"]["end"]["row"], 3); + assert_eq!(json["range"]["end"]["column"], 10); + assert_eq!(json["mapping"]["Original"]["file_id"], 0); + + // Verify round-trip + let deserialized: SourceInfo = serde_json::from_value(json).unwrap(); + assert_eq!(info, deserialized); + } + + /// Test JSON serialization of Substring mapping + #[test] + fn test_json_serialization_substring() { + let file_id = FileId(0); + let parent_range = Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 100, + row: 5, + column: 20, + }, + }; + let parent = SourceInfo::original(file_id, parent_range); + + let substring = SourceInfo::substring(parent, 10, 30); + let json = serde_json::to_value(&substring).unwrap(); + + // Verify JSON structure + assert_eq!(json["range"]["start"]["offset"], 0); + assert_eq!(json["range"]["end"]["offset"], 20); // length = 30 - 10 = 20 + assert_eq!(json["mapping"]["Substring"]["offset"], 10); + + // Verify parent is serialized (with Rc, it's a full copy in JSON) + assert!(json["mapping"]["Substring"]["parent"].is_object()); + assert_eq!( + json["mapping"]["Substring"]["parent"]["mapping"]["Original"]["file_id"], + 0 + ); + + // Verify round-trip + let deserialized: SourceInfo = serde_json::from_value(json).unwrap(); + assert_eq!(substring, deserialized); + } + + /// Test JSON serialization of nested Substring mappings (simulates .qmd frontmatter) + #[test] + fn test_json_serialization_nested_substring() { + let file_id = FileId(0); + + // Level 1: Original file + let file_range = Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 200, + row: 10, + column: 0, + }, + }; + let file_info = SourceInfo::original(file_id, file_range); + + // Level 2: YAML frontmatter (substring of file) + let yaml_info = SourceInfo::substring(file_info, 4, 150); + + // Level 3: YAML value (substring of frontmatter) + let value_info = SourceInfo::substring(yaml_info, 20, 35); + + let json = serde_json::to_value(&value_info).unwrap(); + + // Verify nested structure + assert_eq!(json["mapping"]["Substring"]["offset"], 20); + assert_eq!( + json["mapping"]["Substring"]["parent"]["mapping"]["Substring"]["offset"], + 4 + ); + assert_eq!( + json["mapping"]["Substring"]["parent"]["mapping"]["Substring"]["parent"]["mapping"]["Original"] + ["file_id"], + 0 + ); + + // Verify round-trip + let deserialized: SourceInfo = serde_json::from_value(json).unwrap(); + assert_eq!(value_info, deserialized); + } + + /// Test JSON serialization of Concat mapping + #[test] + fn test_json_serialization_concat() { + let file_id1 = FileId(0); + let file_id2 = FileId(1); + + let info1 = SourceInfo::original( + file_id1, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 10, + row: 0, + column: 10, + }, + }, + ); + + let info2 = SourceInfo::original( + file_id2, + Range { + start: Location { + offset: 20, + row: 2, + column: 0, + }, + end: Location { + offset: 30, + row: 2, + column: 10, + }, + }, + ); + + let combined = info1.combine(&info2); + let json = serde_json::to_value(&combined).unwrap(); + + // Verify JSON structure + assert!(json["mapping"]["Concat"]["pieces"].is_array()); + let pieces = json["mapping"]["Concat"]["pieces"].as_array().unwrap(); + assert_eq!(pieces.len(), 2); + + // First piece + assert_eq!(pieces[0]["offset_in_concat"], 0); + assert_eq!(pieces[0]["length"], 10); + assert_eq!( + pieces[0]["source_info"]["mapping"]["Original"]["file_id"], + 0 + ); + + // Second piece + assert_eq!(pieces[1]["offset_in_concat"], 10); + assert_eq!(pieces[1]["length"], 10); + assert_eq!( + pieces[1]["source_info"]["mapping"]["Original"]["file_id"], + 1 + ); + + // Verify round-trip + let deserialized: SourceInfo = serde_json::from_value(json).unwrap(); + assert_eq!(combined, deserialized); + } + + /// Test JSON serialization of Transformed mapping + #[test] + fn test_json_serialization_transformed() { + use crate::RangeMapping; + + let file_id = FileId(0); + let parent = SourceInfo::original( + file_id, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 20, + row: 0, + column: 20, + }, + }, + ); + + // Create a transformed source with range mappings + let mappings = vec![ + RangeMapping { + from_start: 0, + from_end: 5, + to_start: 0, + to_end: 5, + }, + RangeMapping { + from_start: 5, + from_end: 10, + to_start: 10, + to_end: 15, + }, + ]; + + let transformed = SourceInfo { + range: Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 10, + row: 0, + column: 10, + }, + }, + mapping: SourceMapping::Transformed { + parent: Rc::new(parent), + mapping: mappings.clone(), + }, + }; + + let json = serde_json::to_value(&transformed).unwrap(); + + // Verify JSON structure + assert!(json["mapping"]["Transformed"]["mapping"].is_array()); + let json_mappings = json["mapping"]["Transformed"]["mapping"] + .as_array() + .unwrap(); + assert_eq!(json_mappings.len(), 2); + + // Verify first mapping + assert_eq!(json_mappings[0]["from_start"], 0); + assert_eq!(json_mappings[0]["from_end"], 5); + assert_eq!(json_mappings[0]["to_start"], 0); + assert_eq!(json_mappings[0]["to_end"], 5); + + // Verify second mapping + assert_eq!(json_mappings[1]["from_start"], 5); + assert_eq!(json_mappings[1]["from_end"], 10); + assert_eq!(json_mappings[1]["to_start"], 10); + assert_eq!(json_mappings[1]["to_end"], 15); + + // Verify parent is serialized + assert_eq!( + json["mapping"]["Transformed"]["parent"]["mapping"]["Original"]["file_id"], + 0 + ); + + // Verify round-trip + let deserialized: SourceInfo = serde_json::from_value(json).unwrap(); + assert_eq!(transformed, deserialized); + } + + /// Test JSON serialization of complex nested structure (real-world example) + #[test] + fn test_json_serialization_complex_nested() { + let file_id = FileId(0); + + // Simulate a .qmd file structure + let qmd_file = SourceInfo::original( + file_id, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 500, + row: 20, + column: 0, + }, + }, + ); + + // YAML frontmatter is a substring + let yaml_frontmatter = SourceInfo::substring(qmd_file.clone(), 4, 200); + + // A YAML key is a substring of frontmatter + let yaml_key = SourceInfo::substring(yaml_frontmatter.clone(), 10, 20); + + // A YAML value is another substring of frontmatter + let yaml_value = SourceInfo::substring(yaml_frontmatter, 25, 50); + + // Combine key and value (simulating metadata entry) + let combined = yaml_key.combine(&yaml_value); + + let json = serde_json::to_value(&combined).unwrap(); + + // Verify this complex structure serializes + assert!(json.is_object()); + assert!(json["mapping"]["Concat"].is_object()); + + // Verify round-trip + let deserialized: SourceInfo = serde_json::from_value(json).unwrap(); + assert_eq!(combined, deserialized); + } +} diff --git a/crates/quarto-source-map/src/types.rs b/crates/quarto-source-map/src/types.rs new file mode 100644 index 0000000..12bd564 --- /dev/null +++ b/crates/quarto-source-map/src/types.rs @@ -0,0 +1,169 @@ +//! Core types for source mapping + +use serde::{Deserialize, Serialize}; + +/// A unique identifier for a source file +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct FileId(pub usize); + +/// A location in source text (0-indexed) +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +pub struct Location { + /// Byte offset from start of source + pub offset: usize, + /// Row number (0-indexed) + pub row: usize, + /// Column number (0-indexed, in characters not bytes) + pub column: usize, +} + +/// A range in source text from start to end +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Range { + /// Start location (inclusive) + pub start: Location, + /// End location (exclusive) + pub end: Location, +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_file_id_equality() { + let id1 = FileId(0); + let id2 = FileId(0); + let id3 = FileId(1); + + assert_eq!(id1, id2); + assert_ne!(id1, id3); + } + + #[test] + fn test_location_ordering() { + let loc1 = Location { + offset: 0, + row: 0, + column: 0, + }; + let loc2 = Location { + offset: 5, + row: 0, + column: 5, + }; + let loc3 = Location { + offset: 10, + row: 1, + column: 0, + }; + + assert!(loc1 < loc2); + assert!(loc2 < loc3); + assert!(loc1 < loc3); + } + + #[test] + fn test_location_equality() { + let loc1 = Location { + offset: 5, + row: 0, + column: 5, + }; + let loc2 = Location { + offset: 5, + row: 0, + column: 5, + }; + let loc3 = Location { + offset: 6, + row: 0, + column: 6, + }; + + assert_eq!(loc1, loc2); + assert_ne!(loc1, loc3); + } + + #[test] + fn test_range_equality() { + let range1 = Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 5, + row: 0, + column: 5, + }, + }; + let range2 = Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 5, + row: 0, + column: 5, + }, + }; + let range3 = Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 10, + row: 0, + column: 10, + }, + }; + + assert_eq!(range1, range2); + assert_ne!(range1, range3); + } + + #[test] + fn test_serialization_file_id() { + let id = FileId(42); + let json = serde_json::to_string(&id).unwrap(); + let deserialized: FileId = serde_json::from_str(&json).unwrap(); + assert_eq!(id, deserialized); + } + + #[test] + fn test_serialization_location() { + let loc = Location { + offset: 100, + row: 5, + column: 10, + }; + let json = serde_json::to_string(&loc).unwrap(); + let deserialized: Location = serde_json::from_str(&json).unwrap(); + assert_eq!(loc, deserialized); + } + + #[test] + fn test_serialization_range() { + let range = Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 50, + row: 2, + column: 10, + }, + }; + let json = serde_json::to_string(&range).unwrap(); + let deserialized: Range = serde_json::from_str(&json).unwrap(); + assert_eq!(range, deserialized); + } +} diff --git a/crates/quarto-source-map/src/utils.rs b/crates/quarto-source-map/src/utils.rs new file mode 100644 index 0000000..895058b --- /dev/null +++ b/crates/quarto-source-map/src/utils.rs @@ -0,0 +1,211 @@ +//! Utility functions for working with source positions + +use crate::types::{Location, Range}; + +/// Convert a byte offset to a Location with line and column info +/// +/// Returns None if the offset is out of bounds. +pub fn offset_to_location(source: &str, offset: usize) -> Option { + if offset > source.len() { + return None; + } + + let mut row = 0; + let mut column = 0; + let mut current_offset = 0; + + for ch in source.chars() { + if current_offset >= offset { + break; + } + + if ch == '\n' { + row += 1; + column = 0; + } else { + column += 1; + } + + current_offset += ch.len_utf8(); + } + + Some(Location { + offset, + row, + column, + }) +} + +/// Convert line and column numbers to a byte offset +/// +/// Line and column are 0-indexed. Returns None if out of bounds. +pub fn line_col_to_offset(source: &str, line: usize, col: usize) -> Option { + let mut current_line = 0; + let mut current_col = 0; + let mut offset = 0; + + for ch in source.chars() { + if current_line == line && current_col == col { + return Some(offset); + } + + if ch == '\n' { + current_line += 1; + current_col = 0; + } else { + current_col += 1; + } + + offset += ch.len_utf8(); + } + + // Check if we're at the end position + if current_line == line && current_col == col { + return Some(offset); + } + + None +} + +/// Create a Range from start and end byte offsets +/// +/// This is a helper that creates a Range with Location structs +/// that only have offsets filled in (row and column are 0). +/// Use `offset_to_location` to get full Location info. +pub fn range_from_offsets(start: usize, end: usize) -> Range { + Range { + start: Location { + offset: start, + row: 0, + column: 0, + }, + end: Location { + offset: end, + row: 0, + column: 0, + }, + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_offset_to_location_simple() { + let source = "hello\nworld"; + + // Beginning + let loc = offset_to_location(source, 0).unwrap(); + assert_eq!(loc.offset, 0); + assert_eq!(loc.row, 0); + assert_eq!(loc.column, 0); + + // Middle of first line + let loc = offset_to_location(source, 3).unwrap(); + assert_eq!(loc.offset, 3); + assert_eq!(loc.row, 0); + assert_eq!(loc.column, 3); + + // After newline (beginning of second line) + let loc = offset_to_location(source, 6).unwrap(); + assert_eq!(loc.offset, 6); + assert_eq!(loc.row, 1); + assert_eq!(loc.column, 0); + + // Middle of second line + let loc = offset_to_location(source, 9).unwrap(); + assert_eq!(loc.offset, 9); + assert_eq!(loc.row, 1); + assert_eq!(loc.column, 3); + } + + #[test] + fn test_offset_to_location_out_of_bounds() { + let source = "hello"; + assert!(offset_to_location(source, 100).is_none()); + } + + #[test] + fn test_offset_to_location_end() { + let source = "hello"; + let loc = offset_to_location(source, 5).unwrap(); + assert_eq!(loc.offset, 5); + assert_eq!(loc.row, 0); + assert_eq!(loc.column, 5); + } + + #[test] + fn test_line_col_to_offset_simple() { + let source = "hello\nworld"; + + // Beginning + let offset = line_col_to_offset(source, 0, 0).unwrap(); + assert_eq!(offset, 0); + + // Middle of first line + let offset = line_col_to_offset(source, 0, 3).unwrap(); + assert_eq!(offset, 3); + + // Beginning of second line + let offset = line_col_to_offset(source, 1, 0).unwrap(); + assert_eq!(offset, 6); + + // Middle of second line + let offset = line_col_to_offset(source, 1, 3).unwrap(); + assert_eq!(offset, 9); + } + + #[test] + fn test_line_col_to_offset_out_of_bounds() { + let source = "hello\nworld"; + assert!(line_col_to_offset(source, 10, 0).is_none()); + assert!(line_col_to_offset(source, 0, 100).is_none()); + } + + #[test] + fn test_line_col_to_offset_end() { + let source = "hello"; + let offset = line_col_to_offset(source, 0, 5).unwrap(); + assert_eq!(offset, 5); + } + + #[test] + fn test_roundtrip() { + let source = "hello\nworld\ntest"; + + // Test various positions + for test_offset in [0, 3, 6, 10, 16] { + let loc = offset_to_location(source, test_offset).unwrap(); + let back_to_offset = line_col_to_offset(source, loc.row, loc.column).unwrap(); + assert_eq!(test_offset, back_to_offset); + } + } + + #[test] + fn test_range_from_offsets() { + let range = range_from_offsets(10, 20); + assert_eq!(range.start.offset, 10); + assert_eq!(range.end.offset, 20); + assert_eq!(range.start.row, 0); + assert_eq!(range.start.column, 0); + } + + #[test] + fn test_offset_to_location_multiline() { + let source = "line1\nline2\nline3"; + + // Test each line start + let loc = offset_to_location(source, 0).unwrap(); + assert_eq!(loc.row, 0); + assert_eq!(loc.column, 0); + + let loc = offset_to_location(source, 6).unwrap(); + assert_eq!(loc.row, 1); + assert_eq!(loc.column, 0); + + let loc = offset_to_location(source, 12).unwrap(); + assert_eq!(loc.row, 2); + assert_eq!(loc.column, 0); + } +} diff --git a/crates/quarto-yaml/Cargo.toml b/crates/quarto-yaml/Cargo.toml new file mode 100644 index 0000000..9fe0894 --- /dev/null +++ b/crates/quarto-yaml/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "quarto-yaml" +version.workspace = true +edition.workspace = true +authors.workspace = true +license.workspace = true +repository.workspace = true + +[dependencies] +yaml-rust2 = { workspace = true } +serde = { workspace = true } +thiserror = { workspace = true } +quarto-source-map = { path = "../quarto-source-map" } + +[dev-dependencies] +regex = "1" + +[[bench]] +name = "memory_overhead" +harness = false + +[[bench]] +name = "scaling_overhead" +harness = false diff --git a/crates/quarto-yaml/README.md b/crates/quarto-yaml/README.md new file mode 100644 index 0000000..7c51906 --- /dev/null +++ b/crates/quarto-yaml/README.md @@ -0,0 +1,154 @@ +# quarto-yaml + +YAML parsing with source location tracking for the Quarto Rust port. + +## Overview + +This crate provides `YamlWithSourceInfo`, which wraps `yaml-rust2::Yaml` with source location information for every node in the YAML tree. This enables precise error reporting and source tracking through transformations. + +## Design Philosophy + +Uses the **owned data approach**: wraps owned `Yaml` values with a parallel children structure for source tracking. This follows rust-analyzer's precedent of using owned data for tree structures. + +**Trade-offs:** +- Simple API with no lifetime parameters +- Compatible with config merging across different lifetimes +- Enables LSP caching (serializable) +- ~3x memory overhead (acceptable for config files <10KB) + +## Features + +- ✅ Parse YAML with complete source tracking +- ✅ Access raw `yaml-rust2::Yaml` for direct manipulation +- ✅ Source-tracked children for error reporting +- ✅ Type-safe access methods +- ⚠️ Basic alias support (converted to Null) +- ⚠️ Tags parsed but not exposed +- 🔴 Single document only (no multi-document streams yet) + +## Usage + +```rust +use quarto_yaml::{parse, parse_file}; + +// Parse from string +let yaml = parse(r#" +title: My Document +author: John Doe +tags: + - rust + - yaml +"#).unwrap(); + +// Parse with filename +let yaml = parse_file(content, "config.yaml").unwrap(); + +// Access raw Yaml +println!("Title: {:?}", yaml.yaml["title"]); + +// Source-tracked access +if let Some(title) = yaml.get_hash_value("title") { + println!("Title at {}:{}", + title.source_info.line, + title.source_info.col + ); +} + +// Navigate arrays +if let Some(tags) = yaml.get_hash_value("tags") { + for tag in tags.as_array().unwrap() { + println!("{} at line {}", + tag.yaml.as_str().unwrap(), + tag.source_info.line + ); + } +} +``` + +## API Overview + +### Core Types + +- **`YamlWithSourceInfo`** - Main wrapper with owned Yaml + source tracking +- **`SourceInfo`** - Source location (file, line, col, offset, length) +- **`YamlHashEntry`** - Hash entry with source spans for key, value, and entry + +### Functions + +- `parse(content: &str) -> Result` +- `parse_file(content: &str, filename: &str) -> Result` + +### Methods on YamlWithSourceInfo + +- `get_hash_value(&self, key: &str) -> Option<&YamlWithSourceInfo>` +- `get_array_item(&self, index: usize) -> Option<&YamlWithSourceInfo>` +- `as_array(&self) -> Option<&[YamlWithSourceInfo]>` +- `as_hash(&self) -> Option<&[YamlHashEntry]>` +- `is_scalar()`, `is_array()`, `is_hash()` - Type checking +- `len()`, `is_empty()` - Child count + +## Implementation Details + +### Data Structure + +```rust +pub struct YamlWithSourceInfo { + pub yaml: Yaml, // Direct access to raw Yaml + pub source_info: SourceInfo, // This node's location + children: Children, // Source-tracked children (private) +} +``` + +### Parser + +Uses yaml-rust2's `MarkedEventReceiver` API to build the tree: +- Event-based parsing (push parser) +- Stack-based tree construction +- Marker provides source positions + +## Limitations + +1. **Scalar lengths**: Currently approximate (uses value length) +2. **Aliases**: Converted to Null (anchor tracking not implemented) +3. **Tags**: Parsed but not exposed in API +4. **Multi-document**: Only first document parsed + +## Future Work + +See `claude-notes/implementation-plan.md` for roadmap: + +**Phase 2**: Parser improvements (accurate spans, aliases, tags) +**Phase 3**: Public API enhancements (merging, validation) +**Phase 4**: Advanced features (multi-document, streaming) +**Phase 5**: Integration (unified SourceInfo, LSP support) + +## Dependencies + +- `yaml-rust2 = "0.9"` - YAML parsing with markers +- `serde = "1.0"` - For future serialization +- `thiserror = "1.0"` - Error types + +## Testing + +```bash +cd crates/quarto-yaml +cargo test +``` + +All 14 tests passing ✅ + +## Documentation + +```bash +cargo doc --open +``` + +## License + +MIT (same as Kyoto project) + +## Notes + +This crate is part of the Kyoto project - a Rust port of Quarto CLI. See the main project for context and architecture decisions. + +For implementation notes, see `claude-notes/` directory. diff --git a/crates/quarto-yaml/YAML-1.2-REQUIREMENT.md b/crates/quarto-yaml/YAML-1.2-REQUIREMENT.md new file mode 100644 index 0000000..a5c9198 --- /dev/null +++ b/crates/quarto-yaml/YAML-1.2-REQUIREMENT.md @@ -0,0 +1,113 @@ +# YAML 1.2 Requirement + +## Critical Constraint + +**We CANNOT use `serde_yaml` until it supports YAML 1.2.** + +## Background + +### YAML Version Differences + +- **YAML 1.1** (used by `yaml-rust` and `serde_yaml`): Older spec with ambiguous boolean parsing + - `yes`, `no`, `on`, `off` are parsed as booleans + - This breaks many real-world documents where `no` is meant to be a string + +- **YAML 1.2** (used by `yaml-rust2` and `quarto-yaml`): Fixed ambiguities + - Only `true`, `false` (and some case variants) are booleans + - `yes`, `no`, `on`, `off` are strings by default + - Much more predictable for users + +### Why This Matters for Quarto + +Quarto documents often contain YAML like: + +```yaml +author: + name: John Doe + orcid: no # Should be the string "no", not boolean false +``` + +With YAML 1.1 parsers, this would incorrectly parse `no` as `false`. + +## Current State + +- **quarto-yaml**: Uses `yaml-rust2` ✅ (YAML 1.2) +- **quarto-yaml-validation**: Uses `serde_yaml` ❌ (YAML 1.1) for Schema deserialization + +## Problem + +The current `Schema` deserialization in `quarto-yaml-validation/src/schema.rs` uses serde: + +```rust +impl<'de> Deserialize<'de> for Schema { + fn deserialize(deserializer: D) -> Result + // This uses serde_yaml, which only supports YAML 1.1 +} +``` + +This means: +1. **User documents** are parsed with YAML 1.2 (correct) +2. **Schema files** are parsed with YAML 1.1 (incorrect) + +This inconsistency is problematic because: +- Users expect consistent YAML parsing behavior +- Schema files may themselves contain ambiguous values like `no` in examples +- Quarto extensions will define their own schemas and expect YAML 1.2 + +## Solution + +**Use `YamlWithSourceInfo` for loading schemas, not serde deserialization.** + +Instead of: +```rust +// Current (WRONG - uses YAML 1.1) +let schema: Schema = serde_yaml::from_str(yaml_str)?; +``` + +Do: +```rust +// Correct (uses YAML 1.2) +let yaml = quarto_yaml::parse(yaml_str, Some(file_path))?; +let schema = Schema::from_yaml(&yaml)?; // Manual conversion +``` + +Benefits: +1. ✅ Consistent YAML 1.2 parsing for both documents and schemas +2. ✅ Source location tracking for schema files (enables better error messages) +3. ✅ No dependency on `serde_yaml` (one less dependency) +4. ✅ Extensions can use the same infrastructure + +Trade-offs: +- More manual code to convert `YamlWithSourceInfo` → `Schema` +- Cannot leverage serde's automatic deserialization +- But: More control over error messages and validation + +## Implementation Plan + +1. Remove `serde::Deserialize` implementation from `Schema` enum +2. Add `Schema::from_yaml(yaml: &YamlWithSourceInfo) -> Result` method +3. Add helper methods for parsing each schema type +4. Update tests to use `quarto_yaml::parse()` instead of `serde_yaml` +5. Add source location tracking to schema parsing errors + +## Timeline + +This should be done **before** implementing the `validate-yaml` binary, since it affects the fundamental architecture. + +## Related Files + +- `/crates/quarto-yaml-validation/src/schema.rs` - Schema deserialization (needs rewrite) +- `/claude-notes/yaml-schema-from-yaml-design.md` - Design document (needs revision) + +## Future: serde_yaml YAML 1.2 Support + +If `serde_yaml` ever adds YAML 1.2 support, we could: +1. Keep the `from_yaml()` approach for source tracking +2. Optionally add serde deserialization back as a convenience method +3. But `from_yaml()` should remain the primary API + +## References + +- yaml-rust2: https://docs.rs/yaml-rust2/ (YAML 1.2) +- serde_yaml: https://docs.rs/serde_yaml/ (YAML 1.1) +- YAML 1.2 spec: https://yaml.org/spec/1.2/spec.html diff --git a/crates/quarto-yaml/benches/memory_overhead.rs b/crates/quarto-yaml/benches/memory_overhead.rs new file mode 100644 index 0000000..fca2054 --- /dev/null +++ b/crates/quarto-yaml/benches/memory_overhead.rs @@ -0,0 +1,267 @@ +//! Memory overhead benchmark for YamlWithSourceInfo vs raw Yaml +//! +//! This benchmark measures the actual memory overhead of our owned data approach +//! compared to using yaml-rust2::Yaml directly. +//! +//! Run with: cargo bench --bench memory_overhead + +use quarto_yaml::parse; +use std::mem; +use yaml_rust2::YamlLoader; + +/// Calculate approximate memory usage of a Yaml tree +fn estimate_yaml_memory(yaml: &yaml_rust2::Yaml) -> usize { + let mut size = mem::size_of::(); + + match yaml { + yaml_rust2::Yaml::Real(s) | yaml_rust2::Yaml::String(s) => { + size += s.capacity(); + } + yaml_rust2::Yaml::Array(arr) => { + size += arr.capacity() * mem::size_of::(); + for item in arr { + size += estimate_yaml_memory(item); + } + } + yaml_rust2::Yaml::Hash(hash) => { + // HashMap overhead is complex, approximate + size += hash.capacity() * (mem::size_of::() * 2); + for (k, v) in hash { + size += estimate_yaml_memory(k); + size += estimate_yaml_memory(v); + } + } + _ => {} + } + + size +} + +/// Calculate approximate memory usage of a YamlWithSourceInfo tree +fn estimate_yaml_with_source_memory(yaml: &quarto_yaml::YamlWithSourceInfo) -> usize { + let mut size = mem::size_of::(); + + // Add the underlying Yaml + size += estimate_yaml_memory(&yaml.yaml); + + // Add SourceInfo + // Note: SourceInfo size is already included in sizeof(YamlWithSourceInfo) + // For basic parsing, SourceInfo uses Original variant with FileId (just a usize) + + // Add children + if let Some(children) = yaml.as_array() { + // Note: using len() not capacity() since we only have a slice + size += children.len() * mem::size_of::(); + for child in children { + size += estimate_yaml_with_source_memory(child); + } + } else if let Some(entries) = yaml.as_hash() { + // Note: using len() not capacity() since we only have a slice + size += entries.len() * mem::size_of::(); + for entry in entries { + size += estimate_yaml_with_source_memory(&entry.key); + size += estimate_yaml_with_source_memory(&entry.value); + // Add the 3 SourceInfo structs in YamlHashEntry + size += mem::size_of::() * 3; + } + } + + size +} + +/// Test case with name, YAML content, and description +struct TestCase { + name: &'static str, + yaml: &'static str, + description: &'static str, +} + +const TEST_CASES: &[TestCase] = &[ + TestCase { + name: "simple_scalar", + yaml: "hello world", + description: "Single scalar value", + }, + TestCase { + name: "small_hash", + yaml: r#" +title: My Document +author: John Doe +date: 2024-01-01 +"#, + description: "Small hash with 3 string values", + }, + TestCase { + name: "small_array", + yaml: r#" +- item1 +- item2 +- item3 +- item4 +- item5 +"#, + description: "Small array with 5 items", + }, + TestCase { + name: "nested_structure", + yaml: r#" +project: + title: My Project + version: 1.0.0 + authors: + - name: Alice + email: alice@example.com + - name: Bob + email: bob@example.com + config: + port: 8080 + debug: true + features: + - feature1 + - feature2 + - feature3 +"#, + description: "Nested structure with arrays and hashes", + }, + TestCase { + name: "quarto_document", + yaml: r#" +title: "My Research Paper" +author: "Jane Smith" +date: "2024-01-01" +format: + html: + theme: cosmo + toc: true + toc-depth: 3 + code-fold: true + pdf: + documentclass: article + margin-left: 1in + margin-right: 1in +execute: + echo: true + warning: false + error: false +bibliography: references.bib +csl: apa.csl +"#, + description: "Typical Quarto document metadata", + }, + TestCase { + name: "quarto_project", + yaml: r#" +project: + type: website + output-dir: _site + +website: + title: "My Website" + navbar: + left: + - text: "Home" + href: index.qmd + - text: "About" + href: about.qmd + - text: "Blog" + href: blog/index.qmd + right: + - icon: github + href: https://github.com/user/repo + +format: + html: + theme: + light: flatly + dark: darkly + css: styles.css + toc: true + +execute: + freeze: auto +"#, + description: "Quarto project configuration", + }, +]; + +fn main() { + println!("Memory Overhead Analysis: YamlWithSourceInfo vs raw Yaml"); + println!("==========================================================\n"); + + println!("Size of base types:"); + println!( + " yaml_rust2::Yaml: {} bytes", + mem::size_of::() + ); + println!( + " YamlWithSourceInfo: {} bytes", + mem::size_of::() + ); + println!( + " SourceInfo: {} bytes", + mem::size_of::() + ); + println!( + " YamlHashEntry: {} bytes", + mem::size_of::() + ); + println!(); + + let mut total_raw = 0usize; + let mut total_tracked = 0usize; + + for test in TEST_CASES { + println!("Test: {} - {}", test.name, test.description); + println!("{}", "-".repeat(60)); + + // Parse with yaml-rust2 + let raw_docs = YamlLoader::load_from_str(test.yaml).expect("Failed to parse YAML"); + let raw_yaml = &raw_docs[0]; + let raw_size = estimate_yaml_memory(raw_yaml); + + // Parse with YamlWithSourceInfo + let tracked_yaml = parse(test.yaml).expect("Failed to parse YAML with source tracking"); + let tracked_size = estimate_yaml_with_source_memory(&tracked_yaml); + + let overhead = tracked_size as f64 / raw_size as f64; + let diff = tracked_size - raw_size; + + println!(" Raw Yaml size: {:>8} bytes", raw_size); + println!(" YamlWithSourceInfo size: {:>8} bytes", tracked_size); + println!( + " Overhead: {:>8} bytes ({:.2}x)", + diff, overhead + ); + println!(); + + total_raw += raw_size; + total_tracked += tracked_size; + } + + println!("=========================================================="); + println!("TOTALS across all test cases:"); + println!(" Total raw: {:>8} bytes", total_raw); + println!(" Total tracked: {:>8} bytes", total_tracked); + let total_overhead = total_tracked as f64 / total_raw as f64; + println!(" Average overhead: {:.2}x", total_overhead); + println!(); + + // Analysis + println!("Analysis:"); + if total_overhead < 2.0 { + println!(" ✅ Overhead is better than expected (<2x)"); + } else if total_overhead < 3.0 { + println!(" ✅ Overhead is within expected range (2-3x)"); + } else if total_overhead < 4.0 { + println!(" ⚠️ Overhead is slightly higher than expected (3-4x)"); + } else { + println!(" ❌ Overhead is significantly higher than expected (>4x)"); + } + + println!(); + println!("Notes:"); + println!(" - These are estimates based on size_of and capacity"); + println!(" - Actual memory usage may differ due to allocator overhead"); + println!(" - For typical Quarto configs (<10KB raw), overhead is acceptable"); + println!(" - The overhead provides precise error reporting and LSP support"); +} diff --git a/crates/quarto-yaml/benches/scaling_overhead.rs b/crates/quarto-yaml/benches/scaling_overhead.rs new file mode 100644 index 0000000..a8611a7 --- /dev/null +++ b/crates/quarto-yaml/benches/scaling_overhead.rs @@ -0,0 +1,305 @@ +//! Scaling analysis: verify overhead grows linearly with data size +//! +//! This benchmark tests whether memory overhead grows linearly (O(n)) or +//! superlinearly (O(n²), O(n log n), etc.) with increasing YAML data size. +//! +//! If overhead ratio stays constant as size increases → Linear (good!) +//! If overhead ratio increases as size increases → Superlinear (bad!) +//! +//! Run with: cargo bench --bench scaling_overhead + +use quarto_yaml::parse; +use std::mem; +use yaml_rust2::YamlLoader; + +/// Calculate approximate memory usage of a Yaml tree +fn estimate_yaml_memory(yaml: &yaml_rust2::Yaml) -> usize { + let mut size = mem::size_of::(); + + match yaml { + yaml_rust2::Yaml::Real(s) | yaml_rust2::Yaml::String(s) => { + size += s.capacity(); + } + yaml_rust2::Yaml::Array(arr) => { + size += arr.capacity() * mem::size_of::(); + for item in arr { + size += estimate_yaml_memory(item); + } + } + yaml_rust2::Yaml::Hash(hash) => { + size += hash.capacity() * (mem::size_of::() * 2); + for (k, v) in hash { + size += estimate_yaml_memory(k); + size += estimate_yaml_memory(v); + } + } + _ => {} + } + + size +} + +/// Calculate approximate memory usage of a YamlWithSourceInfo tree +fn estimate_yaml_with_source_memory(yaml: &quarto_yaml::YamlWithSourceInfo) -> usize { + let mut size = mem::size_of::(); + + size += estimate_yaml_memory(&yaml.yaml); + // Note: SourceInfo size is already included in sizeof(YamlWithSourceInfo) + // For basic parsing, SourceInfo uses Original variant with FileId (just a usize) + + if let Some(children) = yaml.as_array() { + size += children.len() * mem::size_of::(); + for child in children { + size += estimate_yaml_with_source_memory(child); + } + } else if let Some(entries) = yaml.as_hash() { + size += entries.len() * mem::size_of::(); + for entry in entries { + size += estimate_yaml_with_source_memory(&entry.key); + size += estimate_yaml_with_source_memory(&entry.value); + size += mem::size_of::() * 3; + } + } + + size +} + +struct ScalingResult { + size: usize, + raw_bytes: usize, + tracked_bytes: usize, + overhead_ratio: f64, +} + +/// Generate a flat array of N string items +fn generate_flat_array(n: usize) -> String { + let mut yaml = String::from("[\n"); + for i in 0..n { + yaml.push_str(&format!(" \"item_{}\",\n", i)); + } + yaml.push_str("]\n"); + yaml +} + +/// Generate a flat hash with N key-value pairs +fn generate_flat_hash(n: usize) -> String { + let mut yaml = String::new(); + for i in 0..n { + yaml.push_str(&format!("key_{}: \"value_{}\"\n", i, i)); + } + yaml +} + +/// Generate a nested structure with depth D and breadth B +/// (D levels deep, B children at each level) +fn generate_nested_structure(depth: usize, breadth: usize) -> String { + fn generate_level( + current_depth: usize, + max_depth: usize, + breadth: usize, + indent: usize, + ) -> String { + let ind = " ".repeat(indent); + + if current_depth >= max_depth { + return format!("{}value\n", ind); + } + + let mut yaml = String::new(); + for i in 0..breadth { + yaml.push_str(&format!("{}child_{}:\n", ind, i)); + yaml.push_str(&generate_level( + current_depth + 1, + max_depth, + breadth, + indent + 1, + )); + } + yaml + } + + generate_level(0, depth, breadth, 0) +} + +/// Generate a mixed structure: top-level hash with N keys, each having a small nested structure +fn generate_mixed_structure(n: usize) -> String { + let mut yaml = String::new(); + for i in 0..n { + yaml.push_str(&format!( + "section_{}:\n title: \"Section {}\"\n enabled: true\n items:\n - item1\n - item2\n - item3\n", + i, i + )); + } + yaml +} + +fn test_scaling(name: &str, generator: impl Fn(usize) -> String, sizes: &[usize]) { + println!("\n{}", "=".repeat(70)); + println!("Scaling Test: {}", name); + println!("{}", "=".repeat(70)); + println!( + "{:>6} {:>12} {:>12} {:>12} {:>8}", + "Size", "Raw (bytes)", "Tracked", "Overhead", "Ratio" + ); + println!("{}", "-".repeat(70)); + + let mut results = Vec::new(); + + for &size in sizes { + let yaml_content = generator(size); + + // Parse with yaml-rust2 + let raw_docs = YamlLoader::load_from_str(&yaml_content).expect("Failed to parse YAML"); + let raw_yaml = &raw_docs[0]; + let raw_bytes = estimate_yaml_memory(raw_yaml); + + // Parse with YamlWithSourceInfo + let tracked_yaml = parse(&yaml_content).expect("Failed to parse YAML with source tracking"); + let tracked_bytes = estimate_yaml_with_source_memory(&tracked_yaml); + + let overhead = tracked_bytes - raw_bytes; + let ratio = tracked_bytes as f64 / raw_bytes as f64; + + println!( + "{:>6} {:>12} {:>12} {:>12} {:>8.2}x", + size, raw_bytes, tracked_bytes, overhead, ratio + ); + + results.push(ScalingResult { + size, + raw_bytes, + tracked_bytes, + overhead_ratio: ratio, + }); + } + + // Analyze scaling behavior + println!("\nScaling Analysis:"); + + if results.len() >= 2 { + let first = &results[0]; + let last = &results[results.len() - 1]; + + let size_ratio = last.size as f64 / first.size as f64; + let raw_ratio = last.raw_bytes as f64 / first.raw_bytes as f64; + let tracked_ratio = last.tracked_bytes as f64 / first.tracked_bytes as f64; + + println!(" Size increased: {:.1}x", size_ratio); + println!(" Raw memory increased: {:.1}x", raw_ratio); + println!(" Tracked memory increased: {:.1}x", tracked_ratio); + + // Check if overhead ratio is stable + let ratio_change = (last.overhead_ratio - first.overhead_ratio).abs(); + let ratio_change_pct = (ratio_change / first.overhead_ratio) * 100.0; + + println!( + "\n Overhead ratio change: {:.2}x → {:.2}x (Δ{:.1}%)", + first.overhead_ratio, last.overhead_ratio, ratio_change_pct + ); + + if ratio_change_pct < 10.0 { + println!(" ✅ Overhead is STABLE - scales linearly!"); + } else if ratio_change_pct < 25.0 { + println!(" ⚠️ Overhead grows slightly - possibly O(n log n)"); + } else { + println!(" ❌ Overhead grows significantly - possibly superlinear!"); + } + + // Check raw and tracked growth rates + let raw_per_item = last.raw_bytes as f64 / last.size as f64; + let tracked_per_item = last.tracked_bytes as f64 / last.size as f64; + + println!("\n At largest size:"); + println!(" Raw bytes per item: {:.1} bytes", raw_per_item); + println!(" Tracked bytes per item: {:.1} bytes", tracked_per_item); + println!( + " Overhead per item: {:.1} bytes", + tracked_per_item - raw_per_item + ); + } +} + +fn main() { + println!("Scaling Overhead Analysis: YamlWithSourceInfo"); + println!("============================================================="); + println!("Testing whether overhead grows linearly with data size"); + println!(); + + // Test 1: Flat arrays + let array_sizes = vec![10, 50, 100, 250, 500, 1000]; + test_scaling("Flat Array", generate_flat_array, &array_sizes); + + // Test 2: Flat hashes + let hash_sizes = vec![10, 50, 100, 250, 500, 1000]; + test_scaling("Flat Hash", generate_flat_hash, &hash_sizes); + + // Test 3: Mixed structures (realistic Quarto configs) + let mixed_sizes = vec![5, 10, 20, 50, 100]; + test_scaling("Mixed Structure", generate_mixed_structure, &mixed_sizes); + + // Test 4: Nested structures (depth=5, varying breadth) + println!("\n{}", "=".repeat(70)); + println!("Nested Structure Scaling (depth=5, varying breadth)"); + println!("{}", "=".repeat(70)); + println!( + "{:>8} {:>12} {:>12} {:>12} {:>8}", + "Breadth", "Raw (bytes)", "Tracked", "Overhead", "Ratio" + ); + println!("{}", "-".repeat(70)); + + let breadths = vec![2, 3, 4, 5]; + let mut nested_results = Vec::new(); + + for breadth in &breadths { + let yaml_content = generate_nested_structure(5, *breadth); + + let raw_docs = YamlLoader::load_from_str(&yaml_content).expect("Failed to parse YAML"); + let raw_yaml = &raw_docs[0]; + let raw_bytes = estimate_yaml_memory(raw_yaml); + + let tracked_yaml = parse(&yaml_content).expect("Failed to parse YAML with source tracking"); + let tracked_bytes = estimate_yaml_with_source_memory(&tracked_yaml); + + let overhead = tracked_bytes - raw_bytes; + let ratio = tracked_bytes as f64 / raw_bytes as f64; + + println!( + "{:>8} {:>12} {:>12} {:>12} {:>8.2}x", + breadth, raw_bytes, tracked_bytes, overhead, ratio + ); + + nested_results.push((breadth, raw_bytes, tracked_bytes, ratio)); + } + + println!("\nNested Structure Analysis:"); + if nested_results.len() >= 2 { + let first = nested_results.first().unwrap(); + let last = nested_results.last().unwrap(); + + let total_nodes_first = first.0.pow(5); // breadth^depth + let total_nodes_last = last.0.pow(5); + + println!( + " Total nodes: {} → {}", + total_nodes_first, total_nodes_last + ); + println!(" Overhead ratio: {:.2}x → {:.2}x", first.3, last.3); + + let ratio_change_pct = ((last.3 - first.3) / first.3) * 100.0; + if ratio_change_pct.abs() < 10.0 { + println!(" ✅ Overhead is STABLE even with deep nesting!"); + } else { + println!(" ⚠️ Overhead changes with nesting depth"); + } + } + + // Final summary + println!("\n{}", "=".repeat(70)); + println!("CONCLUSION"); + println!("{}", "=".repeat(70)); + println!("If overhead ratios stay roughly constant (within 10-25%)"); + println!("across all tests, then overhead scales linearly O(n)."); + println!(); + println!("This means larger configs use proportionally more memory,"); + println!("but don't suffer from superlinear growth."); +} diff --git a/crates/quarto-yaml/claude-notes/implementation-plan.md b/crates/quarto-yaml/claude-notes/implementation-plan.md new file mode 100644 index 0000000..2350bdc --- /dev/null +++ b/crates/quarto-yaml/claude-notes/implementation-plan.md @@ -0,0 +1,160 @@ +# quarto-yaml Implementation Plan + +## Overview + +This crate implements `YamlWithSourceInfo`, a data structure that wraps `yaml-rust2::Yaml` with source location tracking. + +## Architecture Decision: Owned Data + +Following rust-analyzer's precedent, we use owned `Yaml` values with a parallel children structure for source tracking. Trade-off: ~3x memory overhead for simplicity and compatibility with config merging across different lifetimes. + +## Core Data Structures + +### 1. YamlWithSourceInfo + +```rust +pub struct YamlWithSourceInfo { + /// The complete yaml-rust2::Yaml value (owned) + pub yaml: Yaml, + + /// Source location for this node + pub source_info: SourceInfo, + + /// Source-tracked children (parallel structure) + children: Children, +} +``` + +### 2. Children Enum + +```rust +enum Children { + None, + Array(Vec), + Hash(Vec), +} +``` + +### 3. YamlHashEntry + +```rust +pub struct YamlHashEntry { + pub key: YamlWithSourceInfo, + pub value: YamlWithSourceInfo, + pub key_span: SourceInfo, // Span of just the key + pub value_span: SourceInfo, // Span of just the value + pub entry_span: SourceInfo, // Span of key + value +} +``` + +## SourceInfo Type + +For Phase 1, we'll use a simple SourceInfo type: + +```rust +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct SourceInfo { + /// Optional filename + pub file: Option, + + /// Byte offset in source + pub offset: usize, + + /// Line number (1-based) + pub line: usize, + + /// Column number (1-based) + pub col: usize, + + /// Length in bytes + pub len: usize, +} +``` + +Later this will be replaced by the unified SourceInfo from the main project. + +## Implementation Phases + +### Phase 1: Core Data Structures (Current) +- [x] Create crate structure +- [ ] Define SourceInfo type +- [ ] Define YamlWithSourceInfo, Children, YamlHashEntry +- [ ] Implement basic constructors + +### Phase 2: Parser Implementation +- [ ] Implement MarkedEventReceiver trait +- [ ] Build tree from events +- [ ] Track source positions +- [ ] Handle errors + +### Phase 3: Public API +- [ ] `parse(content: &str) -> Result` +- [ ] `parse_file(content: &str, filename: &str) -> Result` +- [ ] Access methods: `get_hash_value()`, `get_array_item()`, etc. +- [ ] Error type with source positions + +### Phase 4: Testing +- [ ] Unit tests for data structures +- [ ] Parser tests with various YAML structures +- [ ] Source position tracking tests +- [ ] Error handling tests + +### Phase 5: Documentation +- [ ] API documentation +- [ ] Usage examples +- [ ] Integration guide + +## Parser Design + +The parser will use yaml-rust2's `MarkedEventReceiver` API: + +```rust +struct YamlBuilder { + stack: Vec, + source: String, + filename: Option, +} + +impl MarkedEventReceiver for YamlBuilder { + fn on_event(&mut self, event: Event, marker: Marker) { + // Build tree with source tracking + } +} +``` + +## Testing Strategy + +### Test Categories + +1. **Basic YAML structures** + - Scalars (string, int, float, bool) + - Arrays + - Hashes + - Nested structures + +2. **Source position tracking** + - Verify line/column accuracy + - Test multi-line values + - Test nested structures + +3. **Error handling** + - Invalid YAML + - Parse errors with positions + +4. **Edge cases** + - Empty documents + - Documents with only comments + - Multi-document streams (initially unsupported) + +## Dependencies + +- `yaml-rust2 = "0.9"` - YAML parsing with position tracking +- `serde = "1.0"` - For future SourceInfo serialization +- `thiserror = "1.0"` - Error types + +## Future Enhancements + +1. **Config merging** - Merge multiple YamlWithSourceInfo objects +2. **Validation** - Schema validation with source positions +3. **Unified SourceInfo** - Replace with project-wide SourceInfo type +4. **Multi-document** - Support YAML streams diff --git a/crates/quarto-yaml/claude-notes/implementation-status.md b/crates/quarto-yaml/claude-notes/implementation-status.md new file mode 100644 index 0000000..07ed812 --- /dev/null +++ b/crates/quarto-yaml/claude-notes/implementation-status.md @@ -0,0 +1,206 @@ +# quarto-yaml Implementation Status + +## Overview + +The `quarto-yaml` crate is now **functional** with basic parsing capabilities. It successfully parses YAML documents and tracks source locations for all nodes. + +## Completed Features + +### Core Data Structures ✅ + +- **SourceInfo**: Tracks source locations with file, offset, line, column, and length +- **YamlWithSourceInfo**: Main wrapper around yaml-rust2::Yaml with source tracking +- **YamlHashEntry**: Represents hash entries with source tracking for keys, values, and entire entries +- **Children enum**: Internal structure for tracking child nodes (Array/Hash/None) + +### Parser Implementation ✅ + +- **MarkedEventReceiver**: Implemented for YamlBuilder +- **Event-based parsing**: Handles all yaml-rust2 events +- **Tree construction**: Builds YamlWithSourceInfo tree from events +- **Source tracking**: Records source positions for all nodes + +### Public API ✅ + +- `parse(content: &str)` - Parse YAML from string +- `parse_file(content: &str, filename: &str)` - Parse with filename +- `get_hash_value(&self, key: &str)` - Access hash values +- `get_array_item(&self, index: usize)` - Access array elements +- `as_array()`, `as_hash()` - Access children +- `is_scalar()`, `is_array()`, `is_hash()` - Type checking + +### Tests ✅ + +All 14 tests passing: +- Scalar parsing (string, integer, boolean) +- Array parsing +- Hash parsing +- Nested structures +- Source info tracking +- Filename association + +## Architecture Decisions + +### Owned Data Approach ✅ + +Following rust-analyzer's precedent, we use **owned yaml-rust2::Yaml** values with a parallel Children structure for source tracking. + +**Trade-offs:** +- ✅ Simple API (no lifetime parameters) +- ✅ Enables config merging across different lifetimes +- ✅ Compatible with LSP caching +- ⚠️ ~3x memory overhead (acceptable for configs <10KB) + +### Design Pattern ✅ + +```rust +pub struct YamlWithSourceInfo { + pub yaml: Yaml, // Complete owned Yaml tree + pub source_info: SourceInfo, // This node's location + children: Children, // Source-tracked children +} +``` + +This provides **dual access**: +1. Direct Yaml access for code that doesn't need source tracking +2. Source-tracked access through children for error reporting + +## Known Limitations + +### 1. Scalar Length Computation ⚠️ + +Currently uses value length, not accounting for: +- Quotes and escapes +- Multi-line strings +- Block scalars + +**TODO**: Compute accurate lengths from source positions + +### 2. Alias Support 🔴 + +Aliases are currently converted to Null values. + +**TODO**: Track anchors and resolve aliases properly + +### 3. Tag Support 🔴 + +YAML tags (like `!expr`) are parsed but not exposed in the API. + +**TODO**: Add tag field to YamlWithSourceInfo + +### 4. Multi-Document Support 🔴 + +Currently only parses the first document in a stream. + +**TODO**: Support multi-document parsing if needed + +## Code Quality + +### Warnings ⚠️ + +Two dead_code warnings (acceptable for now): +- `source` field in YamlBuilder (may be needed for accurate length computation) +- `Complete` variant in BuildNode (may be used in future refactoring) + +### Test Coverage ✅ + +Good coverage of: +- Basic types (scalar, array, hash) +- Nested structures +- Source tracking +- Edge cases + +## Next Steps + +### Phase 1: Core Improvements + +1. **Accurate source spans** - Compute real lengths from markers +2. **Alias support** - Track and resolve anchors +3. **Tag support** - Expose tags in API + +### Phase 2: Advanced Features + +4. **Config merging** - Implement merge operations with source tracking +5. **Validation** - Schema validation with source-aware errors +6. **Error reporting** - Better error messages with source context + +### Phase 3: Integration + +7. **Unified SourceInfo** - Replace with project-wide SourceInfo type +8. **quarto-markdown integration** - Use for YAML metadata in documents +9. **LSP support** - Provide hover/completion data + +## Usage Example + +```rust +use quarto_yaml::{parse_file, YamlWithSourceInfo}; + +let yaml = parse_file(r#" +title: My Document +author: John Doe +tags: + - rust + - yaml +"#, "config.yaml").unwrap(); + +// Direct Yaml access +println!("Title: {:?}", yaml.yaml["title"]); + +// Source-tracked access +if let Some(title) = yaml.get_hash_value("title") { + println!("Title at {}:{}", + title.source_info.line, + title.source_info.col + ); +} + +// Navigate structure +if let Some(tags) = yaml.get_hash_value("tags") { + for (i, tag) in tags.as_array().unwrap().iter().enumerate() { + println!("Tag {}: {} at line {}", + i, + tag.yaml.as_str().unwrap(), + tag.source_info.line + ); + } +} +``` + +## File Structure + +``` +crates/quarto-yaml/ +├── Cargo.toml +├── claude-notes/ +│ ├── implementation-plan.md # Original plan +│ └── implementation-status.md # This file +└── src/ + ├── lib.rs # Public API + ├── error.rs # Error types + ├── source_info.rs # SourceInfo struct + ├── yaml_with_source_info.rs # Core data structures + └── parser.rs # Parser implementation +``` + +## Dependencies + +- `yaml-rust2 = "0.9"` - YAML parsing with position tracking +- `serde = "1.0"` - For future SourceInfo serialization +- `thiserror = "1.0"` - Error types + +## Timeline + +**Total time: ~2-3 hours** + +- Planning: 30min +- Data structures: 1h +- Parser implementation: 1h +- Testing and debugging: 30min + +## Conclusion + +The `quarto-yaml` crate is now ready for basic use! It successfully parses YAML with source tracking, providing a solid foundation for config parsing, validation, and LSP features. + +The owned data approach has proven to be simple and effective, with no lifetime complexity and clean APIs. The memory overhead is acceptable for typical config file sizes. + +Next steps should focus on improving source span accuracy, adding alias/tag support, and implementing config merging operations. diff --git a/crates/quarto-yaml/claude-notes/memory-overhead-analysis.md b/crates/quarto-yaml/claude-notes/memory-overhead-analysis.md new file mode 100644 index 0000000..571e138 --- /dev/null +++ b/crates/quarto-yaml/claude-notes/memory-overhead-analysis.md @@ -0,0 +1,221 @@ +# Memory Overhead Analysis + +## Executive Summary + +**Measured overhead: 6.38x** (not the 3x estimated) + +However, this is still **acceptable** for Quarto's use case: +- Typical config files are <10KB +- 10KB × 6.38 = ~64KB total memory +- Provides precise error reporting and LSP support +- Memory is cheap, developer time is expensive + +## Benchmark Results + +### Base Type Sizes + +``` +yaml_rust2::Yaml: 56 bytes +YamlWithSourceInfo: 144 bytes (2.57x larger) +SourceInfo: 56 bytes +YamlHashEntry: 456 bytes (!!!) +``` + +### Test Cases + +| Test Case | Raw Yaml | YamlWithSourceInfo | Overhead | +|-----------|----------|---------------------|----------| +| Simple scalar | 67 bytes | 267 bytes | **3.99x** | +| Small hash (3 items) | 772 bytes | 4,424 bytes | **5.73x** | +| Small array (5 items) | 809 bytes | 2,866 bytes | **3.54x** | +| Nested structure | 4,402 bytes | 27,924 bytes | **6.34x** | +| Quarto document | 4,991 bytes | 32,175 bytes | **6.45x** | +| Quarto project | 8,275 bytes | 55,576 bytes | **6.72x** | +| **TOTAL** | **19,316 bytes** | **123,232 bytes** | **6.38x** | + +## Why Higher Than Expected? + +### 1. YamlHashEntry is Heavy (456 bytes!) + +Each hash entry contains: +- `key: YamlWithSourceInfo` (144 bytes) +- `value: YamlWithSourceInfo` (144 bytes) +- `key_span: SourceInfo` (56 bytes) +- `value_span: SourceInfo` (56 bytes) +- `entry_span: SourceInfo` (56 bytes) + +**Total: 456 bytes per entry** + +### 2. Recursive Duplication + +`YamlWithSourceInfo` contains: +- `yaml: Yaml` (56 bytes) - the original tree +- `source_info: SourceInfo` (56 bytes) +- `children: Children` (enum with Vec) + +The `children` field duplicates the entire tree structure, creating recursive overhead. + +### 3. SourceInfo is Not Small + +At 56 bytes, `SourceInfo` is as large as `Yaml` itself: +- `file: Option` (24 bytes) +- `offset: usize` (8 bytes) +- `line: usize` (8 bytes) +- `col: usize` (8 bytes) +- `len: usize` (8 bytes) + +### 4. Overhead Increases with Nesting + +Deeper structures have higher overhead because each level duplicates: +- The Yaml value +- SourceInfo for the node +- Children structure with more YamlWithSourceInfo nodes + +## Is This A Problem? + +### No, for several reasons: + +#### 1. Absolute Numbers Are Small + +Even "large" Quarto project configs: +- Raw: 8KB → With tracking: 56KB +- Still fits in L1 cache on modern CPUs +- Negligible compared to typical application memory usage + +#### 2. Temporary Data Structure + +Config parsing is a one-time operation: +- Parse → Validate → Extract values → Drop YamlWithSourceInfo +- Not held in memory throughout application lifetime +- Only kept for error reporting context + +#### 3. Value Proposition + +The overhead buys us: +- ✅ Precise error messages with line/col +- ✅ LSP hover showing where config came from +- ✅ Config merging with source tracking +- ✅ Validation errors pointing to exact location +- ✅ "Jump to definition" for config values + +#### 4. Proven At Scale + +rust-analyzer uses similar approach: +- Owned SyntaxNode with refcounting +- Handles entire Rust codebases (100K+ LOC) +- Memory overhead acceptable + +## Optimization Opportunities + +If we needed to reduce overhead (we don't), we could: + +### 1. Remove Redundant SourceInfo from YamlHashEntry + +Currently: +```rust +pub struct YamlHashEntry { + pub key: YamlWithSourceInfo, // has source_info + pub value: YamlWithSourceInfo, // has source_info + pub key_span: SourceInfo, // duplicate! + pub value_span: SourceInfo, // duplicate! + pub entry_span: SourceInfo, +} +``` + +Could just use: +```rust +pub struct YamlHashEntry { + pub key: YamlWithSourceInfo, // use key.source_info + pub value: YamlWithSourceInfo, // use value.source_info + pub entry_span: SourceInfo, // only this is unique +} +``` + +**Savings**: 112 bytes per hash entry → ~30% reduction for hashes + +### 2. Box SourceInfo + +```rust +pub struct YamlWithSourceInfo { + pub yaml: Yaml, + pub source_info: Box, // 8 bytes pointer vs 56 bytes struct + children: Children, +} +``` + +**Savings**: 48 bytes per node, but adds indirection (slower access) + +### 3. Interned Filenames + +Instead of `file: Option` in every SourceInfo: +```rust +pub struct SourceInfo { + pub file_id: Option, // index into global string table + // ... +} +``` + +**Savings**: ~16 bytes per node with filename + +### 4. Compact SourceInfo + +```rust +#[repr(C)] +pub struct CompactSourceInfo { + pub file_id: u16, // 65K files should be enough + pub offset: u32, // 4GB should be enough + pub line: u16, // 65K lines should be enough + pub col: u16, // 65K columns should be enough + pub len: u16, // 65K byte spans should be enough +} +// Total: 12 bytes vs 56 bytes +``` + +**Savings**: 44 bytes per node → ~70% reduction in SourceInfo overhead + +### 5. Single Allocation for Tree + +Like rust-analyzer's arena allocation: +- Allocate entire tree in single Vec +- Use indices instead of pointers +- Better cache locality + +**Savings**: Reduces allocator overhead, improves cache performance + +## Recommendation + +**Do nothing.** The current overhead is acceptable because: + +1. **Absolute cost is low** (~60KB for typical configs) +2. **Temporary data** (parsed, used, dropped) +3. **High value** (precise error reporting, LSP support) +4. **Simple implementation** (no lifetime complexity) +5. **Proven approach** (rust-analyzer does similar) + +If we later discover memory pressure (unlikely), we have clear optimization paths. + +## Updating Documentation + +Need to update these claims: + +### Before +"~3x memory overhead (acceptable for configs <10KB)" + +### After +"~6x memory overhead, but still acceptable: +- 10KB config → ~60KB in memory +- Temporary data structure (parse, validate, drop) +- Provides precise error reporting and LSP support" + +## Conclusion + +The **6.38x overhead is higher than estimated but still acceptable** for Quarto's use case. + +The owned data approach remains the right choice: +- ✅ Simple API (no lifetime parameters) +- ✅ Config merging across different lifetimes +- ✅ LSP caching support +- ✅ Memory cost is negligible for typical configs +- ✅ Follows rust-analyzer precedent + +**Status**: No changes needed. Ship it! 🚢 diff --git a/crates/quarto-yaml/claude-notes/scaling-analysis.md b/crates/quarto-yaml/claude-notes/scaling-analysis.md new file mode 100644 index 0000000..c93ef5b --- /dev/null +++ b/crates/quarto-yaml/claude-notes/scaling-analysis.md @@ -0,0 +1,238 @@ +# Scaling Analysis: Linear vs Superlinear Growth + +## Executive Summary + +✅ **Overhead scales LINEARLY with data size** - no superlinear growth detected. + +The overhead ratio stabilizes around 4-6x for realistic workloads, with only small variations (2-13%) as data size increases 100x. + +## Test Results + +### Test 1: Flat Array (10 → 1000 items) + +``` +Size Raw Tracked Ratio +10 1,592 5,496 3.45x +50 6,840 26,536 3.88x +100 13,624 52,836 3.88x ← Stabilizes +250 30,392 132,036 4.34x +500 60,728 264,036 4.35x +1000 121,400 528,036 4.35x ← Stable +``` + +**Analysis**: +- Overhead ratio: 3.45x → 4.35x (26% change) +- Size increased: 100x +- Memory increased: Raw 76x, Tracked 96x +- **Verdict**: Small fixed cost at tiny sizes, then **linear** (ratio stabilizes at 4.35x) + +### Test 2: Flat Hash (10 → 1000 key-value pairs) + +``` +Size Raw Tracked Ratio +10 2,874 14,544 5.06x +50 12,618 70,288 5.57x +100 25,190 140,360 5.57x ← Stabilizes +250 83,072 369,992 4.45x +500 166,998 740,168 4.43x +1000 334,850 1,480,520 4.42x ← Stable +``` + +**Analysis**: +- Overhead ratio: 5.06x → 4.42x (12.6% change, actually *decreasing*) +- Size increased: 100x +- Memory increased: Raw 117x, Tracked 102x +- **Verdict**: **Linear** - ratio stabilizes, slight decrease due to amortization + +### Test 3: Mixed Structure (5 → 100 sections, most realistic) + +``` +Size Raw Tracked Ratio +5 7,005 42,860 6.12x +10 13,954 85,464 6.12x ← Same! +20 27,862 170,722 6.13x +50 68,018 424,928 6.25x +100 135,990 849,650 6.25x ← Stable +``` + +**Analysis**: +- Overhead ratio: 6.12x → 6.25x (**2.1% change** - excellent!) +- Size increased: 20x +- Memory increased: Raw 19.4x, Tracked 19.8x +- **Verdict**: ✅ **Perfectly linear!** This is closest to real Quarto configs + +### Test 4: Nested Structures (depth=5, breadth 2 → 5) + +``` +Breadth Total Nodes Raw Tracked Ratio +2 32 18,010 146,128 8.11x +3 243 85,124 801,526 9.42x +4 1,024 434,836 3,597,208 8.27x +5 3,125 1,092,680 9,674,890 8.85x +``` + +**Analysis**: +- Overhead ratio: 8.11x → 8.85x (9.1% change) +- Nodes increased: 98x (32 → 3,125) +- **Verdict**: ✅ **Linear** even with deep nesting + +## Why Flat Array Shows 26% Change? + +The "26% change" in flat arrays is **not** superlinear growth. It's **fixed costs amortizing**: + +### Small Size (10 items): 3.45x overhead +- Fixed overhead (YamlWithSourceInfo struct, Children enum, etc.) is significant +- Relative to tiny data size, fixed costs dominate + +### Large Size (1000 items): 4.35x overhead +- Same fixed overhead, but now spread over 1000 items +- Per-item overhead dominates, fixed costs negligible +- **Ratio stabilizes** at 4.35x + +This is **exactly what we want** - it means overhead is primarily per-item, not per-size-squared or worse. + +## Mathematical Verification + +For linear scaling, memory should follow: `M(n) = a + b·n` + +Where: +- `a` = fixed overhead +- `b` = per-item overhead +- `n` = number of items + +Looking at flat array results: + +``` +n=100: M = 52,836 +n=1000: M = 528,036 + +Per-item overhead: (528,036 - 52,836) / (1000 - 100) = 528 bytes/item +``` + +This matches the "528.0 bytes per item" reported at n=1000. ✅ + +## Practical Implications + +### For Quarto Configs + +Typical Quarto project config (~100 keys): +- Raw: ~136 KB +- Tracked: ~850 KB +- Overhead: 6.25x (stable ratio) + +Large Quarto project (1000 keys) - unlikely but possible: +- Raw: ~1.3 MB +- Tracked: ~8.5 MB +- Overhead: Still 6.25x (same ratio!) + +**No superlinear explosion** - memory grows proportionally. + +### Worst Case: Deep Nesting + +Even with pathological depth=5, breadth=5 (3,125 nodes): +- Raw: 1.1 MB +- Tracked: 9.7 MB +- Overhead: 8.85x + +This is still linear - the higher ratio (8.85x vs 6.25x) is because hash entries are expensive (456 bytes each), but it doesn't grow superlinearly. + +## Comparison to Alternatives + +### If We Had O(n²) Scaling (hypothetical bad case): + +``` +Size Linear (actual) Quadratic (bad) +10 5,496 ~5,000 +100 52,836 ~500,000 (10x worse!) +1000 528,036 ~50,000,000 (100x worse!) +``` + +We're seeing **linear**, not quadratic. 🎉 + +### If We Had O(n log n) Scaling: + +``` +Size Linear (actual) n log n (bad) +10 5,496 ~5,000 +100 52,836 ~100,000 (2x worse) +1000 528,036 ~3,000,000 (6x worse) +``` + +We're not seeing this either - ratio stays constant. + +## Why This Matters + +### Memory Usage is Predictable + +- 10 KB config → ~60 KB tracked (6x) +- 100 KB config → ~600 KB tracked (6x) +- 1 MB config → ~6 MB tracked (6x) + +**Predictable scaling** means no surprises with large configs. + +### No Performance Cliffs + +With superlinear growth, you'd hit a "cliff" where: +- Small configs work fine +- Medium configs slow down noticeably +- Large configs become unusable + +**Linear scaling** means smooth, predictable performance across all sizes. + +### Validation for Design + +The owned-data approach with parallel children structure: +- ✅ Scales linearly (verified) +- ✅ Predictable memory usage +- ✅ No pathological cases +- ✅ Simple implementation +- ✅ No lifetime complexity + +## Detailed Scaling Behavior + +### Per-Item Overhead by Structure Type + +| Structure Type | Bytes per Item | Notes | +|---------------|----------------|-------| +| Flat Array | 528 | YamlWithSourceInfo + SourceInfo | +| Flat Hash | 1,480 | Includes YamlHashEntry (456 bytes!) | +| Mixed (realistic) | 8,497 | Nested hashes + arrays + scalars | +| Deep Nested | ~3,100 | More hash entries at each level | + +Hash entries are expensive (456 bytes each) because they store: +- 2× YamlWithSourceInfo (288 bytes) +- 3× SourceInfo (168 bytes) + +But even with expensive entries, scaling remains **linear**. + +## Conclusion + +✅ **Overhead scales linearly O(n)** - verified across multiple test cases: +- Flat arrays: Stable at 4.35x (after initial warmup) +- Flat hashes: Stable at 4.42x +- Mixed structures: **2.1% variation** (excellent!) +- Deep nesting: 9.1% variation (good) + +✅ **No superlinear growth** - memory increases proportionally with data size + +✅ **Predictable behavior** - can estimate memory usage for any config size + +✅ **Design validated** - owned data approach works well at scale + +**Recommendation**: The current implementation is production-ready. The linear scaling means we won't encounter performance cliffs or memory explosions with larger configs. + +## Benchmark Tool + +Run the scaling analysis: +```bash +cd crates/quarto-yaml +cargo bench --bench scaling_overhead +``` + +Tests: +- Flat arrays: 10 → 1000 items +- Flat hashes: 10 → 1000 pairs +- Mixed structures: 5 → 100 sections (realistic Quarto configs) +- Nested structures: depth=5, breadth 2→5 (3,125 nodes max) + +All tests confirm **linear scaling**. 🚀 diff --git a/crates/quarto-yaml/src/error.rs b/crates/quarto-yaml/src/error.rs new file mode 100644 index 0000000..842fb1a --- /dev/null +++ b/crates/quarto-yaml/src/error.rs @@ -0,0 +1,81 @@ +//! Error types for YAML parsing with source locations. + +use crate::SourceInfo; +use std::fmt; + +/// Result type alias for quarto-yaml operations. +pub type Result = std::result::Result; + +/// Errors that can occur during YAML parsing. +#[derive(Debug, Clone, PartialEq)] +pub enum Error { + /// YAML syntax error + ParseError { + message: String, + location: Option, + }, + + /// Unexpected end of input + UnexpectedEof { location: Option }, + + /// Invalid YAML structure + InvalidStructure { + message: String, + location: Option, + }, +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Error::ParseError { message, location } => { + write!(f, "Parse error: {}", message)?; + if let Some(loc) = location { + // Display with 1-indexed row/column + write!( + f, + " at {}:{}", + loc.range.start.row + 1, + loc.range.start.column + 1 + )?; + } + Ok(()) + } + Error::UnexpectedEof { location } => { + write!(f, "Unexpected end of input")?; + if let Some(loc) = location { + write!( + f, + " at {}:{}", + loc.range.start.row + 1, + loc.range.start.column + 1 + )?; + } + Ok(()) + } + Error::InvalidStructure { message, location } => { + write!(f, "Invalid YAML structure: {}", message)?; + if let Some(loc) = location { + write!( + f, + " at {}:{}", + loc.range.start.row + 1, + loc.range.start.column + 1 + )?; + } + Ok(()) + } + } + } +} + +impl std::error::Error for Error {} + +impl From for Error { + fn from(err: yaml_rust2::ScanError) -> Self { + Error::ParseError { + message: err.to_string(), + location: None, + } + } +} diff --git a/crates/quarto-yaml/src/lib.rs b/crates/quarto-yaml/src/lib.rs new file mode 100644 index 0000000..ae9d644 --- /dev/null +++ b/crates/quarto-yaml/src/lib.rs @@ -0,0 +1,42 @@ +//! # quarto-yaml +//! +//! YAML parsing with source location tracking. +//! +//! This crate provides `YamlWithSourceInfo`, which wraps `yaml-rust2::Yaml` with +//! source location information for every node in the YAML tree. This enables +//! precise error reporting and source tracking through transformations. +//! +//! ## Design +//! +//! Uses the **owned data approach**: wraps owned `Yaml` values with a parallel +//! children structure for source tracking. Trade-off: ~3x memory overhead for +//! simplicity and compatibility with config merging across different lifetimes. +//! +//! Follows rust-analyzer's precedent of using owned data with reference counting +//! for tree structures. +//! +//! ## Example +//! +//! ```rust,no_run +//! use quarto_yaml::parse; +//! +//! let content = r#" +//! title: My Document +//! author: John Doe +//! "#; +//! +//! let yaml = parse(content).unwrap(); +//! // Access with source location tracking +//! if let Some(title) = yaml.get_hash_value("title") { +//! println!("Title at offset {}", title.source_info.range.start.offset); +//! } +//! ``` + +mod error; +mod parser; +mod yaml_with_source_info; + +pub use error::{Error, Result}; +pub use parser::{parse, parse_file, parse_with_parent}; +pub use quarto_source_map::SourceInfo; // Re-export from quarto-source-map +pub use yaml_with_source_info::{YamlHashEntry, YamlWithSourceInfo}; diff --git a/crates/quarto-yaml/src/parser.rs b/crates/quarto-yaml/src/parser.rs new file mode 100644 index 0000000..7218099 --- /dev/null +++ b/crates/quarto-yaml/src/parser.rs @@ -0,0 +1,1051 @@ +//! YAML parser that builds YamlWithSourceInfo trees. + +use crate::{Error, Result, SourceInfo, YamlHashEntry, YamlWithSourceInfo}; +use yaml_rust2::Yaml; +use yaml_rust2::parser::{Event, MarkedEventReceiver, Parser}; +use yaml_rust2::scanner::Marker; + +/// Parse YAML from a string, producing a YamlWithSourceInfo tree. +/// +/// This parses a single YAML document. If the input contains multiple documents, +/// only the first one will be parsed. +/// +/// # Example +/// +/// ```rust +/// use quarto_yaml::parse; +/// +/// let yaml = parse("title: My Document").unwrap(); +/// assert!(yaml.is_hash()); +/// ``` +/// +/// # Errors +/// +/// Returns an error if the YAML is invalid or if parsing fails. +pub fn parse(content: &str) -> Result { + parse_impl(content, None, None) +} + +/// Parse YAML from a string with an associated filename. +/// +/// The filename is included in source location information for better +/// error reporting. +/// +/// # Example +/// +/// ```rust +/// use quarto_yaml::parse_file; +/// +/// let yaml = parse_file("title: My Document", "config.yaml").unwrap(); +/// // Filename tracking will be added in a future update +/// assert!(yaml.source_info.range.end.offset > 0); +/// ``` +/// +/// # Errors +/// +/// Returns an error if the YAML is invalid or if parsing fails. +pub fn parse_file(content: &str, filename: &str) -> Result { + parse_impl(content, Some(filename), None) +} + +/// Parse YAML that was extracted from a parent document. +/// +/// This function is used when parsing YAML that is a substring of a larger +/// document (e.g., YAML frontmatter extracted from a .qmd file). The resulting +/// YamlWithSourceInfo will have Substring mappings that track back to the +/// parent document. +/// +/// # Arguments +/// +/// * `content` - The YAML string to parse +/// * `parent` - Source information for the parent document from which this YAML was extracted +/// +/// # Example +/// +/// ```rust,no_run +/// use quarto_yaml::{parse_with_parent, SourceInfo}; +/// use quarto_source_map::{FileId, Location, Range}; +/// +/// // Create parent source info for a .qmd file +/// let parent = SourceInfo::original( +/// FileId(1), +/// Range { +/// start: Location { offset: 0, row: 0, column: 0 }, +/// end: Location { offset: 1000, row: 50, column: 0 }, +/// } +/// ); +/// +/// // Parse YAML frontmatter (extracted from parent document at offset 10-50) +/// let yaml_content = "title: My Document\nauthor: John"; +/// let yaml = parse_with_parent(yaml_content, parent).unwrap(); +/// +/// // The yaml now has Substring mappings tracking back to the parent +/// ``` +/// +/// # Errors +/// +/// Returns an error if the YAML is invalid or if parsing fails. +pub fn parse_with_parent(content: &str, parent: SourceInfo) -> Result { + parse_impl(content, None, Some(parent)) +} + +fn parse_impl( + content: &str, + filename: Option<&str>, + parent: Option, +) -> Result { + // If parent is not provided but filename is, create a parent SourceInfo for the file + let parent = parent.or_else(|| { + filename.map(|name| { + // Create a FileId from filename hash + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let mut hasher = DefaultHasher::new(); + name.hash(&mut hasher); + let file_id = quarto_source_map::FileId(hasher.finish() as usize); + + // Create SourceInfo for the entire file content + use quarto_source_map::{Location, Range}; + SourceInfo::original( + file_id, + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: content.len(), + row: content.lines().count().saturating_sub(1), + column: content.lines().last().map(|l| l.len()).unwrap_or(0), + }, + }, + ) + }) + }); + + let mut parser = Parser::new_from_str(content); + let mut builder = YamlBuilder::new(content, parent); + + parser + .load(&mut builder, false) // false = single document only + .map_err(Error::from)?; + + builder.result() +} + +/// Builder that implements MarkedEventReceiver to construct YamlWithSourceInfo. +struct YamlBuilder<'a> { + /// The source text being parsed (reserved for future use in accurate scalar length computation) + _source: &'a str, + + /// Optional parent source info for substring tracking + parent: Option, + + /// Stack of nodes being constructed + stack: Vec, + + /// The completed root node + root: Option, +} + +/// A node being constructed during parsing. +enum BuildNode { + /// Building a sequence + Sequence { + start_marker: Marker, + items: Vec, + }, + + /// Building a mapping + Mapping { + start_marker: Marker, + entries: Vec<(YamlWithSourceInfo, Option)>, + }, +} + +impl<'a> YamlBuilder<'a> { + fn new(source: &'a str, parent: Option) -> Self { + Self { + _source: source, + parent, + stack: Vec::new(), + root: None, + } + } + + fn result(self) -> Result { + self.root.ok_or_else(|| Error::ParseError { + message: "No YAML document found".into(), + location: None, + }) + } + + fn push_complete(&mut self, node: YamlWithSourceInfo) { + if self.stack.is_empty() { + // This is the root + self.root = Some(node); + return; + } + + // Add to the parent node + match self.stack.last_mut().unwrap() { + BuildNode::Sequence { items, .. } => { + items.push(node); + } + BuildNode::Mapping { entries, .. } => { + if let Some((_, value)) = entries.last_mut() { + if value.is_none() { + *value = Some(node); + } else { + // This is a new key + entries.push((node, None)); + } + } else { + // First key + entries.push((node, None)); + } + } + } + } + + fn make_source_info(&self, marker: &Marker, len: usize) -> SourceInfo { + let start_offset = marker.index(); + let end_offset = start_offset + len; + + if let Some(ref parent) = self.parent { + // We're parsing a substring - create a Substring mapping + SourceInfo::substring(parent.clone(), start_offset, end_offset) + } else { + // We're parsing an original file - create an Original mapping + use quarto_source_map::{Location, Range}; + + let start_row = marker.line(); // yaml-rust2 uses 0-based + let start_column = marker.col(); // yaml-rust2 uses 0-based + + SourceInfo::original( + quarto_source_map::FileId(0), // Dummy FileId for now + Range { + start: Location { + offset: start_offset, + row: start_row, + column: start_column, + }, + end: Location { + offset: end_offset, + // TODO: Calculate accurate end row/column based on content + row: start_row, + column: start_column + len, + }, + }, + ) + } + } + + fn compute_scalar_len(&self, _marker: &Marker, value: &str) -> usize { + // For now, use the value length + // TODO: This should be computed more accurately from the source + // considering quotes, escapes, etc. + value.len() + } +} + +impl<'a> MarkedEventReceiver for YamlBuilder<'a> { + fn on_event(&mut self, ev: Event, marker: Marker) { + match ev { + Event::Nothing => {} + + Event::StreamStart => {} + Event::StreamEnd => {} + Event::DocumentStart => {} + Event::DocumentEnd => {} + + Event::Scalar(value, _style, _anchor_id, tag) => { + // Capture tag information if present + let tag_info = tag.as_ref().map(|t| { + // Tag appears at marker position + // Format: ! where suffix is what we care about + let tag_len = 1 + t.suffix.len(); // ! + suffix + let tag_source_info = self.make_source_info(&marker, tag_len); + (t.suffix.clone(), tag_source_info) + }); + + // Compute source info for the value itself + // For now, use the existing logic (marker + value length) + // TODO: This should account for tag length + whitespace for more accuracy + let len = self.compute_scalar_len(&marker, &value); + let source_info = self.make_source_info(&marker, len); + + // Create the Yaml value + let yaml = parse_scalar_value(&value); + let node = YamlWithSourceInfo::new_scalar_with_tag(yaml, source_info, tag_info); + + self.push_complete(node); + } + + Event::SequenceStart(_anchor_id, _tag) => { + self.stack.push(BuildNode::Sequence { + start_marker: marker, + items: Vec::new(), + }); + } + + Event::SequenceEnd => { + let build_node = self.stack.pop().expect("SequenceEnd without SequenceStart"); + + if let BuildNode::Sequence { + start_marker, + items, + } = build_node + { + // Compute the length from start to current marker + let len = marker.index().saturating_sub(start_marker.index()); + let source_info = self.make_source_info(&start_marker, len); + + // Build the Yaml::Array + let yaml_items: Vec = items.iter().map(|n| n.yaml.clone()).collect(); + let yaml = Yaml::Array(yaml_items); + + let node = YamlWithSourceInfo::new_array(yaml, source_info, items); + self.push_complete(node); + } else { + panic!("Expected Sequence build node"); + } + } + + Event::MappingStart(_anchor_id, _tag) => { + self.stack.push(BuildNode::Mapping { + start_marker: marker, + entries: Vec::new(), + }); + } + + Event::MappingEnd => { + let build_node = self.stack.pop().expect("MappingEnd without MappingStart"); + + if let BuildNode::Mapping { + start_marker, + entries, + } = build_node + { + // Compute the length from start to current marker + let len = marker.index().saturating_sub(start_marker.index()); + let source_info = self.make_source_info(&start_marker, len); + + // Build the hash entries + let mut hash_entries = Vec::new(); + let mut yaml_pairs = Vec::new(); + + for (key, value) in entries { + let value = value.expect("Mapping entry without value"); + + // Create YamlHashEntry + let key_span = key.source_info.clone(); + let value_span = value.source_info.clone(); + + // Entry span from key start to value end + use quarto_source_map::Range; + let entry_span = SourceInfo::original( + quarto_source_map::FileId(0), // Dummy FileId + Range { + start: key_span.range.start.clone(), + end: value_span.range.end.clone(), + }, + ); + + hash_entries.push(YamlHashEntry::new( + key.clone(), + value.clone(), + key_span, + value_span, + entry_span, + )); + + yaml_pairs.push((key.yaml.clone(), value.yaml.clone())); + } + + // Build the Yaml::Hash + let yaml = Yaml::Hash(yaml_pairs.into_iter().collect()); + + let node = YamlWithSourceInfo::new_hash(yaml, source_info, hash_entries); + self.push_complete(node); + } else { + panic!("Expected Mapping build node"); + } + } + + Event::Alias(_anchor_id) => { + // For now, we don't support aliases + // We could add support later by tracking anchors + let source_info = self.make_source_info(&marker, 0); + let node = YamlWithSourceInfo::new_scalar(Yaml::Null, source_info); + self.push_complete(node); + } + } + } +} + +/// Parse a scalar string value into the appropriate Yaml type. +/// +/// This handles type inference: integers, floats, booleans, null, and strings. +fn parse_scalar_value(value: &str) -> Yaml { + // Try to parse as integer + if let Ok(i) = value.parse::() { + return Yaml::Integer(i); + } + + // Try to parse as float + if let Ok(_f) = value.parse::() { + return Yaml::Real(value.to_string()); + } + + // Check for boolean + match value { + "true" | "True" | "TRUE" | "yes" | "Yes" | "YES" | "on" | "On" | "ON" => { + return Yaml::Boolean(true); + } + "false" | "False" | "FALSE" | "no" | "No" | "NO" | "off" | "Off" | "OFF" => { + return Yaml::Boolean(false); + } + "null" | "Null" | "NULL" | "~" | "" => { + return Yaml::Null; + } + _ => {} + } + + // Default to string + Yaml::String(value.to_string()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_scalar() { + let yaml = parse("hello").unwrap(); + assert!(yaml.is_scalar()); + assert_eq!(yaml.yaml.as_str(), Some("hello")); + } + + #[test] + fn test_parse_integer() { + let yaml = parse("42").unwrap(); + assert!(yaml.is_scalar()); + assert_eq!(yaml.yaml.as_i64(), Some(42)); + } + + #[test] + fn test_parse_boolean() { + let yaml = parse("true").unwrap(); + assert!(yaml.is_scalar()); + assert_eq!(yaml.yaml.as_bool(), Some(true)); + } + + #[test] + fn test_parse_array() { + let yaml = parse("[1, 2, 3]").unwrap(); + assert!(yaml.is_array()); + assert_eq!(yaml.len(), 3); + + let items = yaml.as_array().unwrap(); + assert_eq!(items[0].yaml.as_i64(), Some(1)); + assert_eq!(items[1].yaml.as_i64(), Some(2)); + assert_eq!(items[2].yaml.as_i64(), Some(3)); + } + + #[test] + fn test_parse_hash() { + let yaml = parse("title: My Document\nauthor: John Doe").unwrap(); + assert!(yaml.is_hash()); + assert_eq!(yaml.len(), 2); + + let title = yaml.get_hash_value("title").unwrap(); + assert_eq!(title.yaml.as_str(), Some("My Document")); + + let author = yaml.get_hash_value("author").unwrap(); + assert_eq!(author.yaml.as_str(), Some("John Doe")); + } + + #[test] + fn test_nested_structure() { + let yaml = parse( + r#" +project: + title: My Project + authors: + - Alice + - Bob +"#, + ) + .unwrap(); + + assert!(yaml.is_hash()); + + let project = yaml.get_hash_value("project").unwrap(); + assert!(project.is_hash()); + + let authors = project.get_hash_value("authors").unwrap(); + assert!(authors.is_array()); + assert_eq!(authors.len(), 2); + } + + #[test] + fn test_source_info_tracking() { + let yaml = parse("title: My Document").unwrap(); + + // Check that source info is present + // Note: row/column are 0-indexed in the new system + assert!(yaml.source_info.range.start.offset < yaml.source_info.range.end.offset); + + let title = yaml.get_hash_value("title").unwrap(); + // Verify the title value has a valid range + assert!(title.source_info.range.start.offset < title.source_info.range.end.offset); + } + + #[test] + fn test_parse_with_filename() { + let yaml = parse_file("title: Test", "config.yaml").unwrap(); + assert!(yaml.source_info.range.end.offset > 0); + + // Verify that we're now using Substring mapping for files + match &yaml.source_info.mapping { + quarto_source_map::SourceMapping::Substring { .. } => { + // Expected: Substring mapping to parent file + } + _ => panic!("Expected Substring mapping for file parsing"), + } + } + + #[test] + fn test_parse_with_parent_simple() { + use quarto_source_map::{FileId, Location, Range}; + + // Simulate extracting YAML from a .qmd file at offset 100-150 + let parent = SourceInfo::original( + FileId(42), + Range { + start: Location { + offset: 100, + row: 5, + column: 0, + }, + end: Location { + offset: 150, + row: 8, + column: 0, + }, + }, + ); + + let yaml_content = "title: My Document\nauthor: John"; + let yaml = parse_with_parent(yaml_content, parent).unwrap(); + + // Verify root has Substring mapping + match &yaml.source_info.mapping { + quarto_source_map::SourceMapping::Substring { + parent: p, + offset: _, + } => { + // Parent should point to our original parent + match &p.mapping { + quarto_source_map::SourceMapping::Original { file_id } => { + assert_eq!(file_id.0, 42); + } + _ => panic!("Expected parent to have Original mapping"), + } + } + _ => panic!("Expected Substring mapping"), + } + } + + #[test] + fn test_parse_with_parent_nested() { + use quarto_source_map::{FileId, Location, Range}; + + // Parent file + let parent = SourceInfo::original( + FileId(1), + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: 500, + row: 20, + column: 0, + }, + }, + ); + + let yaml_content = r#" +project: + title: My Project + authors: + - Alice + - Bob +"#; + let yaml = parse_with_parent(yaml_content, parent).unwrap(); + + // Get nested values + let project = yaml + .get_hash_value("project") + .expect("project key not found"); + let title = project + .get_hash_value("title") + .expect("title key not found"); + let authors = project + .get_hash_value("authors") + .expect("authors key not found"); + + // All should have Substring mappings + assert!(matches!( + project.source_info.mapping, + quarto_source_map::SourceMapping::Substring { .. } + )); + assert!(matches!( + title.source_info.mapping, + quarto_source_map::SourceMapping::Substring { .. } + )); + assert!(matches!( + authors.source_info.mapping, + quarto_source_map::SourceMapping::Substring { .. } + )); + + // Array elements should also have Substring mappings + if let Some(items) = authors.as_array() { + assert_eq!(items.len(), 2); + assert!(matches!( + items[0].source_info.mapping, + quarto_source_map::SourceMapping::Substring { .. } + )); + assert!(matches!( + items[1].source_info.mapping, + quarto_source_map::SourceMapping::Substring { .. } + )); + } else { + panic!("Expected array for authors"); + } + } + + #[test] + fn test_substring_offset_tracking() { + use quarto_source_map::{FileId, Location, Range}; + + // Parent document + let parent_content = "---\ntitle: Test\nauthor: John\n---\n\nDocument content"; + let parent = SourceInfo::original( + FileId(1), + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: parent_content.len(), + row: 4, + column: 0, + }, + }, + ); + + // YAML frontmatter (offset 4-31 in parent) + let yaml_content = "title: Test\nauthor: John"; + let yaml = parse_with_parent(yaml_content, parent).unwrap(); + + // Get title value + let title = yaml.get_hash_value("title").expect("title not found"); + + // Verify the title has a valid substring range + match &title.source_info.mapping { + quarto_source_map::SourceMapping::Substring { offset, .. } => { + // Offset should be relative to the yaml_content string + assert!(*offset < yaml_content.len()); + } + _ => panic!("Expected Substring mapping for title"), + } + + // Check that range makes sense + assert!(title.source_info.range.start.offset < title.source_info.range.end.offset); + } + + #[test] + fn test_parse_anonymous_no_substring() { + // Parse without filename or parent - should use Original mapping + let yaml = parse("title: Test").unwrap(); + + match &yaml.source_info.mapping { + quarto_source_map::SourceMapping::Original { file_id } => { + assert_eq!(file_id.0, 0); // Anonymous FileId + } + _ => panic!("Expected Original mapping for anonymous parse"), + } + } + + /// Helper function to resolve a SourceInfo through the mapping chain to get + /// the absolute offset in the original file. + fn resolve_to_original_offset(info: &SourceInfo) -> (usize, quarto_source_map::FileId) { + match &info.mapping { + quarto_source_map::SourceMapping::Original { file_id } => { + (info.range.start.offset, *file_id) + } + quarto_source_map::SourceMapping::Substring { parent, offset } => { + let (parent_offset, file_id) = resolve_to_original_offset(parent); + (parent_offset + offset, file_id) + } + _ => panic!("Unsupported mapping type for offset resolution"), + } + } + + #[test] + fn test_hash_key_and_value_locations() { + // Test that we can track both key and value locations in YAML hashes + let yaml_content = "hello: world\nfoo: bar\ncount: 42"; + let yaml = parse(yaml_content).unwrap(); + + assert!(yaml.is_hash()); + let entries = yaml.as_hash().expect("Should be a hash"); + + // Test 1: Verify "hello" key and "world" value locations + let hello_entry = entries + .iter() + .find(|e| e.key.yaml.as_str() == Some("hello")) + .expect("Should have 'hello' key"); + + // Verify key location + assert_eq!(hello_entry.key.yaml.as_str(), Some("hello")); + let key_offset = hello_entry.key_span.range.start.offset; + let key_str = &yaml_content[key_offset..key_offset + 5]; + assert_eq!(key_str, "hello", "Key location should point to 'hello'"); + + // Verify value location + assert_eq!(hello_entry.value.yaml.as_str(), Some("world")); + let value_offset = hello_entry.value_span.range.start.offset; + let value_str = &yaml_content[value_offset..value_offset + 5]; + assert_eq!(value_str, "world", "Value location should point to 'world'"); + + // Verify they are different locations + assert_ne!( + key_offset, value_offset, + "Key and value should have different offsets" + ); + + // Test 2: Verify "foo" key and "bar" value locations + let foo_entry = entries + .iter() + .find(|e| e.key.yaml.as_str() == Some("foo")) + .expect("Should have 'foo' key"); + + let foo_key_offset = foo_entry.key_span.range.start.offset; + let foo_key_str = &yaml_content[foo_key_offset..foo_key_offset + 3]; + assert_eq!(foo_key_str, "foo", "Key location should point to 'foo'"); + + let bar_value_offset = foo_entry.value_span.range.start.offset; + let bar_value_str = &yaml_content[bar_value_offset..bar_value_offset + 3]; + assert_eq!(bar_value_str, "bar", "Value location should point to 'bar'"); + + // Test 3: Verify "count" key and "42" value locations + let count_entry = entries + .iter() + .find(|e| e.key.yaml.as_str() == Some("count")) + .expect("Should have 'count' key"); + + let count_key_offset = count_entry.key_span.range.start.offset; + let count_key_str = &yaml_content[count_key_offset..count_key_offset + 5]; + assert_eq!( + count_key_str, "count", + "Key location should point to 'count'" + ); + + assert_eq!(count_entry.value.yaml.as_i64(), Some(42)); + let count_value_offset = count_entry.value_span.range.start.offset; + let count_value_str = &yaml_content[count_value_offset..count_value_offset + 2]; + assert_eq!(count_value_str, "42", "Value location should point to '42'"); + + // Test 4: Verify entry spans include both key and value + // The entry span should start at the key and end after the value + assert!( + hello_entry.entry_span.range.start.offset <= key_offset, + "Entry span should start at or before the key" + ); + assert!( + hello_entry.entry_span.range.end.offset >= value_offset + 5, + "Entry span should end at or after the value" + ); + } + + #[test] + fn test_qmd_frontmatter_extraction() { + use quarto_source_map::{FileId, Location, Range}; + + // Simulate a realistic .qmd file + let qmd_content = r#"--- +title: "My Research Paper" +author: "Jane Smith" +date: "2024-01-15" +format: + html: + theme: cosmo + toc: true + pdf: + documentclass: article +--- + +# Introduction + +This is my research paper with some **bold** text. + +## Methods + +We used the following approach... +"#; + + // Extract YAML frontmatter using regex (simple approach - just for testing) + let re = regex::Regex::new(r"(?s)^---\n(.*?)\n---").unwrap(); + let captures = re + .captures(qmd_content) + .expect("Failed to find YAML frontmatter"); + + let yaml_match = captures.get(1).expect("No YAML content found"); + let yaml_start = yaml_match.start(); + let yaml_end = yaml_match.end(); + let yaml_content = yaml_match.as_str(); + + // Create parent SourceInfo for the entire .qmd file + let parent = SourceInfo::original( + FileId(123), // Simulated FileId for test.qmd + Range { + start: Location { + offset: 0, + row: 0, + column: 0, + }, + end: Location { + offset: qmd_content.len(), + row: qmd_content.lines().count().saturating_sub(1), + column: qmd_content.lines().last().unwrap_or("").len(), + }, + }, + ); + + // Create parent SourceInfo for just the YAML portion + let yaml_parent = SourceInfo::substring(parent.clone(), yaml_start, yaml_end); + + // Parse the YAML with parent tracking + let yaml = parse_with_parent(yaml_content, yaml_parent).unwrap(); + + // Verify the YAML was parsed correctly + assert!(yaml.is_hash()); + let title = yaml.get_hash_value("title").expect("title not found"); + assert_eq!(title.yaml.as_str(), Some("My Research Paper")); + + // Verify that the title's location maps back through the substring chain + match &title.source_info.mapping { + quarto_source_map::SourceMapping::Substring { parent: p, offset } => { + // The offset should be within the YAML content + assert!(*offset < yaml_content.len()); + + // The parent should be another Substring pointing to the .qmd file + match &p.mapping { + quarto_source_map::SourceMapping::Substring { + parent: grandparent, + offset: yaml_offset, + } => { + // This should point to the original .qmd file + assert_eq!(*yaml_offset, yaml_start); + + // Grandparent should be the Original .qmd file + match &grandparent.mapping { + quarto_source_map::SourceMapping::Original { file_id } => { + assert_eq!(file_id.0, 123); + } + _ => panic!("Expected Original mapping for .qmd file"), + } + } + _ => panic!("Expected Substring mapping for YAML within .qmd"), + } + } + _ => panic!("Expected Substring mapping for title"), + } + + // Verify nested structures also have correct mappings + let format = yaml.get_hash_value("format").expect("format not found"); + assert!(format.is_hash()); + + let html = format.get_hash_value("html").expect("html not found"); + assert!(html.is_hash()); + + let theme = html.get_hash_value("theme").expect("theme not found"); + assert_eq!(theme.yaml.as_str(), Some("cosmo")); + + // The theme value should also have Substring mapping through the chain + match &theme.source_info.mapping { + quarto_source_map::SourceMapping::Substring { .. } => { + // Good - it has substring mapping + } + _ => panic!("Expected Substring mapping for deeply nested theme value"), + } + + // Verify that the 'toc' boolean value is correctly located + let toc = html.get_hash_value("toc").expect("toc not found"); + assert_eq!(toc.yaml.as_bool(), Some(true)); + + // Calculate where "true" appears in the original .qmd file + let toc_true_in_qmd = qmd_content + .find("toc: true") + .expect("toc: true not found in qmd"); + let toc_value_offset = toc_true_in_qmd + "toc: ".len(); + + // The toc value should be located within the YAML frontmatter region + assert!( + toc_value_offset >= yaml_start && toc_value_offset < yaml_end, + "toc value offset {} should be within YAML range {}-{}", + toc_value_offset, + yaml_start, + yaml_end + ); + + // ===== NOW TEST OFFSET RESOLUTION ===== + + // Test 1: Verify the title value resolves to correct position in .qmd file + let (resolved_title_offset, resolved_file_id) = + resolve_to_original_offset(&title.source_info); + assert_eq!( + resolved_file_id.0, 123, + "Title should resolve to FileId 123" + ); + + // Extract the exact string at the resolved position + let title_expected = "\"My Research Paper\""; // YAML parser includes quotes + let resolved_title_str = + &qmd_content[resolved_title_offset..resolved_title_offset + title_expected.len()]; + assert_eq!( + resolved_title_str, title_expected, + "Resolved title offset should point to exactly '{}'", + title_expected + ); + + // Test 2: Verify the theme value "cosmo" resolves correctly + let (resolved_cosmo_offset, resolved_file_id) = + resolve_to_original_offset(&theme.source_info); + assert_eq!( + resolved_file_id.0, 123, + "Theme should resolve to FileId 123" + ); + + // Extract the exact string at the resolved position + let cosmo_expected = "cosmo"; + let resolved_cosmo_str = + &qmd_content[resolved_cosmo_offset..resolved_cosmo_offset + cosmo_expected.len()]; + assert_eq!( + resolved_cosmo_str, cosmo_expected, + "Resolved theme offset should point to exactly '{}'", + cosmo_expected + ); + + // Test 3: Verify the author value resolves correctly + let author = yaml.get_hash_value("author").expect("author not found"); + assert_eq!(author.yaml.as_str(), Some("Jane Smith")); + + let (resolved_author_offset, resolved_file_id) = + resolve_to_original_offset(&author.source_info); + assert_eq!( + resolved_file_id.0, 123, + "Author should resolve to FileId 123" + ); + + // Extract the exact string at the resolved position + let author_expected = "\"Jane Smith\""; // YAML parser includes quotes + let resolved_author_str = + &qmd_content[resolved_author_offset..resolved_author_offset + author_expected.len()]; + assert_eq!( + resolved_author_str, author_expected, + "Resolved author offset should point to exactly '{}'", + author_expected + ); + + // Test 4: Verify the YAML root offset resolution + let (resolved_yaml_offset, _) = resolve_to_original_offset(&yaml.source_info); + + // The resolved position should be within the YAML frontmatter + assert!( + resolved_yaml_offset >= yaml_start && resolved_yaml_offset < yaml_end, + "YAML root offset {} should be within YAML content range {}-{}", + resolved_yaml_offset, + yaml_start, + yaml_end + ); + + // Extract and verify the exact string - yaml-rust2 reports the first value, not the first key + let yaml_root_expected = ": \"My Research Paper\""; // Colon and first value + let resolved_yaml_str = + &qmd_content[resolved_yaml_offset..resolved_yaml_offset + yaml_root_expected.len()]; + assert_eq!( + resolved_yaml_str, yaml_root_expected, + "Resolved YAML root offset should point to exactly '{}'", + yaml_root_expected + ); + + // Test 5: Verify nested hash entry offsets + let pdf = format.get_hash_value("pdf").expect("pdf not found"); + let documentclass = pdf + .get_hash_value("documentclass") + .expect("documentclass not found"); + assert_eq!(documentclass.yaml.as_str(), Some("article")); + + let (resolved_article_offset, resolved_file_id) = + resolve_to_original_offset(&documentclass.source_info); + assert_eq!( + resolved_file_id.0, 123, + "Documentclass should resolve to FileId 123" + ); + + // Extract the exact string at the resolved position + let article_expected = "article"; + let resolved_article_str = + &qmd_content[resolved_article_offset..resolved_article_offset + article_expected.len()]; + assert_eq!( + resolved_article_str, article_expected, + "Resolved documentclass offset should point to exactly '{}'", + article_expected + ); + + // Test 6: Verify that hash entry key spans resolve correctly + if let Some(entries) = yaml.as_hash() { + for entry in entries { + let (entry_key_start, entry_file_id) = resolve_to_original_offset(&entry.key_span); + assert_eq!( + entry_file_id.0, 123, + "Entry key should resolve to FileId 123" + ); + + // All top-level keys should be within the YAML frontmatter region + assert!( + entry_key_start >= yaml_start && entry_key_start < yaml_end, + "Entry key at offset {} should be within YAML range {}-{}", + entry_key_start, + yaml_start, + yaml_end + ); + + // Verify the key actually points to the key string + let key_str = entry.key.yaml.as_str().unwrap_or(""); + if !key_str.is_empty() && entry_key_start + key_str.len() <= qmd_content.len() { + let resolved_key_str = + &qmd_content[entry_key_start..entry_key_start + key_str.len()]; + assert_eq!( + resolved_key_str, key_str, + "Entry key '{}' should resolve to exact position", + key_str + ); + } + } + } + + // All tests passed - offset resolution works correctly through the double-substring chain! + } +} diff --git a/crates/quarto-yaml/src/yaml_with_source_info.rs b/crates/quarto-yaml/src/yaml_with_source_info.rs new file mode 100644 index 0000000..ee758a1 --- /dev/null +++ b/crates/quarto-yaml/src/yaml_with_source_info.rs @@ -0,0 +1,310 @@ +//! YAML value with source location tracking. + +use crate::SourceInfo; +use yaml_rust2::Yaml; + +/// A YAML value with source location information. +/// +/// This structure wraps a `yaml-rust2::Yaml` value with source location tracking +/// for the value itself and all its children. Uses the **owned data approach**: +/// stores an owned `Yaml` value with a parallel `Children` structure for source +/// tracking. +/// +/// ## Design Trade-offs +/// +/// - **Memory**: ~3x overhead (owned Yaml + source-tracked children) +/// - **Simplicity**: No lifetime parameters, clean API +/// - **Config merging**: Can merge configs from different lifetimes +/// - **LSP caching**: Can serialize/deserialize for caching +/// +/// Follows rust-analyzer's precedent of using owned data for tree structures. +/// +/// ## Example +/// +/// ```rust,no_run +/// use quarto_yaml::{parse, YamlWithSourceInfo}; +/// use yaml_rust2::Yaml; +/// +/// let yaml = parse("title: My Document").unwrap(); +/// if let Some(title) = yaml.get_hash_value("title") { +/// println!("Title: {:?}", title.yaml); +/// println!("Location: offset {}", title.source_info.range.start.offset); +/// } +/// ``` +#[derive(Debug, Clone)] +pub struct YamlWithSourceInfo { + /// The complete yaml-rust2::Yaml value (owned). + /// + /// This provides direct access to the raw Yaml for code that doesn't + /// need source tracking. It's a complete, independent Yaml tree. + pub yaml: Yaml, + + /// Source location for this node. + pub source_info: SourceInfo, + + /// YAML tag information (e.g., !path, !glob, !str). + /// + /// If present, contains the tag suffix (e.g., "path" for !path) and + /// the source location of the tag itself. Used to bypass markdown parsing + /// for tagged strings and enable error reporting on tags. + pub tag: Option<(String, SourceInfo)>, + + /// Source-tracked children (parallel structure). + /// + /// This mirrors the structure of `yaml` but includes source location + /// information for each child. The structure matches the `yaml` field: + /// - None for scalars and Null + /// - Array for sequences + /// - Hash for mappings + children: Children, +} + +/// Source-tracked children of a YAML node. +/// +/// This is a parallel structure to the children in `Yaml`, providing +/// source location information for each child element. +#[derive(Debug, Clone)] +enum Children { + /// No children (for scalars, Null, BadValue) + None, + + /// Array elements with source tracking + Array(Vec), + + /// Hash entries with source tracking + Hash(Vec), +} + +/// A key-value pair in a YAML hash/mapping with source tracking. +/// +/// Tracks source locations for the key, value, and the entire entry. +#[derive(Debug, Clone)] +pub struct YamlHashEntry { + /// The key with source tracking + pub key: YamlWithSourceInfo, + + /// The value with source tracking + pub value: YamlWithSourceInfo, + + /// Source location of just the key + pub key_span: SourceInfo, + + /// Source location of just the value + pub value_span: SourceInfo, + + /// Source location of the entire entry (key + value) + pub entry_span: SourceInfo, +} + +impl YamlWithSourceInfo { + /// Create a new YamlWithSourceInfo for a scalar or leaf node. + pub fn new_scalar(yaml: Yaml, source_info: SourceInfo) -> Self { + Self { + yaml, + source_info, + tag: None, + children: Children::None, + } + } + + /// Create a new YamlWithSourceInfo for a scalar with tag information. + pub fn new_scalar_with_tag( + yaml: Yaml, + source_info: SourceInfo, + tag: Option<(String, SourceInfo)>, + ) -> Self { + Self { + yaml, + source_info, + tag, + children: Children::None, + } + } + + /// Create a new YamlWithSourceInfo for an array/sequence. + pub fn new_array( + yaml: Yaml, + source_info: SourceInfo, + children: Vec, + ) -> Self { + Self { + yaml, + source_info, + tag: None, + children: Children::Array(children), + } + } + + /// Create a new YamlWithSourceInfo for a hash/mapping. + pub fn new_hash(yaml: Yaml, source_info: SourceInfo, entries: Vec) -> Self { + Self { + yaml, + source_info, + tag: None, + children: Children::Hash(entries), + } + } + + /// Check if this is a scalar value (not array or hash). + pub fn is_scalar(&self) -> bool { + matches!(self.children, Children::None) + } + + /// Check if this is an array. + pub fn is_array(&self) -> bool { + matches!(self.children, Children::Array(_)) + } + + /// Check if this is a hash. + pub fn is_hash(&self) -> bool { + matches!(self.children, Children::Hash(_)) + } + + /// Get array children if this is an array. + pub fn as_array(&self) -> Option<&[YamlWithSourceInfo]> { + match &self.children { + Children::Array(items) => Some(items), + _ => None, + } + } + + /// Get hash entries if this is a hash. + pub fn as_hash(&self) -> Option<&[YamlHashEntry]> { + match &self.children { + Children::Hash(entries) => Some(entries), + _ => None, + } + } + + /// Get a value from a hash by key (string comparison). + /// + /// This searches through hash entries and compares keys as strings. + /// Returns None if this is not a hash or the key is not found. + pub fn get_hash_value(&self, key: &str) -> Option<&YamlWithSourceInfo> { + match &self.children { + Children::Hash(entries) => entries.iter().find_map(|entry| { + if entry.key.yaml.as_str() == Some(key) { + Some(&entry.value) + } else { + None + } + }), + _ => None, + } + } + + /// Get an array element by index. + pub fn get_array_item(&self, index: usize) -> Option<&YamlWithSourceInfo> { + match &self.children { + Children::Array(items) => items.get(index), + _ => None, + } + } + + /// Get the number of children (array length or hash entry count). + pub fn len(&self) -> usize { + match &self.children { + Children::None => 0, + Children::Array(items) => items.len(), + Children::Hash(entries) => entries.len(), + } + } + + /// Check if this node has no children. + pub fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Consume self and return array children if this is an array. + /// + /// Returns a tuple of (items, source_info) where items are the owned + /// YamlWithSourceInfo elements and source_info is the SourceInfo for + /// the whole array. + pub fn into_array(self) -> Option<(Vec, SourceInfo)> { + match self.children { + Children::Array(items) => Some((items, self.source_info)), + _ => None, + } + } + + /// Consume self and return hash entries if this is a hash. + /// + /// Returns a tuple of (entries, source_info) where entries are the owned + /// YamlHashEntry elements and source_info is the SourceInfo for + /// the whole hash. + pub fn into_hash(self) -> Option<(Vec, SourceInfo)> { + match self.children { + Children::Hash(entries) => Some((entries, self.source_info)), + _ => None, + } + } +} + +impl YamlHashEntry { + /// Create a new YamlHashEntry. + pub fn new( + key: YamlWithSourceInfo, + value: YamlWithSourceInfo, + key_span: SourceInfo, + value_span: SourceInfo, + entry_span: SourceInfo, + ) -> Self { + Self { + key, + value, + key_span, + value_span, + entry_span, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_scalar_creation() { + let yaml = Yaml::String("test".into()); + let info = SourceInfo::default(); + let node = YamlWithSourceInfo::new_scalar(yaml.clone(), info.clone()); + + assert_eq!(node.yaml, yaml); + assert_eq!(node.source_info, info); + assert!(node.is_scalar()); + assert!(!node.is_array()); + assert!(!node.is_hash()); + assert_eq!(node.len(), 0); + } + + #[test] + fn test_array_creation() { + let child1 = + YamlWithSourceInfo::new_scalar(Yaml::String("a".into()), SourceInfo::default()); + let child2 = + YamlWithSourceInfo::new_scalar(Yaml::String("b".into()), SourceInfo::default()); + + let yaml = Yaml::Array(vec![Yaml::String("a".into()), Yaml::String("b".into())]); + let node = YamlWithSourceInfo::new_array(yaml, SourceInfo::default(), vec![child1, child2]); + + assert!(node.is_array()); + assert_eq!(node.len(), 2); + assert!(node.as_array().is_some()); + assert_eq!(node.as_array().unwrap().len(), 2); + } + + #[test] + fn test_get_array_item() { + let child1 = + YamlWithSourceInfo::new_scalar(Yaml::String("a".into()), SourceInfo::default()); + let child2 = + YamlWithSourceInfo::new_scalar(Yaml::String("b".into()), SourceInfo::default()); + + let yaml = Yaml::Array(vec![Yaml::String("a".into()), Yaml::String("b".into())]); + let node = YamlWithSourceInfo::new_array(yaml, SourceInfo::default(), vec![child1, child2]); + + assert_eq!(node.get_array_item(0).unwrap().yaml.as_str(), Some("a")); + assert_eq!(node.get_array_item(1).unwrap().yaml.as_str(), Some("b")); + assert!(node.get_array_item(2).is_none()); + } +} diff --git a/crates/wasm-qmd-parser/src/utils.rs b/crates/wasm-qmd-parser/src/utils.rs index cc13879..ee549d9 100644 --- a/crates/wasm-qmd-parser/src/utils.rs +++ b/crates/wasm-qmd-parser/src/utils.rs @@ -3,6 +3,7 @@ * Copyright (c) 2025 Posit, PBC */ +#[allow(dead_code)] pub fn set_panic_hook() { // When the `console_error_panic_hook` feature is enabled, we can call the // `set_panic_hook` function at least once during initialization, and then diff --git a/docs/writers/json.qmd b/docs/writers/json.qmd new file mode 100644 index 0000000..97e2438 --- /dev/null +++ b/docs/writers/json.qmd @@ -0,0 +1,177 @@ +--- +title: "JSON Output Format" +--- + +The `quarto-markdown-pandoc` binary can output AST in JSON format using `-t json`. This format is designed to be compatible with Pandoc's JSON AST while adding source tracking information. + +## Basic Structure + +The JSON output contains three main sections: + +```json +{ + "pandoc-api-version": [1, 23, 1], + "meta": { /* metadata */ }, + "blocks": [ /* block elements */ ], + "astContext": { + "filenames": [ /* array of source files */ ], + "sourceInfoPool": [ /* source location data */ ] + } +} +``` + +## Source Information Tracking + +Unlike Pandoc, `quarto-markdown-pandoc` tracks the exact source location of every AST node. This information is encoded compactly using a pool-and-reference system. + +### How It Works + +1. **Pool**: All unique source location information is stored once in `astContext.sourceInfoPool` +2. **References**: Each AST node has an `"s"` field containing a numeric index into the pool +3. **Deduplication**: Shared source information (e.g., siblings in YAML) reuses the same pool entry + +### Example + +```json +{ + "astContext": { + "filenames": ["example.qmd"], + "sourceInfoPool": [ + {"r": [0, 0, 0, 4, 0, 4], "t": 0, "d": 0} + ] + }, + "blocks": [ + { + "t": "Para", + "s": 0, + "c": [ + {"t": "Str", "c": "Hello", "s": 0} + ] + } + ] +} +``` + +The `"s": 0` field means "look up source info at index 0 in the pool". + +## SourceInfoPool Encoding + +Each entry in the `sourceInfoPool` array has this compact format: + +```json +{"r": [start_offset, start_row, start_col, end_offset, end_row, end_col], "t": type, "d": data} +``` + +### Fields + +- **`r`** (range): 6-element array `[start_offset, start_row, start_col, end_offset, end_row, end_col]` + - All positions are 0-indexed + - `offset` is byte offset from start of source + - `row` and `col` are line and column numbers + +- **`t`** (type): Integer indicating the source mapping type + - `0` = Original (direct position in source file) + - `1` = Substring (extracted from a parent source) + - `2` = Concat (multiple sources joined together) + - `3` = Transformed (source that was modified with explicit mapping) + +- **`d`** (data): Type-specific data (see below) + +### Type 0: Original + +Represents text directly from a source file. + +```json +{"r": [0, 0, 0, 10, 0, 10], "t": 0, "d": 0} +``` + +- **`d`**: The file ID (index into `astContext.filenames`) + +**Example**: The word "Hello" at bytes 0-5 in the first file (file_id=0). + +### Type 1: Substring + +Represents a substring extracted from another source. + +```json +{"r": [0, 0, 0, 5, 0, 5], "t": 1, "d": [3, 10]} +``` + +- **`d`**: `[parent_id, offset]` + - `parent_id`: Index of the parent source in the pool + - `offset`: Byte offset within the parent where this substring starts + +**Example**: A 5-byte substring starting at byte 10 of source #3 (e.g., extracting YAML value from frontmatter). + +### Type 2: Concat + +Represents multiple sources concatenated together. + +```json +{"r": [0, 0, 0, 10, 0, 10], "t": 2, "d": [[1, 0, 5], [2, 5, 5]]} +``` + +- **`d`**: Array of pieces, where each piece is `[source_info_id, offset_in_concat, length]` + - `source_info_id`: Index of this piece's source in the pool + - `offset_in_concat`: Where this piece starts in the concatenated result + - `length`: Length of this piece in bytes + +**Example**: Joining sources #1 (5 bytes) and #2 (5 bytes) to create a 10-byte result. + +### Type 3: Transformed + +Represents source text that was transformed (e.g., entity decoding, shortcode expansion) with explicit range mappings. + +```json +{"r": [0, 0, 0, 8, 0, 8], "t": 3, "d": [4, [[0, 4, 0, 4], [4, 8, 6, 10]]]} +``` + +- **`d`**: `[parent_id, range_mappings]` + - `parent_id`: Index of the parent source in the pool + - `range_mappings`: Array of `[from_start, from_end, to_start, to_end]` + - `from_start`, `from_end`: Range in the transformed text (this source) + - `to_start`, `to_end`: Corresponding range in the parent text + +**Example**: 8 bytes of transformed text derived from bytes 0-4 and 6-10 of source #4. + +## Complete Example + +```json +{ + "pandoc-api-version": [1, 23, 1], + "meta": {}, + "blocks": [ + { + "t": "Para", + "s": 3, + "c": [ + {"t": "Str", "c": "Hello", "s": 0}, + {"t": "Space", "s": 1}, + {"t": "Str", "c": "world", "s": 2} + ] + } + ], + "astContext": { + "filenames": ["example.qmd"], + "sourceInfoPool": [ + {"r": [0, 0, 0, 5, 0, 5], "t": 0, "d": 0}, + {"r": [5, 0, 5, 6, 0, 6], "t": 0, "d": 0}, + {"r": [6, 0, 6, 11, 0, 11], "t": 0, "d": 0}, + {"r": [0, 0, 0, 11, 0, 11], "t": 2, "d": [[0, 0, 5], [1, 5, 1], [2, 6, 5]]} + ] + } +} +``` + +### Explanation + +- Pool entry 0: "Hello" at bytes 0-5 +- Pool entry 1: Space at byte 5-6 +- Pool entry 2: "world" at bytes 6-11 +- Pool entry 3: Concatenation of all three pieces +- The Para block references entry 3 (the full concatenated range) +- Each inline element references its individual piece + +## Pandoc compatibility + +For compatibility with tools expecting Pandoc JSON, either ignore the `"s"` fields and `astContext` section (that's what Pandoc will do) or remove them from the JSON object ahead of time. From d1aef425aad225dd71facd4847f4b8f1ebcca238 Mon Sep 17 00:00:00 2001 From: Carlos Scheidegger Date: Mon, 20 Oct 2025 17:18:34 -0500 Subject: [PATCH 2/2] remove bad section --- CLAUDE.md | 7 ------- 1 file changed, 7 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 6928df1..a35965d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -113,13 +113,6 @@ When fixing ANY bug: - `crates/tree-sitter-qmd`: tree-sitter grammars for block and inline parsers - `crates/wasm-qmd-parser`: A WASM module with some entry points from `crates/quarto-markdown-pandoc` -### `private-crates` - private crates we are not going to release yet - -- `private-crates/quarto-yaml-validation`: A library to validate YAML objects using schemas -- `private-crates/validate-yaml`: A binary to exercise `quarto-yaml-validation` -- `private-crates/quarto`: The future main entry point for the `quarto` command line binary. -- `private-crates/quarto-core`: supporting library for `quarto` - ## General Instructions - in this repository, "qmd" means "quarto markdown", the dialect of markdown we are developing. Although we aim to be largely compatible with Pandoc, discrepancies in the behavior might not be bugs.