fix abbreviation coalescing behavior

cscheid · cscheid · commit abcbdf74edd7 · 2025-10-04T14:44:32.000-05:00
diff --git a/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs b/crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs
@@ -467,6 +467,84 @@ fn process_backslash_escape(
     PandocNativeIntermediate::IntermediateBaseText(content.to_string(), node_location(node))
 }
 
+// Pandoc's default abbreviations list
+// From Text.Pandoc.Parsing in pandoc source
+fn is_abbreviation(text: &str) -> bool {
+    matches!(text,
+        "Mr." | "Mrs." | "Ms." | "Capt." | "Dr." | "Prof." |
+        "Gen." | "Gov." | "e.g." | "i.e." | "Sgt." | "St." |
+        "vol." | "vs." | "Sen." | "Rep." | "Pres." | "Hon." |
+        "Rev." | "Ph.D." | "M.D." | "M.A." | "p." | "pp." |
+        "ch." | "chap." | "sec." | "cf." | "cp.")
+}
+
+// Coalesce Str nodes that end with abbreviations with following words
+// This matches Pandoc's behavior of keeping abbreviations with the next word
+// Returns (result, did_coalesce) tuple
+fn coalesce_abbreviations(inlines: Vec<Inline>) -> (Vec<Inline>, bool) {
+    let mut result: Vec<Inline> = Vec::new();
+    let mut i = 0;
+    let mut did_coalesce = false;
+
+    while i < inlines.len() {
+        if let Inline::Str(ref str_inline) = inlines[i] {
+            let mut current_text = str_inline.text.clone();
+            let start_info = str_inline.source_info.clone();
+            let mut end_info = str_inline.source_info.clone();
+            let mut j = i + 1;
+
+            // Check if current text is an abbreviation
+            if is_abbreviation(&current_text) {
+                // Coalesce with following Space + Str until we hit a capital letter
+                while j + 1 < inlines.len() {
+                    if let (Inline::Space(_), Inline::Str(next_str)) = (&inlines[j], &inlines[j + 1]) {
+                        // Stop before uppercase letters (potential sentence boundaries)
+                        if next_str.text.chars().next().map_or(false, |c| c.is_uppercase()) {
+                            break;
+                        }
+
+                        // Coalesce
+                        current_text.push(' ');
+                        current_text.push_str(&next_str.text);
+                        end_info = next_str.source_info.clone();
+                        j += 2;
+                        did_coalesce = true;
+
+                        // If this word is also an abbreviation, continue coalescing
+                        // Otherwise, stop after this word
+                        if !is_abbreviation(&next_str.text) {
+                            break;
+                        }
+                    } else {
+                        break;
+                    }
+                }
+            }
+
+            // Create the Str node (possibly coalesced)
+            let source_info = if j > i + 1 {
+                SourceInfo::with_range(Range {
+                    start: start_info.range.start.clone(),
+                    end: end_info.range.end.clone(),
+                })
+            } else {
+                start_info
+            };
+
+            result.push(Inline::Str(Str {
+                text: current_text,
+                source_info,
+            }));
+            i = j;
+        } else {
+            result.push(inlines[i].clone());
+            i += 1;
+        }
+    }
+
+    (result, did_coalesce)
+}
+
 fn process_paragraph(
     node: &tree_sitter::Node,
     children: Vec<(String, PandocNativeIntermediate)>,
@@ -2464,10 +2542,15 @@ fn merge_strs(pandoc: Pandoc) -> Pandoc {
                     source_info: current_source_info.unwrap_or_else(empty_source_info),
                 }));
             }
+
+            // Apply abbreviation coalescing after merging strings
+            let (coalesced_result, did_coalesce) = coalesce_abbreviations(result);
+            did_merge = did_merge || did_coalesce;
+
             if did_merge {
-                FilterResult(result, true)
+                FilterResult(coalesced_result, true)
             } else {
-                Unchanged(result)
+                Unchanged(coalesced_result)
             }
         }),
     )
@@ -2487,5 +2570,6 @@ pub fn treesitter_to_pandoc<T: Write>(
         panic!("Expected Pandoc, got {:?}", result)
     };
     let result = desugar(pandoc)?;
-    Ok(merge_strs(result))
+    let result = merge_strs(result);
+    Ok(result)
 }
diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/native/002.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/native/002.qmd.snapshot
@@ -1 +1 @@
-[ Para [Str "Blah", Space, Str "Blah", Space, Cite [Citation { citationId = "knuth1984", citationPrefix = [Str "see", Space], citationSuffix = [Str ",", Space, Str "pp.", Space, Str "33-35"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }, Citation { citationId = "wickham2015", citationPrefix = [Space, Str "also", Space], citationSuffix = [Str ",", Space, Str "chap.", Space, Str "1"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }] []] ]
+[ Para [Str "Blah", Space, Str "Blah", Space, Cite [Citation { citationId = "knuth1984", citationPrefix = [Str "see", Space], citationSuffix = [Str ",", Space, Str "pp. 33-35"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }, Citation { citationId = "wickham2015", citationPrefix = [Space, Str "also", Space], citationSuffix = [Str ",", Space, Str "chap. 1"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }] []] ]
diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/native/003.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/native/003.qmd.snapshot
@@ -1 +1 @@
-[ Para [Str "Blah", Space, Str "Blah", Space, Cite [Citation { citationId = "knuth1984", citationPrefix = [], citationSuffix = [Str ",", Space, Str "pp.", Space, Str "33-35,", Space, Str "38-39", Space, Str "and", Space, Str "passim"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }] []] ]
+[ Para [Str "Blah", Space, Str "Blah", Space, Cite [Citation { citationId = "knuth1984", citationPrefix = [], citationSuffix = [Str ",", Space, Str "pp. 33-35,", Space, Str "38-39", Space, Str "and", Space, Str "passim"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }] []] ]
diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/native/006.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/native/006.qmd.snapshot
@@ -1 +1 @@
-[ Para [Cite [Citation { citationId = "smith04", citationPrefix = [], citationSuffix = [Str "p.", Space, Str "33"], citationMode = AuthorInText, citationNoteNum = 1, citationHash = 0 }] [Str "@smith04"]] ]
+[ Para [Cite [Citation { citationId = "smith04", citationPrefix = [], citationSuffix = [Str "p. 33"], citationMode = AuthorInText, citationNoteNum = 1, citationHash = 0 }] [Str "@smith04"]] ]
diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/native/014.qmd b/crates/quarto-markdown-pandoc/tests/snapshots/native/014.qmd
@@ -0,0 +1 @@
+i think e.g. this is good? did 1--30 work? wait---really---did it?
diff --git a/crates/quarto-markdown-pandoc/tests/snapshots/native/014.qmd.snapshot b/crates/quarto-markdown-pandoc/tests/snapshots/native/014.qmd.snapshot
@@ -0,0 +1 @@
+[ Para [Str "i", Space, Str "think", Space, Str "e.g. this", Space, Str "is", Space, Str "good?", Space, Str "did", Space, Str "1–30", Space, Str "work?", Space, Str "wait—really—did", Space, Str "it?"] ]

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-[ Para [Str "Blah", Space, Str "Blah", Space, Cite [Citation { citationId = "knuth1984", citationPrefix = [Str "see", Space], citationSuffix = [Str ",", Space, Str "pp.", Space, Str "33-35"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }, Citation { citationId = "wickham2015", citationPrefix = [Space, Str "also", Space], citationSuffix = [Str ",", Space, Str "chap.", Space, Str "1"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }] []] ]`
	`1`	`+[ Para [Str "Blah", Space, Str "Blah", Space, Cite [Citation { citationId = "knuth1984", citationPrefix = [Str "see", Space], citationSuffix = [Str ",", Space, Str "pp. 33-35"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }, Citation { citationId = "wickham2015", citationPrefix = [Space, Str "also", Space], citationSuffix = [Str ",", Space, Str "chap. 1"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }] []] ]`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-[ Para [Cite [Citation { citationId = "smith04", citationPrefix = [], citationSuffix = [Str "p.", Space, Str "33"], citationMode = AuthorInText, citationNoteNum = 1, citationHash = 0 }] [Str "@smith04"]] ]`
	`1`	`+[ Para [Cite [Citation { citationId = "smith04", citationPrefix = [], citationSuffix = [Str "p. 33"], citationMode = AuthorInText, citationNoteNum = 1, citationHash = 0 }] [Str "@smith04"]] ]`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+i think e.g. this is good? did 1--30 work? wait---really---did it?`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+[ Para [Str "i", Space, Str "think", Space, Str "e.g. this", Space, Str "is", Space, Str "good?", Space, Str "did", Space, Str "1–30", Space, Str "work?", Space, Str "wait—really—did", Space, Str "it?"] ]`