Skip to content

Commit abcbdf7

Browse files
committed
fix abbreviation coalescing behavior
1 parent 64aed65 commit abcbdf7

File tree

6 files changed

+92
-6
lines changed

6 files changed

+92
-6
lines changed

crates/quarto-markdown-pandoc/src/pandoc/treesitter.rs

Lines changed: 87 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,84 @@ fn process_backslash_escape(
467467
PandocNativeIntermediate::IntermediateBaseText(content.to_string(), node_location(node))
468468
}
469469

470+
// Pandoc's default abbreviations list
471+
// From Text.Pandoc.Parsing in pandoc source
472+
fn is_abbreviation(text: &str) -> bool {
473+
matches!(text,
474+
"Mr." | "Mrs." | "Ms." | "Capt." | "Dr." | "Prof." |
475+
"Gen." | "Gov." | "e.g." | "i.e." | "Sgt." | "St." |
476+
"vol." | "vs." | "Sen." | "Rep." | "Pres." | "Hon." |
477+
"Rev." | "Ph.D." | "M.D." | "M.A." | "p." | "pp." |
478+
"ch." | "chap." | "sec." | "cf." | "cp.")
479+
}
480+
481+
// Coalesce Str nodes that end with abbreviations with following words
482+
// This matches Pandoc's behavior of keeping abbreviations with the next word
483+
// Returns (result, did_coalesce) tuple
484+
fn coalesce_abbreviations(inlines: Vec<Inline>) -> (Vec<Inline>, bool) {
485+
let mut result: Vec<Inline> = Vec::new();
486+
let mut i = 0;
487+
let mut did_coalesce = false;
488+
489+
while i < inlines.len() {
490+
if let Inline::Str(ref str_inline) = inlines[i] {
491+
let mut current_text = str_inline.text.clone();
492+
let start_info = str_inline.source_info.clone();
493+
let mut end_info = str_inline.source_info.clone();
494+
let mut j = i + 1;
495+
496+
// Check if current text is an abbreviation
497+
if is_abbreviation(&current_text) {
498+
// Coalesce with following Space + Str until we hit a capital letter
499+
while j + 1 < inlines.len() {
500+
if let (Inline::Space(_), Inline::Str(next_str)) = (&inlines[j], &inlines[j + 1]) {
501+
// Stop before uppercase letters (potential sentence boundaries)
502+
if next_str.text.chars().next().map_or(false, |c| c.is_uppercase()) {
503+
break;
504+
}
505+
506+
// Coalesce
507+
current_text.push(' ');
508+
current_text.push_str(&next_str.text);
509+
end_info = next_str.source_info.clone();
510+
j += 2;
511+
did_coalesce = true;
512+
513+
// If this word is also an abbreviation, continue coalescing
514+
// Otherwise, stop after this word
515+
if !is_abbreviation(&next_str.text) {
516+
break;
517+
}
518+
} else {
519+
break;
520+
}
521+
}
522+
}
523+
524+
// Create the Str node (possibly coalesced)
525+
let source_info = if j > i + 1 {
526+
SourceInfo::with_range(Range {
527+
start: start_info.range.start.clone(),
528+
end: end_info.range.end.clone(),
529+
})
530+
} else {
531+
start_info
532+
};
533+
534+
result.push(Inline::Str(Str {
535+
text: current_text,
536+
source_info,
537+
}));
538+
i = j;
539+
} else {
540+
result.push(inlines[i].clone());
541+
i += 1;
542+
}
543+
}
544+
545+
(result, did_coalesce)
546+
}
547+
470548
fn process_paragraph(
471549
node: &tree_sitter::Node,
472550
children: Vec<(String, PandocNativeIntermediate)>,
@@ -2464,10 +2542,15 @@ fn merge_strs(pandoc: Pandoc) -> Pandoc {
24642542
source_info: current_source_info.unwrap_or_else(empty_source_info),
24652543
}));
24662544
}
2545+
2546+
// Apply abbreviation coalescing after merging strings
2547+
let (coalesced_result, did_coalesce) = coalesce_abbreviations(result);
2548+
did_merge = did_merge || did_coalesce;
2549+
24672550
if did_merge {
2468-
FilterResult(result, true)
2551+
FilterResult(coalesced_result, true)
24692552
} else {
2470-
Unchanged(result)
2553+
Unchanged(coalesced_result)
24712554
}
24722555
}),
24732556
)
@@ -2487,5 +2570,6 @@ pub fn treesitter_to_pandoc<T: Write>(
24872570
panic!("Expected Pandoc, got {:?}", result)
24882571
};
24892572
let result = desugar(pandoc)?;
2490-
Ok(merge_strs(result))
2573+
let result = merge_strs(result);
2574+
Ok(result)
24912575
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
[ Para [Str "Blah", Space, Str "Blah", Space, Cite [Citation { citationId = "knuth1984", citationPrefix = [Str "see", Space], citationSuffix = [Str ",", Space, Str "pp.", Space, Str "33-35"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }, Citation { citationId = "wickham2015", citationPrefix = [Space, Str "also", Space], citationSuffix = [Str ",", Space, Str "chap.", Space, Str "1"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }] []] ]
1+
[ Para [Str "Blah", Space, Str "Blah", Space, Cite [Citation { citationId = "knuth1984", citationPrefix = [Str "see", Space], citationSuffix = [Str ",", Space, Str "pp. 33-35"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }, Citation { citationId = "wickham2015", citationPrefix = [Space, Str "also", Space], citationSuffix = [Str ",", Space, Str "chap. 1"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }] []] ]
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
[ Para [Str "Blah", Space, Str "Blah", Space, Cite [Citation { citationId = "knuth1984", citationPrefix = [], citationSuffix = [Str ",", Space, Str "pp.", Space, Str "33-35,", Space, Str "38-39", Space, Str "and", Space, Str "passim"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }] []] ]
1+
[ Para [Str "Blah", Space, Str "Blah", Space, Cite [Citation { citationId = "knuth1984", citationPrefix = [], citationSuffix = [Str ",", Space, Str "pp. 33-35,", Space, Str "38-39", Space, Str "and", Space, Str "passim"], citationMode = NormalCitation, citationNoteNum = 1, citationHash = 0 }] []] ]
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
[ Para [Cite [Citation { citationId = "smith04", citationPrefix = [], citationSuffix = [Str "p.", Space, Str "33"], citationMode = AuthorInText, citationNoteNum = 1, citationHash = 0 }] [Str "@smith04"]] ]
1+
[ Para [Cite [Citation { citationId = "smith04", citationPrefix = [], citationSuffix = [Str "p. 33"], citationMode = AuthorInText, citationNoteNum = 1, citationHash = 0 }] [Str "@smith04"]] ]
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
i think e.g. this is good? did 1--30 work? wait---really---did it?
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[ Para [Str "i", Space, Str "think", Space, Str "e.g. this", Space, Str "is", Space, Str "good?", Space, Str "did", Space, Str "1–30", Space, Str "work?", Space, Str "wait—really—did", Space, Str "it?"] ]

0 commit comments

Comments
 (0)