@@ -467,6 +467,84 @@ fn process_backslash_escape(
467467 PandocNativeIntermediate :: IntermediateBaseText ( content. to_string ( ) , node_location ( node) )
468468}
469469
470+ // Pandoc's default abbreviations list
471+ // From Text.Pandoc.Parsing in pandoc source
472+ fn is_abbreviation ( text : & str ) -> bool {
473+ matches ! ( text,
474+ "Mr." | "Mrs." | "Ms." | "Capt." | "Dr." | "Prof." |
475+ "Gen." | "Gov." | "e.g." | "i.e." | "Sgt." | "St." |
476+ "vol." | "vs." | "Sen." | "Rep." | "Pres." | "Hon." |
477+ "Rev." | "Ph.D." | "M.D." | "M.A." | "p." | "pp." |
478+ "ch." | "chap." | "sec." | "cf." | "cp." )
479+ }
480+
481+ // Coalesce Str nodes that end with abbreviations with following words
482+ // This matches Pandoc's behavior of keeping abbreviations with the next word
483+ // Returns (result, did_coalesce) tuple
484+ fn coalesce_abbreviations ( inlines : Vec < Inline > ) -> ( Vec < Inline > , bool ) {
485+ let mut result: Vec < Inline > = Vec :: new ( ) ;
486+ let mut i = 0 ;
487+ let mut did_coalesce = false ;
488+
489+ while i < inlines. len ( ) {
490+ if let Inline :: Str ( ref str_inline) = inlines[ i] {
491+ let mut current_text = str_inline. text . clone ( ) ;
492+ let start_info = str_inline. source_info . clone ( ) ;
493+ let mut end_info = str_inline. source_info . clone ( ) ;
494+ let mut j = i + 1 ;
495+
496+ // Check if current text is an abbreviation
497+ if is_abbreviation ( & current_text) {
498+ // Coalesce with following Space + Str until we hit a capital letter
499+ while j + 1 < inlines. len ( ) {
500+ if let ( Inline :: Space ( _) , Inline :: Str ( next_str) ) = ( & inlines[ j] , & inlines[ j + 1 ] ) {
501+ // Stop before uppercase letters (potential sentence boundaries)
502+ if next_str. text . chars ( ) . next ( ) . map_or ( false , |c| c. is_uppercase ( ) ) {
503+ break ;
504+ }
505+
506+ // Coalesce
507+ current_text. push ( ' ' ) ;
508+ current_text. push_str ( & next_str. text ) ;
509+ end_info = next_str. source_info . clone ( ) ;
510+ j += 2 ;
511+ did_coalesce = true ;
512+
513+ // If this word is also an abbreviation, continue coalescing
514+ // Otherwise, stop after this word
515+ if !is_abbreviation ( & next_str. text ) {
516+ break ;
517+ }
518+ } else {
519+ break ;
520+ }
521+ }
522+ }
523+
524+ // Create the Str node (possibly coalesced)
525+ let source_info = if j > i + 1 {
526+ SourceInfo :: with_range ( Range {
527+ start : start_info. range . start . clone ( ) ,
528+ end : end_info. range . end . clone ( ) ,
529+ } )
530+ } else {
531+ start_info
532+ } ;
533+
534+ result. push ( Inline :: Str ( Str {
535+ text : current_text,
536+ source_info,
537+ } ) ) ;
538+ i = j;
539+ } else {
540+ result. push ( inlines[ i] . clone ( ) ) ;
541+ i += 1 ;
542+ }
543+ }
544+
545+ ( result, did_coalesce)
546+ }
547+
470548fn process_paragraph (
471549 node : & tree_sitter:: Node ,
472550 children : Vec < ( String , PandocNativeIntermediate ) > ,
@@ -2464,10 +2542,15 @@ fn merge_strs(pandoc: Pandoc) -> Pandoc {
24642542 source_info : current_source_info. unwrap_or_else ( empty_source_info) ,
24652543 } ) ) ;
24662544 }
2545+
2546+ // Apply abbreviation coalescing after merging strings
2547+ let ( coalesced_result, did_coalesce) = coalesce_abbreviations ( result) ;
2548+ did_merge = did_merge || did_coalesce;
2549+
24672550 if did_merge {
2468- FilterResult ( result , true )
2551+ FilterResult ( coalesced_result , true )
24692552 } else {
2470- Unchanged ( result )
2553+ Unchanged ( coalesced_result )
24712554 }
24722555 } ) ,
24732556 )
@@ -2487,5 +2570,6 @@ pub fn treesitter_to_pandoc<T: Write>(
24872570 panic ! ( "Expected Pandoc, got {:?}" , result)
24882571 } ;
24892572 let result = desugar ( pandoc) ?;
2490- Ok ( merge_strs ( result) )
2573+ let result = merge_strs ( result) ;
2574+ Ok ( result)
24912575}
0 commit comments