Skip to content

Commit 5c19014

Browse files
committed
more abbreviation merging fixes
1 parent 0ea1576 commit 5c19014

File tree

4 files changed

+38
-61
lines changed

4 files changed

+38
-61
lines changed

crates/quarto-markdown-pandoc/src/pandoc/treesitter_utils/postprocess.rs

Lines changed: 35 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -49,73 +49,47 @@ pub fn trim_inlines(inlines: Inlines) -> (Inlines, bool) {
4949
(result, changed)
5050
}
5151

52+
/// List of known abbreviations
53+
const ABBREVIATIONS: &[&str] = &[
54+
"Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.", "Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.",
55+
"vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.", "Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.",
56+
"ch.", "chap.", "sec.", "cf.", "cp.",
57+
];
58+
5259
/// Check if a text string is a known abbreviation
5360
fn is_abbreviation(text: &str) -> bool {
54-
matches!(
55-
text,
56-
"Mr."
57-
| "Mrs."
58-
| "Ms."
59-
| "Capt."
60-
| "Dr."
61-
| "Prof."
62-
| "Gen."
63-
| "Gov."
64-
| "e.g."
65-
| "i.e."
66-
| "Sgt."
67-
| "St."
68-
| "vol."
69-
| "vs."
70-
| "Sen."
71-
| "Rep."
72-
| "Pres."
73-
| "Hon."
74-
| "Rev."
75-
| "Ph.D."
76-
| "M.D."
77-
| "M.A."
78-
| "p."
79-
| "pp."
80-
| "ch."
81-
| "chap."
82-
| "sec."
83-
| "cf."
84-
| "cp."
85-
)
61+
ABBREVIATIONS.contains(&text)
62+
}
63+
64+
/// Check if text ends with an abbreviation AND has a valid word boundary before it
65+
/// A valid boundary means the abbreviation is either at the start of the string,
66+
/// or preceded by a non-alphanumeric character (punctuation is OK, letters/digits are not)
67+
fn has_valid_abbrev_boundary(text: &str, abbrev: &str) -> bool {
68+
if !text.ends_with(abbrev) {
69+
return false;
70+
}
71+
72+
// Check if there's a valid word boundary before the abbreviation
73+
if text.len() == abbrev.len() {
74+
return true; // abbreviation is the entire string
75+
}
76+
77+
// Get the prefix before the abbreviation
78+
let prefix = &text[..text.len() - abbrev.len()];
79+
80+
// Check the last character of the prefix - must not be alphanumeric
81+
if let Some(last_char) = prefix.chars().last() {
82+
!last_char.is_alphanumeric()
83+
} else {
84+
true
85+
}
8686
}
8787

8888
/// Check if a text string ends with a known abbreviation
8989
fn ends_with_abbreviation(text: &str) -> bool {
90-
text.ends_with("Mr.")
91-
|| text.ends_with("Mrs.")
92-
|| text.ends_with("Ms.")
93-
|| text.ends_with("Capt.")
94-
|| text.ends_with("Dr.")
95-
|| text.ends_with("Prof.")
96-
|| text.ends_with("Gen.")
97-
|| text.ends_with("Gov.")
98-
|| text.ends_with("e.g.")
99-
|| text.ends_with("i.e.")
100-
|| text.ends_with("Sgt.")
101-
|| text.ends_with("St.")
102-
|| text.ends_with("vol.")
103-
|| text.ends_with("vs.")
104-
|| text.ends_with("Sen.")
105-
|| text.ends_with("Rep.")
106-
|| text.ends_with("Pres.")
107-
|| text.ends_with("Hon.")
108-
|| text.ends_with("Rev.")
109-
|| text.ends_with("Ph.D.")
110-
|| text.ends_with("M.D.")
111-
|| text.ends_with("M.A.")
112-
|| text.ends_with("p.")
113-
|| text.ends_with("pp.")
114-
|| text.ends_with("ch.")
115-
|| text.ends_with("chap.")
116-
|| text.ends_with("sec.")
117-
|| text.ends_with("cf.")
118-
|| text.ends_with("cp.")
90+
ABBREVIATIONS
91+
.iter()
92+
.any(|abbrev| has_valid_abbrev_boundary(text, abbrev))
11993
}
12094

12195
/// Coalesce Str nodes that end with abbreviations with following words
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
This is a branch. 3 things
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
See (ch. 3)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
See ch. top. 3

0 commit comments

Comments
 (0)