Skip to content

Commit 74b7839

Browse files
authored
Merge pull request #40 from wikimedia/fix/boundary-symbol-detection
Fix boundary_symbol detection when followed by whitespace
2 parents ebdc8ca + ad9462f commit 74b7839

File tree

2 files changed

+148
-3
lines changed

2 files changed

+148
-3
lines changed

src/languages/language.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -203,12 +203,19 @@ pub trait Language {
203203

204204
let sentence_text = &paragraph[start..end];
205205
let boundary_symbol = if end > 0 && end <= paragraph.len() {
206-
// Use char_indices for more efficient character iteration
207-
paragraph[..end]
206+
// Trim trailing whitespace before looking for the boundary symbol.
207+
// This fixes the issue where boundary symbols are not detected when
208+
// followed by whitespace (e.g., "Hello. " should detect "." as symbol).
209+
let sentence_slice = &paragraph[..end];
210+
let trimmed_slice = sentence_slice.trim_end();
211+
212+
// Use char_indices for more efficient character iteration on the trimmed slice
213+
trimmed_slice
208214
.char_indices()
209215
.next_back()
210216
.and_then(|(idx, _)| {
211-
let char_str = &paragraph[idx..end];
217+
// Extract the last character from the trimmed slice
218+
let char_str = &trimmed_slice[idx..];
212219
if GLOBAL_SENTENCE_TERMINATORS.contains(&char_str) {
213220
Some(char_str.to_string())
214221
} else {

src/lib.rs

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -558,4 +558,142 @@ mod tests {
558558
"Segment reconstruction failed for multi-byte text"
559559
);
560560
}
561+
562+
#[test]
563+
fn test_boundary_symbol_detection_with_trailing_space() {
564+
// This test reproduces the bug from issue #35:
565+
// boundary_symbols are not detected if space follows boundary_symbol
566+
let text = "Hello world. This is a test.Another test. And another test.";
567+
let boundaries = get_sentence_boundaries("en", text);
568+
569+
// Verify we have at least 4 boundaries (4 sentences)
570+
assert!(
571+
boundaries.len() >= 4,
572+
"Expected at least 4 boundaries, got {}",
573+
boundaries.len()
574+
);
575+
576+
// Check each boundary has the correct boundary_symbol
577+
let non_paragraph: Vec<_> = boundaries
578+
.iter()
579+
.filter(|b| !b.is_paragraph_break)
580+
.collect();
581+
582+
// All 4 sentences should have period as boundary symbol
583+
for (i, boundary) in non_paragraph.iter().enumerate() {
584+
assert!(
585+
boundary.boundary_symbol.is_some(),
586+
"Boundary {} should have a boundary_symbol, got None",
587+
i
588+
);
589+
assert_eq!(
590+
boundary.boundary_symbol.as_deref(),
591+
Some("."),
592+
"Boundary {} should have period as symbol",
593+
i
594+
);
595+
}
596+
}
597+
598+
#[test]
599+
fn test_boundary_symbol_with_multiple_trailing_spaces() {
600+
// Test that multiple trailing spaces don't prevent symbol detection
601+
let text = "Hello. This is another. Yet another.";
602+
let boundaries = get_sentence_boundaries("en", text);
603+
604+
let non_paragraph = boundaries
605+
.iter()
606+
.filter(|b| !b.is_paragraph_break)
607+
.collect::<Vec<_>>();
608+
609+
// All sentence boundaries should have period symbol
610+
for (i, boundary) in non_paragraph.iter().enumerate() {
611+
assert_eq!(
612+
boundary.boundary_symbol.as_deref(),
613+
Some("."),
614+
"Boundary {} should have period symbol despite trailing spaces",
615+
i
616+
);
617+
}
618+
}
619+
620+
#[test]
621+
fn test_boundary_symbol_with_mixed_terminators() {
622+
// Test various sentence terminators with trailing spaces
623+
let text = "Hello! How are you? I'm fine. Yes.";
624+
let boundaries = get_sentence_boundaries("en", text);
625+
626+
let non_paragraph = boundaries
627+
.iter()
628+
.filter(|b| !b.is_paragraph_break)
629+
.collect::<Vec<_>>();
630+
631+
// Verify we detect the different terminators
632+
let symbols: Vec<_> = non_paragraph
633+
.iter()
634+
.filter_map(|b| b.boundary_symbol.as_deref())
635+
.collect();
636+
637+
assert!(
638+
symbols.iter().any(|&s| s == "!"),
639+
"Should detect exclamation mark"
640+
);
641+
assert!(
642+
symbols.iter().any(|&s| s == "?"),
643+
"Should detect question mark"
644+
);
645+
assert!(symbols.iter().any(|&s| s == "."), "Should detect period");
646+
}
647+
648+
#[test]
649+
fn test_boundary_symbol_with_cjk_terminator() {
650+
// Test with CJK full stop (。) which is a valid sentence terminator
651+
let text = "日本語です。中文です。";
652+
let boundaries = get_sentence_boundaries("en", text);
653+
654+
let non_paragraph = boundaries
655+
.iter()
656+
.filter(|b| !b.is_paragraph_break)
657+
.collect::<Vec<_>>();
658+
659+
// Should detect CJK terminators
660+
let with_cjk_stop = non_paragraph
661+
.iter()
662+
.filter(|b| b.boundary_symbol.as_deref() == Some("。"))
663+
.count();
664+
665+
assert!(
666+
with_cjk_stop >= 1,
667+
"Should detect at least one CJK full stop, got {}",
668+
with_cjk_stop
669+
);
670+
}
671+
672+
#[test]
673+
fn test_boundary_symbol_with_tabs_and_spaces() {
674+
// Test with mixed whitespace (tabs and spaces)
675+
let text = "First sentence.\t\n Second sentence. Third one!";
676+
let boundaries = get_sentence_boundaries("en", text);
677+
678+
let non_paragraph = boundaries
679+
.iter()
680+
.filter(|b| !b.is_paragraph_break)
681+
.collect::<Vec<_>>();
682+
683+
// All should have boundary symbols
684+
for (i, boundary) in non_paragraph.iter().enumerate() {
685+
assert!(
686+
boundary.boundary_symbol.is_some(),
687+
"Boundary {} should have boundary_symbol despite mixed whitespace",
688+
i
689+
);
690+
}
691+
692+
// Verify reconstruction still works
693+
let reconstructed: String = boundaries.iter().map(|b| b.text).collect();
694+
assert_eq!(
695+
reconstructed, text,
696+
"Text reconstruction failed with mixed whitespace"
697+
);
698+
}
561699
}

0 commit comments

Comments
 (0)