Skip to content

Commit e8ed98f

Browse files
authored
Merge pull request #38 from wikimedia/boundary-follow-up
Boundary follow up
2 parents 9aa1e1f + 6b97fa4 commit e8ed98f

File tree

2 files changed

+175
-2
lines changed

2 files changed

+175
-2
lines changed

src/languages/language.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,15 @@ pub trait Language {
8989
let paragraphs: Vec<&str> = para_split_re.split(text).collect();
9090

9191
// Pre-calculate all paragraph offsets in one pass
92+
// CRITICAL: We track both byte offsets AND character offsets separately.
93+
// This is essential for correct handling of multi-byte UTF-8 characters (e.g., CJK, emoji).
94+
//
95+
// - `paragraph_offsets`: byte indices into the original text (for slicing with &text[start..end])
96+
// - `paragraph_char_offsets`: character counts (for SentenceBoundary.start_index/end_index)
97+
//
98+
// Example: "日本語" is 3 characters but 9 bytes in UTF-8:
99+
// - byte offset: 0..9
100+
// - char offset: 0..3
92101
let mut paragraph_offsets = Vec::with_capacity(paragraphs.len());
93102
let mut current_offset = 0;
94103
let mut paragraph_char_offsets = Vec::with_capacity(paragraphs.len());
@@ -99,8 +108,8 @@ pub trait Language {
99108
current_offset += paragraph.len();
100109
current_char_offset += paragraph.chars().count();
101110
if i < paragraphs.len() - 1 {
102-
current_offset += 2; // for "\n\n"
103-
current_char_offset += 2;
111+
current_offset += 2; // for "\n\n" bytes
112+
current_char_offset += 2; // for "\n\n" chars (always 2, regardless of encoding)
104113
}
105114
}
106115

@@ -213,6 +222,9 @@ pub trait Language {
213222
let start_byte = paragraph_start_offset + start;
214223
let end_byte = paragraph_start_offset + end;
215224

225+
// Calculate character index from paragraph char offset + character count within paragraph
226+
// This is crucial: start_index is NOT the same as start_byte when UTF-8 multi-byte
227+
// characters are present (e.g., CJK or emoji).
216228
let start_index = paragraph_start_char_offset
217229
+ paragraph[..paragraph.floor_char_boundary(start)]
218230
.chars()

src/lib.rs

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,4 +397,165 @@ mod tests {
397397
paragraph_breaks.len()
398398
);
399399
}
400+
401+
#[test]
402+
fn test_get_sentence_boundaries_with_multibyte_cjk() {
403+
// Test with Japanese and Chinese characters (multi-byte UTF-8)
404+
let text = "日本語です。\n\n中文文章。";
405+
let boundaries = get_sentence_boundaries("en", text);
406+
407+
// Should have sentences and paragraph break
408+
assert!(
409+
boundaries.len() >= 2,
410+
"Expected at least 2 boundaries, got {}",
411+
boundaries.len()
412+
);
413+
414+
// Verify indices don't overlap and are monotonically increasing
415+
for i in 1..boundaries.len() {
416+
assert!(
417+
boundaries[i].start_index >= boundaries[i - 1].end_index,
418+
"Boundary {} starts at {} but previous ends at {}",
419+
i,
420+
boundaries[i].start_index,
421+
boundaries[i - 1].end_index
422+
);
423+
}
424+
425+
// Verify text can be reconstructed (most important for multi-byte UTF-8)
426+
let reconstructed: String = boundaries.iter().map(|b| b.text).collect();
427+
assert_eq!(
428+
reconstructed, text,
429+
"Reconstructed text doesn't match original.\nOriginal: {:?}\nReconstructed: {:?}",
430+
text, reconstructed
431+
);
432+
}
433+
434+
#[test]
435+
fn test_get_sentence_boundaries_with_emoji() {
436+
// Test with emoji characters (4-byte UTF-8)
437+
let text = "Hello world 👋.\n\nGoodbye 👋.";
438+
let boundaries = get_sentence_boundaries("en", text);
439+
440+
// Should have sentences and paragraph break
441+
assert!(
442+
boundaries.len() >= 2,
443+
"Expected at least 2 boundaries, got {}",
444+
boundaries.len()
445+
);
446+
447+
// Verify text can be reconstructed (critical for emoji)
448+
let reconstructed: String = boundaries.iter().map(|b| b.text).collect();
449+
assert_eq!(
450+
reconstructed, text,
451+
"Reconstructed text doesn't match original with emojis"
452+
);
453+
454+
// Verify boundary indices are valid
455+
for boundary in &boundaries {
456+
assert!(
457+
boundary.start_index <= boundary.end_index,
458+
"Invalid boundary: start_index {} > end_index {}",
459+
boundary.start_index,
460+
boundary.end_index
461+
);
462+
}
463+
}
464+
465+
#[test]
466+
fn test_get_sentence_boundaries_with_mixed_scripts() {
467+
// Test with mixed ASCII, Latin extended, and CJK
468+
let text = "English text. Café résumé.\n\n日本語のテキスト。";
469+
let boundaries = get_sentence_boundaries("en", text);
470+
471+
// Verify text reconstruction
472+
let reconstructed: String = boundaries.iter().map(|b| b.text).collect();
473+
assert_eq!(
474+
reconstructed, text,
475+
"Mixed script text failed reconstruction"
476+
);
477+
478+
// Verify indices are properly ordered
479+
for i in 1..boundaries.len() {
480+
assert!(
481+
boundaries[i].start_index >= boundaries[i - 1].end_index,
482+
"Boundaries not ordered correctly at index {}",
483+
i
484+
);
485+
}
486+
487+
// Verify all boundaries have valid indices
488+
for (i, boundary) in boundaries.iter().enumerate() {
489+
assert!(
490+
boundary.start_index <= boundary.end_index,
491+
"Boundary {} has invalid indices: {} > {}",
492+
i,
493+
boundary.start_index,
494+
boundary.end_index
495+
);
496+
}
497+
}
498+
499+
#[test]
500+
fn test_get_sentence_boundaries_character_vs_byte_offsets() {
501+
// This test specifically validates that character indices (start_index/end_index)
502+
// are correctly distinguished from byte offsets (start_byte/end_byte)
503+
let text = "Short. 日本語.";
504+
let boundaries = get_sentence_boundaries("en", text);
505+
506+
// Count actual characters in original text
507+
let total_chars = text.chars().count();
508+
let total_bytes = text.len();
509+
510+
// Verify that the character indices sum correctly
511+
// All boundaries together should cover all characters
512+
let mut last_char_end = 0;
513+
for boundary in &boundaries {
514+
// Each boundary should start where previous ended (no gaps)
515+
assert_eq!(
516+
boundary.start_index, last_char_end,
517+
"Gap in character indices: expected start {}, got {}",
518+
last_char_end, boundary.start_index
519+
);
520+
last_char_end = boundary.end_index;
521+
}
522+
assert_eq!(
523+
last_char_end, total_chars,
524+
"Character coverage mismatch: ended at {}, total chars = {}",
525+
last_char_end, total_chars
526+
);
527+
528+
// Similarly verify byte offsets
529+
let mut last_byte_end = 0;
530+
for boundary in &boundaries {
531+
assert_eq!(
532+
boundary.start_byte, last_byte_end,
533+
"Gap in byte indices: expected start {}, got {}",
534+
last_byte_end, boundary.start_byte
535+
);
536+
last_byte_end = boundary.end_byte;
537+
}
538+
assert_eq!(
539+
last_byte_end, total_bytes,
540+
"Byte coverage mismatch: ended at {}, total bytes = {}",
541+
last_byte_end, total_bytes
542+
);
543+
}
544+
545+
#[test]
546+
fn test_segment_with_multibyte_characters() {
547+
// Test basic segment function with multi-byte characters
548+
let text = "日本語です。中文文章。";
549+
let sentences = segment("en", text);
550+
551+
// Should segment into sentences
552+
assert!(sentences.len() > 0, "Should find at least one sentence");
553+
554+
// Verify reconstruction
555+
let reconstructed: String = sentences.join("");
556+
assert_eq!(
557+
reconstructed, text,
558+
"Segment reconstruction failed for multi-byte text"
559+
);
560+
}
400561
}

0 commit comments

Comments
 (0)