Skip to content

Commit 3869889

Browse files
committed
fix: Avoid ignoring trailing whitespaces in lines
This causes buggy html segmentation when plain text is extracted from formatted html with nesting.
1 parent d623839 commit 3869889

File tree

2 files changed

+2
-3
lines changed

2 files changed

+2
-3
lines changed

src/languages/language.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,9 +42,8 @@ pub trait Language {
4242
let mut boundaries = Vec::with_capacity(estimated_sentences);
4343

4444
// Split by paragraph breaks (one or more newlines with optional whitespace)
45-
let para_split_re = Regex::new(r"\n[\r\s]*\n").unwrap();
45+
let para_split_re = Regex::new(r"\n[\r]*\n").unwrap();
4646
let paragraphs: Vec<&str> = para_split_re.split(text).collect();
47-
4847
// Pre-calculate all paragraph offsets in one pass
4948
let mut paragraph_offsets = Vec::with_capacity(paragraphs.len());
5049
let mut current_offset = 0;

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ fn chunk_text(text: &str, chunk_size: usize) -> Vec<&str> {
107107
let mut chunks = Vec::new();
108108

109109
// Split by paragraph breaks (one or more newlines with optional whitespace)
110-
let re = Regex::new(r"\n[\r\s]*\n").unwrap();
110+
let re = Regex::new(r"\n[\r]*\n").unwrap();
111111

112112
// Get paragraph parts and their positions
113113
let mut paragraphs = Vec::new();

0 commit comments

Comments
 (0)