fix: Avoid ignoring trailing whitespaces in lines

santhoshtr · santhoshtr · commit 3869889e97ba · 2025-10-30T14:34:05.000+05:30
This causes buggy html segmentation when plain text is extracted from
formatted html with nesting.
diff --git a/src/languages/language.rs b/src/languages/language.rs
@@ -42,9 +42,8 @@ pub trait Language {
         let mut boundaries = Vec::with_capacity(estimated_sentences);
 
         // Split by paragraph breaks (one or more newlines with optional whitespace)
-        let para_split_re = Regex::new(r"\n[\r\s]*\n").unwrap();
+        let para_split_re = Regex::new(r"\n[\r]*\n").unwrap();
         let paragraphs: Vec<&str> = para_split_re.split(text).collect();
-
         // Pre-calculate all paragraph offsets in one pass
         let mut paragraph_offsets = Vec::with_capacity(paragraphs.len());
         let mut current_offset = 0;
diff --git a/src/lib.rs b/src/lib.rs
@@ -107,7 +107,7 @@ fn chunk_text(text: &str, chunk_size: usize) -> Vec<&str> {
     let mut chunks = Vec::new();
 
     // Split by paragraph breaks (one or more newlines with optional whitespace)
-    let re = Regex::new(r"\n[\r\s]*\n").unwrap();
+    let re = Regex::new(r"\n[\r]*\n").unwrap();
 
     // Get paragraph parts and their positions
     let mut paragraphs = Vec::new();