Merge pull request #33 from oiwn/dev

oiwn · web-flow · commit 8b1c2347c6a5 · 2025-05-11T23:09:41.000+07:00
improve ce scoring, fixed unicode error and reduce processing time.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,6 @@
 /tmp
 /data
 *.profraw
-all_code.txt
+dom_content_extracton.txt
 .code
 .amc.toml
diff --git a/.tmuxp.yaml b/.tmuxp.yaml
@@ -11,12 +11,12 @@ windows:
     start-directory: ./
     panes:
       - shell_command:
-        - exa
+        - eza
   - window_name: srv
     start-directory: ./
     panes:
       - shell_command:
-        - exa --long
+        - eza --long
   - window_name: notes
     panes:
       - shell_command:
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "dom-content-extraction"
-version = "0.3.10"
+version = "0.3.11"
 
 description = "Rust implementation of Content extraction via text density paper"
 license = "MPL-2.0"
@@ -40,8 +40,8 @@ reqwest = { version = "0.12", features = ["blocking"], optional = true }
 tempfile = { version = "3.19", optional = true }
 url = { version = "2.5", optional = true }
 anyhow = { version = "1.0", optional = true }
-unicode-normalization = "0.1.24"
-unicode-segmentation = "1.12.0"
+unicode-normalization = "0.1"
+unicode-segmentation = "1.12"
 
 [dev-dependencies]
 criterion = "0.5"
diff --git a/README.md b/README.md
@@ -12,7 +12,9 @@ via Text Density (CETD) algorithm described in the paper by
 
 ## What Problem Does This Solve?
 
-Web pages often contain a lot of peripheral content like navigation menus, advertisements, footers, and sidebars. This makes it challenging to extract just the main content programmatically. This library helps solve this problem by:
+Web pages often contain a lot of peripheral content like navigation menus,
+advertisements, footers, and sidebars. This makes it challenging to extract just
+the main content programmatically. This library helps solve this problem by:
 
 - Analyzing the text density patterns in HTML documents
 - Identifying content-rich sections versus navigational/peripheral elements
@@ -47,7 +49,7 @@ This ensures accurate content extraction from web pages in any language, with pr
 
 ## Usage
 
-Due to "LazyLock" MSRV is 1.80
+MSRV is 1.85 due to 2024 edition. Living on the edge!
 
 Basic usage example:
 
diff --git a/examples/ce_score.rs b/examples/ce_score.rs
@@ -30,8 +30,11 @@ fn clean_and_normalize_text(text: &str) -> String {
 }
 
 fn extract_content_from_html(file_path: &Path) -> Result<String> {
-    let content = fs::read_to_string(file_path)
+    // let content = fs::read_to_string(file_path)
+    //     .with_context(|| format!("Failed to read file: {:?}", file_path))?;
+    let content = fs::read(file_path)
         .with_context(|| format!("Failed to read file: {:?}", file_path))?;
+    let content = String::from_utf8_lossy(&content).into_owned();
 
     let document = Html::parse_document(&content);
     let mut dtree = DensityTree::from_document(&document).unwrap();
@@ -42,8 +45,11 @@ fn extract_content_from_html(file_path: &Path) -> Result<String> {
 }
 
 fn clean_txt_file(file_path: &Path) -> Result<String> {
-    let content = fs::read_to_string(file_path)
+    // let content = fs::read_to_string(file_path)
+    //     .with_context(|| format!("Failed to read file: {:?}", file_path))?;
+    let content = fs::read(file_path)
         .with_context(|| format!("Failed to read file: {:?}", file_path))?;
+    let content = String::from_utf8_lossy(&content).into_owned();
 
     // Remove URL line from the top
     let content = content.lines().skip(1).collect::<Vec<&str>>().join("\n");
@@ -64,6 +70,48 @@ fn clean_txt_file(file_path: &Path) -> Result<String> {
 }
 
 fn calculate_lcs(s1: &str, s2: &str) -> usize {
+    // Split into words instead of characters
+    let s1: Vec<&str> = s1.split_whitespace().collect();
+    let s2: Vec<&str> = s2.split_whitespace().collect();
+    let (m, n) = (s1.len(), s2.len());
+    let mut prev = vec![0; n + 1];
+    let mut curr = vec![0; n + 1];
+
+    for i in 1..=m {
+        for j in 1..=n {
+            if s1[i - 1] == s2[j - 1] {
+                curr[j] = prev[j - 1] + 1;
+            } else {
+                curr[j] = curr[j - 1].max(prev[j]);
+            }
+        }
+        std::mem::swap(&mut prev, &mut curr);
+    }
+
+    // Convert word count to approximate character count
+    let lcs_words = prev[n];
+    if lcs_words == 0 {
+        return 0;
+    }
+
+    // Calculate average word length in both strings
+    let avg_word_len1 = if s1.is_empty() {
+        0.0
+    } else {
+        s1.iter().map(|w| w.len()).sum::<usize>() as f64 / s1.len() as f64
+    };
+    let avg_word_len2 = if s2.is_empty() {
+        0.0
+    } else {
+        s2.iter().map(|w| w.len()).sum::<usize>() as f64 / s2.len() as f64
+    };
+    let avg_word_len = (avg_word_len1 + avg_word_len2) / 2.0;
+
+    // Convert to character count (add 1 for space between words)
+    (lcs_words as f64 * (avg_word_len + 1.0)) as usize
+}
+
+/* fn calculate_lcs(s1: &str, s2: &str) -> usize {
     let s1: Vec<char> = s1.chars().collect();
     let s2: Vec<char> = s2.chars().collect();
     let (m, n) = (s1.len(), s2.len());
@@ -82,7 +130,7 @@ fn calculate_lcs(s1: &str, s2: &str) -> usize {
     }
 
     prev[n]
-}
+} */
 
 fn process_file_pair(
     txt_path: &Path,
diff --git a/notes.org b/notes.org
@@ -2,7 +2,7 @@
 
 * Unicode documents handling
 ** TODO I think there are problems processing unicode.
-** TODO refactoring to make certain tags processing logic configurable.
+** DONE refactoring to make certain tags processing logic configurable.
 
 * Microtasks
 ** DONE Better split for CI/CD workflows