Merge pull request #32 from oiwn/dev

oiwn · web-flow · commit 5cc523fb5442 · 2025-03-19T14:52:44.000+07:00
add unicode support
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -76,7 +76,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, windows-latest, macos-latest]
-        toolchain: ["1.80", "stable"]
+        toolchain: ["1.85", "stable"]
     runs-on: ${{ matrix.os }}
     steps:
       - uses: actions/checkout@v4
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "dom-content-extraction"
-version = "0.3.9"
+version = "0.3.10"
 
 description = "Rust implementation of Content extraction via text density paper"
 license = "MPL-2.0"
@@ -17,10 +17,11 @@ exclude = [
     ".github/*",
     ".gitignore",
     ".tmuxp.yaml",
+    ".amc.toml",
     "notes.org"
 ]
 
-edition = "2021"
+edition = "2024"
 
 [profile.release]
 opt-level = "z"
@@ -30,16 +31,17 @@ strip = true
 panic = "abort"
 
 [dependencies]
-# library
 ego-tree = "0.10"
-scraper = "0.22"
+scraper = "0.23"
 thiserror = "2"
 # binary
 clap = { version = "4.5", features = ["derive"], optional = true }
 reqwest = { version = "0.12", features = ["blocking"], optional = true }
-tempfile = { version = "3.16", optional = true }
+tempfile = { version = "3.19", optional = true }
 url = { version = "2.5", optional = true }
 anyhow = { version = "1.0", optional = true }
+unicode-normalization = "0.1.24"
+unicode-segmentation = "1.12.0"
 
 [dev-dependencies]
 criterion = "0.5"
diff --git a/README.md b/README.md
@@ -24,10 +24,27 @@ Web pages often contain a lot of peripheral content like navigation menus, adver
 - Build a density tree representing text distribution in the HTML document
 - Calculate composite text density using multiple metrics
 - Extract main content blocks based on density patterns
+- Unicode Support
 - Support for nested HTML structures
 - Efficient processing of large documents
 - Error handling for malformed HTML
 
+## Unicode Support
+
+DOM Content Extraction includes robust Unicode support for handling multilingual content:
+
+- Proper character counting using Unicode grapheme clusters
+- Unicode normalization (NFC) for consistent text representation
+- Support for various writing systems including Latin, Cyrillic, and CJK scripts
+- Accurate text density calculations across different languages
+
+This ensures accurate content extraction from web pages in any language, with proper handling of:
+
+- Combining characters (like accents in European languages)
+- Bidirectional text
+- Complex script rendering
+- Multi-code-point graphemes (like emojis)
+
 ## Usage
 
 Due to "LazyLock" MSRV is 1.80
diff --git a/examples/basic.rs b/examples/basic.rs
@@ -1,72 +1,6 @@
-/* use dom_content_extraction::DensityTree;
-use scraper::Html;
-
-fn main() -> Result<(), Box<dyn std::error::Error>> {
-    let html_content = r#"
-        <!DOCTYPE html>
-        <html>
-        <head><title>Test Page</title></head>
-        <body>
-            <nav>Menu Item 1 | Menu Item 2</nav>
-            <div class="sidebar">Side content</div>
-            <article class="main-content">
-                This is the main article content.
-                It has multiple paragraphs and should be extracted.
-                <p>This is another paragraph with important information.</p>
-                <a href="\#">Some link</a>
-            </article>
-            <footer>Copyright 2024</footer>
-        </body>
-        </html>
-    "#;
-
-    let document = Html::parse_document(html_content);
-    let mut dtree = DensityTree::from_document(&document)?;
-    dtree.calculate_density_sum()?;
-    let extracted_content = dtree.extract_content(&document)?;
-    println!("Extracted content:\n{}", extracted_content);
-
-    Ok(())
-} */
-
 use dom_content_extraction::{get_content, scraper::Html};
 
 fn main() -> Result<(), Box<dyn std::error::Error>> {
-    /* let html = r#"
-        <!DOCTYPE html><html><body>
-            <nav>Navigation</nav>
-            <article>
-                <h1>Main Article</h1>
-                <p>This is the primary content that should be extracted.</p>
-                <p>A second paragraph with more content details, and
-                information that elaborates fdfdsfsdfs fsdfsdfsdfsdfsdf
-                fsdfsdfs fsdfs fdfs fsdfsdf</p>
-            </article>
-            <footer>Footer</footer>
-        </body></html>
-    "#;
-
-    let html = r#"<!DOCTYPE html><html><body>
-       <header>
-           <nav>Home | About | Contact</nav>
-       </header>
-       <aside>
-           <ul>
-               <li>Sidebar link 1</li>
-               <li>Sidebar link 2</li>
-           </ul>
-       </aside>
-       <main>
-           <article>
-               <h1>Main Article Title</h1>
-               <p>This is the primary content paragraph that should be extracted. It contains actual meaningful text that would be considered the main content of the page.</p>
-               <p>A second paragraph with more content details and information that elaborates on the main topic.</p>
-               <a href="\#">Related link</a>
-           </article>
-       </main>
-       <footer>Copyright 2024</footer>
-    </body></html>"#; */
-
     let html = r#"<!DOCTYPE html><html><body>
         <nav>Home | About</nav>
         <main>
diff --git a/examples/ce_score.rs b/examples/ce_score.rs
@@ -1,9 +1,13 @@
+// TODO: whole thing should be optimized because now it's really too much slow!
 use anyhow::{Context, Result};
-use dom_content_extraction::scraper::Html;
-use dom_content_extraction::DensityTree;
+use dom_content_extraction::{DensityTree, scraper::Html};
 use rayon::prelude::*;
 use regex::Regex;
-use std::{fs, path::Path};
+use std::{
+    fs,
+    path::Path,
+    time::{Duration, Instant},
+};
 
 fn normalize_text(text: &str) -> String {
     text.split_whitespace().collect::<Vec<&str>>().join(" ")
@@ -80,7 +84,11 @@ fn calculate_lcs(s1: &str, s2: &str) -> usize {
     prev[n]
 }
 
-fn process_file_pair(txt_path: &Path, html_path: &Path) -> Result<(f64, f64, f64)> {
+fn process_file_pair(
+    txt_path: &Path,
+    html_path: &Path,
+) -> Result<(f64, f64, f64, Duration)> {
+    let file_start = Instant::now();
     let clean_content = clean_txt_file(txt_path)?;
     let clean_content = clean_and_normalize_text(&clean_content);
 
@@ -95,12 +103,15 @@ fn process_file_pair(txt_path: &Path, html_path: &Path) -> Result<(f64, f64, f64
     let recall = lcs_length as f64 / clean_content.len() as f64;
     let f1_score = 2.0 * (precision * recall) / (precision + recall);
 
-    Ok((precision, recall, f1_score))
+    let duration = file_start.elapsed();
+
+    Ok((precision, recall, f1_score, duration))
 }
 
 fn main() -> Result<()> {
     let gold_standard_dir = Path::new("data/GoldStandard");
     let html_input_dir = Path::new("data/finalrun-input");
+    let start_time = Instant::now();
 
     let entries: Vec<_> =
         fs::read_dir(gold_standard_dir)?.collect::<std::io::Result<Vec<_>>>()?;
@@ -115,7 +126,7 @@ fn main() -> Result<()> {
 
                 if html_path.exists() {
                     match process_file_pair(&path, &html_path) {
-                        Ok((precision, recall, f1))
+                        Ok((precision, recall, f1, duration))
                             if !precision.is_nan()
                                 && !recall.is_nan()
                                 && !f1.is_nan() =>
@@ -124,6 +135,11 @@ fn main() -> Result<()> {
                             println!("  Precision: {:.2}", precision);
                             println!("  Recall: {:.2}", recall);
                             println!("  F1 Score: {:.2}", f1);
+                            println!("  Processing time: {:.2?}", duration);
+                            // If you want to highlight slow files:
+                            if duration > Duration::from_millis(500) {
+                                println!("  ⚠️ SLOW PROCESSING");
+                            }
                             println!();
                             Some((precision, recall, f1))
                         }
@@ -172,5 +188,12 @@ fn main() -> Result<()> {
     println!("  Average Recall: {:.2}", avg_recall);
     println!("  Average F1 Score: {:.2}", avg_f1);
 
+    let total_duration = start_time.elapsed();
+    println!("Total processing time: {:.2?}", total_duration);
+    println!(
+        "Average time per file: {:.2?}",
+        total_duration / total_results as u32
+    );
+
     Ok(())
 }
diff --git a/notes.org b/notes.org
@@ -2,8 +2,7 @@
 
 * Unicode documents handling
 ** TODO I think there are problems processing unicode.
-** TODO refactoring to make certain tags processing logic
-configurable.
+** TODO refactoring to make certain tags processing logic configurable.
 
 * Microtasks
 ** DONE Better split for CI/CD workflows
@@ -12,7 +11,7 @@ configurable.
 *** DONE forbid unwrap in linting rules
 ** DONE coverage should be >80%
 ** DONE integrate cargo-tarpaulin or gcov into the github ci pipeline
-** TODO cargo publish workflow
+** DONE cargo publish workflow
 ** DONE add clear copy-pastable example into readme.md
 ** DONE need "examples" command for "lorem ipsum" test page
 ** DONE need "benchmark"
diff --git a/src/cetd.rs b/src/cetd.rs
@@ -1,7 +1,6 @@
 use crate::{
-    get_node_text,
-    tree::{NodeMetrics, BODY_SELECTOR},
-    DomExtractionError,
+    DomExtractionError, get_node_text,
+    tree::{BODY_SELECTOR, NodeMetrics},
 };
 use ego_tree::{NodeId, NodeRef, Tree};
 use scraper::Html;
@@ -203,7 +202,10 @@ impl<'a> DensityTree {
         // Process current node
         match node.value() {
             scraper::Node::Text(text) => {
-                let char_count = text.trim().len() as u32;
+                // let char_count = text.trim().len() as u32;
+                // density_node.value().metrics.char_count += char_count;
+                // NOTE: adding unicode support
+                let char_count = crate::unicode::count_graphemes(text.trim());
                 density_node.value().metrics.char_count += char_count;
             }
             scraper::Node::Element(elem) => {
@@ -355,7 +357,8 @@ impl<'a> DensityTree {
                     seen_text.insert(node_text);
                 }
             }
-            Ok(content.trim().to_string())
+            // Ok(content.trim().to_string())
+            Ok(crate::unicode::normalize_text(&content))
         } else {
             Ok(String::new())
         }
@@ -442,7 +445,13 @@ mod tests {
         let dtree = DensityTree::from_document(&document).unwrap();
         let sorted_nodes = dtree.sorted_nodes();
         let node_id = sorted_nodes.last().unwrap().node_id;
-        assert_eq!(get_node_text(node_id, &document).unwrap().len(), 200);
+        assert_eq!(
+            crate::unicode::count_graphemes(
+                &get_node_text(node_id, &document).unwrap()
+            ),
+            186
+        );
+        // assert_eq!(get_node_text(node_id, &document).unwrap().len(), 200);
     }
 
     #[test]
@@ -535,10 +544,12 @@ mod tests {
         }
 
         // Verify that at least one node has the maximum density_sum
-        assert!(dtree
-            .tree
-            .values()
-            .any(|node| node.density_sum.unwrap() == max_density_sum));
+        assert!(
+            dtree
+                .tree
+                .values()
+                .any(|node| node.density_sum.unwrap() == max_density_sum)
+        );
     }
 
     #[test]
diff --git a/src/lib.rs b/src/lib.rs
@@ -118,6 +118,7 @@ use ego_tree::NodeId;
 
 pub mod cetd;
 pub mod tree;
+pub mod unicode;
 pub mod utils;
 pub use cetd::{DensityNode, DensityTree};
 pub use utils::{get_node_links, get_node_text};
diff --git a/src/tree.rs b/src/tree.rs
@@ -44,7 +44,9 @@ impl TreeBuilder for HtmlTreeBuilder<'_> {
 
         match node.value() {
             scraper::Node::Text(text) => {
-                metrics.char_count = text.trim().len() as u32;
+                // NOTE: old method calculation
+                // metrics.char_count = text.trim().len() as u32;
+                metrics.char_count = crate::unicode::count_graphemes(text.trim());
             }
             scraper::Node::Element(elem) => {
                 metrics.tag_count = 1;
diff --git a/src/unicode.rs b/src/unicode.rs
diff --git a/src/utils.rs b/src/utils.rs