Skip to content

Commit 5cc523f

Browse files
authored
Merge pull request #32 from oiwn/dev
add unicode support
2 parents 3ec387a + f60d6ea commit 5cc523f

File tree

11 files changed

+295
-95
lines changed

11 files changed

+295
-95
lines changed

.github/workflows/ci.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ jobs:
7676
fail-fast: false
7777
matrix:
7878
os: [ubuntu-latest, windows-latest, macos-latest]
79-
toolchain: ["1.80", "stable"]
79+
toolchain: ["1.85", "stable"]
8080
runs-on: ${{ matrix.os }}
8181
steps:
8282
- uses: actions/checkout@v4

Cargo.toml

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "dom-content-extraction"
3-
version = "0.3.9"
3+
version = "0.3.10"
44

55
description = "Rust implementation of Content extraction via text density paper"
66
license = "MPL-2.0"
@@ -17,10 +17,11 @@ exclude = [
1717
".github/*",
1818
".gitignore",
1919
".tmuxp.yaml",
20+
".amc.toml",
2021
"notes.org"
2122
]
2223

23-
edition = "2021"
24+
edition = "2024"
2425

2526
[profile.release]
2627
opt-level = "z"
@@ -30,16 +31,17 @@ strip = true
3031
panic = "abort"
3132

3233
[dependencies]
33-
# library
3434
ego-tree = "0.10"
35-
scraper = "0.22"
35+
scraper = "0.23"
3636
thiserror = "2"
3737
# binary
3838
clap = { version = "4.5", features = ["derive"], optional = true }
3939
reqwest = { version = "0.12", features = ["blocking"], optional = true }
40-
tempfile = { version = "3.16", optional = true }
40+
tempfile = { version = "3.19", optional = true }
4141
url = { version = "2.5", optional = true }
4242
anyhow = { version = "1.0", optional = true }
43+
unicode-normalization = "0.1.24"
44+
unicode-segmentation = "1.12.0"
4345

4446
[dev-dependencies]
4547
criterion = "0.5"

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,27 @@ Web pages often contain a lot of peripheral content like navigation menus, adver
2424
- Build a density tree representing text distribution in the HTML document
2525
- Calculate composite text density using multiple metrics
2626
- Extract main content blocks based on density patterns
27+
- Unicode Support
2728
- Support for nested HTML structures
2829
- Efficient processing of large documents
2930
- Error handling for malformed HTML
3031

32+
## Unicode Support
33+
34+
DOM Content Extraction includes robust Unicode support for handling multilingual content:
35+
36+
- Proper character counting using Unicode grapheme clusters
37+
- Unicode normalization (NFC) for consistent text representation
38+
- Support for various writing systems including Latin, Cyrillic, and CJK scripts
39+
- Accurate text density calculations across different languages
40+
41+
This ensures accurate content extraction from web pages in any language, with proper handling of:
42+
43+
- Combining characters (like accents in European languages)
44+
- Bidirectional text
45+
- Complex script rendering
46+
- Multi-code-point graphemes (like emojis)
47+
3148
## Usage
3249

3350
Due to "LazyLock" MSRV is 1.80

examples/basic.rs

Lines changed: 0 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,6 @@
1-
/* use dom_content_extraction::DensityTree;
2-
use scraper::Html;
3-
4-
fn main() -> Result<(), Box<dyn std::error::Error>> {
5-
let html_content = r#"
6-
<!DOCTYPE html>
7-
<html>
8-
<head><title>Test Page</title></head>
9-
<body>
10-
<nav>Menu Item 1 | Menu Item 2</nav>
11-
<div class="sidebar">Side content</div>
12-
<article class="main-content">
13-
This is the main article content.
14-
It has multiple paragraphs and should be extracted.
15-
<p>This is another paragraph with important information.</p>
16-
<a href="\#">Some link</a>
17-
</article>
18-
<footer>Copyright 2024</footer>
19-
</body>
20-
</html>
21-
"#;
22-
23-
let document = Html::parse_document(html_content);
24-
let mut dtree = DensityTree::from_document(&document)?;
25-
dtree.calculate_density_sum()?;
26-
let extracted_content = dtree.extract_content(&document)?;
27-
println!("Extracted content:\n{}", extracted_content);
28-
29-
Ok(())
30-
} */
31-
321
use dom_content_extraction::{get_content, scraper::Html};
332

343
fn main() -> Result<(), Box<dyn std::error::Error>> {
35-
/* let html = r#"
36-
<!DOCTYPE html><html><body>
37-
<nav>Navigation</nav>
38-
<article>
39-
<h1>Main Article</h1>
40-
<p>This is the primary content that should be extracted.</p>
41-
<p>A second paragraph with more content details, and
42-
information that elaborates fdfdsfsdfs fsdfsdfsdfsdfsdf
43-
fsdfsdfs fsdfs fdfs fsdfsdf</p>
44-
</article>
45-
<footer>Footer</footer>
46-
</body></html>
47-
"#;
48-
49-
let html = r#"<!DOCTYPE html><html><body>
50-
<header>
51-
<nav>Home | About | Contact</nav>
52-
</header>
53-
<aside>
54-
<ul>
55-
<li>Sidebar link 1</li>
56-
<li>Sidebar link 2</li>
57-
</ul>
58-
</aside>
59-
<main>
60-
<article>
61-
<h1>Main Article Title</h1>
62-
<p>This is the primary content paragraph that should be extracted. It contains actual meaningful text that would be considered the main content of the page.</p>
63-
<p>A second paragraph with more content details and information that elaborates on the main topic.</p>
64-
<a href="\#">Related link</a>
65-
</article>
66-
</main>
67-
<footer>Copyright 2024</footer>
68-
</body></html>"#; */
69-
704
let html = r#"<!DOCTYPE html><html><body>
715
<nav>Home | About</nav>
726
<main>

examples/ce_score.rs

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1+
// TODO: whole thing should be optimized because now it's really too much slow!
12
use anyhow::{Context, Result};
2-
use dom_content_extraction::scraper::Html;
3-
use dom_content_extraction::DensityTree;
3+
use dom_content_extraction::{DensityTree, scraper::Html};
44
use rayon::prelude::*;
55
use regex::Regex;
6-
use std::{fs, path::Path};
6+
use std::{
7+
fs,
8+
path::Path,
9+
time::{Duration, Instant},
10+
};
711

812
fn normalize_text(text: &str) -> String {
913
text.split_whitespace().collect::<Vec<&str>>().join(" ")
@@ -80,7 +84,11 @@ fn calculate_lcs(s1: &str, s2: &str) -> usize {
8084
prev[n]
8185
}
8286

83-
fn process_file_pair(txt_path: &Path, html_path: &Path) -> Result<(f64, f64, f64)> {
87+
fn process_file_pair(
88+
txt_path: &Path,
89+
html_path: &Path,
90+
) -> Result<(f64, f64, f64, Duration)> {
91+
let file_start = Instant::now();
8492
let clean_content = clean_txt_file(txt_path)?;
8593
let clean_content = clean_and_normalize_text(&clean_content);
8694

@@ -95,12 +103,15 @@ fn process_file_pair(txt_path: &Path, html_path: &Path) -> Result<(f64, f64, f64
95103
let recall = lcs_length as f64 / clean_content.len() as f64;
96104
let f1_score = 2.0 * (precision * recall) / (precision + recall);
97105

98-
Ok((precision, recall, f1_score))
106+
let duration = file_start.elapsed();
107+
108+
Ok((precision, recall, f1_score, duration))
99109
}
100110

101111
fn main() -> Result<()> {
102112
let gold_standard_dir = Path::new("data/GoldStandard");
103113
let html_input_dir = Path::new("data/finalrun-input");
114+
let start_time = Instant::now();
104115

105116
let entries: Vec<_> =
106117
fs::read_dir(gold_standard_dir)?.collect::<std::io::Result<Vec<_>>>()?;
@@ -115,7 +126,7 @@ fn main() -> Result<()> {
115126

116127
if html_path.exists() {
117128
match process_file_pair(&path, &html_path) {
118-
Ok((precision, recall, f1))
129+
Ok((precision, recall, f1, duration))
119130
if !precision.is_nan()
120131
&& !recall.is_nan()
121132
&& !f1.is_nan() =>
@@ -124,6 +135,11 @@ fn main() -> Result<()> {
124135
println!(" Precision: {:.2}", precision);
125136
println!(" Recall: {:.2}", recall);
126137
println!(" F1 Score: {:.2}", f1);
138+
println!(" Processing time: {:.2?}", duration);
139+
// If you want to highlight slow files:
140+
if duration > Duration::from_millis(500) {
141+
println!(" ⚠️ SLOW PROCESSING");
142+
}
127143
println!();
128144
Some((precision, recall, f1))
129145
}
@@ -172,5 +188,12 @@ fn main() -> Result<()> {
172188
println!(" Average Recall: {:.2}", avg_recall);
173189
println!(" Average F1 Score: {:.2}", avg_f1);
174190

191+
let total_duration = start_time.elapsed();
192+
println!("Total processing time: {:.2?}", total_duration);
193+
println!(
194+
"Average time per file: {:.2?}",
195+
total_duration / total_results as u32
196+
);
197+
175198
Ok(())
176199
}

notes.org

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,7 @@
22

33
* Unicode documents handling
44
** TODO I think there are problems processing unicode.
5-
** TODO refactoring to make certain tags processing logic
6-
configurable.
5+
** TODO refactoring to make certain tags processing logic configurable.
76

87
* Microtasks
98
** DONE Better split for CI/CD workflows
@@ -12,7 +11,7 @@ configurable.
1211
*** DONE forbid unwrap in linting rules
1312
** DONE coverage should be >80%
1413
** DONE integrate cargo-tarpaulin or gcov into the github ci pipeline
15-
** TODO cargo publish workflow
14+
** DONE cargo publish workflow
1615
** DONE add clear copy-pastable example into readme.md
1716
** DONE need "examples" command for "lorem ipsum" test page
1817
** DONE need "benchmark"

src/cetd.rs

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
use crate::{
2-
get_node_text,
3-
tree::{NodeMetrics, BODY_SELECTOR},
4-
DomExtractionError,
2+
DomExtractionError, get_node_text,
3+
tree::{BODY_SELECTOR, NodeMetrics},
54
};
65
use ego_tree::{NodeId, NodeRef, Tree};
76
use scraper::Html;
@@ -203,7 +202,10 @@ impl<'a> DensityTree {
203202
// Process current node
204203
match node.value() {
205204
scraper::Node::Text(text) => {
206-
let char_count = text.trim().len() as u32;
205+
// let char_count = text.trim().len() as u32;
206+
// density_node.value().metrics.char_count += char_count;
207+
// NOTE: adding unicode support
208+
let char_count = crate::unicode::count_graphemes(text.trim());
207209
density_node.value().metrics.char_count += char_count;
208210
}
209211
scraper::Node::Element(elem) => {
@@ -355,7 +357,8 @@ impl<'a> DensityTree {
355357
seen_text.insert(node_text);
356358
}
357359
}
358-
Ok(content.trim().to_string())
360+
// Ok(content.trim().to_string())
361+
Ok(crate::unicode::normalize_text(&content))
359362
} else {
360363
Ok(String::new())
361364
}
@@ -442,7 +445,13 @@ mod tests {
442445
let dtree = DensityTree::from_document(&document).unwrap();
443446
let sorted_nodes = dtree.sorted_nodes();
444447
let node_id = sorted_nodes.last().unwrap().node_id;
445-
assert_eq!(get_node_text(node_id, &document).unwrap().len(), 200);
448+
assert_eq!(
449+
crate::unicode::count_graphemes(
450+
&get_node_text(node_id, &document).unwrap()
451+
),
452+
186
453+
);
454+
// assert_eq!(get_node_text(node_id, &document).unwrap().len(), 200);
446455
}
447456

448457
#[test]
@@ -535,10 +544,12 @@ mod tests {
535544
}
536545

537546
// Verify that at least one node has the maximum density_sum
538-
assert!(dtree
539-
.tree
540-
.values()
541-
.any(|node| node.density_sum.unwrap() == max_density_sum));
547+
assert!(
548+
dtree
549+
.tree
550+
.values()
551+
.any(|node| node.density_sum.unwrap() == max_density_sum)
552+
);
542553
}
543554

544555
#[test]

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ use ego_tree::NodeId;
118118

119119
pub mod cetd;
120120
pub mod tree;
121+
pub mod unicode;
121122
pub mod utils;
122123
pub use cetd::{DensityNode, DensityTree};
123124
pub use utils::{get_node_links, get_node_text};

src/tree.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@ impl TreeBuilder for HtmlTreeBuilder<'_> {
4444

4545
match node.value() {
4646
scraper::Node::Text(text) => {
47-
metrics.char_count = text.trim().len() as u32;
47+
// NOTE: old method calculation
48+
// metrics.char_count = text.trim().len() as u32;
49+
metrics.char_count = crate::unicode::count_graphemes(text.trim());
4850
}
4951
scraper::Node::Element(elem) => {
5052
metrics.tag_count = 1;

0 commit comments

Comments
 (0)