Skip to content

Commit 8b1c234

Browse files
authored
Merge pull request #33 from oiwn/dev
improve ce scoring, fixed unicode error and reduce processing time.
2 parents 5cc523f + 12413d8 commit 8b1c234

File tree

6 files changed

+62
-12
lines changed

6 files changed

+62
-12
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@
88
/tmp
99
/data
1010
*.profraw
11-
all_code.txt
11+
dom_content_extracton.txt
1212
.code
1313
.amc.toml

.tmuxp.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,12 @@ windows:
1111
start-directory: ./
1212
panes:
1313
- shell_command:
14-
- exa
14+
- eza
1515
- window_name: srv
1616
start-directory: ./
1717
panes:
1818
- shell_command:
19-
- exa --long
19+
- eza --long
2020
- window_name: notes
2121
panes:
2222
- shell_command:

Cargo.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "dom-content-extraction"
3-
version = "0.3.10"
3+
version = "0.3.11"
44

55
description = "Rust implementation of Content extraction via text density paper"
66
license = "MPL-2.0"
@@ -40,8 +40,8 @@ reqwest = { version = "0.12", features = ["blocking"], optional = true }
4040
tempfile = { version = "3.19", optional = true }
4141
url = { version = "2.5", optional = true }
4242
anyhow = { version = "1.0", optional = true }
43-
unicode-normalization = "0.1.24"
44-
unicode-segmentation = "1.12.0"
43+
unicode-normalization = "0.1"
44+
unicode-segmentation = "1.12"
4545

4646
[dev-dependencies]
4747
criterion = "0.5"

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,9 @@ via Text Density (CETD) algorithm described in the paper by
1212

1313
## What Problem Does This Solve?
1414

15-
Web pages often contain a lot of peripheral content like navigation menus, advertisements, footers, and sidebars. This makes it challenging to extract just the main content programmatically. This library helps solve this problem by:
15+
Web pages often contain a lot of peripheral content like navigation menus,
16+
advertisements, footers, and sidebars. This makes it challenging to extract just
17+
the main content programmatically. This library helps solve this problem by:
1618

1719
- Analyzing the text density patterns in HTML documents
1820
- Identifying content-rich sections versus navigational/peripheral elements
@@ -47,7 +49,7 @@ This ensures accurate content extraction from web pages in any language, with pr
4749

4850
## Usage
4951

50-
Due to "LazyLock" MSRV is 1.80
52+
MSRV is 1.85 due to 2024 edition. Living on the edge!
5153

5254
Basic usage example:
5355

examples/ce_score.rs

Lines changed: 51 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,11 @@ fn clean_and_normalize_text(text: &str) -> String {
3030
}
3131

3232
fn extract_content_from_html(file_path: &Path) -> Result<String> {
33-
let content = fs::read_to_string(file_path)
33+
// let content = fs::read_to_string(file_path)
34+
// .with_context(|| format!("Failed to read file: {:?}", file_path))?;
35+
let content = fs::read(file_path)
3436
.with_context(|| format!("Failed to read file: {:?}", file_path))?;
37+
let content = String::from_utf8_lossy(&content).into_owned();
3538

3639
let document = Html::parse_document(&content);
3740
let mut dtree = DensityTree::from_document(&document).unwrap();
@@ -42,8 +45,11 @@ fn extract_content_from_html(file_path: &Path) -> Result<String> {
4245
}
4346

4447
fn clean_txt_file(file_path: &Path) -> Result<String> {
45-
let content = fs::read_to_string(file_path)
48+
// let content = fs::read_to_string(file_path)
49+
// .with_context(|| format!("Failed to read file: {:?}", file_path))?;
50+
let content = fs::read(file_path)
4651
.with_context(|| format!("Failed to read file: {:?}", file_path))?;
52+
let content = String::from_utf8_lossy(&content).into_owned();
4753

4854
// Remove URL line from the top
4955
let content = content.lines().skip(1).collect::<Vec<&str>>().join("\n");
@@ -64,6 +70,48 @@ fn clean_txt_file(file_path: &Path) -> Result<String> {
6470
}
6571

6672
fn calculate_lcs(s1: &str, s2: &str) -> usize {
73+
// Split into words instead of characters
74+
let s1: Vec<&str> = s1.split_whitespace().collect();
75+
let s2: Vec<&str> = s2.split_whitespace().collect();
76+
let (m, n) = (s1.len(), s2.len());
77+
let mut prev = vec![0; n + 1];
78+
let mut curr = vec![0; n + 1];
79+
80+
for i in 1..=m {
81+
for j in 1..=n {
82+
if s1[i - 1] == s2[j - 1] {
83+
curr[j] = prev[j - 1] + 1;
84+
} else {
85+
curr[j] = curr[j - 1].max(prev[j]);
86+
}
87+
}
88+
std::mem::swap(&mut prev, &mut curr);
89+
}
90+
91+
// Convert word count to approximate character count
92+
let lcs_words = prev[n];
93+
if lcs_words == 0 {
94+
return 0;
95+
}
96+
97+
// Calculate average word length in both strings
98+
let avg_word_len1 = if s1.is_empty() {
99+
0.0
100+
} else {
101+
s1.iter().map(|w| w.len()).sum::<usize>() as f64 / s1.len() as f64
102+
};
103+
let avg_word_len2 = if s2.is_empty() {
104+
0.0
105+
} else {
106+
s2.iter().map(|w| w.len()).sum::<usize>() as f64 / s2.len() as f64
107+
};
108+
let avg_word_len = (avg_word_len1 + avg_word_len2) / 2.0;
109+
110+
// Convert to character count (add 1 for space between words)
111+
(lcs_words as f64 * (avg_word_len + 1.0)) as usize
112+
}
113+
114+
/* fn calculate_lcs(s1: &str, s2: &str) -> usize {
67115
let s1: Vec<char> = s1.chars().collect();
68116
let s2: Vec<char> = s2.chars().collect();
69117
let (m, n) = (s1.len(), s2.len());
@@ -82,7 +130,7 @@ fn calculate_lcs(s1: &str, s2: &str) -> usize {
82130
}
83131
84132
prev[n]
85-
}
133+
} */
86134

87135
fn process_file_pair(
88136
txt_path: &Path,

notes.org

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
* Unicode documents handling
44
** TODO I think there are problems processing unicode.
5-
** TODO refactoring to make certain tags processing logic configurable.
5+
** DONE refactoring to make certain tags processing logic configurable.
66

77
* Microtasks
88
** DONE Better split for CI/CD workflows

0 commit comments

Comments
 (0)