Skip to content

Commit 0543aa4

Browse files
authored
Merge pull request #41 from oiwn/dev
conversion to markdown using htmd crate
2 parents 92d3162 + dfb485d commit 0543aa4

File tree

6 files changed

+159
-4
lines changed

6 files changed

+159
-4
lines changed

.deny.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@ allow = [
55
"Unicode-3.0",
66
"MPL-2.0",
77
"ISC",
8-
"BSD-3-Clause"
8+
"BSD-3-Clause",
9+
"Zlib"
910
]
1011

1112
[advisories]

CLAUDE.md

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,4 +79,46 @@ cargo run --bin dce -- --file input.html --output out.txt # Extract from file
7979
## Features
8080

8181
- Default features include CLI functionality (`cli` feature)
82-
- Library can be used without CLI dependencies by disabling default features
82+
- Library can be used without CLI dependencies by disabling default features
83+
- Optional `markdown` feature for structured markdown extraction using density analysis
84+
85+
## Current Task: Markdown Extraction Implementation
86+
87+
**Goal**: Add markdown extraction capability that leverages CETD density analysis to extract main content as structured markdown.
88+
89+
**Approach**:
90+
- Create completely separate `src/markdown.rs` module (do not modify CETD algorithm)
91+
- Use existing density analysis to identify high-density content nodes
92+
- Extract HTML subtrees for those nodes using their NodeIDs
93+
- Convert HTML to markdown using `htmd` library
94+
- Add as optional `markdown` feature flag
95+
96+
**Implementation Steps**:
97+
1. ✅ Add `htmd` dependency with `markdown` feature flag to Cargo.toml
98+
2. ✅ Create `src/markdown.rs` with main API: `extract_content_as_markdown()`
99+
3. ✅ Add markdown module to `src/lib.rs` with feature gating
100+
4. ✅ Mirror logic from `DensityTree::extract_content()` but collect NodeIDs instead of text
101+
5. ✅ Implement HTML container extraction using scraper's NodeID→HTML mapping
102+
6. ✅ Integrate `htmd` for HTML→Markdown conversion
103+
7. ✅ Add error handling and basic tests
104+
105+
**Current Status**: ✅ Implementation complete and working
106+
107+
**Resolution**:
108+
- Simplified approach: Use `get_max_density_sum_node()` to find highest density content
109+
- Handle text nodes by walking up the tree to find parent elements
110+
- Extract HTML using `ElementRef::inner_html()` method
111+
- Convert to markdown using `htmd::HtmlToMarkdown` with script/style tags skipped
112+
- Proper error handling following existing patterns
113+
114+
**Key Implementation Details**:
115+
- Uses `ElementRef::wrap()` to convert scraper nodes to elements
116+
- Walks up parent tree when max density node is text (whitespace)
117+
- Returns empty string when no content found (consistent with existing behavior)
118+
- Trims markdown output for clean results
119+
120+
**Test Results**:
121+
- ✅ Test `test_extract_content_as_markdown` passes
122+
- ✅ All existing tests continue to pass
123+
- ✅ Generated markdown includes proper formatting (headers, paragraphs)
124+
- ✅ Works with both markdown feature enabled and disabled

Cargo.toml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ url = { version = "2.5", optional = true }
4242
anyhow = { version = "1", optional = true }
4343
unicode-normalization = "0.1"
4444
unicode-segmentation = "1.12"
45+
htmd = { version = "0.3", optional = true }
4546

4647
[dev-dependencies]
4748
criterion = "0.7"
@@ -63,7 +64,8 @@ path = "src/main.rs"
6364
required-features = ["cli"]
6465

6566
[features]
66-
default = ["cli"]
67+
default = ["cli", "markdown"]
68+
markdown = ["dep:htmd"]
6769
cli = [
6870
"dep:clap",
6971
"dep:reqwest",

src/lib.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,9 +120,15 @@ pub mod cetd;
120120
pub mod tree;
121121
pub mod unicode;
122122
pub mod utils;
123+
124+
#[cfg(feature = "markdown")]
125+
pub mod markdown;
123126
pub use cetd::{DensityNode, DensityTree};
124127
pub use utils::{get_node_links, get_node_text};
125128

129+
#[cfg(feature = "markdown")]
130+
pub use markdown::extract_content_as_markdown;
131+
126132
// Re-export
127133
pub use scraper;
128134

src/markdown.rs

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
use crate::{DensityTree, DomExtractionError};
2+
use scraper::{ElementRef, Html};
3+
4+
/// Extracts the main content from an HTML document as markdown using CETD analysis.
5+
///
6+
/// This function identifies the highest density content node using the CETD algorithm
7+
/// and converts its HTML content to markdown format.
8+
///
9+
/// # Arguments
10+
/// * `dtree` - A DensityTree that has been built and analyzed
11+
/// * `document` - The original HTML document for node reference
12+
///
13+
/// # Returns
14+
/// A Result containing the extracted markdown content or an error
15+
#[cfg(feature = "markdown")]
16+
pub fn extract_content_as_markdown(
17+
dtree: &DensityTree,
18+
document: &Html,
19+
) -> Result<String, DomExtractionError> {
20+
// Get the node with maximum density sum
21+
let max_node = match dtree.get_max_density_sum_node() {
22+
Some(node) => node,
23+
None => return Ok(String::new()), // No content found
24+
};
25+
26+
// Get the NodeId from the density node
27+
let node_id = max_node.value().node_id;
28+
29+
// Get the scraper node from the document
30+
let scraper_node = document
31+
.tree
32+
.get(node_id)
33+
.ok_or(DomExtractionError::NodeAccessError(node_id))?;
34+
35+
// Find the nearest parent element that can be wrapped as ElementRef
36+
let mut current_node = scraper_node;
37+
let element_ref = loop {
38+
if let Some(element) = ElementRef::wrap(current_node) {
39+
break element;
40+
}
41+
42+
// Move to parent if current node is not an element
43+
if let Some(parent) = current_node.parent() {
44+
current_node = parent;
45+
} else {
46+
return Err(DomExtractionError::NodeAccessError(node_id));
47+
}
48+
};
49+
50+
// Extract the HTML content
51+
let html_content = element_ref.inner_html();
52+
53+
// Convert HTML to markdown using htmd
54+
let converter = htmd::HtmlToMarkdown::builder()
55+
.skip_tags(vec!["script", "style"])
56+
.build();
57+
58+
converter
59+
.convert(&html_content)
60+
.map_err(|_| DomExtractionError::NodeAccessError(node_id))
61+
.map(|md| md.trim().to_string())
62+
}
63+
64+
#[cfg(test)]
65+
mod tests {
66+
use super::*;
67+
use crate::DensityTree;
68+
69+
#[test]
70+
#[cfg(feature = "markdown")]
71+
fn test_extract_content_as_markdown() {
72+
let html = r#"
73+
<html>
74+
<body>
75+
<div class="header">Navigation</div>
76+
<article>
77+
<h1>Main Article</h1>
78+
<p>This is the main content with lots of text that should have high density.</p>
79+
<p>Another paragraph with substantial content for density analysis.</p>
80+
</article>
81+
<div class="sidebar">Sidebar content</div>
82+
</body>
83+
</html>
84+
"#;
85+
86+
let document = Html::parse_document(html);
87+
let mut dtree = DensityTree::from_document(&document).unwrap();
88+
dtree.calculate_density_sum().unwrap();
89+
90+
let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
91+
92+
// Debug: print what we actually got
93+
println!("Generated markdown: '{}'", markdown);
94+
95+
// Should contain the main content
96+
assert!(!markdown.is_empty(), "Markdown should not be empty");
97+
// Relaxed assertions for debugging
98+
assert!(markdown.contains("Main Article"));
99+
assert!(markdown.contains("main content"));
100+
}
101+
}

src/utils.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,10 @@ pub fn get_node_text(
5252
}
5353

5454
/// Recursively collect text from nodes while filtering out script/style content
55-
fn collect_text_filtered(node: &ego_tree::NodeRef<'_, scraper::node::Node>, text_fragments: &mut Vec<String>) {
55+
fn collect_text_filtered(
56+
node: &ego_tree::NodeRef<'_, scraper::node::Node>,
57+
text_fragments: &mut Vec<String>,
58+
) {
5659
match node.value() {
5760
scraper::Node::Text(txt) => {
5861
let clean_text = txt.trim();

0 commit comments

Comments
 (0)