Skip to content

Commit 20eb6cb

Browse files
authored
Merge pull request #45 from oiwn/dev
Trying to render markdown
2 parents 0543aa4 + 6e621ed commit 20eb6cb

File tree

6 files changed

+160
-22
lines changed

6 files changed

+160
-22
lines changed

.llvm-cov

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[llvm-cov]
2+
ignore-filename-regex = ["src/main.rs"]

.tarpaulin.toml

Lines changed: 0 additions & 14 deletions
This file was deleted.

CLAUDE.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ cargo run --bin dce -- --file input.html --output out.txt # Extract from file
8282
- Library can be used without CLI dependencies by disabling default features
8383
- Optional `markdown` feature for structured markdown extraction using density analysis
8484

85-
## Current Task: Markdown Extraction Implementation
85+
## Markdown Extraction Implementation
8686

8787
**Goal**: Add markdown extraction capability that leverages CETD density analysis to extract main content as structured markdown.
8888

examples/check.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
use clap::{Parser, Subcommand};
2+
#[cfg(feature = "markdown")]
3+
use dom_content_extraction::extract_content_as_markdown;
24
use dom_content_extraction::{
35
DensityTree, get_content, get_node_text, scraper::Html,
46
};
@@ -16,6 +18,8 @@ enum Commands {
1618
LoremIpsum,
1719
Test4,
1820
TestToy,
21+
#[cfg(feature = "markdown")]
22+
LoremIpsumMarkdown,
1923
}
2024

2125
fn main() {
@@ -31,6 +35,10 @@ fn main() {
3135
Commands::TestToy => {
3236
process_toy();
3337
}
38+
#[cfg(feature = "markdown")]
39+
Commands::LoremIpsumMarkdown => {
40+
process_lorem_ipsum_markdown();
41+
}
3442
}
3543
}
3644

@@ -80,3 +88,16 @@ fn process_toy() {
8088
let content = get_content(&document).unwrap();
8189
println!("{}", content);
8290
}
91+
92+
#[cfg(feature = "markdown")]
93+
fn process_lorem_ipsum_markdown() {
94+
println!("Processing Lorem Ipsum example as Markdown...");
95+
let html_content =
96+
fs::read_to_string("html/lorem_ipsum.html").expect("Unable to read file");
97+
let document = Html::parse_document(&html_content);
98+
let mut dtree = DensityTree::from_document(&document).unwrap();
99+
dtree.calculate_density_sum().unwrap();
100+
101+
let markdown_content = extract_content_as_markdown(&dtree, &document).unwrap();
102+
println!("Extracted markdown content:\n{}", markdown_content);
103+
}

src/main.rs

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ struct Cli {
2121
/// Output file (stdout if not specified)
2222
#[arg(short, long)]
2323
output: Option<PathBuf>,
24+
25+
/// Output format (text or markdown)
26+
#[arg(long, default_value = "text", value_parser = ["text", "markdown"])]
27+
format: String,
2428
}
2529

2630
fn parse_url(s: &str) -> Result<Url, String> {
@@ -40,9 +44,36 @@ fn fetch_url(url: &Url) -> Result<String> {
4044
.and_then(|r| r.text())?)
4145
}
4246

43-
fn process_html(html: &str) -> Result<String> {
47+
fn process_html(html: &str, format: &str) -> Result<String> {
4448
let document = Html::parse_document(html);
45-
get_content(&document).context("Failed to extract content")
49+
50+
match format {
51+
"text" => get_content(&document).context("Failed to extract content"),
52+
"markdown" => {
53+
#[cfg(not(feature = "markdown"))]
54+
{
55+
anyhow::bail!(
56+
"Markdown output requires the 'markdown' feature to be enabled"
57+
);
58+
}
59+
60+
#[cfg(feature = "markdown")]
61+
{
62+
use dom_content_extraction::{
63+
DensityTree, extract_content_as_markdown,
64+
};
65+
let mut dtree = DensityTree::from_document(&document)
66+
.context("Failed to create density tree")?;
67+
dtree
68+
.calculate_density_sum()
69+
.context("Failed to calculate density sums")?;
70+
extract_content_as_markdown(&dtree, &document)
71+
.map_err(|e| anyhow::anyhow!(e))
72+
.context("Failed to extract content as markdown")
73+
}
74+
}
75+
_ => anyhow::bail!("Invalid format: {}. Use 'text' or 'markdown'", format),
76+
}
4677
}
4778

4879
fn write_output(content: &str, output_path: Option<PathBuf>) -> Result<()> {
@@ -81,7 +112,7 @@ fn main() -> Result<()> {
81112
};
82113

83114
// Process HTML and extract content
84-
let extracted_content = process_html(&html_content)?;
115+
let extracted_content = process_html(&html_content, &cli.format)?;
85116

86117
// Write output
87118
write_output(&extracted_content, cli.output)?;

src/markdown.rs

Lines changed: 102 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ pub fn extract_content_as_markdown(
6565
mod tests {
6666
use super::*;
6767
use crate::DensityTree;
68+
use std::fs;
6869

6970
#[test]
7071
#[cfg(feature = "markdown")]
@@ -89,13 +90,110 @@ mod tests {
8990

9091
let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
9192

92-
// Debug: print what we actually got
93-
println!("Generated markdown: '{}'", markdown);
94-
9593
// Should contain the main content
9694
assert!(!markdown.is_empty(), "Markdown should not be empty");
97-
// Relaxed assertions for debugging
9895
assert!(markdown.contains("Main Article"));
9996
assert!(markdown.contains("main content"));
10097
}
98+
99+
#[test]
100+
#[cfg(feature = "markdown")]
101+
fn test_extract_from_test1_html() {
102+
let html_content = fs::read_to_string("html/test_1.html")
103+
.expect("Unable to read test_1.html");
104+
let document = Html::parse_document(&html_content);
105+
let mut dtree = DensityTree::from_document(&document).unwrap();
106+
dtree.calculate_density_sum().unwrap();
107+
108+
let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
109+
110+
// Debug: print what we actually got
111+
println!("test1 markdown: '{}'", markdown);
112+
113+
// Should extract article body content (highest density)
114+
assert!(!markdown.is_empty(), "Markdown should not be empty");
115+
// Check for content that should be present in article body
116+
assert!(markdown.contains("Here is text"));
117+
assert!(markdown.contains("Paragraph text"));
118+
assert!(markdown.contains("huge paragraph"));
119+
// Should not contain footer navigation
120+
assert!(!markdown.contains("Menu"));
121+
assert!(!markdown.contains("link1"));
122+
}
123+
124+
#[test]
125+
#[cfg(feature = "markdown")]
126+
fn test_extract_from_test2_html() {
127+
let html_content = fs::read_to_string("html/test_2.html")
128+
.expect("Unable to read test_2.html");
129+
let document = Html::parse_document(&html_content);
130+
let mut dtree = DensityTree::from_document(&document).unwrap();
131+
dtree.calculate_density_sum().unwrap();
132+
133+
let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
134+
135+
// Debug: print what we actually got
136+
println!("test2 markdown: '{}'", markdown);
137+
138+
// Should extract article body content (highest density)
139+
assert!(!markdown.is_empty(), "Markdown should not be empty");
140+
// Check for content that should be present in article body
141+
assert!(markdown.contains("Here is text"));
142+
assert!(markdown.contains("long paragraph"));
143+
// Links should be converted to markdown format
144+
assert!(markdown.contains("wikipedia"));
145+
}
146+
147+
#[test]
148+
#[cfg(feature = "markdown")]
149+
fn test_extract_from_test4_html() {
150+
let html_content = fs::read_to_string("html/test_4.html")
151+
.expect("Unable to read test_4.html");
152+
let document = Html::parse_document(&html_content);
153+
let mut dtree = DensityTree::from_document(&document).unwrap();
154+
dtree.calculate_density_sum().unwrap();
155+
156+
let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
157+
158+
// Debug: print what we actually got
159+
println!("test4 markdown: '{}'", markdown);
160+
161+
// Should extract article content and filter out scripts/comments
162+
assert!(!markdown.is_empty(), "Markdown should not be empty");
163+
// Check for content that should be present
164+
assert!(markdown.contains("Lorem ipsum"));
165+
assert!(markdown.contains("long paragraph"));
166+
assert!(markdown.contains("wikipedia"));
167+
// Should not contain script content
168+
assert!(!markdown.contains("myFunction"));
169+
assert!(!markdown.contains("Some comments"));
170+
}
171+
172+
#[test]
173+
#[cfg(feature = "markdown")]
174+
fn test_empty_content_returns_empty_markdown() {
175+
let html = r#"
176+
<html>
177+
<body>
178+
<script>console.log("empty")</script>
179+
</body>
180+
</html>
181+
"#;
182+
183+
let document = Html::parse_document(html);
184+
let mut dtree = DensityTree::from_document(&document).unwrap();
185+
dtree.calculate_density_sum().unwrap();
186+
187+
let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
188+
189+
// Debug: print what we actually got
190+
println!("empty content markdown: '{}'", markdown);
191+
192+
// Empty content should return empty string
193+
assert!(
194+
markdown.is_empty(),
195+
"Expected empty markdown for content-less HTML, got: '{}'",
196+
markdown
197+
);
198+
}
101199
}

0 commit comments

Comments
 (0)