diff --git a/.llvm-cov b/.llvm-cov new file mode 100644 index 0000000..bdcdab7 --- /dev/null +++ b/.llvm-cov @@ -0,0 +1,2 @@ +[llvm-cov] +ignore-filename-regex = ["src/main.rs"] \ No newline at end of file diff --git a/.tarpaulin.toml b/.tarpaulin.toml deleted file mode 100644 index 59ef03d..0000000 --- a/.tarpaulin.toml +++ /dev/null @@ -1,14 +0,0 @@ -[coverage] -# Exclude benches directory -exclude-files = [ - "benches/*", - "examples/*" -] - -[report] -# Output options -out = ["Xml", "Html", "Json"] -output-dir = "target/tarpaulin" - -# Report configuration -fail-under = 80 # Fail if coverage is under 80% diff --git a/CLAUDE.md b/CLAUDE.md index a24e6ea..1d43ac1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -82,7 +82,7 @@ cargo run --bin dce -- --file input.html --output out.txt # Extract from file - Library can be used without CLI dependencies by disabling default features - Optional `markdown` feature for structured markdown extraction using density analysis -## Current Task: Markdown Extraction Implementation +## Markdown Extraction Implementation **Goal**: Add markdown extraction capability that leverages CETD density analysis to extract main content as structured markdown. diff --git a/examples/check.rs b/examples/check.rs index 6ec14f5..d3df7e0 100644 --- a/examples/check.rs +++ b/examples/check.rs @@ -1,4 +1,6 @@ use clap::{Parser, Subcommand}; +#[cfg(feature = "markdown")] +use dom_content_extraction::extract_content_as_markdown; use dom_content_extraction::{ DensityTree, get_content, get_node_text, scraper::Html, }; @@ -16,6 +18,8 @@ enum Commands { LoremIpsum, Test4, TestToy, + #[cfg(feature = "markdown")] + LoremIpsumMarkdown, } fn main() { @@ -31,6 +35,10 @@ fn main() { Commands::TestToy => { process_toy(); } + #[cfg(feature = "markdown")] + Commands::LoremIpsumMarkdown => { + process_lorem_ipsum_markdown(); + } } } @@ -80,3 +88,16 @@ fn process_toy() { let content = get_content(&document).unwrap(); println!("{}", content); } + +#[cfg(feature = "markdown")] +fn process_lorem_ipsum_markdown() { + println!("Processing Lorem Ipsum example as Markdown..."); + let html_content = + fs::read_to_string("html/lorem_ipsum.html").expect("Unable to read file"); + let document = Html::parse_document(&html_content); + let mut dtree = DensityTree::from_document(&document).unwrap(); + dtree.calculate_density_sum().unwrap(); + + let markdown_content = extract_content_as_markdown(&dtree, &document).unwrap(); + println!("Extracted markdown content:\n{}", markdown_content); +} diff --git a/src/main.rs b/src/main.rs index c7ead1a..a475b18 100644 --- a/src/main.rs +++ b/src/main.rs @@ -21,6 +21,10 @@ struct Cli { /// Output file (stdout if not specified) #[arg(short, long)] output: Option, + + /// Output format (text or markdown) + #[arg(long, default_value = "text", value_parser = ["text", "markdown"])] + format: String, } fn parse_url(s: &str) -> Result { @@ -40,9 +44,36 @@ fn fetch_url(url: &Url) -> Result { .and_then(|r| r.text())?) } -fn process_html(html: &str) -> Result { +fn process_html(html: &str, format: &str) -> Result { let document = Html::parse_document(html); - get_content(&document).context("Failed to extract content") + + match format { + "text" => get_content(&document).context("Failed to extract content"), + "markdown" => { + #[cfg(not(feature = "markdown"))] + { + anyhow::bail!( + "Markdown output requires the 'markdown' feature to be enabled" + ); + } + + #[cfg(feature = "markdown")] + { + use dom_content_extraction::{ + DensityTree, extract_content_as_markdown, + }; + let mut dtree = DensityTree::from_document(&document) + .context("Failed to create density tree")?; + dtree + .calculate_density_sum() + .context("Failed to calculate density sums")?; + extract_content_as_markdown(&dtree, &document) + .map_err(|e| anyhow::anyhow!(e)) + .context("Failed to extract content as markdown") + } + } + _ => anyhow::bail!("Invalid format: {}. Use 'text' or 'markdown'", format), + } } fn write_output(content: &str, output_path: Option) -> Result<()> { @@ -81,7 +112,7 @@ fn main() -> Result<()> { }; // Process HTML and extract content - let extracted_content = process_html(&html_content)?; + let extracted_content = process_html(&html_content, &cli.format)?; // Write output write_output(&extracted_content, cli.output)?; diff --git a/src/markdown.rs b/src/markdown.rs index 19850dc..2753328 100644 --- a/src/markdown.rs +++ b/src/markdown.rs @@ -65,6 +65,7 @@ pub fn extract_content_as_markdown( mod tests { use super::*; use crate::DensityTree; + use std::fs; #[test] #[cfg(feature = "markdown")] @@ -89,13 +90,110 @@ mod tests { let markdown = extract_content_as_markdown(&dtree, &document).unwrap(); - // Debug: print what we actually got - println!("Generated markdown: '{}'", markdown); - // Should contain the main content assert!(!markdown.is_empty(), "Markdown should not be empty"); - // Relaxed assertions for debugging assert!(markdown.contains("Main Article")); assert!(markdown.contains("main content")); } + + #[test] + #[cfg(feature = "markdown")] + fn test_extract_from_test1_html() { + let html_content = fs::read_to_string("html/test_1.html") + .expect("Unable to read test_1.html"); + let document = Html::parse_document(&html_content); + let mut dtree = DensityTree::from_document(&document).unwrap(); + dtree.calculate_density_sum().unwrap(); + + let markdown = extract_content_as_markdown(&dtree, &document).unwrap(); + + // Debug: print what we actually got + println!("test1 markdown: '{}'", markdown); + + // Should extract article body content (highest density) + assert!(!markdown.is_empty(), "Markdown should not be empty"); + // Check for content that should be present in article body + assert!(markdown.contains("Here is text")); + assert!(markdown.contains("Paragraph text")); + assert!(markdown.contains("huge paragraph")); + // Should not contain footer navigation + assert!(!markdown.contains("Menu")); + assert!(!markdown.contains("link1")); + } + + #[test] + #[cfg(feature = "markdown")] + fn test_extract_from_test2_html() { + let html_content = fs::read_to_string("html/test_2.html") + .expect("Unable to read test_2.html"); + let document = Html::parse_document(&html_content); + let mut dtree = DensityTree::from_document(&document).unwrap(); + dtree.calculate_density_sum().unwrap(); + + let markdown = extract_content_as_markdown(&dtree, &document).unwrap(); + + // Debug: print what we actually got + println!("test2 markdown: '{}'", markdown); + + // Should extract article body content (highest density) + assert!(!markdown.is_empty(), "Markdown should not be empty"); + // Check for content that should be present in article body + assert!(markdown.contains("Here is text")); + assert!(markdown.contains("long paragraph")); + // Links should be converted to markdown format + assert!(markdown.contains("wikipedia")); + } + + #[test] + #[cfg(feature = "markdown")] + fn test_extract_from_test4_html() { + let html_content = fs::read_to_string("html/test_4.html") + .expect("Unable to read test_4.html"); + let document = Html::parse_document(&html_content); + let mut dtree = DensityTree::from_document(&document).unwrap(); + dtree.calculate_density_sum().unwrap(); + + let markdown = extract_content_as_markdown(&dtree, &document).unwrap(); + + // Debug: print what we actually got + println!("test4 markdown: '{}'", markdown); + + // Should extract article content and filter out scripts/comments + assert!(!markdown.is_empty(), "Markdown should not be empty"); + // Check for content that should be present + assert!(markdown.contains("Lorem ipsum")); + assert!(markdown.contains("long paragraph")); + assert!(markdown.contains("wikipedia")); + // Should not contain script content + assert!(!markdown.contains("myFunction")); + assert!(!markdown.contains("Some comments")); + } + + #[test] + #[cfg(feature = "markdown")] + fn test_empty_content_returns_empty_markdown() { + let html = r#" + + + + + + "#; + + let document = Html::parse_document(html); + let mut dtree = DensityTree::from_document(&document).unwrap(); + dtree.calculate_density_sum().unwrap(); + + let markdown = extract_content_as_markdown(&dtree, &document).unwrap(); + + // Debug: print what we actually got + println!("empty content markdown: '{}'", markdown); + + // Empty content should return empty string + assert!( + markdown.is_empty(), + "Expected empty markdown for content-less HTML, got: '{}'", + markdown + ); + } }