From 11d92bbcde4273558428cf7c00dddb07bb35ce55 Mon Sep 17 00:00:00 2001 From: oiwn Date: Sat, 20 Sep 2025 18:38:14 +0700 Subject: [PATCH 1/3] adding markdown renderer --- CLAUDE.md | 2 +- examples/check.rs | 21 +++++++++ src/markdown.rs | 106 ++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 124 insertions(+), 5 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index a24e6ea..1d43ac1 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -82,7 +82,7 @@ cargo run --bin dce -- --file input.html --output out.txt # Extract from file - Library can be used without CLI dependencies by disabling default features - Optional `markdown` feature for structured markdown extraction using density analysis -## Current Task: Markdown Extraction Implementation +## Markdown Extraction Implementation **Goal**: Add markdown extraction capability that leverages CETD density analysis to extract main content as structured markdown. diff --git a/examples/check.rs b/examples/check.rs index 6ec14f5..1bdee67 100644 --- a/examples/check.rs +++ b/examples/check.rs @@ -2,6 +2,8 @@ use clap::{Parser, Subcommand}; use dom_content_extraction::{ DensityTree, get_content, get_node_text, scraper::Html, }; +#[cfg(feature = "markdown")] +use dom_content_extraction::extract_content_as_markdown; use std::fs; #[derive(Parser)] @@ -16,6 +18,8 @@ enum Commands { LoremIpsum, Test4, TestToy, + #[cfg(feature = "markdown")] + LoremIpsumMarkdown, } fn main() { @@ -31,6 +35,10 @@ fn main() { Commands::TestToy => { process_toy(); } + #[cfg(feature = "markdown")] + Commands::LoremIpsumMarkdown => { + process_lorem_ipsum_markdown(); + } } } @@ -80,3 +88,16 @@ fn process_toy() { let content = get_content(&document).unwrap(); println!("{}", content); } + +#[cfg(feature = "markdown")] +fn process_lorem_ipsum_markdown() { + println!("Processing Lorem Ipsum example as Markdown..."); + let html_content = + fs::read_to_string("html/lorem_ipsum.html").expect("Unable to read file"); + let document = Html::parse_document(&html_content); + let mut dtree = DensityTree::from_document(&document).unwrap(); + dtree.calculate_density_sum().unwrap(); + + let markdown_content = extract_content_as_markdown(&dtree, &document).unwrap(); + println!("Extracted markdown content:\n{}", markdown_content); +} diff --git a/src/markdown.rs b/src/markdown.rs index 19850dc..2753328 100644 --- a/src/markdown.rs +++ b/src/markdown.rs @@ -65,6 +65,7 @@ pub fn extract_content_as_markdown( mod tests { use super::*; use crate::DensityTree; + use std::fs; #[test] #[cfg(feature = "markdown")] @@ -89,13 +90,110 @@ mod tests { let markdown = extract_content_as_markdown(&dtree, &document).unwrap(); - // Debug: print what we actually got - println!("Generated markdown: '{}'", markdown); - // Should contain the main content assert!(!markdown.is_empty(), "Markdown should not be empty"); - // Relaxed assertions for debugging assert!(markdown.contains("Main Article")); assert!(markdown.contains("main content")); } + + #[test] + #[cfg(feature = "markdown")] + fn test_extract_from_test1_html() { + let html_content = fs::read_to_string("html/test_1.html") + .expect("Unable to read test_1.html"); + let document = Html::parse_document(&html_content); + let mut dtree = DensityTree::from_document(&document).unwrap(); + dtree.calculate_density_sum().unwrap(); + + let markdown = extract_content_as_markdown(&dtree, &document).unwrap(); + + // Debug: print what we actually got + println!("test1 markdown: '{}'", markdown); + + // Should extract article body content (highest density) + assert!(!markdown.is_empty(), "Markdown should not be empty"); + // Check for content that should be present in article body + assert!(markdown.contains("Here is text")); + assert!(markdown.contains("Paragraph text")); + assert!(markdown.contains("huge paragraph")); + // Should not contain footer navigation + assert!(!markdown.contains("Menu")); + assert!(!markdown.contains("link1")); + } + + #[test] + #[cfg(feature = "markdown")] + fn test_extract_from_test2_html() { + let html_content = fs::read_to_string("html/test_2.html") + .expect("Unable to read test_2.html"); + let document = Html::parse_document(&html_content); + let mut dtree = DensityTree::from_document(&document).unwrap(); + dtree.calculate_density_sum().unwrap(); + + let markdown = extract_content_as_markdown(&dtree, &document).unwrap(); + + // Debug: print what we actually got + println!("test2 markdown: '{}'", markdown); + + // Should extract article body content (highest density) + assert!(!markdown.is_empty(), "Markdown should not be empty"); + // Check for content that should be present in article body + assert!(markdown.contains("Here is text")); + assert!(markdown.contains("long paragraph")); + // Links should be converted to markdown format + assert!(markdown.contains("wikipedia")); + } + + #[test] + #[cfg(feature = "markdown")] + fn test_extract_from_test4_html() { + let html_content = fs::read_to_string("html/test_4.html") + .expect("Unable to read test_4.html"); + let document = Html::parse_document(&html_content); + let mut dtree = DensityTree::from_document(&document).unwrap(); + dtree.calculate_density_sum().unwrap(); + + let markdown = extract_content_as_markdown(&dtree, &document).unwrap(); + + // Debug: print what we actually got + println!("test4 markdown: '{}'", markdown); + + // Should extract article content and filter out scripts/comments + assert!(!markdown.is_empty(), "Markdown should not be empty"); + // Check for content that should be present + assert!(markdown.contains("Lorem ipsum")); + assert!(markdown.contains("long paragraph")); + assert!(markdown.contains("wikipedia")); + // Should not contain script content + assert!(!markdown.contains("myFunction")); + assert!(!markdown.contains("Some comments")); + } + + #[test] + #[cfg(feature = "markdown")] + fn test_empty_content_returns_empty_markdown() { + let html = r#" + + + + + + "#; + + let document = Html::parse_document(html); + let mut dtree = DensityTree::from_document(&document).unwrap(); + dtree.calculate_density_sum().unwrap(); + + let markdown = extract_content_as_markdown(&dtree, &document).unwrap(); + + // Debug: print what we actually got + println!("empty content markdown: '{}'", markdown); + + // Empty content should return empty string + assert!( + markdown.is_empty(), + "Expected empty markdown for content-less HTML, got: '{}'", + markdown + ); + } } From 22bb33fd78162b315a65ead1767789ea08d5097a Mon Sep 17 00:00:00 2001 From: oiwn Date: Sat, 20 Sep 2025 21:56:51 +0700 Subject: [PATCH 2/3] can pick render format text or markdown --- src/main.rs | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/src/main.rs b/src/main.rs index c7ead1a..049c207 100644 --- a/src/main.rs +++ b/src/main.rs @@ -21,6 +21,10 @@ struct Cli { /// Output file (stdout if not specified) #[arg(short, long)] output: Option, + + /// Output format (text or markdown) + #[arg(long, default_value = "text", value_parser = ["text", "markdown"])] + format: String, } fn parse_url(s: &str) -> Result { @@ -40,9 +44,31 @@ fn fetch_url(url: &Url) -> Result { .and_then(|r| r.text())?) } -fn process_html(html: &str) -> Result { +fn process_html(html: &str, format: &str) -> Result { let document = Html::parse_document(html); - get_content(&document).context("Failed to extract content") + + match format { + "text" => get_content(&document).context("Failed to extract content"), + "markdown" => { + #[cfg(not(feature = "markdown"))] + { + anyhow::bail!("Markdown output requires the 'markdown' feature to be enabled"); + } + + #[cfg(feature = "markdown")] + { + use dom_content_extraction::{DensityTree, extract_content_as_markdown}; + let mut dtree = DensityTree::from_document(&document) + .context("Failed to create density tree")?; + dtree.calculate_density_sum() + .context("Failed to calculate density sums")?; + extract_content_as_markdown(&dtree, &document) + .map_err(|e| anyhow::anyhow!(e)) + .context("Failed to extract content as markdown") + } + } + _ => anyhow::bail!("Invalid format: {}. Use 'text' or 'markdown'", format), + } } fn write_output(content: &str, output_path: Option) -> Result<()> { @@ -81,7 +107,7 @@ fn main() -> Result<()> { }; // Process HTML and extract content - let extracted_content = process_html(&html_content)?; + let extracted_content = process_html(&html_content, &cli.format)?; // Write output write_output(&extracted_content, cli.output)?; From 6e621ed46133737f8e45cc377e4a8b7b7c91f603 Mon Sep 17 00:00:00 2001 From: oiwn Date: Sat, 20 Sep 2025 22:55:03 +0700 Subject: [PATCH 3/3] fixes --- .llvm-cov | 2 ++ .tarpaulin.toml | 14 -------------- examples/check.rs | 6 +++--- src/main.rs | 15 ++++++++++----- 4 files changed, 15 insertions(+), 22 deletions(-) create mode 100644 .llvm-cov delete mode 100644 .tarpaulin.toml diff --git a/.llvm-cov b/.llvm-cov new file mode 100644 index 0000000..bdcdab7 --- /dev/null +++ b/.llvm-cov @@ -0,0 +1,2 @@ +[llvm-cov] +ignore-filename-regex = ["src/main.rs"] \ No newline at end of file diff --git a/.tarpaulin.toml b/.tarpaulin.toml deleted file mode 100644 index 59ef03d..0000000 --- a/.tarpaulin.toml +++ /dev/null @@ -1,14 +0,0 @@ -[coverage] -# Exclude benches directory -exclude-files = [ - "benches/*", - "examples/*" -] - -[report] -# Output options -out = ["Xml", "Html", "Json"] -output-dir = "target/tarpaulin" - -# Report configuration -fail-under = 80 # Fail if coverage is under 80% diff --git a/examples/check.rs b/examples/check.rs index 1bdee67..d3df7e0 100644 --- a/examples/check.rs +++ b/examples/check.rs @@ -1,9 +1,9 @@ use clap::{Parser, Subcommand}; +#[cfg(feature = "markdown")] +use dom_content_extraction::extract_content_as_markdown; use dom_content_extraction::{ DensityTree, get_content, get_node_text, scraper::Html, }; -#[cfg(feature = "markdown")] -use dom_content_extraction::extract_content_as_markdown; use std::fs; #[derive(Parser)] @@ -97,7 +97,7 @@ fn process_lorem_ipsum_markdown() { let document = Html::parse_document(&html_content); let mut dtree = DensityTree::from_document(&document).unwrap(); dtree.calculate_density_sum().unwrap(); - + let markdown_content = extract_content_as_markdown(&dtree, &document).unwrap(); println!("Extracted markdown content:\n{}", markdown_content); } diff --git a/src/main.rs b/src/main.rs index 049c207..a475b18 100644 --- a/src/main.rs +++ b/src/main.rs @@ -46,21 +46,26 @@ fn fetch_url(url: &Url) -> Result { fn process_html(html: &str, format: &str) -> Result { let document = Html::parse_document(html); - + match format { "text" => get_content(&document).context("Failed to extract content"), "markdown" => { #[cfg(not(feature = "markdown"))] { - anyhow::bail!("Markdown output requires the 'markdown' feature to be enabled"); + anyhow::bail!( + "Markdown output requires the 'markdown' feature to be enabled" + ); } - + #[cfg(feature = "markdown")] { - use dom_content_extraction::{DensityTree, extract_content_as_markdown}; + use dom_content_extraction::{ + DensityTree, extract_content_as_markdown, + }; let mut dtree = DensityTree::from_document(&document) .context("Failed to create density tree")?; - dtree.calculate_density_sum() + dtree + .calculate_density_sum() .context("Failed to calculate density sums")?; extract_content_as_markdown(&dtree, &document) .map_err(|e| anyhow::anyhow!(e))