From 11d92bbcde4273558428cf7c00dddb07bb35ce55 Mon Sep 17 00:00:00 2001
From: oiwn <alex@imscraping.ninja>
Date: Sat, 20 Sep 2025 18:38:14 +0700
Subject: [PATCH 1/3] adding markdown renderer

---
 CLAUDE.md         |   2 +-
 examples/check.rs |  21 +++++++++
 src/markdown.rs   | 106 ++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 124 insertions(+), 5 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index a24e6ea..1d43ac1 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -82,7 +82,7 @@ cargo run --bin dce -- --file input.html --output out.txt # Extract from file
 - Library can be used without CLI dependencies by disabling default features
 - Optional `markdown` feature for structured markdown extraction using density analysis
 
-## Current Task: Markdown Extraction Implementation
+## Markdown Extraction Implementation
 
 **Goal**: Add markdown extraction capability that leverages CETD density analysis to extract main content as structured markdown.
 
diff --git a/examples/check.rs b/examples/check.rs
index 6ec14f5..1bdee67 100644
--- a/examples/check.rs
+++ b/examples/check.rs
@@ -2,6 +2,8 @@ use clap::{Parser, Subcommand};
 use dom_content_extraction::{
     DensityTree, get_content, get_node_text, scraper::Html,
 };
+#[cfg(feature = "markdown")]
+use dom_content_extraction::extract_content_as_markdown;
 use std::fs;
 
 #[derive(Parser)]
@@ -16,6 +18,8 @@ enum Commands {
     LoremIpsum,
     Test4,
     TestToy,
+    #[cfg(feature = "markdown")]
+    LoremIpsumMarkdown,
 }
 
 fn main() {
@@ -31,6 +35,10 @@ fn main() {
         Commands::TestToy => {
             process_toy();
         }
+        #[cfg(feature = "markdown")]
+        Commands::LoremIpsumMarkdown => {
+            process_lorem_ipsum_markdown();
+        }
     }
 }
 
@@ -80,3 +88,16 @@ fn process_toy() {
     let content = get_content(&document).unwrap();
     println!("{}", content);
 }
+
+#[cfg(feature = "markdown")]
+fn process_lorem_ipsum_markdown() {
+    println!("Processing Lorem Ipsum example as Markdown...");
+    let html_content =
+        fs::read_to_string("html/lorem_ipsum.html").expect("Unable to read file");
+    let document = Html::parse_document(&html_content);
+    let mut dtree = DensityTree::from_document(&document).unwrap();
+    dtree.calculate_density_sum().unwrap();
+    
+    let markdown_content = extract_content_as_markdown(&dtree, &document).unwrap();
+    println!("Extracted markdown content:\n{}", markdown_content);
+}
diff --git a/src/markdown.rs b/src/markdown.rs
index 19850dc..2753328 100644
--- a/src/markdown.rs
+++ b/src/markdown.rs
@@ -65,6 +65,7 @@ pub fn extract_content_as_markdown(
 mod tests {
     use super::*;
     use crate::DensityTree;
+    use std::fs;
 
     #[test]
     #[cfg(feature = "markdown")]
@@ -89,13 +90,110 @@ mod tests {
 
         let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
 
-        // Debug: print what we actually got
-        println!("Generated markdown: '{}'", markdown);
-
         // Should contain the main content
         assert!(!markdown.is_empty(), "Markdown should not be empty");
-        // Relaxed assertions for debugging
         assert!(markdown.contains("Main Article"));
         assert!(markdown.contains("main content"));
     }
+
+    #[test]
+    #[cfg(feature = "markdown")]
+    fn test_extract_from_test1_html() {
+        let html_content = fs::read_to_string("html/test_1.html")
+            .expect("Unable to read test_1.html");
+        let document = Html::parse_document(&html_content);
+        let mut dtree = DensityTree::from_document(&document).unwrap();
+        dtree.calculate_density_sum().unwrap();
+
+        let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
+
+        // Debug: print what we actually got
+        println!("test1 markdown: '{}'", markdown);
+
+        // Should extract article body content (highest density)
+        assert!(!markdown.is_empty(), "Markdown should not be empty");
+        // Check for content that should be present in article body
+        assert!(markdown.contains("Here is text"));
+        assert!(markdown.contains("Paragraph text"));
+        assert!(markdown.contains("huge paragraph"));
+        // Should not contain footer navigation
+        assert!(!markdown.contains("Menu"));
+        assert!(!markdown.contains("link1"));
+    }
+
+    #[test]
+    #[cfg(feature = "markdown")]
+    fn test_extract_from_test2_html() {
+        let html_content = fs::read_to_string("html/test_2.html")
+            .expect("Unable to read test_2.html");
+        let document = Html::parse_document(&html_content);
+        let mut dtree = DensityTree::from_document(&document).unwrap();
+        dtree.calculate_density_sum().unwrap();
+
+        let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
+
+        // Debug: print what we actually got
+        println!("test2 markdown: '{}'", markdown);
+
+        // Should extract article body content (highest density)
+        assert!(!markdown.is_empty(), "Markdown should not be empty");
+        // Check for content that should be present in article body
+        assert!(markdown.contains("Here is text"));
+        assert!(markdown.contains("long paragraph"));
+        // Links should be converted to markdown format
+        assert!(markdown.contains("wikipedia"));
+    }
+
+    #[test]
+    #[cfg(feature = "markdown")]
+    fn test_extract_from_test4_html() {
+        let html_content = fs::read_to_string("html/test_4.html")
+            .expect("Unable to read test_4.html");
+        let document = Html::parse_document(&html_content);
+        let mut dtree = DensityTree::from_document(&document).unwrap();
+        dtree.calculate_density_sum().unwrap();
+
+        let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
+
+        // Debug: print what we actually got
+        println!("test4 markdown: '{}'", markdown);
+
+        // Should extract article content and filter out scripts/comments
+        assert!(!markdown.is_empty(), "Markdown should not be empty");
+        // Check for content that should be present
+        assert!(markdown.contains("Lorem ipsum"));
+        assert!(markdown.contains("long paragraph"));
+        assert!(markdown.contains("wikipedia"));
+        // Should not contain script content
+        assert!(!markdown.contains("myFunction"));
+        assert!(!markdown.contains("Some comments"));
+    }
+
+    #[test]
+    #[cfg(feature = "markdown")]
+    fn test_empty_content_returns_empty_markdown() {
+        let html = r#"
+            <html>
+            <body>
+                <script>console.log("empty")</script>
+            </body>
+            </html>
+        "#;
+
+        let document = Html::parse_document(html);
+        let mut dtree = DensityTree::from_document(&document).unwrap();
+        dtree.calculate_density_sum().unwrap();
+
+        let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
+
+        // Debug: print what we actually got
+        println!("empty content markdown: '{}'", markdown);
+
+        // Empty content should return empty string
+        assert!(
+            markdown.is_empty(),
+            "Expected empty markdown for content-less HTML, got: '{}'",
+            markdown
+        );
+    }
 }

From 22bb33fd78162b315a65ead1767789ea08d5097a Mon Sep 17 00:00:00 2001
From: oiwn <alex@imscraping.ninja>
Date: Sat, 20 Sep 2025 21:56:51 +0700
Subject: [PATCH 2/3] can pick render format text or markdown

---
 src/main.rs | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/src/main.rs b/src/main.rs
index c7ead1a..049c207 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -21,6 +21,10 @@ struct Cli {
     /// Output file (stdout if not specified)
     #[arg(short, long)]
     output: Option<PathBuf>,
+
+    /// Output format (text or markdown)
+    #[arg(long, default_value = "text", value_parser = ["text", "markdown"])]
+    format: String,
 }
 
 fn parse_url(s: &str) -> Result<Url, String> {
@@ -40,9 +44,31 @@ fn fetch_url(url: &Url) -> Result<String> {
         .and_then(|r| r.text())?)
 }
 
-fn process_html(html: &str) -> Result<String> {
+fn process_html(html: &str, format: &str) -> Result<String> {
     let document = Html::parse_document(html);
-    get_content(&document).context("Failed to extract content")
+    
+    match format {
+        "text" => get_content(&document).context("Failed to extract content"),
+        "markdown" => {
+            #[cfg(not(feature = "markdown"))]
+            {
+                anyhow::bail!("Markdown output requires the 'markdown' feature to be enabled");
+            }
+            
+            #[cfg(feature = "markdown")]
+            {
+                use dom_content_extraction::{DensityTree, extract_content_as_markdown};
+                let mut dtree = DensityTree::from_document(&document)
+                    .context("Failed to create density tree")?;
+                dtree.calculate_density_sum()
+                    .context("Failed to calculate density sums")?;
+                extract_content_as_markdown(&dtree, &document)
+                    .map_err(|e| anyhow::anyhow!(e))
+                    .context("Failed to extract content as markdown")
+            }
+        }
+        _ => anyhow::bail!("Invalid format: {}. Use 'text' or 'markdown'", format),
+    }
 }
 
 fn write_output(content: &str, output_path: Option<PathBuf>) -> Result<()> {
@@ -81,7 +107,7 @@ fn main() -> Result<()> {
     };
 
     // Process HTML and extract content
-    let extracted_content = process_html(&html_content)?;
+    let extracted_content = process_html(&html_content, &cli.format)?;
 
     // Write output
     write_output(&extracted_content, cli.output)?;

From 6e621ed46133737f8e45cc377e4a8b7b7c91f603 Mon Sep 17 00:00:00 2001
From: oiwn <alex@imscraping.ninja>
Date: Sat, 20 Sep 2025 22:55:03 +0700
Subject: [PATCH 3/3] fixes

---
 .llvm-cov         |  2 ++
 .tarpaulin.toml   | 14 --------------
 examples/check.rs |  6 +++---
 src/main.rs       | 15 ++++++++++-----
 4 files changed, 15 insertions(+), 22 deletions(-)
 create mode 100644 .llvm-cov
 delete mode 100644 .tarpaulin.toml

diff --git a/.llvm-cov b/.llvm-cov
new file mode 100644
index 0000000..bdcdab7
--- /dev/null
+++ b/.llvm-cov
@@ -0,0 +1,2 @@
+[llvm-cov]
+ignore-filename-regex = ["src/main.rs"]
\ No newline at end of file
diff --git a/.tarpaulin.toml b/.tarpaulin.toml
deleted file mode 100644
index 59ef03d..0000000
--- a/.tarpaulin.toml
+++ /dev/null
@@ -1,14 +0,0 @@
-[coverage]
-# Exclude benches directory
-exclude-files = [
-    "benches/*",
-    "examples/*"
-]
-
-[report]
-# Output options
-out = ["Xml", "Html", "Json"]
-output-dir = "target/tarpaulin"
-
-# Report configuration
-fail-under = 80 # Fail if coverage is under 80%
diff --git a/examples/check.rs b/examples/check.rs
index 1bdee67..d3df7e0 100644
--- a/examples/check.rs
+++ b/examples/check.rs
@@ -1,9 +1,9 @@
 use clap::{Parser, Subcommand};
+#[cfg(feature = "markdown")]
+use dom_content_extraction::extract_content_as_markdown;
 use dom_content_extraction::{
     DensityTree, get_content, get_node_text, scraper::Html,
 };
-#[cfg(feature = "markdown")]
-use dom_content_extraction::extract_content_as_markdown;
 use std::fs;
 
 #[derive(Parser)]
@@ -97,7 +97,7 @@ fn process_lorem_ipsum_markdown() {
     let document = Html::parse_document(&html_content);
     let mut dtree = DensityTree::from_document(&document).unwrap();
     dtree.calculate_density_sum().unwrap();
-    
+
     let markdown_content = extract_content_as_markdown(&dtree, &document).unwrap();
     println!("Extracted markdown content:\n{}", markdown_content);
 }
diff --git a/src/main.rs b/src/main.rs
index 049c207..a475b18 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -46,21 +46,26 @@ fn fetch_url(url: &Url) -> Result<String> {
 
 fn process_html(html: &str, format: &str) -> Result<String> {
     let document = Html::parse_document(html);
-    
+
     match format {
         "text" => get_content(&document).context("Failed to extract content"),
         "markdown" => {
             #[cfg(not(feature = "markdown"))]
             {
-                anyhow::bail!("Markdown output requires the 'markdown' feature to be enabled");
+                anyhow::bail!(
+                    "Markdown output requires the 'markdown' feature to be enabled"
+                );
             }
-            
+
             #[cfg(feature = "markdown")]
             {
-                use dom_content_extraction::{DensityTree, extract_content_as_markdown};
+                use dom_content_extraction::{
+                    DensityTree, extract_content_as_markdown,
+                };
                 let mut dtree = DensityTree::from_document(&document)
                     .context("Failed to create density tree")?;
-                dtree.calculate_density_sum()
+                dtree
+                    .calculate_density_sum()
                     .context("Failed to calculate density sums")?;
                 extract_content_as_markdown(&dtree, &document)
                     .map_err(|e| anyhow::anyhow!(e))