Merge pull request #45 from oiwn/dev

oiwn · web-flow · commit 20eb6cb1fb99 · 2025-09-20T23:11:38.000+07:00
Trying to render markdown
diff --git a/.llvm-cov b/.llvm-cov
@@ -0,0 +1,2 @@
+[llvm-cov]
+ignore-filename-regex = ["src/main.rs"]
diff --git a/.tarpaulin.toml b/.tarpaulin.toml
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -82,7 +82,7 @@ cargo run --bin dce -- --file input.html --output out.txt # Extract from file
 - Library can be used without CLI dependencies by disabling default features
 - Optional `markdown` feature for structured markdown extraction using density analysis
 
-## Current Task: Markdown Extraction Implementation
+## Markdown Extraction Implementation
 
 **Goal**: Add markdown extraction capability that leverages CETD density analysis to extract main content as structured markdown.
 
diff --git a/examples/check.rs b/examples/check.rs
@@ -1,4 +1,6 @@
 use clap::{Parser, Subcommand};
+#[cfg(feature = "markdown")]
+use dom_content_extraction::extract_content_as_markdown;
 use dom_content_extraction::{
     DensityTree, get_content, get_node_text, scraper::Html,
 };
@@ -16,6 +18,8 @@ enum Commands {
     LoremIpsum,
     Test4,
     TestToy,
+    #[cfg(feature = "markdown")]
+    LoremIpsumMarkdown,
 }
 
 fn main() {
@@ -31,6 +35,10 @@ fn main() {
         Commands::TestToy => {
             process_toy();
         }
+        #[cfg(feature = "markdown")]
+        Commands::LoremIpsumMarkdown => {
+            process_lorem_ipsum_markdown();
+        }
     }
 }
 
@@ -80,3 +88,16 @@ fn process_toy() {
     let content = get_content(&document).unwrap();
     println!("{}", content);
 }
+
+#[cfg(feature = "markdown")]
+fn process_lorem_ipsum_markdown() {
+    println!("Processing Lorem Ipsum example as Markdown...");
+    let html_content =
+        fs::read_to_string("html/lorem_ipsum.html").expect("Unable to read file");
+    let document = Html::parse_document(&html_content);
+    let mut dtree = DensityTree::from_document(&document).unwrap();
+    dtree.calculate_density_sum().unwrap();
+
+    let markdown_content = extract_content_as_markdown(&dtree, &document).unwrap();
+    println!("Extracted markdown content:\n{}", markdown_content);
+}
diff --git a/src/main.rs b/src/main.rs
@@ -21,6 +21,10 @@ struct Cli {
     /// Output file (stdout if not specified)
     #[arg(short, long)]
     output: Option<PathBuf>,
+
+    /// Output format (text or markdown)
+    #[arg(long, default_value = "text", value_parser = ["text", "markdown"])]
+    format: String,
 }
 
 fn parse_url(s: &str) -> Result<Url, String> {
@@ -40,9 +44,36 @@ fn fetch_url(url: &Url) -> Result<String> {
         .and_then(|r| r.text())?)
 }
 
-fn process_html(html: &str) -> Result<String> {
+fn process_html(html: &str, format: &str) -> Result<String> {
     let document = Html::parse_document(html);
-    get_content(&document).context("Failed to extract content")
+
+    match format {
+        "text" => get_content(&document).context("Failed to extract content"),
+        "markdown" => {
+            #[cfg(not(feature = "markdown"))]
+            {
+                anyhow::bail!(
+                    "Markdown output requires the 'markdown' feature to be enabled"
+                );
+            }
+
+            #[cfg(feature = "markdown")]
+            {
+                use dom_content_extraction::{
+                    DensityTree, extract_content_as_markdown,
+                };
+                let mut dtree = DensityTree::from_document(&document)
+                    .context("Failed to create density tree")?;
+                dtree
+                    .calculate_density_sum()
+                    .context("Failed to calculate density sums")?;
+                extract_content_as_markdown(&dtree, &document)
+                    .map_err(|e| anyhow::anyhow!(e))
+                    .context("Failed to extract content as markdown")
+            }
+        }
+        _ => anyhow::bail!("Invalid format: {}. Use 'text' or 'markdown'", format),
+    }
 }
 
 fn write_output(content: &str, output_path: Option<PathBuf>) -> Result<()> {
@@ -81,7 +112,7 @@ fn main() -> Result<()> {
     };
 
     // Process HTML and extract content
-    let extracted_content = process_html(&html_content)?;
+    let extracted_content = process_html(&html_content, &cli.format)?;
 
     // Write output
     write_output(&extracted_content, cli.output)?;
diff --git a/src/markdown.rs b/src/markdown.rs
@@ -65,6 +65,7 @@ pub fn extract_content_as_markdown(
 mod tests {
     use super::*;
     use crate::DensityTree;
+    use std::fs;
 
     #[test]
     #[cfg(feature = "markdown")]
@@ -89,13 +90,110 @@ mod tests {
 
         let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
 
-        // Debug: print what we actually got
-        println!("Generated markdown: '{}'", markdown);
-
         // Should contain the main content
         assert!(!markdown.is_empty(), "Markdown should not be empty");
-        // Relaxed assertions for debugging
         assert!(markdown.contains("Main Article"));
         assert!(markdown.contains("main content"));
     }
+
+    #[test]
+    #[cfg(feature = "markdown")]
+    fn test_extract_from_test1_html() {
+        let html_content = fs::read_to_string("html/test_1.html")
+            .expect("Unable to read test_1.html");
+        let document = Html::parse_document(&html_content);
+        let mut dtree = DensityTree::from_document(&document).unwrap();
+        dtree.calculate_density_sum().unwrap();
+
+        let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
+
+        // Debug: print what we actually got
+        println!("test1 markdown: '{}'", markdown);
+
+        // Should extract article body content (highest density)
+        assert!(!markdown.is_empty(), "Markdown should not be empty");
+        // Check for content that should be present in article body
+        assert!(markdown.contains("Here is text"));
+        assert!(markdown.contains("Paragraph text"));
+        assert!(markdown.contains("huge paragraph"));
+        // Should not contain footer navigation
+        assert!(!markdown.contains("Menu"));
+        assert!(!markdown.contains("link1"));
+    }
+
+    #[test]
+    #[cfg(feature = "markdown")]
+    fn test_extract_from_test2_html() {
+        let html_content = fs::read_to_string("html/test_2.html")
+            .expect("Unable to read test_2.html");
+        let document = Html::parse_document(&html_content);
+        let mut dtree = DensityTree::from_document(&document).unwrap();
+        dtree.calculate_density_sum().unwrap();
+
+        let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
+
+        // Debug: print what we actually got
+        println!("test2 markdown: '{}'", markdown);
+
+        // Should extract article body content (highest density)
+        assert!(!markdown.is_empty(), "Markdown should not be empty");
+        // Check for content that should be present in article body
+        assert!(markdown.contains("Here is text"));
+        assert!(markdown.contains("long paragraph"));
+        // Links should be converted to markdown format
+        assert!(markdown.contains("wikipedia"));
+    }
+
+    #[test]
+    #[cfg(feature = "markdown")]
+    fn test_extract_from_test4_html() {
+        let html_content = fs::read_to_string("html/test_4.html")
+            .expect("Unable to read test_4.html");
+        let document = Html::parse_document(&html_content);
+        let mut dtree = DensityTree::from_document(&document).unwrap();
+        dtree.calculate_density_sum().unwrap();
+
+        let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
+
+        // Debug: print what we actually got
+        println!("test4 markdown: '{}'", markdown);
+
+        // Should extract article content and filter out scripts/comments
+        assert!(!markdown.is_empty(), "Markdown should not be empty");
+        // Check for content that should be present
+        assert!(markdown.contains("Lorem ipsum"));
+        assert!(markdown.contains("long paragraph"));
+        assert!(markdown.contains("wikipedia"));
+        // Should not contain script content
+        assert!(!markdown.contains("myFunction"));
+        assert!(!markdown.contains("Some comments"));
+    }
+
+    #[test]
+    #[cfg(feature = "markdown")]
+    fn test_empty_content_returns_empty_markdown() {
+        let html = r#"
+            <html>
+            <body>
+                <script>console.log("empty")</script>
+            </body>
+            </html>
+        "#;
+
+        let document = Html::parse_document(html);
+        let mut dtree = DensityTree::from_document(&document).unwrap();
+        dtree.calculate_density_sum().unwrap();
+
+        let markdown = extract_content_as_markdown(&dtree, &document).unwrap();
+
+        // Debug: print what we actually got
+        println!("empty content markdown: '{}'", markdown);
+
+        // Empty content should return empty string
+        assert!(
+            markdown.is_empty(),
+            "Expected empty markdown for content-less HTML, got: '{}'",
+            markdown
+        );
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+[llvm-cov]`
	`2`	`+ignore-filename-regex = ["src/main.rs"]`