Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .llvm-cov
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[llvm-cov]
ignore-filename-regex = ["src/main.rs"]
14 changes: 0 additions & 14 deletions .tarpaulin.toml

This file was deleted.

2 changes: 1 addition & 1 deletion CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ cargo run --bin dce -- --file input.html --output out.txt # Extract from file
- Library can be used without CLI dependencies by disabling default features
- Optional `markdown` feature for structured markdown extraction using density analysis

## Current Task: Markdown Extraction Implementation
## Markdown Extraction Implementation

**Goal**: Add markdown extraction capability that leverages CETD density analysis to extract main content as structured markdown.

Expand Down
21 changes: 21 additions & 0 deletions examples/check.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use clap::{Parser, Subcommand};
#[cfg(feature = "markdown")]
use dom_content_extraction::extract_content_as_markdown;
use dom_content_extraction::{
DensityTree, get_content, get_node_text, scraper::Html,
};
Expand All @@ -16,6 +18,8 @@ enum Commands {
LoremIpsum,
Test4,
TestToy,
#[cfg(feature = "markdown")]
LoremIpsumMarkdown,
}

fn main() {
Expand All @@ -31,6 +35,10 @@ fn main() {
Commands::TestToy => {
process_toy();
}
#[cfg(feature = "markdown")]
Commands::LoremIpsumMarkdown => {
process_lorem_ipsum_markdown();
}
}
}

Expand Down Expand Up @@ -80,3 +88,16 @@ fn process_toy() {
let content = get_content(&document).unwrap();
println!("{}", content);
}

#[cfg(feature = "markdown")]
fn process_lorem_ipsum_markdown() {
println!("Processing Lorem Ipsum example as Markdown...");
let html_content =
fs::read_to_string("html/lorem_ipsum.html").expect("Unable to read file");
let document = Html::parse_document(&html_content);
let mut dtree = DensityTree::from_document(&document).unwrap();
dtree.calculate_density_sum().unwrap();

let markdown_content = extract_content_as_markdown(&dtree, &document).unwrap();
println!("Extracted markdown content:\n{}", markdown_content);
}
37 changes: 34 additions & 3 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ struct Cli {
/// Output file (stdout if not specified)
#[arg(short, long)]
output: Option<PathBuf>,

/// Output format (text or markdown)
#[arg(long, default_value = "text", value_parser = ["text", "markdown"])]
format: String,
}

fn parse_url(s: &str) -> Result<Url, String> {
Expand All @@ -40,9 +44,36 @@ fn fetch_url(url: &Url) -> Result<String> {
.and_then(|r| r.text())?)
}

fn process_html(html: &str) -> Result<String> {
fn process_html(html: &str, format: &str) -> Result<String> {
let document = Html::parse_document(html);
get_content(&document).context("Failed to extract content")

match format {
"text" => get_content(&document).context("Failed to extract content"),
"markdown" => {
#[cfg(not(feature = "markdown"))]
{
anyhow::bail!(
"Markdown output requires the 'markdown' feature to be enabled"
);
}

#[cfg(feature = "markdown")]
{
use dom_content_extraction::{
DensityTree, extract_content_as_markdown,
};
let mut dtree = DensityTree::from_document(&document)
.context("Failed to create density tree")?;
dtree
.calculate_density_sum()
.context("Failed to calculate density sums")?;
extract_content_as_markdown(&dtree, &document)
.map_err(|e| anyhow::anyhow!(e))
.context("Failed to extract content as markdown")
}
}
_ => anyhow::bail!("Invalid format: {}. Use 'text' or 'markdown'", format),
}
}

fn write_output(content: &str, output_path: Option<PathBuf>) -> Result<()> {
Expand Down Expand Up @@ -81,7 +112,7 @@ fn main() -> Result<()> {
};

// Process HTML and extract content
let extracted_content = process_html(&html_content)?;
let extracted_content = process_html(&html_content, &cli.format)?;

// Write output
write_output(&extracted_content, cli.output)?;
Expand Down
106 changes: 102 additions & 4 deletions src/markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ pub fn extract_content_as_markdown(
mod tests {
use super::*;
use crate::DensityTree;
use std::fs;

#[test]
#[cfg(feature = "markdown")]
Expand All @@ -89,13 +90,110 @@ mod tests {

let markdown = extract_content_as_markdown(&dtree, &document).unwrap();

// Debug: print what we actually got
println!("Generated markdown: '{}'", markdown);

// Should contain the main content
assert!(!markdown.is_empty(), "Markdown should not be empty");
// Relaxed assertions for debugging
assert!(markdown.contains("Main Article"));
assert!(markdown.contains("main content"));
}

#[test]
#[cfg(feature = "markdown")]
fn test_extract_from_test1_html() {
let html_content = fs::read_to_string("html/test_1.html")
.expect("Unable to read test_1.html");
let document = Html::parse_document(&html_content);
let mut dtree = DensityTree::from_document(&document).unwrap();
dtree.calculate_density_sum().unwrap();

let markdown = extract_content_as_markdown(&dtree, &document).unwrap();

// Debug: print what we actually got
println!("test1 markdown: '{}'", markdown);

// Should extract article body content (highest density)
assert!(!markdown.is_empty(), "Markdown should not be empty");
// Check for content that should be present in article body
assert!(markdown.contains("Here is text"));
assert!(markdown.contains("Paragraph text"));
assert!(markdown.contains("huge paragraph"));
// Should not contain footer navigation
assert!(!markdown.contains("Menu"));
assert!(!markdown.contains("link1"));
}

#[test]
#[cfg(feature = "markdown")]
fn test_extract_from_test2_html() {
let html_content = fs::read_to_string("html/test_2.html")
.expect("Unable to read test_2.html");
let document = Html::parse_document(&html_content);
let mut dtree = DensityTree::from_document(&document).unwrap();
dtree.calculate_density_sum().unwrap();

let markdown = extract_content_as_markdown(&dtree, &document).unwrap();

// Debug: print what we actually got
println!("test2 markdown: '{}'", markdown);

// Should extract article body content (highest density)
assert!(!markdown.is_empty(), "Markdown should not be empty");
// Check for content that should be present in article body
assert!(markdown.contains("Here is text"));
assert!(markdown.contains("long paragraph"));
// Links should be converted to markdown format
assert!(markdown.contains("wikipedia"));
}

#[test]
#[cfg(feature = "markdown")]
fn test_extract_from_test4_html() {
let html_content = fs::read_to_string("html/test_4.html")
.expect("Unable to read test_4.html");
let document = Html::parse_document(&html_content);
let mut dtree = DensityTree::from_document(&document).unwrap();
dtree.calculate_density_sum().unwrap();

let markdown = extract_content_as_markdown(&dtree, &document).unwrap();

// Debug: print what we actually got
println!("test4 markdown: '{}'", markdown);

// Should extract article content and filter out scripts/comments
assert!(!markdown.is_empty(), "Markdown should not be empty");
// Check for content that should be present
assert!(markdown.contains("Lorem ipsum"));
assert!(markdown.contains("long paragraph"));
assert!(markdown.contains("wikipedia"));
// Should not contain script content
assert!(!markdown.contains("myFunction"));
assert!(!markdown.contains("Some comments"));
}

#[test]
#[cfg(feature = "markdown")]
fn test_empty_content_returns_empty_markdown() {
let html = r#"
<html>
<body>
<script>console.log("empty")</script>
</body>
</html>
"#;

let document = Html::parse_document(html);
let mut dtree = DensityTree::from_document(&document).unwrap();
dtree.calculate_density_sum().unwrap();

let markdown = extract_content_as_markdown(&dtree, &document).unwrap();

// Debug: print what we actually got
println!("empty content markdown: '{}'", markdown);

// Empty content should return empty string
assert!(
markdown.is_empty(),
"Expected empty markdown for content-less HTML, got: '{}'",
markdown
);
}
}
Loading