carles-abarca
diff --git a/‎Cargo.toml‎
Lines changed: 9 additions & 9 deletions b/‎Cargo.toml‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎README.md‎
Lines changed: 33 additions & 22 deletions b/‎README.md‎
Lines changed: 33 additions & 22 deletions
diff --git a/‎crates/docling-rs-cli/src/cli/args.rs‎
Lines changed: 11 additions & 5 deletions b/‎crates/docling-rs-cli/src/cli/args.rs‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎crates/docling-rs-cli/src/cli/converter.rs‎
Lines changed: 23 additions & 3 deletions b/‎crates/docling-rs-cli/src/cli/converter.rs‎
Lines changed: 23 additions & 3 deletions
diff --git a/‎crates/docling-rs-cli/src/cli/output.rs‎
Lines changed: 6 additions & 42 deletions b/‎crates/docling-rs-cli/src/cli/output.rs‎
Lines changed: 6 additions & 42 deletions
diff --git a/‎crates/docling-rs-cli/tests/contract_cli.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/docling-rs-cli/tests/contract_cli.rs‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/docling-rs-cli/tests/integration_cli_errors.rs‎
Lines changed: 3 additions & 3 deletions b/‎crates/docling-rs-cli/tests/integration_cli_errors.rs‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎crates/docling-rs-core/assets/tokenizer-all-MiniLM-L6-v2.json‎
Lines changed: 1 addition & 0 deletions b/‎crates/docling-rs-core/assets/tokenizer-all-MiniLM-L6-v2.json‎
Lines changed: 1 addition & 0 deletions
@@ -14,7 +14,7 @@ members = [
 resolver = "2"
 
 [workspace.package]
-version = "1.0.0"
+version = "1.0.3"
 edition = "2021"
 rust-version = "1.75"
 authors = ["Docling-rs Contributors"]
@@ -24,14 +24,14 @@ description = "Native Rust document processing library"
 
 [workspace.dependencies]
 # Internal crates
-docling-rs-core = { version = "1.0.0", path = "crates/docling-rs-core" }
-docling-rs-markdown = { version = "1.0.0", path = "crates/docling-rs-formats/markdown" }
-docling-rs-html = { version = "1.0.0", path = "crates/docling-rs-formats/html" }
-docling-rs-csv = { version = "1.0.0", path = "crates/docling-rs-formats/csv" }
-docling-rs-docx = { version = "1.0.0", path = "crates/docling-rs-formats/docx" }
-docling-rs-xlsx = { version = "1.0.0", path = "crates/docling-rs-formats/xlsx" }
-docling-rs-pptx = { version = "1.0.0", path = "crates/docling-rs-formats/pptx" }
-docling-rs-pdf = { version = "1.0.0", path = "crates/docling-rs-formats/pdf" }
+docling-rs-core = { version = "1.0.3", path = "crates/docling-rs-core" }
+docling-rs-markdown = { version = "1.0.3", path = "crates/docling-rs-formats/markdown" }
+docling-rs-html = { version = "1.0.3", path = "crates/docling-rs-formats/html" }
+docling-rs-csv = { version = "1.0.3", path = "crates/docling-rs-formats/csv" }
+docling-rs-docx = { version = "1.0.3", path = "crates/docling-rs-formats/docx" }
+docling-rs-xlsx = { version = "1.0.3", path = "crates/docling-rs-formats/xlsx" }
+docling-rs-pptx = { version = "1.0.3", path = "crates/docling-rs-formats/pptx" }
+docling-rs-pdf = { version = "1.0.3", path = "crates/docling-rs-formats/pdf" }
 
 # Core dependencies shared across crates
 serde = { version = "1.0", features = ["derive"] }
 
@@ -27,7 +27,7 @@ The original [Docling](https://github.com/DS4SD/docling) by IBM is an excellent
 
 ## Status
 
-**v1.0.1** - Production-ready with 7 format backends (all enabled by default)
+**v1.0.3** - Production-ready with 7 format backends (all enabled by default)
 
 | Component | Status |
 |-----------|--------|
@@ -90,8 +90,8 @@ docling-rs document.pdf --to markdown --output-dir ./output
 # Batch convert a directory
 docling-rs ./documents/ --to json --output-dir ./converted
 
-# Enable chunking for RAG
-docling-rs document.pdf --chunk --chunk-size 512 --to json
+# Enable chunking for RAG (uses embedded all-MiniLM-L6-v2 tokenizer)
+docling-rs document.pdf --chunk --to json
 
 # Filter by input format
 docling-rs ./docs/ --from pdf,docx --to markdown
@@ -148,29 +148,36 @@ let result = converter.convert_bytes(
 
 ## Document Chunking
 
-Intelligent chunking for RAG and embedding applications:
+Intelligent chunking for RAG and embedding applications with **embedded tokenizer** (`sentence-transformers/all-MiniLM-L6-v2`):
 
 ```rust
-use docling_rs::{DocumentConverter, HierarchicalChunker};
+use docling_rs::{DocumentConverter, chunking::{HybridChunker, HuggingFaceTokenizer}};
 
 let converter = DocumentConverter::new();
 let result = converter.convert_file("document.pdf")?;
 let doc = result.document();
 
-// Create hierarchical chunker
-let chunker = HierarchicalChunker::new()
-    .with_max_chunk_size(512)
-    .with_overlap(50);
+// Hybrid chunker with embedded tokenizer (recommended for RAG)
+let tokenizer = HuggingFaceTokenizer::default_embedded()?;
+let chunker = HybridChunker::builder()
+    .tokenizer(Box::new(tokenizer))
+    .max_tokens(128)  // Default: 128, optimized for embeddings
+    .merge_peers(true)
+    .build()?;
 
 // Generate chunks
-let chunks = chunker.chunk(doc)?;
-
-for chunk in &chunks {
-    println!("Chunk: {} chars", chunk.text().len());
-    println!("Context: {:?}", chunk.metadata().headings());
+for chunk in chunker.chunk(&doc) {
+    println!("Chunk: {} chars", chunk.text.len());
 }
 ```
 
+### Chunking Features (v1.0.3)
+
+- **Embedded Tokenizer**: `all-MiniLM-L6-v2` tokenizer bundled in the binary
+- **Hybrid Strategy Default**: Token-aware chunking optimized for RAG
+- **Table Chunking**: CSV/XLSX tables are chunked row-by-row in `key=value` format
+- **Smart Merging**: Undersized chunks are merged while preserving semantic boundaries
+
 ## CLI Options
 
 ```
@@ -184,9 +191,10 @@ Options:
   -o, --output-dir <DIR>         Output directory
   -f, --from <FORMATS>           Filter input formats (comma-separated)
       --chunk                    Enable document chunking
-      --chunk-strategy <STRAT>   Chunking strategy: hierarchical, hybrid [default: hierarchical]
-      --chunk-max-tokens <N>     Max tokens per chunk (hybrid) [default: 512]
-      --chunk-merge-peers        Merge undersized peer chunks (hybrid) [default: true]
+      --chunk-strategy <STRAT>   Chunking strategy: hierarchical, hybrid [default: hybrid]
+      --chunk-max-tokens <N>     Max tokens per chunk [default: 128]
+      --chunk-merge-peers        Merge undersized peer chunks [default: true]
+      --tokenizer <MODEL>        HuggingFace tokenizer model [default: embedded all-MiniLM-L6-v2]
       --continue-on-error        Continue on errors (batch mode)
       --abort-on-error           Stop on first error (batch mode)
   -v, --verbose                  Verbose output
@@ -198,14 +206,17 @@ Options:
 ### Chunking Strategies
 
 ```bash
-# Hierarchical chunking (default) - preserves document structure
+# Hybrid chunking (default) - token-aware with embedded tokenizer, ideal for RAG
 docling-rs document.pdf --chunk --to json
 
-# Hybrid chunking - token-aware, ideal for embeddings
-docling-rs document.pdf --chunk --chunk-strategy hybrid --chunk-max-tokens 512 --to json
+# Custom max tokens
+docling-rs document.pdf --chunk --chunk-max-tokens 256 --to json
+
+# Hierarchical chunking - preserves document structure
+docling-rs document.pdf --chunk --chunk-strategy hierarchical --to json
 
-# Hybrid without merging small chunks
-docling-rs document.pdf --chunk --chunk-strategy hybrid --chunk-merge-peers false --to json
+# Disable chunk merging for more granular output
+docling-rs document.pdf --chunk --chunk-merge-peers false --to json
 ```
 
 ## Architecture
 
@@ -58,21 +58,27 @@ pub struct CliArgs {
     pub chunk: bool,
 
     /// Chunking strategy: hierarchical (structure-based) or hybrid (token-aware)
+    /// Default is hybrid which uses the embedded all-MiniLM-L6-v2 tokenizer
     #[arg(
         long = "chunk-strategy",
         value_name = "STRATEGY",
-        default_value = "hierarchical"
+        default_value = "hybrid"
     )]
     pub chunk_strategy: ChunkStrategy,
 
-    /// Maximum tokens per chunk (for hybrid strategy, default: 512)
-    #[arg(long = "chunk-max-tokens", value_name = "TOKENS", default_value = "512", value_parser = validate_chunk_size)]
+    /// Maximum tokens per chunk (default: 128, optimized for RAG granularity)
+    #[arg(long = "chunk-max-tokens", value_name = "TOKENS", default_value = "128", value_parser = validate_chunk_size)]
     pub chunk_max_tokens: usize,
 
     /// Merge undersized peer chunks (for hybrid strategy)
     #[arg(long = "chunk-merge-peers", default_value = "true")]
     pub chunk_merge_peers: bool,
 
+    /// HuggingFace tokenizer model (default: embedded all-MiniLM-L6-v2)
+    /// Specify a different model if needed, requires tokenizer.json in cache
+    #[arg(long = "tokenizer", value_name = "MODEL")]
+    pub tokenizer_model: Option<String>,
+
     /// Continue processing on error (batch mode)
     #[arg(long = "continue-on-error")]
     pub continue_on_error: bool,
@@ -154,9 +160,9 @@ pub enum OutputFormat {
 #[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Default)]
 pub enum ChunkStrategy {
     /// Structure-based chunking (preserves document hierarchy)
-    #[default]
     Hierarchical,
-    /// Token-aware chunking (respects token limits for embeddings)
+    /// Token-aware chunking with embedded all-MiniLM-L6-v2 tokenizer (recommended for RAG)
+    #[default]
     Hybrid,
 }
 
 
@@ -4,7 +4,7 @@ use crate::cli::args::{ChunkStrategy, CliArgs, InputFormat, OutputFormat};
 use crate::cli::output;
 use anyhow::{Context, Result};
 use docling_rs::chunking::{
-    BaseChunk, BaseChunker, HierarchicalChunker, HybridChunker, SimpleTokenizer,
+    BaseChunk, BaseChunker, HierarchicalChunker, HuggingFaceTokenizer, HybridChunker, Tokenizer,
 };
 use docling_rs::DocumentConverter;
 use std::fs;
@@ -359,9 +359,29 @@ impl Converter {
                 chunker.chunk(doc).collect()
             }
             ChunkStrategy::Hybrid => {
-                let tokenizer = SimpleTokenizer::with_max_tokens(self.args.chunk_max_tokens);
+                // Create tokenizer: custom model if specified, otherwise embedded all-MiniLM-L6-v2
+                let tokenizer: Box<dyn Tokenizer> =
+                    if let Some(ref model) = self.args.tokenizer_model {
+                        Box::new(
+                            HuggingFaceTokenizer::from_pretrained(model).with_context(|| {
+                                format!(
+                                    "Failed to load HuggingFace tokenizer '{}'. \
+                                    Please download tokenizer.json from https://huggingface.co/{}/tree/main",
+                                    model, model
+                                )
+                            })?,
+                        )
+                    } else {
+                        // Use embedded all-MiniLM-L6-v2 tokenizer (default for RAG)
+                        Box::new(
+                            HuggingFaceTokenizer::default_embedded()
+                                .context("Failed to load embedded tokenizer")?
+                                .with_max_tokens(self.args.chunk_max_tokens),
+                        )
+                    };
+
                 let chunker = HybridChunker::builder()
-                    .tokenizer(Box::new(tokenizer))
+                    .tokenizer(tokenizer)
                     .max_tokens(self.args.chunk_max_tokens)
                     .merge_peers(self.args.chunk_merge_peers)
                     .build()
 
@@ -1,57 +1,21 @@
 //! Output file generation (markdown, JSON, text).
+//!
+//! Re-exports output functions from docling-rs for CLI use.
 
 use anyhow::Result;
-use docling_rs::{DoclingDocument, NodeType};
+use docling_rs::DoclingDocument;
 
 /// Convert document to Markdown format
 pub fn to_markdown(doc: &DoclingDocument) -> String {
-    let mut output = String::new();
-
-    // Title
-    output.push_str(&format!("# {}\n\n", doc.name()));
-
-    // Content (iterate through document nodes)
-    for node in doc.nodes() {
-        let text = node.text_content().unwrap_or("");
-        match node.node_type() {
-            NodeType::Heading => {
-                output.push_str(&format!("## {}\n\n", text));
-            }
-            NodeType::Paragraph | NodeType::Text => {
-                output.push_str(&format!("{}\n\n", text));
-            }
-            NodeType::Table => {
-                output.push_str("(Table content)\n\n");
-            }
-            _ => {
-                output.push_str(&format!("{}\n\n", text));
-            }
-        }
-    }
-
-    output
+    docling_rs::output::to_markdown(doc)
 }
 
 /// Convert document to JSON format
 pub fn to_json(doc: &DoclingDocument) -> Result<String> {
-    Ok(serde_json::to_string_pretty(doc)?)
+    Ok(docling_rs::output::to_json(doc)?)
 }
 
 /// Convert document to plain text format
 pub fn to_text(doc: &DoclingDocument) -> String {
-    let mut output = String::new();
-
-    // Title
-    output.push_str(&format!("{}\n\n", doc.name()));
-
-    // Extract all text from nodes
-    for node in doc.nodes() {
-        if let Some(text) = node.text_content() {
-            if !text.is_empty() {
-                output.push_str(&format!("{}\n\n", text));
-            }
-        }
-    }
-
-    output
+    docling_rs::output::to_text(doc)
 }
@@ -192,7 +192,7 @@ fn test_ct010_enrichment_options() {
     let mut cmd = Command::cargo_bin("docling-rs").unwrap();
     cmd.arg(&input)
         .arg("--chunk")
-        .arg("--chunk-size")
+        .arg("--chunk-max-tokens")
         .arg("1000")
         .arg("--output-dir")
         .arg(&output_dir)
 
@@ -64,19 +64,19 @@ fn test_invalid_output_format() {
 }
 
 #[test]
-fn test_invalid_chunk_size() {
+fn test_invalid_chunk_max_tokens() {
     let temp = TempDir::new().unwrap();
     let input = temp.path().join("test.md");
     fs::write(&input, "# Test").unwrap();
 
     let mut cmd = Command::cargo_bin("docling-rs").unwrap();
     cmd.arg(&input)
-        .arg("--chunk-size")
+        .arg("--chunk-max-tokens")
         .arg("0")
         .assert()
         .failure()
         .code(1)
-        .stderr(predicate::str::contains("invalid").or(predicate::str::contains("must be")));
+        .stderr(predicate::str::contains("invalid").or(predicate::str::contains("must be").or(predicate::str::contains("greater than"))));
 }
 
 #[test]