Skip to content

Commit c9ccbcd

Browse files
carles-abarcaclaude
andcommitted
feat(chunking): Improve RAG chunking with embedded tokenizer and table support
v1.0.3 release with major chunking improvements: Embedded Tokenizer: - Bundle sentence-transformers/all-MiniLM-L6-v2 tokenizer in binary - Add HuggingFaceTokenizer::default_embedded() for zero-config usage - Fix truncation issue that was limiting token counts to 128 Chunking Defaults: - Change default strategy from hierarchical to hybrid - Reduce default max_tokens from 256 to 128 for better RAG granularity - Set merge threshold to 75% of max_tokens Table Chunking (BUG FIX): - Fix CSV/XLSX producing 0 chunks - Add rows_as_chunks() method to serialize table rows - Format: "Name, Column = Value. Name, Column2 = Value2." Markdown Output: - Add structured TableData with to_markdown() formatting - Improve table rendering with proper column alignment CLI Updates: - Rename --chunk-size to --chunk-max-tokens - Add --tokenizer flag for custom HuggingFace models - Update help text with new defaults 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 0c30861 commit c9ccbcd

File tree

20 files changed

+459
-202
lines changed

20 files changed

+459
-202
lines changed

Cargo.toml

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ members = [
1414
resolver = "2"
1515

1616
[workspace.package]
17-
version = "1.0.0"
17+
version = "1.0.3"
1818
edition = "2021"
1919
rust-version = "1.75"
2020
authors = ["Docling-rs Contributors"]
@@ -24,14 +24,14 @@ description = "Native Rust document processing library"
2424

2525
[workspace.dependencies]
2626
# Internal crates
27-
docling-rs-core = { version = "1.0.0", path = "crates/docling-rs-core" }
28-
docling-rs-markdown = { version = "1.0.0", path = "crates/docling-rs-formats/markdown" }
29-
docling-rs-html = { version = "1.0.0", path = "crates/docling-rs-formats/html" }
30-
docling-rs-csv = { version = "1.0.0", path = "crates/docling-rs-formats/csv" }
31-
docling-rs-docx = { version = "1.0.0", path = "crates/docling-rs-formats/docx" }
32-
docling-rs-xlsx = { version = "1.0.0", path = "crates/docling-rs-formats/xlsx" }
33-
docling-rs-pptx = { version = "1.0.0", path = "crates/docling-rs-formats/pptx" }
34-
docling-rs-pdf = { version = "1.0.0", path = "crates/docling-rs-formats/pdf" }
27+
docling-rs-core = { version = "1.0.3", path = "crates/docling-rs-core" }
28+
docling-rs-markdown = { version = "1.0.3", path = "crates/docling-rs-formats/markdown" }
29+
docling-rs-html = { version = "1.0.3", path = "crates/docling-rs-formats/html" }
30+
docling-rs-csv = { version = "1.0.3", path = "crates/docling-rs-formats/csv" }
31+
docling-rs-docx = { version = "1.0.3", path = "crates/docling-rs-formats/docx" }
32+
docling-rs-xlsx = { version = "1.0.3", path = "crates/docling-rs-formats/xlsx" }
33+
docling-rs-pptx = { version = "1.0.3", path = "crates/docling-rs-formats/pptx" }
34+
docling-rs-pdf = { version = "1.0.3", path = "crates/docling-rs-formats/pdf" }
3535

3636
# Core dependencies shared across crates
3737
serde = { version = "1.0", features = ["derive"] }

README.md

Lines changed: 33 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ The original [Docling](https://github.com/DS4SD/docling) by IBM is an excellent
2727

2828
## Status
2929

30-
**v1.0.1** - Production-ready with 7 format backends (all enabled by default)
30+
**v1.0.3** - Production-ready with 7 format backends (all enabled by default)
3131

3232
| Component | Status |
3333
|-----------|--------|
@@ -90,8 +90,8 @@ docling-rs document.pdf --to markdown --output-dir ./output
9090
# Batch convert a directory
9191
docling-rs ./documents/ --to json --output-dir ./converted
9292

93-
# Enable chunking for RAG
94-
docling-rs document.pdf --chunk --chunk-size 512 --to json
93+
# Enable chunking for RAG (uses embedded all-MiniLM-L6-v2 tokenizer)
94+
docling-rs document.pdf --chunk --to json
9595

9696
# Filter by input format
9797
docling-rs ./docs/ --from pdf,docx --to markdown
@@ -148,29 +148,36 @@ let result = converter.convert_bytes(
148148

149149
## Document Chunking
150150

151-
Intelligent chunking for RAG and embedding applications:
151+
Intelligent chunking for RAG and embedding applications with **embedded tokenizer** (`sentence-transformers/all-MiniLM-L6-v2`):
152152

153153
```rust
154-
use docling_rs::{DocumentConverter, HierarchicalChunker};
154+
use docling_rs::{DocumentConverter, chunking::{HybridChunker, HuggingFaceTokenizer}};
155155

156156
let converter = DocumentConverter::new();
157157
let result = converter.convert_file("document.pdf")?;
158158
let doc = result.document();
159159

160-
// Create hierarchical chunker
161-
let chunker = HierarchicalChunker::new()
162-
.with_max_chunk_size(512)
163-
.with_overlap(50);
160+
// Hybrid chunker with embedded tokenizer (recommended for RAG)
161+
let tokenizer = HuggingFaceTokenizer::default_embedded()?;
162+
let chunker = HybridChunker::builder()
163+
.tokenizer(Box::new(tokenizer))
164+
.max_tokens(128) // Default: 128, optimized for embeddings
165+
.merge_peers(true)
166+
.build()?;
164167

165168
// Generate chunks
166-
let chunks = chunker.chunk(doc)?;
167-
168-
for chunk in &chunks {
169-
println!("Chunk: {} chars", chunk.text().len());
170-
println!("Context: {:?}", chunk.metadata().headings());
169+
for chunk in chunker.chunk(&doc) {
170+
println!("Chunk: {} chars", chunk.text.len());
171171
}
172172
```
173173

174+
### Chunking Features (v1.0.3)
175+
176+
- **Embedded Tokenizer**: `all-MiniLM-L6-v2` tokenizer bundled in the binary
177+
- **Hybrid Strategy Default**: Token-aware chunking optimized for RAG
178+
- **Table Chunking**: CSV/XLSX tables are chunked row-by-row in `key=value` format
179+
- **Smart Merging**: Undersized chunks are merged while preserving semantic boundaries
180+
174181
## CLI Options
175182

176183
```
@@ -184,9 +191,10 @@ Options:
184191
-o, --output-dir <DIR> Output directory
185192
-f, --from <FORMATS> Filter input formats (comma-separated)
186193
--chunk Enable document chunking
187-
--chunk-strategy <STRAT> Chunking strategy: hierarchical, hybrid [default: hierarchical]
188-
--chunk-max-tokens <N> Max tokens per chunk (hybrid) [default: 512]
189-
--chunk-merge-peers Merge undersized peer chunks (hybrid) [default: true]
194+
--chunk-strategy <STRAT> Chunking strategy: hierarchical, hybrid [default: hybrid]
195+
--chunk-max-tokens <N> Max tokens per chunk [default: 128]
196+
--chunk-merge-peers Merge undersized peer chunks [default: true]
197+
--tokenizer <MODEL> HuggingFace tokenizer model [default: embedded all-MiniLM-L6-v2]
190198
--continue-on-error Continue on errors (batch mode)
191199
--abort-on-error Stop on first error (batch mode)
192200
-v, --verbose Verbose output
@@ -198,14 +206,17 @@ Options:
198206
### Chunking Strategies
199207

200208
```bash
201-
# Hierarchical chunking (default) - preserves document structure
209+
# Hybrid chunking (default) - token-aware with embedded tokenizer, ideal for RAG
202210
docling-rs document.pdf --chunk --to json
203211

204-
# Hybrid chunking - token-aware, ideal for embeddings
205-
docling-rs document.pdf --chunk --chunk-strategy hybrid --chunk-max-tokens 512 --to json
212+
# Custom max tokens
213+
docling-rs document.pdf --chunk --chunk-max-tokens 256 --to json
214+
215+
# Hierarchical chunking - preserves document structure
216+
docling-rs document.pdf --chunk --chunk-strategy hierarchical --to json
206217

207-
# Hybrid without merging small chunks
208-
docling-rs document.pdf --chunk --chunk-strategy hybrid --chunk-merge-peers false --to json
218+
# Disable chunk merging for more granular output
219+
docling-rs document.pdf --chunk --chunk-merge-peers false --to json
209220
```
210221

211222
## Architecture

crates/docling-rs-cli/src/cli/args.rs

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,21 +58,27 @@ pub struct CliArgs {
5858
pub chunk: bool,
5959

6060
/// Chunking strategy: hierarchical (structure-based) or hybrid (token-aware)
61+
/// Default is hybrid which uses the embedded all-MiniLM-L6-v2 tokenizer
6162
#[arg(
6263
long = "chunk-strategy",
6364
value_name = "STRATEGY",
64-
default_value = "hierarchical"
65+
default_value = "hybrid"
6566
)]
6667
pub chunk_strategy: ChunkStrategy,
6768

68-
/// Maximum tokens per chunk (for hybrid strategy, default: 512)
69-
#[arg(long = "chunk-max-tokens", value_name = "TOKENS", default_value = "512", value_parser = validate_chunk_size)]
69+
/// Maximum tokens per chunk (default: 128, optimized for RAG granularity)
70+
#[arg(long = "chunk-max-tokens", value_name = "TOKENS", default_value = "128", value_parser = validate_chunk_size)]
7071
pub chunk_max_tokens: usize,
7172

7273
/// Merge undersized peer chunks (for hybrid strategy)
7374
#[arg(long = "chunk-merge-peers", default_value = "true")]
7475
pub chunk_merge_peers: bool,
7576

77+
/// HuggingFace tokenizer model (default: embedded all-MiniLM-L6-v2)
78+
/// Specify a different model if needed, requires tokenizer.json in cache
79+
#[arg(long = "tokenizer", value_name = "MODEL")]
80+
pub tokenizer_model: Option<String>,
81+
7682
/// Continue processing on error (batch mode)
7783
#[arg(long = "continue-on-error")]
7884
pub continue_on_error: bool,
@@ -154,9 +160,9 @@ pub enum OutputFormat {
154160
#[derive(Debug, Clone, Copy, PartialEq, Eq, ValueEnum, Default)]
155161
pub enum ChunkStrategy {
156162
/// Structure-based chunking (preserves document hierarchy)
157-
#[default]
158163
Hierarchical,
159-
/// Token-aware chunking (respects token limits for embeddings)
164+
/// Token-aware chunking with embedded all-MiniLM-L6-v2 tokenizer (recommended for RAG)
165+
#[default]
160166
Hybrid,
161167
}
162168

crates/docling-rs-cli/src/cli/converter.rs

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use crate::cli::args::{ChunkStrategy, CliArgs, InputFormat, OutputFormat};
44
use crate::cli::output;
55
use anyhow::{Context, Result};
66
use docling_rs::chunking::{
7-
BaseChunk, BaseChunker, HierarchicalChunker, HybridChunker, SimpleTokenizer,
7+
BaseChunk, BaseChunker, HierarchicalChunker, HuggingFaceTokenizer, HybridChunker, Tokenizer,
88
};
99
use docling_rs::DocumentConverter;
1010
use std::fs;
@@ -359,9 +359,29 @@ impl Converter {
359359
chunker.chunk(doc).collect()
360360
}
361361
ChunkStrategy::Hybrid => {
362-
let tokenizer = SimpleTokenizer::with_max_tokens(self.args.chunk_max_tokens);
362+
// Create tokenizer: custom model if specified, otherwise embedded all-MiniLM-L6-v2
363+
let tokenizer: Box<dyn Tokenizer> =
364+
if let Some(ref model) = self.args.tokenizer_model {
365+
Box::new(
366+
HuggingFaceTokenizer::from_pretrained(model).with_context(|| {
367+
format!(
368+
"Failed to load HuggingFace tokenizer '{}'. \
369+
Please download tokenizer.json from https://huggingface.co/{}/tree/main",
370+
model, model
371+
)
372+
})?,
373+
)
374+
} else {
375+
// Use embedded all-MiniLM-L6-v2 tokenizer (default for RAG)
376+
Box::new(
377+
HuggingFaceTokenizer::default_embedded()
378+
.context("Failed to load embedded tokenizer")?
379+
.with_max_tokens(self.args.chunk_max_tokens),
380+
)
381+
};
382+
363383
let chunker = HybridChunker::builder()
364-
.tokenizer(Box::new(tokenizer))
384+
.tokenizer(tokenizer)
365385
.max_tokens(self.args.chunk_max_tokens)
366386
.merge_peers(self.args.chunk_merge_peers)
367387
.build()
Lines changed: 6 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1,57 +1,21 @@
11
//! Output file generation (markdown, JSON, text).
2+
//!
3+
//! Re-exports output functions from docling-rs for CLI use.
24
35
use anyhow::Result;
4-
use docling_rs::{DoclingDocument, NodeType};
6+
use docling_rs::DoclingDocument;
57

68
/// Convert document to Markdown format
79
pub fn to_markdown(doc: &DoclingDocument) -> String {
8-
let mut output = String::new();
9-
10-
// Title
11-
output.push_str(&format!("# {}\n\n", doc.name()));
12-
13-
// Content (iterate through document nodes)
14-
for node in doc.nodes() {
15-
let text = node.text_content().unwrap_or("");
16-
match node.node_type() {
17-
NodeType::Heading => {
18-
output.push_str(&format!("## {}\n\n", text));
19-
}
20-
NodeType::Paragraph | NodeType::Text => {
21-
output.push_str(&format!("{}\n\n", text));
22-
}
23-
NodeType::Table => {
24-
output.push_str("(Table content)\n\n");
25-
}
26-
_ => {
27-
output.push_str(&format!("{}\n\n", text));
28-
}
29-
}
30-
}
31-
32-
output
10+
docling_rs::output::to_markdown(doc)
3311
}
3412

3513
/// Convert document to JSON format
3614
pub fn to_json(doc: &DoclingDocument) -> Result<String> {
37-
Ok(serde_json::to_string_pretty(doc)?)
15+
Ok(docling_rs::output::to_json(doc)?)
3816
}
3917

4018
/// Convert document to plain text format
4119
pub fn to_text(doc: &DoclingDocument) -> String {
42-
let mut output = String::new();
43-
44-
// Title
45-
output.push_str(&format!("{}\n\n", doc.name()));
46-
47-
// Extract all text from nodes
48-
for node in doc.nodes() {
49-
if let Some(text) = node.text_content() {
50-
if !text.is_empty() {
51-
output.push_str(&format!("{}\n\n", text));
52-
}
53-
}
54-
}
55-
56-
output
20+
docling_rs::output::to_text(doc)
5721
}

crates/docling-rs-cli/tests/contract_cli.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@ fn test_ct010_enrichment_options() {
192192
let mut cmd = Command::cargo_bin("docling-rs").unwrap();
193193
cmd.arg(&input)
194194
.arg("--chunk")
195-
.arg("--chunk-size")
195+
.arg("--chunk-max-tokens")
196196
.arg("1000")
197197
.arg("--output-dir")
198198
.arg(&output_dir)

crates/docling-rs-cli/tests/integration_cli_errors.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,19 +64,19 @@ fn test_invalid_output_format() {
6464
}
6565

6666
#[test]
67-
fn test_invalid_chunk_size() {
67+
fn test_invalid_chunk_max_tokens() {
6868
let temp = TempDir::new().unwrap();
6969
let input = temp.path().join("test.md");
7070
fs::write(&input, "# Test").unwrap();
7171

7272
let mut cmd = Command::cargo_bin("docling-rs").unwrap();
7373
cmd.arg(&input)
74-
.arg("--chunk-size")
74+
.arg("--chunk-max-tokens")
7575
.arg("0")
7676
.assert()
7777
.failure()
7878
.code(1)
79-
.stderr(predicate::str::contains("invalid").or(predicate::str::contains("must be")));
79+
.stderr(predicate::str::contains("invalid").or(predicate::str::contains("must be").or(predicate::str::contains("greater than"))));
8080
}
8181

8282
#[test]

crates/docling-rs-core/assets/tokenizer-all-MiniLM-L6-v2.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)