Skip to content

Commit 42997dc

Browse files
authored
Merge pull request #386 from kreuzberg-dev/fix-rust-docs
Align core Rust API docs and snippets
2 parents 2e57ec7 + 8304f52 commit 42997dc

40 files changed

+3008
-1342
lines changed

crates/kreuzberg/src/core/config/extraction/core.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,10 @@ pub struct ExtractionConfig {
8787
#[serde(default)]
8888
pub html_options: Option<html_to_markdown_rs::ConversionOptions>,
8989

90-
/// Maximum concurrent extractions in batch operations (None = num_cpus * 2).
90+
/// Maximum concurrent extractions in batch operations (None = (num_cpus × 1.5).ceil()).
9191
///
9292
/// Limits parallelism to prevent resource exhaustion when processing
93-
/// large batches. Defaults to twice the number of CPU cores.
93+
/// large batches. Defaults to (num_cpus × 1.5).ceil() when not set.
9494
#[serde(default)]
9595
pub max_concurrent_extractions: Option<usize>,
9696

crates/kreuzberg/src/core/extractor/batch.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ use super::file::extract_file;
1919
/// This function processes multiple files in parallel, automatically managing
2020
/// concurrency to prevent resource exhaustion. The concurrency limit can be
2121
/// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
22-
/// to `num_cpus * 2`.
22+
/// to `(num_cpus * 1.5).ceil()`.
2323
///
2424
/// # Arguments
2525
///
@@ -153,7 +153,7 @@ pub async fn batch_extract_file(
153153
/// This function processes multiple byte arrays in parallel, automatically managing
154154
/// concurrency to prevent resource exhaustion. The concurrency limit can be
155155
/// configured via `ExtractionConfig::max_concurrent_extractions` or defaults
156-
/// to `num_cpus * 2`.
156+
/// to `(num_cpus * 1.5).ceil()`.
157157
///
158158
/// # Arguments
159159
///

crates/kreuzberg/src/plugins/mod.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
//! # -> kreuzberg::Result<ExtractionResult> {
4242
//! # Ok(ExtractionResult {
4343
//! # content: String::new(),
44-
//! # mime_type: String::new().into(),
44+
//! # mime_type: std::borrow::Cow::Borrowed("text/plain"),
4545
//! # metadata: Metadata::default(),
4646
//! # tables: vec![],
4747
//! # detected_languages: None,
@@ -130,7 +130,7 @@
130130
//!
131131
//! Ok(ExtractionResult {
132132
//! content: extracted_text,
133-
//! mime_type: "application/json".to_string().into(),
133+
//! mime_type: std::borrow::Cow::Borrowed("application/json"),
134134
//! metadata,
135135
//! tables: vec![],
136136
//! detected_languages: None,

crates/kreuzberg/src/plugins/ocr.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@ pub enum OcrBackendType {
4343
/// use kreuzberg::plugins::{Plugin, OcrBackend, OcrBackendType};
4444
/// use kreuzberg::{Result, OcrConfig};
4545
/// use async_trait::async_trait;
46-
/// use std::path::Path;
4746
/// use std::borrow::Cow;
47+
/// use std::path::Path;
4848
/// use kreuzberg::types::{ExtractionResult, Metadata};
4949
///
5050
/// struct CustomOcrBackend;
@@ -119,8 +119,8 @@ pub trait OcrBackend: Plugin {
119119
/// # use kreuzberg::plugins::{Plugin, OcrBackend};
120120
/// # use kreuzberg::{Result, OcrConfig};
121121
/// # use async_trait::async_trait;
122-
/// # use std::path::Path;
123122
/// # use std::borrow::Cow;
123+
/// # use std::path::Path;
124124
/// # use kreuzberg::types::{ExtractionResult, Metadata};
125125
/// # struct MyOcr;
126126
/// # impl Plugin for MyOcr {
@@ -310,9 +310,9 @@ pub trait OcrBackend: Plugin {
310310
/// use kreuzberg::{Result, OcrConfig};
311311
/// use kreuzberg::types::{ExtractionResult, Metadata};
312312
/// use async_trait::async_trait;
313+
/// use std::borrow::Cow;
313314
/// use std::sync::Arc;
314315
/// use std::path::Path;
315-
/// use std::borrow::Cow;
316316
///
317317
/// struct CustomOcr;
318318
///

crates/kreuzberg/src/types/extraction.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ pub struct ExtractionResult {
5050
#[serde(skip_serializing_if = "Option::is_none")]
5151
pub pages: Option<Vec<PageContent>>,
5252

53-
/// Semantic elements when element-based output format is enabled.
53+
/// Semantic elements when element-based result format is enabled.
5454
///
55-
/// When output_format is set to ElementBased, this field contains semantic
55+
/// When result_format is set to ElementBased, this field contains semantic
5656
/// elements with type classification, unique identifiers, and metadata for
5757
/// Unstructured-compatible element-based processing.
5858
#[serde(skip_serializing_if = "Option::is_none", default)]

docs/cli/usage.md

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ The Kreuzberg CLI provides command-line access to all extraction features. This
2525
--8<-- "snippets/cli/install_go_sdk.md"
2626

2727
!!! info "Feature Availability"
28-
**Homebrew Installation:**
28+
**Homebrew Installation:**
2929

3030
- ✅ Text extraction (PDF, Office, images, 75+ formats)
3131
- ✅ OCR with Tesseract
@@ -165,22 +165,37 @@ Configure OCR backend, language, and Tesseract options in your config file (see
165165

166166
### Using Config Files
167167

168-
Kreuzberg automatically discovers configuration files by searching the current directory and parent directories for:
169-
170-
1. `./kreuzberg.{toml,yaml,yml,json}` in the current directory
171-
2. `../kreuzberg.{toml,yaml,yml,json}` in the parent directory (and so on, up the directory tree)
168+
Kreuzberg automatically discovers a configuration file by searching the current directory and parent directories for **`kreuzberg.toml`** only. If you use YAML or JSON, specify the file explicitly with `--config`.
172169

173170
```bash title="Terminal"
174-
# Extract using discovered configuration
171+
# Extract using discovered configuration (finds kreuzberg.toml)
175172
kreuzberg extract document.pdf
176173
```
177174

178175
### Specify Config File
179176

177+
You can load TOML, YAML (`.yaml` or `.yml`), or JSON via `--config`:
178+
180179
```bash title="Terminal"
181180
kreuzberg extract document.pdf --config my-config.toml
181+
kreuzberg extract document.pdf --config kreuzberg.yaml
182+
kreuzberg extract document.pdf --config my-config.json
183+
```
184+
185+
### Inline JSON Config
186+
187+
Override or supply config without a file using inline JSON (merged after config file, before individual flags):
188+
189+
```bash title="Terminal"
190+
# Inline JSON (applied after config file)
191+
kreuzberg extract document.pdf --config-json '{"ocr":{"backend":"tesseract"},"chunking":{"max_chars":1000}}'
192+
193+
# Base64-encoded JSON (useful in shells where quoting is awkward)
194+
kreuzberg extract document.pdf --config-json-base64 eyJvY3IiOnsiYmFja2VuZCI6InRlc3NlcmFjdCJ9fQ==
182195
```
183196

197+
Both `extract` and `batch` support `--config-json` and `--config-json-base64`.
198+
184199
### Example Config Files
185200

186201
**kreuzberg.toml:**
@@ -447,23 +462,25 @@ kreuzberg detect document.pdf
447462

448463
## Docker Usage
449464

465+
Use the CLI image `ghcr.io/kreuzberg-dev/kreuzberg-cli:latest` for command-line usage. The full image `ghcr.io/kreuzberg-dev/kreuzberg:latest` also includes the CLI.
466+
450467
### Basic Docker
451468

452469
```bash title="Terminal"
453470
# Extract document using Docker with mounted directory
454-
docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg:latest \
471+
docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg-cli:latest \
455472
extract /data/document.pdf
456473

457474
# Extract and save output to host directory using shell redirection
458-
docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg:latest \
475+
docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg-cli:latest \
459476
extract /data/document.pdf > output.txt
460477
```
461478

462479
### Docker with OCR
463480

464481
```bash title="Terminal"
465482
# Extract with OCR using Docker
466-
docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg:latest \
483+
docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg-cli:latest \
467484
extract /data/scanned.pdf --ocr true
468485
```
469486

@@ -472,11 +489,11 @@ docker run -v $(pwd):/data ghcr.io/kreuzberg-dev/kreuzberg:latest \
472489
**docker-compose.yaml:**
473490

474491
```yaml title="docker-compose.yaml"
475-
version: '3.8'
492+
version: "3.8"
476493

477494
services:
478495
kreuzberg:
479-
image: ghcr.io/kreuzberg-dev/kreuzberg:latest
496+
image: ghcr.io/kreuzberg-dev/kreuzberg-cli:latest
480497
volumes:
481498
- ./documents:/input
482499
command: extract /input/document.pdf --ocr true
@@ -558,8 +575,9 @@ The `serve` command starts a RESTful HTTP API server:
558575
# Start server on default host (127.0.0.1) and port (8000)
559576
kreuzberg serve
560577

561-
# Start server on specific host and port
578+
# Start server on specific host and port (-H / -p are short forms)
562579
kreuzberg serve --host 0.0.0.0 --port 8000
580+
kreuzberg serve -H 0.0.0.0 -p 8000
563581

564582
# Start server with custom configuration file
565583
kreuzberg serve --config kreuzberg.toml --host 0.0.0.0 --port 8000
@@ -568,6 +586,7 @@ kreuzberg serve --config kreuzberg.toml --host 0.0.0.0 --port 8000
568586
### Server Endpoints
569587

570588
The server provides the following endpoints:
589+
571590
- `POST /extract` - Extract text from uploaded files
572591
- `POST /batch` - Batch extract from multiple files
573592
- `GET /detect` - Detect MIME type of file
@@ -597,6 +616,7 @@ kreuzberg mcp --config kreuzberg.toml --transport stdio
597616
```
598617

599618
The MCP server provides tools for AI agents:
619+
600620
- `extract_file` - Extract text from a file path
601621
- `extract_bytes` - Extract text from base64-encoded bytes
602622
- `batch_extract` - Extract from multiple files
@@ -643,9 +663,12 @@ kreuzberg --help
643663
kreuzberg extract --help
644664
kreuzberg batch --help
645665
kreuzberg detect --help
666+
kreuzberg version --help
646667
kreuzberg serve --help
647668
kreuzberg mcp --help
648669
kreuzberg cache --help
670+
kreuzberg cache stats --help
671+
kreuzberg cache clear --help
649672
```
650673

651674
### Version Information

docs/concepts/mime-detection.md

Lines changed: 62 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,11 @@ let mime_type = EXT_TO_MIME.get(extension.as_str())
5959
Whether the MIME type was detected or explicitly provided, it must be supported:
6060

6161
```rust title="mime_detection.rs"
62-
pub fn validate_mime_type(mime_type: &str) -> Result<()> {
62+
pub fn validate_mime_type(mime_type: &str) -> Result<String> {
6363
if SUPPORTED_TYPES.contains(mime_type) {
64-
Ok(())
64+
Ok(mime_type.to_string())
6565
} else {
66-
Err(KreuzbergError::UnsupportedFormat {
67-
mime_type: mime_type.to_string(),
68-
})
66+
Err(KreuzbergError::UnsupportedFormat(mime_type.to_string()))
6967
}
7068
}
7169
```
@@ -76,80 +74,80 @@ Kreuzberg supports multiple file formats across many categories:
7674

7775
### Documents
7876

79-
| Extension | MIME Type |
80-
|-----------|-----------|
81-
| `.pdf` | `application/pdf` |
82-
| `.docx` | `application/vnd.openxmlformats-officedocument.wordprocessingml.document` |
83-
| `.doc` | `application/msword` |
84-
| `.odt` | `application/vnd.oasis.opendocument.text` |
85-
| `.rtf` | `application/rtf` |
77+
| Extension | MIME Type |
78+
| --------- | ------------------------------------------------------------------------- |
79+
| `.pdf` | `application/pdf` |
80+
| `.docx` | `application/vnd.openxmlformats-officedocument.wordprocessingml.document` |
81+
| `.doc` | `application/msword` |
82+
| `.odt` | `application/vnd.oasis.opendocument.text` |
83+
| `.rtf` | `application/rtf` |
8684

8785
### Spreadsheets
8886

89-
| Extension | MIME Type |
90-
|-----------|-----------|
91-
| `.xlsx` | `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet` |
92-
| `.xls` | `application/vnd.ms-excel` |
93-
| `.xlsm` | `application/vnd.ms-excel.sheet.macroEnabled.12` |
94-
| `.xlsb` | `application/vnd.ms-excel.sheet.binary.macroEnabled.12` |
95-
| `.ods` | `application/vnd.oasis.opendocument.spreadsheet` |
96-
| `.csv` | `text/csv` |
97-
| `.tsv` | `text/tab-separated-values` |
87+
| Extension | MIME Type |
88+
| --------- | ------------------------------------------------------------------- |
89+
| `.xlsx` | `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet` |
90+
| `.xls` | `application/vnd.ms-excel` |
91+
| `.xlsm` | `application/vnd.ms-excel.sheet.macroEnabled.12` |
92+
| `.xlsb` | `application/vnd.ms-excel.sheet.binary.macroEnabled.12` |
93+
| `.ods` | `application/vnd.oasis.opendocument.spreadsheet` |
94+
| `.csv` | `text/csv` |
95+
| `.tsv` | `text/tab-separated-values` |
9896

9997
### Presentations
10098

101-
| Extension | MIME Type |
102-
|-----------|-----------|
103-
| `.pptx` | `application/vnd.openxmlformats-officedocument.presentationml.presentation` |
104-
| `.ppt` | `application/vnd.ms-powerpoint` |
105-
| `.odp` | `application/vnd.oasis.opendocument.presentation` |
99+
| Extension | MIME Type |
100+
| --------- | --------------------------------------------------------------------------- |
101+
| `.pptx` | `application/vnd.openxmlformats-officedocument.presentationml.presentation` |
102+
| `.ppt` | `application/vnd.ms-powerpoint` |
103+
| `.odp` | `application/vnd.oasis.opendocument.presentation` |
106104

107105
### Images
108106

109-
| Extension | MIME Type |
110-
|-----------|-----------|
111-
| `.jpg`, `.jpeg` | `image/jpeg` |
112-
| `.png` | `image/png` |
113-
| `.gif` | `image/gif` |
114-
| `.bmp` | `image/bmp` |
115-
| `.tiff`, `.tif` | `image/tiff` |
116-
| `.webp` | `image/webp` |
117-
| `.svg` | `image/svg+xml` |
107+
| Extension | MIME Type |
108+
| --------------- | --------------- |
109+
| `.jpg`, `.jpeg` | `image/jpeg` |
110+
| `.png` | `image/png` |
111+
| `.gif` | `image/gif` |
112+
| `.bmp` | `image/bmp` |
113+
| `.tiff`, `.tif` | `image/tiff` |
114+
| `.webp` | `image/webp` |
115+
| `.svg` | `image/svg+xml` |
118116

119117
### Text and Markup
120118

121-
| Extension | MIME Type |
122-
|-----------|-----------|
123-
| `.txt` | `text/plain` |
124-
| `.md`, `.markdown` | `text/markdown` |
125-
| `.html`, `.htm` | `text/html` |
126-
| `.xml` | `application/xml` |
127-
| `.json` | `application/json` |
128-
| `.yaml` | `application/x-yaml` |
129-
| `.toml` | `application/toml` |
119+
| Extension | MIME Type |
120+
| ------------------ | -------------------- |
121+
| `.txt` | `text/plain` |
122+
| `.md`, `.markdown` | `text/markdown` |
123+
| `.html`, `.htm` | `text/html` |
124+
| `.xml` | `application/xml` |
125+
| `.json` | `application/json` |
126+
| `.yaml` | `application/x-yaml` |
127+
| `.toml` | `application/toml` |
130128

131129
### Email
132130

133-
| Extension | MIME Type |
134-
|-----------|-----------|
135-
| `.eml` | `message/rfc822` |
136-
| `.msg` | `application/vnd.ms-outlook` |
131+
| Extension | MIME Type |
132+
| --------- | ---------------------------- |
133+
| `.eml` | `message/rfc822` |
134+
| `.msg` | `application/vnd.ms-outlook` |
137135

138136
### Archives
139137

140-
| Extension | MIME Type |
141-
|-----------|-----------|
142-
| `.zip` | `application/zip` |
143-
| `.tar` | `application/x-tar` |
144-
| `.gz` | `application/gzip` |
145-
| `.7z` | `application/x-7z-compressed` |
138+
| Extension | MIME Type |
139+
| --------- | ----------------------------- |
140+
| `.zip` | `application/zip` |
141+
| `.tar` | `application/x-tar` |
142+
| `.gz` | `application/gzip` |
143+
| `.7z` | `application/x-7z-compressed` |
146144

147145
### Ebooks
148146

149-
| Extension | MIME Type |
150-
|-----------|-----------|
151-
| `.epub` | `application/epub+zip` |
152-
| `.mobi` | `application/x-mobipocket-ebook` |
147+
| Extension | MIME Type |
148+
| --------- | -------------------------------- |
149+
| `.epub` | `application/epub+zip` |
150+
| `.mobi` | `application/x-mobipocket-ebook` |
153151

154152
## Explicit MIME Type Override
155153

@@ -189,15 +187,15 @@ result = extract_file("document.pdf", mime_type=PDF_MIME_TYPE, config=config)
189187
Kreuzberg provides utility functions for MIME type operations:
190188

191189
```rust title="mime_detection.rs"
192-
// Automatically detect MIME type from file path extension
193-
pub fn detect_mime_type(path: impl AsRef<Path>) -> Result<String>
190+
// Detect MIME type from file path (by extension). check_exists: if true, file must exist.
191+
pub fn detect_mime_type(path: impl AsRef<Path>, check_exists: bool) -> Result<String>
194192

195-
// Verify that a MIME type is supported by Kreuzberg
196-
pub fn validate_mime_type(mime_type: &str) -> Result<()>
193+
// Validate that a MIME type is supported; returns the validated (possibly normalized) string.
194+
pub fn validate_mime_type(mime_type: &str) -> Result<String>
197195

198-
// Auto-detect MIME type or validate explicit type if provided
196+
// If mime_type provided, validate it; else detect from path. Errors if both are None.
199197
pub fn detect_or_validate(
200-
path: impl AsRef<Path>,
198+
path: Option<&Path>,
201199
mime_type: Option<&str>
202200
) -> Result<String>
203201
```

0 commit comments

Comments
 (0)