Skip to content

Commit e266428

Browse files
committed
fix: fix doc tests across Rust and Python, add C to lint workflow, update README
- Fix 9 Rust doc tests missing `extracted_keywords` field in plugin examples - Enable Python doctests via pytest --doctest-modules (50 passing, 4 skipped) - Fix Python docstring examples: proper traceback format, +SKIP for FFI calls - Fix PyO3 keyword enum variants and invalid kwargs in Python-facing Rust docs - Add C to lint workflow (all, check, systems groups + standalone tasks) - Add C-FFI to README Installation section, Platform Support table, Polyglot desc
1 parent 366a6fc commit e266428

File tree

14 files changed

+137
-63
lines changed

14 files changed

+137
-63
lines changed

.task/workflows/lint.yml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ tasks:
1515
- task: php:lint
1616
- task: elixir:lint
1717
- task: r:lint
18+
- task: c:lint
1819
- task: typescript:typecheck
1920

2021
check:
@@ -31,6 +32,7 @@ tasks:
3132
- task: php:lint:check
3233
- task: elixir:lint:check
3334
- task: r:lint:check
35+
- task: c:lint:check
3436
- task: typescript:typecheck:check
3537

3638
core:
@@ -70,18 +72,20 @@ tasks:
7072
- task: typescript:typecheck:check
7173

7274
systems:
73-
desc: Lint systems languages (Rust, Go, C#) with auto-fix
75+
desc: Lint systems languages (Rust, Go, C#, C) with auto-fix
7476
cmds:
7577
- task: rust:lint
7678
- task: go:lint
7779
- task: csharp:lint
80+
- task: c:lint
7881

7982
systems:check:
80-
desc: Check systems languages linting without modifications
83+
desc: Check systems languages linting (Rust, Go, C#, C) without modifications
8184
cmds:
8285
- task: rust:lint:check
8386
- task: go:lint:check
8487
- task: csharp:lint:check
88+
- task: c:lint:check
8589

8690
jvm-bindings:
8791
desc: Lint JVM-dependent bindings (Java, C#, Ruby, PHP) with auto-fix
@@ -215,6 +219,16 @@ tasks:
215219
cmds:
216220
- task: r:lint:check
217221

222+
c:
223+
desc: Lint C code with auto-fix
224+
cmds:
225+
- task: c:lint
226+
227+
c:check:
228+
desc: Check C code linting without modifications
229+
cmds:
230+
- task: c:lint:check
231+
218232
typescript:
219233
desc: Run TypeScript type checking
220234
cmds:

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ Extract text and metadata from a wide range of file formats (75+), generate embe
6565
## Key Features
6666

6767
- **Extensible architecture** – Plugin system for custom OCR backends, validators, post-processors, and document extractors
68-
- **Polyglot** – Native bindings for Rust, Python, TypeScript/Node.js, Ruby, Go, Java, C#, PHP, and Elixir
68+
- **Polyglot** – Native bindings for Rust, Python, TypeScript/Node.js, Ruby, Go, Java, C#, PHP, Elixir, R, and C
6969
- **75+ file formats** – PDF, Office documents, images, HTML, XML, emails, archives, academic formats across 8 categories
7070
- **OCR support** – Tesseract (all bindings, including Tesseract-WASM for browsers), PaddleOCR (all native bindings), EasyOCR (Python), extensible via plugin API
7171
- **High performance** – Rust core with native PDFium, SIMD optimizations and full parallelism
@@ -96,6 +96,7 @@ Each language binding provides comprehensive documentation with examples and bes
9696

9797
**Native:**
9898
- **[Rust](https://github.com/kreuzberg-dev/kreuzberg/tree/main/crates/kreuzberg)** – Core library, flexible feature flags, zero-copy APIs
99+
- **[C (FFI)](https://github.com/kreuzberg-dev/kreuzberg/tree/main/crates/kreuzberg-ffi)** – C header + shared library, pkg-config/CMake support, cross-platform
99100

100101
**Containers:**
101102
- **[Docker](https://docs.kreuzberg.dev/guides/docker/)** – Official images with API, CLI, and MCP server modes (Core: ~1.0-1.3GB, Full: ~1.0-1.3GB with OCR + legacy format support)
@@ -122,6 +123,7 @@ Complete architecture coverage across all language bindings:
122123
| C# |||||
123124
| PHP |||||
124125
| Rust |||||
126+
| C (FFI) |||||
125127
| CLI |||||
126128
| Docker |||| - |
127129

crates/kreuzberg-py/src/keywords.rs

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ use pyo3::prelude::*;
99
///
1010
/// Example:
1111
/// >>> from kreuzberg import KeywordAlgorithm
12-
/// >>> algo = KeywordAlgorithm.YAKE
12+
/// >>> algo = KeywordAlgorithm.Yake
13+
/// >>> assert algo == KeywordAlgorithm.Yake
1314
#[pyclass(name = "KeywordAlgorithm", module = "kreuzberg")]
1415
#[derive(Clone, Copy, PartialEq, Eq)]
1516
pub enum KeywordAlgorithm {
@@ -42,7 +43,8 @@ impl From<kreuzberg::keywords::KeywordAlgorithm> for KeywordAlgorithm {
4243
///
4344
/// Example:
4445
/// >>> from kreuzberg import YakeParams
45-
/// >>> params = YakeParams(window_size=3, deduplicate=True, dedup_threshold=0.8)
46+
/// >>> params = YakeParams(window_size=3)
47+
/// >>> assert params.window_size == 3
4648
#[pyclass(name = "YakeParams", module = "kreuzberg")]
4749
#[derive(Clone)]
4850
pub struct YakeParams {
@@ -157,11 +159,12 @@ impl From<kreuzberg::keywords::RakeParams> for RakeParams {
157159
/// Example:
158160
/// >>> from kreuzberg import KeywordConfig, KeywordAlgorithm
159161
/// >>> config = KeywordConfig(
160-
/// ... algorithm=KeywordAlgorithm.YAKE,
162+
/// ... algorithm=KeywordAlgorithm.Yake,
161163
/// ... max_keywords=15,
162164
/// ... min_score=0.1,
163165
/// ... language="en"
164166
/// ... )
167+
/// >>> assert config.max_keywords == 15
165168
#[pyclass(name = "KeywordConfig", module = "kreuzberg")]
166169
#[derive(Clone)]
167170
pub struct KeywordConfig {

crates/kreuzberg-py/src/types.rs

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@ use crate::plugins::json_value_to_py;
2323
///
2424
/// Example:
2525
/// >>> from kreuzberg import extract_file_sync, ExtractionConfig
26-
/// >>> result = extract_file_sync("document.pdf", None, ExtractionConfig())
27-
/// >>> print(result.content)
28-
/// >>> print(result.metadata)
29-
/// >>> print(len(result.tables))
30-
/// >>> if result.detected_languages:
26+
/// >>> result = extract_file_sync("document.pdf", None, ExtractionConfig()) # doctest: +SKIP
27+
/// >>> print(result.content) # doctest: +SKIP
28+
/// >>> print(result.metadata) # doctest: +SKIP
29+
/// >>> print(len(result.tables)) # doctest: +SKIP
30+
/// >>> if result.detected_languages: # doctest: +SKIP
3131
/// ... print(result.detected_languages)
32-
/// >>> if result.document:
32+
/// >>> if result.document: # doctest: +SKIP
3333
/// ... print(f"Document has {len(result.document['nodes'])} nodes")
3434
#[pyclass(name = "ExtractionResult", module = "kreuzberg")]
3535
pub struct ExtractionResult {
@@ -167,9 +167,9 @@ impl ExtractionResult {
167167
/// int: Total page count
168168
///
169169
/// Example:
170-
/// >>> result = extract_file_sync("document.pdf", None, ExtractionConfig())
171-
/// >>> page_count = result.get_page_count()
172-
/// >>> print(f"Document has {page_count} pages")
170+
/// >>> result = extract_file_sync("document.pdf", None, ExtractionConfig()) # doctest: +SKIP
171+
/// >>> page_count = result.get_page_count() # doctest: +SKIP
172+
/// >>> print(f"Document has {page_count} pages") # doctest: +SKIP
173173
#[pyo3(name = "get_page_count")]
174174
fn get_page_count(&self) -> usize {
175175
Python::attach(|py| self.pages.as_ref().map(|pages_py| pages_py.bind(py).len()).unwrap_or(0))
@@ -186,9 +186,9 @@ impl ExtractionResult {
186186
/// Example:
187187
/// >>> from kreuzberg import ChunkingConfig, ExtractionConfig
188188
/// >>> config = ExtractionConfig(chunking=ChunkingConfig(max_chars=500))
189-
/// >>> result = extract_file_sync("document.pdf", None, config)
190-
/// >>> chunk_count = result.get_chunk_count()
191-
/// >>> print(f"Document has {chunk_count} chunks")
189+
/// >>> result = extract_file_sync("document.pdf", None, config) # doctest: +SKIP
190+
/// >>> chunk_count = result.get_chunk_count() # doctest: +SKIP
191+
/// >>> print(f"Document has {chunk_count} chunks") # doctest: +SKIP
192192
#[pyo3(name = "get_chunk_count")]
193193
fn get_chunk_count(&self) -> usize {
194194
Python::attach(|py| {
@@ -212,9 +212,9 @@ impl ExtractionResult {
212212
/// >>> config = ExtractionConfig(
213213
/// ... language_detection=LanguageDetectionConfig(enabled=True)
214214
/// ... )
215-
/// >>> result = extract_file_sync("document.pdf", None, config)
216-
/// >>> lang = result.get_detected_language()
217-
/// >>> if lang:
215+
/// >>> result = extract_file_sync("document.pdf", None, config) # doctest: +SKIP
216+
/// >>> lang = result.get_detected_language() # doctest: +SKIP
217+
/// >>> if lang: # doctest: +SKIP
218218
/// ... print(f"Document language: {lang}")
219219
#[pyo3(name = "get_detected_language")]
220220
fn get_detected_language(&self) -> Option<String> {
@@ -242,12 +242,12 @@ impl ExtractionResult {
242242
/// Any | None: Field value (type depends on field), or None if not found
243243
///
244244
/// Example:
245-
/// >>> result = extract_file_sync("document.pdf", None, ExtractionConfig())
246-
/// >>> title = result.get_metadata_field("title")
247-
/// >>> if title:
245+
/// >>> result = extract_file_sync("document.pdf", None, ExtractionConfig()) # doctest: +SKIP
246+
/// >>> title = result.get_metadata_field("title") # doctest: +SKIP
247+
/// >>> if title: # doctest: +SKIP
248248
/// ... print(f"Title: {title}")
249-
/// >>> authors = result.get_metadata_field("authors")
250-
/// >>> if authors:
249+
/// >>> authors = result.get_metadata_field("authors") # doctest: +SKIP
250+
/// >>> if authors: # doctest: +SKIP
251251
/// ... print(f"Authors: {authors}")
252252
#[pyo3(name = "get_metadata_field")]
253253
fn get_metadata_field(&self, field_name: &str) -> PyResult<Option<Py<PyAny>>> {
@@ -786,8 +786,8 @@ mod tests {
786786
/// Example:
787787
/// >>> from kreuzberg import ChunkingConfig, ExtractionConfig
788788
/// >>> config = ExtractionConfig(chunking=ChunkingConfig(max_chars=500))
789-
/// >>> result = extract_file_sync("document.pdf", None, config)
790-
/// >>> for chunk in result.chunks:
789+
/// >>> result = extract_file_sync("document.pdf", None, config) # doctest: +SKIP
790+
/// >>> for chunk in result.chunks: # doctest: +SKIP
791791
/// ... print(f"Chunk: {chunk.content[:50]}...")
792792
/// ... print(f"Metadata: {chunk.metadata}")
793793
#[pyclass(name = "Chunk", module = "kreuzberg")]
@@ -842,8 +842,8 @@ impl PyChunk {
842842
/// page_number (int): Page number where table was found
843843
///
844844
/// Example:
845-
/// >>> result = extract_file_sync("document.pdf", None, ExtractionConfig())
846-
/// >>> for table in result.tables:
845+
/// >>> result = extract_file_sync("document.pdf", None, ExtractionConfig()) # doctest: +SKIP
846+
/// >>> for table in result.tables: # doctest: +SKIP
847847
/// ... print(f"Table on page {table.page_number}:")
848848
/// ... print(table.markdown)
849849
/// ... print(f"Dimensions: {len(table.cells)} rows x {len(table.cells[0])} cols")

crates/kreuzberg/src/plugins/extractor/registry.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ use std::sync::Arc;
6161
/// elements: None,
6262
/// ocr_elements: None,
6363
/// document: None,
64+
/// #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
65+
/// extracted_keywords: None,
6466
/// quality_score: None,
6567
/// processing_warnings: vec![],
6668
/// annotations: None,

crates/kreuzberg/src/plugins/extractor/trait.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ use crate::KreuzbergError;
6868
/// elements: None,
6969
/// ocr_elements: None,
7070
/// document: None,
71+
/// #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
72+
/// extracted_keywords: None,
7173
/// quality_score: None,
7274
/// processing_warnings: vec![],
7375
/// annotations: None,
@@ -155,6 +157,8 @@ pub trait DocumentExtractor: Plugin {
155157
/// elements: None,
156158
/// ocr_elements: None,
157159
/// document: None,
160+
/// #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
161+
/// extracted_keywords: None,
158162
/// quality_score: None,
159163
/// processing_warnings: vec![],
160164
/// annotations: None,
@@ -233,6 +237,8 @@ pub trait DocumentExtractor: Plugin {
233237
/// elements: None,
234238
/// ocr_elements: None,
235239
/// document: None,
240+
/// #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
241+
/// extracted_keywords: None,
236242
/// quality_score: None,
237243
/// processing_warnings: vec![],
238244
/// annotations: None,

crates/kreuzberg/src/plugins/mod.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@
5252
//! # elements: None,
5353
//! # ocr_elements: None,
5454
//! # document: None,
55+
//! # #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
56+
//! # extracted_keywords: None,
5557
//! # quality_score: None,
5658
//! # processing_warnings: vec![],
5759
//! # annotations: None,
@@ -72,6 +74,8 @@
7274
//! # elements: None,
7375
//! # ocr_elements: None,
7476
//! # document: None,
77+
//! # #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
78+
//! # extracted_keywords: None,
7579
//! # quality_score: None,
7680
//! # processing_warnings: vec![],
7781
//! # annotations: None,
@@ -141,6 +145,8 @@
141145
//! elements: None,
142146
//! ocr_elements: None,
143147
//! document: None,
148+
//! #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
149+
//! extracted_keywords: None,
144150
//! quality_score: None,
145151
//! processing_warnings: vec![],
146152
//! annotations: None,

crates/kreuzberg/src/plugins/ocr.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ pub enum OcrBackendType {
7373
/// elements: None,
7474
/// ocr_elements: None,
7575
/// document: None,
76+
/// #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
77+
/// extracted_keywords: None,
7678
/// quality_score: None,
7779
/// processing_warnings: vec![],
7880
/// annotations: None,
@@ -160,6 +162,8 @@ pub trait OcrBackend: Plugin {
160162
/// elements: None,
161163
/// ocr_elements: None,
162164
/// document: None,
165+
/// #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
166+
/// extracted_keywords: None,
163167
/// quality_score: None,
164168
/// processing_warnings: vec![],
165169
/// annotations: None,
@@ -339,6 +343,8 @@ pub trait OcrBackend: Plugin {
339343
/// elements: None,
340344
/// ocr_elements: None,
341345
/// document: None,
346+
/// #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
347+
/// extracted_keywords: None,
342348
/// quality_score: None,
343349
/// processing_warnings: vec![],
344350
/// annotations: None,

0 commit comments

Comments
 (0)