kreuzberg-dev
diff --git a/‎.task/workflows/lint.yml‎
Lines changed: 16 additions & 2 deletions b/‎.task/workflows/lint.yml‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 3 additions & 1 deletion b/‎README.md‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎crates/kreuzberg-py/src/keywords.rs‎
Lines changed: 6 additions & 3 deletions b/‎crates/kreuzberg-py/src/keywords.rs‎
Lines changed: 6 additions & 3 deletions
diff --git a/‎crates/kreuzberg-py/src/types.rs‎
Lines changed: 24 additions & 24 deletions b/‎crates/kreuzberg-py/src/types.rs‎
Lines changed: 24 additions & 24 deletions
diff --git a/‎crates/kreuzberg/src/plugins/extractor/registry.rs‎
Lines changed: 2 additions & 0 deletions b/‎crates/kreuzberg/src/plugins/extractor/registry.rs‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎crates/kreuzberg/src/plugins/extractor/trait.rs‎
Lines changed: 6 additions & 0 deletions b/‎crates/kreuzberg/src/plugins/extractor/trait.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎crates/kreuzberg/src/plugins/mod.rs‎
Lines changed: 6 additions & 0 deletions b/‎crates/kreuzberg/src/plugins/mod.rs‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎crates/kreuzberg/src/plugins/ocr.rs‎
Lines changed: 6 additions & 0 deletions b/‎crates/kreuzberg/src/plugins/ocr.rs‎
Lines changed: 6 additions & 0 deletions
@@ -15,6 +15,7 @@ tasks:
       - task: php:lint
       - task: elixir:lint
       - task: r:lint
+      - task: c:lint
       - task: typescript:typecheck
 
   check:
@@ -31,6 +32,7 @@ tasks:
       - task: php:lint:check
       - task: elixir:lint:check
       - task: r:lint:check
+      - task: c:lint:check
       - task: typescript:typecheck:check
 
   core:
@@ -70,18 +72,20 @@ tasks:
       - task: typescript:typecheck:check
 
   systems:
-    desc: Lint systems languages (Rust, Go, C#) with auto-fix
+    desc: Lint systems languages (Rust, Go, C#, C) with auto-fix
     cmds:
       - task: rust:lint
       - task: go:lint
       - task: csharp:lint
+      - task: c:lint
 
   systems:check:
-    desc: Check systems languages linting without modifications
+    desc: Check systems languages linting (Rust, Go, C#, C) without modifications
     cmds:
       - task: rust:lint:check
       - task: go:lint:check
       - task: csharp:lint:check
+      - task: c:lint:check
 
   jvm-bindings:
     desc: Lint JVM-dependent bindings (Java, C#, Ruby, PHP) with auto-fix
@@ -215,6 +219,16 @@ tasks:
     cmds:
       - task: r:lint:check
 
+  c:
+    desc: Lint C code with auto-fix
+    cmds:
+      - task: c:lint
+
+  c:check:
+    desc: Check C code linting without modifications
+    cmds:
+      - task: c:lint:check
+
   typescript:
     desc: Run TypeScript type checking
     cmds:
 
@@ -65,7 +65,7 @@ Extract text and metadata from a wide range of file formats (75+), generate embe
 ## Key Features
 
 - **Extensible architecture** – Plugin system for custom OCR backends, validators, post-processors, and document extractors
-- **Polyglot** – Native bindings for Rust, Python, TypeScript/Node.js, Ruby, Go, Java, C#, PHP, and Elixir
+- **Polyglot** – Native bindings for Rust, Python, TypeScript/Node.js, Ruby, Go, Java, C#, PHP, Elixir, R, and C
 - **75+ file formats** – PDF, Office documents, images, HTML, XML, emails, archives, academic formats across 8 categories
 - **OCR support** – Tesseract (all bindings, including Tesseract-WASM for browsers), PaddleOCR (all native bindings), EasyOCR (Python), extensible via plugin API
 - **High performance** – Rust core with native PDFium, SIMD optimizations and full parallelism
@@ -96,6 +96,7 @@ Each language binding provides comprehensive documentation with examples and bes
 
 **Native:**
 - **[Rust](https://github.com/kreuzberg-dev/kreuzberg/tree/main/crates/kreuzberg)** – Core library, flexible feature flags, zero-copy APIs
+- **[C (FFI)](https://github.com/kreuzberg-dev/kreuzberg/tree/main/crates/kreuzberg-ffi)** – C header + shared library, pkg-config/CMake support, cross-platform
 
 **Containers:**
 - **[Docker](https://docs.kreuzberg.dev/guides/docker/)** – Official images with API, CLI, and MCP server modes (Core: ~1.0-1.3GB, Full: ~1.0-1.3GB with OCR + legacy format support)
@@ -122,6 +123,7 @@ Complete architecture coverage across all language bindings:
 | C# | ✅ | ✅ | ✅ | ✅ |
 | PHP | ✅ | ✅ | ✅ | ✅ |
 | Rust | ✅ | ✅ | ✅ | ✅ |
+| C (FFI) | ✅ | ✅ | ✅ | ✅ |
 | CLI | ✅ | ✅ | ✅ | ✅ |
 | Docker | ✅ | ✅ | ✅ | - |
 
 
@@ -9,7 +9,8 @@ use pyo3::prelude::*;
 ///
 /// Example:
 ///     >>> from kreuzberg import KeywordAlgorithm
-///     >>> algo = KeywordAlgorithm.YAKE
+///     >>> algo = KeywordAlgorithm.Yake
+///     >>> assert algo == KeywordAlgorithm.Yake
 #[pyclass(name = "KeywordAlgorithm", module = "kreuzberg")]
 #[derive(Clone, Copy, PartialEq, Eq)]
 pub enum KeywordAlgorithm {
@@ -42,7 +43,8 @@ impl From<kreuzberg::keywords::KeywordAlgorithm> for KeywordAlgorithm {
 ///
 /// Example:
 ///     >>> from kreuzberg import YakeParams
-///     >>> params = YakeParams(window_size=3, deduplicate=True, dedup_threshold=0.8)
+///     >>> params = YakeParams(window_size=3)
+///     >>> assert params.window_size == 3
 #[pyclass(name = "YakeParams", module = "kreuzberg")]
 #[derive(Clone)]
 pub struct YakeParams {
@@ -157,11 +159,12 @@ impl From<kreuzberg::keywords::RakeParams> for RakeParams {
 /// Example:
 ///     >>> from kreuzberg import KeywordConfig, KeywordAlgorithm
 ///     >>> config = KeywordConfig(
-///     ...     algorithm=KeywordAlgorithm.YAKE,
+///     ...     algorithm=KeywordAlgorithm.Yake,
 ///     ...     max_keywords=15,
 ///     ...     min_score=0.1,
 ///     ...     language="en"
 ///     ... )
+///     >>> assert config.max_keywords == 15
 #[pyclass(name = "KeywordConfig", module = "kreuzberg")]
 #[derive(Clone)]
 pub struct KeywordConfig {
 
@@ -23,13 +23,13 @@ use crate::plugins::json_value_to_py;
 ///
 /// Example:
 ///     >>> from kreuzberg import extract_file_sync, ExtractionConfig
-///     >>> result = extract_file_sync("document.pdf", None, ExtractionConfig())
-///     >>> print(result.content)
-///     >>> print(result.metadata)
-///     >>> print(len(result.tables))
-///     >>> if result.detected_languages:
+///     >>> result = extract_file_sync("document.pdf", None, ExtractionConfig())  # doctest: +SKIP
+///     >>> print(result.content)  # doctest: +SKIP
+///     >>> print(result.metadata)  # doctest: +SKIP
+///     >>> print(len(result.tables))  # doctest: +SKIP
+///     >>> if result.detected_languages:  # doctest: +SKIP
 ///     ...     print(result.detected_languages)
-///     >>> if result.document:
+///     >>> if result.document:  # doctest: +SKIP
 ///     ...     print(f"Document has {len(result.document['nodes'])} nodes")
 #[pyclass(name = "ExtractionResult", module = "kreuzberg")]
 pub struct ExtractionResult {
@@ -167,9 +167,9 @@ impl ExtractionResult {
     ///     int: Total page count
     ///
     /// Example:
-    ///     >>> result = extract_file_sync("document.pdf", None, ExtractionConfig())
-    ///     >>> page_count = result.get_page_count()
-    ///     >>> print(f"Document has {page_count} pages")
+    ///     >>> result = extract_file_sync("document.pdf", None, ExtractionConfig())  # doctest: +SKIP
+    ///     >>> page_count = result.get_page_count()  # doctest: +SKIP
+    ///     >>> print(f"Document has {page_count} pages")  # doctest: +SKIP
     #[pyo3(name = "get_page_count")]
     fn get_page_count(&self) -> usize {
         Python::attach(|py| self.pages.as_ref().map(|pages_py| pages_py.bind(py).len()).unwrap_or(0))
@@ -186,9 +186,9 @@ impl ExtractionResult {
     /// Example:
     ///     >>> from kreuzberg import ChunkingConfig, ExtractionConfig
     ///     >>> config = ExtractionConfig(chunking=ChunkingConfig(max_chars=500))
-    ///     >>> result = extract_file_sync("document.pdf", None, config)
-    ///     >>> chunk_count = result.get_chunk_count()
-    ///     >>> print(f"Document has {chunk_count} chunks")
+    ///     >>> result = extract_file_sync("document.pdf", None, config)  # doctest: +SKIP
+    ///     >>> chunk_count = result.get_chunk_count()  # doctest: +SKIP
+    ///     >>> print(f"Document has {chunk_count} chunks")  # doctest: +SKIP
     #[pyo3(name = "get_chunk_count")]
     fn get_chunk_count(&self) -> usize {
         Python::attach(|py| {
@@ -212,9 +212,9 @@ impl ExtractionResult {
     ///     >>> config = ExtractionConfig(
     ///     ...     language_detection=LanguageDetectionConfig(enabled=True)
     ///     ... )
-    ///     >>> result = extract_file_sync("document.pdf", None, config)
-    ///     >>> lang = result.get_detected_language()
-    ///     >>> if lang:
+    ///     >>> result = extract_file_sync("document.pdf", None, config)  # doctest: +SKIP
+    ///     >>> lang = result.get_detected_language()  # doctest: +SKIP
+    ///     >>> if lang:  # doctest: +SKIP
     ///     ...     print(f"Document language: {lang}")
     #[pyo3(name = "get_detected_language")]
     fn get_detected_language(&self) -> Option<String> {
@@ -242,12 +242,12 @@ impl ExtractionResult {
     ///     Any | None: Field value (type depends on field), or None if not found
     ///
     /// Example:
-    ///     >>> result = extract_file_sync("document.pdf", None, ExtractionConfig())
-    ///     >>> title = result.get_metadata_field("title")
-    ///     >>> if title:
+    ///     >>> result = extract_file_sync("document.pdf", None, ExtractionConfig())  # doctest: +SKIP
+    ///     >>> title = result.get_metadata_field("title")  # doctest: +SKIP
+    ///     >>> if title:  # doctest: +SKIP
     ///     ...     print(f"Title: {title}")
-    ///     >>> authors = result.get_metadata_field("authors")
-    ///     >>> if authors:
+    ///     >>> authors = result.get_metadata_field("authors")  # doctest: +SKIP
+    ///     >>> if authors:  # doctest: +SKIP
     ///     ...     print(f"Authors: {authors}")
     #[pyo3(name = "get_metadata_field")]
     fn get_metadata_field(&self, field_name: &str) -> PyResult<Option<Py<PyAny>>> {
@@ -786,8 +786,8 @@ mod tests {
 /// Example:
 ///     >>> from kreuzberg import ChunkingConfig, ExtractionConfig
 ///     >>> config = ExtractionConfig(chunking=ChunkingConfig(max_chars=500))
-///     >>> result = extract_file_sync("document.pdf", None, config)
-///     >>> for chunk in result.chunks:
+///     >>> result = extract_file_sync("document.pdf", None, config)  # doctest: +SKIP
+///     >>> for chunk in result.chunks:  # doctest: +SKIP
 ///     ...     print(f"Chunk: {chunk.content[:50]}...")
 ///     ...     print(f"Metadata: {chunk.metadata}")
 #[pyclass(name = "Chunk", module = "kreuzberg")]
@@ -842,8 +842,8 @@ impl PyChunk {
 ///     page_number (int): Page number where table was found
 ///
 /// Example:
-///     >>> result = extract_file_sync("document.pdf", None, ExtractionConfig())
-///     >>> for table in result.tables:
+///     >>> result = extract_file_sync("document.pdf", None, ExtractionConfig())  # doctest: +SKIP
+///     >>> for table in result.tables:  # doctest: +SKIP
 ///     ...     print(f"Table on page {table.page_number}:")
 ///     ...     print(table.markdown)
 ///     ...     print(f"Dimensions: {len(table.cells)} rows x {len(table.cells[0])} cols")
 
@@ -61,6 +61,8 @@ use std::sync::Arc;
 ///             elements: None,
 ///             ocr_elements: None,
 ///             document: None,
+///             #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+///             extracted_keywords: None,
 ///             quality_score: None,
 ///             processing_warnings: vec![],
 ///             annotations: None,
 
@@ -68,6 +68,8 @@ use crate::KreuzbergError;
 ///             elements: None,
 ///             ocr_elements: None,
 ///             document: None,
+///             #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+///             extracted_keywords: None,
 ///             quality_score: None,
 ///             processing_warnings: vec![],
 ///             annotations: None,
@@ -155,6 +157,8 @@ pub trait DocumentExtractor: Plugin {
     ///         elements: None,
     ///         ocr_elements: None,
     ///         document: None,
+    ///         #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+    ///         extracted_keywords: None,
     ///         quality_score: None,
     ///         processing_warnings: vec![],
     ///         annotations: None,
@@ -233,6 +237,8 @@ pub trait DocumentExtractor: Plugin {
     ///         elements: None,
     ///         ocr_elements: None,
     ///         document: None,
+    ///         #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+    ///         extracted_keywords: None,
     ///         quality_score: None,
     ///         processing_warnings: vec![],
     ///         annotations: None,
 
@@ -52,6 +52,8 @@
 //! #             elements: None,
 //! #             ocr_elements: None,
 //! #             document: None,
+//! #             #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+//! #             extracted_keywords: None,
 //! #             quality_score: None,
 //! #             processing_warnings: vec![],
 //! #             annotations: None,
@@ -72,6 +74,8 @@
 //! #             elements: None,
 //! #             ocr_elements: None,
 //! #             document: None,
+//! #             #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+//! #             extracted_keywords: None,
 //! #             quality_score: None,
 //! #             processing_warnings: vec![],
 //! #             annotations: None,
@@ -141,6 +145,8 @@
 //!             elements: None,
 //!             ocr_elements: None,
 //!             document: None,
+//!             #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+//!             extracted_keywords: None,
 //!             quality_score: None,
 //!             processing_warnings: vec![],
 //!             annotations: None,
 
@@ -73,6 +73,8 @@ pub enum OcrBackendType {
 ///             elements: None,
 ///             ocr_elements: None,
 ///             document: None,
+///             #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+///             extracted_keywords: None,
 ///             quality_score: None,
 ///             processing_warnings: vec![],
 ///             annotations: None,
@@ -160,6 +162,8 @@ pub trait OcrBackend: Plugin {
     ///         elements: None,
     ///         ocr_elements: None,
     ///         document: None,
+    ///         #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+    ///         extracted_keywords: None,
     ///         quality_score: None,
     ///         processing_warnings: vec![],
     ///         annotations: None,
@@ -339,6 +343,8 @@ pub trait OcrBackend: Plugin {
 ///             elements: None,
 ///             ocr_elements: None,
 ///             document: None,
+///             #[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
+///             extracted_keywords: None,
 ///             quality_score: None,
 ///             processing_warnings: vec![],
 ///             annotations: None,