kreuzberg-dev · Goldziher · Jan 18, 2026 · Jan 18, 2026 · Jan 18, 2026 · Jan 18, 2026
diff --git a/.github/workflows/ci-ruby.yaml b/.github/workflows/ci-ruby.yaml
@@ -656,17 +656,17 @@ jobs:
           cd packages/ruby
           echo "lib directory contents:"
           if [ -d "lib" ]; then
-            find lib -type f | head -20
+            find lib -type f | head -20 || true
           else
             echo "ERROR: lib directory not found"
           fi
           echo ""
           echo "=== Looking for compiled extension ==="
-          find . \( -name "*.so" -o -name "*.dll" -o -name "*.dylib" \) 2>/dev/null | head -20
+          find . \( -name "*.so" -o -name "*.dll" -o -name "*.dylib" \) 2>/dev/null | head -20 || true
           echo ""
           if [ -f "mkmf.log" ]; then
             echo "=== mkmf.log (last 100 lines) ==="
-            tail -100 mkmf.log
+            tail -100 mkmf.log || true
           fi
 
       - name: Build kreuzberg CLI binary

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,90 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+#### API
+- **POST /chunk endpoint**: New text chunking endpoint for breaking text into smaller pieces
+  - Accepts JSON body with `text`, `chunker_type` (text/markdown), and optional `config`
+  - Returns chunks with byte offsets, indices, and metadata
+  - Configuration options: `max_characters` (default: 2000), `overlap` (default: 100), `trim` (default: true)
+  - Supports both text and markdown chunking strategies
+  - Case-insensitive chunker_type parameter
+  - Comprehensive error handling for invalid inputs
+
+#### Core
+- **Element-based output format**: New `OutputFormat::ElementBased` option provides Unstructured.io-compatible semantic element extraction
+  - Extracts structured elements: titles, paragraphs, lists, tables, images, page breaks, headings, code blocks, block quotes, headers, footers
+  - Each element includes rich metadata: bounding boxes, page numbers, confidence scores, hierarchy information
+  - Transformation pipeline converts unified output to element-based format via `extraction::transform` module
+  - Added `Element`, `ElementType`, `ElementMetadata`, and `BoundingBox` types to core types module
+  - Supports PDF hierarchy detection for semantic heading levels
+  - Configuration via `config.output_format` field (defaults to `Unified`)
+
+#### Language Bindings
+- **Python**: Element-based output support with full type hints
+  - New `output_format` parameter in extraction config accepting `"unified"` or `"element_based"`
+  - `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` types exported from `kreuzberg.types`
+  - Result includes `elements` field when using element-based format
+  - Compatible with Unstructured.io API for migration
+
+- **TypeScript/Node.js**: Element-based output with strict TypeScript interfaces
+  - `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` interfaces in `@kreuzberg/core`
+  - `outputFormat: "unified" | "element_based"` configuration option
+  - Result type includes optional `elements` array
+
+- **Ruby**: Element-based output with idiomatic Ruby types
+  - `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` classes in `Kreuzberg::Types`
+  - Snake_case serialization for Ruby conventions
+  - `output_format: :unified` or `:element_based` symbol-based configuration
+
+- **PHP**: Element-based output with typed classes
+  - `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` classes in `Kreuzberg\Types`
+  - `outputFormat` field in extraction config
+  - `$result->elements` array when using element-based format
+
+- **Go**: Element-based output with idiomatic Go structs
+  - `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` types with JSON tags
+  - `OutputFormat` field in extraction config
+  - Result struct includes `Elements` slice
+
+- **Java**: Element-based output with builder pattern
+  - `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` classes with builders
+  - `outputFormat` field in `ExtractionConfig`
+  - `ExtractionResult.getElements()` method
+
+- **C#**: Element-based output with nullable reference types
+  - `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` classes
+  - `OutputFormat` property in extraction config
+  - `ExtractionResult.Elements` property
+
+- **Elixir**: Element-based output with pattern matching
+  - `Kreuzberg.Element` module with typespecs
+  - `:output_format` option in config accepting `:unified` or `:element_based`
+  - Result map includes `:elements` key with element list
+
+- **WASM**: Element-based output with TypeScript definitions
+  - Element types exported to WASM TypeScript bindings
+  - `output_format` configuration option
+  - Elements accessible from extraction result
+
+#### Documentation
+- **Migration guides**: New documentation for Unstructured.io users
+  - `docs/migration/from-unstructured.md`: Step-by-step migration guide with code examples
+  - `docs/comparisons/kreuzberg-vs-unstructured.md`: Feature comparison and compatibility matrix
+  - Element-based output guide: `docs/guides/element-based-output.md` covering all 11 element types
+  - Type reference updates: Added Element, ElementType, ElementMetadata, BoundingBox, OutputFormat
+  - Code snippets for element-based extraction in all 10 languages
+
+### Fixed
+
+#### Python
+- **Type exports**: Fixed missing type exports in `kreuzberg.types.__all__`
+  - Added `Element`, `ElementMetadata`, `ElementType`, `BoundingBox` to exported types
+  - Added `HtmlImageMetadata` for HTML image metadata
+  - Total 32 public types now properly exported for IDE autocomplete and type checking
+  - Resolves import failures where types were defined but not accessible
+
 ---
 
 ## [4.0.8] - 2026-01-17

diff --git a/crates/kreuzberg-ffi/benches/result_view_benchmark.rs b/crates/kreuzberg-ffi/benches/result_view_benchmark.rs
@@ -70,6 +70,7 @@ fn create_test_result(content_size: usize, chunk_count: usize) -> ExtractionResu
         chunks,
         images: None,
         pages: None,
+        elements: None,
     }
 }
 

diff --git a/crates/kreuzberg-ffi/src/helpers.rs b/crates/kreuzberg-ffi/src/helpers.rs
@@ -66,6 +66,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
         chunks,
         images,
         pages,
+        elements: _,
     } = result;
 
     let sanitized_content = if content.contains('\0') {
@@ -345,6 +346,7 @@ mod tests {
             chunks: None,
             images: None,
             pages: None,
+            elements: None,
         };
 
         let c_result = to_c_extraction_result(result);
@@ -382,6 +384,7 @@ mod tests {
             chunks: None,
             images: None,
             pages: None,
+            elements: None,
         };
 
         let c_result = to_c_extraction_result(result);
@@ -429,6 +432,7 @@ mod tests {
             chunks: None,
             images: None,
             pages: None,
+            elements: None,
         };
 
         let c_result = to_c_extraction_result(result);
@@ -506,6 +510,7 @@ mod tests {
             chunks: Some(vec![chunk]),
             images: None,
             pages: None,
+            elements: None,
         };
 
         let c_result = to_c_extraction_result(result);

diff --git a/crates/kreuzberg-ffi/src/plugins/ocr_backend.rs b/crates/kreuzberg-ffi/src/plugins/ocr_backend.rs
@@ -167,6 +167,7 @@ impl OcrBackend for FfiOcrBackend {
             chunks: None,
             images: None,
             pages: None,
+            elements: None,
         })
     }
 

diff --git a/crates/kreuzberg-ffi/src/result.rs b/crates/kreuzberg-ffi/src/result.rs
@@ -399,6 +399,7 @@ mod tests {
             ]),
             images: None,
             pages: None,
+            elements: None,
         }
     }
 

diff --git a/crates/kreuzberg-ffi/src/result_view.rs b/crates/kreuzberg-ffi/src/result_view.rs
@@ -453,6 +453,7 @@ mod tests {
             ]),
             images: None,
             pages: None,
+            elements: None,
         }
     }
 
@@ -717,6 +718,7 @@ mod tests {
             chunks: None,
             images: None,
             pages: None,
+            elements: None,
         };
 
         let result_ptr = &result as *const ExtractionResult;

diff --git a/crates/kreuzberg-node/src/lib.rs b/crates/kreuzberg-node/src/lib.rs
@@ -24,8 +24,8 @@ use kreuzberg::{
     Chunk as RustChunk, ChunkMetadata as RustChunkMetadata, ChunkingConfig as RustChunkingConfig,
     EmbeddingConfig as RustEmbeddingConfig, EmbeddingModelType as RustEmbeddingModelType, ExtractionConfig,
     ExtractionResult as RustExtractionResult, ImageExtractionConfig as RustImageExtractionConfig, KNOWN_FORMATS,
-    LanguageDetectionConfig as RustLanguageDetectionConfig, OcrConfig as RustOcrConfig, PdfConfig as RustPdfConfig,
-    PostProcessorConfig as RustPostProcessorConfig, TesseractConfig as RustTesseractConfig,
+    LanguageDetectionConfig as RustLanguageDetectionConfig, OcrConfig as RustOcrConfig, OutputFormat,
+    PdfConfig as RustPdfConfig, PostProcessorConfig as RustPostProcessorConfig, TesseractConfig as RustTesseractConfig,
     TokenReductionConfig as RustTokenReductionConfig,
 };
 use lazy_static::lazy_static;
@@ -1277,6 +1277,7 @@ impl TryFrom<JsExtractionConfig> for ExtractionConfig {
             html_options,
             max_concurrent_extractions: val.max_concurrent_extractions.map(|v| v as usize),
             pages: val.pages.map(|p| p.try_into()).transpose()?,
+            output_format: OutputFormat::Unified,
         })
     }
 }
@@ -1898,6 +1899,7 @@ impl TryFrom<JsExtractionResult> for RustExtractionResult {
             chunks,
             images,
             pages: None,
+            elements: None,
         })
     }
 }
@@ -2949,6 +2951,7 @@ impl RustOcrBackend for JsOcrBackend {
             chunks: None,
             images: None,
             pages: None,
+            elements: None,
         })
     }
 

diff --git a/crates/kreuzberg-php/src/extraction.rs b/crates/kreuzberg-php/src/extraction.rs
@@ -173,6 +173,7 @@ pub fn kreuzberg_extract_bytes(
                         chunks: None,
                         images: None,
                         pages: None,
+                        elements: None,
                     };
 
                     return ExtractionResult::from_rust(rust_result);

diff --git a/crates/kreuzberg-py/src/config.rs b/crates/kreuzberg-py/src/config.rs
@@ -92,6 +92,7 @@ impl ExtractionConfig {
                 html_options: html_options_inner,
                 max_concurrent_extractions,
                 pages: pages.map(Into::into),
+                output_format: Default::default(),
             },
             html_options_dict,
         })

diff --git a/crates/kreuzberg-py/src/plugins.rs b/crates/kreuzberg-py/src/plugins.rs
@@ -675,6 +675,7 @@ fn dict_to_extraction_result(_py: Python<'_>, dict: &Bound<'_, PyAny>) -> Result
         chunks: None,
         images: None,
         pages: None,
+        elements: None,
     })
 }
 

diff --git a/crates/kreuzberg-py/src/types.rs b/crates/kreuzberg-py/src/types.rs
@@ -467,6 +467,7 @@ mod tests {
                 chunks: None,
                 images: None,
                 pages: None,
+                elements: None,
             };
 
             let py_result = ExtractionResult::from_rust(rust_result, py).expect("conversion should succeed");
@@ -493,6 +494,7 @@ mod tests {
                 chunks: None,
                 images: None,
                 pages: None,
+                elements: None,
             };
             rust_result
                 .metadata

diff --git a/crates/kreuzberg-wasm/src/plugins.rs b/crates/kreuzberg-wasm/src/plugins.rs
@@ -901,6 +901,7 @@ impl OcrBackend for JsOcrBackendWrapper {
             chunks: None,
             images: None,
             pages: None,
+            elements: None,
         })
     }
 
@@ -937,6 +938,7 @@ impl OcrBackend for JsOcrBackendWrapper {
             chunks: None,
             images: None,
             pages: None,
+            elements: None,
         })
     }
 
@@ -950,6 +952,7 @@ impl OcrBackend for JsOcrBackendWrapper {
             chunks: None,
             images: None,
             pages: None,
+            elements: None,
         })
     }
 

diff --git a/crates/kreuzberg-wasm/typescript/types.ts b/crates/kreuzberg-wasm/typescript/types.ts
@@ -214,6 +214,94 @@ export interface LanguageDetectionConfig {
 	enabled?: boolean;
 }
 
+/**
+ * Semantic element type classification.
+ *
+ * Categorizes text content extracted from documents into semantic units for downstream processing.
+ * Supports element types commonly found in documents processed by Unstructured-compatible systems.
+ *
+ * WASM serialization note: This is serialized from Rust using serde with snake_case transformation.
+ */
+export type ElementType =
+	| "title"
+	| "narrative_text"
+	| "heading"
+	| "list_item"
+	| "table"
+	| "image"
+	| "page_break"
+	| "code_block"
+	| "block_quote"
+	| "footer"
+	| "header";
+
+/**
+ * Bounding box coordinates for element positioning.
+ *
+ * Represents the spatial boundaries of an element on a page using normalized coordinates.
+ * Coordinates are in document space (typically PDF or image coordinates).
+ *
+ * WASM serialization note: All fields are serialized as numbers (floats) by serde.
+ */
+export interface BoundingBox {
+	/** Left x-coordinate */
+	x0: number;
+	/** Bottom y-coordinate */
+	y0: number;
+	/** Right x-coordinate */
+	x1: number;
+	/** Top y-coordinate */
+	y1: number;
+}
+
+/**
+ * Metadata for a semantic element.
+ *
+ * Contains optional contextual information about the element including its page location,
+ * source filename, bounding box coordinates, and custom metadata fields.
+ *
+ * WASM serialization note: Optional fields use snake_case from Rust with serde skip_serializing_if.
+ */
+export interface ElementMetadata {
+	/** Page number (1-indexed) */
+	page_number?: number | null;
+	/** Source filename or document name */
+	filename?: string | null;
+	/** Bounding box coordinates if available */
+	coordinates?: BoundingBox | null;
+	/** Position index in the element sequence */
+	element_index?: number | null;
+	/** Additional custom metadata fields */
+	additional?: Record<string, string>;
+}
+
+/**
+ * Semantic element extracted from document.
+ *
+ * Represents a logical unit of content with semantic classification, unique identifier,
+ * and metadata for tracking origin and position. Compatible with Unstructured.io element
+ * format when using element-based output.
+ *
+ * This type is generated by serde serialization from the Rust Element struct and includes:
+ * - A deterministic element ID based on content and location
+ * - Semantic type classification for downstream processing
+ * - Full text content
+ * - Comprehensive metadata including page numbers and coordinates
+ *
+ * WASM serialization note: All fields are serialized directly from Rust types with snake_case
+ * field name transformation applied by serde.
+ */
+export interface Element {
+	/** Unique element identifier (deterministic hash-based ID) */
+	element_id: string;
+	/** Semantic type classification */
+	element_type: ElementType;
+	/** Text content of the element */
+	text: string;
+	/** Metadata about the element including page number, coordinates, etc. */
+	metadata: ElementMetadata;
+}
+
 /**
  * Result of document extraction
  */
@@ -236,6 +324,8 @@ export interface ExtractionResult {
 	pages?: PageContent[] | null;
 	/** Extracted keywords when keyword extraction is enabled */
 	keywords?: ExtractedKeyword[] | null;
+	/** Semantic elements when element-based output format is used */
+	elements?: Element[] | null;
 }
 
 /**
-Original file line number
+Diff line change
@@ Expand Up @@
             chunks,
             images: None,
             pages: None,
+            elements: None,
         }
     }
@@ Expand Down @@