Skip to content

Commit deca129

Browse files
authored
Merge pull request #311 from kreuzberg-dev/feature/unstructured-compatibility
feat: add comprehensive element-based output support
2 parents 38785b0 + cdad394 commit deca129

File tree

107 files changed

+6852
-585
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

107 files changed

+6852
-585
lines changed

.github/workflows/ci-ruby.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -656,17 +656,17 @@ jobs:
656656
cd packages/ruby
657657
echo "lib directory contents:"
658658
if [ -d "lib" ]; then
659-
find lib -type f | head -20
659+
find lib -type f | head -20 || true
660660
else
661661
echo "ERROR: lib directory not found"
662662
fi
663663
echo ""
664664
echo "=== Looking for compiled extension ==="
665-
find . \( -name "*.so" -o -name "*.dll" -o -name "*.dylib" \) 2>/dev/null | head -20
665+
find . \( -name "*.so" -o -name "*.dll" -o -name "*.dylib" \) 2>/dev/null | head -20 || true
666666
echo ""
667667
if [ -f "mkmf.log" ]; then
668668
echo "=== mkmf.log (last 100 lines) ==="
669-
tail -100 mkmf.log
669+
tail -100 mkmf.log || true
670670
fi
671671
672672
- name: Build kreuzberg CLI binary

CHANGELOG.md

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,90 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99

1010
## [Unreleased]
1111

12+
### Added
13+
14+
#### API
15+
- **POST /chunk endpoint**: New text chunking endpoint for breaking text into smaller pieces
16+
- Accepts JSON body with `text`, `chunker_type` (text/markdown), and optional `config`
17+
- Returns chunks with byte offsets, indices, and metadata
18+
- Configuration options: `max_characters` (default: 2000), `overlap` (default: 100), `trim` (default: true)
19+
- Supports both text and markdown chunking strategies
20+
- Case-insensitive chunker_type parameter
21+
- Comprehensive error handling for invalid inputs
22+
23+
#### Core
24+
- **Element-based output format**: New `OutputFormat::ElementBased` option provides Unstructured.io-compatible semantic element extraction
25+
- Extracts structured elements: titles, paragraphs, lists, tables, images, page breaks, headings, code blocks, block quotes, headers, footers
26+
- Each element includes rich metadata: bounding boxes, page numbers, confidence scores, hierarchy information
27+
- Transformation pipeline converts unified output to element-based format via `extraction::transform` module
28+
- Added `Element`, `ElementType`, `ElementMetadata`, and `BoundingBox` types to core types module
29+
- Supports PDF hierarchy detection for semantic heading levels
30+
- Configuration via `config.output_format` field (defaults to `Unified`)
31+
32+
#### Language Bindings
33+
- **Python**: Element-based output support with full type hints
34+
- New `output_format` parameter in extraction config accepting `"unified"` or `"element_based"`
35+
- `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` types exported from `kreuzberg.types`
36+
- Result includes `elements` field when using element-based format
37+
- Compatible with Unstructured.io API for migration
38+
39+
- **TypeScript/Node.js**: Element-based output with strict TypeScript interfaces
40+
- `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` interfaces in `@kreuzberg/core`
41+
- `outputFormat: "unified" | "element_based"` configuration option
42+
- Result type includes optional `elements` array
43+
44+
- **Ruby**: Element-based output with idiomatic Ruby types
45+
- `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` classes in `Kreuzberg::Types`
46+
- Snake_case serialization for Ruby conventions
47+
- `output_format: :unified` or `:element_based` symbol-based configuration
48+
49+
- **PHP**: Element-based output with typed classes
50+
- `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` classes in `Kreuzberg\Types`
51+
- `outputFormat` field in extraction config
52+
- `$result->elements` array when using element-based format
53+
54+
- **Go**: Element-based output with idiomatic Go structs
55+
- `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` types with JSON tags
56+
- `OutputFormat` field in extraction config
57+
- Result struct includes `Elements` slice
58+
59+
- **Java**: Element-based output with builder pattern
60+
- `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` classes with builders
61+
- `outputFormat` field in `ExtractionConfig`
62+
- `ExtractionResult.getElements()` method
63+
64+
- **C#**: Element-based output with nullable reference types
65+
- `Element`, `ElementType`, `ElementMetadata`, `BoundingBox` classes
66+
- `OutputFormat` property in extraction config
67+
- `ExtractionResult.Elements` property
68+
69+
- **Elixir**: Element-based output with pattern matching
70+
- `Kreuzberg.Element` module with typespecs
71+
- `:output_format` option in config accepting `:unified` or `:element_based`
72+
- Result map includes `:elements` key with element list
73+
74+
- **WASM**: Element-based output with TypeScript definitions
75+
- Element types exported to WASM TypeScript bindings
76+
- `output_format` configuration option
77+
- Elements accessible from extraction result
78+
79+
#### Documentation
80+
- **Migration guides**: New documentation for Unstructured.io users
81+
- `docs/migration/from-unstructured.md`: Step-by-step migration guide with code examples
82+
- `docs/comparisons/kreuzberg-vs-unstructured.md`: Feature comparison and compatibility matrix
83+
- Element-based output guide: `docs/guides/element-based-output.md` covering all 11 element types
84+
- Type reference updates: Added Element, ElementType, ElementMetadata, BoundingBox, OutputFormat
85+
- Code snippets for element-based extraction in all 10 languages
86+
87+
### Fixed
88+
89+
#### Python
90+
- **Type exports**: Fixed missing type exports in `kreuzberg.types.__all__`
91+
- Added `Element`, `ElementMetadata`, `ElementType`, `BoundingBox` to exported types
92+
- Added `HtmlImageMetadata` for HTML image metadata
93+
- Total 32 public types now properly exported for IDE autocomplete and type checking
94+
- Resolves import failures where types were defined but not accessible
95+
1296
---
1397

1498
## [4.0.8] - 2026-01-17

crates/kreuzberg-ffi/benches/result_view_benchmark.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ fn create_test_result(content_size: usize, chunk_count: usize) -> ExtractionResu
7070
chunks,
7171
images: None,
7272
pages: None,
73+
elements: None,
7374
}
7475
}
7576

crates/kreuzberg-ffi/src/helpers.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ pub fn to_c_extraction_result(result: ExtractionResult) -> std::result::Result<*
6666
chunks,
6767
images,
6868
pages,
69+
elements: _,
6970
} = result;
7071

7172
let sanitized_content = if content.contains('\0') {
@@ -345,6 +346,7 @@ mod tests {
345346
chunks: None,
346347
images: None,
347348
pages: None,
349+
elements: None,
348350
};
349351

350352
let c_result = to_c_extraction_result(result);
@@ -382,6 +384,7 @@ mod tests {
382384
chunks: None,
383385
images: None,
384386
pages: None,
387+
elements: None,
385388
};
386389

387390
let c_result = to_c_extraction_result(result);
@@ -429,6 +432,7 @@ mod tests {
429432
chunks: None,
430433
images: None,
431434
pages: None,
435+
elements: None,
432436
};
433437

434438
let c_result = to_c_extraction_result(result);
@@ -506,6 +510,7 @@ mod tests {
506510
chunks: Some(vec![chunk]),
507511
images: None,
508512
pages: None,
513+
elements: None,
509514
};
510515

511516
let c_result = to_c_extraction_result(result);

crates/kreuzberg-ffi/src/plugins/ocr_backend.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ impl OcrBackend for FfiOcrBackend {
167167
chunks: None,
168168
images: None,
169169
pages: None,
170+
elements: None,
170171
})
171172
}
172173

crates/kreuzberg-ffi/src/result.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,7 @@ mod tests {
399399
]),
400400
images: None,
401401
pages: None,
402+
elements: None,
402403
}
403404
}
404405

crates/kreuzberg-ffi/src/result_view.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,7 @@ mod tests {
453453
]),
454454
images: None,
455455
pages: None,
456+
elements: None,
456457
}
457458
}
458459

@@ -717,6 +718,7 @@ mod tests {
717718
chunks: None,
718719
images: None,
719720
pages: None,
721+
elements: None,
720722
};
721723

722724
let result_ptr = &result as *const ExtractionResult;

crates/kreuzberg-node/src/lib.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ use kreuzberg::{
2424
Chunk as RustChunk, ChunkMetadata as RustChunkMetadata, ChunkingConfig as RustChunkingConfig,
2525
EmbeddingConfig as RustEmbeddingConfig, EmbeddingModelType as RustEmbeddingModelType, ExtractionConfig,
2626
ExtractionResult as RustExtractionResult, ImageExtractionConfig as RustImageExtractionConfig, KNOWN_FORMATS,
27-
LanguageDetectionConfig as RustLanguageDetectionConfig, OcrConfig as RustOcrConfig, PdfConfig as RustPdfConfig,
28-
PostProcessorConfig as RustPostProcessorConfig, TesseractConfig as RustTesseractConfig,
27+
LanguageDetectionConfig as RustLanguageDetectionConfig, OcrConfig as RustOcrConfig, OutputFormat,
28+
PdfConfig as RustPdfConfig, PostProcessorConfig as RustPostProcessorConfig, TesseractConfig as RustTesseractConfig,
2929
TokenReductionConfig as RustTokenReductionConfig,
3030
};
3131
use lazy_static::lazy_static;
@@ -1277,6 +1277,7 @@ impl TryFrom<JsExtractionConfig> for ExtractionConfig {
12771277
html_options,
12781278
max_concurrent_extractions: val.max_concurrent_extractions.map(|v| v as usize),
12791279
pages: val.pages.map(|p| p.try_into()).transpose()?,
1280+
output_format: OutputFormat::Unified,
12801281
})
12811282
}
12821283
}
@@ -1898,6 +1899,7 @@ impl TryFrom<JsExtractionResult> for RustExtractionResult {
18981899
chunks,
18991900
images,
19001901
pages: None,
1902+
elements: None,
19011903
})
19021904
}
19031905
}
@@ -2949,6 +2951,7 @@ impl RustOcrBackend for JsOcrBackend {
29492951
chunks: None,
29502952
images: None,
29512953
pages: None,
2954+
elements: None,
29522955
})
29532956
}
29542957

crates/kreuzberg-php/src/extraction.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ pub fn kreuzberg_extract_bytes(
173173
chunks: None,
174174
images: None,
175175
pages: None,
176+
elements: None,
176177
};
177178

178179
return ExtractionResult::from_rust(rust_result);

crates/kreuzberg-py/src/config.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ impl ExtractionConfig {
9292
html_options: html_options_inner,
9393
max_concurrent_extractions,
9494
pages: pages.map(Into::into),
95+
output_format: Default::default(),
9596
},
9697
html_options_dict,
9798
})

0 commit comments

Comments
 (0)