kreuzberg-dev
diff --git a/‎.github/actions/install-system-deps/action.yml‎
Lines changed: 1 addition & 19 deletions b/‎.github/actions/install-system-deps/action.yml‎
Lines changed: 1 addition & 19 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 57 additions & 51 deletions b/‎CHANGELOG.md‎
Lines changed: 57 additions & 51 deletions
diff --git a/‎Cargo.lock‎
Lines changed: 8 additions & 25 deletions b/‎Cargo.lock‎
Lines changed: 8 additions & 25 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/kreuzberg-cli/README.md‎
Lines changed: 0 additions & 7 deletions b/‎crates/kreuzberg-cli/README.md‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎crates/kreuzberg-node/README.md‎
Lines changed: 3 additions & 5 deletions b/‎crates/kreuzberg-node/README.md‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎crates/kreuzberg-node/typescript/errors.ts‎
Lines changed: 2 additions & 3 deletions b/‎crates/kreuzberg-node/typescript/errors.ts‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎crates/kreuzberg-node/typescript/index.ts‎
Lines changed: 1 addition & 1 deletion b/‎crates/kreuzberg-node/typescript/index.ts‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/kreuzberg-py/README.md‎
Lines changed: 1 addition & 1 deletion b/‎crates/kreuzberg-py/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎crates/kreuzberg-wasm/src/errors.rs‎
Lines changed: 1 addition & 1 deletion b/‎crates/kreuzberg-wasm/src/errors.rs‎
Lines changed: 1 addition & 1 deletion
@@ -1,7 +1,7 @@
 name: Install System Dependencies
 description: |
   Install and cache platform-specific dependencies required for document conversion.
-  Includes: Tesseract OCR, LibreOffice, fonts, and build tools.
+  Includes: Tesseract OCR, fonts, and build tools.
   Features robust caching with architecture/version awareness, timeout handling, and retry logic.
 
 inputs:
@@ -35,14 +35,6 @@ runs:
           tesseract-macos-${{ runner.arch }}-v5-
           tesseract-macos-${{ runner.arch }}-
 
-    - name: Cache LibreOffice (macOS)
-      if: runner.os == 'macOS'
-      id: cache-libreoffice-macos
-      uses: actions/cache@v5
-      with:
-        path: /Applications/LibreOffice.app
-        key: libreoffice-macos-${{ runner.arch }}-v3
-
     - name: Install dependencies (macOS)
       if: runner.os == 'macOS'
       shell: bash
@@ -86,16 +78,6 @@ runs:
         restore-keys: |
           tesseract-windows-${{ runner.arch }}-
 
-    - name: Cache LibreOffice (Windows)
-      if: runner.os == 'Windows'
-      id: cache-libreoffice-windows
-      uses: actions/cache@v5
-      with:
-        path: |
-          C:\Program Files\LibreOffice
-          C:\ProgramData\chocolatey\lib\libreoffice
-        key: libreoffice-windows-${{ runner.arch }}-v3
-
     - name: Cache LLVM (Windows)
       if: runner.os == 'Windows'
       id: cache-llvm-windows
 
@@ -7,73 +7,88 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ---
 
-## [4.2.15] - 2026-02-08
+## [Unreleased]
 
 ### Added
 
-#### Agent Skill for AI Coding Assistants
-
-- **Agent Skill for document extraction**: Added `skills/kreuzberg/SKILL.md` following the [Agent Skills](https://agentskills.io) open standard, with comprehensive instructions for Python, Node.js, Rust, and CLI usage. Includes 8 detailed reference files covering API signatures, configuration, supported formats, plugins, and all language bindings. Works with Claude Code, Codex, Gemini CLI, Cursor, VS Code, Amp, Goose, Roo Code, and any compatible tool.
+#### PaddleOCR Backend
+- **PaddleOCR backend via ONNX Runtime**: New OCR backend (`kreuzberg-paddle-ocr`) using PaddlePaddle's PP-OCRv4 models converted to ONNX format, run via ONNX Runtime. Supports 6 languages (English, Chinese, Japanese, Korean, German, French) with automatic model downloading and caching. Provides superior CJK recognition compared to Tesseract.
+- **PaddleOCR support in all bindings**: Available across Python, Rust, TypeScript/Node.js, Go, Java, PHP, Ruby, C#, and Elixir bindings via the `paddle-ocr` feature flag.
+- **PaddleOCR CLI support**: The `kreuzberg-cli` binary supports `--ocr-backend paddle-ocr` for PaddleOCR extraction.
 
-#### MIME Type Mappings
-- Added `.docbook` (`application/docbook+xml`) and `.jats` (`application/x-jats+xml`) file extension mappings.
+#### Unified OCR Element Output
+- **Structured OCR element data**: Extraction results now include `OcrElement` data with bounding geometry (rectangles and quadrilaterals), per-element confidence scores, rotation information, and hierarchical levels (word, line, block, page). Available from both PaddleOCR and Tesseract backends.
 
-### Added
+#### Shared ONNX Runtime Discovery
+- **`ort_discovery` module**: Finds ONNX Runtime shared libraries across platforms, shared between PaddleOCR and future ONNX-based backends.
 
-#### OCR
-- **PaddleOCR backend via ONNX Runtime**: Added a new OCR backend (`kreuzberg-paddle-ocr`) using PaddlePaddle's PP-OCRv4 models converted to ONNX format, run via ONNX Runtime. Supports 6 languages (English, Chinese, Japanese, Korean, German, French) with automatic model downloading and caching. Provides superior CJK recognition compared to Tesseract.
-- **Unified OCR element output architecture**: Extraction results now include structured `OcrElement` data with bounding geometry (rectangles and quadrilaterals), per-element confidence scores, rotation information, and hierarchical levels (word, line, block, page). Available from both PaddleOCR and Tesseract backends.
-- **PaddleOCR support in all bindings**: PaddleOCR is available across Python, Rust, TypeScript/Node.js, Go, Java, PHP, Ruby, C#, and Elixir bindings via the `paddle-ocr` feature flag.
-- **PaddleOCR CLI support**: The `kreuzberg-cli` binary supports `--ocr-backend paddle-ocr` for PaddleOCR extraction.
-- **Shared ORT discovery**: Added `ort_discovery` module for finding ONNX Runtime shared libraries across platforms, shared between PaddleOCR and future ONNX-based backends.
-- **PaddleOCR model setup GitHub Action**: Added `.github/actions/setup-paddle-ocr-models/` action for CI pipelines to download and cache PaddleOCR model files.
+#### Document Structure Output
+- **`DocumentStructure` support across all bindings**: Added structured document output with `include_document_structure` configuration option across Python, TypeScript/Node.js, Go, Java, PHP, Ruby, C#, Elixir, and WASM bindings.
 
-#### CI
-- **PaddleOCR CI integration**: Added PaddleOCR to the CI/publish pipelines with dedicated test jobs and model caching.
+#### Native DOC/PPT Extraction
+- **OLE/CFB-based extraction**: Added native DOC and PPT extraction via OLE/CFB binary parsing. Legacy Office formats no longer require any external tools.
 
 #### musl Linux Support
 - **Re-enabled musl targets**: Added `x86_64-unknown-linux-musl` and `aarch64-unknown-linux-musl` targets for CLI binaries, Python wheels (musllinux), and Node.js native bindings. Resolves glibc 2.38+ requirement for prebuilt CLI binaries on older distros like Ubuntu 22.04 (#364).
-- **musl CI workflows**: Added dedicated `ci-musl.yaml` workflow for CLI musl build validation with Alpine container smoke tests, and musllinux Python wheel builds to `ci-python.yaml`.
-- **PDFium musl awareness**: Build script now downloads musl-specific PDFium binaries and uses `libstdc++` consistently for all Linux targets (including musl).
-- **musl C++ cross-compilation**: Added `resolve_cxx_compiler()` and `create_musl_cxx_wrapper()` to `kreuzberg-tesseract` build script for correct C++ header resolution when cross-compiling from glibc host to musl target. Skips `-ldl` linking on musl (not available/needed).
-
-#### Build System
-- **Tesseract 5.5.2**: Bumped vendored Tesseract from 5.5.1 to 5.5.2 with `BUILD_TESSERACT_BINARY=OFF` to skip unnecessary binary compilation.
-- **Leptonica 1.87.0**: Bumped vendored Leptonica from 1.86.0 to 1.87.0.
-- **ONNX Runtime 1.24.1**: Bumped ONNX Runtime from 1.23.2 to 1.24.1.
-- **Dead code cleanup**: Removed unused EMSDK constants and `apply_patches()` function from `kreuzberg-tesseract` build script.
-
-### Removed
-
-#### Node.js Bindings
-- **Guten OCR references**: Removed all references to the unused Guten OCR backend. Renamed `KREUZBERG_DEBUG_GUTEN` env var to `KREUZBERG_DEBUG_OCR`.
-
-#### PHP Bindings
-- **Guten OCR backend option**: Removed `'guten'` from the documented backend choices in `OcrConfig`.
 
 ### Fixed
 
-#### PaddleOCR Recognition Model Shape Inference
-- Fixed PaddleOCR recognition model (`en_PP-OCRv4_rec_infer.onnx`) failing to load with `ShapeInferenceError` on ONNX Runtime 1.23.x. A `Squeeze` node incorrectly reduced a rank-1 tensor to a scalar before a `Concat` operation. The fixed model has been re-uploaded to the HuggingFace model repository.
+#### MSG Extraction Hang on Large Attachments (#372)
+- Fixed `.msg` (Outlook) extraction hanging indefinitely on files with large attachments. Replaced the `msg_parser` crate with direct OLE/CFB parsing using the `cfb` crate — attachment binary data is now read directly without hex-encoding overhead.
+- Added lenient FAT padding for MSG files with truncated sector tables produced by some Outlook versions.
+
+#### Rotated PDF Text Extraction
+- Fixed text extraction returning empty content for PDFs with 90° or 270° page rotation. Kreuzberg now strips `/Rotate` entries from page dictionaries before loading, restoring correct text extraction for all rotation angles.
 
 #### CSV and Excel Extraction Quality
 - Fixed CSV extraction producing near-zero quality scores (0.024) by outputting proper delimited text instead of debug format.
 - Fixed Excel extraction producing low quality scores (0.22) by outputting clean tab/newline-delimited cell text.
 
-#### Native DOC/PPT Extraction
-- Added native DOC and PPT extraction via OLE/CFB parsing, replacing the LibreOffice subprocess dependency for legacy Office formats.
-
 #### XML Extraction Quality
 - Improved XML text extraction to better handle namespaced elements, CDATA sections, and mixed content, improving quality scores.
 
 #### WASM Table Extraction
 - Fixed WASM adapter not recognizing `page_number` field (snake_case) from Rust FFI, causing table data to be silently dropped in Deno and Cloudflare Workers tests.
 
-#### Ruby CI ONNX Runtime Discovery
-- Fixed Ruby E2E tests failing with `dlopen failed` for `libonnxruntime.so` by adding ONNX Runtime setup and library path export to the Ruby CI test job.
+#### PaddleOCR Recognition Model
+- Fixed PaddleOCR recognition model (`en_PP-OCRv4_rec_infer.onnx`) failing to load with `ShapeInferenceError` on ONNX Runtime 1.23.x.
+- Fixed incorrect detection model filename in Docker and CI action (`en_PP-OCRv4_det_infer.onnx` → `ch_PP-OCRv4_det_infer.onnx`).
+
+#### Python Bindings
+- Fixed `OcrConfig` constructor silently ignoring `paddle_ocr_config` and `element_config` keyword arguments.
+
+### Changed
+
+#### Build System
+- Bumped ONNX Runtime from 1.23.2 to 1.24.1 across CI, Docker images, and documentation.
+- Bumped vendored Tesseract from 5.5.1 to 5.5.2.
+- Bumped vendored Leptonica from 1.86.0 to 1.87.0.
+
+### Removed
+
+#### LibreOffice Dependency
+- **LibreOffice is no longer required**: Legacy .doc and .ppt files are now extracted natively via OLE/CFB parsing. LibreOffice has been removed from Docker images, CI pipelines, and system dependency requirements, reducing the full Docker image size by ~500-800MB. Users on Kreuzberg <4.3 still need LibreOffice for these formats.
+
+#### `msg_parser` Dependency
+- Replaced `msg_parser` crate with direct CFB parsing for MSG extraction. Eliminates hex-encoding overhead and reduces dependency count.
+
+#### Guten OCR Backend
+- Removed all references to the unused Guten OCR backend from Node.js and PHP bindings. Renamed `KREUZBERG_DEBUG_GUTEN` env var to `KREUZBERG_DEBUG_OCR`.
 
-#### Java E2E Test Compilation
-- Fixed Java E2E helper compilation errors caused by `Metadata` type not being directly castable to `Map` and `Element.getType()` method not existing. Updated to use `Metadata.getAdditional()` and `Element.getElementType()`.
+---
+
+## [4.2.15] - 2026-02-08
+
+### Added
+
+#### Agent Skill for AI Coding Assistants
+
+- **Agent Skill for document extraction**: Added `skills/kreuzberg/SKILL.md` following the [Agent Skills](https://agentskills.io) open standard, with comprehensive instructions for Python, Node.js, Rust, and CLI usage. Includes 8 detailed reference files covering API signatures, configuration, supported formats, plugins, and all language bindings. Works with Claude Code, Codex, Gemini CLI, Cursor, VS Code, Amp, Goose, Roo Code, and any compatible tool.
+
+#### MIME Type Mappings
+- Added `.docbook` (`application/docbook+xml`) and `.jats` (`application/x-jats+xml`) file extension mappings.
+
+### Fixed
 
 #### ODT List and Section Extraction
 - Fixed ODT extractor not handling `text:list` and `text:section` elements. Documents containing bulleted/numbered lists or sections returned empty content.
@@ -99,17 +114,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 #### PDF Error Handling Regression
 - Reverted incorrect change from v4.2.14 that silently returned empty results for corrupted/malformed PDFs instead of propagating errors. Corrupted PDFs now correctly return `PdfError::InvalidPdf` and password-protected PDFs return `PdfError::PasswordRequired` as expected.
 
-#### PaddleOCR Model URLs
-- Fixed incorrect detection model filename in Docker and CI action (`en_PP-OCRv4_det_infer.onnx` → `ch_PP-OCRv4_det_infer.onnx`).
-
-#### Python Bindings
-- Fixed `OcrConfig` constructor silently ignoring `paddle_ocr_config` and `element_config` keyword arguments.
-
 ### Changed
 
-#### ONNX Runtime
-- Bumped ONNX Runtime from 1.23.2 to 1.24.1 across CI, Docker images, and documentation. Minimum supported ORT version is 1.23+.
-
 #### API Parity
 - Added `security_limits` field to all 9 language bindings (TypeScript, Go, Python, Ruby, PHP, Java, C#, WASM, Elixir) for API parity with Rust core `ExtractionConfig`.
 
 
@@ -91,7 +91,7 @@ Each language binding provides comprehensive documentation with examples and bes
 - **[Rust](https://github.com/kreuzberg-dev/kreuzberg/tree/main/crates/kreuzberg)** – Core library, flexible feature flags, zero-copy APIs
 
 **Containers:**
-- **[Docker](https://docs.kreuzberg.dev/guides/docker/)** – Official images with API, CLI, and MCP server modes (Core: ~1.0-1.3GB, Full: ~1.5-2.1GB with LibreOffice)
+- **[Docker](https://docs.kreuzberg.dev/guides/docker/)** – Official images with API, CLI, and MCP server modes (Core: ~1.0-1.3GB, Full: ~1.0-1.3GB with OCR + legacy format support)
 
 **Command-Line:**
 - **[CLI](https://docs.kreuzberg.dev/cli/usage/)** – Cross-platform binary, batch processing, MCP server mode
 
@@ -84,13 +84,6 @@ To enable optical character recognition for scanned documents:
 - **Ubuntu/Debian**: `sudo apt-get install tesseract-ocr`
 - **Windows**: Download from [tesseract-ocr/tesseract](https://github.com/tesseract-ocr/tesseract)
 
-#### Legacy Office Format Support (Optional)
-
-For `.doc` and `.ppt` file extraction:
-
-- **macOS**: `brew install libreoffice`
-- **Ubuntu/Debian**: `sudo apt-get install libreoffice`
-
 ## Quick Start
 
 > The CLI is available for Linux (x86_64/aarch64), macOS (Apple Silicon), and Windows with consistent behavior across all platforms.
 
@@ -98,12 +98,10 @@ yarn add @kreuzberg/node
 - Optional: [ONNX Runtime](https://github.com/microsoft/onnxruntime/releases) version 1.23+ for embeddings support
 - Optional: [Tesseract OCR](https://github.com/tesseract-ocr/tesseract) for OCR functionality
 
-- Optional: [LibreOffice](https://www.libreoffice.org/download/download/) for legacy Office formats (DOC, XLS, PPT, RTF, ODT, ODS, ODP)
-
 **Format Support Notes:**
-- Modern Office formats (DOCX, XLSX, PPTX) work without LibreOffice
-- Legacy formats (DOC, XLS, PPT) require LibreOffice installation
-- WASM binding supports DOCX, XLSX, PPTX, and ODT (no LibreOffice required)
+- Legacy formats (DOC, XLS, PPT) are now extracted natively without external tools
+- Modern Office formats (DOCX, XLSX, PPTX) are fully supported
+- WASM binding supports all document formats via in-memory parsing
 
 
 
 
@@ -380,7 +380,6 @@ export class PluginError extends KreuzbergError {
  * Error thrown when a required system dependency is missing.
  *
  * Missing dependency errors occur when external tools or libraries are not available, such as:
- * - LibreOffice (for DOC/PPT/XLS files)
  * - Tesseract OCR (for OCR processing)
  * - ImageMagick (for image processing)
  * - Poppler (for PDF rendering)
@@ -390,11 +389,11 @@ export class PluginError extends KreuzbergError {
  * import { extractFile, MissingDependencyError } from '@kreuzberg/node';
  *
  * try {
- *   const result = await extractFile('document.doc');
+ *   const result = await extractFile('document.pdf');
  * } catch (error) {
  *   if (error instanceof MissingDependencyError) {
  *     console.error('Missing dependency:', error.message);
- *     console.log('Please install LibreOffice to process DOC files');
+ *     console.log('Please install Tesseract OCR for image processing');
  *   }
  * }
  * ```
 
@@ -36,7 +36,7 @@
  *
  * ## Supported Formats
  *
- * - **Documents**: PDF, DOCX, PPTX, XLSX, DOC, PPT (with LibreOffice)
+ * - **Documents**: PDF, DOCX, PPTX, XLSX, DOC, PPT
  * - **Text**: Markdown, Plain Text, XML
  * - **Web**: HTML (converted to Markdown)
  * - **Data**: JSON, YAML, TOML
 
@@ -38,7 +38,7 @@ result = await extract_file("document.pdf")          # 140ms
 result = extract_file_sync("document.pdf")           # 140ms
 ```
 
-**Why?** The subprocess call (pdftotext, libreoffice) accounts for 95-99% of time. With only one file, there's nothing to do concurrently, so async provides no benefit.
+**Why?** The extraction call accounts for 95-99% of time. With only one file, there's nothing to do concurrently, so async provides no benefit.
 
 ### Batch/Concurrent Processing
 
 
@@ -161,7 +161,7 @@ mod tests {
 
     #[wasm_bindgen_test]
     fn test_convert_error_missing_dependency_returns_jsvalue() {
-        let err = KreuzbergError::MissingDependency("libreoffice".to_string());
+        let err = KreuzbergError::MissingDependency("tesseract".to_string());
         let result = convert_error(err);
 
         assert!(!result.is_null());
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@`
`36`	`36`	`*`
`37`	`37`	`* ## Supported Formats`
`38`	`38`	`*`
`39`		`- * - Documents: PDF, DOCX, PPTX, XLSX, DOC, PPT (with LibreOffice)`
	`39`	`+ * - Documents: PDF, DOCX, PPTX, XLSX, DOC, PPT`
`40`	`40`	`* - Text: Markdown, Plain Text, XML`
`41`	`41`	`* - Web: HTML (converted to Markdown)`
`42`	`42`	`* - Data: JSON, YAML, TOML`