kreuzberg-dev
diff --git a/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎crates/kreuzberg/src/extraction/xml.rs‎
Lines changed: 53 additions & 4 deletions b/‎crates/kreuzberg/src/extraction/xml.rs‎
Lines changed: 53 additions & 4 deletions
diff --git a/‎crates/kreuzberg/src/pdf/text.rs‎
Lines changed: 31 additions & 0 deletions b/‎crates/kreuzberg/src/pdf/text.rs‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎packages/r/DESCRIPTION‎
Lines changed: 2 additions & 2 deletions b/‎packages/r/DESCRIPTION‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎packages/r/NAMESPACE‎
Lines changed: 42 additions & 66 deletions b/‎packages/r/NAMESPACE‎
Lines changed: 42 additions & 66 deletions
diff --git a/‎packages/r/R/extendr-wrappers.R‎
Lines changed: 23 additions & 24 deletions b/‎packages/r/R/extendr-wrappers.R‎
Lines changed: 23 additions & 24 deletions
diff --git a/‎packages/r/R/kreuzberg-package.R‎
Lines changed: 6 additions & 0 deletions b/‎packages/r/R/kreuzberg-package.R‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎packages/r/src/rust/src/helpers.rs‎
Lines changed: 6 additions & 1 deletion b/‎packages/r/src/rust/src/helpers.rs‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎test_documents/xlsx/data-with-macros.xla‎
48.5 KB b/‎test_documents/xlsx/data-with-macros.xla‎
48.5 KB
diff --git a/‎test_documents/xlsx/data-with-macros.xlam‎
166 KB b/‎test_documents/xlsx/data-with-macros.xlam‎
166 KB
@@ -22,6 +22,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 
+- **XML UTF-16 parsing fails on files with odd byte count**: The XML extractor rejected valid UTF-16 encoded files that had a trailing odd byte (e.g. `factbook-utf-16.xml`) with "Invalid UTF-16: odd byte count". The decoder now truncates to the nearest even byte boundary, matching the lenient approach already used in email extraction.
+- **R bindings crash on strings with embedded NUL bytes**: Extraction results containing NUL (`\0`) characters (e.g. from RTF files) caused the R FFI layer to error with "embedded nul in string" since R strings are C-based. NUL bytes are now stripped before passing strings to R.
+- **R bindings `%||%` operator incompatible with R < 4.4**: The R package used the `%||%` null-coalescing operator which is only available in base R >= 4.4, but the package declares `R >= 4.2`. Added a package-local polyfill for backwards compatibility.
 - **API returns HTTP 500 for unsupported file formats** (#414): Uploading files with unsupported or undetectable MIME types (e.g. DOCX via `curl -F`) returned HTTP 500 Internal Server Error instead of HTTP 400 Bad Request. The `/extract` endpoint now falls back to extension-based MIME detection from the filename when the client sends `application/octet-stream`, and `UnsupportedFormat` errors are mapped to HTTP 400 with a clear `UnsupportedFormatError` response.
 - **PDF markdown extraction missing headings/bold for flat structure trees** (#391): PDFs where the structure tree tags everything as `<P>` (common with Adobe InDesign) now produce proper headings and bold text. The structure tree path previously bypassed font-size-based heading classification entirely. Pages with font size variation but no heading tags are now enriched via K-means font-size clustering. Additionally, bold detection now recognizes fonts with "Bold" in the name (e.g. `MyriadPro-Bold`) even when the PDF doesn't set the font weight descriptor.
 - **PaddleOCR backend not found when using `backend="paddleocr"`** (#403): The PaddleOCR backend registered itself as `"paddle-ocr"` but users and documentation use `"paddleocr"`. The OCR backend registry now resolves the `"paddleocr"` alias to the canonical `"paddle-ocr"` name.
 
@@ -200,11 +200,10 @@ pub fn parse_xml(xml_bytes: &[u8], preserve_whitespace: bool) -> Result<XmlExtra
 
 /// Decode UTF-16 bytes (with BOM) to UTF-8 bytes.
 fn decode_utf16_to_utf8(data: &[u8], big_endian: bool) -> Result<Vec<u8>> {
-    // Skip BOM (first 2 bytes)
+    // Skip BOM (first 2 bytes) and truncate to even length
     let data = &data[2..];
-    if !data.len().is_multiple_of(2) {
-        return Err(KreuzbergError::parsing("Invalid UTF-16: odd byte count".to_string()));
-    }
+    let even_len = data.len() & !1;
+    let data = &data[..even_len];
 
     let u16_iter = data.chunks_exact(2).map(|chunk| {
         if big_endian {
@@ -463,4 +462,54 @@ mod tests {
         let result = parse_xml(xml, false);
         let _ = result;
     }
+
+    #[test]
+    fn test_utf16_le_xml() {
+        // UTF-16 LE BOM + "<r>A</r>" encoded as UTF-16 LE
+        let mut xml = vec![0xFF, 0xFE]; // BOM
+        for c in "<r>A</r>".encode_utf16() {
+            xml.extend_from_slice(&c.to_le_bytes());
+        }
+        let result = parse_xml(&xml, false).unwrap();
+        assert!(result.content.contains("A"));
+    }
+
+    #[test]
+    fn test_utf16_be_xml() {
+        // UTF-16 BE BOM + "<r>B</r>" encoded as UTF-16 BE
+        let mut xml = vec![0xFE, 0xFF]; // BOM
+        for c in "<r>B</r>".encode_utf16() {
+            xml.extend_from_slice(&c.to_be_bytes());
+        }
+        let result = parse_xml(&xml, false).unwrap();
+        assert!(result.content.contains("B"));
+    }
+
+    #[test]
+    fn test_utf16_odd_byte_count_truncates_gracefully() {
+        // UTF-16 LE BOM + "<r>X</r>" + trailing odd byte
+        let mut xml = vec![0xFF, 0xFE]; // BOM
+        for c in "<r>X</r>".encode_utf16() {
+            xml.extend_from_slice(&c.to_le_bytes());
+        }
+        xml.push(0x0A); // trailing odd byte
+        let result = parse_xml(&xml, false).unwrap();
+        assert!(result.content.contains("X"));
+    }
+
+    #[test]
+    fn test_utf16_factbook_file() {
+        let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
+            .join("../../test_documents/vendored/unstructured/xml/factbook-utf-16.xml");
+        if path.exists() {
+            let xml = std::fs::read(&path).unwrap();
+            let result = parse_xml(&xml, false).unwrap();
+            assert!(
+                !result.content.is_empty(),
+                "factbook-utf-16.xml should produce non-empty content"
+            );
+            assert!(result.content.contains("United States"));
+            assert!(result.content.contains("Canada"));
+        }
+    }
 }
@@ -555,6 +555,37 @@ mod tests {
         let result = extractor.extract_text_with_passwords(b"not a pdf", &[]);
         assert!(result.is_err());
     }
+
+    #[test]
+    fn test_strip_page_rotation_no_rotate() {
+        let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Page >>\nendobj";
+        let result = strip_page_rotation(pdf);
+        assert!(matches!(result, Cow::Borrowed(_)));
+    }
+
+    #[test]
+    fn test_strip_page_rotation_90() {
+        let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Page /Rotate 90 >>\nendobj";
+        let result = strip_page_rotation(pdf);
+        assert!(matches!(result, Cow::Owned(_)));
+        assert!(!has_rotate_marker(&result));
+    }
+
+    #[test]
+    fn test_strip_page_rotation_270() {
+        let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Page /Rotate 270 >>\nendobj";
+        let result = strip_page_rotation(pdf);
+        assert!(matches!(result, Cow::Owned(_)));
+        assert!(!has_rotate_marker(&result));
+    }
+
+    #[test]
+    fn test_strip_page_rotation_multiple() {
+        let pdf = b"%PDF-1.4\n1 0 obj\n<< /Rotate 90 >>\n2 0 obj\n<< /Rotate 180 >>\nendobj";
+        let result = strip_page_rotation(pdf);
+        assert!(matches!(result, Cow::Owned(_)));
+        assert!(!has_rotate_marker(&result));
+    }
 }
 
 #[cfg(test)]
 
@@ -15,8 +15,8 @@ Suggests:
     withr,
     roxygen2
 SystemRequirements: Cargo (Rust's package manager), rustc (>= 1.91)
-Config/rextendr/version: 0.4.0
+Config/rextendr/version: 0.4.2
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.2
+RoxygenNote: 7.3.3
 Config/testthat/edition: 3
@@ -1,79 +1,55 @@
 # Generated by roxygen2: do not edit by hand
 
-useDynLib(kreuzberg, .registration = TRUE)
-
-# Config builders
-export(extraction_config)
-export(ocr_config)
-export(chunking_config)
-export(discover)
-export(from_file)
-
-# Extraction functions
-export(extract_file)
-export(extract_file_sync)
-export(extract_bytes)
-export(extract_bytes_sync)
-
-# Batch extraction
-export(batch_extract_files)
-export(batch_extract_files_sync)
+S3method(chunk_count,kreuzberg_result)
+S3method(content,kreuzberg_result)
+S3method(detected_language,kreuzberg_result)
+S3method(format,kreuzberg_result)
+S3method(metadata_field,kreuzberg_result)
+S3method(mime_type,kreuzberg_result)
+S3method(page_count,kreuzberg_result)
+S3method(print,kreuzberg_result)
+S3method(summary,kreuzberg_result)
 export(batch_extract_bytes)
 export(batch_extract_bytes_sync)
-
-# Plugin management
-export(register_ocr_backend)
-export(unregister_ocr_backend)
-export(list_ocr_backends)
+export(batch_extract_files)
+export(batch_extract_files_sync)
+export(cache_stats)
+export(chunk_count)
+export(chunking_config)
+export(clear_cache)
+export(clear_document_extractors)
 export(clear_ocr_backends)
-export(register_post_processor)
-export(unregister_post_processor)
-export(list_post_processors)
 export(clear_post_processors)
-export(register_validator)
-export(unregister_validator)
-export(list_validators)
 export(clear_validators)
-export(list_document_extractors)
-export(unregister_document_extractor)
-export(clear_document_extractors)
-
-# Metadata
+export(content)
 export(detect_mime_type)
 export(detect_mime_type_from_path)
+export(detected_language)
+export(discover)
+export(extract_bytes)
+export(extract_bytes_sync)
+export(extract_file)
+export(extract_file_sync)
+export(extraction_config)
+export(from_file)
 export(get_extensions_for_mime)
+export(list_document_extractors)
+export(list_ocr_backends)
+export(list_post_processors)
+export(list_validators)
+export(metadata_field)
+export(mime_type)
+export(ocr_config)
+export(page_count)
+export(register_ocr_backend)
+export(register_post_processor)
+export(register_validator)
+export(unregister_document_extractor)
+export(unregister_ocr_backend)
+export(unregister_post_processor)
+export(unregister_validator)
+export(validate_language_code)
 export(validate_mime_type)
-
-# Cache
-export(clear_cache)
-export(cache_stats)
-
-# Validation
 export(validate_ocr_backend_name)
-export(validate_language_code)
 export(validate_output_format)
-
-# Result S3 class
-export(as_kreuzberg_result)
-export(print.kreuzberg_result)
-export(summary.kreuzberg_result)
-export(format.kreuzberg_result)
-
-# S3 generics
-export(content)
-export(mime_type)
-export(page_count)
-export(chunk_count)
-export(detected_language)
-export(metadata_field)
-
-# S3 method registrations
-S3method(print, kreuzberg_result)
-S3method(summary, kreuzberg_result)
-S3method(format, kreuzberg_result)
-S3method(content, kreuzberg_result)
-S3method(mime_type, kreuzberg_result)
-S3method(page_count, kreuzberg_result)
-S3method(chunk_count, kreuzberg_result)
-S3method(detected_language, kreuzberg_result)
-S3method(metadata_field, kreuzberg_result)
+useDynLib(kreuzberg, .registration = TRUE)
@@ -1,19 +1,23 @@
 # Generated by extendr: Do not edit by hand
+
+# nolint start
+
 #
 # This file was created with the following call:
 #   .Call("wrap__make_kreuzberg_wrappers", use_symbols = TRUE, package_name = "kreuzberg")
 
-#' @docType package
 #' @usage NULL
 #' @useDynLib kreuzberg, .registration = TRUE
 NULL
 
-# Cache functions
-clear_cache_native <- function() invisible(.Call(wrap__clear_cache))
+#' Clear the extraction cache
+#' @export
+clear_cache <- function() .Call(wrap__clear_cache)
 
-cache_stats_native <- function() .Call(wrap__cache_stats)
+#' Get cache statistics
+#' @export
+cache_stats <- function() .Call(wrap__cache_stats)
 
-# Extraction functions
 extract_file_sync_native <- function(path, mime_type, config_json) .Call(wrap__extract_file_sync_native, path, mime_type, config_json)
 
 extract_file_native <- function(path, mime_type, config_json) .Call(wrap__extract_file_native, path, mime_type, config_json)
@@ -22,7 +26,6 @@ extract_bytes_sync_native <- function(data, mime_type, config_json) .Call(wrap__
 
 extract_bytes_native <- function(data, mime_type, config_json) .Call(wrap__extract_bytes_native, data, mime_type, config_json)
 
-# Batch extraction functions
 batch_extract_files_sync_native <- function(paths, config_json) .Call(wrap__batch_extract_files_sync_native, paths, config_json)
 
 batch_extract_files_native <- function(paths, config_json) .Call(wrap__batch_extract_files_native, paths, config_json)
@@ -31,7 +34,6 @@ batch_extract_bytes_sync_native <- function(data_list, mime_types, config_json)
 
 batch_extract_bytes_native <- function(data_list, mime_types, config_json) .Call(wrap__batch_extract_bytes_native, data_list, mime_types, config_json)
 
-# Metadata functions
 detect_mime_type_native <- function(data) .Call(wrap__detect_mime_type_native, data)
 
 detect_mime_type_from_path_native <- function(path) .Call(wrap__detect_mime_type_from_path_native, path)
@@ -40,48 +42,45 @@ get_extensions_for_mime_native <- function(mime_type) .Call(wrap__get_extensions
 
 validate_mime_type_native <- function(mime_type) .Call(wrap__validate_mime_type_native, mime_type)
 
-# Plugin functions - Post-processors
-register_post_processor_native <- function(name, callback) invisible(.Call(wrap__register_post_processor_native, name, callback))
+register_post_processor_native <- function(name, callback) .Call(wrap__register_post_processor_native, name, callback)
 
-unregister_post_processor_native <- function(name) invisible(.Call(wrap__unregister_post_processor_native, name))
+unregister_post_processor_native <- function(name) .Call(wrap__unregister_post_processor_native, name)
 
 list_post_processors_native <- function() .Call(wrap__list_post_processors_native)
 
-clear_post_processors_native <- function() invisible(.Call(wrap__clear_post_processors_native))
+clear_post_processors_native <- function() .Call(wrap__clear_post_processors_native)
 
-# Plugin functions - Validators
-register_validator_native <- function(name, callback) invisible(.Call(wrap__register_validator_native, name, callback))
+register_validator_native <- function(name, callback) .Call(wrap__register_validator_native, name, callback)
 
-unregister_validator_native <- function(name) invisible(.Call(wrap__unregister_validator_native, name))
+unregister_validator_native <- function(name) .Call(wrap__unregister_validator_native, name)
 
 list_validators_native <- function() .Call(wrap__list_validators_native)
 
-clear_validators_native <- function() invisible(.Call(wrap__clear_validators_native))
+clear_validators_native <- function() .Call(wrap__clear_validators_native)
 
-# Plugin functions - OCR backends
-register_ocr_backend_native <- function(name, callback) invisible(.Call(wrap__register_ocr_backend_native, name, callback))
+register_ocr_backend_native <- function(name, callback) .Call(wrap__register_ocr_backend_native, name, callback)
 
-unregister_ocr_backend_native <- function(name) invisible(.Call(wrap__unregister_ocr_backend_native, name))
+unregister_ocr_backend_native <- function(name) .Call(wrap__unregister_ocr_backend_native, name)
 
 list_ocr_backends_native <- function() .Call(wrap__list_ocr_backends_native)
 
-clear_ocr_backends_native <- function() invisible(.Call(wrap__clear_ocr_backends_native))
+clear_ocr_backends_native <- function() .Call(wrap__clear_ocr_backends_native)
 
-# Plugin functions - Document extractors
 list_document_extractors_native <- function() .Call(wrap__list_document_extractors_native)
 
-unregister_document_extractor_native <- function(name) invisible(.Call(wrap__unregister_document_extractor_native, name))
+unregister_document_extractor_native <- function(name) .Call(wrap__unregister_document_extractor_native, name)
 
-clear_document_extractors_native <- function() invisible(.Call(wrap__clear_document_extractors_native))
+clear_document_extractors_native <- function() .Call(wrap__clear_document_extractors_native)
 
-# Config loading functions
 config_from_file_native <- function(path) .Call(wrap__config_from_file_native, path)
 
 config_discover_native <- function() .Call(wrap__config_discover_native)
 
-# Validation functions
 validate_ocr_backend_name_native <- function(backend) .Call(wrap__validate_ocr_backend_name_native, backend)
 
 validate_language_code_native <- function(code) .Call(wrap__validate_language_code_native, code)
 
 validate_output_format_native <- function(format) .Call(wrap__validate_output_format_native, format)
+
+
+# nolint end
@@ -3,3 +3,9 @@
 
 #' @useDynLib kreuzberg, .registration = TRUE
 NULL
+
+# Null-coalescing operator for R < 4.4 compatibility.
+# In R >= 4.4 this is available in base, but we need to support R >= 4.2.
+`%||%` <- function(x, y) {
+  if (is.null(x)) y else x
+}
@@ -18,7 +18,12 @@ pub fn json_to_robj(value: &Value) -> extendr_api::Result<Robj> {
                 Ok(().into())
             }
         }
-        Value::String(s) => Ok(s.as_str().into_robj()),
+        Value::String(s) => {
+            // R strings (CHARSXP) are C strings and cannot contain embedded NUL bytes.
+            // Strip them to avoid conversion errors.
+            let sanitized = s.replace('\0', "");
+            Ok(sanitized.into_robj())
+        }
         Value::Array(arr) => {
             let items: Vec<Robj> = arr.iter().map(json_to_robj).collect::<extendr_api::Result<Vec<_>>>()?;
             Ok(List::from_values(items).into_robj())
Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,12 @@ pub fn json_to_robj(value: &Value) -> extendr_api::Result<Robj> {`
`18`	`18`	`Ok(().into())`
`19`	`19`	`}`
`20`	`20`	`}`
`21`		`- Value::String(s) => Ok(s.as_str().into_robj()),`
	`21`	`+ Value::String(s) => {`
	`22`	`+ // R strings (CHARSXP) are C strings and cannot contain embedded NUL bytes.`
	`23`	`+ // Strip them to avoid conversion errors.`
	`24`	`+ let sanitized = s.replace('\0', "");`
	`25`	`+ Ok(sanitized.into_robj())`
	`26`	`+ }`
`22`	`27`	`Value::Array(arr) => {`
`23`	`28`	`let items: Vec<Robj> = arr.iter().map(json_to_robj).collect::<extendr_api::Result<Vec<_>>>()?;`
`24`	`29`	`Ok(List::from_values(items).into_robj())`