Skip to content

Commit a4e8f33

Browse files
committed
fix: fix R bindings, UTF-16 XML parsing, and benchmark harness issues
- Fix UTF-16 XML extraction rejecting files with odd byte counts by truncating to even length instead of erroring - Fix R NUL byte crash by stripping embedded NUL from strings at FFI boundary - Add %||% polyfill for R < 4.4 compatibility - Register kreuzberg-r adapter in benchmark harness - Fix R server-mode stdin reading (file("stdin") vs stdin()) - Fix uv pip show error handling when uv is not installed - Replace empty XLA/XLAM test fixtures with data-containing files - Remove PDF fixtures that require OCR or trigger pdfium bugs - Remove duplicate pdf_searchable_ocr fixture - Add unit tests for UTF-16 XML parsing and PDF rotation stripping
1 parent a38876c commit a4e8f33

File tree

21 files changed

+191
-181
lines changed

21 files changed

+191
-181
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2222

2323
### Fixed
2424

25+
- **XML UTF-16 parsing fails on files with odd byte count**: The XML extractor rejected valid UTF-16 encoded files that had a trailing odd byte (e.g. `factbook-utf-16.xml`) with "Invalid UTF-16: odd byte count". The decoder now truncates to the nearest even byte boundary, matching the lenient approach already used in email extraction.
26+
- **R bindings crash on strings with embedded NUL bytes**: Extraction results containing NUL (`\0`) characters (e.g. from RTF files) caused the R FFI layer to error with "embedded nul in string" since R strings are C-based. NUL bytes are now stripped before passing strings to R.
27+
- **R bindings `%||%` operator incompatible with R < 4.4**: The R package used the `%||%` null-coalescing operator which is only available in base R >= 4.4, but the package declares `R >= 4.2`. Added a package-local polyfill for backwards compatibility.
2528
- **API returns HTTP 500 for unsupported file formats** (#414): Uploading files with unsupported or undetectable MIME types (e.g. DOCX via `curl -F`) returned HTTP 500 Internal Server Error instead of HTTP 400 Bad Request. The `/extract` endpoint now falls back to extension-based MIME detection from the filename when the client sends `application/octet-stream`, and `UnsupportedFormat` errors are mapped to HTTP 400 with a clear `UnsupportedFormatError` response.
2629
- **PDF markdown extraction missing headings/bold for flat structure trees** (#391): PDFs where the structure tree tags everything as `<P>` (common with Adobe InDesign) now produce proper headings and bold text. The structure tree path previously bypassed font-size-based heading classification entirely. Pages with font size variation but no heading tags are now enriched via K-means font-size clustering. Additionally, bold detection now recognizes fonts with "Bold" in the name (e.g. `MyriadPro-Bold`) even when the PDF doesn't set the font weight descriptor.
2730
- **PaddleOCR backend not found when using `backend="paddleocr"`** (#403): The PaddleOCR backend registered itself as `"paddle-ocr"` but users and documentation use `"paddleocr"`. The OCR backend registry now resolves the `"paddleocr"` alias to the canonical `"paddle-ocr"` name.

crates/kreuzberg/src/extraction/xml.rs

Lines changed: 53 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -200,11 +200,10 @@ pub fn parse_xml(xml_bytes: &[u8], preserve_whitespace: bool) -> Result<XmlExtra
200200

201201
/// Decode UTF-16 bytes (with BOM) to UTF-8 bytes.
202202
fn decode_utf16_to_utf8(data: &[u8], big_endian: bool) -> Result<Vec<u8>> {
203-
// Skip BOM (first 2 bytes)
203+
// Skip BOM (first 2 bytes) and truncate to even length
204204
let data = &data[2..];
205-
if !data.len().is_multiple_of(2) {
206-
return Err(KreuzbergError::parsing("Invalid UTF-16: odd byte count".to_string()));
207-
}
205+
let even_len = data.len() & !1;
206+
let data = &data[..even_len];
208207

209208
let u16_iter = data.chunks_exact(2).map(|chunk| {
210209
if big_endian {
@@ -463,4 +462,54 @@ mod tests {
463462
let result = parse_xml(xml, false);
464463
let _ = result;
465464
}
465+
466+
#[test]
467+
fn test_utf16_le_xml() {
468+
// UTF-16 LE BOM + "<r>A</r>" encoded as UTF-16 LE
469+
let mut xml = vec![0xFF, 0xFE]; // BOM
470+
for c in "<r>A</r>".encode_utf16() {
471+
xml.extend_from_slice(&c.to_le_bytes());
472+
}
473+
let result = parse_xml(&xml, false).unwrap();
474+
assert!(result.content.contains("A"));
475+
}
476+
477+
#[test]
478+
fn test_utf16_be_xml() {
479+
// UTF-16 BE BOM + "<r>B</r>" encoded as UTF-16 BE
480+
let mut xml = vec![0xFE, 0xFF]; // BOM
481+
for c in "<r>B</r>".encode_utf16() {
482+
xml.extend_from_slice(&c.to_be_bytes());
483+
}
484+
let result = parse_xml(&xml, false).unwrap();
485+
assert!(result.content.contains("B"));
486+
}
487+
488+
#[test]
489+
fn test_utf16_odd_byte_count_truncates_gracefully() {
490+
// UTF-16 LE BOM + "<r>X</r>" + trailing odd byte
491+
let mut xml = vec![0xFF, 0xFE]; // BOM
492+
for c in "<r>X</r>".encode_utf16() {
493+
xml.extend_from_slice(&c.to_le_bytes());
494+
}
495+
xml.push(0x0A); // trailing odd byte
496+
let result = parse_xml(&xml, false).unwrap();
497+
assert!(result.content.contains("X"));
498+
}
499+
500+
#[test]
501+
fn test_utf16_factbook_file() {
502+
let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
503+
.join("../../test_documents/vendored/unstructured/xml/factbook-utf-16.xml");
504+
if path.exists() {
505+
let xml = std::fs::read(&path).unwrap();
506+
let result = parse_xml(&xml, false).unwrap();
507+
assert!(
508+
!result.content.is_empty(),
509+
"factbook-utf-16.xml should produce non-empty content"
510+
);
511+
assert!(result.content.contains("United States"));
512+
assert!(result.content.contains("Canada"));
513+
}
514+
}
466515
}

crates/kreuzberg/src/pdf/text.rs

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -555,6 +555,37 @@ mod tests {
555555
let result = extractor.extract_text_with_passwords(b"not a pdf", &[]);
556556
assert!(result.is_err());
557557
}
558+
559+
#[test]
560+
fn test_strip_page_rotation_no_rotate() {
561+
let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Page >>\nendobj";
562+
let result = strip_page_rotation(pdf);
563+
assert!(matches!(result, Cow::Borrowed(_)));
564+
}
565+
566+
#[test]
567+
fn test_strip_page_rotation_90() {
568+
let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Page /Rotate 90 >>\nendobj";
569+
let result = strip_page_rotation(pdf);
570+
assert!(matches!(result, Cow::Owned(_)));
571+
assert!(!has_rotate_marker(&result));
572+
}
573+
574+
#[test]
575+
fn test_strip_page_rotation_270() {
576+
let pdf = b"%PDF-1.4\n1 0 obj\n<< /Type /Page /Rotate 270 >>\nendobj";
577+
let result = strip_page_rotation(pdf);
578+
assert!(matches!(result, Cow::Owned(_)));
579+
assert!(!has_rotate_marker(&result));
580+
}
581+
582+
#[test]
583+
fn test_strip_page_rotation_multiple() {
584+
let pdf = b"%PDF-1.4\n1 0 obj\n<< /Rotate 90 >>\n2 0 obj\n<< /Rotate 180 >>\nendobj";
585+
let result = strip_page_rotation(pdf);
586+
assert!(matches!(result, Cow::Owned(_)));
587+
assert!(!has_rotate_marker(&result));
588+
}
558589
}
559590

560591
#[cfg(test)]

packages/r/DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ Suggests:
1515
withr,
1616
roxygen2
1717
SystemRequirements: Cargo (Rust's package manager), rustc (>= 1.91)
18-
Config/rextendr/version: 0.4.0
18+
Config/rextendr/version: 0.4.2
1919
Encoding: UTF-8
2020
Roxygen: list(markdown = TRUE)
21-
RoxygenNote: 7.3.2
21+
RoxygenNote: 7.3.3
2222
Config/testthat/edition: 3

packages/r/NAMESPACE

Lines changed: 42 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -1,79 +1,55 @@
11
# Generated by roxygen2: do not edit by hand
22

3-
useDynLib(kreuzberg, .registration = TRUE)
4-
5-
# Config builders
6-
export(extraction_config)
7-
export(ocr_config)
8-
export(chunking_config)
9-
export(discover)
10-
export(from_file)
11-
12-
# Extraction functions
13-
export(extract_file)
14-
export(extract_file_sync)
15-
export(extract_bytes)
16-
export(extract_bytes_sync)
17-
18-
# Batch extraction
19-
export(batch_extract_files)
20-
export(batch_extract_files_sync)
3+
S3method(chunk_count,kreuzberg_result)
4+
S3method(content,kreuzberg_result)
5+
S3method(detected_language,kreuzberg_result)
6+
S3method(format,kreuzberg_result)
7+
S3method(metadata_field,kreuzberg_result)
8+
S3method(mime_type,kreuzberg_result)
9+
S3method(page_count,kreuzberg_result)
10+
S3method(print,kreuzberg_result)
11+
S3method(summary,kreuzberg_result)
2112
export(batch_extract_bytes)
2213
export(batch_extract_bytes_sync)
23-
24-
# Plugin management
25-
export(register_ocr_backend)
26-
export(unregister_ocr_backend)
27-
export(list_ocr_backends)
14+
export(batch_extract_files)
15+
export(batch_extract_files_sync)
16+
export(cache_stats)
17+
export(chunk_count)
18+
export(chunking_config)
19+
export(clear_cache)
20+
export(clear_document_extractors)
2821
export(clear_ocr_backends)
29-
export(register_post_processor)
30-
export(unregister_post_processor)
31-
export(list_post_processors)
3222
export(clear_post_processors)
33-
export(register_validator)
34-
export(unregister_validator)
35-
export(list_validators)
3623
export(clear_validators)
37-
export(list_document_extractors)
38-
export(unregister_document_extractor)
39-
export(clear_document_extractors)
40-
41-
# Metadata
24+
export(content)
4225
export(detect_mime_type)
4326
export(detect_mime_type_from_path)
27+
export(detected_language)
28+
export(discover)
29+
export(extract_bytes)
30+
export(extract_bytes_sync)
31+
export(extract_file)
32+
export(extract_file_sync)
33+
export(extraction_config)
34+
export(from_file)
4435
export(get_extensions_for_mime)
36+
export(list_document_extractors)
37+
export(list_ocr_backends)
38+
export(list_post_processors)
39+
export(list_validators)
40+
export(metadata_field)
41+
export(mime_type)
42+
export(ocr_config)
43+
export(page_count)
44+
export(register_ocr_backend)
45+
export(register_post_processor)
46+
export(register_validator)
47+
export(unregister_document_extractor)
48+
export(unregister_ocr_backend)
49+
export(unregister_post_processor)
50+
export(unregister_validator)
51+
export(validate_language_code)
4552
export(validate_mime_type)
46-
47-
# Cache
48-
export(clear_cache)
49-
export(cache_stats)
50-
51-
# Validation
5253
export(validate_ocr_backend_name)
53-
export(validate_language_code)
5454
export(validate_output_format)
55-
56-
# Result S3 class
57-
export(as_kreuzberg_result)
58-
export(print.kreuzberg_result)
59-
export(summary.kreuzberg_result)
60-
export(format.kreuzberg_result)
61-
62-
# S3 generics
63-
export(content)
64-
export(mime_type)
65-
export(page_count)
66-
export(chunk_count)
67-
export(detected_language)
68-
export(metadata_field)
69-
70-
# S3 method registrations
71-
S3method(print, kreuzberg_result)
72-
S3method(summary, kreuzberg_result)
73-
S3method(format, kreuzberg_result)
74-
S3method(content, kreuzberg_result)
75-
S3method(mime_type, kreuzberg_result)
76-
S3method(page_count, kreuzberg_result)
77-
S3method(chunk_count, kreuzberg_result)
78-
S3method(detected_language, kreuzberg_result)
79-
S3method(metadata_field, kreuzberg_result)
55+
useDynLib(kreuzberg, .registration = TRUE)

packages/r/R/extendr-wrappers.R

Lines changed: 23 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,23 @@
11
# Generated by extendr: Do not edit by hand
2+
3+
# nolint start
4+
25
#
36
# This file was created with the following call:
47
# .Call("wrap__make_kreuzberg_wrappers", use_symbols = TRUE, package_name = "kreuzberg")
58

6-
#' @docType package
79
#' @usage NULL
810
#' @useDynLib kreuzberg, .registration = TRUE
911
NULL
1012

11-
# Cache functions
12-
clear_cache_native <- function() invisible(.Call(wrap__clear_cache))
13+
#' Clear the extraction cache
14+
#' @export
15+
clear_cache <- function() .Call(wrap__clear_cache)
1316

14-
cache_stats_native <- function() .Call(wrap__cache_stats)
17+
#' Get cache statistics
18+
#' @export
19+
cache_stats <- function() .Call(wrap__cache_stats)
1520

16-
# Extraction functions
1721
extract_file_sync_native <- function(path, mime_type, config_json) .Call(wrap__extract_file_sync_native, path, mime_type, config_json)
1822

1923
extract_file_native <- function(path, mime_type, config_json) .Call(wrap__extract_file_native, path, mime_type, config_json)
@@ -22,7 +26,6 @@ extract_bytes_sync_native <- function(data, mime_type, config_json) .Call(wrap__
2226

2327
extract_bytes_native <- function(data, mime_type, config_json) .Call(wrap__extract_bytes_native, data, mime_type, config_json)
2428

25-
# Batch extraction functions
2629
batch_extract_files_sync_native <- function(paths, config_json) .Call(wrap__batch_extract_files_sync_native, paths, config_json)
2730

2831
batch_extract_files_native <- function(paths, config_json) .Call(wrap__batch_extract_files_native, paths, config_json)
@@ -31,7 +34,6 @@ batch_extract_bytes_sync_native <- function(data_list, mime_types, config_json)
3134

3235
batch_extract_bytes_native <- function(data_list, mime_types, config_json) .Call(wrap__batch_extract_bytes_native, data_list, mime_types, config_json)
3336

34-
# Metadata functions
3537
detect_mime_type_native <- function(data) .Call(wrap__detect_mime_type_native, data)
3638

3739
detect_mime_type_from_path_native <- function(path) .Call(wrap__detect_mime_type_from_path_native, path)
@@ -40,48 +42,45 @@ get_extensions_for_mime_native <- function(mime_type) .Call(wrap__get_extensions
4042

4143
validate_mime_type_native <- function(mime_type) .Call(wrap__validate_mime_type_native, mime_type)
4244

43-
# Plugin functions - Post-processors
44-
register_post_processor_native <- function(name, callback) invisible(.Call(wrap__register_post_processor_native, name, callback))
45+
register_post_processor_native <- function(name, callback) .Call(wrap__register_post_processor_native, name, callback)
4546

46-
unregister_post_processor_native <- function(name) invisible(.Call(wrap__unregister_post_processor_native, name))
47+
unregister_post_processor_native <- function(name) .Call(wrap__unregister_post_processor_native, name)
4748

4849
list_post_processors_native <- function() .Call(wrap__list_post_processors_native)
4950

50-
clear_post_processors_native <- function() invisible(.Call(wrap__clear_post_processors_native))
51+
clear_post_processors_native <- function() .Call(wrap__clear_post_processors_native)
5152

52-
# Plugin functions - Validators
53-
register_validator_native <- function(name, callback) invisible(.Call(wrap__register_validator_native, name, callback))
53+
register_validator_native <- function(name, callback) .Call(wrap__register_validator_native, name, callback)
5454

55-
unregister_validator_native <- function(name) invisible(.Call(wrap__unregister_validator_native, name))
55+
unregister_validator_native <- function(name) .Call(wrap__unregister_validator_native, name)
5656

5757
list_validators_native <- function() .Call(wrap__list_validators_native)
5858

59-
clear_validators_native <- function() invisible(.Call(wrap__clear_validators_native))
59+
clear_validators_native <- function() .Call(wrap__clear_validators_native)
6060

61-
# Plugin functions - OCR backends
62-
register_ocr_backend_native <- function(name, callback) invisible(.Call(wrap__register_ocr_backend_native, name, callback))
61+
register_ocr_backend_native <- function(name, callback) .Call(wrap__register_ocr_backend_native, name, callback)
6362

64-
unregister_ocr_backend_native <- function(name) invisible(.Call(wrap__unregister_ocr_backend_native, name))
63+
unregister_ocr_backend_native <- function(name) .Call(wrap__unregister_ocr_backend_native, name)
6564

6665
list_ocr_backends_native <- function() .Call(wrap__list_ocr_backends_native)
6766

68-
clear_ocr_backends_native <- function() invisible(.Call(wrap__clear_ocr_backends_native))
67+
clear_ocr_backends_native <- function() .Call(wrap__clear_ocr_backends_native)
6968

70-
# Plugin functions - Document extractors
7169
list_document_extractors_native <- function() .Call(wrap__list_document_extractors_native)
7270

73-
unregister_document_extractor_native <- function(name) invisible(.Call(wrap__unregister_document_extractor_native, name))
71+
unregister_document_extractor_native <- function(name) .Call(wrap__unregister_document_extractor_native, name)
7472

75-
clear_document_extractors_native <- function() invisible(.Call(wrap__clear_document_extractors_native))
73+
clear_document_extractors_native <- function() .Call(wrap__clear_document_extractors_native)
7674

77-
# Config loading functions
7875
config_from_file_native <- function(path) .Call(wrap__config_from_file_native, path)
7976

8077
config_discover_native <- function() .Call(wrap__config_discover_native)
8178

82-
# Validation functions
8379
validate_ocr_backend_name_native <- function(backend) .Call(wrap__validate_ocr_backend_name_native, backend)
8480

8581
validate_language_code_native <- function(code) .Call(wrap__validate_language_code_native, code)
8682

8783
validate_output_format_native <- function(format) .Call(wrap__validate_output_format_native, format)
84+
85+
86+
# nolint end

packages/r/R/kreuzberg-package.R

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,9 @@
33

44
#' @useDynLib kreuzberg, .registration = TRUE
55
NULL
6+
7+
# Null-coalescing operator for R < 4.4 compatibility.
8+
# In R >= 4.4 this is available in base, but we need to support R >= 4.2.
9+
`%||%` <- function(x, y) {
10+
if (is.null(x)) y else x
11+
}

packages/r/src/rust/src/helpers.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,12 @@ pub fn json_to_robj(value: &Value) -> extendr_api::Result<Robj> {
1818
Ok(().into())
1919
}
2020
}
21-
Value::String(s) => Ok(s.as_str().into_robj()),
21+
Value::String(s) => {
22+
// R strings (CHARSXP) are C strings and cannot contain embedded NUL bytes.
23+
// Strip them to avoid conversion errors.
24+
let sanitized = s.replace('\0', "");
25+
Ok(sanitized.into_robj())
26+
}
2227
Value::Array(arr) => {
2328
let items: Vec<Robj> = arr.iter().map(json_to_robj).collect::<extendr_api::Result<Vec<_>>>()?;
2429
Ok(List::from_values(items).into_robj())
48.5 KB
Binary file not shown.
166 KB
Binary file not shown.

0 commit comments

Comments
 (0)