Skip to content

Commit 0a5608c

Browse files
committed
fix(node,ocr,docx): fix e2e test failures and DOCX UTF-8 panic
- Fix Tesseract TSV level mapping (was off-by-one): levels are 1=Page, 2=Block, 3=Paragraph, 4=Line, 5=Word - Fix parse_tsv_to_elements filter to include word-level (5) entries - Fix ocr_elements dropped in image_ocr.rs (was hardcoded to None) - Fix DOCX extractor panic on multi-byte UTF-8 page boundaries (#401) - Add djot_content field to JsExtractionResult Node bindings - Add mapPageConfig and mapHtmlOptions to e2e generator template - Fix config_djot_content fixture (djot_content only for native .djot)
1 parent b1407b9 commit 0a5608c

File tree

10 files changed

+170
-24
lines changed

10 files changed

+170
-24
lines changed

crates/kreuzberg-node/src/result.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,8 @@ pub struct JsExtractionResult {
162162
pub elements: Option<Vec<JsElement>>,
163163
#[napi(ts_type = "DocumentStructure | null")]
164164
pub document: Option<serde_json::Value>,
165+
#[napi(ts_type = "DjotContent | null", js_name = "djotContent")]
166+
pub djot_content: Option<serde_json::Value>,
165167
#[napi(ts_type = "OcrElement[] | null")]
166168
pub ocr_elements: Option<serde_json::Value>,
167169
#[napi(js_name = "extractedKeywords")]
@@ -359,6 +361,18 @@ impl TryFrom<RustExtractionResult> for JsExtractionResult {
359361
)
360362
})?;
361363

364+
let djot_content = val
365+
.djot_content
366+
.as_ref()
367+
.map(serde_json::to_value)
368+
.transpose()
369+
.map_err(|e| {
370+
Error::new(
371+
Status::GenericFailure,
372+
format!("Failed to serialize djot_content: {}", e),
373+
)
374+
})?;
375+
362376
let ocr_elements = val
363377
.ocr_elements
364378
.map(|elems| serde_json::to_value(&elems))
@@ -448,6 +462,7 @@ impl TryFrom<RustExtractionResult> for JsExtractionResult {
448462
pages,
449463
elements,
450464
document,
465+
djot_content,
451466
ocr_elements,
452467
extracted_keywords,
453468
quality_score: val.quality_score,
@@ -676,7 +691,7 @@ impl TryFrom<JsExtractionResult> for RustExtractionResult {
676691
.collect()
677692
}),
678693
document,
679-
djot_content: None,
694+
djot_content: val.djot_content.and_then(|v| serde_json::from_value(v).ok()),
680695
ocr_elements: val.ocr_elements.and_then(|v| serde_json::from_value(v).ok()),
681696
extracted_keywords: val.extracted_keywords.map(|keywords| {
682697
keywords

crates/kreuzberg/src/extraction/image_ocr.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ pub async fn process_images_with_ocr(
6060
djot_content: None,
6161
pages: None,
6262
elements: None,
63-
ocr_elements: None,
63+
ocr_elements: ocr_extraction.ocr_elements,
6464
document: None,
6565
#[cfg(any(feature = "keywords-yake", feature = "keywords-rake"))]
6666
extracted_keywords: None,

crates/kreuzberg/src/extractors/docx.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -704,8 +704,15 @@ impl DocumentExtractor for DocxExtractor {
704704
let page_num = boundary.page_number;
705705
// Extract text slice for this page
706706
let page_text = if boundary.byte_start < text.len() {
707-
let end = boundary.byte_end.min(text.len());
708-
text[boundary.byte_start..end].trim().to_string()
707+
let mut start = boundary.byte_start.min(text.len());
708+
while start < text.len() && !text.is_char_boundary(start) {
709+
start += 1;
710+
}
711+
let mut end = boundary.byte_end.min(text.len());
712+
while end > start && !text.is_char_boundary(end) {
713+
end -= 1;
714+
}
715+
text[start..end].trim().to_string()
709716
} else {
710717
String::new()
711718
};

crates/kreuzberg/src/ocr/conversion.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,14 +146,15 @@ pub fn tsv_row_to_element(row: &TsvRow) -> OcrElement {
146146
let confidence = OcrConfidence::from_tesseract(row.conf);
147147
let level = OcrElementLevel::from_tesseract_level(row.level);
148148

149-
// Generate a hierarchical parent ID for word-level elements
150-
let parent_id = if row.level == 4 {
149+
// Generate a hierarchical parent ID
150+
// Tesseract levels: 1=Page, 2=Block, 3=Paragraph, 4=Line, 5=Word
151+
let parent_id = if row.level == 5 {
151152
// Word-level: parent is the line
152153
Some(format!(
153154
"p{}_b{}_par{}_l{}",
154155
row.page_num, row.block_num, row.par_num, row.line_num
155156
))
156-
} else if row.level == 3 {
157+
} else if row.level == 4 {
157158
// Line-level: parent is the paragraph
158159
Some(format!("p{}_b{}_par{}", row.page_num, row.block_num, row.par_num))
159160
} else {

crates/kreuzberg/src/ocr/processor/execution.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,9 @@ fn parse_tsv_to_elements(tsv_data: &str, min_confidence: f64) -> Vec<OcrElement>
6464
continue;
6565
}
6666

67-
// Only include word-level (4) and line-level (3) entries
68-
if level != 3 && level != 4 {
67+
// Only include word-level (5) and line-level (4) entries
68+
// Tesseract TSV levels: 1=page, 2=block, 3=paragraph, 4=line, 5=word
69+
if level != 4 && level != 5 {
6970
continue;
7071
}
7172

crates/kreuzberg/src/types/ocr_elements.rs

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -195,14 +195,14 @@ pub enum OcrElementLevel {
195195
impl OcrElementLevel {
196196
/// Convert from Tesseract's numeric level (1-5).
197197
///
198-
/// Tesseract levels: 1=Block, 2=Para, 3=Line, 4=Word, 5=Symbol
198+
/// Tesseract levels: 1=Page, 2=Block, 3=Paragraph, 4=Line, 5=Word
199199
pub fn from_tesseract_level(level: i32) -> Self {
200200
match level {
201-
1 => Self::Block,
202-
2 => Self::Block, // Para treated as Block
203-
3 => Self::Line,
204-
4 => Self::Word,
205-
5 => Self::Word, // Symbol treated as Word
201+
1 => Self::Page,
202+
2 => Self::Block,
203+
3 => Self::Block, // Paragraph treated as Block
204+
4 => Self::Line,
205+
5 => Self::Word,
206206
_ => Self::Line,
207207
}
208208
}
@@ -398,9 +398,11 @@ mod tests {
398398

399399
#[test]
400400
fn test_element_level_from_tesseract() {
401-
assert_eq!(OcrElementLevel::from_tesseract_level(1), OcrElementLevel::Block);
402-
assert_eq!(OcrElementLevel::from_tesseract_level(3), OcrElementLevel::Line);
403-
assert_eq!(OcrElementLevel::from_tesseract_level(4), OcrElementLevel::Word);
401+
assert_eq!(OcrElementLevel::from_tesseract_level(1), OcrElementLevel::Page);
402+
assert_eq!(OcrElementLevel::from_tesseract_level(2), OcrElementLevel::Block);
403+
assert_eq!(OcrElementLevel::from_tesseract_level(3), OcrElementLevel::Block);
404+
assert_eq!(OcrElementLevel::from_tesseract_level(4), OcrElementLevel::Line);
405+
assert_eq!(OcrElementLevel::from_tesseract_level(5), OcrElementLevel::Word);
404406
}
405407

406408
#[test]

e2e/typescript/tests/contract.spec.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -365,7 +365,6 @@ describe("contract fixtures", () => {
365365
}
366366
assertions.assertExpectedMime(result, ["application/pdf"]);
367367
assertions.assertMinContentLength(result, 10);
368-
assertions.assertDjotContent(result, true, undefined);
369368
},
370369
TEST_TIMEOUT_MS,
371370
);

e2e/typescript/tests/helpers.ts

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,73 @@ function mapPostProcessorConfig(raw: PlainRecord): PostProcessorConfig {
187187
return config;
188188
}
189189

190+
function mapPageConfig(raw: PlainRecord): PlainRecord {
191+
const config: PlainRecord = {};
192+
assignBooleanField(config, raw, "extract_pages", "extractPages");
193+
assignBooleanField(config, raw, "insert_page_markers", "insertPageMarkers");
194+
if (typeof raw.marker_format === "string") {
195+
config.markerFormat = raw.marker_format;
196+
}
197+
return config;
198+
}
199+
200+
function mapHtmlOptions(raw: PlainRecord): PlainRecord {
201+
const config: PlainRecord = {};
202+
if (typeof raw.heading_style === "string") {
203+
config.headingStyle = raw.heading_style;
204+
}
205+
if (typeof raw.list_indent_type === "string") {
206+
config.listIndentType = raw.list_indent_type;
207+
}
208+
assignNumberField(config, raw, "list_indent_width", "listIndentWidth");
209+
if (typeof raw.bullets === "string") {
210+
config.bullets = raw.bullets;
211+
}
212+
if (typeof raw.strong_em_symbol === "string") {
213+
config.strongEmSymbol = raw.strong_em_symbol;
214+
}
215+
assignBooleanField(config, raw, "escape_asterisks", "escapeAsterisks");
216+
assignBooleanField(config, raw, "escape_underscores", "escapeUnderscores");
217+
assignBooleanField(config, raw, "escape_misc", "escapeMisc");
218+
assignBooleanField(config, raw, "escape_ascii", "escapeAscii");
219+
if (typeof raw.code_language === "string") {
220+
config.codeLanguage = raw.code_language;
221+
}
222+
assignBooleanField(config, raw, "autolinks", "autolinks");
223+
assignBooleanField(config, raw, "default_title", "defaultTitle");
224+
assignBooleanField(config, raw, "br_in_tables", "brInTables");
225+
assignBooleanField(config, raw, "hocr_spatial_tables", "hocrSpatialTables");
226+
if (typeof raw.highlight_style === "string") {
227+
config.highlightStyle = raw.highlight_style;
228+
}
229+
assignBooleanField(config, raw, "extract_metadata", "extractMetadata");
230+
if (typeof raw.whitespace_mode === "string") {
231+
config.whitespaceMode = raw.whitespace_mode;
232+
}
233+
assignBooleanField(config, raw, "strip_newlines", "stripNewlines");
234+
assignBooleanField(config, raw, "wrap", "wrap");
235+
assignNumberField(config, raw, "wrap_width", "wrapWidth");
236+
assignBooleanField(config, raw, "convert_as_inline", "convertAsInline");
237+
if (typeof raw.sub_symbol === "string") {
238+
config.subSymbol = raw.sub_symbol;
239+
}
240+
if (typeof raw.sup_symbol === "string") {
241+
config.supSymbol = raw.sup_symbol;
242+
}
243+
if (typeof raw.newline_style === "string") {
244+
config.newlineStyle = raw.newline_style;
245+
}
246+
if (typeof raw.code_block_style === "string") {
247+
config.codeBlockStyle = raw.code_block_style;
248+
}
249+
if (typeof raw.encoding === "string") {
250+
config.encoding = raw.encoding;
251+
}
252+
assignBooleanField(config, raw, "debug", "debug");
253+
assignBooleanField(config, raw, "include_links", "includeLinks");
254+
return config;
255+
}
256+
190257
function mapKeywordConfig(raw: PlainRecord): KeywordConfig {
191258
const config: KeywordConfig = {};
192259
const target = config as PlainRecord;
@@ -265,6 +332,14 @@ export function buildConfig(raw: unknown): ExtractionConfig {
265332
result.keywords = mapKeywordConfig(source.keywords as PlainRecord);
266333
}
267334

335+
if (isPlainRecord(source.pages)) {
336+
target.pages = mapPageConfig(source.pages as PlainRecord);
337+
}
338+
339+
if (isPlainRecord(source.html_options)) {
340+
target.htmlOptions = mapHtmlOptions(source.html_options as PlainRecord);
341+
}
342+
268343
if (typeof source.output_format === "string") {
269344
result.outputFormat = source.output_format as string;
270345
}

fixtures/contract/config_djot_content.json

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"id": "config_djot_content",
3-
"description": "Tests djot output format produces djot_content field",
3+
"description": "Tests djot output format converts content to djot markup",
44
"tags": ["contract", "config", "djot"],
55
"document": {
66
"path": "pdf/fake_memo.pdf"
@@ -15,10 +15,7 @@
1515
"assertions": {
1616
"expected_mime": "application/pdf",
1717
"min_content_length": 10,
18-
"output_format_is": "djot",
19-
"djot_content": {
20-
"has_content": true
21-
}
18+
"output_format_is": "djot"
2219
},
2320
"skip": {
2421
"requires_feature": ["pdf"]

tools/e2e-generator/src/typescript.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,47 @@ function mapPostProcessorConfig(raw: PlainRecord): PostProcessorConfig {
190190
return config;
191191
}
192192
193+
function mapPageConfig(raw: PlainRecord): PlainRecord {
194+
const config: PlainRecord = {};
195+
assignBooleanField(config, raw, "extract_pages", "extractPages");
196+
assignBooleanField(config, raw, "insert_page_markers", "insertPageMarkers");
197+
if (typeof raw.marker_format === "string") { config.markerFormat = raw.marker_format; }
198+
return config;
199+
}
200+
201+
function mapHtmlOptions(raw: PlainRecord): PlainRecord {
202+
const config: PlainRecord = {};
203+
if (typeof raw.heading_style === "string") { config.headingStyle = raw.heading_style; }
204+
if (typeof raw.list_indent_type === "string") { config.listIndentType = raw.list_indent_type; }
205+
assignNumberField(config, raw, "list_indent_width", "listIndentWidth");
206+
if (typeof raw.bullets === "string") { config.bullets = raw.bullets; }
207+
if (typeof raw.strong_em_symbol === "string") { config.strongEmSymbol = raw.strong_em_symbol; }
208+
assignBooleanField(config, raw, "escape_asterisks", "escapeAsterisks");
209+
assignBooleanField(config, raw, "escape_underscores", "escapeUnderscores");
210+
assignBooleanField(config, raw, "escape_misc", "escapeMisc");
211+
assignBooleanField(config, raw, "escape_ascii", "escapeAscii");
212+
if (typeof raw.code_language === "string") { config.codeLanguage = raw.code_language; }
213+
assignBooleanField(config, raw, "autolinks", "autolinks");
214+
assignBooleanField(config, raw, "default_title", "defaultTitle");
215+
assignBooleanField(config, raw, "br_in_tables", "brInTables");
216+
assignBooleanField(config, raw, "hocr_spatial_tables", "hocrSpatialTables");
217+
if (typeof raw.highlight_style === "string") { config.highlightStyle = raw.highlight_style; }
218+
assignBooleanField(config, raw, "extract_metadata", "extractMetadata");
219+
if (typeof raw.whitespace_mode === "string") { config.whitespaceMode = raw.whitespace_mode; }
220+
assignBooleanField(config, raw, "strip_newlines", "stripNewlines");
221+
assignBooleanField(config, raw, "wrap", "wrap");
222+
assignNumberField(config, raw, "wrap_width", "wrapWidth");
223+
assignBooleanField(config, raw, "convert_as_inline", "convertAsInline");
224+
if (typeof raw.sub_symbol === "string") { config.subSymbol = raw.sub_symbol; }
225+
if (typeof raw.sup_symbol === "string") { config.supSymbol = raw.sup_symbol; }
226+
if (typeof raw.newline_style === "string") { config.newlineStyle = raw.newline_style; }
227+
if (typeof raw.code_block_style === "string") { config.codeBlockStyle = raw.code_block_style; }
228+
if (typeof raw.encoding === "string") { config.encoding = raw.encoding; }
229+
assignBooleanField(config, raw, "debug", "debug");
230+
assignBooleanField(config, raw, "include_links", "includeLinks");
231+
return config;
232+
}
233+
193234
function mapKeywordConfig(raw: PlainRecord): KeywordConfig {
194235
const config: KeywordConfig = {};
195236
const target = config as PlainRecord;
@@ -264,6 +305,14 @@ export function buildConfig(raw: unknown): ExtractionConfig {
264305
result.keywords = mapKeywordConfig(source.keywords as PlainRecord);
265306
}
266307
308+
if (isPlainRecord(source.pages)) {
309+
target.pages = mapPageConfig(source.pages as PlainRecord);
310+
}
311+
312+
if (isPlainRecord(source.html_options)) {
313+
target.htmlOptions = mapHtmlOptions(source.html_options as PlainRecord);
314+
}
315+
267316
if (typeof source.output_format === "string") {
268317
result.outputFormat = source.output_format as string;
269318
}

0 commit comments

Comments
 (0)