Skip to content

Commit b515630

Browse files
committed
fix(paddle-ocr): correct dict index offset, default angle cls off, configurable padding (#395)
- Prepend CTC blank token and append space token in read_keys_from_file() to match get_keys() layout, fixing off-by-one character mapping errors - Default use_angle_cls to false (misfires on short text regions) - Replace hardcoded padding=50 with configurable padding (default 10) - Add unit tests for key loading and CTC decoding
1 parent 5bb4af2 commit b515630

File tree

4 files changed

+91
-7
lines changed

4 files changed

+91
-7
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1414
- **PDF markdown output format**: Native PDF text extraction now supports `output_format: Markdown`, producing structured markdown with headings (via font-size clustering), paragraphs, inline bold/italic markup, and list detection — instead of flat text with visual line breaks.
1515
- **Multi-column PDF layout detection**: Histogram-based column gutter detection identifies 2+ column layouts (academic papers, magazines) and processes each column independently, preventing text interleaving across columns.
1616
- **Bold/italic detection via font name fallback**: When PDF font descriptor flags don't indicate bold/italic, the extractor checks font names for "Bold"/"Italic"/"Oblique" substrings and font weight >= 700 as secondary signals.
17+
- **musl/Alpine Linux native builds for Elixir, Java, and C#**: New Docker-based CI jobs build native libraries (`libkreuzberg_rustler.so`, `libkreuzberg_ffi.so`) targeting `x86_64-unknown-linux-musl` and `aarch64-unknown-linux-musl`. Enables instant install on Alpine Linux and musl-based distributions without compiling from source.
18+
- **Pre-compiled platform-specific Ruby gems**: The publish workflow now ships pre-compiled native gems for `x86_64-linux`, `aarch64-linux`, `arm64-darwin`, and `x64-mingw-ucrt`, eliminating the 30+ minute compile-from-source on `gem install kreuzberg`. A fallback source gem is still published for unsupported platforms.
1719

1820
### Fixed
1921

22+
- **PaddleOCR dict index offset causing wrong character recognition (#395)**: `read_keys_from_file()` was missing the CTC blank token (`#`) at index 0 and the space token at the end, causing off-by-one character mapping errors. Now matches the `get_keys()` layout used for embedded models.
23+
- **PaddleOCR angle classifier misfiring on short text (#395)**: Changed `use_angle_cls` default from `true` to `false`. The angle classifier can misfire on short text regions (e.g., 2-3 character table cells), rotating crops incorrectly before recognition. Users can re-enable via `PaddleOcrConfig::with_angle_cls(true)` for rotated documents.
24+
- **PaddleOCR excessive padding including table gridlines (#395)**: Reduced default detection padding from 50px to 10px and made it configurable via `PaddleOcrConfig::with_padding()`. Large padding on small images caused table gridlines to be included in text crops.
25+
- **Ruby CI Bundler gems destroyed by vendoring script**: The `vendor-kreuzberg-core.py` script was deleting the entire `vendor/` directory including `vendor/bundle/` (Bundler's gem installation). Now only cleans crate subdirectories, preserving Bundler state.
2026
- **PDF document loaded twice for markdown rendering**: Eliminated redundant Pdfium initialization and document parsing by rendering markdown speculatively during the first document load, saving 25-40ms per PDF.
2127
- **NaN panics in PDF text clustering and block merging**: Replaced `expect()` calls on `partial_cmp` with `unwrap_or(Ordering::Equal)` across clustering, extraction, and markdown modules to handle corrupt PDF coordinates gracefully.
2228
- **PDF heading detection false positives**: Added distance threshold to font-size centroid matching — decorative elements with extreme font sizes no longer receive heading levels.

crates/kreuzberg-paddle-ocr/src/crnn_net.rs

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,12 @@ impl CrnnNet {
107107
let content = std::fs::read_to_string(path)?;
108108
let mut keys = Vec::new();
109109

110+
// Prepend CTC blank token and append space token to match get_keys() layout.
111+
// The ONNX model's output index 0 = blank, last index = space.
112+
keys.push("#".to_string());
110113
keys.extend(content.split('\n').map(|s| s.to_string()));
114+
keys.push(" ".to_string());
115+
111116
self.keys = keys;
112117
Ok(())
113118
}
@@ -223,3 +228,56 @@ impl CrnnNet {
223228
Ok(text_line)
224229
}
225230
}
231+
232+
#[cfg(test)]
233+
mod tests {
234+
use super::*;
235+
236+
#[test]
237+
fn test_score_to_text_line_skips_blank_index() {
238+
// keys[0] = "#" (CTC blank), keys[1] = "a", keys[2] = "b"
239+
let keys = vec!["#".to_string(), "a".to_string(), "b".to_string()];
240+
// 3 timesteps, 3 classes each. Simulate: blank, "a", "b"
241+
let output = vec![
242+
1.0, 0.0, 0.0, // timestep 0: max at index 0 (blank) -> skip
243+
0.0, 0.9, 0.1, // timestep 1: max at index 1 ("a")
244+
0.0, 0.1, 0.8, // timestep 2: max at index 2 ("b")
245+
];
246+
let result = CrnnNet::score_to_text_line(&output, 3, 3, &keys).unwrap();
247+
assert_eq!(result.text, "ab");
248+
}
249+
250+
#[test]
251+
fn test_score_to_text_line_deduplicates_consecutive() {
252+
let keys = vec!["#".to_string(), "h".to_string(), "i".to_string()];
253+
// 4 timesteps: "h", "h", "i", "i" -> should deduplicate to "hi"
254+
let output = vec![
255+
0.0, 0.9, 0.0, // "h"
256+
0.0, 0.8, 0.0, // "h" again (same index, skip)
257+
0.0, 0.0, 0.9, // "i"
258+
0.0, 0.0, 0.8, // "i" again (same index, skip)
259+
];
260+
let result = CrnnNet::score_to_text_line(&output, 4, 3, &keys).unwrap();
261+
assert_eq!(result.text, "hi");
262+
}
263+
264+
#[test]
265+
fn test_read_keys_from_file_has_blank_and_space() {
266+
let dir = std::env::temp_dir().join("kreuzberg_test_dict");
267+
std::fs::create_dir_all(&dir).unwrap();
268+
let dict_path = dir.join("test_dict.txt");
269+
std::fs::write(&dict_path, "a\nb\nc").unwrap();
270+
271+
let mut net = CrnnNet::new();
272+
net.read_keys_from_file(dict_path.to_str().unwrap()).unwrap();
273+
274+
// Should be: ["#", "a", "b", "c", " "]
275+
assert_eq!(net.keys[0], "#");
276+
assert_eq!(net.keys[1], "a");
277+
assert_eq!(net.keys[2], "b");
278+
assert_eq!(net.keys[3], "c");
279+
assert_eq!(net.keys[net.keys.len() - 1], " ");
280+
281+
std::fs::remove_dir_all(&dir).ok();
282+
}
283+
}

crates/kreuzberg/src/paddle_ocr/backend.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,7 @@ impl PaddleOcrBackend {
260260
plugin_name: "paddle-ocr".to_string(),
261261
})?;
262262

263-
let padding = 50u32;
263+
let padding = config.padding;
264264
let max_side_len = config.det_limit_side_len;
265265
let box_score_thresh = config.det_db_thresh;
266266
let box_thresh = config.det_db_box_thresh;

crates/kreuzberg/src/paddle_ocr/config.rs

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ pub struct PaddleOcrConfig {
3737
/// Optional custom cache directory for model files
3838
pub cache_dir: Option<PathBuf>,
3939

40-
/// Enable angle classification for rotated text (default: true)
40+
/// Enable angle classification for rotated text (default: false).
41+
/// Can misfire on short text regions, rotating crops incorrectly before recognition.
4142
pub use_angle_cls: bool,
4243

4344
/// Enable table structure detection (default: false)
@@ -62,6 +63,10 @@ pub struct PaddleOcrConfig {
6263
/// Batch size for recognition inference (default: 6)
6364
/// Number of text regions to process simultaneously
6465
pub rec_batch_num: u32,
66+
67+
/// Padding in pixels added around the image before detection (default: 10).
68+
/// Large values can include surrounding content like table gridlines.
69+
pub padding: u32,
6570
}
6671

6772
impl PaddleOcrConfig {
@@ -82,13 +87,14 @@ impl PaddleOcrConfig {
8287
Self {
8388
language: language.into(),
8489
cache_dir: None,
85-
use_angle_cls: true,
90+
use_angle_cls: false,
8691
enable_table_detection: false,
8792
det_db_thresh: 0.3,
8893
det_db_box_thresh: 0.5,
8994
det_db_unclip_ratio: 1.6,
9095
det_limit_side_len: 960,
9196
rec_batch_num: 6,
97+
padding: 10,
9298
}
9399
}
94100

@@ -191,6 +197,16 @@ impl PaddleOcrConfig {
191197
self
192198
}
193199

200+
/// Sets padding in pixels added around images before detection.
201+
///
202+
/// # Arguments
203+
///
204+
/// * `padding` - Padding in pixels (0-100)
205+
pub fn with_padding(mut self, padding: u32) -> Self {
206+
self.padding = padding.clamp(0, 100);
207+
self
208+
}
209+
194210
/// Resolves the cache directory, checking in order:
195211
/// 1. Configured `cache_dir` if set
196212
/// 2. `KREUZBERG_CACHE_DIR` environment variable + `/paddle-ocr`
@@ -383,8 +399,9 @@ mod tests {
383399
fn test_new_config() {
384400
let config = PaddleOcrConfig::new("en");
385401
assert_eq!(config.language, "en");
386-
assert!(config.use_angle_cls);
402+
assert!(!config.use_angle_cls);
387403
assert!(!config.enable_table_detection);
404+
assert_eq!(config.padding, 10);
388405
}
389406

390407
#[test]
@@ -396,21 +413,24 @@ mod tests {
396413
assert_eq!(config.det_db_unclip_ratio, 1.6);
397414
assert_eq!(config.det_limit_side_len, 960);
398415
assert_eq!(config.rec_batch_num, 6);
416+
assert_eq!(config.padding, 10);
399417
}
400418

401419
#[test]
402420
fn test_builder_pattern() {
403421
let config = PaddleOcrConfig::new("ch")
404-
.with_angle_cls(false)
422+
.with_angle_cls(true)
405423
.with_table_detection(true)
406424
.with_det_db_thresh(0.4)
407-
.with_rec_batch_num(12);
425+
.with_rec_batch_num(12)
426+
.with_padding(25);
408427

409428
assert_eq!(config.language, "ch");
410-
assert!(!config.use_angle_cls);
429+
assert!(config.use_angle_cls);
411430
assert!(config.enable_table_detection);
412431
assert_eq!(config.det_db_thresh, 0.4);
413432
assert_eq!(config.rec_batch_num, 12);
433+
assert_eq!(config.padding, 25);
414434
}
415435

416436
#[test]

0 commit comments

Comments
 (0)