Skip to content

Commit ed32c91

Browse files
committed
feat: use geometric sorting as fallback for reading order
- When no structure tree exists, fall back to geometric sorting instead of raw stream order - Sorts by Y (top→bottom), then X (left→right) - Includes automatic two-column layout detection - Add Reading Order section to README explaining the approach
1 parent dd68ba8 commit ed32c91

File tree

2 files changed

+47
-7
lines changed

2 files changed

+47
-7
lines changed

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,20 @@ python/zpdf/ # Python bindings (cffi)
146146
examples/ # Usage examples
147147
```
148148

149+
## Reading Order
150+
151+
zpdf uses a two-tier approach for extracting text in logical reading order:
152+
153+
1. **Structure Tree** (preferred): Uses the PDF's semantic structure as defined by the document author. This is the correct approach for tagged/accessible PDFs (PDF/UA) and properly handles multi-column layouts, sidebars, tables, and captions.
154+
155+
2. **Geometric Sorting** (fallback): When no structure tree exists, zpdf falls back to sorting text by Y-coordinate (top→bottom), then X-coordinate (left→right), with automatic two-column detection.
156+
157+
| Method | Pros | Cons |
158+
|--------|------|------|
159+
| Structure tree | Correct semantic order, handles complex layouts | Only works on tagged PDFs |
160+
| Geometric sort | Works on any PDF, handles two-column | Can fail on complex layouts |
161+
| Stream order | Fast, raw extraction | Often wrong order |
162+
149163
## Comparison
150164

151165
| Feature | zpdf | pdfium | MuPDF |

src/root.zig

Lines changed: 33 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -483,13 +483,13 @@ fn ensurePageFonts(self: *Document, page_idx: usize) void {
483483

484484
// Parse structure tree to get reading order
485485
var tree = structtree.parseStructTree(arena, self.data, &self.xref_table, &self.object_cache) catch
486-
return self.extractTextToBuffer(page_num, allocator);
486+
return self.extractTextGeometric(page_num, allocator);
487487

488488
defer tree.deinit();
489489

490490
if (tree.root == null) {
491-
// No structure tree, fall back to stream order
492-
return self.extractTextToBuffer(page_num, allocator);
491+
// No structure tree, fall back to geometric sorting
492+
return self.extractTextGeometric(page_num, allocator);
493493
}
494494

495495
// Build page index mapping (object number -> page index)
@@ -533,17 +533,43 @@ fn ensurePageFonts(self: *Document, page_idx: usize) void {
533533
}
534534
}
535535

536-
// If we got no content from structure tree, fall back to stream order
536+
// If we got no content from structure tree, fall back to geometric sorting
537537
if (result.items.len == 0) {
538538
result.deinit(allocator);
539-
return self.extractTextToBuffer(page_num, allocator);
539+
return self.extractTextGeometric(page_num, allocator);
540540
}
541541

542542
return result.toOwnedSlice(allocator);
543543
}
544544

545-
/// Extract text to a buffer (helper for fallback)
546-
fn extractTextToBuffer(self: *Document, page_num: usize, allocator: std.mem.Allocator) ![]u8 {
545+
/// Extract text using geometric sorting (fallback when no structure tree)
546+
/// Sorts text by Y (top to bottom), then X (left to right), with two-column detection
547+
fn extractTextGeometric(self: *Document, page_num: usize, allocator: std.mem.Allocator) ![]u8 {
548+
const page = self.pages.items[page_num];
549+
const page_width = page.media_box[2] - page.media_box[0];
550+
551+
const spans = self.extractTextWithBounds(page_num, allocator) catch |err| {
552+
// If bounds extraction fails, fall back to stream order
553+
if (err == error.OutOfMemory) return err;
554+
return self.extractTextStreamOrder(page_num, allocator);
555+
};
556+
557+
if (spans.len == 0) {
558+
allocator.free(spans);
559+
return allocator.alloc(u8, 0);
560+
}
561+
defer allocator.free(spans);
562+
563+
var layout_result = layout.analyzeLayout(allocator, spans, page_width) catch {
564+
return self.extractTextStreamOrder(page_num, allocator);
565+
};
566+
defer layout_result.deinit();
567+
568+
return layout_result.getTextInOrder(allocator);
569+
}
570+
571+
/// Extract text in raw stream order (last resort fallback)
572+
fn extractTextStreamOrder(self: *Document, page_num: usize, allocator: std.mem.Allocator) ![]u8 {
547573
var output: std.ArrayList(u8) = .empty;
548574
errdefer output.deinit(allocator);
549575

0 commit comments

Comments
 (0)