Skip to content

Commit 0a9614d

Browse files
committed
refactor: simplify API to use reading order by default
- Remove parallel extraction option (sequential is still 5x faster) - All extract functions now use reading order (structure tree + geometric fallback) - Add sortGeometric to match PyMuPDF's simple Y→X sorting algorithm - Clean up 248 lines of unused parallel code from C API - Simplify Python bindings: extract_all() takes no parameters
1 parent ed32c91 commit 0a9614d

File tree

5 files changed

+95
-343
lines changed

5 files changed

+95
-343
lines changed

benchmark/verapdf_bench.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def find_pdfs():
3737
return list(CORPUS_DIR.rglob("*.pdf"))
3838

3939
def extract_zpdf(pdf_path):
40-
"""Extract text using zpdf Python bindings."""
40+
"""Extract text using zpdf Python bindings (reading order by default)."""
4141
if not HAS_ZPDF:
4242
return ""
4343
try:

python/zpdf/__init__.py

Lines changed: 4 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -130,28 +130,15 @@ def extract_page(self, page_num: int, reading_order: bool = False) -> str:
130130
finally:
131131
lib.zpdf_free_buffer(buf_ptr, out_len[0])
132132

133-
def extract_all(self, parallel: bool = True, reading_order: bool = False) -> str:
134-
"""Extract text from all pages.
133+
def extract_all(self) -> str:
134+
"""Extract text from all pages in reading order.
135135
136-
Args:
137-
parallel: If True (default), use multi-threaded extraction for speed.
138-
reading_order: If True, returns text in visual reading order
139-
(left-to-right, top-to-bottom with column detection).
140-
If False (default), returns text in PDF stream order.
136+
Uses structure tree when available, falls back to geometric sorting (Y→X).
141137
"""
142138
self._check_open()
143139
out_len = ffi.new("size_t*")
144140

145-
if reading_order:
146-
if parallel:
147-
buf_ptr = lib.zpdf_extract_all_reading_order_parallel(self._handle, out_len)
148-
else:
149-
buf_ptr = lib.zpdf_extract_all_reading_order(self._handle, out_len)
150-
else:
151-
if parallel:
152-
buf_ptr = lib.zpdf_extract_all_parallel(self._handle, out_len)
153-
else:
154-
buf_ptr = lib.zpdf_extract_all(self._handle, out_len)
141+
buf_ptr = lib.zpdf_extract_all_reading_order(self._handle, out_len)
155142

156143
if buf_ptr == ffi.NULL:
157144
if out_len[0] == 0:

src/capi.zig

Lines changed: 11 additions & 317 deletions
Original file line numberDiff line numberDiff line change
@@ -50,28 +50,15 @@ export fn zpdf_extract_page(handle: ?*ZpdfDocument, page_num: c_int, out_len: *u
5050
return null;
5151
}
5252

53+
/// Extract text from all pages in reading order
54+
/// Uses structure tree when available, falls back to geometric sorting
5355
export fn zpdf_extract_all(handle: ?*ZpdfDocument, out_len: *usize) ?[*]u8 {
54-
if (handle) |h| {
55-
const doc: *zpdf.Document = @ptrCast(@alignCast(h));
56-
57-
var buffer: std.ArrayList(u8) = .empty;
58-
doc.extractAllText(buffer.writer(c_allocator)) catch return null;
59-
60-
const slice = buffer.toOwnedSlice(c_allocator) catch return null;
61-
out_len.* = slice.len;
62-
return slice.ptr;
63-
}
64-
return null;
56+
return zpdf_extract_all_reading_order(handle, out_len);
6557
}
6658

59+
/// Alias for zpdf_extract_all (parallel is deprecated, uses sequential)
6760
export fn zpdf_extract_all_parallel(handle: ?*ZpdfDocument, out_len: *usize) ?[*]u8 {
68-
if (handle) |h| {
69-
const doc: *zpdf.Document = @ptrCast(@alignCast(h));
70-
const result = doc.extractAllTextParallel(c_allocator) catch return null;
71-
out_len.* = result.len;
72-
return result.ptr;
73-
}
74-
return null;
61+
return zpdf_extract_all_reading_order(handle, out_len);
7562
}
7663

7764
export fn zpdf_free_buffer(ptr: ?[*]u8, len: usize) void {
@@ -171,311 +158,18 @@ export fn zpdf_extract_page_reading_order(handle: ?*ZpdfDocument, page_num: c_in
171158
}
172159

173160
/// Extract text from all pages in reading order (sequential)
161+
/// Uses structure tree when available, falls back to geometric sorting
174162
export fn zpdf_extract_all_reading_order(handle: ?*ZpdfDocument, out_len: *usize) ?[*]u8 {
175163
if (handle) |h| {
176164
const doc: *zpdf.Document = @ptrCast(@alignCast(h));
177-
const num_pages = doc.pages.items.len;
178-
if (num_pages == 0) {
179-
out_len.* = 0;
180-
return null;
181-
}
182-
183-
var result: std.ArrayList(u8) = .empty;
184-
errdefer result.deinit(c_allocator);
185-
186-
for (0..num_pages) |page_idx| {
187-
if (page_idx > 0) {
188-
result.append(c_allocator, '\x0c') catch continue; // Form feed between pages
189-
}
190-
191-
const page = doc.pages.items[page_idx];
192-
const page_width = page.media_box[2] - page.media_box[0];
193-
194-
const spans = doc.extractTextWithBounds(page_idx, c_allocator) catch continue;
195-
if (spans.len == 0) continue;
196-
defer c_allocator.free(spans);
197-
198-
var layout_result = zpdf.layout.analyzeLayout(c_allocator, spans, page_width) catch continue;
199-
defer layout_result.deinit();
200-
201-
const text = layout_result.getTextInOrder(c_allocator) catch continue;
202-
defer c_allocator.free(text);
203-
204-
result.appendSlice(c_allocator, text) catch continue;
205-
}
206-
207-
const slice = result.toOwnedSlice(c_allocator) catch return null;
208-
out_len.* = slice.len;
209-
return slice.ptr;
165+
const result = doc.extractAllTextStructured(c_allocator) catch return null;
166+
out_len.* = result.len;
167+
return result.ptr;
210168
}
211169
return null;
212170
}
213171

214-
/// Extract text from all pages in reading order (parallel)
215-
/// On WASM, falls back to sequential extraction since threads are not available.
172+
/// Alias for zpdf_extract_all_reading_order (parallel is deprecated)
216173
export fn zpdf_extract_all_reading_order_parallel(handle: ?*ZpdfDocument, out_len: *usize) ?[*]u8 {
217-
if (comptime is_wasm) {
218-
// WASM doesn't support threads, fall back to sequential
219-
return zpdf_extract_all_reading_order(handle, out_len);
220-
}
221-
222-
if (handle) |h| {
223-
const doc: *zpdf.Document = @ptrCast(@alignCast(h));
224-
const num_pages = doc.pages.items.len;
225-
if (num_pages == 0) {
226-
out_len.* = 0;
227-
return null;
228-
}
229-
230-
// Allocate result buffers for each page
231-
const results = c_allocator.alloc([]u8, num_pages) catch return null;
232-
defer c_allocator.free(results);
233-
@memset(results, &[_]u8{});
234-
235-
const Thread = std.Thread;
236-
const cpu_count = Thread.getCpuCount() catch 4;
237-
const num_threads: usize = @min(num_pages, @min(cpu_count, 8));
238-
239-
const Context = struct {
240-
doc: *zpdf.Document,
241-
results: [][]u8,
242-
};
243-
244-
const ctx = Context{
245-
.doc = doc,
246-
.results = results,
247-
};
248-
249-
const worker = struct {
250-
fn run(c: Context, start: usize, end: usize) void {
251-
// Thread-local arena for all allocations
252-
var arena = std.heap.ArenaAllocator.init(c_allocator);
253-
defer arena.deinit();
254-
const thread_alloc = arena.allocator();
255-
256-
// Thread-local object cache (required for thread safety)
257-
var local_cache = std.AutoHashMap(u32, zpdf.Object).init(thread_alloc);
258-
defer local_cache.deinit();
259-
260-
for (start..end) |page_idx| {
261-
const page = c.doc.pages.items[page_idx];
262-
const page_width = page.media_box[2] - page.media_box[0];
263-
264-
// Get content stream with thread-local cache
265-
const content = zpdf.pagetree.getPageContents(
266-
thread_alloc,
267-
c.doc.data,
268-
&c.doc.xref_table,
269-
page,
270-
&local_cache,
271-
) catch continue;
272-
273-
if (content.len == 0) continue;
274-
275-
// Extract spans with bounds
276-
var collector = zpdf.interpreter.SpanCollector.init(c_allocator);
277-
extractTextFromContentWithBoundsLocal(content, &collector) catch continue;
278-
collector.flush() catch continue;
279-
280-
const spans = collector.spans.toOwnedSlice(c_allocator) catch continue;
281-
if (spans.len == 0) continue;
282-
defer c_allocator.free(spans);
283-
284-
// Analyze layout
285-
var layout_result = zpdf.layout.analyzeLayout(c_allocator, spans, page_width) catch continue;
286-
defer layout_result.deinit();
287-
288-
const text = layout_result.getTextInOrder(c_allocator) catch continue;
289-
c.results[page_idx] = text;
290-
}
291-
}
292-
}.run;
293-
294-
// Spawn threads
295-
var threads: [8]?Thread = [_]?Thread{null} ** 8;
296-
const pages_per_thread = (num_pages + num_threads - 1) / num_threads;
297-
298-
for (0..num_threads) |i| {
299-
const start = i * pages_per_thread;
300-
const end = @min(start + pages_per_thread, num_pages);
301-
if (start < end) {
302-
threads[i] = Thread.spawn(.{}, worker, .{ ctx, start, end }) catch null;
303-
}
304-
}
305-
306-
// Wait for all threads
307-
for (&threads) |*t| {
308-
if (t.*) |thread| thread.join();
309-
}
310-
311-
// Calculate total size
312-
var total_size: usize = 0;
313-
var non_empty_count: usize = 0;
314-
for (results) |r| {
315-
if (r.len > 0) {
316-
total_size += r.len;
317-
non_empty_count += 1;
318-
}
319-
}
320-
if (non_empty_count > 1) {
321-
total_size += non_empty_count - 1; // separators
322-
}
323-
324-
if (total_size == 0) {
325-
out_len.* = 0;
326-
return null;
327-
}
328-
329-
var output = c_allocator.alloc(u8, total_size) catch return null;
330-
var pos: usize = 0;
331-
var first_written = false;
332-
for (results) |r| {
333-
if (r.len > 0) {
334-
if (first_written) {
335-
output[pos] = '\x0c';
336-
pos += 1;
337-
}
338-
@memcpy(output[pos..][0..r.len], r);
339-
pos += r.len;
340-
c_allocator.free(r);
341-
first_written = true;
342-
}
343-
}
344-
345-
out_len.* = pos;
346-
return output.ptr;
347-
}
348-
return null;
349-
}
350-
351-
/// Local version of content extraction with bounds (for thread safety)
352-
fn extractTextFromContentWithBoundsLocal(content: []const u8, collector: *zpdf.interpreter.SpanCollector) !void {
353-
var lexer = zpdf.interpreter.ContentLexer.init(collector.allocator, content);
354-
var operands: [64]zpdf.interpreter.Operand = undefined;
355-
var operand_count: usize = 0;
356-
357-
var current_x: f64 = 0;
358-
var current_y: f64 = 0;
359-
var font_size: f64 = 12;
360-
361-
while (try lexer.next()) |token| {
362-
switch (token) {
363-
.number => |n| {
364-
if (operand_count < 64) {
365-
operands[operand_count] = .{ .number = n };
366-
operand_count += 1;
367-
}
368-
},
369-
.string => |s| {
370-
if (operand_count < 64) {
371-
operands[operand_count] = .{ .string = s };
372-
operand_count += 1;
373-
}
374-
},
375-
.hex_string => |s| {
376-
if (operand_count < 64) {
377-
operands[operand_count] = .{ .hex_string = s };
378-
operand_count += 1;
379-
}
380-
},
381-
.name => |n| {
382-
if (operand_count < 64) {
383-
operands[operand_count] = .{ .name = n };
384-
operand_count += 1;
385-
}
386-
},
387-
.array => |arr| {
388-
if (operand_count < 64) {
389-
operands[operand_count] = .{ .array = arr };
390-
operand_count += 1;
391-
}
392-
},
393-
.operator => |op| {
394-
if (op.len > 0) switch (op[0]) {
395-
'T' => if (op.len == 2) switch (op[1]) {
396-
'f' => if (operand_count >= 2) {
397-
font_size = operands[1].number;
398-
collector.setFontSize(font_size);
399-
},
400-
'd', 'D' => if (operand_count >= 2) {
401-
current_x += operands[0].number;
402-
current_y += operands[1].number;
403-
try collector.flush();
404-
collector.setPosition(current_x, current_y);
405-
},
406-
'm' => if (operand_count >= 6) {
407-
current_x = operands[4].number;
408-
current_y = operands[5].number;
409-
try collector.flush();
410-
collector.setPosition(current_x, current_y);
411-
},
412-
'*' => {
413-
try collector.flush();
414-
},
415-
'j' => if (operand_count >= 1) {
416-
try writeTextOperandLocal(operands[0], collector);
417-
},
418-
'J' => if (operand_count >= 1) {
419-
try writeTJArrayWithBoundsLocal(operands[0], collector);
420-
},
421-
else => {},
422-
},
423-
'\'' => if (operand_count >= 1) {
424-
try collector.flush();
425-
try writeTextOperandLocal(operands[0], collector);
426-
},
427-
'"' => if (operand_count >= 3) {
428-
try collector.flush();
429-
try writeTextOperandLocal(operands[2], collector);
430-
},
431-
else => {},
432-
};
433-
operand_count = 0;
434-
},
435-
}
436-
}
437-
}
438-
439-
fn writeTextOperandLocal(operand: zpdf.interpreter.Operand, collector: *zpdf.interpreter.SpanCollector) !void {
440-
const data = switch (operand) {
441-
.string => |s| s,
442-
.hex_string => |s| s,
443-
else => return,
444-
};
445-
446-
for (data) |byte| {
447-
if (byte >= 32 and byte < 127) {
448-
try collector.writeByte(byte);
449-
} else if (byte == 0) {
450-
// CID separator
451-
} else {
452-
const codepoint = zpdf.encoding.win_ansi_encoding[byte];
453-
if (codepoint != 0 and codepoint < 128) {
454-
try collector.writeByte(@truncate(codepoint));
455-
} else if (codepoint != 0) {
456-
var buf: [4]u8 = undefined;
457-
const len = std.unicode.utf8Encode(codepoint, &buf) catch 1;
458-
try collector.writeAll(buf[0..len]);
459-
}
460-
}
461-
}
462-
}
463-
464-
fn writeTJArrayWithBoundsLocal(operand: zpdf.interpreter.Operand, collector: *zpdf.interpreter.SpanCollector) !void {
465-
const arr = switch (operand) {
466-
.array => |a| a,
467-
else => return,
468-
};
469-
470-
for (arr) |item| {
471-
switch (item) {
472-
.string, .hex_string => try writeTextOperandLocal(item, collector),
473-
.number => |n| {
474-
if (n < -100) {
475-
try collector.flush();
476-
}
477-
},
478-
else => {},
479-
}
480-
}
174+
return zpdf_extract_all_reading_order(handle, out_len);
481175
}

0 commit comments

Comments
 (0)