@@ -50,28 +50,15 @@ export fn zpdf_extract_page(handle: ?*ZpdfDocument, page_num: c_int, out_len: *u
5050 return null ;
5151}
5252
53+ /// Extract text from all pages in reading order
54+ /// Uses structure tree when available, falls back to geometric sorting
5355export fn zpdf_extract_all (handle : ? * ZpdfDocument , out_len : * usize ) ? [* ]u8 {
54- if (handle ) | h | {
55- const doc : * zpdf.Document = @ptrCast (@alignCast (h ));
56-
57- var buffer : std .ArrayList (u8 ) = .empty ;
58- doc .extractAllText (buffer .writer (c_allocator )) catch return null ;
59-
60- const slice = buffer .toOwnedSlice (c_allocator ) catch return null ;
61- out_len .* = slice .len ;
62- return slice .ptr ;
63- }
64- return null ;
56+ return zpdf_extract_all_reading_order (handle , out_len );
6557}
6658
59+ /// Alias for zpdf_extract_all (parallel is deprecated, uses sequential)
6760export fn zpdf_extract_all_parallel (handle : ? * ZpdfDocument , out_len : * usize ) ? [* ]u8 {
68- if (handle ) | h | {
69- const doc : * zpdf.Document = @ptrCast (@alignCast (h ));
70- const result = doc .extractAllTextParallel (c_allocator ) catch return null ;
71- out_len .* = result .len ;
72- return result .ptr ;
73- }
74- return null ;
61+ return zpdf_extract_all_reading_order (handle , out_len );
7562}
7663
7764export fn zpdf_free_buffer (ptr : ? [* ]u8 , len : usize ) void {
@@ -171,311 +158,18 @@ export fn zpdf_extract_page_reading_order(handle: ?*ZpdfDocument, page_num: c_in
171158}
172159
173160/// Extract text from all pages in reading order (sequential)
161+ /// Uses structure tree when available, falls back to geometric sorting
174162export fn zpdf_extract_all_reading_order (handle : ? * ZpdfDocument , out_len : * usize ) ? [* ]u8 {
175163 if (handle ) | h | {
176164 const doc : * zpdf.Document = @ptrCast (@alignCast (h ));
177- const num_pages = doc .pages .items .len ;
178- if (num_pages == 0 ) {
179- out_len .* = 0 ;
180- return null ;
181- }
182-
183- var result : std .ArrayList (u8 ) = .empty ;
184- errdefer result .deinit (c_allocator );
185-
186- for (0.. num_pages ) | page_idx | {
187- if (page_idx > 0 ) {
188- result .append (c_allocator , '\x0c ' ) catch continue ; // Form feed between pages
189- }
190-
191- const page = doc .pages .items [page_idx ];
192- const page_width = page .media_box [2 ] - page .media_box [0 ];
193-
194- const spans = doc .extractTextWithBounds (page_idx , c_allocator ) catch continue ;
195- if (spans .len == 0 ) continue ;
196- defer c_allocator .free (spans );
197-
198- var layout_result = zpdf .layout .analyzeLayout (c_allocator , spans , page_width ) catch continue ;
199- defer layout_result .deinit ();
200-
201- const text = layout_result .getTextInOrder (c_allocator ) catch continue ;
202- defer c_allocator .free (text );
203-
204- result .appendSlice (c_allocator , text ) catch continue ;
205- }
206-
207- const slice = result .toOwnedSlice (c_allocator ) catch return null ;
208- out_len .* = slice .len ;
209- return slice .ptr ;
165+ const result = doc .extractAllTextStructured (c_allocator ) catch return null ;
166+ out_len .* = result .len ;
167+ return result .ptr ;
210168 }
211169 return null ;
212170}
213171
214- /// Extract text from all pages in reading order (parallel)
215- /// On WASM, falls back to sequential extraction since threads are not available.
172+ /// Alias for zpdf_extract_all_reading_order (parallel is deprecated)
216173export fn zpdf_extract_all_reading_order_parallel (handle : ? * ZpdfDocument , out_len : * usize ) ? [* ]u8 {
217- if (comptime is_wasm ) {
218- // WASM doesn't support threads, fall back to sequential
219- return zpdf_extract_all_reading_order (handle , out_len );
220- }
221-
222- if (handle ) | h | {
223- const doc : * zpdf.Document = @ptrCast (@alignCast (h ));
224- const num_pages = doc .pages .items .len ;
225- if (num_pages == 0 ) {
226- out_len .* = 0 ;
227- return null ;
228- }
229-
230- // Allocate result buffers for each page
231- const results = c_allocator .alloc ([]u8 , num_pages ) catch return null ;
232- defer c_allocator .free (results );
233- @memset (results , &[_ ]u8 {});
234-
235- const Thread = std .Thread ;
236- const cpu_count = Thread .getCpuCount () catch 4 ;
237- const num_threads : usize = @min (num_pages , @min (cpu_count , 8 ));
238-
239- const Context = struct {
240- doc : * zpdf.Document ,
241- results : [][]u8 ,
242- };
243-
244- const ctx = Context {
245- .doc = doc ,
246- .results = results ,
247- };
248-
249- const worker = struct {
250- fn run (c : Context , start : usize , end : usize ) void {
251- // Thread-local arena for all allocations
252- var arena = std .heap .ArenaAllocator .init (c_allocator );
253- defer arena .deinit ();
254- const thread_alloc = arena .allocator ();
255-
256- // Thread-local object cache (required for thread safety)
257- var local_cache = std .AutoHashMap (u32 , zpdf .Object ).init (thread_alloc );
258- defer local_cache .deinit ();
259-
260- for (start .. end ) | page_idx | {
261- const page = c .doc .pages .items [page_idx ];
262- const page_width = page .media_box [2 ] - page .media_box [0 ];
263-
264- // Get content stream with thread-local cache
265- const content = zpdf .pagetree .getPageContents (
266- thread_alloc ,
267- c .doc .data ,
268- & c .doc .xref_table ,
269- page ,
270- & local_cache ,
271- ) catch continue ;
272-
273- if (content .len == 0 ) continue ;
274-
275- // Extract spans with bounds
276- var collector = zpdf .interpreter .SpanCollector .init (c_allocator );
277- extractTextFromContentWithBoundsLocal (content , & collector ) catch continue ;
278- collector .flush () catch continue ;
279-
280- const spans = collector .spans .toOwnedSlice (c_allocator ) catch continue ;
281- if (spans .len == 0 ) continue ;
282- defer c_allocator .free (spans );
283-
284- // Analyze layout
285- var layout_result = zpdf .layout .analyzeLayout (c_allocator , spans , page_width ) catch continue ;
286- defer layout_result .deinit ();
287-
288- const text = layout_result .getTextInOrder (c_allocator ) catch continue ;
289- c .results [page_idx ] = text ;
290- }
291- }
292- }.run ;
293-
294- // Spawn threads
295- var threads : [8 ]? Thread = [_ ]? Thread {null } ** 8 ;
296- const pages_per_thread = (num_pages + num_threads - 1 ) / num_threads ;
297-
298- for (0.. num_threads ) | i | {
299- const start = i * pages_per_thread ;
300- const end = @min (start + pages_per_thread , num_pages );
301- if (start < end ) {
302- threads [i ] = Thread .spawn (.{}, worker , .{ ctx , start , end }) catch null ;
303- }
304- }
305-
306- // Wait for all threads
307- for (& threads ) | * t | {
308- if (t .* ) | thread | thread .join ();
309- }
310-
311- // Calculate total size
312- var total_size : usize = 0 ;
313- var non_empty_count : usize = 0 ;
314- for (results ) | r | {
315- if (r .len > 0 ) {
316- total_size += r .len ;
317- non_empty_count += 1 ;
318- }
319- }
320- if (non_empty_count > 1 ) {
321- total_size += non_empty_count - 1 ; // separators
322- }
323-
324- if (total_size == 0 ) {
325- out_len .* = 0 ;
326- return null ;
327- }
328-
329- var output = c_allocator .alloc (u8 , total_size ) catch return null ;
330- var pos : usize = 0 ;
331- var first_written = false ;
332- for (results ) | r | {
333- if (r .len > 0 ) {
334- if (first_written ) {
335- output [pos ] = '\x0c ' ;
336- pos += 1 ;
337- }
338- @memcpy (output [pos .. ][0.. r .len ], r );
339- pos += r .len ;
340- c_allocator .free (r );
341- first_written = true ;
342- }
343- }
344-
345- out_len .* = pos ;
346- return output .ptr ;
347- }
348- return null ;
349- }
350-
351- /// Local version of content extraction with bounds (for thread safety)
352- fn extractTextFromContentWithBoundsLocal (content : []const u8 , collector : * zpdf.interpreter.SpanCollector ) ! void {
353- var lexer = zpdf .interpreter .ContentLexer .init (collector .allocator , content );
354- var operands : [64 ]zpdf.interpreter.Operand = undefined ;
355- var operand_count : usize = 0 ;
356-
357- var current_x : f64 = 0 ;
358- var current_y : f64 = 0 ;
359- var font_size : f64 = 12 ;
360-
361- while (try lexer .next ()) | token | {
362- switch (token ) {
363- .number = > | n | {
364- if (operand_count < 64 ) {
365- operands [operand_count ] = .{ .number = n };
366- operand_count += 1 ;
367- }
368- },
369- .string = > | s | {
370- if (operand_count < 64 ) {
371- operands [operand_count ] = .{ .string = s };
372- operand_count += 1 ;
373- }
374- },
375- .hex_string = > | s | {
376- if (operand_count < 64 ) {
377- operands [operand_count ] = .{ .hex_string = s };
378- operand_count += 1 ;
379- }
380- },
381- .name = > | n | {
382- if (operand_count < 64 ) {
383- operands [operand_count ] = .{ .name = n };
384- operand_count += 1 ;
385- }
386- },
387- .array = > | arr | {
388- if (operand_count < 64 ) {
389- operands [operand_count ] = .{ .array = arr };
390- operand_count += 1 ;
391- }
392- },
393- .operator = > | op | {
394- if (op .len > 0 ) switch (op [0 ]) {
395- 'T' = > if (op .len == 2 ) switch (op [1 ]) {
396- 'f' = > if (operand_count >= 2 ) {
397- font_size = operands [1 ].number ;
398- collector .setFontSize (font_size );
399- },
400- 'd' , 'D' = > if (operand_count >= 2 ) {
401- current_x += operands [0 ].number ;
402- current_y += operands [1 ].number ;
403- try collector .flush ();
404- collector .setPosition (current_x , current_y );
405- },
406- 'm' = > if (operand_count >= 6 ) {
407- current_x = operands [4 ].number ;
408- current_y = operands [5 ].number ;
409- try collector .flush ();
410- collector .setPosition (current_x , current_y );
411- },
412- '*' = > {
413- try collector .flush ();
414- },
415- 'j' = > if (operand_count >= 1 ) {
416- try writeTextOperandLocal (operands [0 ], collector );
417- },
418- 'J' = > if (operand_count >= 1 ) {
419- try writeTJArrayWithBoundsLocal (operands [0 ], collector );
420- },
421- else = > {},
422- },
423- '\' ' = > if (operand_count >= 1 ) {
424- try collector .flush ();
425- try writeTextOperandLocal (operands [0 ], collector );
426- },
427- '"' = > if (operand_count >= 3 ) {
428- try collector .flush ();
429- try writeTextOperandLocal (operands [2 ], collector );
430- },
431- else = > {},
432- };
433- operand_count = 0 ;
434- },
435- }
436- }
437- }
438-
439- fn writeTextOperandLocal (operand : zpdf.interpreter.Operand , collector : * zpdf.interpreter.SpanCollector ) ! void {
440- const data = switch (operand ) {
441- .string = > | s | s ,
442- .hex_string = > | s | s ,
443- else = > return ,
444- };
445-
446- for (data ) | byte | {
447- if (byte >= 32 and byte < 127 ) {
448- try collector .writeByte (byte );
449- } else if (byte == 0 ) {
450- // CID separator
451- } else {
452- const codepoint = zpdf .encoding .win_ansi_encoding [byte ];
453- if (codepoint != 0 and codepoint < 128 ) {
454- try collector .writeByte (@truncate (codepoint ));
455- } else if (codepoint != 0 ) {
456- var buf : [4 ]u8 = undefined ;
457- const len = std .unicode .utf8Encode (codepoint , & buf ) catch 1 ;
458- try collector .writeAll (buf [0.. len ]);
459- }
460- }
461- }
462- }
463-
464- fn writeTJArrayWithBoundsLocal (operand : zpdf.interpreter.Operand , collector : * zpdf.interpreter.SpanCollector ) ! void {
465- const arr = switch (operand ) {
466- .array = > | a | a ,
467- else = > return ,
468- };
469-
470- for (arr ) | item | {
471- switch (item ) {
472- .string , .hex_string = > try writeTextOperandLocal (item , collector ),
473- .number = > | n | {
474- if (n < -100 ) {
475- try collector .flush ();
476- }
477- },
478- else = > {},
479- }
480- }
174+ return zpdf_extract_all_reading_order (handle , out_len );
481175}
0 commit comments