@@ -11,7 +11,6 @@ using CSV
1111using DocOpt
1212
1313using Poppler_jll
14- using unpaper_jll
1514using Tesseract_jll
1615
1716export ocr
@@ -117,14 +116,6 @@ function get_images(pdf, page_range::UnitRange{Int}, tmp, total_pages; exit_on_e
117116 return paths, (; binary= " pdftoppm" , logs... )
118117end
119118
120- # Clean up an image with unpaper
121- function unpaper (img; exit_on_error)
122- img_base, img_ext = splitext (img)
123- img_unpaper = img_base * " _unpaper" * img_ext
124- logs = run_and_collect_logs (` $(unpaper_jll. unpaper ()) $img $img_unpaper ` ; exit_on_error)
125- return (; img_unpaper, logs= (; binary= " unpaper" , logs... ))
126- end
127-
128119# ####
129120# #### Step 2: Use tesseract to generate a one-page searchable PDF from an image
130121# ####
178169# ####
179170
180171"""
181- ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_unpaper=false,
172+ ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf");
182173 ntasks=Sys.CPU_THREADS - 1, tesseract_nthreads=1, pages=num_pages(pdf),
183174 cleanup_after=true, cleanup_at_exit=true, tmp=get_scratch_dir(pdf),
184175 verbose=true)
@@ -189,14 +180,13 @@ Keyword arguments:
189180
190181* `ntasks`: how many parallel tasks to use for launching `tesseract` and `pdftoppm`.
191182* `tesseract_nthreads`: how many threads to direct Tesseract to use
192- * `apply_unpaper`: whether or not to apply `unpaper` to try to improve the image quality
193183* `tmp`: a directory to store intermediate files. This directory is deleted at the end of the function if `cleanup_after` is set to `true`, and when the Julia session is ended if `cleanup_at_exit` is set to `true`.
194184* `pages=nothing`: the number of pages of the PDF to process; the default of `nothing` indicates all pages in the PDF. It can help in debugging to set this to something small.
195185* `verbose`: show a progress bar for each step of the process.
196186
197187Set `ENV["JULIA_DEBUG"] = SearchablePDFs` to see (many) debug messages.
198188"""
199- function ocr (pdf, output_path= string (splitext (pdf)[1 ], " _OCR" , " .pdf" ); apply_unpaper = false ,
189+ function ocr (pdf, output_path= string (splitext (pdf)[1 ], " _OCR" , " .pdf" );
200190 ntasks= Sys. CPU_THREADS - 1 , tesseract_nthreads= 1 , pages= nothing ,
201191 cleanup_after= true , cleanup_at_exit= true , tmp= get_scratch_dir (pdf),
202192 verbose= true , force= false , max_files_per_unite= 100 ,
@@ -240,10 +230,6 @@ function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_un
240230 ocr_prog = Progress (pages; desc= " (2/3) OCRing: " , enabled= verbose)
241231 pdfs = asyncmap (enumerate (img_paths); ntasks) do (page, img)
242232 @debug " img" page img
243- if apply_unpaper
244- img, unpaper_logs = unpaper (img; exit_on_error)
245- put! (all_logs, (; page, unpaper_logs... ))
246- end
247233 pdf, tesseract_logs = make_pdf (img; tesseract_nthreads, exit_on_error)
248234 put! (all_logs, (; page, tesseract_logs... ))
249235 next! (ocr_prog)
@@ -272,19 +258,16 @@ end
272258# #### CLI interface
273259# ####
274260
275- CAN_USE_UNPAPER:: Bool = unpaper_jll. is_available () && Sys. ARCH != :aarch64
276-
277261doc:: String = """ Searchable PDFs (OCR).
278262
279263Usage:
280- searchable-pdf <input_pdf> [<output_path>] [--apply_unpaper] [-- keep_intermediates] [--quiet] [--force] [--logfile=<logfile>] [--tmp=<tmp>] [-n=<ntasks>] [-t=<tesseract_nthreads>]
264+ searchable-pdf <input_pdf> [<output_path>] [--keep_intermediates] [--quiet] [--force] [--logfile=<logfile>] [--tmp=<tmp>] [-n=<ntasks>] [-t=<tesseract_nthreads>]
281265 searchable-pdf -h | --help
282266 searchable-pdf --version
283267
284268Options:
285269 -h --help Show this screen.
286270 --version Show version.
287- --apply_unpaper todo
288271 --keep_intermediates xyz
289272 --quiet todo
290273 --force todo
@@ -310,7 +293,6 @@ function main(args=ARGS)
310293
311294 tesseract_nthreads = parse (Int, parsed[" --tesseract_nthreads" ])
312295 result = _main (input_pdf, output_path;
313- apply_unpaper= parsed[" --apply_unpaper" ],
314296 ntasks,
315297 tesseract_nthreads,
316298 keep_intermediates= parsed[" --keep_intermediates" ],
@@ -334,19 +316,12 @@ end
334316
335317function _main (input_pdf:: String ,
336318 output_path:: String = string (splitext (input_pdf)[1 ], " _OCR" ,
337- " .pdf" ); apply_unpaper :: Bool = false ,
319+ " .pdf" );
338320 ntasks:: Int = Sys. CPU_THREADS - 1 , tesseract_nthreads:: Int = 1 ,
339321 keep_intermediates:: Bool = false ,
340322 tmp:: String = get_scratch_dir (input_pdf), quiet:: Bool = false ,
341323 logfile:: Union{Nothing,String} = nothing , force:: Bool = false ,
342324 exit_on_error= false )
343- if apply_unpaper && ! CAN_USE_UNPAPER
344- if Sys. ARCH == :aarch64
345- argument_error (" Cannot use `unpaper` on `aarch64` systems" ; exit_on_error)
346- else
347- argument_error (" `unpaper` is not available on this system" ; exit_on_error)
348- end
349- end
350325 # some of these are redundant with checks inside `ocr`; that's because we want to do them before the "Starting to ocr" message,
351326 # and we want them to exit if they fail in a non-interactive context, instead of printing a stacktracee.
352327 isfile (input_pdf) ||
@@ -361,7 +336,7 @@ function _main(input_pdf::String,
361336 verbose = ! quiet
362337 verbose &&
363338 println (" Starting to ocr `$(input_pdf) `; result will be located at `$(output_path) `." )
364- result = ocr (input_pdf, output_path; apply_unpaper, ntasks, tesseract_nthreads,
339+ result = ocr (input_pdf, output_path; ntasks, tesseract_nthreads,
365340 cleanup_after= ! keep_intermediates, cleanup_at_exit= ! keep_intermediates,
366341 tmp, verbose, force, exit_on_error)
367342 verbose && println (" \n Output is located at `$(output_path) `." )
0 commit comments