@@ -7,7 +7,6 @@ using Random
77using ProgressMeter
88using Scratch
99using CSV
10- using Comonicon
1110
1211using Poppler_jll
1312using unpaper_jll
@@ -33,7 +32,7 @@ function require_extension(path, ext; exception=isinteractive())
3332 _ext = splitext (path)[2 ]
3433 _ext == ext ||
3534 argument_error (" Expected $path to have file extension `$ext `; got `$(_ext) `" ;
36- exception)
35+ exception)
3736 return nothing
3837end
3938
5352# more in charge of the cleanup, which can be good for debugging.
5453function get_scratch_dir (pdf)
5554 return joinpath (@get_scratch! (" pdf_tmps" ),
56- splitext (basename (pdf))[1 ] * " _" * string (randstring (10 )))
55+ splitext (basename (pdf))[1 ] * " _" * string (randstring (10 )))
5756end
5857
5958# https://discourse.julialang.org/t/collecting-all-output-from-shell-commands/15592/7
@@ -127,11 +126,11 @@ end
127126# I ran into "too many open files" errors otherwise
128127# (which seems weird... maybe <https://github.com/JuliaLang/julia/issues/31126>? It was on MacOS)
129128function unite_many_pdfs! (unite_progress_meter, all_logs, tmp, pdfs, output_path;
130- max_files_per_unite= 100 )
129+ max_files_per_unite= 100 )
131130 isdir (tmp) || mkdir (tmp)
132131
133132 output_paths = map (enumerate (Iterators. partition (pdfs, max_files_per_unite))) do (i,
134- current_pdfs)
133+ current_pdfs)
135134 current_output_path = joinpath (tmp, string (" section_" , i, " .pdf" ))
136135 unite_logs = unite_pdfs (current_pdfs, current_output_path)
137136 put! (all_logs, (; page= missing , unite_logs... ))
@@ -170,9 +169,9 @@ Keyword arguments:
170169Set `ENV["JULIA_DEBUG"] = SearchablePDFs` to see (many) debug messages.
171170"""
172171function ocr (pdf, output_path= string (splitext (pdf)[1 ], " _OCR" , " .pdf" ); apply_unpaper= false ,
173- ntasks= Sys. CPU_THREADS - 1 , tesseract_nthreads= 1 , pages= nothing ,
174- cleanup_after= true , cleanup_at_exit= true , tmp= get_scratch_dir (pdf),
175- verbose= true , force= false , max_files_per_unite= 100 )
172+ ntasks= Sys. CPU_THREADS - 1 , tesseract_nthreads= 1 , pages= nothing ,
173+ cleanup_after= true , cleanup_at_exit= true , tmp= get_scratch_dir (pdf),
174+ verbose= true , force= false , max_files_per_unite= 100 )
176175 isfile (pdf) || argument_error (" Input file not found at `$pdf `" ; exception= true )
177176 force || require_no_file (output_path; exception= true )
178177 require_extension (pdf, " .pdf" ; exception= true )
@@ -184,7 +183,7 @@ function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_un
184183 pages = total_pages
185184 elseif pages > total_pages
186185 argument_error (" `pages` must be less than the total number of pages ($(total_pages) )" ;
187- exception= true )
186+ exception= true )
188187 end
189188
190189 mkpath (tmp)
@@ -194,8 +193,8 @@ function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_un
194193
195194 @debug " Found file" pdf pages tmp
196195
197- all_logs = Channel{@NamedTuple {page:: Union{Int,UnitRange{Int},Missing} ,binary:: String ,
198- stdout :: String ,stderr :: String ,code:: Int }}(Inf )
196+ all_logs = Channel{@NamedTuple {page:: Union{Int,UnitRange{Int},Missing} , binary:: String ,
197+ stdout :: String , stderr :: String , code:: Int }}(Inf )
199198
200199 @debug " Generating images..."
201200 imag_prog = Progress (pages; desc= " (1/3) Extracting images: " , enabled= verbose)
@@ -225,9 +224,9 @@ function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_un
225224 @debug " Finished processing pages. Uniting..."
226225 unite_dir = joinpath (tmp, " unite" )
227226 unite_progress_meter = Progress (pages + cld (pages, max_files_per_unite) + 1 ;
228- desc= " (3/3) Collecting pages: " , enabled= verbose)
227+ desc= " (3/3) Collecting pages: " , enabled= verbose)
229228 unite_many_pdfs! (unite_progress_meter, all_logs, unite_dir, pdfs, output_path;
230- max_files_per_unite)
229+ max_files_per_unite)
231230 @debug " Done uniting pdfs"
232231 if cleanup_after
233232 @debug " Cleaning up"
@@ -248,13 +247,13 @@ end
248247"""
249248Create a searchable version of a PDF.
250249"""
251- @main function searchable (input_pdf:: String ,
252- output_path:: String = string (splitext (input_pdf)[1 ], " _OCR" ,
253- " .pdf" ); apply_unpaper:: Bool = false ,
254- ntasks:: Int = Sys. CPU_THREADS - 1 , tesseract_nthreads:: Int = 1 ,
255- keep_intermediates:: Bool = false ,
256- tmp:: String = get_scratch_dir (input_pdf), quiet:: Bool = false ,
257- logfile:: Union{Nothing,String} = nothing , force:: Bool = false )
250+ function searchable (input_pdf:: String ,
251+ output_path:: String = string (splitext (input_pdf)[1 ], " _OCR" ,
252+ " .pdf" ); apply_unpaper:: Bool = false ,
253+ ntasks:: Int = Sys. CPU_THREADS - 1 , tesseract_nthreads:: Int = 1 ,
254+ keep_intermediates:: Bool = false ,
255+ tmp:: String = get_scratch_dir (input_pdf), quiet:: Bool = false ,
256+ logfile:: Union{Nothing,String} = nothing , force:: Bool = false )
258257 # some of these are redundant with checks inside `ocr`; that's because we want to do them before the "Starting to ocr" message,
259258 # and we want them to exit if they fail in a non-interactive context, instead of printing a stacktracee.
260259 isfile (input_pdf) || argument_error (" Input file not found at `$(input_pdf) `" )
@@ -269,8 +268,8 @@ Create a searchable version of a PDF.
269268 verbose &&
270269 println (" Starting to ocr `$(input_pdf) `; result will be located at `$(output_path) `." )
271270 result = ocr (input_pdf, output_path; apply_unpaper, ntasks, tesseract_nthreads,
272- cleanup_after= ! keep_intermediates, cleanup_at_exit= ! keep_intermediates,
273- tmp, verbose)
271+ cleanup_after= ! keep_intermediates, cleanup_at_exit= ! keep_intermediates,
272+ tmp, verbose)
274273 verbose && println (" \n Output is located at `$(output_path) `." )
275274 if keep_intermediates && verbose
276275 println (" Intermediate files located at `$tmp `." )
0 commit comments