Skip to content

Commit 9bdab46

Browse files
committed
fixes & format
1 parent 90ba0eb commit 9bdab46

File tree

4 files changed

+24
-43
lines changed

4 files changed

+24
-43
lines changed

Comonicon.toml

Lines changed: 0 additions & 12 deletions
This file was deleted.

Project.toml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ version = "0.1.0"
55

66
[deps]
77
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
8-
Comonicon = "863f3e99-da2a-4334-8734-de3dacbe5542"
98
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
109
Poppler_jll = "9c32591e-4766-534b-9725-b71a8799265b"
1110
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
@@ -16,8 +15,7 @@ unpaper_jll = "d52248c9-e08a-51c2-9066-05d0bf3e6245"
1615

1716
[compat]
1817
Aqua = "0.5"
19-
CSV = "0.8, 0.9"
20-
Comonicon = "0.11"
18+
CSV = "0.10"
2119
Poppler_jll = "21.9"
2220
ProgressMeter = "1.5"
2321
Scratch = "1"

README.md

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,10 +42,6 @@ Next steps:
4242
using SearchablePDFs
4343
file = ocr("test/test_rasterized.pdf")
4444
```
45+
or use `searchable`.
4546

46-
Call
47-
```julia
48-
using SearchablePDFs
49-
SearchablePDFs.comonicon_install()
50-
```
51-
to install a CLI script powered by [Comonicon.jl](https://github.com/comonicon/Comonicon.jl) to `~/.julia/bin/searchable`. Add that folder to your PATH to be able to use `searchable` as an executable.
47+
TODO- CLI using `@main`.

src/SearchablePDFs.jl

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@ using Random
77
using ProgressMeter
88
using Scratch
99
using CSV
10-
using Comonicon
1110

1211
using Poppler_jll
1312
using unpaper_jll
@@ -33,7 +32,7 @@ function require_extension(path, ext; exception=isinteractive())
3332
_ext = splitext(path)[2]
3433
_ext == ext ||
3534
argument_error("Expected $path to have file extension `$ext`; got `$(_ext)`";
36-
exception)
35+
exception)
3736
return nothing
3837
end
3938

@@ -53,7 +52,7 @@ end
5352
# more in charge of the cleanup, which can be good for debugging.
5453
function get_scratch_dir(pdf)
5554
return joinpath(@get_scratch!("pdf_tmps"),
56-
splitext(basename(pdf))[1] * "_" * string(randstring(10)))
55+
splitext(basename(pdf))[1] * "_" * string(randstring(10)))
5756
end
5857

5958
# https://discourse.julialang.org/t/collecting-all-output-from-shell-commands/15592/7
@@ -127,11 +126,11 @@ end
127126
# I ran into "too many open files" errors otherwise
128127
# (which seems weird... maybe <https://github.com/JuliaLang/julia/issues/31126>? It was on MacOS)
129128
function unite_many_pdfs!(unite_progress_meter, all_logs, tmp, pdfs, output_path;
130-
max_files_per_unite=100)
129+
max_files_per_unite=100)
131130
isdir(tmp) || mkdir(tmp)
132131

133132
output_paths = map(enumerate(Iterators.partition(pdfs, max_files_per_unite))) do (i,
134-
current_pdfs)
133+
current_pdfs)
135134
current_output_path = joinpath(tmp, string("section_", i, ".pdf"))
136135
unite_logs = unite_pdfs(current_pdfs, current_output_path)
137136
put!(all_logs, (; page=missing, unite_logs...))
@@ -170,9 +169,9 @@ Keyword arguments:
170169
Set `ENV["JULIA_DEBUG"] = SearchablePDFs` to see (many) debug messages.
171170
"""
172171
function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_unpaper=false,
173-
ntasks=Sys.CPU_THREADS - 1, tesseract_nthreads=1, pages=nothing,
174-
cleanup_after=true, cleanup_at_exit=true, tmp=get_scratch_dir(pdf),
175-
verbose=true, force=false, max_files_per_unite=100)
172+
ntasks=Sys.CPU_THREADS - 1, tesseract_nthreads=1, pages=nothing,
173+
cleanup_after=true, cleanup_at_exit=true, tmp=get_scratch_dir(pdf),
174+
verbose=true, force=false, max_files_per_unite=100)
176175
isfile(pdf) || argument_error("Input file not found at `$pdf`"; exception=true)
177176
force || require_no_file(output_path; exception=true)
178177
require_extension(pdf, ".pdf"; exception=true)
@@ -184,7 +183,7 @@ function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_un
184183
pages = total_pages
185184
elseif pages > total_pages
186185
argument_error("`pages` must be less than the total number of pages ($(total_pages))";
187-
exception=true)
186+
exception=true)
188187
end
189188

190189
mkpath(tmp)
@@ -194,8 +193,8 @@ function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_un
194193

195194
@debug "Found file" pdf pages tmp
196195

197-
all_logs = Channel{@NamedTuple{page::Union{Int,UnitRange{Int},Missing},binary::String,
198-
stdout::String,stderr::String,code::Int}}(Inf)
196+
all_logs = Channel{@NamedTuple{page::Union{Int,UnitRange{Int},Missing}, binary::String,
197+
stdout::String, stderr::String, code::Int}}(Inf)
199198

200199
@debug "Generating images..."
201200
imag_prog = Progress(pages; desc="(1/3) Extracting images: ", enabled=verbose)
@@ -225,9 +224,9 @@ function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_un
225224
@debug "Finished processing pages. Uniting..."
226225
unite_dir = joinpath(tmp, "unite")
227226
unite_progress_meter = Progress(pages + cld(pages, max_files_per_unite) + 1;
228-
desc="(3/3) Collecting pages: ", enabled=verbose)
227+
desc="(3/3) Collecting pages: ", enabled=verbose)
229228
unite_many_pdfs!(unite_progress_meter, all_logs, unite_dir, pdfs, output_path;
230-
max_files_per_unite)
229+
max_files_per_unite)
231230
@debug "Done uniting pdfs"
232231
if cleanup_after
233232
@debug "Cleaning up"
@@ -248,13 +247,13 @@ end
248247
"""
249248
Create a searchable version of a PDF.
250249
"""
251-
@main function searchable(input_pdf::String,
252-
output_path::String=string(splitext(input_pdf)[1], "_OCR",
253-
".pdf"); apply_unpaper::Bool=false,
254-
ntasks::Int=Sys.CPU_THREADS - 1, tesseract_nthreads::Int=1,
255-
keep_intermediates::Bool=false,
256-
tmp::String=get_scratch_dir(input_pdf), quiet::Bool=false,
257-
logfile::Union{Nothing,String}=nothing, force::Bool=false)
250+
function searchable(input_pdf::String,
251+
output_path::String=string(splitext(input_pdf)[1], "_OCR",
252+
".pdf"); apply_unpaper::Bool=false,
253+
ntasks::Int=Sys.CPU_THREADS - 1, tesseract_nthreads::Int=1,
254+
keep_intermediates::Bool=false,
255+
tmp::String=get_scratch_dir(input_pdf), quiet::Bool=false,
256+
logfile::Union{Nothing,String}=nothing, force::Bool=false)
258257
# some of these are redundant with checks inside `ocr`; that's because we want to do them before the "Starting to ocr" message,
259258
# and we want them to exit if they fail in a non-interactive context, instead of printing a stacktracee.
260259
isfile(input_pdf) || argument_error("Input file not found at `$(input_pdf)`")
@@ -269,8 +268,8 @@ Create a searchable version of a PDF.
269268
verbose &&
270269
println("Starting to ocr `$(input_pdf)`; result will be located at `$(output_path)`.")
271270
result = ocr(input_pdf, output_path; apply_unpaper, ntasks, tesseract_nthreads,
272-
cleanup_after=!keep_intermediates, cleanup_at_exit=!keep_intermediates,
273-
tmp, verbose)
271+
cleanup_after=!keep_intermediates, cleanup_at_exit=!keep_intermediates,
272+
tmp, verbose)
274273
verbose && println("\nOutput is located at `$(output_path)`.")
275274
if keep_intermediates && verbose
276275
println("Intermediate files located at `$tmp`.")

0 commit comments

Comments
 (0)