Skip to content

Commit c5e21f7

Browse files
committed
rm unpaper
1 parent ba8d2f0 commit c5e21f7

File tree

3 files changed

+12
-43
lines changed

3 files changed

+12
-43
lines changed

Project.toml

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,28 +13,26 @@ ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
1313
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1414
Scratch = "6c6a2e73-6563-6170-7368-637461726353"
1515
Tesseract_jll = "efd95c89-babc-5260-8753-618084eaf9d7"
16-
unpaper_jll = "d52248c9-e08a-51c2-9066-05d0bf3e6245"
1716

1817
[compat]
1918
Aqua = "0.8"
2019
CSV = "0.10"
2120
DocOpt = "0.5.0"
2221
OutputCollectors = "0.1.1"
22+
Pkg = "1"
2323
Poppler_jll = "21.9"
2424
ProgressMeter = "1.5"
25+
Random = "1"
2526
Scratch = "1"
2627
Tesseract_jll = "4.1.100"
27-
julia = "1.10"
28-
unpaper_jll = "6.1.100"
2928
Test = "1"
30-
Random = "1"
31-
Pkg = "1"
29+
julia = "1.10"
30+
31+
[apps.searchable-pdf]
3232

3333
[extras]
3434
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
3535
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
3636

3737
[targets]
3838
test = ["Aqua", "Test"]
39-
40-
[apps.searchable-pdf]

src/SearchablePDFs.jl

Lines changed: 5 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ using CSV
1111
using DocOpt
1212

1313
using Poppler_jll
14-
using unpaper_jll
1514
using Tesseract_jll
1615

1716
export ocr
@@ -117,14 +116,6 @@ function get_images(pdf, page_range::UnitRange{Int}, tmp, total_pages; exit_on_e
117116
return paths, (; binary="pdftoppm", logs...)
118117
end
119118

120-
# Clean up an image with unpaper
121-
function unpaper(img; exit_on_error)
122-
img_base, img_ext = splitext(img)
123-
img_unpaper = img_base * "_unpaper" * img_ext
124-
logs = run_and_collect_logs(`$(unpaper_jll.unpaper()) $img $img_unpaper`; exit_on_error)
125-
return (; img_unpaper, logs=(; binary="unpaper", logs...))
126-
end
127-
128119
#####
129120
##### Step 2: Use tesseract to generate a one-page searchable PDF from an image
130121
#####
@@ -178,7 +169,7 @@ end
178169
#####
179170

180171
"""
181-
ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_unpaper=false,
172+
ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf");
182173
ntasks=Sys.CPU_THREADS - 1, tesseract_nthreads=1, pages=num_pages(pdf),
183174
cleanup_after=true, cleanup_at_exit=true, tmp=get_scratch_dir(pdf),
184175
verbose=true)
@@ -189,14 +180,13 @@ Keyword arguments:
189180
190181
* `ntasks`: how many parallel tasks to use for launching `tesseract` and `pdftoppm`.
191182
* `tesseract_nthreads`: how many threads to direct Tesseract to use
192-
* `apply_unpaper`: whether or not to apply `unpaper` to try to improve the image quality
193183
* `tmp`: a directory to store intermediate files. This directory is deleted at the end of the function if `cleanup_after` is set to `true`, and when the Julia session is ended if `cleanup_at_exit` is set to `true`.
194184
* `pages=nothing`: the number of pages of the PDF to process; the default of `nothing` indicates all pages in the PDF. It can help in debugging to set this to something small.
195185
* `verbose`: show a progress bar for each step of the process.
196186
197187
Set `ENV["JULIA_DEBUG"] = SearchablePDFs` to see (many) debug messages.
198188
"""
199-
function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_unpaper=false,
189+
function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf");
200190
ntasks=Sys.CPU_THREADS - 1, tesseract_nthreads=1, pages=nothing,
201191
cleanup_after=true, cleanup_at_exit=true, tmp=get_scratch_dir(pdf),
202192
verbose=true, force=false, max_files_per_unite=100,
@@ -240,10 +230,6 @@ function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_un
240230
ocr_prog = Progress(pages; desc="(2/3) OCRing: ", enabled=verbose)
241231
pdfs = asyncmap(enumerate(img_paths); ntasks) do (page, img)
242232
@debug "img" page img
243-
if apply_unpaper
244-
img, unpaper_logs = unpaper(img; exit_on_error)
245-
put!(all_logs, (; page, unpaper_logs...))
246-
end
247233
pdf, tesseract_logs = make_pdf(img; tesseract_nthreads, exit_on_error)
248234
put!(all_logs, (; page, tesseract_logs...))
249235
next!(ocr_prog)
@@ -272,19 +258,16 @@ end
272258
##### CLI interface
273259
#####
274260

275-
CAN_USE_UNPAPER::Bool = unpaper_jll.is_available() && Sys.ARCH != :aarch64
276-
277261
doc::String = """Searchable PDFs (OCR).
278262
279263
Usage:
280-
searchable-pdf <input_pdf> [<output_path>] [--apply_unpaper] [--keep_intermediates] [--quiet] [--force] [--logfile=<logfile>] [--tmp=<tmp>] [-n=<ntasks>] [-t=<tesseract_nthreads>]
264+
searchable-pdf <input_pdf> [<output_path>] [--keep_intermediates] [--quiet] [--force] [--logfile=<logfile>] [--tmp=<tmp>] [-n=<ntasks>] [-t=<tesseract_nthreads>]
281265
searchable-pdf -h | --help
282266
searchable-pdf --version
283267
284268
Options:
285269
-h --help Show this screen.
286270
--version Show version.
287-
--apply_unpaper todo
288271
--keep_intermediates xyz
289272
--quiet todo
290273
--force todo
@@ -310,7 +293,6 @@ function main(args=ARGS)
310293

311294
tesseract_nthreads = parse(Int, parsed["--tesseract_nthreads"])
312295
result = _main(input_pdf, output_path;
313-
apply_unpaper=parsed["--apply_unpaper"],
314296
ntasks,
315297
tesseract_nthreads,
316298
keep_intermediates=parsed["--keep_intermediates"],
@@ -334,19 +316,12 @@ end
334316

335317
function _main(input_pdf::String,
336318
output_path::String=string(splitext(input_pdf)[1], "_OCR",
337-
".pdf"); apply_unpaper::Bool=false,
319+
".pdf");
338320
ntasks::Int=Sys.CPU_THREADS - 1, tesseract_nthreads::Int=1,
339321
keep_intermediates::Bool=false,
340322
tmp::String=get_scratch_dir(input_pdf), quiet::Bool=false,
341323
logfile::Union{Nothing,String}=nothing, force::Bool=false,
342324
exit_on_error=false)
343-
if apply_unpaper && !CAN_USE_UNPAPER
344-
if Sys.ARCH == :aarch64
345-
argument_error("Cannot use `unpaper` on `aarch64` systems"; exit_on_error)
346-
else
347-
argument_error("`unpaper` is not available on this system"; exit_on_error)
348-
end
349-
end
350325
# some of these are redundant with checks inside `ocr`; that's because we want to do them before the "Starting to ocr" message,
351326
# and we want them to exit if they fail in a non-interactive context, instead of printing a stacktracee.
352327
isfile(input_pdf) ||
@@ -361,7 +336,7 @@ function _main(input_pdf::String,
361336
verbose = !quiet
362337
verbose &&
363338
println("Starting to ocr `$(input_pdf)`; result will be located at `$(output_path)`.")
364-
result = ocr(input_pdf, output_path; apply_unpaper, ntasks, tesseract_nthreads,
339+
result = ocr(input_pdf, output_path; ntasks, tesseract_nthreads,
365340
cleanup_after=!keep_intermediates, cleanup_at_exit=!keep_intermediates,
366341
tmp, verbose, force, exit_on_error)
367342
verbose && println("\nOutput is located at `$(output_path)`.")

test/runtests.jl

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,12 @@ using Aqua
77
TEST_PDF_PATH = joinpath(@__DIR__, "test.pdf")
88
TEST_PDF_RASTERIZED_PATH = joinpath(@__DIR__, "test_rasterized.pdf")
99

10-
unpaper_settings = SearchablePDFs.CAN_USE_UNPAPER ? (true, false) : (false,)
11-
1210
@testset "SearchablePDFs.jl" begin
1311
@test SearchablePDFs.num_pages(TEST_PDF_PATH; exit_on_error=false) == 3
1412

15-
@testset "verbose=$verbose apply_unpaper=$apply_unpaper f=$f opt=$opt" for verbose in
13+
@testset "verbose=$verbose f=$f opt=$opt" for verbose in
1614
(false,
1715
true),
18-
apply_unpaper in
19-
unpaper_settings,
2016
f in
2117
(_main,
2218
ocr),
@@ -28,7 +24,7 @@ unpaper_settings = SearchablePDFs.CAN_USE_UNPAPER ? (true, false) : (false,)
2824
keep_intermediates=opt) :
2925
(; verbose=verbose, max_files_per_unite=opt ? 2 : 100)
3026

31-
result = f(TEST_PDF_RASTERIZED_PATH, joinpath(@__DIR__, "out.pdf"); apply_unpaper,
27+
result = f(TEST_PDF_RASTERIZED_PATH, joinpath(@__DIR__, "out.pdf");
3228
kwargs...)
3329
# make sure we delete the generated files eventually, even if the tests throw
3430
atexit(() -> rm(result.output_path; force=true))

0 commit comments

Comments
 (0)