fixes & format

ericphanson · ericphanson · commit 9bdab468259d · 2024-12-16T15:30:00.000+01:00
diff --git a/Comonicon.toml b/Comonicon.toml
diff --git a/Project.toml b/Project.toml
@@ -5,7 +5,6 @@ version = "0.1.0"
 
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
-Comonicon = "863f3e99-da2a-4334-8734-de3dacbe5542"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Poppler_jll = "9c32591e-4766-534b-9725-b71a8799265b"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
@@ -16,8 +15,7 @@ unpaper_jll = "d52248c9-e08a-51c2-9066-05d0bf3e6245"
 
 [compat]
 Aqua = "0.5"
-CSV = "0.8, 0.9"
-Comonicon = "0.11"
+CSV = "0.10"
 Poppler_jll = "21.9"
 ProgressMeter = "1.5"
 Scratch = "1"
diff --git a/README.md b/README.md
@@ -42,10 +42,6 @@ Next steps:
 using SearchablePDFs
 file = ocr("test/test_rasterized.pdf")
 ```
+or use `searchable`.
 
-Call
-```julia
-using SearchablePDFs
-SearchablePDFs.comonicon_install()
-```
-to install a CLI script powered by [Comonicon.jl](https://github.com/comonicon/Comonicon.jl) to `~/.julia/bin/searchable`. Add that folder to your PATH to be able to use `searchable` as an executable.
+TODO- CLI using `@main`.
diff --git a/src/SearchablePDFs.jl b/src/SearchablePDFs.jl
@@ -7,7 +7,6 @@ using Random
 using ProgressMeter
 using Scratch
 using CSV
-using Comonicon
 
 using Poppler_jll
 using unpaper_jll
@@ -33,7 +32,7 @@ function require_extension(path, ext; exception=isinteractive())
     _ext = splitext(path)[2]
     _ext == ext ||
         argument_error("Expected $path to have file extension `$ext`; got `$(_ext)`";
-                       exception)
+            exception)
     return nothing
 end
 
@@ -53,7 +52,7 @@ end
 # more in charge of the cleanup, which can be good for debugging.
 function get_scratch_dir(pdf)
     return joinpath(@get_scratch!("pdf_tmps"),
-                    splitext(basename(pdf))[1] * "_" * string(randstring(10)))
+        splitext(basename(pdf))[1] * "_" * string(randstring(10)))
 end
 
 # https://discourse.julialang.org/t/collecting-all-output-from-shell-commands/15592/7
@@ -127,11 +126,11 @@ end
 # I ran into "too many open files" errors otherwise
 # (which seems weird... maybe <https://github.com/JuliaLang/julia/issues/31126>? It was on MacOS)
 function unite_many_pdfs!(unite_progress_meter, all_logs, tmp, pdfs, output_path;
-                          max_files_per_unite=100)
+    max_files_per_unite=100)
     isdir(tmp) || mkdir(tmp)
 
     output_paths = map(enumerate(Iterators.partition(pdfs, max_files_per_unite))) do (i,
-                                                                                      current_pdfs)
+        current_pdfs)
         current_output_path = joinpath(tmp, string("section_", i, ".pdf"))
         unite_logs = unite_pdfs(current_pdfs, current_output_path)
         put!(all_logs, (; page=missing, unite_logs...))
@@ -170,9 +169,9 @@ Keyword arguments:
 Set `ENV["JULIA_DEBUG"] = SearchablePDFs` to see (many) debug messages.
 """
 function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_unpaper=false,
-             ntasks=Sys.CPU_THREADS - 1, tesseract_nthreads=1, pages=nothing,
-             cleanup_after=true, cleanup_at_exit=true, tmp=get_scratch_dir(pdf),
-             verbose=true, force=false, max_files_per_unite=100)
+    ntasks=Sys.CPU_THREADS - 1, tesseract_nthreads=1, pages=nothing,
+    cleanup_after=true, cleanup_at_exit=true, tmp=get_scratch_dir(pdf),
+    verbose=true, force=false, max_files_per_unite=100)
     isfile(pdf) || argument_error("Input file not found at `$pdf`"; exception=true)
     force || require_no_file(output_path; exception=true)
     require_extension(pdf, ".pdf"; exception=true)
@@ -184,7 +183,7 @@ function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_un
         pages = total_pages
     elseif pages > total_pages
         argument_error("`pages` must be less than the total number of pages ($(total_pages))";
-                       exception=true)
+            exception=true)
     end
 
     mkpath(tmp)
@@ -194,8 +193,8 @@ function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_un
 
     @debug "Found file" pdf pages tmp
 
-    all_logs = Channel{@NamedTuple{page::Union{Int,UnitRange{Int},Missing},binary::String,
-                                   stdout::String,stderr::String,code::Int}}(Inf)
+    all_logs = Channel{@NamedTuple{page::Union{Int,UnitRange{Int},Missing}, binary::String,
+        stdout::String, stderr::String, code::Int}}(Inf)
 
     @debug "Generating images..."
     imag_prog = Progress(pages; desc="(1/3) Extracting images: ", enabled=verbose)
@@ -225,9 +224,9 @@ function ocr(pdf, output_path=string(splitext(pdf)[1], "_OCR", ".pdf"); apply_un
     @debug "Finished processing pages. Uniting..."
     unite_dir = joinpath(tmp, "unite")
     unite_progress_meter = Progress(pages + cld(pages, max_files_per_unite) + 1;
-                                    desc="(3/3) Collecting pages: ", enabled=verbose)
+        desc="(3/3) Collecting pages: ", enabled=verbose)
     unite_many_pdfs!(unite_progress_meter, all_logs, unite_dir, pdfs, output_path;
-                     max_files_per_unite)
+        max_files_per_unite)
     @debug "Done uniting pdfs"
     if cleanup_after
         @debug "Cleaning up"
@@ -248,13 +247,13 @@ end
 """
 Create a searchable version of a PDF.
 """
-@main function searchable(input_pdf::String,
-                          output_path::String=string(splitext(input_pdf)[1], "_OCR",
-                                                     ".pdf"); apply_unpaper::Bool=false,
-                          ntasks::Int=Sys.CPU_THREADS - 1, tesseract_nthreads::Int=1,
-                          keep_intermediates::Bool=false,
-                          tmp::String=get_scratch_dir(input_pdf), quiet::Bool=false,
-                          logfile::Union{Nothing,String}=nothing, force::Bool=false)
+function searchable(input_pdf::String,
+    output_path::String=string(splitext(input_pdf)[1], "_OCR",
+        ".pdf"); apply_unpaper::Bool=false,
+    ntasks::Int=Sys.CPU_THREADS - 1, tesseract_nthreads::Int=1,
+    keep_intermediates::Bool=false,
+    tmp::String=get_scratch_dir(input_pdf), quiet::Bool=false,
+    logfile::Union{Nothing,String}=nothing, force::Bool=false)
     # some of these are redundant with checks inside `ocr`; that's because we want to do them before the "Starting to ocr" message,
     # and we want them to exit if they fail in a non-interactive context, instead of printing a stacktracee.
     isfile(input_pdf) || argument_error("Input file not found at `$(input_pdf)`")
@@ -269,8 +268,8 @@ Create a searchable version of a PDF.
     verbose &&
         println("Starting to ocr `$(input_pdf)`; result will be located at `$(output_path)`.")
     result = ocr(input_pdf, output_path; apply_unpaper, ntasks, tesseract_nthreads,
-                 cleanup_after=!keep_intermediates, cleanup_at_exit=!keep_intermediates,
-                 tmp, verbose)
+        cleanup_after=!keep_intermediates, cleanup_at_exit=!keep_intermediates,
+        tmp, verbose)
     verbose && println("\nOutput is located at `$(output_path)`.")
     if keep_intermediates && verbose
         println("Intermediate files located at `$tmp`.")