Skip to content

Commit 2a79399

Browse files
committed
focus on pdf documents
1 parent 3ada958 commit 2a79399

File tree

104 files changed

+118
-25285
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

104 files changed

+118
-25285
lines changed

NAMESPACE

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
# Generated by roxygen2: do not edit by hand
22

3-
S3method(print,tesseract)
43
export(ocr)
5-
export(ocr_data)
64
export(tesseract)
75
export(tesseract_contributed_download)
86
export(tesseract_download)

R/cpp11.R

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -32,22 +32,10 @@ get_param_values <- function(api, params) {
3232
.Call(`_cpp11tesseract_get_param_values`, api, params)
3333
}
3434

35-
ocr_raw <- function(input, ptr, HOCR) {
36-
.Call(`_cpp11tesseract_ocr_raw`, input, ptr, HOCR)
37-
}
38-
3935
ocr_file <- function(file, ptr, HOCR) {
4036
.Call(`_cpp11tesseract_ocr_file`, file, ptr, HOCR)
4137
}
4238

43-
ocr_raw_data <- function(input, ptr) {
44-
.Call(`_cpp11tesseract_ocr_raw_data`, input, ptr)
45-
}
46-
47-
ocr_file_data <- function(file, ptr) {
48-
.Call(`_cpp11tesseract_ocr_file_data`, file, ptr)
49-
}
50-
5139
n_pages <- function(file_path, opw, upw) {
5240
.Call(`_cpp11tesseract_n_pages`, file_path, opw, upw)
5341
}

R/images.R

Lines changed: 0 additions & 47 deletions
This file was deleted.

R/ocr.R

Lines changed: 48 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@
66
#' the package vignette for image preprocessing tips.
77
#'
88
#' The `ocr()` function returns plain text by default, or hOCR text if hOCR is set to `TRUE`.
9-
#' The `ocr_data()` function returns a data frame with a confidence rate and bounding box for
10-
#' each word in the text.
119
#'
1210
#' @export
1311
#' @return character vector of text extracted from the file. If the file
1412
#' is has TIFF or PDF extension, it will be a vector of length equal to the
1513
#' number of pages.
1614
#' @family tesseract
1715
#' @param file file path or raw vector (png, tiff, jpeg, etc).
16+
#' @param pages a numeric vector of pages to extract text from. If `NULL` all
17+
#' pages will be extracted.
1818
#' @param engine a tesseract engine created with [tesseract()]. Alternatively a
1919
#' language string which will be passed to [tesseract()].
2020
#' @param HOCR if `TRUE` return results as HOCR xml instead of plain text
@@ -26,63 +26,57 @@
2626
#' @references [Tesseract: Improving Quality](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality)
2727
#' @examples
2828
#' # Simple example
29-
#' file <- system.file("examples", "testocr.png", package = "cpp11tesseract")
29+
#' file <- system.file("examples", "oscarwilde.pdf", package = "cpp11tesseract")
3030
#' text <- ocr(file)
3131
#' cat(text)
32-
ocr <- function(file, engine = tesseract("eng"), HOCR = FALSE, opw = "", upw = "") {
33-
if (is.character(engine)) {
34-
engine <- tesseract(engine)
35-
}
32+
ocr <- function(file, pages = NULL, engine = tesseract("eng"), HOCR = FALSE, opw = "", upw = "") {
33+
if (is.character(engine)) { engine <- tesseract(engine) }
34+
if (is.numeric(pages)) { pages <- as.integer(pages) }
3635
stopifnot(inherits(engine, "externalptr"))
37-
if (isTRUE(inherits(file, "magick-image"))) {
38-
vapply(file, function(x) {
39-
tmp <- tempfile(fileext = ".png")
40-
on.exit(unlink(tmp))
41-
magick::image_write(x, tmp, format = "PNG", density = "300x300")
42-
ocr(tmp, engine = engine, HOCR = HOCR)
43-
}, character(1))
44-
} else if (isTRUE(is.character(file)) && isFALSE(is.pdf(file))) {
45-
if (isFALSE(is.tiff(file))) {
46-
vapply(file, ocr_file, character(1), ptr = engine, HOCR = HOCR, USE.NAMES = FALSE)
47-
} else {
48-
ocr(tiff_convert(file), engine, HOCR = HOCR)
49-
}
50-
} else if (isTRUE(is.raw(file))) {
51-
ocr_raw(file, engine, HOCR = HOCR)
52-
} else if (isTRUE(is.pdf(file))) {
53-
n <- n_pages(file, opw = opw, upw = upw)
54-
fout <- pdf_convert(file, format = "png", pages = 1:n, opw = opw, upw = upw)
55-
out <- vapply(fout, function(x) ocr(x, engine = engine, HOCR = HOCR), character(1))
56-
unlink(fout)
57-
names(out) <- NULL
58-
out
59-
} else {
60-
stop("Argument 'file' must be file-path, url or raw vector")
61-
}
36+
stopifnot(file.exists(file))
37+
stopifnot(is.pdf(file))
38+
39+
if (is.null(pages)) { pages <- seq_len(n_pages(file, opw = opw, upw = upw)) }
40+
fout <- pdf_convert(file, format = "png", pages = pages, opw = opw, upw = upw)
41+
out <- vapply(fout, ocr_file, character(1), ptr = engine, HOCR = HOCR, USE.NAMES = FALSE)
42+
unlink(fout)
43+
out
6244
}
6345

64-
#' @rdname ocr
65-
#' @export
66-
ocr_data <- function(file, engine = tesseract("eng")) {
67-
if (is.character(engine)) {
68-
engine <- tesseract(engine)
46+
is.pdf <- function(x) {
47+
grepl("\\.pdf$", tolower(x))
48+
}
49+
50+
pdf_convert <- function(pdf, format = "png", pages = NULL, dpi = 72,
51+
antialias = TRUE, opw = "", upw = "") {
52+
config <- get_poppler_config()
53+
54+
if (isFALSE(config$render) || isFALSE(length(config$format) > 0)) {
55+
stop("You version of libppoppler does not support rendering")
6956
}
70-
stopifnot(inherits(engine, "externalptr"))
71-
df_list <- if (inherits(file, "magick-image")) {
72-
lapply(file, function(x) {
73-
tmp <- tempfile(fileext = ".png")
74-
on.exit(unlink(tmp))
75-
magick::image_write(x, tmp, format = "PNG", density = "300x300")
76-
ocr_data(tmp, engine = engine)
77-
})
78-
} else if (is.character(file)) {
79-
lapply(file, function(im) {
80-
ocr_file_data(im, ptr = engine)
81-
})
82-
} else if (is.raw(file)) {
83-
list(ocr_raw_data(file, engine))
84-
} else {
85-
stop("Argument 'file' must be file-path, url or raw vector")
57+
58+
format <- match.arg(format, config$format)
59+
60+
if (is.null(pages)) {
61+
pages <- seq_len(n_pages(file, opw = opw, upw = upw))
8662
}
87-
df_as_tibble(do.call(rbind.data.frame, unname(df_list)))
63+
64+
if (isFALSE(is.numeric(pages)) || isFALSE(length(pages) > 0)) {
65+
stop("Argument 'pages' must be a one-indexed vector of page numbers")
66+
}
67+
68+
antialiasing <- isTRUE(antialias) || isTRUE(antialias == "draw")
69+
70+
text_antialiasing <- isTRUE(antialias) || isTRUE(antialias == "text")
71+
72+
dout <- tempdir()
73+
suppressWarnings(try(dir.create(dout)))
74+
75+
filenames <- file.path(dout, sprintf(
76+
"%s-%03d.%s",
77+
tools::file_path_sans_ext(basename(pdf)),
78+
pages, format
79+
))
80+
81+
poppler_convert(pdf, format, pages, filenames, dpi, opw, upw, antialiasing, text_antialiasing)
8882
}

R/tessdata.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,14 +35,14 @@
3535
#'
3636
#' if (any("fra" %in% tesseract_info()$available)) {
3737
#' french <- tesseract("fra")
38-
#' file <- system.file("examples", "french.png", package = "cpp11tesseract")
38+
#' file <- system.file("examples", "french.pdf", package = "cpp11tesseract")
3939
#' text <- ocr(file, engine = french)
4040
#' cat(text)
4141
#' }
4242
tesseract_download <- function(lang, datapath = NULL, model = c("fast", "best"), progress = interactive()) {
4343
stopifnot(is.character(lang))
4444
model <- match.arg(model)
45-
if (!length(datapath)) {
45+
if (is.null(length(datapath))) {
4646
warn_on_linux()
4747
datapath <- tesseract_info()$datapath
4848
}
@@ -85,7 +85,7 @@ tesseract_download <- function(lang, datapath = NULL, model = c("fast", "best"),
8585
#'
8686
#' if (any("grc_hist" %in% tesseract_info()$available)) {
8787
#' greek <- tesseract("grc_hist")
88-
#' file <- system.file("examples", "polytonicgreek.png", package = "cpp11tesseract")
88+
#' file <- system.file("examples", "polytonicgreek.pdf", package = "cpp11tesseract")
8989
#' text <- ocr(file, engine = greek)
9090
#' cat(text)
9191
#' }

R/tesseract.R

Lines changed: 1 addition & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#' Tesseract Engine
22
#'
33
#' Create an OCR engine for a given language and control parameters. This can be used by
4-
#' the [ocr] and [ocr_data] functions to recognize text.
4+
#' the [ocr] function to recognize text.
55
#'
66
#' Tesseract control parameters can be set either via a named list in the
77
#' `options` parameter, or in a `config` file text file which contains the parameter name
@@ -99,16 +99,6 @@ tesseract_engine <- function(datapath, language, configs, options) {
9999
tesseract_engine_internal(datapath, language, configs, opt_names, opt_values)
100100
}
101101

102-
#' @export
103-
#' @noRd
104-
"print.tesseract" <- function(x, ...) {
105-
info <- engine_info_internal(x)
106-
cat("<tesseract engine>\n")
107-
cat(" loaded:", info$loaded, "\n")
108-
cat(" datapath:", info$datapath, "\n")
109-
cat(" available:", info$available, "\n")
110-
}
111-
112102
bail <- function(...) {
113103
stop(sprintf(...), call. = FALSE)
114104
}

docs/404.html

Lines changed: 0 additions & 87 deletions
This file was deleted.

0 commit comments

Comments
 (0)