|
3 | 3 | #' Extract text from an image. Requires that you have training data for the language you |
4 | 4 | #' are reading. Works best for images with high contrast, little noise and horizontal text. |
5 | 5 | #' See [tesseract wiki](https://github.com/tesseract-ocr/tessdoc) and |
6 | | -#' our package vignette for image preprocessing tips. |
| 6 | +#' the package vignette for image preprocessing tips. |
7 | 7 | #' |
8 | 8 | #' The `ocr()` function returns plain text by default, or hOCR text if hOCR is set to `TRUE`. |
9 | 9 | #' The `ocr_data()` function returns a data frame with a confidence rate and bounding box for |
10 | 10 | #' each word in the text. |
11 | 11 | #' |
12 | 12 | #' @export |
13 | | -#' @return character vector of text extracted from the image |
| 13 | +#' @return character vector of text extracted from the file. If the file |
| 14 | +#' is has TIFF or PDF extension, it will be a vector of length equal to the |
| 15 | +#' number of pages. |
14 | 16 | #' @family tesseract |
15 | | -#' @param image file path, url, or raw vector to image (png, tiff, jpeg, etc) |
| 17 | +#' @param file file path or raw vector (png, tiff, jpeg, etc). |
16 | 18 | #' @param engine a tesseract engine created with [tesseract()]. Alternatively a |
17 | 19 | #' language string which will be passed to [tesseract()]. |
18 | 20 | #' @param HOCR if `TRUE` return results as HOCR xml instead of plain text |
| 21 | +#' @param opw owner password to open pdf (please pass it as an environment |
| 22 | +#' variable to avoid leaking sensitive information) |
| 23 | +#' @param upw user password to open pdf (please pass it as an environment |
| 24 | +#' variable to avoid leaking sensitive information) |
19 | 25 | #' @rdname ocr |
20 | 26 | #' @references [Tesseract: Improving Quality](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality) |
21 | 27 | #' @examples |
22 | 28 | #' # Simple example |
23 | 29 | #' file <- system.file("examples", "testocr.png", package = "cpp11tesseract") |
24 | 30 | #' text <- ocr(file) |
25 | 31 | #' cat(text) |
26 | | -ocr <- function(image, engine = tesseract("eng"), HOCR = FALSE) { |
| 32 | +ocr <- function(file, engine = tesseract("eng"), HOCR = FALSE, opw = "", upw = "") { |
27 | 33 | if (is.character(engine)) { |
28 | 34 | engine <- tesseract(engine) |
29 | 35 | } |
30 | 36 | stopifnot(inherits(engine, "externalptr")) |
31 | | - if (inherits(image, "magick-image")) { |
32 | | - vapply(image, function(x) { |
| 37 | + if (isTRUE(inherits(file, "magick-image"))) { |
| 38 | + vapply(file, function(x) { |
33 | 39 | tmp <- tempfile(fileext = ".png") |
34 | 40 | on.exit(unlink(tmp)) |
35 | 41 | magick::image_write(x, tmp, format = "PNG", density = "300x300") |
36 | 42 | ocr(tmp, engine = engine, HOCR = HOCR) |
37 | 43 | }, character(1)) |
38 | | - } else if (is.character(image)) { |
39 | | - image <- download_files(image) |
40 | | - vapply(image, ocr_file, character(1), ptr = engine, HOCR = HOCR, USE.NAMES = FALSE) |
41 | | - } else if (is.raw(image)) { |
42 | | - ocr_raw(image, engine, HOCR = HOCR) |
| 44 | + } else if (isTRUE(is.character(file)) && isFALSE(is.pdf(file))) { |
| 45 | + if (isFALSE(is.tiff(file))) { |
| 46 | + vapply(file, ocr_file, character(1), ptr = engine, HOCR = HOCR, USE.NAMES = FALSE) |
| 47 | + } else { |
| 48 | + ocr(tiff_convert(file), engine, HOCR = HOCR) |
| 49 | + } |
| 50 | + } else if (isTRUE(is.raw(file))) { |
| 51 | + ocr_raw(file, engine, HOCR = HOCR) |
| 52 | + } else if (isTRUE(is.pdf(file))) { |
| 53 | + n <- n_pages(file, opw = opw, upw = upw) |
| 54 | + fout <- pdf_convert(file, format = "png", pages = 1:n, opw = opw, upw = upw) |
| 55 | + out <- vapply(fout, function(x) ocr(x, engine = engine, HOCR = HOCR), character(1)) |
| 56 | + unlink(fout) |
| 57 | + names(out) <- NULL |
| 58 | + out |
43 | 59 | } else { |
44 | | - stop("Argument 'image' must be file-path, url or raw vector") |
| 60 | + stop("Argument 'file' must be file-path, url or raw vector") |
45 | 61 | } |
46 | 62 | } |
47 | 63 |
|
48 | 64 | #' @rdname ocr |
49 | 65 | #' @export |
50 | | -ocr_data <- function(image, engine = tesseract("eng")) { |
| 66 | +ocr_data <- function(file, engine = tesseract("eng")) { |
51 | 67 | if (is.character(engine)) { |
52 | 68 | engine <- tesseract(engine) |
53 | 69 | } |
54 | 70 | stopifnot(inherits(engine, "externalptr")) |
55 | | - df_list <- if (inherits(image, "magick-image")) { |
56 | | - lapply(image, function(x) { |
| 71 | + df_list <- if (inherits(file, "magick-image")) { |
| 72 | + lapply(file, function(x) { |
57 | 73 | tmp <- tempfile(fileext = ".png") |
58 | 74 | on.exit(unlink(tmp)) |
59 | 75 | magick::image_write(x, tmp, format = "PNG", density = "300x300") |
60 | 76 | ocr_data(tmp, engine = engine) |
61 | 77 | }) |
62 | | - } else if (is.character(image)) { |
63 | | - image <- download_files(image) |
64 | | - lapply(image, function(im) { |
| 78 | + } else if (is.character(file)) { |
| 79 | + lapply(file, function(im) { |
65 | 80 | ocr_file_data(im, ptr = engine) |
66 | 81 | }) |
67 | | - } else if (is.raw(image)) { |
68 | | - list(ocr_raw_data(image, engine)) |
| 82 | + } else if (is.raw(file)) { |
| 83 | + list(ocr_raw_data(file, engine)) |
69 | 84 | } else { |
70 | | - stop("Argument 'image' must be file-path, url or raw vector") |
| 85 | + stop("Argument 'file' must be file-path, url or raw vector") |
71 | 86 | } |
72 | 87 | df_as_tibble(do.call(rbind.data.frame, unname(df_list))) |
73 | 88 | } |
0 commit comments