|
6 | 6 | #' the package vignette for image preprocessing tips. |
7 | 7 | #' |
8 | 8 | #' The `ocr()` function returns plain text by default, or hOCR text if hOCR is set to `TRUE`. |
9 | | -#' The `ocr_data()` function returns a data frame with a confidence rate and bounding box for |
10 | | -#' each word in the text. |
11 | 9 | #' |
12 | 10 | #' @export |
13 | 11 | #' @return character vector of text extracted from the file. If the file |
14 | 12 | #' is has TIFF or PDF extension, it will be a vector of length equal to the |
15 | 13 | #' number of pages. |
16 | 14 | #' @family tesseract |
17 | 15 | #' @param file file path or raw vector (png, tiff, jpeg, etc). |
| 16 | +#' @param pages a numeric vector of pages to extract text from. If `NULL` all |
| 17 | +#' pages will be extracted. |
18 | 18 | #' @param engine a tesseract engine created with [tesseract()]. Alternatively a |
19 | 19 | #' language string which will be passed to [tesseract()]. |
20 | 20 | #' @param HOCR if `TRUE` return results as HOCR xml instead of plain text |
|
26 | 26 | #' @references [Tesseract: Improving Quality](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality) |
27 | 27 | #' @examples |
28 | 28 | #' # Simple example |
29 | | -#' file <- system.file("examples", "testocr.png", package = "cpp11tesseract") |
| 29 | +#' file <- system.file("examples", "oscarwilde.pdf", package = "cpp11tesseract") |
30 | 30 | #' text <- ocr(file) |
31 | 31 | #' cat(text) |
32 | | -ocr <- function(file, engine = tesseract("eng"), HOCR = FALSE, opw = "", upw = "") { |
33 | | - if (is.character(engine)) { |
34 | | - engine <- tesseract(engine) |
35 | | - } |
| 32 | +ocr <- function(file, pages = NULL, engine = tesseract("eng"), HOCR = FALSE, opw = "", upw = "") { |
| 33 | + if (is.character(engine)) { engine <- tesseract(engine) } |
| 34 | + if (is.numeric(pages)) { pages <- as.integer(pages) } |
36 | 35 | stopifnot(inherits(engine, "externalptr")) |
37 | | - if (isTRUE(inherits(file, "magick-image"))) { |
38 | | - vapply(file, function(x) { |
39 | | - tmp <- tempfile(fileext = ".png") |
40 | | - on.exit(unlink(tmp)) |
41 | | - magick::image_write(x, tmp, format = "PNG", density = "300x300") |
42 | | - ocr(tmp, engine = engine, HOCR = HOCR) |
43 | | - }, character(1)) |
44 | | - } else if (isTRUE(is.character(file)) && isFALSE(is.pdf(file))) { |
45 | | - if (isFALSE(is.tiff(file))) { |
46 | | - vapply(file, ocr_file, character(1), ptr = engine, HOCR = HOCR, USE.NAMES = FALSE) |
47 | | - } else { |
48 | | - ocr(tiff_convert(file), engine, HOCR = HOCR) |
49 | | - } |
50 | | - } else if (isTRUE(is.raw(file))) { |
51 | | - ocr_raw(file, engine, HOCR = HOCR) |
52 | | - } else if (isTRUE(is.pdf(file))) { |
53 | | - n <- n_pages(file, opw = opw, upw = upw) |
54 | | - fout <- pdf_convert(file, format = "png", pages = 1:n, opw = opw, upw = upw) |
55 | | - out <- vapply(fout, function(x) ocr(x, engine = engine, HOCR = HOCR), character(1)) |
56 | | - unlink(fout) |
57 | | - names(out) <- NULL |
58 | | - out |
59 | | - } else { |
60 | | - stop("Argument 'file' must be file-path, url or raw vector") |
61 | | - } |
| 36 | + stopifnot(file.exists(file)) |
| 37 | + stopifnot(is.pdf(file)) |
| 38 | + |
| 39 | + if (is.null(pages)) { pages <- seq_len(n_pages(file, opw = opw, upw = upw)) } |
| 40 | + fout <- pdf_convert(file, format = "png", pages = pages, opw = opw, upw = upw) |
| 41 | + out <- vapply(fout, ocr_file, character(1), ptr = engine, HOCR = HOCR, USE.NAMES = FALSE) |
| 42 | + unlink(fout) |
| 43 | + out |
62 | 44 | } |
63 | 45 |
|
64 | | -#' @rdname ocr |
65 | | -#' @export |
66 | | -ocr_data <- function(file, engine = tesseract("eng")) { |
67 | | - if (is.character(engine)) { |
68 | | - engine <- tesseract(engine) |
| 46 | +is.pdf <- function(x) { |
| 47 | + grepl("\\.pdf$", tolower(x)) |
| 48 | +} |
| 49 | + |
| 50 | +pdf_convert <- function(pdf, format = "png", pages = NULL, dpi = 72, |
| 51 | + antialias = TRUE, opw = "", upw = "") { |
| 52 | + config <- get_poppler_config() |
| 53 | + |
| 54 | + if (isFALSE(config$render) || isFALSE(length(config$format) > 0)) { |
| 55 | + stop("You version of libppoppler does not support rendering") |
69 | 56 | } |
70 | | - stopifnot(inherits(engine, "externalptr")) |
71 | | - df_list <- if (inherits(file, "magick-image")) { |
72 | | - lapply(file, function(x) { |
73 | | - tmp <- tempfile(fileext = ".png") |
74 | | - on.exit(unlink(tmp)) |
75 | | - magick::image_write(x, tmp, format = "PNG", density = "300x300") |
76 | | - ocr_data(tmp, engine = engine) |
77 | | - }) |
78 | | - } else if (is.character(file)) { |
79 | | - lapply(file, function(im) { |
80 | | - ocr_file_data(im, ptr = engine) |
81 | | - }) |
82 | | - } else if (is.raw(file)) { |
83 | | - list(ocr_raw_data(file, engine)) |
84 | | - } else { |
85 | | - stop("Argument 'file' must be file-path, url or raw vector") |
| 57 | + |
| 58 | + format <- match.arg(format, config$format) |
| 59 | + |
| 60 | + if (is.null(pages)) { |
| 61 | + pages <- seq_len(n_pages(file, opw = opw, upw = upw)) |
86 | 62 | } |
87 | | - df_as_tibble(do.call(rbind.data.frame, unname(df_list))) |
| 63 | + |
| 64 | + if (isFALSE(is.numeric(pages)) || isFALSE(length(pages) > 0)) { |
| 65 | + stop("Argument 'pages' must be a one-indexed vector of page numbers") |
| 66 | + } |
| 67 | + |
| 68 | + antialiasing <- isTRUE(antialias) || isTRUE(antialias == "draw") |
| 69 | + |
| 70 | + text_antialiasing <- isTRUE(antialias) || isTRUE(antialias == "text") |
| 71 | + |
| 72 | + dout <- tempdir() |
| 73 | + suppressWarnings(try(dir.create(dout))) |
| 74 | + |
| 75 | + filenames <- file.path(dout, sprintf( |
| 76 | + "%s-%03d.%s", |
| 77 | + tools::file_path_sans_ext(basename(pdf)), |
| 78 | + pages, format |
| 79 | + )) |
| 80 | + |
| 81 | + poppler_convert(pdf, format, pages, filenames, dpi, opw, upw, antialiasing, text_antialiasing) |
88 | 82 | } |
0 commit comments