pachadotdev
diff --git a/‎DESCRIPTION‎
Lines changed: 7 additions & 3 deletions b/‎DESCRIPTION‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 0 additions & 1 deletion b/‎NAMESPACE‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎NEWS.md‎
Lines changed: 6 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎R/cpp11.R‎
Lines changed: 12 additions & 0 deletions b/‎R/cpp11.R‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎R/images.R‎
Lines changed: 47 additions & 0 deletions b/‎R/images.R‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎R/ocr.R‎
Lines changed: 36 additions & 21 deletions b/‎R/ocr.R‎
Lines changed: 36 additions & 21 deletions
diff --git a/‎R/tesseract.R‎
Lines changed: 0 additions & 32 deletions b/‎R/tesseract.R‎
Lines changed: 0 additions & 32 deletions
diff --git a/‎configure‎
Lines changed: 50 additions & 18 deletions b/‎configure‎
Lines changed: 50 additions & 18 deletions
diff --git a/‎dev/test-with-valgrind.r‎
Lines changed: 3 additions & 0 deletions b/‎dev/test-with-valgrind.r‎
Lines changed: 3 additions & 0 deletions
@@ -1,7 +1,7 @@
 Package: cpp11tesseract
 Type: Package
 Title: Open Source OCR Engine
-Version: 5.3.3
+Version: 5.3.4
 Authors@R: c(person("Jeroen", "Ooms",
                     role = c("aut"),
                     email = "[email protected]",
@@ -25,6 +25,11 @@ SystemRequirements:
         deb: tesseract-ocr libtesseract-dev libleptonica-dev,
         rpm: tesseract-devel leptonica-devel,
         brew: tesseract
+    ),
+    Poppler (
+        deb: libpoppler-cpp-dev,
+        rpm: poppler-cpp-devel,
+        brew: poppler-data
     )
 Imports:
     curl,
@@ -39,8 +44,7 @@ Suggests:
     knitr,
     tibble,
     rmarkdown,
-    testthat (>= 3.0.0),
-    pdftools
+    testthat (>= 3.0.0)
 Encoding: UTF-8
 VignetteBuilder: knitr
 Language: en-US
 
@@ -3,7 +3,6 @@
 S3method(print,tesseract)
 export(ocr)
 export(ocr_data)
-export(pdf_to_png)
 export(tesseract)
 export(tesseract_contributed_download)
 export(tesseract_download)
 
@@ -1,3 +1,9 @@
+# 5.3.4
+
+- The function to download images from the web was removed to comply with Munk
+  security policies. The images should be downloaded locally, and once verified,
+  these should be uploaded to the server.
+
 # 5.3.3
 
 - Tests that the installed Tesseract version is compatible with the C++ compiler
 
@@ -47,3 +47,15 @@ ocr_raw_data <- function(input, ptr) {
 ocr_file_data <- function(file, ptr) {
   .Call(`_cpp11tesseract_ocr_file_data`, file, ptr)
 }
+
+n_pages <- function(file_path, opw, upw) {
+  .Call(`_cpp11tesseract_n_pages`, file_path, opw, upw)
+}
+
+get_poppler_config <- function() {
+  .Call(`_cpp11tesseract_get_poppler_config`)
+}
+
+poppler_convert <- function(file_path, format, pages, names, dpi, opw, upw, antialiasing, text_antialiasing) {
+  .Call(`_cpp11tesseract_poppler_convert`, file_path, format, pages, names, dpi, opw, upw, antialiasing, text_antialiasing)
+}
@@ -0,0 +1,47 @@
+is.pdf <- function(x) {
+  grepl("\\.pdf$", tolower(x))
+}
+
+is.tiff <- function(x) {
+  grepl("\\.tiff$", tolower(x))
+}
+
+pdf_convert <- function(pdf, format = "png", pages = NULL, dpi = 72,
+                        antialias = TRUE, opw = "", upw = "") {
+  config <- get_poppler_config()
+  
+  if (isFALSE(config$render) || isFALSE(length(config$format) > 0)) {
+    stop("You version of libppoppler does not support rendering")
+  }
+  
+  format <- match.arg(format, config$format)
+  
+  if (is.null(pages)) {
+    pages <- seq_len(n_pages(file, opw = opw, upw = upw))
+  }
+
+  if (isFALSE(is.numeric(pages)) || isFALSE(length(pages) > 0)) {
+    stop("Argument 'pages' must be a one-indexed vector of page numbers")
+  }
+  
+  antialiasing <- isTRUE(antialias) || isTRUE(antialias == "draw")
+
+  text_antialiasing <- isTRUE(antialias) || isTRUE(antialias == "text")
+
+  dout <- tempdir()
+  suppressWarnings(try(dir.create(dout)))
+
+  filenames <- file.path(dout, sprintf("%s-%03d.%s",
+    # basename(pdf),
+    # remove the file extension
+    tools::file_path_sans_ext(basename(pdf)),
+    pages, format)
+  )
+
+  poppler_convert(pdf, format, pages, filenames, dpi, opw, upw, antialiasing, text_antialiasing)
+}
+
+tiff_convert <- function(tiff, format = "png", dpi = 72) {
+  stopifnot(requireNamespace("magick", quietly = TRUE))
+  magick::image_convert(magick::image_read(tiff), format = format)
+}
@@ -3,71 +3,86 @@
 #' Extract text from an image. Requires that you have training data for the language you
 #' are reading. Works best for images with high contrast, little noise and horizontal text.
 #' See [tesseract wiki](https://github.com/tesseract-ocr/tessdoc) and
-#' our package vignette for image preprocessing tips.
+#' the package vignette for image preprocessing tips.
 #'
 #' The `ocr()` function returns plain text by default, or hOCR text if hOCR is set to `TRUE`.
 #' The `ocr_data()` function returns a data frame with a confidence rate and bounding box for
 #' each word in the text.
 #'
 #' @export
-#' @return character vector of text extracted from the image
+#' @return character vector of text extracted from the file. If the file
+#'  is has TIFF or PDF extension, it will be a vector of length equal to the
+#'  number of pages.
 #' @family tesseract
-#' @param image file path, url, or raw vector to image (png, tiff, jpeg, etc)
+#' @param file file path or raw vector (png, tiff, jpeg, etc).
 #' @param engine a tesseract engine created with [tesseract()]. Alternatively a
 #' language string which will be passed to [tesseract()].
 #' @param HOCR if `TRUE` return results as HOCR xml instead of plain text
+#' @param opw owner password to open pdf (please pass it as an environment
+#'  variable to avoid leaking sensitive information)
+#' @param upw user password to open pdf (please pass it as an environment
+#'  variable to avoid leaking sensitive information)
 #' @rdname ocr
 #' @references [Tesseract: Improving Quality](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality)
 #' @examples
 #' # Simple example
 #' file <- system.file("examples", "testocr.png", package = "cpp11tesseract")
 #' text <- ocr(file)
 #' cat(text)
-ocr <- function(image, engine = tesseract("eng"), HOCR = FALSE) {
+ocr <- function(file, engine = tesseract("eng"), HOCR = FALSE, opw = "", upw = "") {
   if (is.character(engine)) {
     engine <- tesseract(engine)
   }
   stopifnot(inherits(engine, "externalptr"))
-  if (inherits(image, "magick-image")) {
-    vapply(image, function(x) {
+  if (isTRUE(inherits(file, "magick-image"))) {
+    vapply(file, function(x) {
       tmp <- tempfile(fileext = ".png")
       on.exit(unlink(tmp))
       magick::image_write(x, tmp, format = "PNG", density = "300x300")
       ocr(tmp, engine = engine, HOCR = HOCR)
     }, character(1))
-  } else if (is.character(image)) {
-    image <- download_files(image)
-    vapply(image, ocr_file, character(1), ptr = engine, HOCR = HOCR, USE.NAMES = FALSE)
-  } else if (is.raw(image)) {
-    ocr_raw(image, engine, HOCR = HOCR)
+  } else if (isTRUE(is.character(file)) && isFALSE(is.pdf(file))) {
+    if (isFALSE(is.tiff(file))) {
+      vapply(file, ocr_file, character(1), ptr = engine, HOCR = HOCR, USE.NAMES = FALSE)
+    } else {
+      ocr(tiff_convert(file), engine, HOCR = HOCR)
+    }
+  } else if (isTRUE(is.raw(file))) {
+    ocr_raw(file, engine, HOCR = HOCR)
+  } else if (isTRUE(is.pdf(file))) {
+    n <- n_pages(file, opw = opw, upw = upw)
+    fout <- pdf_convert(file, format = "png", pages = 1:n, opw = opw, upw = upw)
+    out <- vapply(fout, function(x) ocr(x, engine = engine, HOCR = HOCR), character(1))
+    unlink(fout)
+    names(out) <- NULL
+    out
   } else {
-    stop("Argument 'image' must be file-path, url or raw vector")
+    stop("Argument 'file' must be file-path, url or raw vector")
   }
 }
 
 #' @rdname ocr
 #' @export
-ocr_data <- function(image, engine = tesseract("eng")) {
+ocr_data <- function(file, engine = tesseract("eng")) {
   if (is.character(engine)) {
     engine <- tesseract(engine)
   }
   stopifnot(inherits(engine, "externalptr"))
-  df_list <- if (inherits(image, "magick-image")) {
-    lapply(image, function(x) {
+  df_list <- if (inherits(file, "magick-image")) {
+    lapply(file, function(x) {
       tmp <- tempfile(fileext = ".png")
       on.exit(unlink(tmp))
       magick::image_write(x, tmp, format = "PNG", density = "300x300")
       ocr_data(tmp, engine = engine)
     })
-  } else if (is.character(image)) {
-    image <- download_files(image)
-    lapply(image, function(im) {
+  } else if (is.character(file)) {
+    lapply(file, function(im) {
       ocr_file_data(im, ptr = engine)
     })
-  } else if (is.raw(image)) {
-    list(ocr_raw_data(image, engine))
+  } else if (is.raw(file)) {
+    list(ocr_raw_data(file, engine))
   } else {
-    stop("Argument 'image' must be file-path, url or raw vector")
+    stop("Argument 'file' must be file-path, url or raw vector")
   }
   df_as_tibble(do.call(rbind.data.frame, unname(df_list)))
 }
@@ -99,38 +99,6 @@ tesseract_engine <- function(datapath, language, configs, options) {
   tesseract_engine_internal(datapath, language, configs, opt_names, opt_values)
 }
 
-#' Export a PDF file to PNG files
-#' @param path path to the PDF file
-#' @param dpi resolution in DPI
-#' @return a "magick-image" object
-#' @examples
-#' if (requireNamespace("magick", quietly = TRUE)) {
-#'  file <- system.file("examples", "ocrscan.pdf", package = "cpp11tesseract")
-#'  pdf_to_png(file)
-#' }
-#' @export
-pdf_to_png <- function(path, dpi = 600) {
-  if (!requireNamespace("magick", quietly = TRUE)) {
-    stop("magick package is required to read PDF files")
-  } else {
-    magick::image_read_pdf(path, density = dpi)
-  }
-}
-
-download_files <- function(urls) {
-  files <- vapply(urls, function(path) {
-    if (grepl("^https?://", path)) {
-      tmp <- tempfile(fileext = basename(path))
-      curl::curl_download(path, tmp)
-      path <- tmp
-    }
-    normalizePath(path, mustWork = TRUE)
-  }, character(1))
-  is_pdf <- grepl(".pdf$", files)
-  out <- unlist(lapply(files[is_pdf], pdf_to_png))
-  c(files[!is_pdf], out)
-}
-
 #' @export
 #' @noRd
 "print.tesseract" <- function(x, ...) {
 
@@ -7,34 +7,45 @@
 # R CMD INSTALL --configure-vars='INCLUDE_DIR=/.../include LIB_DIR=/.../lib'
 
 # Library settings
-PKG_CONFIG_NAME="tesseract"
-PKG_DEB_NAME="tesseract-ocr libtesseract-dev libleptonica-dev"
-PKG_RPM_NAME="tesseract-devel leptonica-devel"
-PKG_BREW_NAME="tesseract"
-PKG_TEST_HEADER="<baseapi.h>"
-PKG_CFLAGS="-I/usr/include/tesseract -I/usr/include/leptonica"
-PKG_LIBS="-ltesseract"
+PKG_CONFIG_NAME_TESSERACT="tesseract"
+PKG_CONFIG_NAME_POPPLER="poppler-cpp"
+PKG_DEB_NAME="tesseract-ocr libtesseract-dev libleptonica-dev libpoppler-cpp-dev"
+PKG_RPM_NAME="tesseract-devel leptonica-devel poppler-cpp-devel"
+PKG_BREW_NAME="tesseract poppler-data"
+PKG_TEST_HEADER_TESSERACT="<tesseract/baseapi.h>"
+PKG_TEST_HEADER_POPPLER="<poppler-document.h>"
+PKG_CFLAGS="-I/usr/include/tesseract -I/usr/include/leptonica -I/usr/include/poppler"
+PKG_LIBS="-ltesseract -lpoppler-cpp"
 
 # Use pkg-config if available
 pkg-config --version >/dev/null 2>&1
 if [ $? -eq 0 ]; then
-  PKGCONFIG_CFLAGS=`pkg-config --cflags --silence-errors ${PKG_CONFIG_NAME}`
-  PKGCONFIG_LIBS=`pkg-config --libs ${PKG_CONFIG_NAME}`
+  PKGCONFIG_CFLAGS_TESSERACT=`pkg-config --cflags ${PKG_CONFIG_NAME_TESSERACT}`
+  PKGCONFIG_LIBS_TESSERACT=`pkg-config --libs ${PKG_CONFIG_NAME_TESSERACT}`
+  PKGCONFIG_CFLAGS_POPPLER=`pkg-config --cflags ${PKG_CONFIG_NAME_POPPLER}`
+  PKGCONFIG_LIBS_POPPLER=`pkg-config --libs ${PKG_CONFIG_NAME_POPPLER}`
 fi
+
+# Debugging information
+echo "PKGCONFIG_CFLAGS_TESSERACT: $PKGCONFIG_CFLAGS_TESSERACT"
+echo "PKGCONFIG_LIBS_TESSERACT: $PKGCONFIG_LIBS_TESSERACT"
+echo "PKGCONFIG_CFLAGS_POPPLER: $PKGCONFIG_CFLAGS_POPPLER"
+echo "PKGCONFIG_LIBS_POPPLER: $PKGCONFIG_LIBS_POPPLER"
+
 # Note that cflags may be empty in case of success
 if [ "$INCLUDE_DIR" ] || [ "$LIB_DIR" ]; then
   echo "Found INCLUDE_DIR and/or LIB_DIR!"
   PKG_CFLAGS="-I$INCLUDE_DIR $PKG_CFLAGS"
   PKG_LIBS="-L$LIB_DIR $PKG_LIBS"
-elif [ "$PKGCONFIG_CFLAGS" ] || [ "$PKGCONFIG_LIBS" ]; then
+elif [ "$PKGCONFIG_CFLAGS_TESSERACT" ] || [ "$PKGCONFIG_LIBS_TESSERACT" ] || [ "$PKGCONFIG_CFLAGS_POPPLER" ] || [ "$PKGCONFIG_LIBS_POPPLER" ]; then
   echo "Found pkg-config cflags and libs!"
-  PKG_CFLAGS=${PKGCONFIG_CFLAGS}
-  PKG_LIBS=${PKGCONFIG_LIBS}
+  PKG_CFLAGS="${PKGCONFIG_CFLAGS_TESSERACT} ${PKGCONFIG_CFLAGS_POPPLER}"
+  PKG_LIBS="${PKGCONFIG_LIBS_TESSERACT} ${PKGCONFIG_LIBS_POPPLER}"
 elif [ `uname` = "Darwin" ]; then
   test ! "$CI" && brew --version 2>/dev/null
   if [ $? -eq 0 ]; then
     BREWDIR=`brew --prefix`
-    PKG_CFLAGS="-I$BREWDIR/include/tesseract -I$BREWDIR/include/leptonica"
+    PKG_CFLAGS="-I$BREWDIR/include/tesseract -I$BREWDIR/include/leptonica -I$BREWDIR/include/poppler"
     PKG_LIBS="-L$BREWDIR/lib $PKG_LIBS"
   else
     echo "Homebrew is not installed. Visit https://brew.sh/ for information on how to install it."
@@ -64,12 +75,12 @@ ${CXX11CPP} ${CPPFLAGS} ${PKG_CFLAGS} tools/test.cpp >/dev/null 2>configure.log
 # Customize the error
 if [ $? -ne 0 ]; then
   echo "--------------------------- [ANTICONF] --------------------------------"
-  echo "Configuration failed to find '$PKG_CONFIG_NAME' system library. Try installing:"
+  echo "Configuration failed to find system libraries. Try installing:"
   echo " * deb: $PKG_DEB_NAME (Debian, Ubuntu, etc)"
   echo " * rpm: $PKG_RPM_NAME (Fedora, CentOS, RHEL)"
   echo " * brew: $PKG_BREW_NAME (Mac OSX)"
-  echo "If $PKG_CONFIG_NAME is already installed, check that 'pkg-config' is in your"
-  echo "PATH and PKG_CONFIG_PATH contains a $PKG_CONFIG_NAME.pc file. If pkg-config"
+  echo "If the libraries are already installed, check that 'pkg-config' is in your"
+  echo "PATH and PKG_CONFIG_PATH contains the necessary .pc files. If pkg-config"
   echo "is unavailable you can set INCLUDE_DIR and LIB_DIR manually via:"
   echo "R CMD INSTALL --configure-vars='INCLUDE_DIR=... LIB_DIR=...'"
   echo "-------------------------- [ERROR MESSAGE] ---------------------------"
@@ -78,7 +89,7 @@ if [ $? -ne 0 ]; then
   exit 1
 fi
 
-# Create a temporary C++ file to test the compatibility
+# Create a temporary C++ file to test the compatibility with Tesseract
 cat <<EOF > conftest.cpp
 #include <tesseract/baseapi.h>
 int main() {
@@ -87,7 +98,7 @@ int main() {
 }
 EOF
 
-# Compile the temporary C++ file to an object file
+# Test Tesseract
 # Tesseract enforces C++11
 if ! ${CXX11} -std=gnu++11 -c conftest.cpp -o conftest.o ${PKG_CFLAGS}
 then
@@ -99,6 +110,27 @@ else
     rm -rf conftest.cpp conftest.o
 fi
 
+# Create a temporary C++ file to test the compatibility with Poppler
+cat <<EOF > conftest2.cpp
+#include <poppler-version.h>
+
+int main() {
+    poppler::version_string();
+    return 0;
+}
+EOF
+
+# Compile the temporary C++ file to an object file
+if ! ${CXX11} -std=gnu++11 -c conftest2.cpp -o conftest2.o ${PKG_CFLAGS}
+then
+    echo "Poppler is not compatible with the C++ compiler used by R."
+    rm -rf conftest2.cpp conftest2.o
+    exit 1
+else
+    echo "Poppler is compatible with the C++ compiler used by R."
+    rm -rf conftest2.cpp conftest2.o
+fi
+
 # Write to Makevars
 sed -e "s|@cflags@|$PKG_CFLAGS|" -e "s|@libs@|$PKG_LIBS|" src/Makevars.in > src/Makevars
 
 
@@ -0,0 +1,3 @@
+library(cpp11tesseract)
+file <- system.file("examples", "bondargentina.pdf", package = "cpp11tesseract")
+d <- pdf_to_png(file)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+library(cpp11tesseract)`
	`2`	`+file <- system.file("examples", "bondargentina.pdf", package = "cpp11tesseract")`
	`3`	`+d <- pdf_to_png(file)`