Skip to content

Commit 97a613b

Browse files
committed
scan multipage pdf
1 parent f61dfcd commit 97a613b

40 files changed

+28224
-198
lines changed

DESCRIPTION

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: cpp11tesseract
22
Type: Package
33
Title: Open Source OCR Engine
4-
Version: 5.3.3
4+
Version: 5.3.4
55
Authors@R: c(person("Jeroen", "Ooms",
66
role = c("aut"),
77
email = "[email protected]",
@@ -25,6 +25,11 @@ SystemRequirements:
2525
deb: tesseract-ocr libtesseract-dev libleptonica-dev,
2626
rpm: tesseract-devel leptonica-devel,
2727
brew: tesseract
28+
),
29+
Poppler (
30+
deb: libpoppler-cpp-dev,
31+
rpm: poppler-cpp-devel,
32+
brew: poppler-data
2833
)
2934
Imports:
3035
curl,
@@ -39,8 +44,7 @@ Suggests:
3944
knitr,
4045
tibble,
4146
rmarkdown,
42-
testthat (>= 3.0.0),
43-
pdftools
47+
testthat (>= 3.0.0)
4448
Encoding: UTF-8
4549
VignetteBuilder: knitr
4650
Language: en-US

NAMESPACE

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
S3method(print,tesseract)
44
export(ocr)
55
export(ocr_data)
6-
export(pdf_to_png)
76
export(tesseract)
87
export(tesseract_contributed_download)
98
export(tesseract_download)

NEWS.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
# 5.3.4
2+
3+
- The function to download images from the web was removed to comply with Munk
4+
security policies. The images should be downloaded locally, and once verified,
5+
these should be uploaded to the server.
6+
17
# 5.3.3
28

39
- Tests that the installed Tesseract version is compatible with the C++ compiler

R/cpp11.R

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,15 @@ ocr_raw_data <- function(input, ptr) {
4747
ocr_file_data <- function(file, ptr) {
4848
.Call(`_cpp11tesseract_ocr_file_data`, file, ptr)
4949
}
50+
51+
n_pages <- function(file_path, opw, upw) {
52+
.Call(`_cpp11tesseract_n_pages`, file_path, opw, upw)
53+
}
54+
55+
get_poppler_config <- function() {
56+
.Call(`_cpp11tesseract_get_poppler_config`)
57+
}
58+
59+
poppler_convert <- function(file_path, format, pages, names, dpi, opw, upw, antialiasing, text_antialiasing) {
60+
.Call(`_cpp11tesseract_poppler_convert`, file_path, format, pages, names, dpi, opw, upw, antialiasing, text_antialiasing)
61+
}

R/images.R

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
is.pdf <- function(x) {
2+
grepl("\\.pdf$", tolower(x))
3+
}
4+
5+
is.tiff <- function(x) {
6+
grepl("\\.tiff$", tolower(x))
7+
}
8+
9+
pdf_convert <- function(pdf, format = "png", pages = NULL, dpi = 72,
10+
antialias = TRUE, opw = "", upw = "") {
11+
config <- get_poppler_config()
12+
13+
if (isFALSE(config$render) || isFALSE(length(config$format) > 0)) {
14+
stop("You version of libppoppler does not support rendering")
15+
}
16+
17+
format <- match.arg(format, config$format)
18+
19+
if (is.null(pages)) {
20+
pages <- seq_len(n_pages(file, opw = opw, upw = upw))
21+
}
22+
23+
if (isFALSE(is.numeric(pages)) || isFALSE(length(pages) > 0)) {
24+
stop("Argument 'pages' must be a one-indexed vector of page numbers")
25+
}
26+
27+
antialiasing <- isTRUE(antialias) || isTRUE(antialias == "draw")
28+
29+
text_antialiasing <- isTRUE(antialias) || isTRUE(antialias == "text")
30+
31+
dout <- tempdir()
32+
suppressWarnings(try(dir.create(dout)))
33+
34+
filenames <- file.path(dout, sprintf("%s-%03d.%s",
35+
# basename(pdf),
36+
# remove the file extension
37+
tools::file_path_sans_ext(basename(pdf)),
38+
pages, format)
39+
)
40+
41+
poppler_convert(pdf, format, pages, filenames, dpi, opw, upw, antialiasing, text_antialiasing)
42+
}
43+
44+
tiff_convert <- function(tiff, format = "png", dpi = 72) {
45+
stopifnot(requireNamespace("magick", quietly = TRUE))
46+
magick::image_convert(magick::image_read(tiff), format = format)
47+
}

R/ocr.R

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,71 +3,86 @@
33
#' Extract text from an image. Requires that you have training data for the language you
44
#' are reading. Works best for images with high contrast, little noise and horizontal text.
55
#' See [tesseract wiki](https://github.com/tesseract-ocr/tessdoc) and
6-
#' our package vignette for image preprocessing tips.
6+
#' the package vignette for image preprocessing tips.
77
#'
88
#' The `ocr()` function returns plain text by default, or hOCR text if hOCR is set to `TRUE`.
99
#' The `ocr_data()` function returns a data frame with a confidence rate and bounding box for
1010
#' each word in the text.
1111
#'
1212
#' @export
13-
#' @return character vector of text extracted from the image
13+
#' @return character vector of text extracted from the file. If the file
14+
#' is has TIFF or PDF extension, it will be a vector of length equal to the
15+
#' number of pages.
1416
#' @family tesseract
15-
#' @param image file path, url, or raw vector to image (png, tiff, jpeg, etc)
17+
#' @param file file path or raw vector (png, tiff, jpeg, etc).
1618
#' @param engine a tesseract engine created with [tesseract()]. Alternatively a
1719
#' language string which will be passed to [tesseract()].
1820
#' @param HOCR if `TRUE` return results as HOCR xml instead of plain text
21+
#' @param opw owner password to open pdf (please pass it as an environment
22+
#' variable to avoid leaking sensitive information)
23+
#' @param upw user password to open pdf (please pass it as an environment
24+
#' variable to avoid leaking sensitive information)
1925
#' @rdname ocr
2026
#' @references [Tesseract: Improving Quality](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality)
2127
#' @examples
2228
#' # Simple example
2329
#' file <- system.file("examples", "testocr.png", package = "cpp11tesseract")
2430
#' text <- ocr(file)
2531
#' cat(text)
26-
ocr <- function(image, engine = tesseract("eng"), HOCR = FALSE) {
32+
ocr <- function(file, engine = tesseract("eng"), HOCR = FALSE, opw = "", upw = "") {
2733
if (is.character(engine)) {
2834
engine <- tesseract(engine)
2935
}
3036
stopifnot(inherits(engine, "externalptr"))
31-
if (inherits(image, "magick-image")) {
32-
vapply(image, function(x) {
37+
if (isTRUE(inherits(file, "magick-image"))) {
38+
vapply(file, function(x) {
3339
tmp <- tempfile(fileext = ".png")
3440
on.exit(unlink(tmp))
3541
magick::image_write(x, tmp, format = "PNG", density = "300x300")
3642
ocr(tmp, engine = engine, HOCR = HOCR)
3743
}, character(1))
38-
} else if (is.character(image)) {
39-
image <- download_files(image)
40-
vapply(image, ocr_file, character(1), ptr = engine, HOCR = HOCR, USE.NAMES = FALSE)
41-
} else if (is.raw(image)) {
42-
ocr_raw(image, engine, HOCR = HOCR)
44+
} else if (isTRUE(is.character(file)) && isFALSE(is.pdf(file))) {
45+
if (isFALSE(is.tiff(file))) {
46+
vapply(file, ocr_file, character(1), ptr = engine, HOCR = HOCR, USE.NAMES = FALSE)
47+
} else {
48+
ocr(tiff_convert(file), engine, HOCR = HOCR)
49+
}
50+
} else if (isTRUE(is.raw(file))) {
51+
ocr_raw(file, engine, HOCR = HOCR)
52+
} else if (isTRUE(is.pdf(file))) {
53+
n <- n_pages(file, opw = opw, upw = upw)
54+
fout <- pdf_convert(file, format = "png", pages = 1:n, opw = opw, upw = upw)
55+
out <- vapply(fout, function(x) ocr(x, engine = engine, HOCR = HOCR), character(1))
56+
unlink(fout)
57+
names(out) <- NULL
58+
out
4359
} else {
44-
stop("Argument 'image' must be file-path, url or raw vector")
60+
stop("Argument 'file' must be file-path, url or raw vector")
4561
}
4662
}
4763

4864
#' @rdname ocr
4965
#' @export
50-
ocr_data <- function(image, engine = tesseract("eng")) {
66+
ocr_data <- function(file, engine = tesseract("eng")) {
5167
if (is.character(engine)) {
5268
engine <- tesseract(engine)
5369
}
5470
stopifnot(inherits(engine, "externalptr"))
55-
df_list <- if (inherits(image, "magick-image")) {
56-
lapply(image, function(x) {
71+
df_list <- if (inherits(file, "magick-image")) {
72+
lapply(file, function(x) {
5773
tmp <- tempfile(fileext = ".png")
5874
on.exit(unlink(tmp))
5975
magick::image_write(x, tmp, format = "PNG", density = "300x300")
6076
ocr_data(tmp, engine = engine)
6177
})
62-
} else if (is.character(image)) {
63-
image <- download_files(image)
64-
lapply(image, function(im) {
78+
} else if (is.character(file)) {
79+
lapply(file, function(im) {
6580
ocr_file_data(im, ptr = engine)
6681
})
67-
} else if (is.raw(image)) {
68-
list(ocr_raw_data(image, engine))
82+
} else if (is.raw(file)) {
83+
list(ocr_raw_data(file, engine))
6984
} else {
70-
stop("Argument 'image' must be file-path, url or raw vector")
85+
stop("Argument 'file' must be file-path, url or raw vector")
7186
}
7287
df_as_tibble(do.call(rbind.data.frame, unname(df_list)))
7388
}

R/tesseract.R

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -99,38 +99,6 @@ tesseract_engine <- function(datapath, language, configs, options) {
9999
tesseract_engine_internal(datapath, language, configs, opt_names, opt_values)
100100
}
101101

102-
#' Export a PDF file to PNG files
103-
#' @param path path to the PDF file
104-
#' @param dpi resolution in DPI
105-
#' @return a "magick-image" object
106-
#' @examples
107-
#' if (requireNamespace("magick", quietly = TRUE)) {
108-
#' file <- system.file("examples", "ocrscan.pdf", package = "cpp11tesseract")
109-
#' pdf_to_png(file)
110-
#' }
111-
#' @export
112-
pdf_to_png <- function(path, dpi = 600) {
113-
if (!requireNamespace("magick", quietly = TRUE)) {
114-
stop("magick package is required to read PDF files")
115-
} else {
116-
magick::image_read_pdf(path, density = dpi)
117-
}
118-
}
119-
120-
download_files <- function(urls) {
121-
files <- vapply(urls, function(path) {
122-
if (grepl("^https?://", path)) {
123-
tmp <- tempfile(fileext = basename(path))
124-
curl::curl_download(path, tmp)
125-
path <- tmp
126-
}
127-
normalizePath(path, mustWork = TRUE)
128-
}, character(1))
129-
is_pdf <- grepl(".pdf$", files)
130-
out <- unlist(lapply(files[is_pdf], pdf_to_png))
131-
c(files[!is_pdf], out)
132-
}
133-
134102
#' @export
135103
#' @noRd
136104
"print.tesseract" <- function(x, ...) {

configure

Lines changed: 50 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,45 @@
77
# R CMD INSTALL --configure-vars='INCLUDE_DIR=/.../include LIB_DIR=/.../lib'
88

99
# Library settings
10-
PKG_CONFIG_NAME="tesseract"
11-
PKG_DEB_NAME="tesseract-ocr libtesseract-dev libleptonica-dev"
12-
PKG_RPM_NAME="tesseract-devel leptonica-devel"
13-
PKG_BREW_NAME="tesseract"
14-
PKG_TEST_HEADER="<baseapi.h>"
15-
PKG_CFLAGS="-I/usr/include/tesseract -I/usr/include/leptonica"
16-
PKG_LIBS="-ltesseract"
10+
PKG_CONFIG_NAME_TESSERACT="tesseract"
11+
PKG_CONFIG_NAME_POPPLER="poppler-cpp"
12+
PKG_DEB_NAME="tesseract-ocr libtesseract-dev libleptonica-dev libpoppler-cpp-dev"
13+
PKG_RPM_NAME="tesseract-devel leptonica-devel poppler-cpp-devel"
14+
PKG_BREW_NAME="tesseract poppler-data"
15+
PKG_TEST_HEADER_TESSERACT="<tesseract/baseapi.h>"
16+
PKG_TEST_HEADER_POPPLER="<poppler-document.h>"
17+
PKG_CFLAGS="-I/usr/include/tesseract -I/usr/include/leptonica -I/usr/include/poppler"
18+
PKG_LIBS="-ltesseract -lpoppler-cpp"
1719

1820
# Use pkg-config if available
1921
pkg-config --version >/dev/null 2>&1
2022
if [ $? -eq 0 ]; then
21-
PKGCONFIG_CFLAGS=`pkg-config --cflags --silence-errors ${PKG_CONFIG_NAME}`
22-
PKGCONFIG_LIBS=`pkg-config --libs ${PKG_CONFIG_NAME}`
23+
PKGCONFIG_CFLAGS_TESSERACT=`pkg-config --cflags ${PKG_CONFIG_NAME_TESSERACT}`
24+
PKGCONFIG_LIBS_TESSERACT=`pkg-config --libs ${PKG_CONFIG_NAME_TESSERACT}`
25+
PKGCONFIG_CFLAGS_POPPLER=`pkg-config --cflags ${PKG_CONFIG_NAME_POPPLER}`
26+
PKGCONFIG_LIBS_POPPLER=`pkg-config --libs ${PKG_CONFIG_NAME_POPPLER}`
2327
fi
28+
29+
# Debugging information
30+
echo "PKGCONFIG_CFLAGS_TESSERACT: $PKGCONFIG_CFLAGS_TESSERACT"
31+
echo "PKGCONFIG_LIBS_TESSERACT: $PKGCONFIG_LIBS_TESSERACT"
32+
echo "PKGCONFIG_CFLAGS_POPPLER: $PKGCONFIG_CFLAGS_POPPLER"
33+
echo "PKGCONFIG_LIBS_POPPLER: $PKGCONFIG_LIBS_POPPLER"
34+
2435
# Note that cflags may be empty in case of success
2536
if [ "$INCLUDE_DIR" ] || [ "$LIB_DIR" ]; then
2637
echo "Found INCLUDE_DIR and/or LIB_DIR!"
2738
PKG_CFLAGS="-I$INCLUDE_DIR $PKG_CFLAGS"
2839
PKG_LIBS="-L$LIB_DIR $PKG_LIBS"
29-
elif [ "$PKGCONFIG_CFLAGS" ] || [ "$PKGCONFIG_LIBS" ]; then
40+
elif [ "$PKGCONFIG_CFLAGS_TESSERACT" ] || [ "$PKGCONFIG_LIBS_TESSERACT" ] || [ "$PKGCONFIG_CFLAGS_POPPLER" ] || [ "$PKGCONFIG_LIBS_POPPLER" ]; then
3041
echo "Found pkg-config cflags and libs!"
31-
PKG_CFLAGS=${PKGCONFIG_CFLAGS}
32-
PKG_LIBS=${PKGCONFIG_LIBS}
42+
PKG_CFLAGS="${PKGCONFIG_CFLAGS_TESSERACT} ${PKGCONFIG_CFLAGS_POPPLER}"
43+
PKG_LIBS="${PKGCONFIG_LIBS_TESSERACT} ${PKGCONFIG_LIBS_POPPLER}"
3344
elif [ `uname` = "Darwin" ]; then
3445
test ! "$CI" && brew --version 2>/dev/null
3546
if [ $? -eq 0 ]; then
3647
BREWDIR=`brew --prefix`
37-
PKG_CFLAGS="-I$BREWDIR/include/tesseract -I$BREWDIR/include/leptonica"
48+
PKG_CFLAGS="-I$BREWDIR/include/tesseract -I$BREWDIR/include/leptonica -I$BREWDIR/include/poppler"
3849
PKG_LIBS="-L$BREWDIR/lib $PKG_LIBS"
3950
else
4051
echo "Homebrew is not installed. Visit https://brew.sh/ for information on how to install it."
@@ -64,12 +75,12 @@ ${CXX11CPP} ${CPPFLAGS} ${PKG_CFLAGS} tools/test.cpp >/dev/null 2>configure.log
6475
# Customize the error
6576
if [ $? -ne 0 ]; then
6677
echo "--------------------------- [ANTICONF] --------------------------------"
67-
echo "Configuration failed to find '$PKG_CONFIG_NAME' system library. Try installing:"
78+
echo "Configuration failed to find system libraries. Try installing:"
6879
echo " * deb: $PKG_DEB_NAME (Debian, Ubuntu, etc)"
6980
echo " * rpm: $PKG_RPM_NAME (Fedora, CentOS, RHEL)"
7081
echo " * brew: $PKG_BREW_NAME (Mac OSX)"
71-
echo "If $PKG_CONFIG_NAME is already installed, check that 'pkg-config' is in your"
72-
echo "PATH and PKG_CONFIG_PATH contains a $PKG_CONFIG_NAME.pc file. If pkg-config"
82+
echo "If the libraries are already installed, check that 'pkg-config' is in your"
83+
echo "PATH and PKG_CONFIG_PATH contains the necessary .pc files. If pkg-config"
7384
echo "is unavailable you can set INCLUDE_DIR and LIB_DIR manually via:"
7485
echo "R CMD INSTALL --configure-vars='INCLUDE_DIR=... LIB_DIR=...'"
7586
echo "-------------------------- [ERROR MESSAGE] ---------------------------"
@@ -78,7 +89,7 @@ if [ $? -ne 0 ]; then
7889
exit 1
7990
fi
8091

81-
# Create a temporary C++ file to test the compatibility
92+
# Create a temporary C++ file to test the compatibility with Tesseract
8293
cat <<EOF > conftest.cpp
8394
#include <tesseract/baseapi.h>
8495
int main() {
@@ -87,7 +98,7 @@ int main() {
8798
}
8899
EOF
89100

90-
# Compile the temporary C++ file to an object file
101+
# Test Tesseract
91102
# Tesseract enforces C++11
92103
if ! ${CXX11} -std=gnu++11 -c conftest.cpp -o conftest.o ${PKG_CFLAGS}
93104
then
@@ -99,6 +110,27 @@ else
99110
rm -rf conftest.cpp conftest.o
100111
fi
101112

113+
# Create a temporary C++ file to test the compatibility with Poppler
114+
cat <<EOF > conftest2.cpp
115+
#include <poppler-version.h>
116+
117+
int main() {
118+
poppler::version_string();
119+
return 0;
120+
}
121+
EOF
122+
123+
# Compile the temporary C++ file to an object file
124+
if ! ${CXX11} -std=gnu++11 -c conftest2.cpp -o conftest2.o ${PKG_CFLAGS}
125+
then
126+
echo "Poppler is not compatible with the C++ compiler used by R."
127+
rm -rf conftest2.cpp conftest2.o
128+
exit 1
129+
else
130+
echo "Poppler is compatible with the C++ compiler used by R."
131+
rm -rf conftest2.cpp conftest2.o
132+
fi
133+
102134
# Write to Makevars
103135
sed -e "s|@cflags@|$PKG_CFLAGS|" -e "s|@libs@|$PKG_LIBS|" src/Makevars.in > src/Makevars
104136

dev/test-with-valgrind.r

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
library(cpp11tesseract)
2+
file <- system.file("examples", "bondargentina.pdf", package = "cpp11tesseract")
3+
d <- pdf_to_png(file)

0 commit comments

Comments
 (0)