Skip to content

Commit 2246ebb

Browse files
author
hornik
committed
Add url_db_from_package_PDF_files().
git-svn-id: https://svn.r-project.org/R/trunk@89128 00db46b3-68df-0310-9c12-caf00c1e9a41
1 parent 2776121 commit 2246ebb

File tree

1 file changed

+21
-4
lines changed

1 file changed

+21
-4
lines changed

src/library/tools/R/urltools.R

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -82,26 +82,29 @@ function(x, href = TRUE, ifdef = FALSE)
8282
.get_urls_from_HTML_file <-
8383
function(f)
8484
{
85-
doc <- xml2::read_html(f)
85+
doc <- tryCatch(xml2::read_html(f), error = identity)
8686
if(!inherits(doc, "xml_node")) return(character())
8787
nodes <- xml2::xml_find_all(doc, "//a")
8888
hrefs <- xml2::xml_attr(nodes, "href")
8989
unique(hrefs[!is.na(hrefs) & !startsWith(hrefs, "#")])
9090
}
9191

9292
.get_urls_from_PDF_file <-
93-
function(f)
93+
function(f, exe = NULL)
9494
{
9595
## Seems there is no straightforward way to extract hyperrefs from a
9696
## PDF, hence first convert to HTML.
97+
if(is.null(exe))
98+
exe <- Sys.which("pdftohtml")
99+
if(!nzchar(exe)) return(character())
97100
## Note that pdftohtml always outputs in cwd ...
98101
owd <- getwd()
99102
dir.create(d <- tempfile())
100103
on.exit({ unlink(d, recursive = TRUE); setwd(owd) })
101104
file.copy(normalizePath(f), d)
102105
setwd(d)
103106
g <- tempfile(tmpdir = d, fileext = ".xml")
104-
system2("pdftohtml",
107+
system2(exe,
105108
c("-s -q -i -c -xml", shQuote(basename(f)), shQuote(basename(g))))
106109
## Oh dear: seems that pdftohtml can fail without a non-zero exit
107110
## status.
@@ -151,6 +154,9 @@ url_db_from_PDF_files <-
151154
function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
152155
{
153156
urls <- parents <- character()
157+
exe <- Sys.which("pdftohtml")
158+
if(!nzchar(exe))
159+
return(url_db(urls, parents))
154160
if(is.null(files))
155161
files <- list.files(dir, pattern = "[.]pdf$",
156162
full.names = TRUE,
@@ -161,7 +167,7 @@ function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
161167
if(verbose)
162168
message(sprintf("processing %s",
163169
.file_path_relative_to_dir(f, dir)))
164-
.get_urls_from_PDF_file(f)
170+
.get_urls_from_PDF_file(f, exe)
165171
})
166172
names(urls) <- files
167173
urls <- Filter(length, urls)
@@ -313,6 +319,14 @@ function(dir, installed = FALSE)
313319
url_db(urls, rep.int(path, length(urls)))
314320
}
315321

322+
url_db_from_package_PDF_files <-
323+
function(dir, installed = FALSE)
324+
{
325+
path <- if(installed) "doc" else file.path("inst", "doc")
326+
files <- Sys.glob(file.path(dir, path, "*.pdf"))
327+
url_db_from_PDF_files(dir, files = files)
328+
}
329+
316330
url_db_from_package_sources <-
317331
function(dir, add = FALSE) {
318332
meta <- .get_package_metadata(dir, FALSE)
@@ -322,6 +336,7 @@ function(dir, add = FALSE) {
322336
url_db_from_package_news(dir))
323337
if(requireNamespace("xml2", quietly = TRUE)) {
324338
db <- rbind(db,
339+
url_db_from_package_PDF_files(dir),
325340
url_db_from_package_HTML_files(dir),
326341
url_db_from_package_README_md(dir),
327342
url_db_from_package_NEWS_md(dir)
@@ -350,6 +365,8 @@ function(packages, lib.loc = NULL, verbose = FALSE)
350365
url_db_from_package_news(dir, installed = TRUE))
351366
if(requireNamespace("xml2", quietly = TRUE)) {
352367
db <- rbind(db,
368+
url_db_from_package_PDF_files(dir,
369+
installed = TRUE),
353370
url_db_from_package_HTML_files(dir,
354371
installed = TRUE),
355372
url_db_from_package_README_md(dir,

0 commit comments

Comments
 (0)