@@ -82,26 +82,29 @@ function(x, href = TRUE, ifdef = FALSE)
8282.get_urls_from_HTML_file <-
8383function (f )
8484{
85- doc <- xml2 :: read_html(f )
85+ doc <- tryCatch( xml2 :: read_html(f ), error = identity )
8686 if (! inherits(doc , " xml_node" )) return (character ())
8787 nodes <- xml2 :: xml_find_all(doc , " //a" )
8888 hrefs <- xml2 :: xml_attr(nodes , " href" )
8989 unique(hrefs [! is.na(hrefs ) & ! startsWith(hrefs , " #" )])
9090}
9191
9292.get_urls_from_PDF_file <-
93- function (f )
93+ function (f , exe = NULL )
9494{
9595 # # Seems there is no straightforward way to extract hyperrefs from a
9696 # # PDF, hence first convert to HTML.
97+ if (is.null(exe ))
98+ exe <- Sys.which(" pdftohtml" )
99+ if (! nzchar(exe )) return (character ())
97100 # # Note that pdftohtml always outputs in cwd ...
98101 owd <- getwd()
99102 dir.create(d <- tempfile())
100103 on.exit({ unlink(d , recursive = TRUE ); setwd(owd ) })
101104 file.copy(normalizePath(f ), d )
102105 setwd(d )
103106 g <- tempfile(tmpdir = d , fileext = " .xml" )
104- system2(" pdftohtml " ,
107+ system2(exe ,
105108 c(" -s -q -i -c -xml" , shQuote(basename(f )), shQuote(basename(g ))))
106109 # # Oh dear: seems that pdftohtml can fail without a non-zero exit
107110 # # status.
@@ -151,6 +154,9 @@ url_db_from_PDF_files <-
151154function (dir , recursive = FALSE , files = NULL , verbose = FALSE )
152155{
153156 urls <- parents <- character ()
157+ exe <- Sys.which(" pdftohtml" )
158+ if (! nzchar(exe ))
159+ return (url_db(urls , parents ))
154160 if (is.null(files ))
155161 files <- list.files(dir , pattern = " [.]pdf$" ,
156162 full.names = TRUE ,
@@ -161,7 +167,7 @@ function(dir, recursive = FALSE, files = NULL, verbose = FALSE)
161167 if (verbose )
162168 message(sprintf(" processing %s" ,
163169 .file_path_relative_to_dir(f , dir )))
164- .get_urls_from_PDF_file(f )
170+ .get_urls_from_PDF_file(f , exe )
165171 })
166172 names(urls ) <- files
167173 urls <- Filter(length , urls )
@@ -313,6 +319,14 @@ function(dir, installed = FALSE)
313319 url_db(urls , rep.int(path , length(urls )))
314320}
315321
322+ url_db_from_package_PDF_files <-
323+ function (dir , installed = FALSE )
324+ {
325+ path <- if (installed ) " doc" else file.path(" inst" , " doc" )
326+ files <- Sys.glob(file.path(dir , path , " *.pdf" ))
327+ url_db_from_PDF_files(dir , files = files )
328+ }
329+
316330url_db_from_package_sources <-
317331function (dir , add = FALSE ) {
318332 meta <- .get_package_metadata(dir , FALSE )
@@ -322,6 +336,7 @@ function(dir, add = FALSE) {
322336 url_db_from_package_news(dir ))
323337 if (requireNamespace(" xml2" , quietly = TRUE )) {
324338 db <- rbind(db ,
339+ url_db_from_package_PDF_files(dir ),
325340 url_db_from_package_HTML_files(dir ),
326341 url_db_from_package_README_md(dir ),
327342 url_db_from_package_NEWS_md(dir )
@@ -350,6 +365,8 @@ function(packages, lib.loc = NULL, verbose = FALSE)
350365 url_db_from_package_news(dir , installed = TRUE ))
351366 if (requireNamespace(" xml2" , quietly = TRUE )) {
352367 db <- rbind(db ,
368+ url_db_from_package_PDF_files(dir ,
369+ installed = TRUE ),
353370 url_db_from_package_HTML_files(dir ,
354371 installed = TRUE ),
355372 url_db_from_package_README_md(dir ,
0 commit comments