Fixes for ReadPDFs for 1 page files

mwmclean · mwmclean · commit 0420362fc566 · 2022-09-30T21:28:55.000+10:00
* Use metadata to calculate number of pages in document and avoid message/error from doing second pass reading document when there is no second page * Fix checking for NA when combining lists from both passes of pdftotext * Update ReadPDFs unit tests * Closes #95
diff --git a/R/ReadPDFs.R b/R/ReadPDFs.R
@@ -84,8 +84,14 @@ ReadPDFs <- function (path, .enc = 'UTF-8', recursive = TRUE,
 
     dois <- vapply(out, SearchDOIText, "")
     doi.meta.ind <- vapply(dois, nzchar, FALSE)  # !is.na(dois)
+    pages.idx <- lapply(out, grep, patt = "^Pages:")
+    pages <- as.numeric(mapply(function(md, idx) sub("^[^0-9]*", "", md[idx]),
+                               out, pages.idx))
   }else
+  {
     doi.meta.ind <- logical(n.files)
+    pages <- rep(Inf, n.files)
+  }
 
   ########################################
   # search first two pages of pdf for DOI
@@ -99,9 +105,12 @@ ReadPDFs <- function (path, .enc = 'UTF-8', recursive = TRUE,
 
   tfile1 <- tempfile(fileext = '.txt')
   txt.files1 <- lapply(files, GetPDFTxt, page = 1, tfile = tfile1, enc = .enc)
-  txt.files2 <- lapply(files, GetPDFTxt, page = 2, tfile = tfile1, enc = .enc)
+  txt.files2 <- vector("list", n.files)
+  for (i in seq_len(n.files))
+      if (pages[i] > 1)
+          txt.files2[[i]] <- GetPDFTxt(files[[i]], page = 2, tfile = tfile1, enc = .enc)
   file.remove(tfile1)
-
+  
   ## check first page for JSTOR, if yes grab info from both pages, else NA
   resJSTOR <- mapply(CheckJSTOR, txt.files1, txt.files2, files, SIMPLIFY=FALSE)
   JSTOR.ind <- !is.na(resJSTOR)
diff --git a/R/ReadPDFsSupport.R b/R/ReadPDFsSupport.R
@@ -3,6 +3,8 @@
 ReadFirstPages <- function(doc, page.one = TRUE){
   doc <- unlist(doc)
   res <- list()
+  if (length(doc) == 0)
+      return(list(found.abstract = FALSE))
   found.abstract <- FALSE
 
   # arXiv
@@ -405,17 +407,17 @@ GetAuthorTitle <- function(doc, found.abstract, kw){
 #' @keywords internal
 #' @noRd
 CleanAuthorTitle <- function(bib1, bib2, bibMeta, file){
-  # browser()
-  if (!is.null(bibMeta)){ # Don't let Metadata date overwrite year from pdf text
+  has.meta <- !is.null(bibMeta) && !all(is.na(bibMeta))
+  if (has.meta){ # Don't let Metadata date overwrite year from pdf text
     if (!is.null(bib1$year) || !is.null(bib2$year))
       bibMeta$date <- NULL
   }
-    if (bib2$found.abstract && (!is.null(bib2$author) || !is.null(bib2$title))){
-    if(!is.null(bibMeta))
+  if (bib2$found.abstract && (!is.null(bib2$author) || !is.null(bib2$title))){
+    if(has.meta)
       bib1 <- AddListToList(bib1, bibMeta)
     bib <- AddListToList(bib2, bib1)
   }else{
-    if(!is.null(bibMeta))
+    if(has.meta)
       bib2 <- AddListToList(bib2, bibMeta)
     bib <- AddListToList(bib1, bib2)
   }
@@ -581,8 +583,8 @@ ProcessPDFSubject <- function(subj){
 #' @keywords internal
 #' @noRd
 AddListToList <- function(list1, list2){
-  c1 <- is.na(list1) || length(list1)==0
-  c2 <- is.na(list2) || length(list2)==0
+  c1 <- all(is.na(list1)) || length(list1)==0
+  c2 <- all(is.na(list2)) || length(list2)==0
 
   if (c1 && c2)
     return(NA)
diff --git a/tests/testthat/test-readPDF.R b/tests/testthat/test-readPDF.R
@@ -74,8 +74,8 @@ test_that("Creates a BibEntry object", {
                                      use.crossref = TRUE))
     expect_is(bib, "BibEntry")
     if (!biomet.fail)
-        expect_equal(as.character(bib[["azzalini1996multivariate"]]$author),
-                     c("A AZZALINI", "A DALLA VALLE"))
+        expect_equal(as.character(bib[["multivariate1996"]]$title),
+                     "The Multivariate Skew-normal Distribution")
 })
 
 test_that("Add file field", {
@@ -103,7 +103,7 @@ test_that("Recognizes JSTOR", {
     bib <- ReadPDFs(exe.path, progress = FALSE, use.crossref = FALSE)
     expect_equal(bib[author = "carrol"]$eprinttype, "jstor")
     expect_equal(bib[author = "carrol"]$url,
-                 "https://www.jstor.org/stable/25050155")
+                 "https://www.jstor.org/stable/24538366")
 })
 
 test_that("Recognizes arxiv", {
@@ -144,7 +144,7 @@ test_that("Reading journal and title", {
     expect_match(bib[year = "1996"]$journal, "Biometrika")
     expect_equal(bib[year = "1996"]$title,
                  "The Multivariate Skew-normal Distribution")
-    expect_equal(bib[year = "1996"]$bibtype, "Article")
+    expect_equal(bib[year = "1996"]$bibtype, "Misc")
 })
 
 test_that("use.metadata = FALSE", {