Skip to content

Commit 0420362

Browse files
committed
Fixes for ReadPDFs for 1 page files
* Use metadata to calculate number of pages in document and avoid message/error from doing second pass reading document when there is no second page * Fix checking for NA when combining lists from both passes of pdftotext * Update ReadPDFs unit tests * Closes #95
1 parent 23c9f41 commit 0420362

File tree

3 files changed

+24
-13
lines changed

3 files changed

+24
-13
lines changed

R/ReadPDFs.R

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,14 @@ ReadPDFs <- function (path, .enc = 'UTF-8', recursive = TRUE,
8484

8585
dois <- vapply(out, SearchDOIText, "")
8686
doi.meta.ind <- vapply(dois, nzchar, FALSE) # !is.na(dois)
87+
pages.idx <- lapply(out, grep, patt = "^Pages:")
88+
pages <- as.numeric(mapply(function(md, idx) sub("^[^0-9]*", "", md[idx]),
89+
out, pages.idx))
8790
}else
91+
{
8892
doi.meta.ind <- logical(n.files)
93+
pages <- rep(Inf, n.files)
94+
}
8995

9096
########################################
9197
# search first two pages of pdf for DOI
@@ -99,9 +105,12 @@ ReadPDFs <- function (path, .enc = 'UTF-8', recursive = TRUE,
99105

100106
tfile1 <- tempfile(fileext = '.txt')
101107
txt.files1 <- lapply(files, GetPDFTxt, page = 1, tfile = tfile1, enc = .enc)
102-
txt.files2 <- lapply(files, GetPDFTxt, page = 2, tfile = tfile1, enc = .enc)
108+
txt.files2 <- vector("list", n.files)
109+
for (i in seq_len(n.files))
110+
if (pages[i] > 1)
111+
txt.files2[[i]] <- GetPDFTxt(files[[i]], page = 2, tfile = tfile1, enc = .enc)
103112
file.remove(tfile1)
104-
113+
105114
## check first page for JSTOR, if yes grab info from both pages, else NA
106115
resJSTOR <- mapply(CheckJSTOR, txt.files1, txt.files2, files, SIMPLIFY=FALSE)
107116
JSTOR.ind <- !is.na(resJSTOR)

R/ReadPDFsSupport.R

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
ReadFirstPages <- function(doc, page.one = TRUE){
44
doc <- unlist(doc)
55
res <- list()
6+
if (length(doc) == 0)
7+
return(list(found.abstract = FALSE))
68
found.abstract <- FALSE
79

810
# arXiv
@@ -405,17 +407,17 @@ GetAuthorTitle <- function(doc, found.abstract, kw){
405407
#' @keywords internal
406408
#' @noRd
407409
CleanAuthorTitle <- function(bib1, bib2, bibMeta, file){
408-
# browser()
409-
if (!is.null(bibMeta)){ # Don't let Metadata date overwrite year from pdf text
410+
has.meta <- !is.null(bibMeta) && !all(is.na(bibMeta))
411+
if (has.meta){ # Don't let Metadata date overwrite year from pdf text
410412
if (!is.null(bib1$year) || !is.null(bib2$year))
411413
bibMeta$date <- NULL
412414
}
413-
if (bib2$found.abstract && (!is.null(bib2$author) || !is.null(bib2$title))){
414-
if(!is.null(bibMeta))
415+
if (bib2$found.abstract && (!is.null(bib2$author) || !is.null(bib2$title))){
416+
if(has.meta)
415417
bib1 <- AddListToList(bib1, bibMeta)
416418
bib <- AddListToList(bib2, bib1)
417419
}else{
418-
if(!is.null(bibMeta))
420+
if(has.meta)
419421
bib2 <- AddListToList(bib2, bibMeta)
420422
bib <- AddListToList(bib1, bib2)
421423
}
@@ -581,8 +583,8 @@ ProcessPDFSubject <- function(subj){
581583
#' @keywords internal
582584
#' @noRd
583585
AddListToList <- function(list1, list2){
584-
c1 <- is.na(list1) || length(list1)==0
585-
c2 <- is.na(list2) || length(list2)==0
586+
c1 <- all(is.na(list1)) || length(list1)==0
587+
c2 <- all(is.na(list2)) || length(list2)==0
586588

587589
if (c1 && c2)
588590
return(NA)

tests/testthat/test-readPDF.R

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ test_that("Creates a BibEntry object", {
7474
use.crossref = TRUE))
7575
expect_is(bib, "BibEntry")
7676
if (!biomet.fail)
77-
expect_equal(as.character(bib[["azzalini1996multivariate"]]$author),
78-
c("A AZZALINI", "A DALLA VALLE"))
77+
expect_equal(as.character(bib[["multivariate1996"]]$title),
78+
"The Multivariate Skew-normal Distribution")
7979
})
8080

8181
test_that("Add file field", {
@@ -103,7 +103,7 @@ test_that("Recognizes JSTOR", {
103103
bib <- ReadPDFs(exe.path, progress = FALSE, use.crossref = FALSE)
104104
expect_equal(bib[author = "carrol"]$eprinttype, "jstor")
105105
expect_equal(bib[author = "carrol"]$url,
106-
"https://www.jstor.org/stable/25050155")
106+
"https://www.jstor.org/stable/24538366")
107107
})
108108

109109
test_that("Recognizes arxiv", {
@@ -144,7 +144,7 @@ test_that("Reading journal and title", {
144144
expect_match(bib[year = "1996"]$journal, "Biometrika")
145145
expect_equal(bib[year = "1996"]$title,
146146
"The Multivariate Skew-normal Distribution")
147-
expect_equal(bib[year = "1996"]$bibtype, "Article")
147+
expect_equal(bib[year = "1996"]$bibtype, "Misc")
148148
})
149149

150150
test_that("use.metadata = FALSE", {

0 commit comments

Comments
 (0)