From c0d7fdc5fe93c73c93d4c57db4966a8217a215e2 Mon Sep 17 00:00:00 2001 From: Diego Doe Date: Thu, 10 Jan 2019 22:03:25 +0100 Subject: [PATCH 1/5] Fixed problem apparently due to using http protocol instead of https. --- R/pmcOAI.R | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/R/pmcOAI.R b/R/pmcOAI.R index b697fd4..4cf1af5 100644 --- a/R/pmcOAI.R +++ b/R/pmcOAI.R @@ -1,6 +1,6 @@ # Get XML from PMC-OAI service (Pubmed Central Open Archives Initiative) -# http://www.ncbi.nlm.nih.gov/pmc/tools/oai/ +# https://www.ncbi.nlm.nih.gov/pmc/tools/oai/ pmcOAI <- function(id, ...){ @@ -11,11 +11,11 @@ pmcOAI <- function(id, ...){ id2 <- gsub("PMC", "", id) # file name for attributes - file <- paste("http://www.ncbi.nlm.nih.gov/pmc/articles/", id, sep="") + file <- paste("https://www.ncbi.nlm.nih.gov/pmc/articles/", id, sep="") # use getURL in RCurl package (readlines returns incomplete line warning and does not get errors (just 404 NOT found) - # url <- "http://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:" - url <- "http://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:" + # url <- "https://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:" + url <- "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:" x <- getURL( paste0(url, id2), .encoding="UTF-8", ...) @@ -25,8 +25,8 @@ pmcOAI <- function(id, ...){ if(error=="idDoesNotExist") stop("No results found using ", id) message("No full text in Open Access Subset, downloading metadata only" ) - # url <- "http://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc_fm&identifier=oai:pubmedcentral.nih.gov:" - url <- "http://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc_fm&identifier=oai:pubmedcentral.nih.gov:" + # url <- "https://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc_fm&identifier=oai:pubmedcentral.nih.gov:" + url <- "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc_fm&identifier=oai:pubmedcentral.nih.gov:" x <- getURL( paste0(url, id2), .encoding="UTF-8", ...) From 9ef297b25ce88b3a55cde6bdd34ae107dea7b2b7 Mon Sep 17 00:00:00 2001 From: MFreidank Date: Fri, 25 Jan 2019 23:16:12 +0100 Subject: [PATCH 2/5] split downloading and parsing xml apart This allows pre-downloading of xml files and then simply calling `pmcOAI::processXML` with the contents of XML files. --- R/pmcOAI.R | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/R/pmcOAI.R b/R/pmcOAI.R index 4cf1af5..ef098e5 100644 --- a/R/pmcOAI.R +++ b/R/pmcOAI.R @@ -31,6 +31,12 @@ pmcOAI <- function(id, ...){ x <- getURL( paste0(url, id2), .encoding="UTF-8", ...) } + + doc <- processXML(x) + doc +} + +processXML <- function(xmlFile, id=NULL, file=NULL) { # Remove namespace for easier XPath queries # x[1] <- gsub(" xmlns=[^ ]*" , "", x[1]) # see PMC4515827 with tab before xmlns, \txmlns= @@ -43,10 +49,15 @@ pmcOAI <- function(id, ...){ x[n] <- gsub(">([^<])", ">^\\1", x[n]) doc <- xmlParse(x) - + ## ADD attributes - attr(doc, "id") <- id - attr(doc, "file") <- file + if (id) { + attr(doc, "id") <- id + } + + if (file) { + attr(doc, "file") <- file + } + doc } - From 140d337840a9c6080520eb2459cbbd63081c3984 Mon Sep 17 00:00:00 2001 From: MFreidank Date: Sat, 26 Jan 2019 12:20:48 +0100 Subject: [PATCH 3/5] minor fix --- R/pmcOAI.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pmcOAI.R b/R/pmcOAI.R index ef098e5..d788fde 100644 --- a/R/pmcOAI.R +++ b/R/pmcOAI.R @@ -36,7 +36,7 @@ pmcOAI <- function(id, ...){ doc } -processXML <- function(xmlFile, id=NULL, file=NULL) { +processXML <- function(x, id=NULL, file=NULL) { # Remove namespace for easier XPath queries # x[1] <- gsub(" xmlns=[^ ]*" , "", x[1]) # see PMC4515827 with tab before xmlns, \txmlns= From 3c80ead4a433fa81e35d292b2b7ae0e8a76d88c2 Mon Sep 17 00:00:00 2001 From: MFreidank Date: Sat, 26 Jan 2019 12:22:03 +0100 Subject: [PATCH 4/5] turn check to !is.null --- R/pmcOAI.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pmcOAI.R b/R/pmcOAI.R index d788fde..f745ceb 100644 --- a/R/pmcOAI.R +++ b/R/pmcOAI.R @@ -51,11 +51,11 @@ processXML <- function(x, id=NULL, file=NULL) { doc <- xmlParse(x) ## ADD attributes - if (id) { + if (!is.null(id)) { attr(doc, "id") <- id } - if (file) { + if (!is.null(file)) { attr(doc, "file") <- file } From ced317588dca0cae0ef0fad3796f36248f0c0d09 Mon Sep 17 00:00:00 2001 From: Diego Doe Date: Sat, 27 Jul 2019 16:54:49 +0200 Subject: [PATCH 5/5] added spaces --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 1afa227..8e0e531 100644 --- a/README.md +++ b/README.md @@ -16,3 +16,5 @@ to read zip, word tables and pdf supplementary files. Additional details about the package are on the [wiki pages](https://github.com/cstubben/pmcXML/wiki/Overview) and in [BMC Bioinformatics](http://www.biomedcentral.com/1471-2105/15/43/abstract). Stubben, CJ and JC Challacombe, 2014. Mining locus tags in PubMed Central to improve microbial gene annotation. BMC Bioinformatics 15:43. + +