diff --git a/R/pmcOAI.R b/R/pmcOAI.R index b697fd4..f745ceb 100644 --- a/R/pmcOAI.R +++ b/R/pmcOAI.R @@ -1,6 +1,6 @@ # Get XML from PMC-OAI service (Pubmed Central Open Archives Initiative) -# http://www.ncbi.nlm.nih.gov/pmc/tools/oai/ +# https://www.ncbi.nlm.nih.gov/pmc/tools/oai/ pmcOAI <- function(id, ...){ @@ -11,11 +11,11 @@ pmcOAI <- function(id, ...){ id2 <- gsub("PMC", "", id) # file name for attributes - file <- paste("http://www.ncbi.nlm.nih.gov/pmc/articles/", id, sep="") + file <- paste("https://www.ncbi.nlm.nih.gov/pmc/articles/", id, sep="") # use getURL in RCurl package (readlines returns incomplete line warning and does not get errors (just 404 NOT found) - # url <- "http://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:" - url <- "http://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:" + # url <- "https://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:" + url <- "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:" x <- getURL( paste0(url, id2), .encoding="UTF-8", ...) @@ -25,12 +25,18 @@ pmcOAI <- function(id, ...){ if(error=="idDoesNotExist") stop("No results found using ", id) message("No full text in Open Access Subset, downloading metadata only" ) - # url <- "http://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc_fm&identifier=oai:pubmedcentral.nih.gov:" - url <- "http://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc_fm&identifier=oai:pubmedcentral.nih.gov:" + # url <- "https://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc_fm&identifier=oai:pubmedcentral.nih.gov:" + url <- "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc_fm&identifier=oai:pubmedcentral.nih.gov:" x <- getURL( paste0(url, id2), .encoding="UTF-8", ...) } + + doc <- processXML(x) + doc +} + +processXML <- function(x, id=NULL, file=NULL) { # Remove namespace for easier XPath queries # x[1] <- gsub(" xmlns=[^ ]*" , "", x[1]) # see PMC4515827 with tab before xmlns, \txmlns= @@ -43,10 +49,15 @@ pmcOAI <- function(id, ...){ x[n] <- gsub(">([^<])", ">^\\1", x[n]) doc <- xmlParse(x) - + ## ADD attributes - attr(doc, "id") <- id - attr(doc, "file") <- file + if (!is.null(id)) { + attr(doc, "id") <- id + } + + if (!is.null(file)) { + attr(doc, "file") <- file + } + doc } - diff --git a/README.md b/README.md index 1afa227..8e0e531 100644 --- a/README.md +++ b/README.md @@ -16,3 +16,5 @@ to read zip, word tables and pdf supplementary files. Additional details about the package are on the [wiki pages](https://github.com/cstubben/pmcXML/wiki/Overview) and in [BMC Bioinformatics](http://www.biomedcentral.com/1471-2105/15/43/abstract). Stubben, CJ and JC Challacombe, 2014. Mining locus tags in PubMed Central to improve microbial gene annotation. BMC Bioinformatics 15:43. + +