Skip to content
This repository was archived by the owner on Mar 21, 2019. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions R/pmcOAI.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Get XML from PMC-OAI service (Pubmed Central Open Archives Initiative)

# http://www.ncbi.nlm.nih.gov/pmc/tools/oai/
# https://www.ncbi.nlm.nih.gov/pmc/tools/oai/

pmcOAI <- function(id, ...){

Expand All @@ -11,11 +11,11 @@ pmcOAI <- function(id, ...){
id2 <- gsub("PMC", "", id)

# file name for attributes
file <- paste("http://www.ncbi.nlm.nih.gov/pmc/articles/", id, sep="")
file <- paste("https://www.ncbi.nlm.nih.gov/pmc/articles/", id, sep="")

# use getURL in RCurl package (readlines returns incomplete line warning and does not get errors (just 404 NOT found)
# url <- "http://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:"
url <- "http://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:"
# url <- "https://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:"
url <- "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc&identifier=oai:pubmedcentral.nih.gov:"

x <- getURL( paste0(url, id2), .encoding="UTF-8", ...)

Expand All @@ -25,12 +25,18 @@ pmcOAI <- function(id, ...){
if(error=="idDoesNotExist") stop("No results found using ", id)

message("No full text in Open Access Subset, downloading metadata only" )
# url <- "http://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc_fm&identifier=oai:pubmedcentral.nih.gov:"
url <- "http://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc_fm&identifier=oai:pubmedcentral.nih.gov:"
# url <- "https://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc_fm&identifier=oai:pubmedcentral.nih.gov:"
url <- "https://www.ncbi.nlm.nih.gov/pmc/oai/oai.cgi?verb=GetRecord&metadataPrefix=pmc_fm&identifier=oai:pubmedcentral.nih.gov:"

x <- getURL( paste0(url, id2), .encoding="UTF-8", ...)

}

doc <- processXML(x)
doc
}

processXML <- function(x, id=NULL, file=NULL) {
# Remove namespace for easier XPath queries
# x[1] <- gsub(" xmlns=[^ ]*" , "", x[1])
# see PMC4515827 with tab before xmlns, \txmlns=
Expand All @@ -43,10 +49,15 @@ pmcOAI <- function(id, ...){
x[n] <- gsub(">([^<])</xref>", ">^\\1</xref>", x[n])

doc <- xmlParse(x)

## ADD attributes
attr(doc, "id") <- id
attr(doc, "file") <- file
if (!is.null(id)) {
attr(doc, "id") <- id
}

if (!is.null(file)) {
attr(doc, "file") <- file
}

doc
}

2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ to read zip, word tables and pdf supplementary files.
Additional details about the package are on the [wiki pages](https://github.com/cstubben/pmcXML/wiki/Overview) and in [BMC Bioinformatics](http://www.biomedcentral.com/1471-2105/15/43/abstract).

Stubben, CJ and JC Challacombe, 2014. Mining locus tags in PubMed Central to improve microbial gene annotation. BMC Bioinformatics 15:43.