Skip to content

Commit dac00b7

Browse files
authored
change get-literature.R to use rentrez only instead of easyPubMed (#306)
## Description The PubMed queries kept failing and I was struggling to debug them. I switched to using rentrez only instead of easyPubMed and things appear to be working now. I also changed the way the queries are created to make it a bit easier to read. @laurelhiatt did I mess any of the logic up? Here is the the latest Literature Update using this script (to check the output): #307 ## Major Changes - change get-literature.R to use rentrez only instead of easyPubMed ## Minor Changes - Reformat queries ## Checklist - [x] All changes are well summarized - [x] Check all tests pass - [ ] Check that the website preview looks good - [ ] Update the STRchive version in `CITATION.cff`, format X.Y.Z. If any major changes, increment Y. If only minor changes, increment Z. If the breaking change (rare), increment X. - [x] Ask someone to review this PR
1 parent 2b3fe6d commit dac00b7

File tree

1 file changed

+155
-86
lines changed

1 file changed

+155
-86
lines changed

scripts/get-literature.R

Lines changed: 155 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@ suppressPackageStartupMessages({
2929
library(dplyr)
3030
library(biomaRt)
3131
library(rentrez)
32-
library(easyPubMed)
3332
library(stringr)
3433
library(purrr)
3534
})
@@ -160,6 +159,33 @@ consolidated_strings <- gsub("BMD", "Becker muscular dystrophy", consolidated_st
160159
#where results will be stored
161160
base_directory <- args[2]
162161

162+
# Optional: set NCBI email from environment (recommended by NCBI)
163+
# if (nzchar(Sys.getenv("ENTREZ_EMAIL"))) {
164+
# set_entrez_email(Sys.getenv("ENTREZ_EMAIL"))
165+
# } else {
166+
# cat("Note: Set ENTREZ_EMAIL env var to comply with NCBI policies.\n", file=stderr())
167+
# }
168+
169+
# Helper to fetch MEDLINE text in batches and write to file prefix
170+
fetch_medline_and_write <- function(pmids, outfile_prefix, batch_size = 200) {
171+
if (length(pmids) == 0) return(FALSE)
172+
medline_texts <- character(0)
173+
for (i in seq(1, length(pmids), by = batch_size)) {
174+
chunk <- pmids[i:min(i + batch_size - 1, length(pmids))]
175+
txt <- tryCatch({
176+
entrez_fetch(db = "pubmed", id = chunk, rettype = "medline", retmode = "text")
177+
}, error = function(e) {
178+
cat("entrez_fetch error:", conditionMessage(e), "\n", file=stderr())
179+
return(NULL)
180+
})
181+
if (!is.null(txt)) medline_texts <- c(medline_texts, txt)
182+
}
183+
if (!length(medline_texts)) return(FALSE)
184+
out_file <- paste0(outfile_prefix, "_batch_01.txt")
185+
writeLines(medline_texts, con = out_file, useBytes = TRUE)
186+
return(file.exists(out_file))
187+
}
188+
163189
# function to perform the pubmed query
164190
# Function printout includes gene name and if there are results, confirms
165191
# that a file has been created
@@ -178,50 +204,82 @@ perform_pubmed_query <- function(gene_info) {
178204

179205
# Adding [Title/Abstract] to each term
180206
joined_terms <- paste0(individual_terms, "[Title/Abstract]")
181-
#joined_terms <- paste0('(', paste(or_terms, collapse = '[Title/Abstract] OR '), ')[Title/Abstract]')
182207
# Construct the query with organized or_terms
183-
query <- paste0('("repeat expansion"[Title/Abstract] OR "tandem repeat"[Title/Abstract] OR "repeat expansions"[Title/Abstract] OR "tandem repeats"[Title/Abstract] OR "repeat sequence"[Title/Abstract] OR "repeat sequences"[Title/Abstract] OR "repeat length"[Title/Abstract] OR "repeat lengths"[Title/Abstract] OR "expansion"[Title] OR "expansions"[Title] OR "repeats"[Title]) AND (', paste(joined_terms, collapse = " OR "),') AND "English"[Language] AND ("disease"[Title/Abstract] OR "disorder"[Title/Abstract] OR "diseases"[Title/Abstract] OR "disorders"[Title/Abstract] OR "syndrome"[Title/Abstract] OR "syndromes"[Title/Abstract] OR "patient"[Title/Abstract] OR "patients"[Title/Abstract] OR "proband"[Title/Abstract] OR "probands"[Title/Abstract]) AND ("journal article"[Publication Type] OR "letter"[Publication Type] or "Case Reports"[Publication Type]) NOT "review"[Publication Type]')
184-
185-
186-
# Clean up any unnecessary slashes from the query
187-
query <- gsub(" ", " ", query) # Remove double spaces
188-
#print(query)
208+
terms_repeat <- c(
209+
'"repeat expansion"[Title/Abstract]',
210+
'"repeat expansions"[Title/Abstract]',
211+
'"tandem repeat"[Title/Abstract]',
212+
'"tandem repeats"[Title/Abstract]',
213+
'"repeat sequence"[Title/Abstract]',
214+
'"repeat sequences"[Title/Abstract]',
215+
'"repeat length"[Title/Abstract]',
216+
'"repeat lengths"[Title/Abstract]',
217+
'"expansion"[Title]',
218+
'"expansions"[Title]',
219+
'"repeats"[Title]'
220+
)
221+
222+
terms_gene <- joined_terms # e.g., "FMR1"[Title/Abstract] OR "FMR-1"[Title/Abstract] ...
223+
terms_language <- '"English"[Language]'
224+
terms_disease <- c(
225+
"disease*[Title/Abstract]",
226+
"disorder*[Title/Abstract]",
227+
"syndrome*[Title/Abstract]",
228+
"patient*[Title/Abstract]",
229+
"proband*[Title/Abstract]"
230+
)
231+
terms_pubtype <- c(
232+
'"journal article"[Publication Type]',
233+
'"letter"[Publication Type]',
234+
'"Case Reports"[Publication Type]'
235+
)
236+
terms_exclude <- '"review"[Publication Type]'
237+
238+
query <- paste(
239+
"(" , paste(terms_repeat, collapse = " OR "), ")",
240+
"AND (", paste(terms_gene, collapse = " OR "), ")",
241+
"AND", terms_language,
242+
"AND (", paste(terms_disease, collapse = " OR "), ")",
243+
"AND (", paste(terms_pubtype, collapse = " OR "), ")",
244+
"NOT", terms_exclude
245+
)
246+
247+
# Clean up double spaces
248+
query <- gsub("\\s{2,}", " ", query) # Remove double spaces
189249
gene_name <- gsub('"', '', gene_name)
190250
out_prefix <- paste0(base_directory, "/", gene_name)
191251

192-
# Include a separator ("/") between base_directory and gene_name
193-
# Modify dest_file_prefix to include the full file path
252+
# print query for debugging
253+
# cat("PubMed query for gene", gene_name, ":\n", query, "\n")
254+
255+
# Use rentrez to search and fetch
194256
tryCatch({
195-
epm_object <- epm_query(query)
196-
cat("Found", epm_object@meta$exp_count, "articles for gene:", gene_name, "\n", file=stderr())
197-
if (epm_object@meta$exp_count == 0) {
257+
srch0 <- entrez_search(db = "pubmed", term = query, retmax = 0, use_history = FALSE)
258+
count <- ifelse(is.null(srch0$count), 0L, as.integer(srch0$count))
259+
cat("Found", count, "articles for gene:", gene_name, "\n", file=stderr())
260+
if (count == 0L) {
198261
cat("Skipping fetch for:", gene_name, "\n", file=stderr())
199-
next # Skip to the next gene if no articles found
262+
next
200263
}
201-
202-
epm_fetch(epm_object,
203-
write_to_file = TRUE,
204-
outfile_path = NULL, # Uses current working directory if NULL
205-
format = "medline",
206-
encoding = "UTF-8",
207-
outfile_prefix = out_prefix)
264+
retmax <- min(count, 10000L)
265+
srch <- entrez_search(db = "pubmed", term = query, retmax = retmax)
266+
ok <- fetch_medline_and_write(srch$ids, out_prefix)
267+
if (!ok) stop("Failed to write MEDLINE file")
208268
}, error = function(e) {
209-
cat("batch pubmed download error.\n", file=stderr())
269+
cat("batch pubmed download error: ", conditionMessage(e), "\n", file=stderr())
210270
quit(status = 1)
211271
})
212272

213273
# output file name
214274
out_file <- paste0(out_prefix, "_batch_01.txt")
215275
cat(out_file, "\n", file=stderr())
216276

217-
# Check if the file was created successfully XXX What if file existed before script ran?
277+
# Check if the file was created successfully
218278
cat("Full file path:", out_file, "\n", file=stderr())
219279
if (file.exists(out_file)) {
220-
#cat("File exists.\n", file=stderr())
221280
file_paths[[gene_name]] <- out_file
222281
} else {
223282
cat("Error: File not found -", out_file, "\n", file=stderr())
224-
#quit(status = 1)
225283
}
226284
}
227285

@@ -257,21 +315,15 @@ for (gene_name in names(file_paths)) {
257315
pub_info_list <- list()
258316

259317
extract_citation_info <- function(medline_data_list, gene_name) {
260-
# Combine the list of XML strings into a single string
318+
# Combine the MEDLINE records into a single string
261319
medline_string <- paste(medline_data_list, collapse = "")
262320

263-
# Use regular expressions to extract PMID, publication years, and titles
264-
# Extract PMIDs
321+
# Extract PMIDs, dates, and titles
265322
pmids <- str_extract_all(medline_string, "(?<=PMID- )\\d+")[[1]]
266-
#print(pmids)
267-
# Extract Publication Dates
268323
publication_dates <- str_extract_all(medline_string, "(?<=DP - )\\d+")[[1]]
269-
#print(publication_dates)
270-
# Extract Titles
271324
title <- str_extract_all(medline_string, "(?<=TI - ).+?(?=\\.|\\?)")[[1]]
272-
#print(title)
273325

274-
# Ensure all vectors have the same length
326+
# Align vector lengths
275327
length_diff <- length(pmids) - length(publication_dates)
276328
if (length_diff > 0) {
277329
publication_dates <- c(publication_dates, rep(NA, length_diff))
@@ -286,45 +338,47 @@ extract_citation_info <- function(medline_data_list, gene_name) {
286338
pmids <- c(pmids, rep(NA, -length_diff))
287339
}
288340

289-
# Create a dataframe with gene_name, PMID, PublicationYear, and Title
290-
pub_info_df <- data.frame(gene = rep(gene_name, length(pmids)),
291-
PMID = pmids,
292-
PublicationDate = publication_dates,
293-
Title = title,
294-
stringsAsFactors = FALSE)
295-
296-
return(pub_info_df)
341+
# Data frame output
342+
data.frame(
343+
gene = rep(gene_name, length(pmids)),
344+
PMID = pmids,
345+
PublicationDate = publication_dates,
346+
Title = title,
347+
stringsAsFactors = FALSE
348+
)
297349
}
298350

299351
for (gene_name in names(all_publications)) {
300-
# Get the list of XML data for the current gene_name
301352
medline_data_list <- all_publications[[gene_name]]
302-
#print(gene_name)
303-
# Extract publication information using the function
353+
if (length(medline_data_list) == 0) next # skip empty files
304354
pub_info_df <- extract_citation_info(medline_data_list, gene_name)
305-
306-
# Append the results to the list
307355
pub_info_list[[gene_name]] <- pub_info_df
308356
}
309357

310358
# Combine all the dataframes into a single dataframe
311-
all_pub_info_df <- do.call(rbind, pub_info_list)
359+
if (length(pub_info_list) == 0) {
360+
all_pub_info_df <- data.frame(
361+
gene = character(),
362+
PMID = character(),
363+
PublicationDate = character(),
364+
Title = character(),
365+
stringsAsFactors = FALSE
366+
)
367+
} else {
368+
all_pub_info_df <- do.call(rbind, pub_info_list)
369+
}
312370

313371
# add pubmed search results to literature field for entry
314372
data <- data %>%
315373
mutate(additional_literature = map_chr(gene, function(g) {
316-
# Find matching PMIDs from all_pub_info_df for each gene
374+
# Guard if no publications parsed
375+
if (nrow(all_pub_info_df) == 0) return("")
317376
matching_pmids <- all_pub_info_df %>%
318377
filter(gene == g) %>%
319378
pull(PMID) %>%
320379
unique() %>%
321380
paste0("@pmid:", ., collapse = ",")
322-
323-
# If there are no matches, return an empty string
324-
if (length(matching_pmids) == 0) {
325-
return("")
326-
}
327-
return(matching_pmids)
381+
if (length(matching_pmids) == 0) "" else matching_pmids
328382
}))
329383

330384
# remove any redundant pmids from additional_literature that are in references
@@ -360,56 +414,71 @@ write_json(lit_data, args[4])
360414
#new locus query found from reviewing pertinent terms in discovery papers
361415
perform_new_pubmed_query <- function() {
362416
file_path <- list() # Initialize the list to store all publications
363-
#joined_terms <- paste0('(', paste(or_terms, collapse = '[Title/Abstract] OR '), ')[Title/Abstract]')
364-
# Construct the query with organized or_terms
365-
query <- paste0('("repeat expansion"[Title/Abstract] OR "tandem repeat"[Title/Abstract]) AND ("discovered"[Title/Abstract] OR "identified"[Title/Abstract] OR "causative"[Title/Abstract] OR "underlie"[Title/Abstract] OR "basis"[Title/Abstract]) AND "English"[Language] AND ("disease"[Title/Abstract] OR "disorder"[Title/Abstract] OR "syndrome"[Title/Abstract] OR "condition*"[Title/Abstract]) AND ("journal article"[Publication Type] OR "letter"[Publication Type] OR "Case Reports"[Publication Type]) NOT "review"[Publication Type]')
366417

367-
# Clean up any unnecessary slashes from the query
368-
query <- gsub(" ", " ", query) # Remove double spaces
369-
#print(query)
418+
terms_repeat <- c(
419+
'"repeat expansion"[Title/Abstract]',
420+
'"tandem repeat"[Title/Abstract]'
421+
)
422+
terms_discovery <- c(
423+
'"discovered"[Title/Abstract]',
424+
'"identified"[Title/Abstract]',
425+
'"causative"[Title/Abstract]',
426+
'"underlie"[Title/Abstract]',
427+
'"basis"[Title/Abstract]'
428+
)
429+
terms_language <- '"English"[Language]'
430+
terms_disease <- c(
431+
'"disease"[Title/Abstract]',
432+
'"disorder"[Title/Abstract]',
433+
'"syndrome"[Title/Abstract]',
434+
'"condition*"[Title/Abstract]'
435+
)
436+
terms_pubtype <- c(
437+
'"journal article"[Publication Type]',
438+
'"letter"[Publication Type]',
439+
'"Case Reports"[Publication Type]'
440+
)
441+
terms_exclude <- '"review"[Publication Type]'
442+
443+
query <- paste(
444+
"(" , paste(terms_repeat, collapse = " OR "), ")",
445+
"AND (", paste(terms_discovery, collapse = " OR "), ")",
446+
"AND", terms_language,
447+
"AND (", paste(terms_disease, collapse = " OR "), ")",
448+
"AND (", paste(terms_pubtype, collapse = " OR "), ")",
449+
"NOT", terms_exclude
450+
)
451+
452+
query <- gsub("\\s{2,}", " ", query)
370453
out_prefix <- paste0(base_directory, "/new_loci")
371454

372-
# Include a separator ("/") between base_directory and gene_name
373-
# Modify dest_file_prefix to include the full file path
374455
tryCatch({
375-
epm_object <- epm_query(query)
376-
if (epm_object@meta$exp_count == 0) {
377-
cat("Skipping fetch for new loci - no articles found.\n", file=stderr())
378-
return(NULL) # Skip fetch if no articles found
379-
}
380-
epm_fetch(epm_object,
381-
write_to_file = TRUE,
382-
outfile_path = NULL, # Uses current working directory if NULL
383-
format = "medline",
384-
encoding = "UTF-8",
385-
outfile_prefix = out_prefix)
386-
456+
srch0 <- entrez_search(db = "pubmed", term = query, retmax = 0, use_history = FALSE)
457+
count <- ifelse(is.null(srch0$count), 0L, as.integer(srch0$count))
458+
if (count == 0L) {
459+
cat("Skipping fetch for new loci - no articles found.\n", file=stderr())
460+
return(NULL)
461+
}
462+
retmax <- min(count, 10000L)
463+
srch <- entrez_search(db = "pubmed", term = query, retmax = retmax)
464+
ok <- fetch_medline_and_write(srch$ids, out_prefix)
465+
if (!ok) stop("Failed to write MEDLINE file")
387466
}, error = function(e) {
388-
cat("batch pubmed download error.\n", file=stderr())
467+
cat("batch pubmed download error: ", conditionMessage(e), "\n", file=stderr())
389468
quit(status = 1)
390469
})
391470

392-
# the function adds 01.txt so, gotta fix that here
393471
out_file <- paste0(out_prefix, "_batch_01.txt")
394-
#print(out_file)
395-
396-
# Check if the file was created successfully XXX What if file existed before script ran?
397472
cat("Full file path:", out_file, "\n", file=stderr())
398473
if (file.exists(out_file)) {
399-
#cat("File exists.\n", file=stderr())
400474
file_path <- out_file
401475
} else {
402476
cat("Error: File not found -", out_file, "\n", file=stderr())
403-
#quit(status = 1)
404477
}
405478

406-
407479
return(file_path)
408480
}
409481

410-
411-
412-
perform_new_pubmed_query()
413482
#
414483
# ### Let's get all the citations to run manubot on
415484
extract_citations <- function(column) {

0 commit comments

Comments
 (0)