@@ -29,7 +29,6 @@ suppressPackageStartupMessages({
2929 library(dplyr )
3030 library(biomaRt )
3131 library(rentrez )
32- library(easyPubMed )
3332 library(stringr )
3433 library(purrr )
3534})
@@ -160,6 +159,33 @@ consolidated_strings <- gsub("BMD", "Becker muscular dystrophy", consolidated_st
160159# where results will be stored
161160base_directory <- args [2 ]
162161
162+ # Optional: set NCBI email from environment (recommended by NCBI)
163+ # if (nzchar(Sys.getenv("ENTREZ_EMAIL"))) {
164+ # set_entrez_email(Sys.getenv("ENTREZ_EMAIL"))
165+ # } else {
166+ # cat("Note: Set ENTREZ_EMAIL env var to comply with NCBI policies.\n", file=stderr())
167+ # }
168+
169+ # Helper to fetch MEDLINE text in batches and write to file prefix
170+ fetch_medline_and_write <- function (pmids , outfile_prefix , batch_size = 200 ) {
171+ if (length(pmids ) == 0 ) return (FALSE )
172+ medline_texts <- character (0 )
173+ for (i in seq(1 , length(pmids ), by = batch_size )) {
174+ chunk <- pmids [i : min(i + batch_size - 1 , length(pmids ))]
175+ txt <- tryCatch({
176+ entrez_fetch(db = " pubmed" , id = chunk , rettype = " medline" , retmode = " text" )
177+ }, error = function (e ) {
178+ cat(" entrez_fetch error:" , conditionMessage(e ), " \n " , file = stderr())
179+ return (NULL )
180+ })
181+ if (! is.null(txt )) medline_texts <- c(medline_texts , txt )
182+ }
183+ if (! length(medline_texts )) return (FALSE )
184+ out_file <- paste0(outfile_prefix , " _batch_01.txt" )
185+ writeLines(medline_texts , con = out_file , useBytes = TRUE )
186+ return (file.exists(out_file ))
187+ }
188+
163189# function to perform the pubmed query
164190# Function printout includes gene name and if there are results, confirms
165191# that a file has been created
@@ -178,50 +204,82 @@ perform_pubmed_query <- function(gene_info) {
178204
179205 # Adding [Title/Abstract] to each term
180206 joined_terms <- paste0(individual_terms , " [Title/Abstract]" )
181- # joined_terms <- paste0('(', paste(or_terms, collapse = '[Title/Abstract] OR '), ')[Title/Abstract]')
182207 # Construct the query with organized or_terms
183- query <- paste0(' ("repeat expansion"[Title/Abstract] OR "tandem repeat"[Title/Abstract] OR "repeat expansions"[Title/Abstract] OR "tandem repeats"[Title/Abstract] OR "repeat sequence"[Title/Abstract] OR "repeat sequences"[Title/Abstract] OR "repeat length"[Title/Abstract] OR "repeat lengths"[Title/Abstract] OR "expansion"[Title] OR "expansions"[Title] OR "repeats"[Title]) AND (' , paste(joined_terms , collapse = " OR " ),' ) AND "English"[Language] AND ("disease"[Title/Abstract] OR "disorder"[Title/Abstract] OR "diseases"[Title/Abstract] OR "disorders"[Title/Abstract] OR "syndrome"[Title/Abstract] OR "syndromes"[Title/Abstract] OR "patient"[Title/Abstract] OR "patients"[Title/Abstract] OR "proband"[Title/Abstract] OR "probands"[Title/Abstract]) AND ("journal article"[Publication Type] OR "letter"[Publication Type] or "Case Reports"[Publication Type]) NOT "review"[Publication Type]' )
184-
185-
186- # Clean up any unnecessary slashes from the query
187- query <- gsub(" " , " " , query ) # Remove double spaces
188- # print(query)
208+ terms_repeat <- c(
209+ ' "repeat expansion"[Title/Abstract]' ,
210+ ' "repeat expansions"[Title/Abstract]' ,
211+ ' "tandem repeat"[Title/Abstract]' ,
212+ ' "tandem repeats"[Title/Abstract]' ,
213+ ' "repeat sequence"[Title/Abstract]' ,
214+ ' "repeat sequences"[Title/Abstract]' ,
215+ ' "repeat length"[Title/Abstract]' ,
216+ ' "repeat lengths"[Title/Abstract]' ,
217+ ' "expansion"[Title]' ,
218+ ' "expansions"[Title]' ,
219+ ' "repeats"[Title]'
220+ )
221+
222+ terms_gene <- joined_terms # e.g., "FMR1"[Title/Abstract] OR "FMR-1"[Title/Abstract] ...
223+ terms_language <- ' "English"[Language]'
224+ terms_disease <- c(
225+ " disease*[Title/Abstract]" ,
226+ " disorder*[Title/Abstract]" ,
227+ " syndrome*[Title/Abstract]" ,
228+ " patient*[Title/Abstract]" ,
229+ " proband*[Title/Abstract]"
230+ )
231+ terms_pubtype <- c(
232+ ' "journal article"[Publication Type]' ,
233+ ' "letter"[Publication Type]' ,
234+ ' "Case Reports"[Publication Type]'
235+ )
236+ terms_exclude <- ' "review"[Publication Type]'
237+
238+ query <- paste(
239+ " (" , paste(terms_repeat , collapse = " OR " ), " )" ,
240+ " AND (" , paste(terms_gene , collapse = " OR " ), " )" ,
241+ " AND" , terms_language ,
242+ " AND (" , paste(terms_disease , collapse = " OR " ), " )" ,
243+ " AND (" , paste(terms_pubtype , collapse = " OR " ), " )" ,
244+ " NOT" , terms_exclude
245+ )
246+
247+ # Clean up double spaces
248+ query <- gsub(" \\ s{2,}" , " " , query ) # Remove double spaces
189249 gene_name <- gsub(' "' , ' ' , gene_name )
190250 out_prefix <- paste0(base_directory , " /" , gene_name )
191251
192- # Include a separator ("/") between base_directory and gene_name
193- # Modify dest_file_prefix to include the full file path
252+ # print query for debugging
253+ # cat("PubMed query for gene", gene_name, ":\n", query, "\n")
254+
255+ # Use rentrez to search and fetch
194256 tryCatch({
195- epm_object <- epm_query(query )
196- cat(" Found" , epm_object @ meta $ exp_count , " articles for gene:" , gene_name , " \n " , file = stderr())
197- if (epm_object @ meta $ exp_count == 0 ) {
257+ srch0 <- entrez_search(db = " pubmed" , term = query , retmax = 0 , use_history = FALSE )
258+ count <- ifelse(is.null(srch0 $ count ), 0L , as.integer(srch0 $ count ))
259+ cat(" Found" , count , " articles for gene:" , gene_name , " \n " , file = stderr())
260+ if (count == 0L ) {
198261 cat(" Skipping fetch for:" , gene_name , " \n " , file = stderr())
199- next # Skip to the next gene if no articles found
262+ next
200263 }
201-
202- epm_fetch(epm_object ,
203- write_to_file = TRUE ,
204- outfile_path = NULL , # Uses current working directory if NULL
205- format = " medline" ,
206- encoding = " UTF-8" ,
207- outfile_prefix = out_prefix )
264+ retmax <- min(count , 10000L )
265+ srch <- entrez_search(db = " pubmed" , term = query , retmax = retmax )
266+ ok <- fetch_medline_and_write(srch $ ids , out_prefix )
267+ if (! ok ) stop(" Failed to write MEDLINE file" )
208268 }, error = function (e ) {
209- cat(" batch pubmed download error. \n " , file = stderr())
269+ cat(" batch pubmed download error: " , conditionMessage( e ), " \n " , file = stderr())
210270 quit(status = 1 )
211271 })
212272
213273 # output file name
214274 out_file <- paste0(out_prefix , " _batch_01.txt" )
215275 cat(out_file , " \n " , file = stderr())
216276
217- # Check if the file was created successfully XXX What if file existed before script ran?
277+ # Check if the file was created successfully
218278 cat(" Full file path:" , out_file , " \n " , file = stderr())
219279 if (file.exists(out_file )) {
220- # cat("File exists.\n", file=stderr())
221280 file_paths [[gene_name ]] <- out_file
222281 } else {
223282 cat(" Error: File not found -" , out_file , " \n " , file = stderr())
224- # quit(status = 1)
225283 }
226284 }
227285
@@ -257,21 +315,15 @@ for (gene_name in names(file_paths)) {
257315pub_info_list <- list ()
258316
259317extract_citation_info <- function (medline_data_list , gene_name ) {
260- # Combine the list of XML strings into a single string
318+ # Combine the MEDLINE records into a single string
261319 medline_string <- paste(medline_data_list , collapse = " " )
262320
263- # Use regular expressions to extract PMID, publication years, and titles
264- # Extract PMIDs
321+ # Extract PMIDs, dates, and titles
265322 pmids <- str_extract_all(medline_string , " (?<=PMID- )\\ d+" )[[1 ]]
266- # print(pmids)
267- # Extract Publication Dates
268323 publication_dates <- str_extract_all(medline_string , " (?<=DP - )\\ d+" )[[1 ]]
269- # print(publication_dates)
270- # Extract Titles
271324 title <- str_extract_all(medline_string , " (?<=TI - ).+?(?=\\ .|\\ ?)" )[[1 ]]
272- # print(title)
273325
274- # Ensure all vectors have the same length
326+ # Align vector lengths
275327 length_diff <- length(pmids ) - length(publication_dates )
276328 if (length_diff > 0 ) {
277329 publication_dates <- c(publication_dates , rep(NA , length_diff ))
@@ -286,45 +338,47 @@ extract_citation_info <- function(medline_data_list, gene_name) {
286338 pmids <- c(pmids , rep(NA , - length_diff ))
287339 }
288340
289- # Create a dataframe with gene_name, PMID, PublicationYear, and Title
290- pub_info_df <- data.frame (gene = rep( gene_name , length( pmids )),
291- PMID = pmids ,
292- PublicationDate = publication_dates ,
293- Title = title ,
294- stringsAsFactors = FALSE )
295-
296- return ( pub_info_df )
341+ # Data frame output
342+ data.frame (
343+ gene = rep( gene_name , length( pmids )) ,
344+ PMID = pmids ,
345+ PublicationDate = publication_dates ,
346+ Title = title ,
347+ stringsAsFactors = FALSE
348+ )
297349}
298350
299351for (gene_name in names(all_publications )) {
300- # Get the list of XML data for the current gene_name
301352 medline_data_list <- all_publications [[gene_name ]]
302- # print(gene_name)
303- # Extract publication information using the function
353+ if (length(medline_data_list ) == 0 ) next # skip empty files
304354 pub_info_df <- extract_citation_info(medline_data_list , gene_name )
305-
306- # Append the results to the list
307355 pub_info_list [[gene_name ]] <- pub_info_df
308356}
309357
310358# Combine all the dataframes into a single dataframe
311- all_pub_info_df <- do.call(rbind , pub_info_list )
359+ if (length(pub_info_list ) == 0 ) {
360+ all_pub_info_df <- data.frame (
361+ gene = character (),
362+ PMID = character (),
363+ PublicationDate = character (),
364+ Title = character (),
365+ stringsAsFactors = FALSE
366+ )
367+ } else {
368+ all_pub_info_df <- do.call(rbind , pub_info_list )
369+ }
312370
313371# add pubmed search results to literature field for entry
314372data <- data %> %
315373 mutate(additional_literature = map_chr(gene , function (g ) {
316- # Find matching PMIDs from all_pub_info_df for each gene
374+ # Guard if no publications parsed
375+ if (nrow(all_pub_info_df ) == 0 ) return (" " )
317376 matching_pmids <- all_pub_info_df %> %
318377 filter(gene == g ) %> %
319378 pull(PMID ) %> %
320379 unique() %> %
321380 paste0(" @pmid:" , . , collapse = " ," )
322-
323- # If there are no matches, return an empty string
324- if (length(matching_pmids ) == 0 ) {
325- return (" " )
326- }
327- return (matching_pmids )
381+ if (length(matching_pmids ) == 0 ) " " else matching_pmids
328382 }))
329383
330384# remove any redundant pmids from additional_literature that are in references
@@ -360,56 +414,71 @@ write_json(lit_data, args[4])
360414# new locus query found from reviewing pertinent terms in discovery papers
361415perform_new_pubmed_query <- function () {
362416 file_path <- list () # Initialize the list to store all publications
363- # joined_terms <- paste0('(', paste(or_terms, collapse = '[Title/Abstract] OR '), ')[Title/Abstract]')
364- # Construct the query with organized or_terms
365- query <- paste0(' ("repeat expansion"[Title/Abstract] OR "tandem repeat"[Title/Abstract]) AND ("discovered"[Title/Abstract] OR "identified"[Title/Abstract] OR "causative"[Title/Abstract] OR "underlie"[Title/Abstract] OR "basis"[Title/Abstract]) AND "English"[Language] AND ("disease"[Title/Abstract] OR "disorder"[Title/Abstract] OR "syndrome"[Title/Abstract] OR "condition*"[Title/Abstract]) AND ("journal article"[Publication Type] OR "letter"[Publication Type] OR "Case Reports"[Publication Type]) NOT "review"[Publication Type]' )
366417
367- # Clean up any unnecessary slashes from the query
368- query <- gsub(" " , " " , query ) # Remove double spaces
369- # print(query)
418+ terms_repeat <- c(
419+ ' "repeat expansion"[Title/Abstract]' ,
420+ ' "tandem repeat"[Title/Abstract]'
421+ )
422+ terms_discovery <- c(
423+ ' "discovered"[Title/Abstract]' ,
424+ ' "identified"[Title/Abstract]' ,
425+ ' "causative"[Title/Abstract]' ,
426+ ' "underlie"[Title/Abstract]' ,
427+ ' "basis"[Title/Abstract]'
428+ )
429+ terms_language <- ' "English"[Language]'
430+ terms_disease <- c(
431+ ' "disease"[Title/Abstract]' ,
432+ ' "disorder"[Title/Abstract]' ,
433+ ' "syndrome"[Title/Abstract]' ,
434+ ' "condition*"[Title/Abstract]'
435+ )
436+ terms_pubtype <- c(
437+ ' "journal article"[Publication Type]' ,
438+ ' "letter"[Publication Type]' ,
439+ ' "Case Reports"[Publication Type]'
440+ )
441+ terms_exclude <- ' "review"[Publication Type]'
442+
443+ query <- paste(
444+ " (" , paste(terms_repeat , collapse = " OR " ), " )" ,
445+ " AND (" , paste(terms_discovery , collapse = " OR " ), " )" ,
446+ " AND" , terms_language ,
447+ " AND (" , paste(terms_disease , collapse = " OR " ), " )" ,
448+ " AND (" , paste(terms_pubtype , collapse = " OR " ), " )" ,
449+ " NOT" , terms_exclude
450+ )
451+
452+ query <- gsub(" \\ s{2,}" , " " , query )
370453 out_prefix <- paste0(base_directory , " /new_loci" )
371454
372- # Include a separator ("/") between base_directory and gene_name
373- # Modify dest_file_prefix to include the full file path
374455 tryCatch({
375- epm_object <- epm_query(query )
376- if (epm_object @ meta $ exp_count == 0 ) {
377- cat(" Skipping fetch for new loci - no articles found.\n " , file = stderr())
378- return (NULL ) # Skip fetch if no articles found
379- }
380- epm_fetch(epm_object ,
381- write_to_file = TRUE ,
382- outfile_path = NULL , # Uses current working directory if NULL
383- format = " medline" ,
384- encoding = " UTF-8" ,
385- outfile_prefix = out_prefix )
386-
456+ srch0 <- entrez_search(db = " pubmed" , term = query , retmax = 0 , use_history = FALSE )
457+ count <- ifelse(is.null(srch0 $ count ), 0L , as.integer(srch0 $ count ))
458+ if (count == 0L ) {
459+ cat(" Skipping fetch for new loci - no articles found.\n " , file = stderr())
460+ return (NULL )
461+ }
462+ retmax <- min(count , 10000L )
463+ srch <- entrez_search(db = " pubmed" , term = query , retmax = retmax )
464+ ok <- fetch_medline_and_write(srch $ ids , out_prefix )
465+ if (! ok ) stop(" Failed to write MEDLINE file" )
387466 }, error = function (e ) {
388- cat(" batch pubmed download error. \n " , file = stderr())
467+ cat(" batch pubmed download error: " , conditionMessage( e ), " \n " , file = stderr())
389468 quit(status = 1 )
390469 })
391470
392- # the function adds 01.txt so, gotta fix that here
393471 out_file <- paste0(out_prefix , " _batch_01.txt" )
394- # print(out_file)
395-
396- # Check if the file was created successfully XXX What if file existed before script ran?
397472 cat(" Full file path:" , out_file , " \n " , file = stderr())
398473 if (file.exists(out_file )) {
399- # cat("File exists.\n", file=stderr())
400474 file_path <- out_file
401475 } else {
402476 cat(" Error: File not found -" , out_file , " \n " , file = stderr())
403- # quit(status = 1)
404477 }
405478
406-
407479 return (file_path )
408480}
409481
410-
411-
412- perform_new_pubmed_query()
413482#
414483# ### Let's get all the citations to run manubot on
415484extract_citations <- function (column ) {
0 commit comments