@@ -27,7 +27,11 @@ class PubMedRetrieverError(Exception):
2727
2828
2929class PubMedRetriever :
30- """Retrieves paper metadata and full text from PubMed using NCBI E-Utilities API."""
30+ """Fetches papers from PubMed/PMC via NCBI E-utilities.
31+
32+ Handles rate limiting (NCBI requires 0.34s between requests) and retries.
33+ Caches results to avoid hitting API limits.
34+ """
3135
3236 BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
3337 DEFAULT_TIMEOUT = 10
@@ -36,22 +40,27 @@ class PubMedRetriever:
3640 def __init__ (
3741 self , api_key : Optional [str ] = None , email : str = "bioanalyzer@example.com"
3842 ):
39- """Initialize PubMed retriever with API key and email."""
43+ """Create retriever instance.
44+
45+ Args:
46+ api_key: NCBI API key (optional but recommended for higher rate limits).
47+ email: Contact email for NCBI requests (required by their ToS).
48+ """
4049 self .api_key = api_key
4150 self .email = email
4251 self .session = self ._create_session ()
4352 self ._verify_connectivity ()
4453
4554 def _create_session (self ) -> requests .Session :
46- """Create a configured requests session."""
55+ """Set up HTTP session with proper User-Agent header ."""
4756 session = requests .Session ()
4857 session .headers .update (
4958 {"User-Agent" : f"BioAnalyzer/1.0 (contact: { self .email } )" }
5059 )
5160 return session
5261
5362 def _verify_connectivity (self , retries : int = 3 ) -> None :
54- """Test NCBI E-utilities reachability on startup with retries ."""
63+ """Verify NCBI E-utilities connectivity on startup."""
5564 test_url = f"{ self .BASE_URL } /esearch.fcgi"
5665 params = {"db" : "pubmed" , "term" : "cancer" , "retmax" : 1 }
5766 for attempt in range (retries ):
@@ -235,12 +244,19 @@ def fetch_paper_metadata(self, pmid: str) -> Dict[str, Any]:
235244 fields ["authors" ] = [a .text for a in item .findall ("Item" ) if a .text ]
236245 fields .setdefault ("abstract" , "" )
237246 return fields
238- except Exception as e :
247+ except ( ElementTree . ParseError , AttributeError , ValueError ) as e :
239248 safe_error = mask_exception_message (e )
240249 logger .error (
241250 f"Error parsing fallback esummary for PMID { pmid } : { safe_error } "
242251 )
243252 return {"error" : "Fallback retrieval failed." }
253+ except Exception as e :
254+ # Catch-all for unexpected parsing errors
255+ safe_error = mask_exception_message (e )
256+ logger .error (
257+ f"Unexpected error parsing fallback esummary for PMID { pmid } : { safe_error } "
258+ )
259+ return {"error" : "Fallback retrieval failed." }
244260
245261 def search (self , query : str , max_results : int = 10 ) -> List [str ]:
246262 xml_data = self ._make_request (
@@ -271,9 +287,17 @@ def get_pmc_fulltext(self, pmid: str) -> str:
271287
272288 return self ._get_pmc_fulltext_by_id (pmc_id )
273289
290+ except (requests .exceptions .RequestException , PubMedRetrieverError ) as e :
291+ safe_error = mask_exception_message (e )
292+ logger .warning (f"Network error retrieving full text for PMID { pmid } : { safe_error } " )
293+ return ""
294+ except (ElementTree .ParseError , ValueError ) as e :
295+ safe_error = mask_exception_message (e )
296+ logger .warning (f"Parse error retrieving full text for PMID { pmid } : { safe_error } " )
297+ return ""
274298 except Exception as e :
275299 safe_error = mask_exception_message (e )
276- logger .warning (f"Error retrieving full text for PMID { pmid } : { safe_error } " )
300+ logger .warning (f"Unexpected error retrieving full text for PMID { pmid } : { safe_error } " )
277301 return ""
278302
279303 def _get_pmc_id_from_pmid (self , pmid : str ) -> Optional [str ]:
@@ -305,9 +329,17 @@ def _get_pmc_id_from_pmid(self, pmid: str) -> Optional[str]:
305329 return pmc_id
306330 return None
307331
332+ except (requests .exceptions .RequestException , PubMedRetrieverError ) as e :
333+ safe_error = mask_exception_message (e )
334+ logger .warning (f"Network error getting PMC ID for PMID { pmid } : { safe_error } " )
335+ return None
336+ except (ElementTree .ParseError , AttributeError ) as e :
337+ safe_error = mask_exception_message (e )
338+ logger .warning (f"Parse error getting PMC ID for PMID { pmid } : { safe_error } " )
339+ return None
308340 except Exception as e :
309341 safe_error = mask_exception_message (e )
310- logger .warning (f"Error getting PMC ID for PMID { pmid } : { safe_error } " )
342+ logger .warning (f"Unexpected error getting PMC ID for PMID { pmid } : { safe_error } " )
311343 return None
312344
313345 def _get_pmc_fulltext_by_id (self , pmc_id : str ) -> str :
@@ -351,20 +383,25 @@ def _get_pmc_fulltext_by_id(self, pmc_id: str) -> str:
351383
352384 return "\n \n " .join (full_text_parts )
353385
386+ except (requests .exceptions .RequestException , PubMedRetrieverError ) as e :
387+ safe_error = mask_exception_message (e )
388+ logger .warning (f"Network error retrieving PMC full text for { pmc_id } : { safe_error } " )
389+ return ""
390+ except (ElementTree .ParseError , AttributeError , ValueError ) as e :
391+ safe_error = mask_exception_message (e )
392+ logger .warning (f"Parse error retrieving PMC full text for { pmc_id } : { safe_error } " )
393+ return ""
354394 except Exception as e :
355395 safe_error = mask_exception_message (e )
356- logger .warning (f"Error retrieving PMC full text for { pmc_id } : { safe_error } " )
396+ logger .warning (f"Unexpected error retrieving PMC full text for { pmc_id } : { safe_error } " )
357397 return ""
358398
359399 async def get_pmc_fulltext_async (self , pmid : str ) -> str :
360400 """Async wrapper for PMC full text retrieval."""
361401 return await asyncio .to_thread (self .get_pmc_fulltext , pmid )
362402
363403 def get_full_paper_data (self , pmid : str ) -> Dict [str , Any ]:
364- """
365- Retrieve complete paper data including metadata and full text.
366- This is the main method for comprehensive paper retrieval.
367- """
404+ """Retrieve complete paper data including metadata and full text."""
368405 try :
369406 logger .info (f"Retrieving full paper data for PMID: { pmid } " )
370407
@@ -392,14 +429,14 @@ def get_full_paper_data(self, pmid: str) -> Dict[str, Any]:
392429 logger .info (f"Successfully retrieved paper data for PMID: { pmid } " )
393430 return paper_data
394431
395- except Exception as e :
432+ except ( requests . exceptions . RequestException , PubMedRetrieverError ) as e :
396433 safe_error = mask_exception_message (e )
397434 logger .error (
398- f"Error retrieving full paper data for PMID { pmid } : { safe_error } "
435+ f"Network error retrieving full paper data for PMID { pmid } : { safe_error } "
399436 )
400437 return {
401438 "pmid" : pmid ,
402- "error" : f "Failed to retrieve paper data: { str ( e ) } " ,
439+ "error" : "Failed to retrieve paper data due to network error " ,
403440 "title" : "" ,
404441 "abstract" : "" ,
405442 "journal" : "" ,
@@ -420,19 +457,33 @@ async def fetch_metadata():
420457 return await asyncio .wait_for (
421458 self .get_paper_metadata_async (pmid ), timeout = 6
422459 )
460+ except asyncio .TimeoutError :
461+ logger .error (f"Timeout fetching metadata for PMID { pmid } " )
462+ return {}
463+ except (requests .exceptions .RequestException , PubMedRetrieverError ) as e :
464+ safe_error = mask_exception_message (e )
465+ logger .error (f"Network error fetching metadata for PMID { pmid } : { safe_error } " )
466+ return {}
423467 except Exception as e :
424468 safe_error = mask_exception_message (e )
425- logger .error (f"Metadata fetch error for PMID { pmid } : { safe_error } " )
469+ logger .error (f"Unexpected error fetching metadata for PMID { pmid } : { safe_error } " )
426470 return {}
427471
428472 async def fetch_fulltext ():
429473 try :
430474 return await asyncio .wait_for (
431475 self .get_pmc_fulltext_async (pmid ), timeout = 8
432476 )
477+ except asyncio .TimeoutError :
478+ logger .warning (f"Timeout fetching full text for PMID { pmid } " )
479+ return ""
480+ except (requests .exceptions .RequestException , PubMedRetrieverError ) as e :
481+ safe_error = mask_exception_message (e )
482+ logger .warning (f"Network error fetching full text for PMID { pmid } : { safe_error } " )
483+ return ""
433484 except Exception as e :
434485 safe_error = mask_exception_message (e )
435- logger .warning (f"Full text fetch error for PMID { pmid } : { safe_error } " )
486+ logger .warning (f"Unexpected error fetching full text for PMID { pmid } : { safe_error } " )
436487 return ""
437488
438489 if USE_FULLTEXT :
0 commit comments