Skip to content

Commit ea46e42

Browse files
authored
Merge pull request #64 from waldronlab/refactor/improve-maintainability
Refactor: improve code maintainability and error handling
2 parents 66d6ca6 + ea51461 commit ea46e42

File tree

9 files changed

+440
-1396
lines changed

9 files changed

+440
-1396
lines changed

README.md

Lines changed: 126 additions & 779 deletions
Large diffs are not rendered by default.

app/api/app.py

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,12 @@
11
"""FastAPI application for BioAnalyzer backend API."""
22

3+
import logging
4+
import os
5+
import traceback
36
from fastapi import FastAPI, Request
47
from fastapi.middleware.cors import CORSMiddleware
58
from fastapi.responses import JSONResponse
69
from fastapi.exceptions import RequestValidationError
7-
import logging
8-
import os
9-
import sys
10-
import traceback
11-
12-
sys.path.append(
13-
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14-
)
1510

1611
from app.api.routers import (
1712
bugsigdb_analysis,
@@ -119,19 +114,20 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
119114

120115
@app.exception_handler(Exception)
121116
async def global_exception_handler(request: Request, exc: Exception):
122-
"""Handle unexpected exceptions."""
117+
"""Handle unexpected exceptions with credential masking."""
123118
from app.utils.credential_masking import mask_exception_message, mask_string
124119

125-
# Mask any credentials in exception message and traceback
126120
safe_exc_msg = mask_exception_message(exc)
127121
safe_traceback = mask_string(traceback.format_exc())
122+
request_id = getattr(request.state, "request_id", None)
128123

129124
logger.error(
130125
f"Unhandled exception: {safe_exc_msg}\n"
131126
f"Traceback: {safe_traceback}\n"
132-
f"Request ID: {getattr(request.state, 'request_id', None)}"
127+
f"Request ID: {request_id}"
133128
)
134129

130+
# Hide internal details in production
135131
if ENVIRONMENT == "production":
136132
detail = "An internal error occurred. Please try again later."
137133
else:
@@ -142,7 +138,7 @@ async def global_exception_handler(request: Request, exc: Exception):
142138
content={
143139
"error": "Internal Server Error",
144140
"detail": detail,
145-
"request_id": getattr(request.state, "request_id", None),
141+
"request_id": request_id,
146142
},
147143
)
148144

app/models/unified_qa.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@
2828

2929

3030
class UnifiedQA:
31-
"""Unified QA system supporting multiple LLM providers."""
31+
"""Wrapper around LLM providers for question answering.
32+
33+
Tries LiteLLM first (supports multiple providers), falls back to Paper-QA,
34+
then GeminiQA if those fail. Auto-detects provider from available API keys.
35+
"""
3236

3337
def __init__(
3438
self,
@@ -38,7 +42,15 @@ def __init__(
3842
gemini_api_key: Optional[str] = None,
3943
use_paperqa: bool = True,
4044
):
41-
"""Initialize QA system with specified provider and model."""
45+
"""Set up QA system with the specified provider.
46+
47+
Args:
48+
provider: LLM provider (gemini, openai, anthropic, ollama). Auto-detects if None.
49+
model: Specific model name. Uses provider default if None.
50+
use_gemini: Deprecated. Use provider='gemini' instead.
51+
gemini_api_key: Override GEMINI_API_KEY env var.
52+
use_paperqa: Try Paper-QA as fallback (default: True).
53+
"""
4254
if use_gemini is not None:
4355
logger.warning(
4456
"use_gemini parameter is deprecated. Use provider='gemini' instead."

app/services/advanced_rag.py

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1-
"""Advanced RAG service with contextual summarization and re-ranking."""
1+
"""RAG pipeline for better field extraction accuracy.
2+
3+
Re-ranks text chunks by relevance to each field query, then generates
4+
query-aware summaries. More accurate than simple analysis but slower and
5+
more expensive (multiple LLM calls per field).
6+
"""
27

38
import logging
49
from typing import List, Dict, Optional, Tuple, Any
@@ -26,7 +31,11 @@
2631

2732

2833
class AdvancedRAGService:
29-
"""Advanced RAG service combining contextual summarization and chunk re-ranking."""
34+
"""RAG pipeline: re-rank chunks, then summarize before querying LLM.
35+
36+
This is what makes v2 API more accurate than v1. The tradeoff is speed
37+
and cost - expect 2-3x more LLM API calls per field.
38+
"""
3039

3140
def __init__(
3241
self,
@@ -38,7 +47,17 @@ def __init__(
3847
max_sources: Optional[int] = None,
3948
use_10_scale: bool = True,
4049
):
41-
"""Initialize advanced RAG service."""
50+
"""Set up RAG service with configuration.
51+
52+
Args:
53+
summary_provider: LLM provider for summarization (default: auto-detect).
54+
summary_model: Model for summarization (default: provider default).
55+
rerank_method: keyword|llm|hybrid (default: hybrid).
56+
cache_dir: Where to cache summaries (default: cache/).
57+
evidence_k: Initial chunks to retrieve before re-ranking.
58+
max_sources: Max chunks to use after re-ranking.
59+
use_10_scale: Use 0-10 relevance scale instead of 0-1.
60+
"""
4261
summary_config = SummarizationConfig(
4362
summary_length=RAG_SUMMARY_LENGTH,
4463
quality=RAG_SUMMARY_QUALITY,

app/services/bugsigdb_analyzer.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
"""Analysis service for extracting BugSigDB fields from papers."""
1+
"""Extracts six BugSigDB fields from papers using LLMs.
2+
3+
This is the main analysis service. It orchestrates paper retrieval, text preparation,
4+
and field extraction. Results are cached to avoid redundant API calls.
5+
"""
26

37
import logging
48
from typing import Dict, Optional, List
@@ -76,7 +80,14 @@ def get_cache_manager():
7680

7781

7882
async def analyze_paper_simple(pmid: str) -> Optional[Dict]:
79-
"""Analyze paper and extract BugSigDB fields."""
83+
"""Extract BugSigDB fields from a paper.
84+
85+
Uses v1 API flow: direct LLM queries per field. Fast but less accurate than RAG.
86+
Checks cache first, then fetches from PubMed if needed.
87+
88+
Returns:
89+
Dict with field results, or None if paper can't be retrieved.
90+
"""
8091
try:
8192
cache_manager = get_cache_manager()
8293
pubmed_retriever = get_pubmed_retriever()

app/services/data_retrieval.py

Lines changed: 68 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,11 @@ class PubMedRetrieverError(Exception):
2727

2828

2929
class PubMedRetriever:
30-
"""Retrieves paper metadata and full text from PubMed using NCBI E-Utilities API."""
30+
"""Fetches papers from PubMed/PMC via NCBI E-utilities.
31+
32+
Handles rate limiting (NCBI requires 0.34s between requests) and retries.
33+
Caches results to avoid hitting API limits.
34+
"""
3135

3236
BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
3337
DEFAULT_TIMEOUT = 10
@@ -36,22 +40,27 @@ class PubMedRetriever:
3640
def __init__(
3741
self, api_key: Optional[str] = None, email: str = "bioanalyzer@example.com"
3842
):
39-
"""Initialize PubMed retriever with API key and email."""
43+
"""Create retriever instance.
44+
45+
Args:
46+
api_key: NCBI API key (optional but recommended for higher rate limits).
47+
email: Contact email for NCBI requests (required by their ToS).
48+
"""
4049
self.api_key = api_key
4150
self.email = email
4251
self.session = self._create_session()
4352
self._verify_connectivity()
4453

4554
def _create_session(self) -> requests.Session:
46-
"""Create a configured requests session."""
55+
"""Set up HTTP session with proper User-Agent header."""
4756
session = requests.Session()
4857
session.headers.update(
4958
{"User-Agent": f"BioAnalyzer/1.0 (contact: {self.email})"}
5059
)
5160
return session
5261

5362
def _verify_connectivity(self, retries: int = 3) -> None:
54-
"""Test NCBI E-utilities reachability on startup with retries."""
63+
"""Verify NCBI E-utilities connectivity on startup."""
5564
test_url = f"{self.BASE_URL}/esearch.fcgi"
5665
params = {"db": "pubmed", "term": "cancer", "retmax": 1}
5766
for attempt in range(retries):
@@ -235,12 +244,19 @@ def fetch_paper_metadata(self, pmid: str) -> Dict[str, Any]:
235244
fields["authors"] = [a.text for a in item.findall("Item") if a.text]
236245
fields.setdefault("abstract", "")
237246
return fields
238-
except Exception as e:
247+
except (ElementTree.ParseError, AttributeError, ValueError) as e:
239248
safe_error = mask_exception_message(e)
240249
logger.error(
241250
f"Error parsing fallback esummary for PMID {pmid}: {safe_error}"
242251
)
243252
return {"error": "Fallback retrieval failed."}
253+
except Exception as e:
254+
# Catch-all for unexpected parsing errors
255+
safe_error = mask_exception_message(e)
256+
logger.error(
257+
f"Unexpected error parsing fallback esummary for PMID {pmid}: {safe_error}"
258+
)
259+
return {"error": "Fallback retrieval failed."}
244260

245261
def search(self, query: str, max_results: int = 10) -> List[str]:
246262
xml_data = self._make_request(
@@ -271,9 +287,17 @@ def get_pmc_fulltext(self, pmid: str) -> str:
271287

272288
return self._get_pmc_fulltext_by_id(pmc_id)
273289

290+
except (requests.exceptions.RequestException, PubMedRetrieverError) as e:
291+
safe_error = mask_exception_message(e)
292+
logger.warning(f"Network error retrieving full text for PMID {pmid}: {safe_error}")
293+
return ""
294+
except (ElementTree.ParseError, ValueError) as e:
295+
safe_error = mask_exception_message(e)
296+
logger.warning(f"Parse error retrieving full text for PMID {pmid}: {safe_error}")
297+
return ""
274298
except Exception as e:
275299
safe_error = mask_exception_message(e)
276-
logger.warning(f"Error retrieving full text for PMID {pmid}: {safe_error}")
300+
logger.warning(f"Unexpected error retrieving full text for PMID {pmid}: {safe_error}")
277301
return ""
278302

279303
def _get_pmc_id_from_pmid(self, pmid: str) -> Optional[str]:
@@ -305,9 +329,17 @@ def _get_pmc_id_from_pmid(self, pmid: str) -> Optional[str]:
305329
return pmc_id
306330
return None
307331

332+
except (requests.exceptions.RequestException, PubMedRetrieverError) as e:
333+
safe_error = mask_exception_message(e)
334+
logger.warning(f"Network error getting PMC ID for PMID {pmid}: {safe_error}")
335+
return None
336+
except (ElementTree.ParseError, AttributeError) as e:
337+
safe_error = mask_exception_message(e)
338+
logger.warning(f"Parse error getting PMC ID for PMID {pmid}: {safe_error}")
339+
return None
308340
except Exception as e:
309341
safe_error = mask_exception_message(e)
310-
logger.warning(f"Error getting PMC ID for PMID {pmid}: {safe_error}")
342+
logger.warning(f"Unexpected error getting PMC ID for PMID {pmid}: {safe_error}")
311343
return None
312344

313345
def _get_pmc_fulltext_by_id(self, pmc_id: str) -> str:
@@ -351,20 +383,25 @@ def _get_pmc_fulltext_by_id(self, pmc_id: str) -> str:
351383

352384
return "\n\n".join(full_text_parts)
353385

386+
except (requests.exceptions.RequestException, PubMedRetrieverError) as e:
387+
safe_error = mask_exception_message(e)
388+
logger.warning(f"Network error retrieving PMC full text for {pmc_id}: {safe_error}")
389+
return ""
390+
except (ElementTree.ParseError, AttributeError, ValueError) as e:
391+
safe_error = mask_exception_message(e)
392+
logger.warning(f"Parse error retrieving PMC full text for {pmc_id}: {safe_error}")
393+
return ""
354394
except Exception as e:
355395
safe_error = mask_exception_message(e)
356-
logger.warning(f"Error retrieving PMC full text for {pmc_id}: {safe_error}")
396+
logger.warning(f"Unexpected error retrieving PMC full text for {pmc_id}: {safe_error}")
357397
return ""
358398

359399
async def get_pmc_fulltext_async(self, pmid: str) -> str:
360400
"""Async wrapper for PMC full text retrieval."""
361401
return await asyncio.to_thread(self.get_pmc_fulltext, pmid)
362402

363403
def get_full_paper_data(self, pmid: str) -> Dict[str, Any]:
364-
"""
365-
Retrieve complete paper data including metadata and full text.
366-
This is the main method for comprehensive paper retrieval.
367-
"""
404+
"""Retrieve complete paper data including metadata and full text."""
368405
try:
369406
logger.info(f"Retrieving full paper data for PMID: {pmid}")
370407

@@ -392,14 +429,14 @@ def get_full_paper_data(self, pmid: str) -> Dict[str, Any]:
392429
logger.info(f"Successfully retrieved paper data for PMID: {pmid}")
393430
return paper_data
394431

395-
except Exception as e:
432+
except (requests.exceptions.RequestException, PubMedRetrieverError) as e:
396433
safe_error = mask_exception_message(e)
397434
logger.error(
398-
f"Error retrieving full paper data for PMID {pmid}: {safe_error}"
435+
f"Network error retrieving full paper data for PMID {pmid}: {safe_error}"
399436
)
400437
return {
401438
"pmid": pmid,
402-
"error": f"Failed to retrieve paper data: {str(e)}",
439+
"error": "Failed to retrieve paper data due to network error",
403440
"title": "",
404441
"abstract": "",
405442
"journal": "",
@@ -420,19 +457,33 @@ async def fetch_metadata():
420457
return await asyncio.wait_for(
421458
self.get_paper_metadata_async(pmid), timeout=6
422459
)
460+
except asyncio.TimeoutError:
461+
logger.error(f"Timeout fetching metadata for PMID {pmid}")
462+
return {}
463+
except (requests.exceptions.RequestException, PubMedRetrieverError) as e:
464+
safe_error = mask_exception_message(e)
465+
logger.error(f"Network error fetching metadata for PMID {pmid}: {safe_error}")
466+
return {}
423467
except Exception as e:
424468
safe_error = mask_exception_message(e)
425-
logger.error(f"Metadata fetch error for PMID {pmid}: {safe_error}")
469+
logger.error(f"Unexpected error fetching metadata for PMID {pmid}: {safe_error}")
426470
return {}
427471

428472
async def fetch_fulltext():
429473
try:
430474
return await asyncio.wait_for(
431475
self.get_pmc_fulltext_async(pmid), timeout=8
432476
)
477+
except asyncio.TimeoutError:
478+
logger.warning(f"Timeout fetching full text for PMID {pmid}")
479+
return ""
480+
except (requests.exceptions.RequestException, PubMedRetrieverError) as e:
481+
safe_error = mask_exception_message(e)
482+
logger.warning(f"Network error fetching full text for PMID {pmid}: {safe_error}")
483+
return ""
433484
except Exception as e:
434485
safe_error = mask_exception_message(e)
435-
logger.warning(f"Full text fetch error for PMID {pmid}: {safe_error}")
486+
logger.warning(f"Unexpected error fetching full text for PMID {pmid}: {safe_error}")
436487
return ""
437488

438489
if USE_FULLTEXT:

app/utils/config.py

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,15 @@ def validate_gemini_key():
6464
try:
6565
genai.configure(api_key=GEMINI_API_KEY)
6666
return True
67-
except Exception as e:
68-
# Mask any potential credentials in error message
67+
except (ValueError, AttributeError) as e:
6968
safe_error = mask_exception_message(e)
7069
print(f"Gemini API key validation failed: {safe_error}")
7170
return False
71+
except Exception as e:
72+
# Catch-all for unexpected errors during configuration
73+
safe_error = mask_exception_message(e)
74+
print(f"Unexpected error validating Gemini key: {safe_error}")
75+
return False
7276

7377

7478
def validate_env_vars():
@@ -82,20 +86,20 @@ def validate_env_vars():
8286
if not GEMINI_API_KEY:
8387
missing_vars.append("GEMINI_API_KEY")
8488

85-
# Check if at least one AI model is available
89+
# At least one LLM provider must be available
8690
if not AVAILABLE_MODELS:
8791
missing_vars.append("GEMINI_API_KEY")
8892

8993
if missing_vars:
9094
print(
91-
f"Warning: The following environment variables are missing: {', '.join(missing_vars)}"
95+
f"Warning: Missing environment variables: {', '.join(missing_vars)}"
9296
)
93-
print("Please set them in your .env file or environment.")
97+
print("Set them in your .env file or environment.")
9498

9599
return len(missing_vars) == 0
96100

97101

98-
# Call validation when module is imported
102+
# Validate on import
99103
validate_env_vars()
100104

101105

@@ -209,10 +213,9 @@ def check_required_vars():
209213

210214

211215
def setup_logging():
212-
"""Setup comprehensive logging configuration with file rotation.
216+
"""Configure logging with file rotation.
213217
214-
Handles permission errors gracefully, falling back to console-only logging
215-
if file handlers cannot be created (e.g., during testing).
218+
Falls back to console-only logging if file handlers can't be created.
216219
"""
217220
import logging.handlers
218221

0 commit comments

Comments
 (0)