diff --git a/docs/how-to/add-reference-source.md b/docs/how-to/add-reference-source.md index 404d6fc..ae1e82b 100644 --- a/docs/how-to/add-reference-source.md +++ b/docs/how-to/add-reference-source.md @@ -10,6 +10,33 @@ Each reference source is a Python class that: 2. Implements `prefix()` and `fetch()` methods 3. Registers itself with the `ReferenceSourceRegistry` +## Entrez Summary Sources (Recommended for NCBI IDs) + +If your source is backed by NCBI Entrez, prefer the built-in `EntrezSummarySource` +base class. It provides shared rate limiting, email configuration, and summary parsing. + +```python +# src/linkml_reference_validator/etl/sources/my_entrez.py +"""Entrez summary source example.""" + +from linkml_reference_validator.etl.sources.entrez import EntrezSummarySource +from linkml_reference_validator.etl.sources.base import ReferenceSourceRegistry + + +@ReferenceSourceRegistry.register +class ExampleEntrezSource(EntrezSummarySource): + """Fetch summaries from an Entrez database.""" + + PREFIX = "EXAMPLE" + ENTREZ_DB = "example_db" + TITLE_FIELDS = ("title", "name") + CONTENT_FIELDS = ("summary", "description") + ID_PATTERNS = (r"^EX\\d+$",) +``` + +`TITLE_FIELDS` and `CONTENT_FIELDS` are checked in order, and the first non-empty value +is used for the `ReferenceContent`. + ## Step 1: Create the Source Class Create a new file in `src/linkml_reference_validator/etl/sources/`: diff --git a/docs/how-to/repair-validation-errors.md b/docs/how-to/repair-validation-errors.md index 5bbfa17..26eb9ff 100644 --- a/docs/how-to/repair-validation-errors.md +++ b/docs/how-to/repair-validation-errors.md @@ -121,9 +121,15 @@ RECOMMENDED REMOVALS: ## Configuration File -Create `.linkml-reference-validator.yaml` for project-specific settings: +Create `.linkml-reference-validator.yaml` for project-specific settings. You can +include both validation and repair settings: ```yaml +validation: + reference_prefix_map: + geo: GEO + NCBIGeo: GEO + repair: # Confidence thresholds auto_fix_threshold: 0.95 diff --git a/docs/how-to/validate-entrez.md b/docs/how-to/validate-entrez.md new file mode 100644 index 0000000..20099f3 --- /dev/null +++ b/docs/how-to/validate-entrez.md @@ -0,0 +1,116 @@ +# Validating Entrez Accessions + +This guide shows how to validate supporting text against NCBI Entrez records for GEO, BioProject, and BioSample. + +## Overview + +These sources use the NCBI Entrez E-utilities `esummary` endpoint: + +- **GEO** (GSE/GDS): summaries from the `gds` database +- **BioProject** (PRJNA/PRJEB/PRJDB): summaries from the `bioproject` database +- **BioSample** (SAMN/SAME/SAMD): summaries from the `biosample` database + +The validator uses the returned summary/description fields as the content for matching. + +## Basic Usage + +### GEO (GSE or GDS) + +```bash +linkml-reference-validator validate text \ + "RNA-seq analysis of cardiac tissue" \ + GEO:GSE12345 +``` + +### BioProject + +```bash +linkml-reference-validator validate text \ + "Whole genome sequencing project for strain X" \ + BioProject:PRJNA12345 +``` + +### BioSample + +```bash +linkml-reference-validator validate text \ + "Human liver biopsy sample description" \ + BioSample:SAMN12345678 +``` + +## Accepted Identifier Formats + +You can use either prefixed or bare accessions: + +``` +GEO:GSE12345 +GDS12345 +BioProject:PRJNA12345 +PRJEB12345 +BioSample:SAMN12345678 +SAME1234567 +``` + +## Prefix Aliases and Normalization + +Prefixes are case-insensitive and can be normalized with a configuration map. This +is useful when data uses alternate prefix styles such as `geo:` or `NCBIGeo:`. + +Create `.linkml-reference-validator.yaml` with a `validation` section: + +```yaml +validation: + reference_prefix_map: + geo: GEO + NCBIGeo: GEO + NCBIBioProject: BIOPROJECT + NCBIBioSample: BIOSAMPLE +``` + +You can also configure this programmatically: + +```python +from linkml_reference_validator.models import ReferenceValidationConfig + +config = ReferenceValidationConfig( + reference_prefix_map={"geo": "GEO", "NCBIGeo": "GEO"} +) +``` + +Pass the config file to CLI commands with `--config .linkml-reference-validator.yaml`. + +## Pre-caching Entrez Records + +For offline validation or to speed up repeated validations: + +```bash +linkml-reference-validator cache reference GEO:GSE12345 +linkml-reference-validator cache reference BioProject:PRJNA12345 +linkml-reference-validator cache reference BioSample:SAMN12345678 +``` + +Cached references are stored in `references_cache/` as markdown files with YAML frontmatter. + +## Rate Limiting and Email + +NCBI requires a valid contact email for Entrez API usage. Configure it in your settings: + +```python +from linkml_reference_validator.models import ReferenceValidationConfig + +config = ReferenceValidationConfig( + email="you@example.org", + rate_limit_delay=0.5, +) +``` + +## Content Availability + +Entrez summaries vary by record. If a summary field is missing, the validator will return +`content_type: unavailable` and matching may fail. + +## See Also + +- [Adding a New Reference Source](add-reference-source.md) +- [Quickstart](../quickstart.md) +- [CLI Reference](../reference/cli.md) diff --git a/docs/reference/cli.md b/docs/reference/cli.md index 0331780..91e0c24 100644 --- a/docs/reference/cli.md +++ b/docs/reference/cli.md @@ -51,6 +51,7 @@ linkml-reference-validator validate text [OPTIONS] TEXT REFERENCE_ID ### Options - `--cache-dir PATH` - Directory for caching references (default: `references_cache`) +- `--config PATH` - Path to validation configuration file (.yaml) - `--verbose, -v` - Verbose output with detailed logging - `--help` - Show help message @@ -138,6 +139,7 @@ linkml-reference-validator validate data [OPTIONS] DATA_FILE - `--schema PATH, -s PATH` (required) - Path to LinkML schema file - `--target-class TEXT, -t TEXT` - Target class to validate (optional) - `--cache-dir PATH, -c PATH` - Directory for caching references (default: `references_cache`) +- `--config PATH` - Path to validation configuration file (.yaml) - `--verbose, -v` - Verbose output with detailed logging - `--help` - Show help message @@ -240,6 +242,7 @@ linkml-reference-validator repair text [OPTIONS] TEXT REFERENCE_ID ### Options - `--cache-dir PATH, -c PATH` - Directory for caching references +- `--config PATH` - Path to configuration file (.yaml) - `--verbose, -v` - Verbose output with detailed logging - `--auto-fix-threshold FLOAT, -a FLOAT` - Minimum similarity for auto-fixes (default: 0.95) - `--help` - Show help message @@ -318,7 +321,7 @@ linkml-reference-validator repair data [OPTIONS] DATA_FILE - `--dry-run / --no-dry-run, -n / -N` - Show changes without applying (default: dry-run) - `--auto-fix-threshold FLOAT, -a FLOAT` - Minimum similarity for auto-fixes (default: 0.95) - `--output PATH, -o PATH` - Output file path (default: overwrite with backup) -- `--config PATH` - Path to repair configuration file +- `--config PATH` - Path to configuration file (.yaml) - `--cache-dir PATH, -c PATH` - Directory for caching references - `--verbose, -v` - Verbose output with detailed logging - `--help` - Show help message @@ -412,11 +415,18 @@ Summary: --- -## Repair Configuration File +## Configuration File -Create `.linkml-reference-validator.yaml` for project-specific settings: +Create `.linkml-reference-validator.yaml` for project-specific settings. Use +the `validation` section for reference fetching behavior and `repair` for +auto-fix settings. ```yaml +validation: + reference_prefix_map: + geo: GEO + NCBIGeo: GEO + repair: # Confidence thresholds auto_fix_threshold: 0.95 @@ -471,6 +481,7 @@ linkml-reference-validator cache reference [OPTIONS] REFERENCE_ID ### Options - `--cache-dir PATH, -c PATH` - Directory for caching references (default: `references_cache`) +- `--config PATH` - Path to validation configuration file (.yaml) - `--force, -f` - Force re-fetch even if cached - `--verbose, -v` - Verbose output with detailed logging - `--help` - Show help message diff --git a/mkdocs.yml b/mkdocs.yml index 2b8f0bb..06c70ab 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -32,6 +32,7 @@ nav: - Python API: notebooks/03_python_api.ipynb - How-To Guides: - Validating OBO Files: how-to/validate-obo-files.md + - Validating Entrez Accessions: how-to/validate-entrez.md - Validating DOIs: how-to/validate-dois.md - Validating URLs: how-to/validate-urls.md - Using Local Files and URLs: how-to/use-local-files-and-urls.md diff --git a/src/linkml_reference_validator/cli/cache.py b/src/linkml_reference_validator/cli/cache.py index d0d775d..3d05b8f 100644 --- a/src/linkml_reference_validator/cli/cache.py +++ b/src/linkml_reference_validator/cli/cache.py @@ -6,9 +6,14 @@ from typing_extensions import Annotated from linkml_reference_validator.etl.reference_fetcher import ReferenceFetcher -from linkml_reference_validator.models import ReferenceValidationConfig - -from .shared import CacheDirOption, VerboseOption, ForceOption, setup_logging +from .shared import ( + CacheDirOption, + VerboseOption, + ForceOption, + ConfigFileOption, + setup_logging, + load_validation_config, +) logger = logging.getLogger(__name__) @@ -22,6 +27,7 @@ @cache_app.command(name="reference") def reference_command( reference_id: Annotated[str, typer.Argument(help="Reference ID (e.g., PMID:12345678 or DOI:10.1234/example)")], + config_file: ConfigFileOption = None, cache_dir: CacheDirOption = None, force: ForceOption = False, verbose: VerboseOption = False, @@ -41,7 +47,7 @@ def reference_command( """ setup_logging(verbose) - config = ReferenceValidationConfig() + config = load_validation_config(config_file) if cache_dir: config.cache_dir = cache_dir diff --git a/src/linkml_reference_validator/cli/repair.py b/src/linkml_reference_validator/cli/repair.py index aa1db84..11ad250 100644 --- a/src/linkml_reference_validator/cli/repair.py +++ b/src/linkml_reference_validator/cli/repair.py @@ -13,13 +13,16 @@ from ruamel.yaml import YAML from typing_extensions import Annotated -from linkml_reference_validator.models import ( - ReferenceValidationConfig, - RepairConfig, -) +from linkml_reference_validator.models import RepairConfig from linkml_reference_validator.validation.repairer import SupportingTextRepairer -from .shared import CacheDirOption, VerboseOption, setup_logging +from .shared import ( + CacheDirOption, + VerboseOption, + ConfigFileOption, + setup_logging, + load_validation_config, +) logger = logging.getLogger(__name__) @@ -75,13 +78,7 @@ def data_command( output: OutputOption = None, cache_dir: CacheDirOption = None, verbose: VerboseOption = False, - config_file: Annotated[ - Optional[Path], - typer.Option( - "--config", - help="Path to repair configuration file (.yaml)", - ), - ] = None, + config_file: ConfigFileOption = None, ): """Repair supporting text in a data file. @@ -120,7 +117,7 @@ def data_command( repair_config.dry_run = dry_run # Set up validation config - val_config = ReferenceValidationConfig() + val_config = load_validation_config(config_file) if cache_dir: val_config.cache_dir = cache_dir @@ -198,6 +195,7 @@ def text_command( cache_dir: CacheDirOption = None, verbose: VerboseOption = False, auto_fix_threshold: AutoFixThresholdOption = 0.95, + config_file: ConfigFileOption = None, ): """Attempt to repair a single supporting text quote. @@ -214,7 +212,7 @@ def text_command( """ setup_logging(verbose) - val_config = ReferenceValidationConfig() + val_config = load_validation_config(config_file) if cache_dir: val_config.cache_dir = cache_dir @@ -283,10 +281,21 @@ def _load_repair_config(config_file: Optional[Path]) -> RepairConfig: if config_data is None: return RepairConfig() + if not isinstance(config_data, dict): + return RepairConfig() + # Extract repair section if present - repair_data = config_data.get("repair", config_data) + if "repair" in config_data: + repair_data = config_data.get("repair") + if isinstance(repair_data, dict): + return RepairConfig(**repair_data) + return RepairConfig() + + repair_keys = set(RepairConfig.model_fields.keys()) + if repair_keys.intersection(config_data.keys()): + return RepairConfig(**config_data) - return RepairConfig(**repair_data) + return RepairConfig() def _extract_evidence_items( diff --git a/src/linkml_reference_validator/cli/shared.py b/src/linkml_reference_validator/cli/shared.py index c8426bb..8fb001b 100644 --- a/src/linkml_reference_validator/cli/shared.py +++ b/src/linkml_reference_validator/cli/shared.py @@ -4,8 +4,11 @@ from typing import Optional import typer +from ruamel.yaml import YAML # type: ignore from typing_extensions import Annotated +from linkml_reference_validator.models import ReferenceValidationConfig + # Common option definitions for reuse CacheDirOption = Annotated[ Optional[Path], @@ -34,6 +37,14 @@ ), ] +ConfigFileOption = Annotated[ + Optional[Path], + typer.Option( + "--config", + help="Path to validation configuration file (.yaml)", + ), +] + def setup_logging(verbose: bool) -> None: """Configure logging based on verbosity flag. @@ -45,3 +56,57 @@ def setup_logging(verbose: bool) -> None: if verbose: logging.basicConfig(level=logging.INFO) + + +def load_validation_config(config_file: Optional[Path]) -> ReferenceValidationConfig: + """Load validation configuration from file. + + Args: + config_file: Path to config file, or None for defaults + + Returns: + ReferenceValidationConfig instance + """ + if config_file is None: + for default_path in [ + Path(".linkml-reference-validator.yaml"), + Path(".linkml-reference-validator.yml"), + ]: + if default_path.exists(): + config_file = default_path + break + + if config_file is None: + return ReferenceValidationConfig() + + yaml = YAML(typ="safe") + with open(config_file) as f: + config_data = yaml.load(f) + + if not config_data: + return ReferenceValidationConfig() + + validation_data = _extract_validation_config_data(config_data) + if validation_data is None: + return ReferenceValidationConfig() + + return ReferenceValidationConfig(**validation_data) + + +def _extract_validation_config_data(config_data: object) -> Optional[dict]: + """Extract validation settings from a config object.""" + if not isinstance(config_data, dict): + return None + + if "validation" in config_data: + section = config_data.get("validation") + return section if isinstance(section, dict) else None + if "reference_validation" in config_data: + section = config_data.get("reference_validation") + return section if isinstance(section, dict) else None + + validation_keys = set(ReferenceValidationConfig.model_fields.keys()) + if validation_keys.intersection(config_data.keys()): + return config_data + + return None diff --git a/src/linkml_reference_validator/cli/validate.py b/src/linkml_reference_validator/cli/validate.py index 7aac087..dae05b7 100644 --- a/src/linkml_reference_validator/cli/validate.py +++ b/src/linkml_reference_validator/cli/validate.py @@ -10,7 +10,7 @@ from typing_extensions import Annotated from linkml_reference_validator.etl.text_extractor import TextExtractor -from linkml_reference_validator.models import ReferenceValidationConfig, ValidationReport +from linkml_reference_validator.models import ValidationReport from linkml_reference_validator.plugins.reference_validation_plugin import ( ReferenceValidationPlugin, ) @@ -18,7 +18,13 @@ SupportingTextValidator, ) -from .shared import CacheDirOption, VerboseOption, setup_logging +from .shared import ( + CacheDirOption, + VerboseOption, + ConfigFileOption, + setup_logging, + load_validation_config, +) logger = logging.getLogger(__name__) @@ -33,6 +39,7 @@ def text_command( text: Annotated[str, typer.Argument(help="Supporting text to validate")], reference_id: Annotated[str, typer.Argument(help="Reference ID (e.g., PMID:12345678 or DOI:10.1234/example)")], + config_file: ConfigFileOption = None, cache_dir: CacheDirOption = None, verbose: VerboseOption = False, ): @@ -51,7 +58,7 @@ def text_command( """ setup_logging(verbose) - config = ReferenceValidationConfig() + config = load_validation_config(config_file) if cache_dir: config.cache_dir = cache_dir @@ -115,6 +122,7 @@ def text_file_command( help="Show only summary statistics, not individual results", ), ] = False, + config_file: ConfigFileOption = None, cache_dir: CacheDirOption = None, verbose: VerboseOption = False, ): @@ -142,7 +150,7 @@ def text_file_command( typer.echo(f"Error: File not found: {file_path}", err=True) raise typer.Exit(1) - config = ReferenceValidationConfig() + config = load_validation_config(config_file) if cache_dir: config.cache_dir = cache_dir @@ -214,6 +222,7 @@ def data_command( Optional[str], typer.Option("--target-class", "-t", help="Target class to validate"), ] = None, + config_file: ConfigFileOption = None, cache_dir: CacheDirOption = None, verbose: VerboseOption = False, ): @@ -230,7 +239,7 @@ def data_command( """ setup_logging(verbose) - config = ReferenceValidationConfig() + config = load_validation_config(config_file) if cache_dir: config.cache_dir = cache_dir diff --git a/src/linkml_reference_validator/etl/reference_fetcher.py b/src/linkml_reference_validator/etl/reference_fetcher.py index 83f9282..8aa09ed 100644 --- a/src/linkml_reference_validator/etl/reference_fetcher.py +++ b/src/linkml_reference_validator/etl/reference_fetcher.py @@ -79,30 +79,32 @@ def fetch( >>> # ref = fetcher.fetch("PMID:12345678") >>> # ref = fetcher.fetch("file:./notes.md") """ + normalized_reference_id = self._normalize_reference_id(reference_id) + # Check memory cache - if not force_refresh and reference_id in self._cache: - return self._cache[reference_id] + if not force_refresh and normalized_reference_id in self._cache: + return self._cache[normalized_reference_id] # Check disk cache if not force_refresh: - cached = self._load_from_disk(reference_id) + cached = self._load_from_disk(normalized_reference_id) if cached: - self._cache[reference_id] = cached + self._cache[normalized_reference_id] = cached return cached # Find appropriate source using registry - source_class = ReferenceSourceRegistry.get_source(reference_id) + source_class = ReferenceSourceRegistry.get_source(normalized_reference_id) if not source_class: - logger.warning(f"No source found for reference type: {reference_id}") + logger.warning(f"No source found for reference type: {normalized_reference_id}") return None # Parse identifier and fetch - _, identifier = self._parse_reference_id(reference_id) + _, identifier = self._parse_reference_id(normalized_reference_id) source = source_class() content = source.fetch(identifier, self.config) if content: - self._cache[reference_id] = content + self._cache[normalized_reference_id] = content self._save_to_disk(content) return content @@ -129,6 +131,9 @@ def _parse_reference_id(self, reference_id: str) -> tuple[str, str]: ('file', './test.md') >>> fetcher._parse_reference_id("url:https://example.com/page") ('url', 'https://example.com/page') + >>> config = ReferenceValidationConfig(reference_prefix_map={"geo": "GEO"}) + >>> ReferenceFetcher(config)._parse_reference_id("geo:GSE12345") + ('GEO', 'GSE12345') """ stripped = reference_id.strip() @@ -137,13 +142,38 @@ def _parse_reference_id(self, reference_id: str) -> tuple[str, str]: if match: prefix = match.group(1) # Preserve case for file/url, uppercase for others - if prefix.lower() not in ("file", "url"): - prefix = prefix.upper() + prefix = self._normalize_prefix(prefix) + prefix = self._apply_prefix_map(prefix) return prefix, match.group(2).strip() if reference_id.strip().isdigit(): return "PMID", reference_id.strip() return "UNKNOWN", reference_id + def _normalize_reference_id(self, reference_id: str) -> str: + """Normalize reference IDs using configured prefix aliases.""" + prefix, identifier = self._parse_reference_id(reference_id) + if prefix == "UNKNOWN": + return reference_id.strip() + return f"{prefix}:{identifier}" + + def _normalize_prefix(self, prefix: str) -> str: + """Normalize prefix casing with special handling for file/url.""" + if prefix.lower() in ("file", "url"): + return prefix.lower() + return prefix.upper() + + def _apply_prefix_map(self, prefix: str) -> str: + """Apply configured prefix aliases.""" + prefix_map = self._normalized_prefix_map() + return prefix_map.get(prefix, prefix) + + def _normalized_prefix_map(self) -> dict[str, str]: + """Return a case-normalized prefix map.""" + normalized: dict[str, str] = {} + for key, value in self.config.reference_prefix_map.items(): + normalized[self._normalize_prefix(key)] = self._normalize_prefix(value) + return normalized + def _get_cache_path(self, reference_id: str) -> Path: """Get the cache file path for a reference. diff --git a/src/linkml_reference_validator/etl/sources/__init__.py b/src/linkml_reference_validator/etl/sources/__init__.py index 1483397..a20e12a 100644 --- a/src/linkml_reference_validator/etl/sources/__init__.py +++ b/src/linkml_reference_validator/etl/sources/__init__.py @@ -6,7 +6,7 @@ Examples: >>> from linkml_reference_validator.etl.sources import ReferenceSourceRegistry >>> sources = ReferenceSourceRegistry.list_sources() - >>> len(sources) >= 4 + >>> len(sources) >= 7 True """ @@ -20,6 +20,11 @@ from linkml_reference_validator.etl.sources.doi import DOISource from linkml_reference_validator.etl.sources.file import FileSource from linkml_reference_validator.etl.sources.url import URLSource +from linkml_reference_validator.etl.sources.entrez import ( + GEOSource, + BioProjectSource, + BioSampleSource, +) __all__ = [ "ReferenceSource", @@ -28,4 +33,7 @@ "DOISource", "FileSource", "URLSource", + "GEOSource", + "BioProjectSource", + "BioSampleSource", ] diff --git a/src/linkml_reference_validator/etl/sources/entrez.py b/src/linkml_reference_validator/etl/sources/entrez.py new file mode 100644 index 0000000..578e3ac --- /dev/null +++ b/src/linkml_reference_validator/etl/sources/entrez.py @@ -0,0 +1,235 @@ +"""Entrez summary-based reference sources. + +Provides a shared base class for NCBI Entrez E-utilities summary endpoints. + +Examples: + >>> from linkml_reference_validator.etl.sources.entrez import GEOSource + >>> GEOSource.prefix() + 'GEO' + >>> GEOSource.can_handle("geo:GSE12345") + True +""" + +import logging +import re +import time +from typing import Any, Optional + +from Bio import Entrez # type: ignore + +from linkml_reference_validator.models import ReferenceContent, ReferenceValidationConfig +from linkml_reference_validator.etl.sources.base import ReferenceSource, ReferenceSourceRegistry + +logger = logging.getLogger(__name__) + + +class EntrezSummarySource(ReferenceSource): + """Base class for Entrez summary-based sources. + + Subclasses define the Entrez database and field mappings for title/content. + + Examples: + >>> class ExampleSource(EntrezSummarySource): + ... PREFIX = "EXAMPLE" + ... ENTREZ_DB = "example_db" + ... TITLE_FIELDS = ("title",) + ... CONTENT_FIELDS = ("summary",) + >>> ExampleSource.prefix() + 'EXAMPLE' + """ + + PREFIX: str = "" + ENTREZ_DB: str = "" + TITLE_FIELDS: tuple[str, ...] = () + CONTENT_FIELDS: tuple[str, ...] = () + ID_PATTERNS: tuple[str, ...] = () + + @classmethod + def prefix(cls) -> str: + """Return the prefix this source handles. + + Examples: + >>> class ExampleSource(EntrezSummarySource): + ... PREFIX = "EXAMPLE" + ... ENTREZ_DB = "example_db" + ... TITLE_FIELDS = ("title",) + ... CONTENT_FIELDS = ("summary",) + >>> ExampleSource.prefix() + 'EXAMPLE' + """ + return cls.PREFIX + + @classmethod + def can_handle(cls, reference_id: str) -> bool: + """Check if this source can handle the given reference ID. + + Supports prefixed references and optional raw accessions. + + Examples: + >>> class ExampleSource(EntrezSummarySource): + ... PREFIX = "EXAMPLE" + ... ENTREZ_DB = "example_db" + ... TITLE_FIELDS = ("title",) + ... CONTENT_FIELDS = ("summary",) + ... ID_PATTERNS = (r"^EX\\d+$",) + >>> ExampleSource.can_handle("EXAMPLE:EX123") + True + >>> ExampleSource.can_handle("EX123") + True + """ + if super().can_handle(reference_id): + return True + if cls.ID_PATTERNS: + for pattern in cls.ID_PATTERNS: + if re.match(pattern, reference_id, re.IGNORECASE): + return True + return False + + def fetch( + self, identifier: str, config: ReferenceValidationConfig + ) -> Optional[ReferenceContent]: + """Fetch a summary record from an Entrez database. + + Args: + identifier: Identifier or accession + config: Configuration including rate limiting and email + + Returns: + ReferenceContent if successful, None otherwise + """ + if not self.ENTREZ_DB: + logger.warning("EntrezSummarySource missing ENTREZ_DB configuration") + return None + + Entrez.email = config.email # type: ignore + time.sleep(config.rate_limit_delay) + + handle = None + try: + handle = Entrez.esummary(db=self.ENTREZ_DB, id=identifier) + records = Entrez.read(handle) + except Exception as exc: + logger.warning( + f"Failed to fetch Entrez summary for {self.prefix()}:{identifier}: {exc}" + ) + return None + finally: + if handle is not None: + handle.close() + + record = self._extract_record(records) + if not record: + logger.warning(f"No Entrez summary found for {self.prefix()}:{identifier}") + return None + + title = self._get_first_field_value(record, self.TITLE_FIELDS) + content = self._get_first_field_value(record, self.CONTENT_FIELDS) + content_type = "summary" if content else "unavailable" + + return ReferenceContent( + reference_id=f"{self.prefix()}:{identifier}", + title=title, + content=content, + content_type=content_type, + metadata={"entrez_db": self.ENTREZ_DB}, + ) + + def _extract_record(self, records: Any) -> Optional[dict[str, Any]]: + """Extract the first summary record from Entrez results.""" + if isinstance(records, list): + if records: + return records[0] + return None + + if isinstance(records, dict): + docset = records.get("DocumentSummarySet") + if isinstance(docset, dict): + docs = docset.get("DocumentSummary") + if isinstance(docs, list) and docs: + return docs[0] + if isinstance(docs, dict): + return docs + return records + + return None + + def _get_first_field_value( + self, record: dict[str, Any], field_names: tuple[str, ...] + ) -> Optional[str]: + """Return the first non-empty value from a record for the given fields.""" + if not field_names: + return None + + normalized_keys = {key.lower(): key for key in record.keys()} + for name in field_names: + record_key = normalized_keys.get(name.lower(), name) + value = record.get(record_key) + text = self._normalize_text(value) + if text: + return text + + return None + + def _normalize_text(self, value: Any) -> Optional[str]: + """Normalize summary field values into a string.""" + if value is None: + return None + if isinstance(value, (list, tuple)): + items = [str(item) for item in value if item] + return "; ".join(items) if items else None + text = str(value).strip() + return text if text else None + + +@ReferenceSourceRegistry.register +class GEOSource(EntrezSummarySource): + """Fetch GEO series and dataset summaries from Entrez. + + Examples: + >>> GEOSource.prefix() + 'GEO' + >>> GEOSource.can_handle("geo:GSE12345") + True + """ + + PREFIX = "GEO" + ENTREZ_DB = "gds" + TITLE_FIELDS = ("title", "description", "summary") + CONTENT_FIELDS = ("summary", "description", "title") + ID_PATTERNS = (r"^GSE\\d+$", r"^GDS\\d+$") + + +@ReferenceSourceRegistry.register +class BioProjectSource(EntrezSummarySource): + """Fetch BioProject summaries from Entrez. + + Examples: + >>> BioProjectSource.prefix() + 'BIOPROJECT' + >>> BioProjectSource.can_handle("bioproject:PRJNA000001") + True + """ + + PREFIX = "BIOPROJECT" + ENTREZ_DB = "bioproject" + TITLE_FIELDS = ("Project_Title", "Project_Name", "title") + CONTENT_FIELDS = ("Project_Description", "Description", "title") + ID_PATTERNS = (r"^PRJ[EDN][A-Z]?\\d+$",) + + +@ReferenceSourceRegistry.register +class BioSampleSource(EntrezSummarySource): + """Fetch BioSample summaries from Entrez. + + Examples: + >>> BioSampleSource.prefix() + 'BIOSAMPLE' + >>> BioSampleSource.can_handle("biosample:SAMN00000001") + True + """ + + PREFIX = "BIOSAMPLE" + ENTREZ_DB = "biosample" + TITLE_FIELDS = ("Title", "title", "Description") + CONTENT_FIELDS = ("Description", "Title", "title") + ID_PATTERNS = (r"^SAM[END]\\d+$",) diff --git a/src/linkml_reference_validator/models.py b/src/linkml_reference_validator/models.py index dc6896b..e1e4ae7 100644 --- a/src/linkml_reference_validator/models.py +++ b/src/linkml_reference_validator/models.py @@ -339,6 +339,11 @@ class ReferenceValidationConfig(BaseModel): ... ) >>> config.supporting_text_regex 'ex:supporting_text="([^"]*)\\[(\\S+:\\S+)\\]"' + >>> config = ReferenceValidationConfig( + ... reference_prefix_map={"geo": "GEO", "NCBIGeo": "GEO"} + ... ) + >>> config.reference_prefix_map["geo"] + 'GEO' """ cache_dir: Path = Field( @@ -372,6 +377,13 @@ class ReferenceValidationConfig(BaseModel): ge=1, description="Regex capture group number containing the reference ID", ) + reference_prefix_map: dict[str, str] = Field( + default_factory=dict, + description=( + "Optional mapping of alternate prefixes to canonical prefixes, " + "e.g. {'geo': 'GEO', 'NCBIGeo': 'GEO'}" + ), + ) def get_cache_dir(self) -> Path: """Create and return the cache directory. diff --git a/tests/test_reference_fetcher.py b/tests/test_reference_fetcher.py index 15c567f..b4b46a9 100644 --- a/tests/test_reference_fetcher.py +++ b/tests/test_reference_fetcher.py @@ -39,6 +39,24 @@ def test_parse_reference_id(fetcher): assert fetcher._parse_reference_id("url:https://example.com") == ("url", "https://example.com") +def test_parse_reference_id_with_prefix_map(tmp_path): + """Test parsing with configurable prefix aliases.""" + config = ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + reference_prefix_map={ + "geo": "GEO", + "NCBIGeo": "GEO", + "bioproject": "BIOPROJECT", + }, + ) + fetcher = ReferenceFetcher(config) + + assert fetcher._parse_reference_id("geo:GSE12345") == ("GEO", "GSE12345") + assert fetcher._parse_reference_id("NCBIGeo:GSE12345") == ("GEO", "GSE12345") + assert fetcher._parse_reference_id("bioproject:PRJNA12345") == ("BIOPROJECT", "PRJNA12345") + + def test_get_cache_path(fetcher): """Test cache path generation.""" path = fetcher._get_cache_path("PMID:12345678") diff --git a/tests/test_sources.py b/tests/test_sources.py index 91354ff..a09f068 100644 --- a/tests/test_sources.py +++ b/tests/test_sources.py @@ -9,6 +9,11 @@ from linkml_reference_validator.etl.sources.url import URLSource from linkml_reference_validator.etl.sources.pmid import PMIDSource from linkml_reference_validator.etl.sources.doi import DOISource +from linkml_reference_validator.etl.sources.entrez import ( + GEOSource, + BioProjectSource, + BioSampleSource, +) class TestReferenceSourceRegistry: @@ -22,6 +27,9 @@ def test_registry_has_default_sources(self): assert "DOI" in prefixes assert "file" in prefixes assert "url" in prefixes + assert "GEO" in prefixes + assert "BIOPROJECT" in prefixes + assert "BIOSAMPLE" in prefixes def test_get_source_for_pmid(self): """Should return PMIDSource for PMID references.""" @@ -296,3 +304,77 @@ def test_can_handle_doi(self, source): """Should handle DOI references.""" assert source.can_handle("DOI:10.1234/test") assert not source.can_handle("PMID:12345678") + + +class TestEntrezSummarySources: + """Tests for Entrez summary-based sources.""" + + @pytest.fixture + def config(self, tmp_path): + """Create test config.""" + return ReferenceValidationConfig( + cache_dir=tmp_path / "cache", + rate_limit_delay=0.0, + ) + + @pytest.mark.parametrize( + ("source_cls", "reference_id", "title_key", "content_key", "db_name"), + [ + (GEOSource, "GEO:GSE12345", "title", "summary", "gds"), + ( + BioProjectSource, + "BioProject:PRJNA000001", + "Project_Title", + "Project_Description", + "bioproject", + ), + (BioSampleSource, "biosample:SAMN00000001", "Title", "Description", "biosample"), + ], + ) + @patch("linkml_reference_validator.etl.sources.entrez.Entrez.read") + @patch("linkml_reference_validator.etl.sources.entrez.Entrez.esummary") + def test_fetch_entrez_summary( + self, + mock_esummary, + mock_read, + source_cls, + reference_id, + title_key, + content_key, + db_name, + config, + ): + """Should fetch summary records for Entrez-backed sources.""" + mock_handle = MagicMock() + mock_esummary.return_value = mock_handle + mock_read.return_value = [ + { + title_key: "Example Title", + content_key: "Example content summary.", + } + ] + + source = source_cls() + result = source.fetch(reference_id.split(":", 1)[1], config) + + assert result is not None + assert result.reference_id == f"{source.prefix()}:{reference_id.split(':', 1)[1]}" + assert result.title == "Example Title" + assert result.content == "Example content summary." + assert result.content_type == "summary" + assert result.metadata["entrez_db"] == db_name + mock_esummary.assert_called_once_with(db=db_name, id=reference_id.split(":", 1)[1]) + mock_handle.close.assert_called_once() + + @pytest.mark.parametrize( + ("source", "valid_id", "invalid_id"), + [ + (GEOSource(), "geo:GSE12345", "DOI:10.1000/test"), + (BioProjectSource(), "bioproject:PRJNA12345", "PMID:123"), + (BioSampleSource(), "biosample:SAMN12345", "url:https://example.com"), + ], + ) + def test_can_handle_entrez_sources(self, source, valid_id, invalid_id): + """Should handle prefixed Entrez references and reject others.""" + assert source.can_handle(valid_id) + assert not source.can_handle(invalid_id) diff --git a/tests/test_validation_config.py b/tests/test_validation_config.py new file mode 100644 index 0000000..a3dcd36 --- /dev/null +++ b/tests/test_validation_config.py @@ -0,0 +1,38 @@ +"""Tests for validation configuration loading.""" + +from linkml_reference_validator.cli.shared import load_validation_config + + +def test_load_validation_config_from_section(tmp_path): + """Should load validation config from a named section.""" + config_file = tmp_path / ".linkml-reference-validator.yaml" + config_file.write_text( + """ +validation: + cache_dir: references_cache + reference_prefix_map: + geo: GEO + NCBIGeo: GEO +""" + ) + + config = load_validation_config(config_file) + + assert config.cache_dir.name == "references_cache" + assert config.reference_prefix_map["geo"] == "GEO" + assert config.reference_prefix_map["NCBIGeo"] == "GEO" + + +def test_load_validation_config_ignores_repair_only(tmp_path): + """Should ignore files that only define repair settings.""" + config_file = tmp_path / ".linkml-reference-validator.yaml" + config_file.write_text( + """ +repair: + auto_fix_threshold: 0.97 +""" + ) + + config = load_validation_config(config_file) + + assert config.reference_prefix_map == {}