Skip to content
6 changes: 5 additions & 1 deletion edam_mcp/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .models.segmentation import SegmentationRequest, SegmentationResponse
from .models.suggestion import SuggestionRequest, SuggestionResponse
from .models.workflow import WorkflowSummaryRequest, WorkflowSummaryResponse
from .tools import map_to_edam_concept, suggest_new_concept
from .tools import map_to_edam_concept, map_to_edam_operation, suggest_new_concept
from .tools.segment_text import segment_text
from .tools.workflow import get_workflow_summary

Expand Down Expand Up @@ -47,6 +47,10 @@ async def segment_text_tool(request: SegmentationRequest, context: Context) -> S
async def map_to_edam_concept_tool(request: MappingRequest, context: Context) -> MappingResponse:
return await map_to_edam_concept(request, context)

@mcp.tool
async def map_to_edam_operation_tool(request: MappingRequest, context: Context) -> MappingResponse:
return await map_to_edam_operation(request, context)

@mcp.tool
async def suggest_new_concept_tool(request: SuggestionRequest, context: Context) -> SuggestionResponse:
return await suggest_new_concept(request, context)
Expand Down
4 changes: 2 additions & 2 deletions edam_mcp/ontology/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""EDAM ontology handling modules."""

from .loader import OntologyLoader
from .matcher import ConceptMatcher
from .matcher import ConceptMatcher, EDAMConceptType
from .suggester import ConceptSuggester

__all__ = ["OntologyLoader", "ConceptMatcher", "ConceptSuggester"]
__all__ = ["OntologyLoader", "ConceptMatcher", "EDAMConceptType", "ConceptSuggester"]
73 changes: 67 additions & 6 deletions edam_mcp/ontology/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import logging
import os
from enum import Enum

import numpy as np

Expand All @@ -13,6 +14,15 @@
logger = logging.getLogger(__name__)


# define enum for EDAM concept types
class EDAMConceptType(Enum):
TOPIC = "Topic"
OPERATION = "Operation"
DATA = "Data"
FORMAT = "Format"
ANY = "Any"


class ConceptMatcher:
"""Handles semantic matching of descriptions to EDAM concepts."""

Expand All @@ -25,6 +35,7 @@ def __init__(self, ontology_loader: OntologyLoader):
self.ontology_loader = ontology_loader
self.embedding_model = None
self.concept_embeddings: dict[str, np.ndarray] = {}
self.operation_embeddings: dict[str, np.ndarray] = {}
self.use_chromadb = settings.use_chromadb
self.chroma_db = os.path.join(settings.cache_dir, "default.db")
# Don't build embeddings immediately - do it lazily when needed
Expand All @@ -50,13 +61,20 @@ def _build_embeddings(self) -> None:
logger.error("chromadb not available. Install with: pip install chromadb")
return
client = chromadb.PersistentClient(path=self.chroma_db)

# Further details at: https://docs.trychroma.com/docs/collections/configure#hnsw-index-configuration
embedding_function = SentenceTransformerEmbeddingFunction(settings.embedding_model)
collection = client.get_or_create_collection(
name="concept_embeddings",
embedding_function=embedding_function,
configuration={"hnsw": {"space": "cosine", "ef_construction": 200}},
)
operation_collection = client.get_or_create_collection(
name="operation_embeddings",
embedding_function=embedding_function,
configuration={"hnsw": {"space": "cosine", "ef_construction": 200}},
)

logger.info("Building concept embeddings and storing in ChromaDB...")
else:
logger.info("Building concept embeddings and storing in memory...")
Expand Down Expand Up @@ -101,18 +119,42 @@ def _build_embeddings(self) -> None:
}
],
)
if concept["type"] == "Operation":
operation_collection.add(
ids=[uri],
embeddings=[embedding.tolist()],
documents=[processed_text],
metadatas=[
{
"label": concept["label"],
"definition": concept.get("definition"),
"synonyms": (
", ".join(concept["synonyms"])
if isinstance(concept.get("synonyms"), list)
else concept.get("synonyms")
),
}
],
)
else:
self.concept_embeddings[uri] = embedding
if concept["type"] == "Operation":
self.operation_embeddings[uri] = embedding

if self.use_chromadb:
logger.info(f"Stored embeddings for {len(self.ontology_loader.concepts)} concepts in ChromaDB")
else:
logger.info(f"Built embeddings for {len(self.concept_embeddings)} concepts")

# Show the size of the embeddings dictionary
logger.info(f"Concept embeddings size: {len(self.concept_embeddings)}")
logger.info(f"Operation embeddings size: {len(self.operation_embeddings)}")

def match_concepts(
self,
description: str,
context: str | None = None,
concept_type: EDAMConceptType = EDAMConceptType.ANY,
max_results: int = 5,
min_confidence: float = 0.5,
) -> list[ConceptMatch]:
Expand All @@ -121,6 +163,7 @@ def match_concepts(
Args:
description: Text description to match.
context: Additional context information.
concept_type: Type of EDAM concept to match.
max_results: Maximum number of matches to return.
min_confidence: Minimum confidence threshold.

Expand All @@ -142,7 +185,7 @@ def match_concepts(
description_embedding = self.embedding_model.encode(processed_description, show_progress_bar=False)

# Calculate similarities
similarities = self._calculate_similarities(description_embedding, max_results * 2)
similarities = self._calculate_similarities(description_embedding, concept_type, max_results * 2)

# Filter and sort results
matches = []
Expand All @@ -164,11 +207,15 @@ def match_concepts(
matches.sort(key=lambda x: x.confidence, reverse=True)
return matches[:max_results]

def _calculate_similarities(self, description_embedding: np.ndarray, max_results: int) -> list[tuple[str, float]]:
def _calculate_similarities(
self, description_embedding: np.ndarray, concept_type: EDAMConceptType, max_results: int
) -> list[tuple[str, float]]:
"""Calculate cosine similarities between description and all concepts.

Args:
description_embedding: Embedding of the description.
concept_type: Type of EDAM concept to match.
max_results: Maximum number of results to return.

Returns:
List of (concept_uri, similarity) tuples.
Expand All @@ -183,7 +230,14 @@ def _calculate_similarities(self, description_embedding: np.ndarray, max_results
return []

client = chromadb.PersistentClient(path=self.chroma_db)
collection = client.get_or_create_collection(name="concept_embeddings")
if concept_type == EDAMConceptType.OPERATION:
logger.info("Using ChromaDB to query operation embeddings")
collection = client.get_or_create_collection(name="operation_embeddings")
logger.info(f"Collection has {collection.count()} items")
else:
logger.info("Using ChromaDB to query concept embeddings")
collection = client.get_or_create_collection(name="concept_embeddings")
logger.info(f"Collection has {collection.count()} items")
# Use ChromaDB's default query for similarity search
query_results = collection.query(
query_embeddings=[description_embedding],
Expand All @@ -195,9 +249,16 @@ def _calculate_similarities(self, description_embedding: np.ndarray, max_results
similarity_scores = [1.0 - d for d in distances]
similarities = list(zip(ids, similarity_scores))
else:
for uri, concept_embedding in self.concept_embeddings.items():
similarity = self._cosine_similarity(description_embedding, concept_embedding)
similarities.append((uri, similarity))
if concept_type == EDAMConceptType.OPERATION:
logger.info("Calculating similarities using in-memory operation embeddings")
for uri, concept_embedding in self.operation_embeddings.items():
similarity = self._cosine_similarity(description_embedding, concept_embedding)
similarities.append((uri, similarity))
else:
logger.info("Calculating similarities using in-memory concept embeddings")
for uri, concept_embedding in self.concept_embeddings.items():
similarity = self._cosine_similarity(description_embedding, concept_embedding)
similarities.append((uri, similarity))

similarities.sort(key=lambda x: x[1], reverse=True)
return similarities
Expand Down
4 changes: 2 additions & 2 deletions edam_mcp/tools/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""MCP tools for EDAM ontology operations."""

from .mapping import map_to_edam_concept
from .mapping import map_to_edam_concept, map_to_edam_operation
from .suggestion import suggest_new_concept

__all__ = ["map_to_edam_concept", "suggest_new_concept"]
__all__ = ["map_to_edam_concept", "map_to_edam_operation", "suggest_new_concept"]
64 changes: 63 additions & 1 deletion edam_mcp/tools/mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from fastmcp.server import Context

from ..models.mapping import MappingRequest, MappingResponse
from ..ontology import ConceptMatcher, OntologyLoader
from ..ontology import ConceptMatcher, EDAMConceptType, OntologyLoader
from ..utils.context import MockContext


Expand Down Expand Up @@ -67,6 +67,68 @@ async def map_to_edam_concept(request: MappingRequest, context: Context) -> Mapp
raise


async def map_to_edam_operation(request: MappingRequest, context: Context) -> MappingResponse:
"""Map a description to existing EDAM operations (i.e. bioinformatics data processing tasks).

This tool takes a description (metadata, free text) and finds the most
appropriate mappings to EDAM operations. It returns matches
with confidence scores, indicating how well each operation matches the description.

Args:
request: Mapping request containing description and parameters.
context: MCP context for logging and progress reporting.

Returns:
Mapping response with matched operations and confidence scores.
"""

try:
# Log the request
context.info(f"Mapping description: {request.description[:100]}...")

# Initialize ontology components
ontology_loader = OntologyLoader()
if not ontology_loader.load_ontology():
raise RuntimeError("Failed to load EDAM ontology")

concept_matcher = ConceptMatcher(ontology_loader)

# First try exact matches
exact_matches = concept_matcher.find_exact_matches(request.description)

if exact_matches:
context.info(f"Found {len(exact_matches)} exact matches")
return MappingResponse(
matches=exact_matches,
total_matches=len(exact_matches),
has_exact_match=True,
confidence_threshold=request.min_confidence,
)

# Perform semantic matching
context.info("Performing semantic matching...")
matches = concept_matcher.match_concepts(
description=request.description,
context=request.context,
concept_type=EDAMConceptType.OPERATION,
max_results=request.max_results,
min_confidence=request.min_confidence,
)

context.info(f"Found {len(matches)} semantic matches")

return MappingResponse(
matches=matches,
total_matches=len(matches),
has_exact_match=False,
confidence_threshold=request.min_confidence,
)

except Exception as e:
context.error(f"Error in concept mapping: {e}")
raise


# Alternative function signature for direct use
async def map_description_to_concepts(
description: str,
Expand Down
80 changes: 80 additions & 0 deletions examples/basic_usage_operation_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""Basic usage example for the EDAM MCP server."""

import asyncio
import logging
import sys
from pathlib import Path

# Add the project root to the Python path
project_root = Path(__file__).parent.parent
sys.path.insert(0, str(project_root))

try:
from edam_mcp.models.mapping import MappingRequest
from edam_mcp.tools.mapping import map_to_edam_operation
from edam_mcp.utils.context import MockContext
except ImportError as e:
print(f"Error importing edam_mcp: {e}")
print("Make sure you have installed the package in development mode:")
print(" uv sync --dev")
sys.exit(1)

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


async def example_operation_mapping():
"""Example of mapping descriptions to EDAM operations concepts."""
print("=== EDAM Operation Mapping Example ===\n")

# Example descriptions to map
descriptions = [
"The Spectra package defines an efficient infrastructure for storing "
"and handling mass spectrometry spectra and functionality to subset, "
"process, visualize and compare spectra data. It provides different implementations "
"(backends) to store mass spectrometry data. These comprise backends tuned for fast "
"data access and processing and backends for very large data sets ensuring a small memory footprint.",
]

for description in descriptions:
print(f"Mapping: {description}")

try:
request = MappingRequest(
description=description,
context="bioinformatics",
max_results=5,
min_confidence=0.5,
)

mock_context = MockContext()

response = await map_to_edam_operation(request, context=mock_context)

if response.matches:
print(f" Found {response.total_matches} matches:")
for match in response.matches:
print(f" - {match.concept_label} (confidence: {match.confidence:.2f})")
print(f" URI: {match.concept_uri}")
print(f" Type: {match.concept_type}")
assert match.concept_type == "Operation"
else:
print(" No matches found")

except Exception as e:
print(f" Error: {e}")


async def main():
"""Run the examples."""
print("EDAM MCP Server - Bioconductor Spectra - Mapping")
print("=" * 50)

# Run mapping example
await example_operation_mapping()


if __name__ == "__main__":
asyncio.run(main())