This guide explains how to add semantic vector search to ChatSeek for fuzzy sample type matching.
Current behavior: Exact label matching
- Query: "Find samples for measuring cell permeability"
- Result: ❌ No matches (doesn't recognize "PERM_Analysis")
With vector search: Semantic matching
- Query: "Find samples for measuring cell permeability"
- Result: ✅ Finds PERM_Analysis (89% similarity)
┌─────────────────────────────────────────────────┐
│ 1. Setup Phase (One-time) │
├─────────────────────────────────────────────────┤
│ • Create sample type descriptions │
│ • Generate embeddings (OpenAI/Anthropic/local) │
│ • Store in Neo4j vector index │
│ • Index: ~602 SCXP_Analysis, ~100 PERM, etc. │
└─────────────────────────────────────────────────┘
↓
┌─────────────────────────────────────────────────┐
│ 2. Query Phase (Real-time) │
├─────────────────────────────────────────────────┤
│ • Embed user's natural language query │
│ • Vector similarity search (cosine distance) │
│ • Top-K most similar sample types │
│ • Inject hints into Cypher query builder │
└─────────────────────────────────────────────────┘
Create semantic descriptions for each node type:
# chatseek/graphrag/sample_descriptions.py
SAMPLE_TYPE_DESCRIPTIONS = {
# Sample types
"NHP_Sample": "Non-human primate biological specimen from animal subject",
"CEX_Sample": "Cell expression assay sample from flow cytometry or immunoassay",
"DNA_Sample": "Genomic DNA extracted for sequencing or genotyping analysis",
"SEQ_Sample": "RNA or DNA sample prepared for high-throughput sequencing",
"RNA_Sample": "Total RNA or mRNA sample for gene expression analysis",
# Analysis types
"SCXP_Analysis": "Single-cell expression profiling analysis with cell type identification",
"PERM_Analysis": "Permeability assay analysis measuring barrier function and transport",
"FLOW_Analysis": "Flow cytometry analysis for cell population characterization",
# Data types
"SEQ_Data": "Raw sequencing data files (FASTQ, BAM) from NGS platforms",
"SCXP_Data": "Single-cell RNA-seq count matrices and metadata",
"PERM_Data": "Permeability coefficient measurements and transport kinetics",
# Model types
"ML_Model": "Machine learning model for prediction or classification",
"STAT_Model": "Statistical model for hypothesis testing and inference",
}# chatseek/setup/create_vector_index.py
from chatseek.core.database import get_driver
from chatseek.utils.llm import get_llm
from chatseek.graphrag.sample_descriptions import SAMPLE_TYPE_DESCRIPTIONS
def setup_vector_index():
"""One-time setup for vector search."""
driver = get_driver()
llm = get_llm()
with driver.session() as session:
# 1. Create vector index (1536 dimensions for OpenAI ada-002)
session.run("""
CREATE VECTOR INDEX sample_type_embeddings IF NOT EXISTS
FOR (n:SampleTypeEmbedding)
ON n.embedding
OPTIONS {indexConfig: {
`vector.dimensions`: 1536,
`vector.similarity_function`: 'cosine'
}}
""")
print("✓ Vector index created")
# 2. Generate and store embeddings
for sample_type, description in SAMPLE_TYPE_DESCRIPTIONS.items():
# Generate embedding (implementation depends on your LLM)
embedding = llm.embed(description) # Returns list of 1536 floats
# Store in Neo4j
session.run("""
MERGE (e:SampleTypeEmbedding {sample_type: $type})
SET e.description = $desc,
e.embedding = $emb,
e.updated_at = datetime()
""", type=sample_type, desc=description, emb=embedding)
print(f" • Embedded: {sample_type}")
print(f"✓ Indexed {len(SAMPLE_TYPE_DESCRIPTIONS)} sample types")
driver.close()
if __name__ == "__main__":
setup_vector_index()Run once:
python chatseek/setup/create_vector_index.py# chatseek/graphrag/semantic_search.py
class SemanticSearchEnhancer:
"""Enhance queries with vector similarity search."""
def __init__(self, driver, llm):
self.driver = driver
self.llm = llm
def find_relevant_sample_types(
self,
query: str,
top_k: int = 3,
min_score: float = 0.7
) -> list[tuple[str, float]]:
"""
Find sample types semantically similar to query.
Args:
query: Natural language query
top_k: Number of results to return
min_score: Minimum similarity score (0-1)
Returns:
List of (sample_type, similarity_score) tuples
"""
# 1. Embed the query
query_embedding = self.llm.embed(query)
# 2. Vector similarity search in Neo4j
with self.driver.session() as session:
result = session.run("""
CALL db.index.vector.queryNodes(
'sample_type_embeddings',
$top_k,
$query_emb
)
YIELD node, score
WHERE score >= $min_score
RETURN node.sample_type as type,
node.description as desc,
score
ORDER BY score DESC
""", top_k=top_k, query_emb=query_embedding, min_score=min_score)
matches = [(r["type"], r["score"]) for r in result]
return matches# chatseek/graphrag/query_engine.py (enhanced)
class QueryEngine:
def __init__(self, driver, llm, use_semantic_search=True):
self.driver = driver
self.llm = llm
self.extractor = EntityExtractor(llm)
self.builder = QueryBuilder()
# NEW: Enable semantic search
self.semantic_search = (
SemanticSearchEnhancer(driver, llm)
if use_semantic_search
else None
)
def query(self, natural_language: str, verbose=False):
if verbose:
print(f"🔍 Processing: {natural_language}\n")
# Step 1: Extract entities (existing)
intent = self.extractor.extract(natural_language)
if verbose:
print(f"1️⃣ Extracted entities: {intent.entities}")
# Step 2: Semantic enhancement (NEW)
if self.semantic_search:
semantic_hints = self.semantic_search.find_relevant_sample_types(
natural_language, top_k=3, min_score=0.7
)
if semantic_hints and verbose:
print(f"2️⃣ Semantic matches:")
for sample_type, score in semantic_hints:
print(f" • {sample_type} ({score:.2%} similarity)")
# Add hints to intent
intent.semantic_hints = semantic_hints
# Step 3: Build query (enhanced with semantic hints)
cypher_query = self.builder.build_from_intent(intent)
if verbose:
print(f"3️⃣ Generated Cypher:\n{cypher_query[:200]}...\n")
# Step 4: Execute
results = self._execute_query(cypher_query)
return {
"success": True,
"results": results,
"cypher_query": cypher_query,
"semantic_hints": getattr(intent, 'semantic_hints', [])
}# chatseek/graphrag/query_builder.py (enhancement)
class QueryBuilder:
def build_from_intent(self, intent: Intent) -> str:
"""Build Cypher query from extracted intent."""
# If no explicit entities but semantic hints exist, use those
if not intent.entities and hasattr(intent, 'semantic_hints'):
if intent.semantic_hints:
# Use top semantic match as label filter
top_match, score = intent.semantic_hints[0]
if intent.intent_type == "find_samples":
return self.build_samples_by_type(top_match)
elif intent.intent_type == "show_provenance":
return self.build_provenance_for_analysis("", top_match)
# Original logic...
return self._build_original_query(intent)
def build_samples_by_type(self, sample_type: str) -> str:
"""Find all samples of a specific type (from semantic match)."""
return f"""
MATCH (s:{sample_type})
RETURN s.uid, s.name, s.id, labels(s) as sample_type
ORDER BY s.uid
LIMIT 100
"""engine = QueryEngine(driver, llm, use_semantic_search=True)
# Without semantic search: fails (no exact "permeability" label)
# With semantic search: finds PERM_Analysis
result = engine.query(
"Show me samples used for measuring cell permeability",
verbose=True
)
# Output:
# 🔍 Processing: Show me samples used for measuring cell permeability
# 1️⃣ Extracted entities: []
# 2️⃣ Semantic matches:
# • PERM_Analysis (89.2% similarity)
# • PERM_Data (76.5% similarity)
# 3️⃣ Generated Cypher:
# MATCH (s:PERM_Analysis) ...
# ✅ Found 142 resultsresult = engine.query("Find gene expression data", verbose=True)
# Semantic matches:
# • SCXP_Analysis (92.1% similarity) - single-cell
# • SEQ_Sample (87.3% similarity) - bulk RNA-seq
# • RNA_Sample (85.6% similarity) - RNA extracts# Study name is exact, sample type is semantic
result = engine.query("Find permeability samples in GBM Study")
# Extracts: study="GBM" (exact)
# Semantic: PERM_Analysis (fuzzy match on "permeability")
# Query: MATCH (study:Study {name: 'GBM'})-[:HAS_ASSAY]->(a)-[:OUTPUT]->(s:PERM_Analysis)Option 1: OpenAI (recommended)
import openai
def embed(text: str) -> list[float]:
response = openai.embeddings.create(
model="text-embedding-ada-002",
input=text
)
return response.data[0].embedding # 1536 dimensionsOption 2: Local (sentence-transformers)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
def embed(text: str) -> list[float]:
return model.encode(text).tolist() # 384 dimensionsOption 3: Anthropic (via prompt)
# Note: Anthropic doesn't have native embeddings API yet
# Use OpenAI or local model for embedding generation# Adjust similarity threshold (0.0 - 1.0)
semantic_hints = searcher.find_relevant_sample_types(
query,
top_k=3, # Return top 3 matches
min_score=0.70 # Require 70%+ similarity
)
# Lower min_score = more fuzzy (may get false positives)
# Higher min_score = more strict (may miss valid matches)- Latency: +100-200ms per query (embedding generation + vector search)
- Cache embeddings: Store query embeddings for repeated searches
- Index size: ~1KB per sample type description
- Memory: Neo4j vector index size ≈ dimensions × count × 4 bytes
- Example: 1536 dims × 50 types × 4 bytes = 307KB
- Fuzzy matching: "immune cells" → CEX_Sample
- Synonym handling: "gene expression" → SCXP_Analysis + SEQ_Sample
- Typo tolerance: "permability" → PERM_Analysis (0.85 similarity)
- User-friendly: Don't need to know exact label names
- Setup cost: Need to write/maintain sample type descriptions
- Latency: Adds 100-200ms per query
- False positives: Similar descriptions may not be biologically related
- Embedding cost: OpenAI charges ~$0.0001 per 1K tokens
When you add new node labels to Neo4j:
- Add description to
SAMPLE_TYPE_DESCRIPTIONS - Run embedding script:
python chatseek/setup/create_vector_index.py - Verify with test query
# Update existing descriptions and re-embed
session.run("""
MATCH (e:SampleTypeEmbedding {sample_type: 'PERM_Analysis'})
SET e.description = $new_desc,
e.embedding = $new_emb,
e.updated_at = datetime()
""", new_desc="...", new_emb=llm.embed("..."))# Log semantic match quality
if verbose:
print(f"Semantic confidence: {score:.2%}")
if score < 0.75:
print("⚠️ Low confidence - verify results")- Hybrid search: Combine exact keyword matching + semantic similarity
- Property-level search: Embed sample property descriptions for field-level matching
- Query expansion: Use semantic search to suggest alternative queries
- Feedback loop: Learn from user corrections to improve embeddings