Skip to content

Commit ea821bf

Browse files
NoopDogclaude
andauthored
feat: add variable-level search with intent detection (#197)
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent fb98c1d commit ea821bf

File tree

12 files changed

+1192
-75
lines changed

12 files changed

+1192
-75
lines changed

app/components/Chat/chat.tsx

Lines changed: 110 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,21 @@ interface Study {
5050
title: string;
5151
}
5252

53+
interface Variable {
54+
concept: string;
55+
datasetId: string;
56+
dbGapUrl: string;
57+
description: string;
58+
phvId: string;
59+
studyId: string;
60+
studyTitle: string;
61+
studyUrl: string;
62+
tableName: string;
63+
variableName: string;
64+
}
65+
5366
interface SearchResponse {
67+
intent: "auto" | "study" | "variable";
5468
message: string | null;
5569
query: {
5670
mentions: Mention[];
@@ -63,6 +77,8 @@ interface SearchResponse {
6377
totalMs: number;
6478
};
6579
totalStudies: number;
80+
totalVariables: number;
81+
variables: Variable[];
6682
}
6783

6884
interface UserMessage {
@@ -299,46 +315,100 @@ function AssistantResponse({
299315
</SectionRow>
300316
)}
301317

302-
<ResultCount>
303-
Found {response.totalStudies}{" "}
304-
{response.totalStudies === 1 ? "study" : "studies"} in {totalSeconds}s
305-
</ResultCount>
306-
307-
{response.studies.length > 0 && (
308-
<StudyTable>
309-
<Table size="small">
310-
<TableHead>
311-
<TableRow>
312-
<TableCell>Title</TableCell>
313-
<TableCell>dbGaP Id</TableCell>
314-
<TableCell>Platform</TableCell>
315-
<TableCell>Focus / Disease</TableCell>
316-
<TableCell>Data Type</TableCell>
317-
<TableCell>Participants</TableCell>
318-
<TableCell>Study Design</TableCell>
319-
<TableCell>Consent Code</TableCell>
320-
</TableRow>
321-
</TableHead>
322-
<TableBody>
323-
{response.studies.map((study, i) => (
324-
<TableRow key={i}>
325-
<TableCell>{study.title}</TableCell>
326-
<TableCell>{study.dbGapId}</TableCell>
327-
<TableCell>{study.platforms.join(", ")}</TableCell>
328-
<TableCell>{study.focus}</TableCell>
329-
<TableCell>{study.dataTypes.join(", ")}</TableCell>
330-
<TableCell>
331-
{study.participantCount != null
332-
? study.participantCount.toLocaleString()
333-
: "—"}
334-
</TableCell>
335-
<TableCell>{study.studyDesigns.join(", ")}</TableCell>
336-
<TableCell>{study.consentCodes.join(", ")}</TableCell>
337-
</TableRow>
338-
))}
339-
</TableBody>
340-
</Table>
341-
</StudyTable>
318+
{response.intent === "auto" ? null : response.intent === "variable" ? (
319+
<>
320+
<ResultCount>
321+
Found {response.totalVariables}{" "}
322+
{response.totalVariables === 1 ? "variable" : "variables"} in{" "}
323+
{totalSeconds}s
324+
</ResultCount>
325+
{response.variables.length > 0 && (
326+
<StudyTable>
327+
<Table size="small">
328+
<TableHead>
329+
<TableRow>
330+
<TableCell>Concept</TableCell>
331+
<TableCell>Variable Name</TableCell>
332+
<TableCell>Description</TableCell>
333+
<TableCell>Study</TableCell>
334+
<TableCell>dbGaP</TableCell>
335+
</TableRow>
336+
</TableHead>
337+
<TableBody>
338+
{response.variables.map((v, i) => (
339+
<TableRow key={i}>
340+
<TableCell>{v.concept}</TableCell>
341+
<TableCell>{v.variableName}</TableCell>
342+
<TableCell>{v.description}</TableCell>
343+
<TableCell>
344+
<a
345+
href={v.studyUrl}
346+
rel="noopener noreferrer"
347+
target="_blank"
348+
>
349+
{v.studyTitle || v.studyId}
350+
</a>
351+
</TableCell>
352+
<TableCell>
353+
<a
354+
href={v.dbGapUrl}
355+
rel="noopener noreferrer"
356+
target="_blank"
357+
>
358+
{v.phvId}
359+
</a>
360+
</TableCell>
361+
</TableRow>
362+
))}
363+
</TableBody>
364+
</Table>
365+
</StudyTable>
366+
)}
367+
</>
368+
) : (
369+
<>
370+
<ResultCount>
371+
Found {response.totalStudies}{" "}
372+
{response.totalStudies === 1 ? "study" : "studies"} in{" "}
373+
{totalSeconds}s
374+
</ResultCount>
375+
{response.studies.length > 0 && (
376+
<StudyTable>
377+
<Table size="small">
378+
<TableHead>
379+
<TableRow>
380+
<TableCell>Title</TableCell>
381+
<TableCell>dbGaP Id</TableCell>
382+
<TableCell>Platform</TableCell>
383+
<TableCell>Focus / Disease</TableCell>
384+
<TableCell>Data Type</TableCell>
385+
<TableCell>Participants</TableCell>
386+
<TableCell>Study Design</TableCell>
387+
<TableCell>Consent Code</TableCell>
388+
</TableRow>
389+
</TableHead>
390+
<TableBody>
391+
{response.studies.map((study, i) => (
392+
<TableRow key={i}>
393+
<TableCell>{study.title}</TableCell>
394+
<TableCell>{study.dbGapId}</TableCell>
395+
<TableCell>{study.platforms.join(", ")}</TableCell>
396+
<TableCell>{study.focus}</TableCell>
397+
<TableCell>{study.dataTypes.join(", ")}</TableCell>
398+
<TableCell>
399+
{study.participantCount != null
400+
? study.participantCount.toLocaleString()
401+
: "—"}
402+
</TableCell>
403+
<TableCell>{study.studyDesigns.join(", ")}</TableCell>
404+
<TableCell>{study.consentCodes.join(", ")}</TableCell>
405+
</TableRow>
406+
))}
407+
</TableBody>
408+
</Table>
409+
</StudyTable>
410+
)}
411+
</>
342412
)}
343413
</AssistantBubble>
344414
);

backend/Makefile

Lines changed: 59 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@
55
# Prod: aws sso login --profile ncpi-prod-deployer
66
#
77
# Quick start:
8-
# make deploy-dev # build + push to dev
9-
# make deploy-prod # build + push to prod
8+
# make start # start local API server (rebuilds index if needed)
9+
# make restart # stop + start
10+
# make stop # stop running server
1011

1112
# Image settings
1213
IMAGE_NAME = ncpi-search-api
@@ -21,23 +22,68 @@ PROD_AWS_PROFILE = ncpi-prod-deployer
2122
PROD_ECR_REPO = 701973344956.dkr.ecr.us-east-1.amazonaws.com/5wu-ncpi-dataset-catalog
2223

2324
AWS_REGION = us-east-1
25+
PID_FILE = .server.pid
2426

25-
.PHONY: help build push-dev push-prod deploy-dev deploy-prod
27+
.PHONY: help build push-dev push-prod deploy-dev deploy-prod \
28+
start stop restart db-reload evals
2629

2730
# Default target
2831
help:
29-
@echo "NCPI Search API - Docker Build & Deploy"
32+
@echo "NCPI Search API"
3033
@echo ""
31-
@echo "Available targets:"
32-
@echo " help - Show this help message"
33-
@echo " build - Build Docker image locally"
34-
@echo " push-dev - Tag and push image to dev ECR"
35-
@echo " push-prod - Tag and push image to prod ECR"
36-
@echo " deploy-dev - Build + push to dev (all-in-one)"
37-
@echo " deploy-prod - Build + push to prod (all-in-one)"
34+
@echo "Local development:"
35+
@echo " start - Start local API server on :8000 (background)"
36+
@echo " stop - Stop running local server"
37+
@echo " restart - Stop + start"
38+
@echo " db-reload - Rebuild DuckDB index and restart server"
39+
@echo " evals - Run extract agent evals"
3840
@echo ""
39-
@echo "Quick start:"
40-
@echo " make deploy-dev"
41+
@echo "Docker deploy:"
42+
@echo " build - Build Docker image locally"
43+
@echo " deploy-dev - Build + push to dev ECR"
44+
@echo " deploy-prod - Build + push to prod ECR"
45+
46+
# --- Local development -------------------------------------------------
47+
48+
# Start API server in background
49+
start: stop
50+
@echo "Starting API server on http://localhost:8000 ..."
51+
@.venv/bin/python -m uvicorn concept_search.api:app \
52+
--host 0.0.0.0 --port 8000 > .server.log 2>&1 & echo $$! > $(PID_FILE)
53+
@sleep 2
54+
@if kill -0 $$(cat $(PID_FILE)) 2>/dev/null; then \
55+
echo "Server running (PID $$(cat $(PID_FILE))), log: backend/.server.log"; \
56+
else \
57+
echo "Server failed to start. Check backend/.server.log"; \
58+
cat .server.log; \
59+
rm -f $(PID_FILE); \
60+
exit 1; \
61+
fi
62+
63+
# Stop running server
64+
stop:
65+
@if [ -f $(PID_FILE) ]; then \
66+
kill $$(cat $(PID_FILE)) 2>/dev/null && echo "Server stopped." || true; \
67+
rm -f $(PID_FILE); \
68+
fi
69+
@lsof -ti :8000 | xargs kill 2>/dev/null || true
70+
71+
# Stop + start
72+
restart: stop start
73+
74+
# Rebuild DuckDB index and restart server
75+
db-reload: stop
76+
@echo "Deleting cached DuckDB index..."
77+
@rm -f ../catalog/concept-search.duckdb
78+
@echo "Rebuilding index (this may take a moment)..."
79+
@.venv/bin/python -c "from concept_search.index import get_index; idx = get_index(); print(f'Index rebuilt: {idx.store.study_count} studies')"
80+
@$(MAKE) start
81+
82+
# Run extract agent evals
83+
evals:
84+
.venv/bin/python -m concept_search.eval_extract
85+
86+
# --- Docker deploy ------------------------------------------------------
4187

4288
# Build Docker image
4389
build:

backend/concept_search/EXTRACT_PROMPT.md

Lines changed: 49 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,37 @@
1-
You are a query parser for the NCPI Dataset Catalog. Your job is to extract **mentions** from a researcher's natural-language query. A mention is a phrase that refers to a filterable property of a dataset.
1+
You are a query parser for the NCPI Dataset Catalog. Your job is to extract searchable **mentions** from a researcher's natural-language query. The catalog supports two search modes: finding **datasets/studies** and finding **measured variables**. You determine which mode the user intends, then extract the relevant facet mentions either way.
22

33
## Your Job
44

5-
Identify each distinct mention in the query, assign it to a facet, and extract the text. For small facets (platform, dataType, studyDesign, sex, raceEthnicity, computedAncestry), resolve the values directly from the known lists below. For other facets (focus, measurement, consentCode), just extract the text — a separate agent will resolve the canonical values.
5+
1. Determine the query **intent**: is the user searching for studies/datasets, or for specific measured variables?
6+
2. Extract mentions from the query regardless of intent — the same facets apply to both modes. Assign each mention to a facet and extract the text. For small facets (platform, dataType, studyDesign, sex, raceEthnicity, computedAncestry), resolve the values directly from the known lists below. For other facets (focus, measurement, consentCode), just extract the text — a separate agent will resolve the canonical values.
7+
8+
## Query Intent
9+
10+
Set the `intent` field to one of:
11+
12+
| Intent | When to Use | Examples |
13+
| ------------ | --------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- |
14+
| `"study"` | User wants to find studies or datasets | "diabetes datasets on AnVIL", "cancer studies with WGS", "cohorts released after 2024" |
15+
| `"variable"` | User wants to find specific measured variables | "what variables measure chocolate consumption?", "which phenotype variables capture BMI?", "what is measured for blood pressure?" |
16+
| `"auto"` | You cannot determine intent from context — set `message` to ask | "blood pressure" (could be studies about BP or variables measuring BP) |
17+
18+
**Signals for `"variable"` intent:**
19+
20+
- "variable(s)", "what is measured", "what measures", "which measurements"
21+
- "columns", "fields", "phenotype variables", "what data is collected"
22+
- Questions of the form "what variables..." or "which variables..."
23+
24+
**Signals for `"study"` intent:**
25+
26+
- "study/studies", "dataset(s)", "cohort(s)", "trial(s)"
27+
- Platform references ("on AnVIL", "in BDC")
28+
- Study-level facets (consent codes, study designs, demographics, platforms)
29+
30+
**Default behavior:**
31+
32+
- If the query mentions platforms, consent codes, study designs, demographics, or other study-level facets → default to `"study"`
33+
- If the query specifically asks about what is measured or what variables exist → default to `"variable"`
34+
- If intent is truly ambiguous, set `intent: "auto"` and add a `message`: "Are you looking for studies about [X], or for variables that measure [X]?"
635

736
## Facets
837

@@ -58,16 +87,23 @@ For these facets, extract the user's text and leave `values` empty. A resolve ag
5887

5988
## Instructions
6089

61-
1. Read the query and identify each distinct filterable mention.
62-
2. Assign each mention to a facet.
63-
3. For platform, dataType, studyDesign, sex, raceEthnicity, computedAncestry: set `values` to the matching known value(s).
64-
4. For focus, measurement, consentCode: set `text` to the relevant phrase, leave `values` empty.
65-
5. Correct obvious typos in your text output (e.g., "systollic" → "systolic").
66-
6. Expand abbreviations (e.g., "SBP" → "systolic blood pressure", "BMI" → "body mass index").
67-
7. For small facets, ONLY when the user explicitly says "or" (e.g., "WGS or WXS"), create **one mention** with both values in the `values` list. The OR is expressed by having multiple values in a single mention.
68-
8. For other facets, ONLY when the user explicitly says "or", create **one mention** with the combined text.
69-
9. When the user says "and" between items of the same facet (e.g., "AnVIL and BDC", "heart disease and diabetes"), always create **separate mentions** — one per item. "And" means the user wants studies matching BOTH, not either. Similarly, create separate mentions for "but not", "excluding", etc. A separate agent handles the boolean logic.
70-
10. Do NOT invent values for focus, measurement, or consentCode — leave `values` empty for those.
90+
1. Determine the query **intent** (`"study"`, `"variable"`, or `"auto"`) — see "Query Intent" above.
91+
2. Read the query and identify each distinct filterable mention.
92+
3. Assign each mention to a facet.
93+
4. For platform, dataType, studyDesign, sex, raceEthnicity, computedAncestry: set `values` to the matching known value(s).
94+
5. For focus, measurement, consentCode: set `text` to the relevant phrase, leave `values` empty.
95+
6. Correct obvious typos in your text output (e.g., "systollic" → "systolic").
96+
7. Expand abbreviations (e.g., "SBP" → "systolic blood pressure", "BMI" → "body mass index").
97+
8. For small facets, ONLY when the user explicitly says "or" (e.g., "WGS or WXS"), create **one mention** with both values in the `values` list. The OR is expressed by having multiple values in a single mention.
98+
9. For other facets, ONLY when the user explicitly says "or", create **one mention** with the combined text.
99+
10. When the user says "and" between items of the same facet (e.g., "AnVIL and BDC", "heart disease and diabetes"), always create **separate mentions** — one per item. "And" means the user wants studies matching BOTH, not either. Similarly, create separate mentions for "but not", "excluding", etc. A separate agent handles the boolean logic.
100+
11. Do NOT invent values for focus, measurement, or consentCode — leave `values` empty for those.
101+
102+
### Variable intent examples
103+
104+
- "what variables measure chocolate consumption?" → `intent: "variable"`, mention: `{facet: "measurement", text: "chocolate consumption"}`
105+
- "which variables capture blood pressure?" → `intent: "variable"`, mention: `{facet: "measurement", text: "blood pressure"}`
106+
- "what phenotype variables exist for BMI?" → `intent: "variable"`, mention: `{facet: "measurement", text: "body mass index"}`
71107

72108
## When to Set `message`
73109

@@ -76,5 +112,6 @@ If the query is too vague, ambiguous, or contains no searchable concepts, set `m
76112
- **No searchable terms:** "I couldn't identify any searchable terms. Try specifying a disease (e.g., diabetes), measurement (e.g., blood pressure), or data type (e.g., WGS)."
77113
- **Ambiguous term:** "I'm not sure what 'the blood one' refers to. Did you mean a measurement like blood pressure or blood glucose, or a disease like a blood disorder?"
78114
- **Partially vague:** Extract what you can and set `message` for the unclear part. E.g., for "diabetes studies with that thing" → extract focus="diabetes", message="I couldn't identify what 'that thing' refers to. Could you be more specific?"
115+
- **Ambiguous intent:** When a query could be either a study search or variable search, set `intent: "auto"` and `message`: "Are you looking for studies about [X], or for variables that measure [X]?"
79116

80117
Leave `message` as null when the query is clear.

0 commit comments

Comments
 (0)