NIH-NCPI
diff --git a/‎app/components/Chat/chat.tsx‎
Lines changed: 110 additions & 40 deletions b/‎app/components/Chat/chat.tsx‎
Lines changed: 110 additions & 40 deletions
diff --git a/‎backend/Makefile‎
Lines changed: 59 additions & 13 deletions b/‎backend/Makefile‎
Lines changed: 59 additions & 13 deletions
diff --git a/‎backend/concept_search/EXTRACT_PROMPT.md‎
Lines changed: 49 additions & 12 deletions b/‎backend/concept_search/EXTRACT_PROMPT.md‎
Lines changed: 49 additions & 12 deletions
@@ -50,7 +50,21 @@ interface Study {
   title: string;
 }
 
+interface Variable {
+  concept: string;
+  datasetId: string;
+  dbGapUrl: string;
+  description: string;
+  phvId: string;
+  studyId: string;
+  studyTitle: string;
+  studyUrl: string;
+  tableName: string;
+  variableName: string;
+}
+
 interface SearchResponse {
+  intent: "auto" | "study" | "variable";
   message: string | null;
   query: {
     mentions: Mention[];
@@ -63,6 +77,8 @@ interface SearchResponse {
     totalMs: number;
   };
   totalStudies: number;
+  totalVariables: number;
+  variables: Variable[];
 }
 
 interface UserMessage {
@@ -299,46 +315,100 @@ function AssistantResponse({
         </SectionRow>
       )}
 
-      <ResultCount>
-        Found {response.totalStudies}{" "}
-        {response.totalStudies === 1 ? "study" : "studies"} in {totalSeconds}s
-      </ResultCount>
-
-      {response.studies.length > 0 && (
-        <StudyTable>
-          <Table size="small">
-            <TableHead>
-              <TableRow>
-                <TableCell>Title</TableCell>
-                <TableCell>dbGaP Id</TableCell>
-                <TableCell>Platform</TableCell>
-                <TableCell>Focus / Disease</TableCell>
-                <TableCell>Data Type</TableCell>
-                <TableCell>Participants</TableCell>
-                <TableCell>Study Design</TableCell>
-                <TableCell>Consent Code</TableCell>
-              </TableRow>
-            </TableHead>
-            <TableBody>
-              {response.studies.map((study, i) => (
-                <TableRow key={i}>
-                  <TableCell>{study.title}</TableCell>
-                  <TableCell>{study.dbGapId}</TableCell>
-                  <TableCell>{study.platforms.join(", ")}</TableCell>
-                  <TableCell>{study.focus}</TableCell>
-                  <TableCell>{study.dataTypes.join(", ")}</TableCell>
-                  <TableCell>
-                    {study.participantCount != null
-                      ? study.participantCount.toLocaleString()
-                      : "—"}
-                  </TableCell>
-                  <TableCell>{study.studyDesigns.join(", ")}</TableCell>
-                  <TableCell>{study.consentCodes.join(", ")}</TableCell>
-                </TableRow>
-              ))}
-            </TableBody>
-          </Table>
-        </StudyTable>
+      {response.intent === "auto" ? null : response.intent === "variable" ? (
+        <>
+          <ResultCount>
+            Found {response.totalVariables}{" "}
+            {response.totalVariables === 1 ? "variable" : "variables"} in{" "}
+            {totalSeconds}s
+          </ResultCount>
+          {response.variables.length > 0 && (
+            <StudyTable>
+              <Table size="small">
+                <TableHead>
+                  <TableRow>
+                    <TableCell>Concept</TableCell>
+                    <TableCell>Variable Name</TableCell>
+                    <TableCell>Description</TableCell>
+                    <TableCell>Study</TableCell>
+                    <TableCell>dbGaP</TableCell>
+                  </TableRow>
+                </TableHead>
+                <TableBody>
+                  {response.variables.map((v, i) => (
+                    <TableRow key={i}>
+                      <TableCell>{v.concept}</TableCell>
+                      <TableCell>{v.variableName}</TableCell>
+                      <TableCell>{v.description}</TableCell>
+                      <TableCell>
+                        <a
+                          href={v.studyUrl}
+                          rel="noopener noreferrer"
+                          target="_blank"
+                        >
+                          {v.studyTitle || v.studyId}
+                        </a>
+                      </TableCell>
+                      <TableCell>
+                        <a
+                          href={v.dbGapUrl}
+                          rel="noopener noreferrer"
+                          target="_blank"
+                        >
+                          {v.phvId}
+                        </a>
+                      </TableCell>
+                    </TableRow>
+                  ))}
+                </TableBody>
+              </Table>
+            </StudyTable>
+          )}
+        </>
+      ) : (
+        <>
+          <ResultCount>
+            Found {response.totalStudies}{" "}
+            {response.totalStudies === 1 ? "study" : "studies"} in{" "}
+            {totalSeconds}s
+          </ResultCount>
+          {response.studies.length > 0 && (
+            <StudyTable>
+              <Table size="small">
+                <TableHead>
+                  <TableRow>
+                    <TableCell>Title</TableCell>
+                    <TableCell>dbGaP Id</TableCell>
+                    <TableCell>Platform</TableCell>
+                    <TableCell>Focus / Disease</TableCell>
+                    <TableCell>Data Type</TableCell>
+                    <TableCell>Participants</TableCell>
+                    <TableCell>Study Design</TableCell>
+                    <TableCell>Consent Code</TableCell>
+                  </TableRow>
+                </TableHead>
+                <TableBody>
+                  {response.studies.map((study, i) => (
+                    <TableRow key={i}>
+                      <TableCell>{study.title}</TableCell>
+                      <TableCell>{study.dbGapId}</TableCell>
+                      <TableCell>{study.platforms.join(", ")}</TableCell>
+                      <TableCell>{study.focus}</TableCell>
+                      <TableCell>{study.dataTypes.join(", ")}</TableCell>
+                      <TableCell>
+                        {study.participantCount != null
+                          ? study.participantCount.toLocaleString()
+                          : "—"}
+                      </TableCell>
+                      <TableCell>{study.studyDesigns.join(", ")}</TableCell>
+                      <TableCell>{study.consentCodes.join(", ")}</TableCell>
+                    </TableRow>
+                  ))}
+                </TableBody>
+              </Table>
+            </StudyTable>
+          )}
+        </>
       )}
     </AssistantBubble>
   );
 
@@ -5,8 +5,9 @@
 #   Prod: aws sso login --profile ncpi-prod-deployer
 #
 # Quick start:
-#   make deploy-dev     # build + push to dev
-#   make deploy-prod    # build + push to prod
+#   make start          # start local API server (rebuilds index if needed)
+#   make restart        # stop + start
+#   make stop           # stop running server
 
 # Image settings
 IMAGE_NAME = ncpi-search-api
@@ -21,23 +22,68 @@ PROD_AWS_PROFILE = ncpi-prod-deployer
 PROD_ECR_REPO = 701973344956.dkr.ecr.us-east-1.amazonaws.com/5wu-ncpi-dataset-catalog
 
 AWS_REGION = us-east-1
+PID_FILE = .server.pid
 
-.PHONY: help build push-dev push-prod deploy-dev deploy-prod
+.PHONY: help build push-dev push-prod deploy-dev deploy-prod \
+        start stop restart db-reload evals
 
 # Default target
 help:
-	@echo "NCPI Search API - Docker Build & Deploy"
+	@echo "NCPI Search API"
 	@echo ""
-	@echo "Available targets:"
-	@echo "  help          - Show this help message"
-	@echo "  build         - Build Docker image locally"
-	@echo "  push-dev      - Tag and push image to dev ECR"
-	@echo "  push-prod     - Tag and push image to prod ECR"
-	@echo "  deploy-dev    - Build + push to dev (all-in-one)"
-	@echo "  deploy-prod   - Build + push to prod (all-in-one)"
+	@echo "Local development:"
+	@echo "  start       - Start local API server on :8000 (background)"
+	@echo "  stop        - Stop running local server"
+	@echo "  restart     - Stop + start"
+	@echo "  db-reload   - Rebuild DuckDB index and restart server"
+	@echo "  evals       - Run extract agent evals"
 	@echo ""
-	@echo "Quick start:"
-	@echo "  make deploy-dev"
+	@echo "Docker deploy:"
+	@echo "  build       - Build Docker image locally"
+	@echo "  deploy-dev  - Build + push to dev ECR"
+	@echo "  deploy-prod - Build + push to prod ECR"
+
+# --- Local development -------------------------------------------------
+
+# Start API server in background
+start: stop
+	@echo "Starting API server on http://localhost:8000 ..."
+	@.venv/bin/python -m uvicorn concept_search.api:app \
+		--host 0.0.0.0 --port 8000 > .server.log 2>&1 & echo $$! > $(PID_FILE)
+	@sleep 2
+	@if kill -0 $$(cat $(PID_FILE)) 2>/dev/null; then \
+		echo "Server running (PID $$(cat $(PID_FILE))), log: backend/.server.log"; \
+	else \
+		echo "Server failed to start. Check backend/.server.log"; \
+		cat .server.log; \
+		rm -f $(PID_FILE); \
+		exit 1; \
+	fi
+
+# Stop running server
+stop:
+	@if [ -f $(PID_FILE) ]; then \
+		kill $$(cat $(PID_FILE)) 2>/dev/null && echo "Server stopped." || true; \
+		rm -f $(PID_FILE); \
+	fi
+	@lsof -ti :8000 | xargs kill 2>/dev/null || true
+
+# Stop + start
+restart: stop start
+
+# Rebuild DuckDB index and restart server
+db-reload: stop
+	@echo "Deleting cached DuckDB index..."
+	@rm -f ../catalog/concept-search.duckdb
+	@echo "Rebuilding index (this may take a moment)..."
+	@.venv/bin/python -c "from concept_search.index import get_index; idx = get_index(); print(f'Index rebuilt: {idx.store.study_count} studies')"
+	@$(MAKE) start
+
+# Run extract agent evals
+evals:
+	.venv/bin/python -m concept_search.eval_extract
+
+# --- Docker deploy ------------------------------------------------------
 
 # Build Docker image
 build:
 
@@ -1,8 +1,37 @@
-You are a query parser for the NCPI Dataset Catalog. Your job is to extract **mentions** from a researcher's natural-language query. A mention is a phrase that refers to a filterable property of a dataset.
+You are a query parser for the NCPI Dataset Catalog. Your job is to extract searchable **mentions** from a researcher's natural-language query. The catalog supports two search modes: finding **datasets/studies** and finding **measured variables**. You determine which mode the user intends, then extract the relevant facet mentions either way.
 
 ## Your Job
 
-Identify each distinct mention in the query, assign it to a facet, and extract the text. For small facets (platform, dataType, studyDesign, sex, raceEthnicity, computedAncestry), resolve the values directly from the known lists below. For other facets (focus, measurement, consentCode), just extract the text — a separate agent will resolve the canonical values.
+1. Determine the query **intent**: is the user searching for studies/datasets, or for specific measured variables?
+2. Extract mentions from the query regardless of intent — the same facets apply to both modes. Assign each mention to a facet and extract the text. For small facets (platform, dataType, studyDesign, sex, raceEthnicity, computedAncestry), resolve the values directly from the known lists below. For other facets (focus, measurement, consentCode), just extract the text — a separate agent will resolve the canonical values.
+
+## Query Intent
+
+Set the `intent` field to one of:
+
+| Intent       | When to Use                                                     | Examples                                                                                                                          |
+| ------------ | --------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- |
+| `"study"`    | User wants to find studies or datasets                          | "diabetes datasets on AnVIL", "cancer studies with WGS", "cohorts released after 2024"                                            |
+| `"variable"` | User wants to find specific measured variables                  | "what variables measure chocolate consumption?", "which phenotype variables capture BMI?", "what is measured for blood pressure?" |
+| `"auto"`     | You cannot determine intent from context — set `message` to ask | "blood pressure" (could be studies about BP or variables measuring BP)                                                            |
+
+**Signals for `"variable"` intent:**
+
+- "variable(s)", "what is measured", "what measures", "which measurements"
+- "columns", "fields", "phenotype variables", "what data is collected"
+- Questions of the form "what variables..." or "which variables..."
+
+**Signals for `"study"` intent:**
+
+- "study/studies", "dataset(s)", "cohort(s)", "trial(s)"
+- Platform references ("on AnVIL", "in BDC")
+- Study-level facets (consent codes, study designs, demographics, platforms)
+
+**Default behavior:**
+
+- If the query mentions platforms, consent codes, study designs, demographics, or other study-level facets → default to `"study"`
+- If the query specifically asks about what is measured or what variables exist → default to `"variable"`
+- If intent is truly ambiguous, set `intent: "auto"` and add a `message`: "Are you looking for studies about [X], or for variables that measure [X]?"
 
 ## Facets
 
@@ -58,16 +87,23 @@ For these facets, extract the user's text and leave `values` empty. A resolve ag
 
 ## Instructions
 
-1. Read the query and identify each distinct filterable mention.
-2. Assign each mention to a facet.
-3. For platform, dataType, studyDesign, sex, raceEthnicity, computedAncestry: set `values` to the matching known value(s).
-4. For focus, measurement, consentCode: set `text` to the relevant phrase, leave `values` empty.
-5. Correct obvious typos in your text output (e.g., "systollic" → "systolic").
-6. Expand abbreviations (e.g., "SBP" → "systolic blood pressure", "BMI" → "body mass index").
-7. For small facets, ONLY when the user explicitly says "or" (e.g., "WGS or WXS"), create **one mention** with both values in the `values` list. The OR is expressed by having multiple values in a single mention.
-8. For other facets, ONLY when the user explicitly says "or", create **one mention** with the combined text.
-9. When the user says "and" between items of the same facet (e.g., "AnVIL and BDC", "heart disease and diabetes"), always create **separate mentions** — one per item. "And" means the user wants studies matching BOTH, not either. Similarly, create separate mentions for "but not", "excluding", etc. A separate agent handles the boolean logic.
-10. Do NOT invent values for focus, measurement, or consentCode — leave `values` empty for those.
+1. Determine the query **intent** (`"study"`, `"variable"`, or `"auto"`) — see "Query Intent" above.
+2. Read the query and identify each distinct filterable mention.
+3. Assign each mention to a facet.
+4. For platform, dataType, studyDesign, sex, raceEthnicity, computedAncestry: set `values` to the matching known value(s).
+5. For focus, measurement, consentCode: set `text` to the relevant phrase, leave `values` empty.
+6. Correct obvious typos in your text output (e.g., "systollic" → "systolic").
+7. Expand abbreviations (e.g., "SBP" → "systolic blood pressure", "BMI" → "body mass index").
+8. For small facets, ONLY when the user explicitly says "or" (e.g., "WGS or WXS"), create **one mention** with both values in the `values` list. The OR is expressed by having multiple values in a single mention.
+9. For other facets, ONLY when the user explicitly says "or", create **one mention** with the combined text.
+10. When the user says "and" between items of the same facet (e.g., "AnVIL and BDC", "heart disease and diabetes"), always create **separate mentions** — one per item. "And" means the user wants studies matching BOTH, not either. Similarly, create separate mentions for "but not", "excluding", etc. A separate agent handles the boolean logic.
+11. Do NOT invent values for focus, measurement, or consentCode — leave `values` empty for those.
+
+### Variable intent examples
+
+- "what variables measure chocolate consumption?" → `intent: "variable"`, mention: `{facet: "measurement", text: "chocolate consumption"}`
+- "which variables capture blood pressure?" → `intent: "variable"`, mention: `{facet: "measurement", text: "blood pressure"}`
+- "what phenotype variables exist for BMI?" → `intent: "variable"`, mention: `{facet: "measurement", text: "body mass index"}`
 
 ## When to Set `message`
 
@@ -76,5 +112,6 @@ If the query is too vague, ambiguous, or contains no searchable concepts, set `m
 - **No searchable terms:** "I couldn't identify any searchable terms. Try specifying a disease (e.g., diabetes), measurement (e.g., blood pressure), or data type (e.g., WGS)."
 - **Ambiguous term:** "I'm not sure what 'the blood one' refers to. Did you mean a measurement like blood pressure or blood glucose, or a disease like a blood disorder?"
 - **Partially vague:** Extract what you can and set `message` for the unclear part. E.g., for "diabetes studies with that thing" → extract focus="diabetes", message="I couldn't identify what 'that thing' refers to. Could you be more specific?"
+- **Ambiguous intent:** When a query could be either a study search or variable search, set `intent: "auto"` and `message`: "Are you looking for studies about [X], or for variables that measure [X]?"
 
 Leave `message` as null when the query is clear.