jjackson
diff --git a/‎.claude/AGENTS.md‎
Lines changed: 79 additions & 1 deletion b/‎.claude/AGENTS.md‎
Lines changed: 79 additions & 1 deletion
diff --git a/‎.claude/launch.json‎
Lines changed: 6 additions & 0 deletions b/‎.claude/launch.json‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎commcare_connect/audit/analysis_config.py‎
Lines changed: 3 additions & 1 deletion b/‎commcare_connect/audit/analysis_config.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎commcare_connect/audit/data_access.py‎
Lines changed: 113 additions & 23 deletions b/‎commcare_connect/audit/data_access.py‎
Lines changed: 113 additions & 23 deletions
@@ -95,6 +95,40 @@ Structured audits of FLW visits with AI-powered reviews.
 - **AI review:** `audit/ai_review.py` runs validation agents on individual visits
 - **Uses:** `AnalysisPipeline` for visit data filtering
 
+#### Audit API Contracts (used by workflow templates)
+
+**Create async** `POST /audit/api/audit/create-async/`
+```json
+{ "opportunities": [{"id": 1, "name": "..."}], "criteria": {
+    "audit_type": "date_range|last_n_per_opp",
+    "start_date": "YYYY-MM-DD", "end_date": "YYYY-MM-DD",
+    "count_per_opp": 10, "sample_percentage": 100,
+    "related_fields": [{"image_path": "...", "filter_by_image": true}]
+  }, "workflow_run_id": 123 }
+```
+Response: `{"success": true, "task_id": "..."}`. Task result has `{"sessions": [{"id", "title", "visits", "images"}]}`.
+
+**Bulk data** `GET /audit/api/<session_id>/bulk-data/`
+Response: `{"assessments": [{id, visit_id, blob_id, question_id, opportunity_id, filename, result, notes, status, image_url, visit_date, entity_name, username, related_fields, ai_result, ai_notes}], ...}`
+Note: `opportunity_id` = `session.opportunity_id` (same for all assessments in a session). `status` = `"pass"|"fail"|"pending"`.
+
+**Save progress** `POST /audit/api/<session_id>/save/`
+FormData: `visit_results` = JSON string of `{visit_id: {assessments: {blob_id: {question_id, result, notes, ai_result, ai_notes}}}}`
+
+**Complete** `POST /audit/api/<session_id>/complete/`
+FormData: `overall_result` (`"pass"|"fail"`), `notes`, `kpi_notes` (can be `""`), `visit_results` (same shape as save).
+
+**AI Review** `POST /audit/api/<session_id>/ai-review/`
+JSON body (NOT FormData): `{"assessments": [{"visit_id", "blob_id", "reading"}], "agent_id": "scale_validation", "opportunity_id": <int>}`
+Response: `{"results": [{"visit_id", "blob_id", "ai_result": "match|no_match|error", "ai_notes": "..."}]}`
+Note: `opportunity_id` is **required**. Use `a.opportunity_id` from the assessment object (not `selected_opps[0].id`).
+
+**Opp search** `GET /audit/api/opportunities/search/?q=<query>`
+Response: `{"opportunities": [{"id", "name"}]}`
+
+**Workflow sessions** `GET /audit/api/workflow/<workflow_run_id>/sessions/`
+Response: `{"sessions": [{"id", ...}]}` — fallback for session_id discovery after async creation.
+
 ### `tasks/` — Task Management
 
 > See also: [`commcare_connect/tasks/README.md`](../commcare_connect/tasks/README.md) for data model details and testing guidance.
@@ -116,10 +150,54 @@ Data-driven workflows with custom React UIs and pipeline integration.
 - **DataAccess:** `WorkflowDataAccess`, `PipelineDataAccess` (both extend `BaseDataAccess`) in `workflow/data_access.py`
 - **Proxy models:** `WorkflowDefinitionRecord`, `WorkflowRenderCodeRecord`, `WorkflowRunRecord`, `WorkflowChatHistoryRecord`, `PipelineDefinitionRecord` (experiment=`"workflow"` / `"pipeline"`)
 - **Key views:** Workflow list (`/workflow/`), definition view, run view
-- **Templates:** Predefined workflow templates in `workflow/templates/` (audit_with_ai_review, performance_review, ocs_outreach)
+- **Templates:** Predefined workflow templates in `workflow/templates/` (audit_with_ai_review, bulk_image_audit, mbw_monitoring_v2, performance_review, ocs_outreach)
 - **Render code:** React components stored as LabsRecords, rendered dynamically in workflow runner
 - **Cross-app:** Can create audit sessions and tasks from workflow actions
 
+#### Workflow Template Anatomy
+
+Each template is a Python file in `workflow/templates/` that exports three dicts:
+
+```python
+DEFINITION = {
+    "name": str, "description": str, "version": 1,
+    "templateType": str,         # must match TEMPLATE["key"]
+    "statuses": [...],           # list of {id, label, color}
+    "config": {...},             # e.g. {"showSummaryCards": True}
+    "pipeline_sources": [],
+}
+
+RENDER_CODE = """function WorkflowUI({ definition, instance, workers,
+    pipelines, links, actions, onUpdateState }) {
+    // Full React JSX component — Babel standalone transpiles in-browser, no build step
+    // Inner components defined as const arrows INSIDE WorkflowUI to close over parent state
+    // Phase router at bottom: {phase === 'foo' && <FooPhase />}
+}"""
+
+TEMPLATE = {
+    "key": str,           # e.g. "bulk_image_audit" — unique, used for lookup
+    "name": str,
+    "description": str,
+    "icon": str,          # Font Awesome class e.g. "fa-images"
+    "color": str,         # Tailwind color e.g. "blue"
+    "definition": DEFINITION,
+    "render_code": RENDER_CODE,
+    "pipeline_schema": None,  # or dict for single pipeline; use "pipeline_schemas" list for multi
+}
+```
+
+**Registration:** `__init__.py` auto-discovers via `pkgutil.iter_modules`. Also has explicit re-exports at the bottom — **add new templates to both the `from . import` line and `__all__`**.
+
+**JSX-in-Python rules:**
+- Cannot use `"""` inside `RENDER_CODE` (Python string delimiter conflict)
+- Inner components must be defined BEFORE they are used (no hoisting)
+- State for child components is hoisted to outer `WorkflowUI` so it persists across re-renders
+- `onUpdateState(patch)` PATCH-merges into `run.data.state` on the server
+- Workflow props: `{ definition, instance, workers, pipelines, links, actions, onUpdateState }`
+- `actions.createAudit(payload)` → `POST /audit/api/audit/create-async/`
+- `actions.streamAuditProgress(task_id, onProgress, onComplete, onError)` → SSE stream
+- `actions.cancelAudit(task_id)` → cancel endpoint
+
 ### `ai/` — AI Agent Integration
 
 > See also: [`commcare_connect/ai/README.md`](../commcare_connect/ai/README.md) for data model details and testing guidance.
 
@@ -25,6 +25,12 @@
       "runtimeExecutable": "celery",
       "runtimeArgs": ["-A", "config.celery_app", "worker", "-l", "info"],
       "port": 0
+    },
+    {
+      "name": "celery-beat",
+      "runtimeExecutable": "celery",
+      "runtimeArgs": ["-A", "config.celery_app", "beat"],
+      "port": 0
     }
   ]
 }
@@ -68,11 +68,13 @@ def extract_images_with_question_ids(visit_data: dict) -> list[dict]:
 
     # Extract visit-level metadata
     username = visit_data.get("username") or ""
-    visit_date = visit_data.get("visit_date") or ""
     entity_name = visit_data.get("entity_name") or "No Entity"
 
     # Build filename->path map in a SINGLE traversal (O(m) where m=tree size)
     form_data = form_json.get("form", form_json)
+
+    # Use form.meta.timeEnd for actual submission time; fall back to visit_date (date only)
+    visit_date = form_data.get("meta", {}).get("timeEnd") or visit_data.get("visit_date") or ""
     filename_map = _build_filename_map(form_data)
 
     # Now each lookup is O(1) instead of O(m)
 
@@ -98,12 +98,14 @@ def from_dict(cls, data: dict) -> "AuditCriteria":
                 {
                     "image_path": rf.get("image_path") or rf.get("imagePath", ""),
                     "field_path": rf.get("field_path") or rf.get("fieldPath", ""),
+                    "hq_url_path": rf.get("hq_url_path") or rf.get("hqUrlPath", ""),
                     "label": rf.get("label", ""),
                     "filter_by_image": rf.get("filter_by_image") or rf.get("filterByImage", False),
                     "filter_by_field": rf.get("filter_by_field") or rf.get("filterByField", False),
                 }
                 for rf in related_fields_raw
-                if (rf.get("image_path") or rf.get("imagePath")) and (rf.get("field_path") or rf.get("fieldPath"))
+                # Require image_path; field_path is optional (image-only filter rules are valid)
+                if rf.get("image_path") or rf.get("imagePath")
             ]
 
         return cls(
@@ -175,10 +177,17 @@ def filter_visits_for_audit(
     if criteria.selected_flw_user_ids and "username" in df.columns:
         df = df[df["username"].isin(criteria.selected_flw_user_ids)]
 
-    # Apply sample percentage
+    # Apply sample percentage — sample per FLW for equal representation, then shuffle
     if criteria.sample_percentage < 100 and len(df) > 0:
-        sample_size = max(1, int(len(df) * criteria.sample_percentage / 100))
-        df = df.sample(n=min(sample_size, len(df)), random_state=42)
+        if "username" in df.columns:
+            groups = []
+            for _, grp in df.groupby("username", dropna=False):
+                n = max(1, int(len(grp) * criteria.sample_percentage / 100))
+                groups.append(grp.sample(n=min(n, len(grp)), random_state=42))
+            df = pd.concat(groups).sample(frac=1, random_state=42)
+        else:
+            sample_size = max(1, int(len(df) * criteria.sample_percentage / 100))
+            df = df.sample(n=min(sample_size, len(df)), random_state=42)
 
     if return_visits:
         return df.to_dict("records")
@@ -737,6 +746,89 @@ def extract_images_for_visits(
             if str(vid) not in result:
                 result[str(vid)] = []
 
+        # Build visit lookup once — shared by enrichment and fallback sections below
+        visit_dict_by_id = {str(v.get("id", "")): v for v in visit_dicts}
+
+        # Fetch cc_domain for building CommCareHQ attachment URLs (cached, ~1 API call per hour)
+        cc_domain = None
+        try:
+            from commcare_connect.workflow.templates.mbw_monitoring.data_fetchers import fetch_opportunity_metadata
+
+            meta = fetch_opportunity_metadata(self.access_token, opp_id)
+            cc_domain = meta.get("cc_domain")
+        except Exception as e:
+            # Intentionally broad: cc_domain is optional for URL construction; any failure
+            # (network, missing key, unexpected format) should degrade gracefully, not block audit.
+            logger.debug(f"[ImageExtract] Could not fetch cc_domain for hq_url construction: {e}")
+
+        # Enrich Connect blob images with xform_id and build hq_url
+        hq_base = settings.COMMCARE_HQ_URL.rstrip("/")
+        for visit_id_str, images in result.items():
+            visit_data = visit_dict_by_id.get(visit_id_str, {})
+            form_json = visit_data.get("form_json", {})
+            xform_id = form_json.get("id") or ""
+            for img in images:
+                img["xform_id"] = xform_id
+                if cc_domain and xform_id and img.get("name") and not img.get("hq_url"):
+                    img["hq_url"] = f"{hq_base}/a/{cc_domain}/api/form/attachment/{xform_id}/{img['name']}"
+
+        # Fallback: for visits with no Connect blobs, extract CommCareHQ URL images
+        # from form_json using related_fields rules.
+        # Strategy 1: use hq_url_path (pre-computed URL stored in form JSON)
+        # Strategy 2: extract filename from image_path, build HQ attachment URL
+        #   (used when hq_url_path is empty — e.g. dynamic image type discovery
+        #   can't resolve DataBindOnly XForm paths from the HQ app definition API)
+        if related_fields:
+            import hashlib
+
+            image_rules = [r for r in related_fields if r.get("image_path")]
+            if image_rules:
+                for visit_id_str, images in result.items():
+                    visit_data = visit_dict_by_id.get(visit_id_str, {})
+                    form_json = visit_data.get("form_json", {})
+                    form_data = form_json.get("form", form_json)
+                    xform_id = form_json.get("id") or ""
+                    username = visit_data.get("username") or ""
+                    # Use form.meta.timeEnd for actual submission time; fall back to visit_date (date only)
+                    visit_date = form_data.get("meta", {}).get("timeEnd") or visit_data.get("visit_date") or ""
+                    entity_name = visit_data.get("entity_name") or "No Entity"
+                    for rule in image_rules:
+                        hq_url_path = rule.get("hq_url_path", "")
+                        image_path = rule.get("image_path", "")
+
+                        # Skip if this image type is already present (e.g. from Connect blob)
+                        if any(img.get("question_id") == image_path for img in images):
+                            continue
+
+                        # Strategy 1: pre-computed URL field in form JSON
+                        hq_url = None
+                        if hq_url_path:
+                            extracted = self._extract_field_value(form_data, hq_url_path)
+                            if extracted and isinstance(extracted, str) and extracted.startswith("http"):
+                                hq_url = extracted
+
+                        # Strategy 2: build URL from filename stored at image_path
+                        if not hq_url and cc_domain and xform_id and image_path:
+                            filename = self._extract_field_value(form_data, image_path)
+                            if filename and isinstance(filename, str) and not filename.startswith("http"):
+                                hq_url = f"{hq_base}/a/{cc_domain}/api/form/attachment/{xform_id}/{filename}"
+
+                        if hq_url:
+                            blob_id = "hq_" + hashlib.sha256(hq_url.encode()).hexdigest()[:16]
+                            name = hq_url_path.split("/")[-1] if hq_url_path else image_path.split("/")[-1]
+                            images.append(
+                                {
+                                    "blob_id": blob_id,
+                                    "hq_url": hq_url,
+                                    "xform_id": xform_id,
+                                    "name": name,
+                                    "question_id": image_path,
+                                    "username": username,
+                                    "visit_date": visit_date,
+                                    "entity_name": entity_name,
+                                }
+                            )
+
         # Add related field values if rules provided
         if related_fields:
             if progress_callback:
@@ -773,25 +865,23 @@ def _filter_visits_by_related_fields(
         if not filter_rules:
             return visit_images
 
+        image_filter_paths = [r.get("image_path", "") for r in filter_rules if r.get("filter_by_image")]
+        field_filter_rules = [r for r in filter_rules if r.get("filter_by_field")]
+
         filtered_result = {}
         for visit_id, images in visit_images.items():
             include_visit = True
 
-            for rule in filter_rules:
-                image_path = rule.get("image_path", "")
-                field_path = rule.get("field_path", "")
-                filter_by_image = rule.get("filter_by_image", False)
-                filter_by_field = rule.get("filter_by_field", False)
+            # OR logic: include visit if it has ANY of the required image types
+            if image_filter_paths:
+                question_ids = {img.get("question_id") for img in images}
+                if not any(p in question_ids for p in image_filter_paths):
+                    include_visit = False
 
-                # Check if this visit has the required image
-                if filter_by_image:
-                    has_matching_image = any(img.get("question_id") == image_path for img in images)
-                    if not has_matching_image:
-                        include_visit = False
-                        break
-
-                # Check if this visit has the required field value
-                if filter_by_field:
+            # AND logic: visit must satisfy every field filter rule
+            if include_visit:
+                for rule in field_filter_rules:
+                    field_path = rule.get("field_path", "")
                     has_field_value = False
                     for img in images:
                         for rf in img.get("related_fields", []):
@@ -1136,7 +1226,7 @@ def create_audit_creation_job(
         opportunities: list[dict],
     ) -> dict:
         """Create an audit creation job record for tracking async creation."""
-        from datetime import datetime
+        from datetime import datetime, timezone
 
         data = {
             "task_id": task_id,
@@ -1154,8 +1244,8 @@ def create_audit_creation_job(
             },
             "result": None,
             "error": None,
-            "created_at": datetime.now().isoformat(),
-            "updated_at": datetime.now().isoformat(),
+            "created_at": datetime.now(timezone.utc).isoformat(),
+            "updated_at": datetime.now(timezone.utc).isoformat(),
         }
 
         record = self.labs_api.create_record(
@@ -1237,7 +1327,7 @@ def update_audit_creation_job(
         error: str | None = None,
     ) -> dict | None:
         """Update an audit creation job record."""
-        from datetime import datetime
+        from datetime import datetime, timezone
 
         from commcare_connect.labs.models import LocalLabsRecord
 
@@ -1267,7 +1357,7 @@ def update_audit_creation_job(
             data["result"] = result
         if error is not None:
             data["error"] = error
-        data["updated_at"] = datetime.now().isoformat()
+        data["updated_at"] = datetime.now(timezone.utc).isoformat()
 
         # Save
         updated = self.labs_api.update_record(
Original file line number	Diff line number	Diff line change
`@@ -25,6 +25,12 @@`
`25`	`25`	`"runtimeExecutable": "celery",`
`26`	`26`	`"runtimeArgs": ["-A", "config.celery_app", "worker", "-l", "info"],`
`27`	`27`	`"port": 0`
	`28`	`+ },`
	`29`	`+ {`
	`30`	`+ "name": "celery-beat",`
	`31`	`+ "runtimeExecutable": "celery",`
	`32`	`+ "runtimeArgs": ["-A", "config.celery_app", "beat"],`
	`33`	`+ "port": 0`
`28`	`34`	`}`
`29`	`35`	`]`
`30`	`36`	`}`