Merge pull request #34 from neurostuff/enh/use_different_machine_sizes

jdkent · web-flow · commit 2e0428c0634f · 2025-10-21T13:15:30.000-05:00
[ENH] use different machine sizes
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,12 @@
 FROM python:3.13-slim
 
+ARG COMPOSE_RUNNER_VERSION
+ENV COMPOSE_RUNNER_VERSION=${COMPOSE_RUNNER_VERSION}
+LABEL org.opencontainers.image.title="compose-runner ecs task"
+LABEL org.opencontainers.image.version=${COMPOSE_RUNNER_VERSION}
+
+RUN test -n "$COMPOSE_RUNNER_VERSION" || (echo "COMPOSE_RUNNER_VERSION build arg is required" && exit 1)
+
 RUN apt-get update && apt-get install -y \
     git \
     && rm -rf /var/lib/apt/lists/*
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ The deployed architecture works like this:
    Pass `-c resultsBucketName=<bucket>` to use an existing S3 bucket, or omit it
    to let the stack create and retain a dedicated bucket. Additional knobs:
 
-   - `-c stateMachineTimeoutSeconds=7200` to control the max wall clock per run
+  - `-c stateMachineTimeoutSeconds=32400` to control the max wall clock per run
    - `-c submitTimeoutSeconds` / `-c statusTimeoutSeconds` / `-c pollTimeoutSeconds`
      to tune Lambda timeouts
    - `-c taskEphemeralStorageGiB` if the default 21 GiB scratch volume is insufficient
diff --git a/compose_runner/aws_lambda/run_handler.py b/compose_runner/aws_lambda/run_handler.py
@@ -4,6 +4,8 @@
 import logging
 import os
 import uuid
+import urllib.error
+import urllib.request
 from typing import Any, Dict, Optional
 
 import boto3
@@ -22,20 +24,84 @@
 NSC_KEY_ENV = "NSC_KEY"
 NV_KEY_ENV = "NV_KEY"
 
+DEFAULT_TASK_SIZE = "standard"
+
 
 def _log(job_id: str, message: str, **details: Any) -> None:
     payload = {"job_id": job_id, "message": message, **details}
     # Ensure consistent JSON logging for ingestion/filtering.
     logger.info(json.dumps(payload))
 
 
+def _compose_api_base_url(environment: str) -> str:
+    env = (environment or "production").lower()
+    if env == "staging":
+        return "https://synth.neurostore.xyz/api"
+    if env == "local":
+        return "http://localhost:81/api"
+    return "https://compose.neurosynth.org/api"
+
+
+def _fetch_meta_analysis(meta_analysis_id: str, environment: str) -> Optional[Dict[str, Any]]:
+    base_url = _compose_api_base_url(environment).rstrip("/")
+    url = f"{base_url}/meta-analyses/{meta_analysis_id}?nested=true"
+    request = urllib.request.Request(url, headers={"User-Agent": "compose-runner/submit"})
+    try:
+        with urllib.request.urlopen(request, timeout=10) as response:
+            return json.load(response)
+    except (urllib.error.URLError, urllib.error.HTTPError, json.JSONDecodeError) as exc:
+        logger.warning("Failed to fetch meta-analysis %s: %s", meta_analysis_id, exc)
+        return None
+
+
+def _requires_large_task(specification: Dict[str, Any]) -> bool:
+    if not isinstance(specification, dict):
+        return False
+    corrector = specification.get("corrector")
+    if not isinstance(corrector, dict):
+        return False
+    if corrector.get("type") != "FWECorrector":
+        return False
+    args = corrector.get("args")
+    if not isinstance(args, dict):
+        return False
+    method = args.get("method")
+    if method is None:
+        kwargs = args.get("**kwargs")
+        if isinstance(kwargs, dict):
+            method = kwargs.get("method")
+    if isinstance(method, str) and method.lower() == "montecarlo":
+        return True
+    return False
+
+
+def _select_task_size(meta_analysis_id: str, environment: str, artifact_prefix: str) -> str:
+    doc = _fetch_meta_analysis(meta_analysis_id, environment)
+    if not doc:
+        return DEFAULT_TASK_SIZE
+    specification = doc.get("specification")
+    try:
+        if _requires_large_task(specification):
+            _log(
+                artifact_prefix,
+                "workflow.task_size_selected",
+                task_size="large",
+                reason="montecarlo_fwe",
+            )
+            return "large"
+    except Exception as exc:  # noqa: broad-except
+        logger.warning("Failed to evaluate specification for %s: %s", meta_analysis_id, exc)
+    return DEFAULT_TASK_SIZE
+
+
 def _job_input(
     payload: Dict[str, Any],
     artifact_prefix: str,
     bucket: Optional[str],
     prefix: Optional[str],
     nsc_key: Optional[str],
     nv_key: Optional[str],
+    task_size: str,
 ) -> Dict[str, Any]:
     no_upload_flag = bool(payload.get("no_upload", False))
     doc: Dict[str, Any] = {
@@ -44,6 +110,7 @@ def _job_input(
         "environment": payload.get("environment", "production"),
         "no_upload": "true" if no_upload_flag else "false",
         "results": {"bucket": bucket or "", "prefix": prefix or ""},
+        "task_size": task_size,
     }
     n_cores = payload.get("n_cores")
     doc["n_cores"] = str(n_cores) if n_cores is not None else ""
@@ -76,7 +143,10 @@ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
     nsc_key = payload.get("nsc_key") or os.environ.get(NSC_KEY_ENV)
     nv_key = payload.get("nv_key") or os.environ.get(NV_KEY_ENV)
 
-    job_input = _job_input(payload, artifact_prefix, bucket, prefix, nsc_key, nv_key)
+    environment = payload.get("environment", "production")
+    task_size = _select_task_size(payload["meta_analysis_id"], environment, artifact_prefix)
+
+    job_input = _job_input(payload, artifact_prefix, bucket, prefix, nsc_key, nv_key, task_size)
     params = {
         "stateMachineArn": os.environ[STATE_MACHINE_ARN_ENV],
         "name": artifact_prefix,
diff --git a/compose_runner/ecs_task.py b/compose_runner/ecs_task.py
@@ -93,6 +93,7 @@ def main() -> None:
     nv_key = os.environ.get(NV_KEY_ENV) or None
     no_upload = _bool_from_env(os.environ.get(NO_UPLOAD_ENV))
     n_cores = _resolve_n_cores(os.environ.get(N_CORES_ENV))
+    compose_runner_version = os.environ.get("COMPOSE_RUNNER_VERSION", "unknown")
 
     bucket = os.environ.get(RESULTS_BUCKET_ENV)
     prefix = os.environ.get(RESULTS_PREFIX_ENV)
@@ -106,6 +107,7 @@ def main() -> None:
         meta_analysis_id=meta_analysis_id,
         environment=environment,
         no_upload=no_upload,
+        compose_runner_version=compose_runner_version,
     )
     try:
         url, _ = run_compose(
@@ -125,6 +127,7 @@ def main() -> None:
             "result_url": url,
             "artifacts_bucket": bucket,
             "artifacts_prefix": prefix,
+            "compose_runner_version": compose_runner_version,
         }
 
         if bucket:
diff --git a/compose_runner/tests/cassettes/test_lambda_handlers/test_select_task_size_uses_large_for_montecarlo.yaml b/compose_runner/tests/cassettes/test_lambda_handlers/test_select_task_size_uses_large_for_montecarlo.yaml
@@ -0,0 +1,60 @@
+interactions:
+- request:
+    method: GET
+    uri: https://synth.neurostore.xyz/api/meta-analyses/ZPSvyvhZAopz?nested=true
+    body: null
+    headers:
+      Accept:
+      - '*/*'
+      Accept-Encoding:
+      - gzip, deflate
+      Connection:
+      - keep-alive
+      User-Agent:
+      - python-requests/2.32.4
+  response:
+    status:
+      code: 200
+      message: OK
+    headers:
+      Server:
+      - nginx/1.21.6
+      Date:
+      - Tue, 21 Oct 2025 14:08:45 GMT
+      Content-Type:
+      - application/json
+      Transfer-Encoding:
+      - chunked
+      Connection:
+      - keep-alive
+      Vary:
+      - Accept-Encoding
+      Content-Encoding:
+      - gzip
+      Strict-Transport-Security:
+      - max-age=31536000
+    body:
+      string: '{"id": "ZPSvyvhZAopz", "created_at": "2025-10-21T04:57:40.236536+00:00",
+        "updated_at": null, "user": "github|12564882", "username": "James Kent", "name":
+        "Untitled MKDADensity Meta Analysis: included", "description": "MKDADensity
+        meta analysis with FWECorrector", "provenance": null, "specification": {"id":
+        "zQdMa4uAaYYU", "created_at": "2025-10-21T04:57:39.888528+00:00", "updated_at":
+        null, "user": "github|12564882", "username": "James Kent", "type": "CBMA",
+        "estimator": {"type": "MKDADensity", "args": {"null_method": "approximate",
+        "n_iters": 5000, "**kwargs": {}, "kernel__r": 10, "kernel__value": 1}}, "database_studyset":
+        null, "filter": "included", "corrector": {"type": "FWECorrector", "args":
+        {"voxel_thresh": 0.001, "n_iters": 5000, "vfwe_only": false, "method": "montecarlo"}},
+        "conditions": [true], "weights": [1.0]}, "neurostore_analysis": {"id": "8S5xRedCGRkz",
+        "created_at": "2025-10-21T04:57:40.255480+00:00", "updated_at": null, "neurostore_id":
+        null, "exception": null, "traceback": null, "status": "PENDING"}, "studyset":
+        {"id": "9jPvdkuRufUP", "created_at": "2025-10-21T04:57:40.008456+00:00", "updated_at":
+        null, "user": "github|12564882", "username": "James Kent", "snapshot": null,
+        "neurostore_id": "3EmvH2LELwR2", "version": null, "url": "https://neurostore.org/api/studysets/3EmvH2LELwR2"},
+        "annotation": {"id": "YVLt6DRFKdd5", "created_at": "2025-10-21T04:57:40.121637+00:00",
+        "updated_at": null, "user": "github|12564882", "username": "James Kent", "snapshot":
+        null, "neurostore_id": "TebrRstj8ofh", "studyset": "3EmvH2LELwR2", "url":
+        "https://neurostore.org/api/annotations/TebrRstj8ofh"}, "project": "D2cTfoxNfpLy",
+        "cached_studyset": "9jPvdkuRufUP", "cached_annotation": "YVLt6DRFKdd5", "run_key":
+        "PDeDnh_8MXc88xoVJySz3w", "results": [], "neurostore_url": null}'
+    http_version: HTTP/1.1
+version: 1
diff --git a/compose_runner/tests/cassettes/test_lambda_handlers/test_select_task_size_uses_standard_for_fdr.yaml b/compose_runner/tests/cassettes/test_lambda_handlers/test_select_task_size_uses_standard_for_fdr.yaml
@@ -0,0 +1,55 @@
+interactions:
+- request:
+    body: null
+    headers:
+      Connection:
+      - close
+      Host:
+      - synth.neurostore.xyz
+      User-Agent:
+      - compose-runner/submit
+    method: GET
+    uri: https://synth.neurostore.xyz/api/meta-analyses/VtFZJFniCKvG?nested=true
+  response:
+    body:
+      string: '{"id": "VtFZJFniCKvG", "created_at": "2025-10-21T14:10:35.309383+00:00",
+        "updated_at": null, "user": "github|12564882", "username": "James Kent", "name":
+        "Untitled MKDADensity Meta Analysis: included (1)", "description": "MKDADensity
+        meta analysis with FDRCorrector", "provenance": null, "specification": {"id":
+        "DtVzKEKGaXLu", "created_at": "2025-10-21T14:10:34.564365+00:00", "updated_at":
+        null, "user": "github|12564882", "username": "James Kent", "type": "CBMA",
+        "estimator": {"type": "MKDADensity", "args": {"null_method": "approximate",
+        "n_iters": 5000, "**kwargs": {}, "kernel__r": 10, "kernel__value": 1}}, "database_studyset":
+        null, "filter": "included", "corrector": {"type": "FDRCorrector", "args":
+        {"method": "indep", "alpha": 0.05}}, "conditions": [true], "weights": [1.0]},
+        "neurostore_analysis": {"id": "564c8kRnJVT4", "created_at": "2025-10-21T14:10:35.325173+00:00",
+        "updated_at": null, "neurostore_id": null, "exception": null, "traceback":
+        null, "status": "PENDING"}, "studyset": {"id": "FA3BDBdGRZ5d", "created_at":
+        "2025-10-21T14:10:34.821625+00:00", "updated_at": null, "user": "github|12564882",
+        "username": "James Kent", "snapshot": null, "neurostore_id": "3EmvH2LELwR2",
+        "version": null, "url": "https://neurostore.org/api/studysets/3EmvH2LELwR2"},
+        "annotation": {"id": "XELVYV7ftp7e", "created_at": "2025-10-21T14:10:35.183354+00:00",
+        "updated_at": null, "user": "github|12564882", "username": "James Kent", "snapshot":
+        null, "neurostore_id": "TebrRstj8ofh", "studyset": "3EmvH2LELwR2", "url":
+        "https://neurostore.org/api/annotations/TebrRstj8ofh"}, "project": "D2cTfoxNfpLy",
+        "cached_studyset": "FA3BDBdGRZ5d", "cached_annotation": "XELVYV7ftp7e", "run_key":
+        "V_jTcP2zfNlWD4KhwKKcJw", "results": [], "neurostore_url": null}'
+    headers:
+      Connection:
+      - close
+      Content-Length:
+      - '1750'
+      Content-Type:
+      - application/json
+      Date:
+      - Tue, 21 Oct 2025 14:14:50 GMT
+      Server:
+      - nginx/1.21.6
+      Strict-Transport-Security:
+      - max-age=31536000
+      Vary:
+      - Accept-Encoding
+    status:
+      code: 200
+      message: OK
+version: 1
diff --git a/compose_runner/tests/test_lambda_handlers.py b/compose_runner/tests/test_lambda_handlers.py
@@ -4,6 +4,8 @@
 from datetime import datetime, timezone
 from typing import Any, Dict
 
+import pytest
+
 from compose_runner.aws_lambda import log_poll_handler, results_handler, run_handler, status_handler
 
 
@@ -23,6 +25,28 @@ def _make_http_event(payload: Dict[str, Any]) -> Dict[str, Any]:
     }
 
 
+def test_requires_large_task_detection():
+    spec = {"corrector": {"type": "FWECorrector", "args": {"method": "montecarlo"}}}
+    assert run_handler._requires_large_task(spec)
+
+
+def test_requires_large_task_false_when_method_differs():
+    spec = {"corrector": {"type": "FWECorrector", "args": {"method": "bonferroni"}}}
+    assert run_handler._requires_large_task(spec) is False
+
+
+@pytest.mark.vcr(record_mode="once")
+def test_select_task_size_uses_large_for_montecarlo():
+    task_size = run_handler._select_task_size("ZPSvyvhZAopz", "staging", "artifact-test")
+    assert task_size == "large"
+
+
+@pytest.mark.vcr(record_mode="once")
+def test_select_task_size_uses_standard_for_fdr():
+    task_size = run_handler._select_task_size("VtFZJFniCKvG", "staging", "artifact-test")
+    assert task_size == "standard"
+
+
 def test_run_handler_http_success(monkeypatch, tmp_path):
     captured = {}
 
@@ -36,6 +60,7 @@ class ExecutionAlreadyExists(Exception):
                 ...
 
     monkeypatch.setattr(run_handler, "_SFN_CLIENT", FakeSFN())
+    monkeypatch.setattr(run_handler, "_select_task_size", lambda *args: "standard")
     monkeypatch.setenv("STATE_MACHINE_ARN", "arn:aws:states:state-machine")
     monkeypatch.setenv("RESULTS_BUCKET", "bucket")
     monkeypatch.setenv("RESULTS_PREFIX", "prefix")
@@ -63,6 +88,32 @@ class ExecutionAlreadyExists(Exception):
     assert input_doc["results"]["prefix"] == "prefix"
     assert input_doc["nsc_key"] == "nsc"
     assert input_doc["nv_key"] == "nv"
+    assert input_doc["task_size"] == "standard"
+
+
+def test_run_handler_http_uses_large_task(monkeypatch):
+    captured = {}
+
+    class FakeSFN:
+        def start_execution(self, **kwargs):
+            captured.update(kwargs)
+            return {"executionArn": "arn:aws:states:us-east-1:123:execution:state-machine:run-456"}
+
+        class exceptions:
+            class ExecutionAlreadyExists(Exception):
+                ...
+
+    monkeypatch.setattr(run_handler, "_SFN_CLIENT", FakeSFN())
+    monkeypatch.setattr(run_handler, "_select_task_size", lambda *args: "large")
+    monkeypatch.setenv("STATE_MACHINE_ARN", "arn:aws:states:state-machine")
+    monkeypatch.setenv("RESULTS_BUCKET", "bucket")
+    monkeypatch.setenv("RESULTS_PREFIX", "prefix")
+
+    event = _make_http_event({"meta_analysis_id": "abc123"})
+    response = run_handler.handler(event, DummyContext())
+    assert response["statusCode"] == 202
+    input_doc = json.loads(captured["input"])
+    assert input_doc["task_size"] == "large"
 
 
 def test_run_handler_missing_meta_analysis(monkeypatch):
diff --git a/infra/cdk/stacks/compose_runner_stack.py b/infra/cdk/stacks/compose_runner_stack.py