neurostuff
diff --git a/‎.github/workflows/deploy.yml‎
Lines changed: 47 additions & 18 deletions b/‎.github/workflows/deploy.yml‎
Lines changed: 47 additions & 18 deletions
diff --git a/‎Dockerfile‎
Lines changed: 2 additions & 2 deletions b/‎Dockerfile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 32 additions & 15 deletions b/‎README.md‎
Lines changed: 32 additions & 15 deletions
diff --git a/‎compose_runner/aws_lambda/common.py‎
Lines changed: 60 additions & 0 deletions b/‎compose_runner/aws_lambda/common.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎compose_runner/aws_lambda/log_poll_handler.py‎
Lines changed: 15 additions & 39 deletions b/‎compose_runner/aws_lambda/log_poll_handler.py‎
Lines changed: 15 additions & 39 deletions
diff --git a/‎compose_runner/aws_lambda/results_handler.py‎
Lines changed: 13 additions & 38 deletions b/‎compose_runner/aws_lambda/results_handler.py‎
Lines changed: 13 additions & 38 deletions
@@ -41,8 +41,9 @@ jobs:
         working-directory: infra/cdk
         env:
           RESULTS_PREFIX: compose-runner/results
-          RUN_MEMORY_SIZE: 3008
-          RUN_TIMEOUT_SECONDS: 900
+          TASK_CPU: 4096
+          TASK_MEMORY_MIB: 30720
+          STATE_MACHINE_TIMEOUT_SECONDS: 7200
         run: |
           source .venv/bin/activate
           VERSION=${GITHUB_REF_NAME}
@@ -51,29 +52,57 @@ jobs:
             --outputs-file cdk-outputs.json \
             -c composeRunnerVersion=${VERSION} \
             -c resultsPrefix=${RESULTS_PREFIX} \
-            -c runMemorySize=${RUN_MEMORY_SIZE} \
-            -c runTimeoutSeconds=${RUN_TIMEOUT_SECONDS}
+            -c taskCpu=${TASK_CPU} \
+            -c taskMemoryMiB=${TASK_MEMORY_MIB} \
+            -c stateMachineTimeoutSeconds=${STATE_MACHINE_TIMEOUT_SECONDS}
 
-      - name: Smoke test run endpoint
+      - name: Smoke test submission and status endpoints
         working-directory: infra/cdk
         run: |
-          RUN_URL=$(jq -r '.ComposeRunnerStack.ComposeRunnerFunctionUrl' cdk-outputs.json)
-          if [ -z "$RUN_URL" ] || [ "$RUN_URL" = "null" ]; then
-            echo "Run Function URL not found in outputs"
+          SUBMIT_URL=$(jq -r '.ComposeRunnerStack.ComposeRunnerSubmitFunctionUrl' cdk-outputs.json)
+          STATUS_URL=$(jq -r '.ComposeRunnerStack.ComposeRunnerStatusFunctionUrl' cdk-outputs.json)
+          if [ -z "$SUBMIT_URL" ] || [ "$SUBMIT_URL" = "null" ]; then
+            echo "Submit Function URL not found in outputs"
             exit 1
           fi
+          if [ -z "$STATUS_URL" ] || [ "$STATUS_URL" = "null" ]; then
+            echo "Status Function URL not found in outputs"
+            exit 1
+          fi
+
           body='{"meta_analysis_id": "pFGy6g3LRo9x", "environment": "production", "no_upload": true}'
-          response=$(curl -s -w "\n%{http_code}" -X POST "$RUN_URL" -H "Content-Type: application/json" -d "$body")
-          http_code=$(echo "$response" | tail -n1)
-          json_body=$(echo "$response" | head -n1)
-          echo "$json_body" > smoke_run.json
-          echo "Status code: $http_code"
-          if [ "$http_code" != "200" ]; then
-            echo "Run endpoint failed: $json_body"
+          response=$(curl -s -w "\n%{http_code}" -X POST "$SUBMIT_URL" -H "Content-Type: application/json" -d "$body")
+          submit_code=$(echo "$response" | tail -n1)
+          submit_json=$(echo "$response" | head -n1)
+          echo "$submit_json" > smoke_submit.json
+          echo "Submit status code: $submit_code"
+          if [ "$submit_code" != "202" ]; then
+            echo "Submit endpoint failed: $submit_json"
+            exit 1
+          fi
+          job_id=$(jq -r '.job_id' smoke_submit.json)
+          artifact_prefix=$(jq -r '.artifact_prefix' smoke_submit.json)
+          if [ -z "$job_id" ] || [ "$job_id" = "null" ]; then
+            echo "Submit response missing job_id: $submit_json"
+            exit 1
+          fi
+          if [ -z "$artifact_prefix" ] || [ "$artifact_prefix" = "null" ]; then
+            echo "Submit response missing artifact_prefix: $submit_json"
+            exit 1
+          fi
+
+          status_body=$(printf '{"job_id":"%s"}' "$job_id")
+          status_response=$(curl -s -w "\n%{http_code}" -X POST "$STATUS_URL" -H "Content-Type: application/json" -d "$status_body")
+          status_code=$(echo "$status_response" | tail -n1)
+          status_json=$(echo "$status_response" | head -n1)
+          echo "$status_json" > smoke_status.json
+          echo "Status status code: $status_code"
+          if [ "$status_code" != "200" ]; then
+            echo "Status endpoint failed: $status_json"
             exit 1
           fi
-          status=$(jq -r '.status' smoke_run.json)
-          if [ "$status" != "SUCCEEDED" ]; then
-            echo "Run endpoint returned non-success status: $json_body"
+          status_value=$(jq -r '.status' smoke_status.json)
+          if [ "$status_value" = "null" ] || [ -z "$status_value" ]; then
+            echo "Status response missing status: $status_json"
             exit 1
           fi
@@ -14,7 +14,7 @@ RUN hatch dep show requirements > requirements.txt && pip install -r requirement
 
 COPY . .
 
-# install the package (more likely to change, leverage caching!)
-RUN pip install .
+# install the package with AWS extras so the ECS task has boto3, etc.
+RUN pip install '.[aws]'
 
 ENTRYPOINT ["compose-run"]
@@ -3,10 +3,24 @@
 Python package to execute meta-analyses created using neurosynth compose and NiMARE
 as the meta-analysis execution engine.
 
-## AWS Lambda Deployment
+## AWS Deployment
 
-This repository includes an AWS CDK application for provisioning the Lambda-based
-execution environment and log polling function.
+This repository includes an AWS CDK application that turns compose-runner into a
+serverless batch pipeline using Step Functions, AWS Lambda, and ECS Fargate.
+The deployed architecture works like this:
+
+- `ComposeRunnerSubmit` (Lambda Function URL) accepts HTTP requests, validates
+  the meta-analysis payload, and starts a Step Functions execution. The response
+  is immediate and returns both a durable `job_id` (the execution ARN) and the
+  `artifact_prefix` used for S3 and log correlation.
+- A Standard state machine runs a single Fargate task (`compose_runner.ecs_task`)
+  and waits for completion. The container downloads inputs, executes the
+  meta-analysis on up to 4 vCPU / 30 GiB of memory, uploads artifacts to S3, and
+  writes `metadata.json` into the same prefix.
+- `ComposeRunnerStatus` (Lambda Function URL) wraps `DescribeExecution`, merges
+  metadata from S3, and exposes a simple status endpoint suitable for polling.
+- `ComposeRunnerLogPoller` streams the ECS CloudWatch Logs for a given `artifact_prefix`,
+  while `ComposeRunnerResultsFetcher` returns presigned URLs for stored artifacts.
 
 1. Create a virtual environment and install the CDK dependencies:
    ```bash
@@ -19,21 +33,24 @@ execution environment and log polling function.
    ```bash
    cdk bootstrap
    ```
-3. Deploy the stack (supplying the compose-runner version you want baked into the Lambda image):
+3. Deploy the stack (supplying the compose-runner version you want baked into the images):
    ```bash
    cdk deploy \
      -c composeRunnerVersion=$(hatch version) \
      -c resultsPrefix=compose-runner/results \
-     -c runMemorySize=3008 \
-     -c runTimeoutSeconds=900
+     -c taskCpu=4096 \
+     -c taskMemoryMiB=30720
    ```
-   The deployment output includes HTTPS endpoints for submitting runs (`ComposeRunnerFunctionUrl`), polling logs (`ComposeRunnerLogPollerFunctionUrl`), and fetching presigned S3 URLs (`ComposeRunnerResultsFunctionUrl`).
-   Omit `resultsBucketName` to let the stack create a managed bucket, or pass an
-   existing bucket name via `-c resultsBucketName=<bucket>`.
+   Pass `-c resultsBucketName=<bucket>` to use an existing S3 bucket, or omit it
+   to let the stack create and retain a dedicated bucket. Additional knobs:
+
+   - `-c stateMachineTimeoutSeconds=7200` to control the max wall clock per run
+   - `-c submitTimeoutSeconds` / `-c statusTimeoutSeconds` / `-c pollTimeoutSeconds`
+     to tune Lambda timeouts
+   - `-c taskEphemeralStorageGiB` if the default 21 GiB scratch volume is insufficient
 
-The deployment builds the Lambda container image from `aws_lambda/Dockerfile`,
-creates two functions (`ComposeRunnerFunction` and `ComposeRunnerLogPoller`),
-and provisions the S3 bucket used to store generated artifacts (including
-`meta_results.pkl`). The log poller function expects clients to call it with a
-job ID (the run Lambda invocation request ID) and returns filtered CloudWatch Logs
-entries for that job.
+The deployment builds both the Lambda image (`aws_lambda/Dockerfile`) and the
+Fargate task image (`Dockerfile`), provisions the Step Functions state machine,
+and configures a public VPC so each task has outbound internet access.
+The CloudFormation outputs list the HTTPS endpoints for submission, status,
+logs, and artifact retrieval, alongside the Step Functions ARN.
@@ -0,0 +1,60 @@
+from __future__ import annotations
+
+import base64
+import json
+from dataclasses import dataclass
+from typing import Any, Dict, Optional
+
+
+def is_http_event(event: Any) -> bool:
+    return isinstance(event, dict) and "requestContext" in event
+
+
+def _decode_body(event: Dict[str, Any]) -> Optional[str]:
+    body = event.get("body")
+    if not body:
+        return None
+    if event.get("isBase64Encoded"):
+        return base64.b64decode(body).decode("utf-8")
+    return body
+
+
+def extract_payload(event: Dict[str, Any]) -> Dict[str, Any]:
+    if not is_http_event(event):
+        return event
+    body = _decode_body(event)
+    if not body:
+        return {}
+    return json.loads(body)
+
+
+def http_response(body: Dict[str, Any], status_code: int = 200) -> Dict[str, Any]:
+    return {
+        "statusCode": status_code,
+        "headers": {"Content-Type": "application/json"},
+        "body": json.dumps(body),
+    }
+
+
+@dataclass(frozen=True)
+class LambdaRequest:
+    raw_event: Any
+    payload: Dict[str, Any]
+    is_http: bool
+
+    @classmethod
+    def parse(cls, event: Any) -> "LambdaRequest":
+        payload = extract_payload(event)
+        return cls(raw_event=event, payload=payload, is_http=is_http_event(event))
+
+    def respond(self, body: Dict[str, Any], status_code: int = 200) -> Dict[str, Any]:
+        if self.is_http:
+            return http_response(body, status_code)
+        return body
+
+    def bad_request(self, message: str, status_code: int = 400) -> Dict[str, Any]:
+        return self.respond({"status": "FAILED", "error": message}, status_code=status_code)
+
+    def get(self, key: str, default: Any = None) -> Any:
+        return self.payload.get(key, default)
+
@@ -2,52 +2,30 @@
 
 import os
 import time
-import base64
-import json
 from typing import Any, Dict, List
 
 import boto3
 
+from compose_runner.aws_lambda.common import LambdaRequest
+
 _LOGS_CLIENT = boto3.client("logs", region_name=os.environ.get("AWS_REGION", "us-east-1"))
 
 LOG_GROUP_ENV = "RUNNER_LOG_GROUP"
 DEFAULT_LOOKBACK_MS_ENV = "DEFAULT_LOOKBACK_MS"
 
-def _is_http_event(event: Any) -> bool:
-    return isinstance(event, dict) and "requestContext" in event
-
-
-def _extract_payload(event: Dict[str, Any]) -> Dict[str, Any]:
-    if not _is_http_event(event):
-        return event
-    body = event.get("body")
-    if not body:
-        return {}
-    if event.get("isBase64Encoded"):
-        body = base64.b64decode(body).decode("utf-8")
-    return json.loads(body)
-
-
-def _http_response(body: Dict[str, Any], status_code: int = 200) -> Dict[str, Any]:
-    return {
-        "statusCode": status_code,
-        "headers": {"Content-Type": "application/json"},
-        "body": json.dumps(body),
-    }
-
 
 def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
-    raw_event = event
-    event = _extract_payload(event)
-    job_id = event.get("job_id")
-    if not job_id:
-        message = "Request payload must include 'job_id'."
-        if _is_http_event(raw_event):
-            return _http_response({"status": "FAILED", "error": message}, status_code=400)
+    request = LambdaRequest.parse(event)
+    payload = request.payload
+    artifact_prefix = payload.get("artifact_prefix")
+    if not artifact_prefix:
+        message = "Request payload must include 'artifact_prefix'."
+        if request.is_http:
+            return request.bad_request(message, status_code=400)
         raise KeyError(message)
-    next_token = event.get("next_token")
-    start_time = event.get("start_time")
-    end_time = event.get("end_time")
+    next_token = payload.get("next_token")
+    start_time = payload.get("start_time")
+    end_time = payload.get("end_time")
 
     log_group = os.environ[LOG_GROUP_ENV]
     lookback_ms = int(os.environ.get(DEFAULT_LOOKBACK_MS_ENV, "3600000"))
@@ -60,7 +38,7 @@ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
 
     params: Dict[str, Any] = {
         "logGroupName": log_group,
-        "filterPattern": f'"{job_id}"',
+        "filterPattern": f'"{artifact_prefix}"',
         "startTime": int(start_time),
     }
     if end_time is not None:
@@ -75,10 +53,8 @@ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
     ]
 
     body = {
-        "job_id": job_id,
+        "artifact_prefix": artifact_prefix,
         "events": events,
         "next_token": response.get("nextToken"),
     }
-    if _is_http_event(raw_event):
-        return _http_response(body)
-    return body
+    return request.respond(body)
@@ -1,13 +1,13 @@
 from __future__ import annotations
 
 import os
-import base64
-import json
 from datetime import datetime, timezone
 from typing import Any, Dict, List
 
 import boto3
 
+from compose_runner.aws_lambda.common import LambdaRequest
+
 _S3 = boto3.client("s3", region_name=os.environ.get("AWS_REGION", "us-east-1"))
 
 RESULTS_BUCKET_ENV = "RESULTS_BUCKET"
@@ -21,44 +21,21 @@ def _serialize_dt(value: datetime) -> str:
     return value.astimezone(timezone.utc).isoformat()
 
 
-def _is_http_event(event: Any) -> bool:
-    return isinstance(event, dict) and "requestContext" in event
-
-
-def _extract_payload(event: Dict[str, Any]) -> Dict[str, Any]:
-    if not _is_http_event(event):
-        return event
-    body = event.get("body")
-    if not body:
-        return {}
-    if event.get("isBase64Encoded"):
-        body = base64.b64decode(body).decode("utf-8")
-    return json.loads(body)
-
-
-def _http_response(body: Dict[str, Any], status_code: int = 200) -> Dict[str, Any]:
-    return {
-        "statusCode": status_code,
-        "headers": {"Content-Type": "application/json"},
-        "body": json.dumps(body),
-    }
-
-
 def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
-    raw_event = event
-    event = _extract_payload(event)
+    request = LambdaRequest.parse(event)
+    payload = request.payload
     bucket = os.environ[RESULTS_BUCKET_ENV]
     prefix = os.environ.get(RESULTS_PREFIX_ENV)
 
-    job_id = event.get("job_id")
-    if not job_id:
-        message = "Request payload must include 'job_id'."
-        if _is_http_event(raw_event):
-            return _http_response({"status": "FAILED", "error": message}, status_code=400)
+    artifact_prefix = payload.get("artifact_prefix")
+    if not artifact_prefix:
+        message = "Request payload must include 'artifact_prefix'."
+        if request.is_http:
+            return request.bad_request(message, status_code=400)
         raise KeyError(message)
-    expires_in = int(event.get("expires_in", DEFAULT_EXPIRES_IN))
+    expires_in = int(payload.get("expires_in", DEFAULT_EXPIRES_IN))
 
-    key_prefix = f"{prefix.rstrip('/')}/{job_id}" if prefix else job_id
+    key_prefix = f"{prefix.rstrip('/')}/{artifact_prefix}" if prefix else artifact_prefix
 
     response = _S3.list_objects_v2(Bucket=bucket, Prefix=key_prefix)
     contents = response.get("Contents", [])
@@ -84,11 +61,9 @@ def handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
         )
 
     body = {
-        "job_id": job_id,
+        "artifact_prefix": artifact_prefix,
         "artifacts": artifacts,
         "bucket": bucket,
         "prefix": key_prefix,
     }
-    if _is_http_event(raw_event):
-        return _http_response(body)
-    return body
+    return request.respond(body)