microsoft · ultmaster · Dec 16, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -3,6 +3,9 @@ permissions:
   contents: read
 on:
   workflow_dispatch:
+  schedule:
+    # Every Monday and Thursday at 3 AM UTC+8
+    - cron: '0 19 * * 0,3'
-    - cron: '0 19 * * 0,3'
+    - cron: '0 19 * * 1,4'
-    - cron: '0 19 * * 0,3'
+    - cron: '0 19 * * 1,4'
 
 jobs:
   benchmark:
@@ -25,7 +28,7 @@ jobs:
             runner:
               - self-hosted
               - 1ES.Pool=agl-runner-cpu
-            timeout: 60
+            timeout: 45
             args: >-
               --mode batch
               --total-tasks 4096
@@ -40,7 +43,7 @@ jobs:
             runner:
               - self-hosted
               - 1ES.Pool=agl-runner-cpu
-            timeout: 60
+            timeout: 45
             args: >-
               --mode batch
               --total-tasks 10000
@@ -70,25 +73,25 @@ jobs:
             runner:
               - self-hosted
               - 1ES.Pool=agl-runner-cpu
-            timeout: 60
+            timeout: 120
             args: >-
               --mode batch
-              --total-tasks 100000
+              --total-tasks 50000
               --batch-size 8192
               --n-runners 256
               --max-rounds 6
               --sleep-seconds 0.1
           - id: scenario-long-queues
             display: Long rollout queues
             kind: scenario
-            store_workers: 32
+            store_workers: 48
             runner:
               - self-hosted
               - 1ES.Pool=agl-runner-cpu
-            timeout: 60
+            timeout: 120
             args: >-
               --mode batch_partial
-              --total-tasks 100000
+              --total-tasks 50000
               --batch-size 1024
               --n-runners 256
               --remaining-tasks 4096
@@ -97,14 +100,14 @@ jobs:
           - id: scenario-high-concurrency
             display: High-throughput concurrent requests
             kind: scenario
-            store_workers: 32
+            store_workers: 96
             runner:
               - self-hosted
               - 1ES.Pool=agl-runner-cpu
-            timeout: 60
+            timeout: 120
             args: >-
               --mode single
-              --total-tasks 100000
+              --total-tasks 50000
               --concurrency 2048
               --n-runners 256
               --max-rounds 2
@@ -172,6 +175,7 @@ jobs:
       STORE_URL: http://localhost:4747
       STORE_API_URL: http://localhost:4747/v1/agl
       PROM_URL: http://localhost:9090
+      GITHUB_ACTIONS_TIMEOUT_MINUTES: ${{ matrix.workload.timeout }}
       WORKLOAD_KIND: ${{ matrix.workload.kind }}
       WORKLOAD_ID: ${{ matrix.workload.id }}
       BACKEND_ID: ${{ matrix.backend.id }}
@@ -338,27 +342,27 @@ jobs:
             runner: ubuntu-latest
         workload:
           - id: high-insert
-            total_tasks: 100000
+            total_tasks: 50000
             concurrency: 2048
             type: insert
           - id: medium-insert
-            total_tasks: 100000
+            total_tasks: 50000
             concurrency: 128
             type: insert
           - id: low-insert
-            total_tasks: 100000
+            total_tasks: 50000
             concurrency: 4
             type: insert
           - id: high-dequeue
-            total_tasks: 100000
+            total_tasks: 50000
             concurrency: 2048
             type: dequeue
           - id: medium-dequeue
-            total_tasks: 100000
+            total_tasks: 50000
             concurrency: 128
             type: dequeue
           - id: low-dequeue
-            total_tasks: 100000
+            total_tasks: 50000
             concurrency: 4
             type: dequeue
     env:

diff --git a/docker/compose.store.yml b/docker/compose.store.yml
@@ -9,6 +9,11 @@ services:
 
     command: agl store --host 0.0.0.0 --port 4747
 
+    ulimits:
+      nofile:
+        soft: 65535
+        hard: 65535
+
     develop:
       watch:
         # Sync the working directory with the `/app` directory in the container

diff --git a/tests/benchmark/benchmark_store.py b/tests/benchmark/benchmark_store.py
@@ -8,6 +8,7 @@
 import random
 import sys
 import threading
+import time
 from typing import Any, Dict, List, Literal, Optional, Sequence, Set, Tuple, cast
 
 from rich.console import Console
@@ -19,7 +20,9 @@
 
 console = Console()
 
-MAX_RUNTIME_SECONDS = 30 * 60
+# Minus 10 to leave time for setting up env.
+MAX_RUNTIME_SECONDS = (int(os.getenv("GITHUB_ACTIONS_TIMEOUT_MINUTES", "30")) - 10) * 60
+MAX_STALE_SECONDS = 300
 
 
 def _abort_due_to_timeout() -> None:
@@ -157,7 +160,7 @@ async def algorithm_batch(self, total_tasks: int, batch_size: int):
 
             pending = {rollout_id: task_name for rollout_id, task_name in batch_rollouts}
             completed_ids: Set[str] = set()
-            completed_ids_last_updated: int = 0
+            completed_ids_last_updated: float = time.perf_counter()
             while len(completed_ids) < len(batch_rollouts):
                 finished_rollouts = await store.wait_for_rollouts(
                     rollout_ids=[rollout_id for rollout_id, _ in batch_rollouts],
@@ -177,13 +180,17 @@ async def algorithm_batch(self, total_tasks: int, batch_size: int):
 
                 # Check and warn for stale rollouts
                 if complete_ids_updated:
-                    completed_ids_last_updated = 0
+                    completed_ids_last_updated = time.perf_counter()
                 else:
-                    completed_ids_last_updated += 1
-                    if completed_ids_last_updated >= 10:
+                    if time.perf_counter() - completed_ids_last_updated > MAX_STALE_SECONDS / 2:
                         unfinished_ids = set(rollout_id for rollout_id, _ in batch_rollouts) - completed_ids
                         print(f"Stale rollouts: {unfinished_ids}")
-                        completed_ids_last_updated = 0
+                    if time.perf_counter() - completed_ids_last_updated > MAX_STALE_SECONDS:
+                        current_workers = await store.query_workers()
+                        console.print(f"Stalled. Current worker status shown below:")
+                        for worker in current_workers:
+                            console.print(f"  Worker: {worker}", width=1024)  # Avoid wrapping
+                        raise RuntimeError("Rollout progress has stalled for too long")
 
                 await asyncio.sleep(5.0)