Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 20 additions & 16 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ permissions:
contents: read
on:
workflow_dispatch:
schedule:
# Every Monday and Thursday at 3 AM UTC+8
- cron: '0 19 * * 0,3'
Copy link

Copilot AI Dec 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cron schedule '0 19 * * 0,3' runs on Sundays (0) and Wednesdays (3), not Mondays and Thursdays as stated in the comment. To run on Mondays and Thursdays at 3 AM UTC+8 (7 PM UTC previous day), use '0 19 * * 1,4'.

Suggested change
- cron: '0 19 * * 0,3'
- cron: '0 19 * * 1,4'

Copilot uses AI. Check for mistakes.

jobs:
benchmark:
Expand All @@ -25,7 +28,7 @@ jobs:
runner:
- self-hosted
- 1ES.Pool=agl-runner-cpu
timeout: 60
timeout: 45
args: >-
--mode batch
--total-tasks 4096
Expand All @@ -40,7 +43,7 @@ jobs:
runner:
- self-hosted
- 1ES.Pool=agl-runner-cpu
timeout: 60
timeout: 45
args: >-
--mode batch
--total-tasks 10000
Expand Down Expand Up @@ -70,25 +73,25 @@ jobs:
runner:
- self-hosted
- 1ES.Pool=agl-runner-cpu
timeout: 60
timeout: 120
args: >-
--mode batch
--total-tasks 100000
--total-tasks 50000
--batch-size 8192
--n-runners 256
--max-rounds 6
--sleep-seconds 0.1
- id: scenario-long-queues
display: Long rollout queues
kind: scenario
store_workers: 32
store_workers: 48
runner:
- self-hosted
- 1ES.Pool=agl-runner-cpu
timeout: 60
timeout: 120
args: >-
--mode batch_partial
--total-tasks 100000
--total-tasks 50000
--batch-size 1024
--n-runners 256
--remaining-tasks 4096
Expand All @@ -97,14 +100,14 @@ jobs:
- id: scenario-high-concurrency
display: High-throughput concurrent requests
kind: scenario
store_workers: 32
store_workers: 96
runner:
- self-hosted
- 1ES.Pool=agl-runner-cpu
timeout: 60
timeout: 120
args: >-
--mode single
--total-tasks 100000
--total-tasks 50000
--concurrency 2048
--n-runners 256
--max-rounds 2
Expand Down Expand Up @@ -172,6 +175,7 @@ jobs:
STORE_URL: http://localhost:4747
STORE_API_URL: http://localhost:4747/v1/agl
PROM_URL: http://localhost:9090
GITHUB_ACTIONS_TIMEOUT_MINUTES: ${{ matrix.workload.timeout }}
WORKLOAD_KIND: ${{ matrix.workload.kind }}
WORKLOAD_ID: ${{ matrix.workload.id }}
BACKEND_ID: ${{ matrix.backend.id }}
Expand Down Expand Up @@ -338,27 +342,27 @@ jobs:
runner: ubuntu-latest
workload:
- id: high-insert
total_tasks: 100000
total_tasks: 50000
concurrency: 2048
type: insert
- id: medium-insert
total_tasks: 100000
total_tasks: 50000
concurrency: 128
type: insert
- id: low-insert
total_tasks: 100000
total_tasks: 50000
concurrency: 4
type: insert
- id: high-dequeue
total_tasks: 100000
total_tasks: 50000
concurrency: 2048
type: dequeue
- id: medium-dequeue
total_tasks: 100000
total_tasks: 50000
concurrency: 128
type: dequeue
- id: low-dequeue
total_tasks: 100000
total_tasks: 50000
concurrency: 4
type: dequeue
env:
Expand Down
5 changes: 5 additions & 0 deletions docker/compose.store.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,11 @@ services:

command: agl store --host 0.0.0.0 --port 4747

ulimits:
nofile:
soft: 65535
hard: 65535

develop:
watch:
# Sync the working directory with the `/app` directory in the container
Expand Down
19 changes: 13 additions & 6 deletions tests/benchmark/benchmark_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import random
import sys
import threading
import time
from typing import Any, Dict, List, Literal, Optional, Sequence, Set, Tuple, cast

from rich.console import Console
Expand All @@ -19,7 +20,9 @@

console = Console()

MAX_RUNTIME_SECONDS = 30 * 60
# Minus 10 to leave time for setting up env.
MAX_RUNTIME_SECONDS = (int(os.getenv("GITHUB_ACTIONS_TIMEOUT_MINUTES", "30")) - 10) * 60
MAX_STALE_SECONDS = 300


def _abort_due_to_timeout() -> None:
Expand Down Expand Up @@ -157,7 +160,7 @@ async def algorithm_batch(self, total_tasks: int, batch_size: int):

pending = {rollout_id: task_name for rollout_id, task_name in batch_rollouts}
completed_ids: Set[str] = set()
completed_ids_last_updated: int = 0
completed_ids_last_updated: float = time.perf_counter()
while len(completed_ids) < len(batch_rollouts):
finished_rollouts = await store.wait_for_rollouts(
rollout_ids=[rollout_id for rollout_id, _ in batch_rollouts],
Expand All @@ -177,13 +180,17 @@ async def algorithm_batch(self, total_tasks: int, batch_size: int):

# Check and warn for stale rollouts
if complete_ids_updated:
completed_ids_last_updated = 0
completed_ids_last_updated = time.perf_counter()
else:
completed_ids_last_updated += 1
if completed_ids_last_updated >= 10:
if time.perf_counter() - completed_ids_last_updated > MAX_STALE_SECONDS / 2:
unfinished_ids = set(rollout_id for rollout_id, _ in batch_rollouts) - completed_ids
print(f"Stale rollouts: {unfinished_ids}")
completed_ids_last_updated = 0
if time.perf_counter() - completed_ids_last_updated > MAX_STALE_SECONDS:
current_workers = await store.query_workers()
console.print(f"Stalled. Current worker status shown below:")
for worker in current_workers:
console.print(f" Worker: {worker}", width=1024) # Avoid wrapping
raise RuntimeError("Rollout progress has stalled for too long")

await asyncio.sleep(5.0)

Expand Down