Skip to content

Commit e92afd7

Browse files
committed
Add autoscaling host agent module to terraform blueprint
1 parent a4b3bc8 commit e92afd7

File tree

5 files changed

+153
-1
lines changed

5 files changed

+153
-1
lines changed

deploy/terraform/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ This sample configuration provisions the minimum AWS infrastructure required to
3838

3939
The bootstrap script installs Docker, pulls the Nimbus images, and writes the `.env` using the outputs produced by this Terraform stack.
4040

41+
## Host agent autoscaling
42+
43+
This stack also creates an EC2 Auto Scaling group that runs the Nimbus host agent. Provide `agent_ami`, `agent_instance_type`, and `agent_desired_capacity` in `terraform.tfvars` to scale eval capacity on demand. The `bootstrap/bootstrap-agent.sh` script installs Nimbus, primes Firecracker assets, and registers the agent service via systemd.
44+
4145
## File layout
4246

4347
```
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
LOG="/var/log/bootstrap-nimbus-agent.log"
5+
exec > >(tee -a "$LOG") 2>&1
6+
7+
echo "[Nimbus agent bootstrap] Installing dependencies..."
8+
9+
sudo yum update -y
10+
sudo yum install -y docker git
11+
sudo systemctl enable docker
12+
sudo systemctl start docker
13+
14+
if ! command -v uv >/dev/null 2>&1; then
15+
curl -LsSf https://astral.sh/uv/install.sh | sh
16+
export PATH="$HOME/.local/bin:$PATH"
17+
fi
18+
19+
echo "[Nimbus agent bootstrap] Fetching Nimbus..."
20+
cd /opt
21+
if [ ! -d "nimbus" ]; then
22+
sudo git clone https://github.com/evalops/nimbus.git
23+
fi
24+
cd nimbus
25+
26+
sudo chown -R ec2-user:ec2-user /opt/nimbus
27+
28+
cat <<'ENV' > /opt/nimbus/agent.env
29+
NIMBUS_AGENT_ID=
30+
NIMBUS_CONTROL_PLANE_URL=https://nimbus.example.com
31+
NIMBUS_CONTROL_PLANE_TOKEN=
32+
NIMBUS_KERNEL_IMAGE=/opt/nimbus/artifacts/kernel
33+
NIMBUS_ROOTFS_IMAGE=/opt/nimbus/artifacts/rootfs.ext4
34+
ENV
35+
36+
sudo su - ec2-user <<'EOS'
37+
set -euo pipefail
38+
cd /opt/nimbus
39+
uv venv .venv
40+
source .venv/bin/activate
41+
uv pip install -e .
42+
mkdir -p artifacts
43+
python scripts/setup_firecracker_assets.py artifacts
44+
45+
cat agent.env | xargs -0
46+
47+
cat <<'SERVICE' > nimbus-agent.service
48+
[Unit]
49+
Description=Nimbus Host Agent
50+
After=network-online.target docker.service
51+
Requires=docker.service
52+
53+
[Service]
54+
Type=simple
55+
EnvironmentFile=/opt/nimbus/agent.env
56+
WorkingDirectory=/opt/nimbus
57+
ExecStart=/opt/nimbus/.venv/bin/python -m nimbus.host_agent
58+
Restart=on-failure
59+
60+
[Install]
61+
WantedBy=multi-user.target
62+
SERVICE
63+
64+
sudo mv nimbus-agent.service /etc/systemd/system/nimbus-agent.service
65+
sudo systemctl daemon-reload
66+
sudo systemctl enable nimbus-agent.service
67+
sudo systemctl start nimbus-agent.service
68+
EOS
69+
70+
echo "[Nimbus agent bootstrap] Completed"

deploy/terraform/main.tf

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,30 @@ resource "aws_security_group" "control_plane" {
8787
}
8888
}
8989

90+
resource "aws_security_group" "host_agent" {
91+
name = "nimbus-host-agent"
92+
description = "Access for host agents"
93+
vpc_id = aws_vpc.nimbus.id
94+
95+
ingress {
96+
from_port = 22
97+
to_port = 22
98+
protocol = "tcp"
99+
cidr_blocks = [var.admin_cidr]
100+
}
101+
102+
egress {
103+
from_port = 0
104+
to_port = 0
105+
protocol = "-1"
106+
cidr_blocks = ["0.0.0.0/0"]
107+
}
108+
109+
tags = {
110+
Name = "nimbus-host-agent"
111+
}
112+
}
113+
90114
resource "aws_instance" "control_plane" {
91115
ami = var.control_plane_ami
92116
instance_type = var.control_plane_instance_type
@@ -140,6 +164,43 @@ resource "aws_lb_listener" "https" {
140164
}
141165
}
142166

167+
resource "aws_launch_template" "host_agent" {
168+
name_prefix = "nimbus-agent"
169+
image_id = var.agent_ami
170+
instance_type = var.agent_instance_type
171+
key_name = var.ssh_key_name
172+
security_group_names = [aws_security_group.host_agent.name]
173+
user_data = base64encode(file("${path.module}/bootstrap/bootstrap-agent.sh"))
174+
175+
tag_specifications {
176+
resource_type = "instance"
177+
tags = {
178+
Name = "nimbus-host-agent"
179+
}
180+
}
181+
}
182+
183+
resource "aws_autoscaling_group" "host_agents" {
184+
name = "nimbus-host-agents"
185+
desired_capacity = var.agent_desired_capacity
186+
max_size = var.agent_desired_capacity + 1
187+
min_size = 1
188+
vpc_zone_identifier = [for subnet in aws_subnet.public : subnet.id]
189+
health_check_grace_period = 60
190+
health_check_type = "EC2"
191+
192+
launch_template {
193+
id = aws_launch_template.host_agent.id
194+
version = "$Latest"
195+
}
196+
197+
tag {
198+
key = "Name"
199+
value = "nimbus-host-agent"
200+
propagate_at_launch = true
201+
}
202+
}
203+
143204
output "control_plane_public_ip" {
144205
value = aws_instance.control_plane.public_ip
145206
}

deploy/terraform/variables.tf

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,3 +48,20 @@ variable "admin_cidr" {
4848
description = "CIDR range allowed to SSH"
4949
default = "0.0.0.0/0"
5050
}
51+
52+
variable "agent_ami" {
53+
type = string
54+
description = "AMI for host agent autoscaling group"
55+
}
56+
57+
variable "agent_instance_type" {
58+
type = string
59+
description = "EC2 instance type for host agents"
60+
default = "t3.large"
61+
}
62+
63+
variable "agent_desired_capacity" {
64+
type = number
65+
description = "Desired host agent count"
66+
default = 2
67+
}

docs/onboarding.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ New to Nimbus? This quick-start itinerary walks you from zero to your first succ
6868

6969
## 5. Production hardening checklist
7070

71-
- ✅ Provision Redis/Postgres from your infra provider (RDS, CloudSQL, etc.) and point `.env`.
7271
- ✅ Stand up the control plane via Terraform or Helm (templates in `deploy/`).
72+
- ✅ Scale host agents with the Terraform autoscaling group (see `deploy/terraform`).
7373
- ✅ Configure TLS (terminate at a reverse proxy or load balancer).
7474
- ✅ Schedule nightly `nimbus.cli.report overview` runs and archive the JSON for auditing.
7575
- ✅ Enable OpenTelemetry export to your tracing backend.

0 commit comments

Comments
 (0)