Skip to content

Commit e82dcaf

Browse files
author
Jonathan Alvarez Delgado
committed
safety(pulumi): deploy cold (desired_count 0), suspend autoscaling, disable schedules
1 parent 70562a6 commit e82dcaf

File tree

4 files changed

+36
-18
lines changed

4 files changed

+36
-18
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,6 @@ venv*
5050

5151
# Pulumi local artefacts (outputs, notes, analysis)
5252
infra/pulumi/pulumi-*.txt
53-
infra/pulumi/preview-output-*.txt
53+
infra/pulumi/preview-*.txt
5454
infra/pulumi/analysis.md
5555
infra/pulumi/infrastructure-inventory.md

infra/pulumi/__main__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -780,7 +780,9 @@ def main():
780780
"help",
781781
], # Default; again overridden per schedule
782782
"environment": [
783-
{"name": "DJANGO_SETTINGS_MODULE", "value": "settings"}
783+
{"name": "DJANGO_SETTINGS_MODULE", "value": "settings_local_stage"},
784+
{"name": "BOOTSTRAP_SAFE", "value": "true"},
785+
{"name": "NETAPP_STORAGE_ROOT", "value": "/tmp/storage"},
784786
],
785787
"logConfiguration": {
786788
"logDriver": "awslogs",
@@ -936,7 +938,7 @@ def main():
936938
{"containerOverrides": [{"name": "cron", "command": command}]}
937939
),
938940
),
939-
state="ENABLED",
941+
state=task_config.get("state", "DISABLED"),
940942
opts=pulumi.ResourceOptions(
941943
parent=schedule_group,
942944
depends_on=[cron_task_definition, scheduler_role],

infra/pulumi/config.stage.yaml

Lines changed: 19 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,17 @@ resources:
2020
addons-server:
2121
name: atn-stage-addons-server
2222
image_tag_mutability: MUTABLE
23-
force_delete: true # Stage only: allows pulumi destroy with images present
23+
force_delete: false # Repo now holds CI-built images; protect from accidental destroy
2424
scan_on_push: true
2525
encryption_type: AES256
26-
# Lifecycle policy keep last 50 tagged images (here any tag), expire untagged after 7 days
27-
# This catches SHA tags, stage-latest, and any future tag patterns
26+
# Lifecycle policy keep last 50 tagged images (stage-/sha- prefixes), expire untagged after 7 days
27+
# Matches tags prefixed with stage- (e.g., stage-latest) and sha- (commit SHAs)
2828
lifecycle_policy: |
2929
{
3030
"rules": [
3131
{
3232
"rulePriority": 1,
33-
"description": "Keep last 50 tagged images (any tag)",
33+
"description": "Keep last 50 tagged images (stage-/sha- prefixes)",
3434
"selection": {
3535
"tagStatus": "tagged",
3636
"tagPrefixList": ["stage-", "sha-"],
@@ -150,10 +150,7 @@ resources:
150150
# Image: 768512802988.dkr.ecr.us-west-2.amazonaws.com/atn-stage-addons-server:stage-latest
151151
tb:fargate:FargateClusterWithLogging:
152152
web:
153-
# desired_count intentionally omitted: autoscaling owns the count.
154-
# tb_pulumi sets ignore_changes on desired_count when not specified,
155-
# preventing Pulumi from fighting the autoscaler. min_capacity in
156-
# the autoscaling config acts as the effective baseline
153+
desired_count: 0 # Start cold; scale up manually after validation
157154
assign_public_ip: false
158155
internal: false # Public-facing ALB
159156
enable_container_insights: true
@@ -196,6 +193,8 @@ resources:
196193
environment:
197194
- name: DJANGO_SETTINGS_MODULE
198195
value: settings_local_stage
196+
- name: BOOTSTRAP_SAFE
197+
value: 'true'
199198
- name: UWSGI_PROCESSES
200199
value: '4'
201200
- name: UWSGI_THREADS
@@ -211,7 +210,7 @@ resources:
211210
# t3a.large has 8GB RAM - needed for addons-linter memory requirements
212211
# Multiple queue groups for different workloads
213212
worker:
214-
# desired_count omitted: autoscaling owns the count (see web comment above)
213+
desired_count: 0 # Start cold; scale up manually after validation
215214
assign_public_ip: false
216215
internal: true
217216
build_load_balancer: false # Workers don't need ALB
@@ -234,6 +233,8 @@ resources:
234233
environment:
235234
- name: DJANGO_SETTINGS_MODULE
236235
value: settings_local_stage
236+
- name: BOOTSTRAP_SAFE
237+
value: 'true'
237238
- name: CELERY_CONCURRENCY
238239
value: '4'
239240
- name: CELERY_QUEUES
@@ -249,7 +250,7 @@ resources:
249250
# Separate ALB endpoint for version checking API (versioncheck.addons.thunderbird.net)
250251
# Lightweight service - c7a.medium equivalent
251252
versioncheck:
252-
# desired_count omitted: autoscaling owns the count (see web comment)
253+
desired_count: 0 # Start cold; scale up manually after validation
253254
assign_public_ip: false
254255
internal: false
255256
enable_container_insights: true
@@ -292,6 +293,8 @@ resources:
292293
environment:
293294
- name: DJANGO_SETTINGS_MODULE
294295
value: settings_local_stage
296+
- name: BOOTSTRAP_SAFE
297+
value: 'true'
295298
- name: UWSGI_PROCESSES
296299
value: '4'
297300
- name: UWSGI_THREADS
@@ -326,23 +329,26 @@ resources:
326329
web:
327330
cpu_threshold: 70
328331
ram_threshold: 70
329-
min_capacity: 2
332+
min_capacity: 0 # Start at 0; scale up manually after validation
330333
max_capacity: 8
331334
cooldown: 300
335+
suspend: true # Suspended until services are validated
332336

333337
worker:
334338
cpu_threshold: 70
335339
ram_threshold: 80 # Higher: addons-linter memory spikes are normal
336-
min_capacity: 2
340+
min_capacity: 0
337341
max_capacity: 6
338342
cooldown: 300
343+
suspend: true
339344

340345
versioncheck:
341346
cpu_threshold: 70
342347
ram_threshold: 70
343-
min_capacity: 1
348+
min_capacity: 0
344349
max_capacity: 4
345350
cooldown: 300
351+
suspend: true
346352

347353
# =============================================================================
348354
# ElastiCache - Memcached (intended to replace current Memcached setup)

settings_local_stage.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,19 @@ def get_secret(secret_name, region_name="us-west-2"):
3434
raise Exception(f"Failed to retrieve secret {secret_name}: {e}")
3535

3636

37+
# -----------------------------------------------------------------------------
38+
# Bootstrap safety toggle
39+
# -----------------------------------------------------------------------------
40+
# When BOOTSTRAP_SAFE is true we deliberately use RO database credentials
41+
# (if present) so that even if something accidentally starts, MySQL itself
42+
# enforces read-only access
43+
BOOTSTRAP_SAFE = env.bool("BOOTSTRAP_SAFE", default=False)
44+
MYSQL_SECRET_NAME = "atn/stage/mysql_ro" if BOOTSTRAP_SAFE else "atn/stage/mysql"
45+
46+
3747
# Retrieve secrets from AWS Secrets Manager
3848
_email_url_secret = get_secret('atn/stage/email_url')
39-
_mysql_secret = get_secret('atn/stage/mysql')
49+
_mysql_secret = get_secret(MYSQL_SECRET_NAME)
4050
_inbound_email_secret = get_secret('atn/stage/inbound_email')
4151
_django_secret = get_secret('atn/stage/django_secret_key')
4252
_celery_broker_secret = get_secret('atn/stage/celery_broker')
@@ -288,7 +298,7 @@ def get_secret(secret_name, region_name="us-west-2"):
288298

289299
ES_DEFAULT_NUM_SHARDS = 10
290300

291-
READ_ONLY = env.bool('READ_ONLY', default=False)
301+
READ_ONLY = env.bool("READ_ONLY", default=BOOTSTRAP_SAFE)
292302

293303
# TODO: Github user ?
294304
GITHUB_API_USER = ''

0 commit comments

Comments
 (0)