Skip to content

Commit 627f420

Browse files
committed
Prod DB backups: script + lifecycle + restore drill tooling
1 parent d14c8ab commit 627f420

File tree

4 files changed

+309
-0
lines changed

4 files changed

+309
-0
lines changed

DEPLOYMENT.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,27 @@ curl -s https://live.cleanapp.io/api/v3/reports/health | jq
164164
sudo docker exec cleanapp_db mysql -u server -p"$PASS" cleanapp \
165165
-e "SELECT COUNT(*) as connections FROM information_schema.processlist;"
166166
```
167+
168+
## Database Backups (Prod)
169+
170+
CleanApp stores full MySQL backups in GCS:
171+
- Prod bucket: `gs://cleanapp_mysql_backup_prod`
172+
- Current object key (versioned): `gs://cleanapp_mysql_backup_prod/current/cleanapp_all.sql.gz`
173+
- Weekly pins (kept ~30 weeks): `gs://cleanapp_mysql_backup_prod/weekly/<ISO_WEEK>/cleanapp_all.sql.gz`
174+
175+
### Restore (One-Liner)
176+
177+
Restore the current backup into the running prod DB container:
178+
179+
```bash
180+
PASS="$(gcloud secrets versions access latest --secret=MYSQL_ROOT_PASSWORD_PROD)"
181+
gsutil cat gs://cleanapp_mysql_backup_prod/current/cleanapp_all.sql.gz | gunzip -c | sudo docker exec -i -e MYSQL_PWD="$PASS" cleanapp_db mysql -uroot
182+
```
183+
184+
### Restore Drill (Recommended)
185+
186+
To prove backups are restorable, run a restore drill into a scratch MySQL container on the prod VM:
187+
188+
```bash
189+
HOST=deployer@34.122.15.16 ./platform_blueprint/ops/db_backup/restore_drill_prod_vm.sh
190+
```
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#!/usr/bin/env bash
2+
# CleanApp MySQL full backup -> GCS (prod/dev)
3+
# - Streams mysqldump -> gzip -> gsutil (no large local temp files)
4+
# - Writes metadata JSON alongside backup
5+
# - Weekly pin (Sundays UTC): copies current object to weekly/<ISO_WEEK>/
6+
set -euo pipefail
7+
8+
ENV=""
9+
while [[ $# -gt 0 ]]; do
10+
case "$1" in
11+
-e|--env)
12+
ENV="$2"; shift 2;;
13+
*)
14+
echo "Usage: $0 -e <dev|prod>" >&2
15+
exit 2;;
16+
esac
17+
done
18+
19+
if [[ -z "${ENV}" ]]; then
20+
echo "Usage: $0 -e <dev|prod>" >&2
21+
exit 2
22+
fi
23+
24+
case "${ENV}" in
25+
dev|prod) ;;
26+
*) echo "Invalid env: ${ENV} (expected dev|prod)" >&2; exit 2;;
27+
esac
28+
29+
log() { echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) $*"; }
30+
need_cmd() { command -v "$1" >/dev/null 2>&1 || { log "ERROR missing command: $1" >&2; exit 1; }; }
31+
32+
need_cmd gcloud
33+
need_cmd gsutil
34+
need_cmd gzip
35+
36+
if ! sudo -n true 2>/dev/null; then
37+
log "ERROR sudo requires a password; cannot run docker exec" >&2
38+
exit 1
39+
fi
40+
41+
SECRET_SUFFIX="$(echo "${ENV}" | tr '[:lower:]' '[:upper:]')"
42+
BUCKET="gs://cleanapp_mysql_backup_${ENV}"
43+
CURRENT_KEY="${BUCKET}/current/cleanapp_all.sql.gz"
44+
CURRENT_META_KEY="${BUCKET}/current/cleanapp_all.metadata.json"
45+
46+
log "INFO backup start env=${ENV} bucket=${BUCKET}"
47+
48+
MYSQL_ROOT_PASSWORD="$(gcloud secrets versions access latest --secret="MYSQL_ROOT_PASSWORD_${SECRET_SUFFIX}" 2>/dev/null)" || {
49+
log "ERROR failed to read MySQL root password from Secret Manager" >&2
50+
exit 1
51+
}
52+
53+
if ! sudo docker ps --format '{{.Names}}' | grep -qx cleanapp_db; then
54+
log "ERROR cleanapp_db container not running" >&2
55+
exit 1
56+
fi
57+
58+
if command -v pigz >/dev/null 2>&1; then
59+
COMPRESS=(pigz -1)
60+
else
61+
COMPRESS=(gzip -1)
62+
fi
63+
64+
started_ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
65+
started_epoch="$(date +%s)"
66+
67+
log "INFO mysqldump stream start"
68+
sudo docker exec -e MYSQL_PWD="${MYSQL_ROOT_PASSWORD}" -i cleanapp_db sh -lc \
69+
'exec mysqldump -uroot \
70+
--all-databases \
71+
--single-transaction \
72+
--quick \
73+
--lock-tables=false \
74+
--routines --events --triggers \
75+
--hex-blob \
76+
--set-gtid-purged=OFF' \
77+
| "${COMPRESS[@]}" \
78+
| gsutil -q -o GSUtil:parallel_composite_upload_threshold=150M cp - "${CURRENT_KEY}"
79+
80+
finished_epoch="$(date +%s)"
81+
finished_ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
82+
duration_s=$((finished_epoch - started_epoch))
83+
84+
size_bytes="$(gsutil ls -l "${CURRENT_KEY}" | awk 'NR==1{print $1}')"
85+
size_bytes="${size_bytes:-0}"
86+
87+
log "INFO capturing row counts"
88+
reports_count="$(sudo docker exec -e MYSQL_PWD="${MYSQL_ROOT_PASSWORD}" -i cleanapp_db sh -lc 'mysql -uroot -N -e "SELECT COUNT(*) FROM cleanapp.reports" 2>/dev/null' | tr -d '\r' | tail -n 1 || true)"
89+
analysis_count="$(sudo docker exec -e MYSQL_PWD="${MYSQL_ROOT_PASSWORD}" -i cleanapp_db sh -lc 'mysql -uroot -N -e "SELECT COUNT(*) FROM cleanapp.report_analysis" 2>/dev/null' | tr -d '\r' | tail -n 1 || true)"
90+
reports_count="${reports_count:-0}"
91+
analysis_count="${analysis_count:-0}"
92+
counts_json="{\"reports\":${reports_count},\"report_analysis\":${analysis_count}}"
93+
94+
meta_tmp="/tmp/cleanapp_all.metadata.$$.$RANDOM.json"
95+
cat >"${meta_tmp}" <<META
96+
{
97+
"env": "${ENV}",
98+
"object": "${CURRENT_KEY}",
99+
"started_utc": "${started_ts}",
100+
"finished_utc": "${finished_ts}",
101+
"duration_seconds": ${duration_s},
102+
"size_bytes": ${size_bytes},
103+
"row_counts": ${counts_json}
104+
}
105+
META
106+
107+
gsutil -q cp "${meta_tmp}" "${CURRENT_META_KEY}"
108+
rm -f "${meta_tmp}" || true
109+
110+
log "INFO backup uploaded object=${CURRENT_KEY} size_bytes=${size_bytes} duration_s=${duration_s}"
111+
112+
if [[ "$(date -u +%u)" == "7" ]]; then
113+
week="$(date -u +%G-W%V)"
114+
weekly_key="${BUCKET}/weekly/${week}/cleanapp_all.sql.gz"
115+
weekly_meta_key="${BUCKET}/weekly/${week}/cleanapp_all.metadata.json"
116+
log "INFO weekly pin start week=${week}"
117+
gsutil -q cp "${CURRENT_KEY}" "${weekly_key}"
118+
gsutil -q cp "${CURRENT_META_KEY}" "${weekly_meta_key}"
119+
log "INFO weekly pin done weekly_object=${weekly_key}"
120+
fi
121+
122+
log "INFO backup complete env=${ENV}"
123+
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/usr/bin/env bash
2+
# Installs the prod DB backup script + cron schedule on the prod VM.
3+
#
4+
# Also sets bucket lifecycle:
5+
# - keep 7 noncurrent versions under current/ (daily versions)
6+
# - keep weekly/ objects for 210 days (~30 weeks)
7+
set -euo pipefail
8+
9+
HOST="${HOST:-deployer@34.122.15.16}"
10+
ENV_NAME="${ENV_NAME:-prod}"
11+
12+
if [[ "${ENV_NAME}" != "prod" && "${ENV_NAME}" != "dev" ]]; then
13+
echo "ENV_NAME must be prod|dev" >&2
14+
exit 2
15+
fi
16+
17+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
18+
19+
tmp_lifecycle="$(mktemp)"
20+
trap 'rm -f "$tmp_lifecycle"' EXIT
21+
22+
cat >"$tmp_lifecycle" <<EOF
23+
{
24+
"rule": [
25+
{
26+
"action": {"type": "Delete"},
27+
"condition": {
28+
"isLive": false,
29+
"numNewerVersions": 7,
30+
"matchesPrefix": ["current/"]
31+
}
32+
},
33+
{
34+
"action": {"type": "Delete"},
35+
"condition": {
36+
"age": 210,
37+
"matchesPrefix": ["weekly/"]
38+
}
39+
}
40+
]
41+
}
42+
EOF
43+
44+
echo "== install backup script on VM =="
45+
ssh "$HOST" "set -euo pipefail; mkdir -p /home/deployer/backups; true"
46+
scp "$SCRIPT_DIR/backup.sh" "$HOST:/home/deployer/backup.sh"
47+
ssh "$HOST" "set -euo pipefail; chmod +x /home/deployer/backup.sh"
48+
49+
echo "== ensure cron (daily 03:30 UTC) =="
50+
ssh "$HOST" "set -euo pipefail; (crontab -l 2>/dev/null | grep -v '/home/deployer/backup.sh' || true) > /tmp/cron.new; echo '30 3 * * * /home/deployer/backup.sh -e ${ENV_NAME} >> /home/deployer/backups/backup.log 2>&1' >> /tmp/cron.new; crontab /tmp/cron.new; rm -f /tmp/cron.new; crontab -l"
51+
52+
echo "== set bucket lifecycle =="
53+
bucket="gs://cleanapp_mysql_backup_${ENV_NAME}"
54+
gsutil lifecycle set "$tmp_lifecycle" "$bucket"
55+
gsutil lifecycle get "$bucket"
56+
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
#!/usr/bin/env bash
2+
# Restore drill: restore the latest backup object into a scratch MySQL container and
3+
# compare row counts against the backup metadata.
4+
#
5+
# WARNING: This can take a long time and consume significant disk IO/space.
6+
set -euo pipefail
7+
8+
HOST="${HOST:-deployer@34.122.15.16}"
9+
ENV_NAME="${ENV_NAME:-prod}"
10+
11+
if [[ "${ENV_NAME}" != "prod" && "${ENV_NAME}" != "dev" ]]; then
12+
echo "ENV_NAME must be prod|dev" >&2
13+
exit 2
14+
fi
15+
16+
bucket="gs://cleanapp_mysql_backup_${ENV_NAME}"
17+
obj_sql="${bucket}/current/cleanapp_all.sql.gz"
18+
obj_meta="${bucket}/current/cleanapp_all.metadata.json"
19+
20+
ssh "$HOST" "ENV_NAME='${ENV_NAME}' OBJ_SQL='${obj_sql}' OBJ_META='${obj_meta}' bash -s" <<'REMOTE'
21+
set -euo pipefail
22+
23+
need() { command -v "$1" >/dev/null 2>&1 || { echo "missing $1" >&2; exit 1; }; }
24+
need gsutil
25+
need sudo
26+
need docker
27+
need python3
28+
29+
if ! sudo -n true 2>/dev/null; then
30+
echo "sudo requires password on VM; cannot run restore drill" >&2
31+
exit 1
32+
fi
33+
34+
env_name="${ENV_NAME}"
35+
obj_sql="${OBJ_SQL}"
36+
obj_meta="${OBJ_META}"
37+
38+
echo "== restore drill: env=${env_name} obj=${obj_sql} =="
39+
40+
if ! gsutil -q stat "${obj_sql}"; then
41+
echo "backup object missing: ${obj_sql}" >&2
42+
exit 2
43+
fi
44+
if ! gsutil -q stat "${obj_meta}"; then
45+
echo "metadata object missing: ${obj_meta}" >&2
46+
exit 2
47+
fi
48+
49+
expected="$(gsutil cat "${obj_meta}")"
50+
exp_reports="$(python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("row_counts",{}).get("reports",0))' <<<"$expected")"
51+
exp_analysis="$(python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("row_counts",{}).get("report_analysis",0))' <<<"$expected")"
52+
echo "expected counts: reports=${exp_reports} report_analysis=${exp_analysis}"
53+
54+
ts="$(date -u +%Y%m%dT%H%M%SZ)"
55+
name="cleanapp_db_restore_drill_${ts}"
56+
vol="eko_mysql_restore_drill_${ts}"
57+
58+
root_pw="$(python3 -c 'import secrets; print(secrets.token_hex(16))')"
59+
60+
echo "== create scratch mysql container =="
61+
sudo docker volume create "${vol}" >/dev/null
62+
sudo docker run -d --name "${name}" \
63+
-e MYSQL_ROOT_PASSWORD="${root_pw}" \
64+
-p 127.0.0.1:3307:3306 \
65+
-v "${vol}":/var/lib/mysql \
66+
mysql:8.0 \
67+
--default-authentication-plugin=mysql_native_password \
68+
--innodb_flush_log_at_trx_commit=2 \
69+
--sync_binlog=0 >/dev/null
70+
71+
cleanup() {
72+
echo "== cleanup =="
73+
sudo docker rm -f "${name}" >/dev/null 2>&1 || true
74+
sudo docker volume rm "${vol}" >/dev/null 2>&1 || true
75+
}
76+
trap cleanup EXIT
77+
78+
echo "== wait for mysql ready =="
79+
for i in $(seq 1 120); do
80+
if sudo docker exec -e MYSQL_PWD="${root_pw}" "${name}" mysql -uroot -e "SELECT 1" >/dev/null 2>&1; then
81+
break
82+
fi
83+
sleep 2
84+
if [[ "$i" -eq 120 ]]; then
85+
echo "mysql did not become ready" >&2
86+
exit 1
87+
fi
88+
done
89+
90+
echo "== stream restore (this can take a long time) =="
91+
gsutil cat "${obj_sql}" | gunzip -c | sudo docker exec -i -e MYSQL_PWD="${root_pw}" "${name}" mysql -uroot
92+
93+
echo "== verify counts =="
94+
got_reports="$(sudo docker exec -e MYSQL_PWD="${root_pw}" "${name}" mysql -uroot -N -e 'SELECT COUNT(*) FROM cleanapp.reports' 2>/dev/null | tr -d '\r' | tail -n 1)"
95+
got_analysis="$(sudo docker exec -e MYSQL_PWD="${root_pw}" "${name}" mysql -uroot -N -e 'SELECT COUNT(*) FROM cleanapp.report_analysis' 2>/dev/null | tr -d '\r' | tail -n 1)"
96+
97+
echo "restored counts: reports=${got_reports} report_analysis=${got_analysis}"
98+
99+
if [[ "${got_reports}" != "${exp_reports}" || "${got_analysis}" != "${exp_analysis}" ]]; then
100+
echo "ERROR restore drill mismatch vs metadata" >&2
101+
exit 3
102+
fi
103+
104+
echo "OK restore drill: counts match metadata"
105+
REMOTE
106+

0 commit comments

Comments
 (0)