Skip to content

Commit 9e73b5e

Browse files
authored
Prod DB backups: scripted + lifecycle + restore drill (#115)
* Prod DB backups: script + lifecycle + restore drill tooling * Fix DB backup/restore scripts to avoid host process-list secret exposure * Harden backup/restore docs to avoid passwords in process list
1 parent d14c8ab commit 9e73b5e

File tree

4 files changed

+338
-2
lines changed

4 files changed

+338
-2
lines changed

DEPLOYMENT.md

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,9 @@ PASS=$(grep MYSQL_APP_PASSWORD /home/deployer/.env | cut -d= -f2)
151151

152152
**Never commit .env files or print passwords in logs.**
153153

154+
Also avoid passing passwords on the command line (they can show up in `ps` output). Prefer piping
155+
secrets via stdin into a short-lived file *inside* the container when you need to run `mysql`.
156+
154157
## Quick Health Check
155158

156159
```bash
@@ -161,6 +164,39 @@ sudo docker ps --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
161164
curl -s https://live.cleanapp.io/api/v3/reports/health | jq
162165

163166
# Check database connections
164-
sudo docker exec cleanapp_db mysql -u server -p"$PASS" cleanapp \
165-
-e "SELECT COUNT(*) as connections FROM information_schema.processlist;"
167+
PASS="$(gcloud secrets versions access latest --secret=MYSQL_APP_PASSWORD_PROD)"
168+
printf '%s' "$PASS" | sudo docker exec -i cleanapp_db sh -lc '
169+
set -euo pipefail
170+
pwfile="$(mktemp)"
171+
cat >"$pwfile"
172+
chmod 600 "$pwfile"
173+
MYSQL_PWD="$(cat "$pwfile")" mysql -u server cleanapp -e "SELECT COUNT(*) as connections FROM information_schema.processlist;"
174+
rm -f "$pwfile"
175+
'
176+
```
177+
178+
## Database Backups (Prod)
179+
180+
CleanApp stores full MySQL backups in GCS:
181+
- Prod bucket: `gs://cleanapp_mysql_backup_prod`
182+
- Current object key (versioned): `gs://cleanapp_mysql_backup_prod/current/cleanapp_all.sql.gz`
183+
- Weekly pins (kept ~30 weeks): `gs://cleanapp_mysql_backup_prod/weekly/<ISO_WEEK>/cleanapp_all.sql.gz`
184+
185+
### Restore (One-Liner)
186+
187+
Restore the current backup into the running prod DB container:
188+
189+
```bash
190+
PASS="$(gcloud secrets versions access latest --secret=MYSQL_ROOT_PASSWORD_PROD)"
191+
printf '%s' "$PASS" | sudo docker exec -i cleanapp_db sh -lc 'cat > /tmp/.restore_pw && chmod 600 /tmp/.restore_pw' && \
192+
gsutil cat gs://cleanapp_mysql_backup_prod/current/cleanapp_all.sql.gz | gunzip -c | sudo docker exec -i cleanapp_db sh -lc 'MYSQL_PWD="$(cat /tmp/.restore_pw)" exec mysql -uroot' && \
193+
sudo docker exec cleanapp_db sh -lc 'rm -f /tmp/.restore_pw'
194+
```
195+
196+
### Restore Drill (Recommended)
197+
198+
To prove backups are restorable, run a restore drill into a scratch MySQL container on the prod VM:
199+
200+
```bash
201+
HOST=deployer@34.122.15.16 ./platform_blueprint/ops/db_backup/restore_drill_prod_vm.sh
166202
```
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
#!/usr/bin/env bash
2+
# CleanApp MySQL full backup -> GCS (prod/dev)
3+
# - Streams mysqldump -> gzip -> gsutil (no large local temp files)
4+
# - Writes metadata JSON alongside backup
5+
# - Weekly pin (Sundays UTC): copies current object to weekly/<ISO_WEEK>/
6+
set -euo pipefail
7+
8+
ENV=""
9+
while [[ $# -gt 0 ]]; do
10+
case "$1" in
11+
-e|--env)
12+
ENV="$2"; shift 2;;
13+
*)
14+
echo "Usage: $0 -e <dev|prod>" >&2
15+
exit 2;;
16+
esac
17+
done
18+
19+
if [[ -z "${ENV}" ]]; then
20+
echo "Usage: $0 -e <dev|prod>" >&2
21+
exit 2
22+
fi
23+
24+
case "${ENV}" in
25+
dev|prod) ;;
26+
*) echo "Invalid env: ${ENV} (expected dev|prod)" >&2; exit 2;;
27+
esac
28+
29+
log() { echo "$(date -u +%Y-%m-%dT%H:%M:%SZ) $*"; }
30+
need_cmd() { command -v "$1" >/dev/null 2>&1 || { log "ERROR missing command: $1" >&2; exit 1; }; }
31+
32+
need_cmd gcloud
33+
need_cmd gsutil
34+
need_cmd gzip
35+
36+
if ! sudo -n true 2>/dev/null; then
37+
log "ERROR sudo requires a password; cannot run docker exec" >&2
38+
exit 1
39+
fi
40+
41+
SECRET_SUFFIX="$(echo "${ENV}" | tr '[:lower:]' '[:upper:]')"
42+
BUCKET="gs://cleanapp_mysql_backup_${ENV}"
43+
CURRENT_KEY="${BUCKET}/current/cleanapp_all.sql.gz"
44+
CURRENT_META_KEY="${BUCKET}/current/cleanapp_all.metadata.json"
45+
46+
log "INFO backup start env=${ENV} bucket=${BUCKET}"
47+
48+
MYSQL_ROOT_PASSWORD="$(gcloud secrets versions access latest --secret="MYSQL_ROOT_PASSWORD_${SECRET_SUFFIX}" 2>/dev/null)" || {
49+
log "ERROR failed to read MySQL root password from Secret Manager" >&2
50+
exit 1
51+
}
52+
53+
if ! sudo docker ps --format '{{.Names}}' | grep -qx cleanapp_db; then
54+
log "ERROR cleanapp_db container not running" >&2
55+
exit 1
56+
fi
57+
58+
# Never pass secrets via `docker exec -e ...` (those end up visible in host `ps` output).
59+
# Instead, write the secret into a short-lived file inside the container and reference it
60+
# from inside the container process.
61+
pwfile="/tmp/cleanapp_mysql_backup_pw.$$.$RANDOM"
62+
cleanup_pwfile() {
63+
sudo docker exec cleanapp_db sh -lc "rm -f '${pwfile}'" >/dev/null 2>&1 || true
64+
}
65+
trap cleanup_pwfile EXIT
66+
67+
printf '%s' "${MYSQL_ROOT_PASSWORD}" | sudo docker exec -i cleanapp_db sh -lc \
68+
"cat > '${pwfile}' && chmod 600 '${pwfile}'" >/dev/null
69+
70+
if command -v pigz >/dev/null 2>&1; then
71+
COMPRESS=(pigz -1)
72+
else
73+
COMPRESS=(gzip -1)
74+
fi
75+
76+
started_ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
77+
started_epoch="$(date +%s)"
78+
79+
log "INFO mysqldump stream start"
80+
sudo docker exec -i cleanapp_db sh -lc \
81+
"MYSQL_PWD=\"\$(cat '${pwfile}')\" exec mysqldump -uroot \
82+
--all-databases \
83+
--single-transaction \
84+
--quick \
85+
--lock-tables=false \
86+
--routines --events --triggers \
87+
--hex-blob \
88+
--set-gtid-purged=OFF" \
89+
| "${COMPRESS[@]}" \
90+
| gsutil -q -o GSUtil:parallel_composite_upload_threshold=150M cp - "${CURRENT_KEY}"
91+
92+
finished_epoch="$(date +%s)"
93+
finished_ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
94+
duration_s=$((finished_epoch - started_epoch))
95+
96+
size_bytes="$(gsutil ls -l "${CURRENT_KEY}" | awk 'NR==1{print $1}')"
97+
size_bytes="${size_bytes:-0}"
98+
99+
log "INFO capturing row counts"
100+
reports_count="$(sudo docker exec -i cleanapp_db sh -lc "MYSQL_PWD=\"\$(cat '${pwfile}')\" mysql -uroot -N -e \"SELECT COUNT(*) FROM cleanapp.reports\" 2>/dev/null" | tr -d '\r' | tail -n 1 || true)"
101+
analysis_count="$(sudo docker exec -i cleanapp_db sh -lc "MYSQL_PWD=\"\$(cat '${pwfile}')\" mysql -uroot -N -e \"SELECT COUNT(*) FROM cleanapp.report_analysis\" 2>/dev/null" | tr -d '\r' | tail -n 1 || true)"
102+
reports_count="${reports_count:-0}"
103+
analysis_count="${analysis_count:-0}"
104+
counts_json="{\"reports\":${reports_count},\"report_analysis\":${analysis_count}}"
105+
106+
meta_tmp="/tmp/cleanapp_all.metadata.$$.$RANDOM.json"
107+
cat >"${meta_tmp}" <<META
108+
{
109+
"env": "${ENV}",
110+
"object": "${CURRENT_KEY}",
111+
"started_utc": "${started_ts}",
112+
"finished_utc": "${finished_ts}",
113+
"duration_seconds": ${duration_s},
114+
"size_bytes": ${size_bytes},
115+
"row_counts": ${counts_json}
116+
}
117+
META
118+
119+
gsutil -q cp "${meta_tmp}" "${CURRENT_META_KEY}"
120+
rm -f "${meta_tmp}" || true
121+
122+
log "INFO backup uploaded object=${CURRENT_KEY} size_bytes=${size_bytes} duration_s=${duration_s}"
123+
124+
if [[ "$(date -u +%u)" == "7" ]]; then
125+
week="$(date -u +%G-W%V)"
126+
weekly_key="${BUCKET}/weekly/${week}/cleanapp_all.sql.gz"
127+
weekly_meta_key="${BUCKET}/weekly/${week}/cleanapp_all.metadata.json"
128+
log "INFO weekly pin start week=${week}"
129+
gsutil -q cp "${CURRENT_KEY}" "${weekly_key}"
130+
gsutil -q cp "${CURRENT_META_KEY}" "${weekly_meta_key}"
131+
log "INFO weekly pin done weekly_object=${weekly_key}"
132+
fi
133+
134+
log "INFO backup complete env=${ENV}"
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
#!/usr/bin/env bash
2+
# Installs the prod DB backup script + cron schedule on the prod VM.
3+
#
4+
# Also sets bucket lifecycle:
5+
# - keep 7 noncurrent versions under current/ (daily versions)
6+
# - keep weekly/ objects for 210 days (~30 weeks)
7+
set -euo pipefail
8+
9+
HOST="${HOST:-deployer@34.122.15.16}"
10+
ENV_NAME="${ENV_NAME:-prod}"
11+
12+
if [[ "${ENV_NAME}" != "prod" && "${ENV_NAME}" != "dev" ]]; then
13+
echo "ENV_NAME must be prod|dev" >&2
14+
exit 2
15+
fi
16+
17+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
18+
19+
tmp_lifecycle="$(mktemp)"
20+
trap 'rm -f "$tmp_lifecycle"' EXIT
21+
22+
cat >"$tmp_lifecycle" <<EOF
23+
{
24+
"rule": [
25+
{
26+
"action": {"type": "Delete"},
27+
"condition": {
28+
"isLive": false,
29+
"numNewerVersions": 7,
30+
"matchesPrefix": ["current/"]
31+
}
32+
},
33+
{
34+
"action": {"type": "Delete"},
35+
"condition": {
36+
"age": 210,
37+
"matchesPrefix": ["weekly/"]
38+
}
39+
}
40+
]
41+
}
42+
EOF
43+
44+
echo "== install backup script on VM =="
45+
ssh "$HOST" "set -euo pipefail; mkdir -p /home/deployer/backups; true"
46+
scp "$SCRIPT_DIR/backup.sh" "$HOST:/home/deployer/backup.sh"
47+
ssh "$HOST" "set -euo pipefail; chmod +x /home/deployer/backup.sh"
48+
49+
echo "== ensure cron (daily 03:30 UTC) =="
50+
ssh "$HOST" "set -euo pipefail; (crontab -l 2>/dev/null | grep -v '/home/deployer/backup.sh' || true) > /tmp/cron.new; echo '30 3 * * * /home/deployer/backup.sh -e ${ENV_NAME} >> /home/deployer/backups/backup.log 2>&1' >> /tmp/cron.new; crontab /tmp/cron.new; rm -f /tmp/cron.new; crontab -l"
51+
52+
echo "== set bucket lifecycle =="
53+
bucket="gs://cleanapp_mysql_backup_${ENV_NAME}"
54+
gsutil lifecycle set "$tmp_lifecycle" "$bucket"
55+
gsutil lifecycle get "$bucket"
56+
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
#!/usr/bin/env bash
2+
# Restore drill: restore the latest backup object into a scratch MySQL container and
3+
# compare row counts against the backup metadata.
4+
#
5+
# WARNING: This can take a long time and consume significant disk IO/space.
6+
set -euo pipefail
7+
8+
HOST="${HOST:-deployer@34.122.15.16}"
9+
ENV_NAME="${ENV_NAME:-prod}"
10+
11+
if [[ "${ENV_NAME}" != "prod" && "${ENV_NAME}" != "dev" ]]; then
12+
echo "ENV_NAME must be prod|dev" >&2
13+
exit 2
14+
fi
15+
16+
bucket="gs://cleanapp_mysql_backup_${ENV_NAME}"
17+
obj_sql="${bucket}/current/cleanapp_all.sql.gz"
18+
obj_meta="${bucket}/current/cleanapp_all.metadata.json"
19+
20+
ssh "$HOST" "ENV_NAME='${ENV_NAME}' OBJ_SQL='${obj_sql}' OBJ_META='${obj_meta}' bash -s" <<'REMOTE'
21+
set -euo pipefail
22+
23+
need() { command -v "$1" >/dev/null 2>&1 || { echo "missing $1" >&2; exit 1; }; }
24+
need gsutil
25+
need sudo
26+
need docker
27+
need python3
28+
29+
if ! sudo -n true 2>/dev/null; then
30+
echo "sudo requires password on VM; cannot run restore drill" >&2
31+
exit 1
32+
fi
33+
34+
env_name="${ENV_NAME}"
35+
obj_sql="${OBJ_SQL}"
36+
obj_meta="${OBJ_META}"
37+
38+
echo "== restore drill: env=${env_name} obj=${obj_sql} =="
39+
40+
if ! gsutil -q stat "${obj_sql}"; then
41+
echo "backup object missing: ${obj_sql}" >&2
42+
exit 2
43+
fi
44+
if ! gsutil -q stat "${obj_meta}"; then
45+
echo "metadata object missing: ${obj_meta}" >&2
46+
exit 2
47+
fi
48+
49+
expected="$(gsutil cat "${obj_meta}")"
50+
exp_reports="$(python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("row_counts",{}).get("reports",0))' <<<"$expected")"
51+
exp_analysis="$(python3 -c 'import json,sys; d=json.load(sys.stdin); print(d.get("row_counts",{}).get("report_analysis",0))' <<<"$expected")"
52+
echo "expected counts: reports=${exp_reports} report_analysis=${exp_analysis}"
53+
54+
ts="$(date -u +%Y%m%dT%H%M%SZ)"
55+
name="cleanapp_db_restore_drill_${ts}"
56+
vol="eko_mysql_restore_drill_${ts}"
57+
58+
root_pw="$(python3 -c 'import secrets; print(secrets.token_hex(16))')"
59+
envfile="/tmp/${name}.env"
60+
61+
echo "== create scratch mysql container =="
62+
sudo docker volume create "${vol}" >/dev/null
63+
umask 077
64+
printf 'MYSQL_ROOT_PASSWORD=%s\n' "${root_pw}" >"${envfile}"
65+
66+
sudo docker run -d --name "${name}" --env-file "${envfile}" \
67+
-p 127.0.0.1:3307:3306 \
68+
-v "${vol}":/var/lib/mysql \
69+
mysql:8.0 \
70+
--default-authentication-plugin=mysql_native_password \
71+
--innodb_flush_log_at_trx_commit=2 \
72+
--sync_binlog=0 >/dev/null
73+
74+
cleanup() {
75+
echo "== cleanup =="
76+
sudo docker rm -f "${name}" >/dev/null 2>&1 || true
77+
sudo docker volume rm "${vol}" >/dev/null 2>&1 || true
78+
rm -f "${envfile}" >/dev/null 2>&1 || true
79+
}
80+
trap cleanup EXIT
81+
82+
echo "== wait for mysql ready =="
83+
for i in $(seq 1 120); do
84+
# Use container env var to avoid putting secrets in host-visible process args.
85+
if sudo docker exec "${name}" sh -lc 'mysql -uroot -p"$MYSQL_ROOT_PASSWORD" -e "SELECT 1" >/dev/null 2>&1'; then
86+
break
87+
fi
88+
sleep 2
89+
if [[ "$i" -eq 120 ]]; then
90+
echo "mysql did not become ready" >&2
91+
exit 1
92+
fi
93+
done
94+
95+
echo "== stream restore (this can take a long time) =="
96+
gsutil cat "${obj_sql}" | gunzip -c | sudo docker exec -i "${name}" sh -lc 'mysql -uroot -p"$MYSQL_ROOT_PASSWORD"'
97+
98+
echo "== verify counts =="
99+
got_reports="$(sudo docker exec "${name}" sh -lc 'mysql -uroot -p"$MYSQL_ROOT_PASSWORD" -N -e \"SELECT COUNT(*) FROM cleanapp.reports\" 2>/dev/null' | tr -d '\r' | tail -n 1)"
100+
got_analysis="$(sudo docker exec "${name}" sh -lc 'mysql -uroot -p"$MYSQL_ROOT_PASSWORD" -N -e \"SELECT COUNT(*) FROM cleanapp.report_analysis\" 2>/dev/null' | tr -d '\r' | tail -n 1)"
101+
102+
echo "restored counts: reports=${got_reports} report_analysis=${got_analysis}"
103+
104+
if [[ "${got_reports}" != "${exp_reports}" || "${got_analysis}" != "${exp_analysis}" ]]; then
105+
echo "ERROR restore drill mismatch vs metadata" >&2
106+
exit 3
107+
fi
108+
109+
echo "OK restore drill: counts match metadata"
110+
REMOTE

0 commit comments

Comments
 (0)