From c65921fd4215bdc4895bd30d78d200c52708e7a7 Mon Sep 17 00:00:00 2001 From: Mohammed Mohsin Date: Wed, 6 May 2026 14:50:52 +0000 Subject: [PATCH 1/2] chore(scripts/migration): resumable Mixpanel raw-event export Streams data.mixpanel.com/api/2.0/export one UTC day at a time, pacing under the 60 queries/hour rate limit, into gzipped JSONL chunks for handoff to PostHog managed migration (or cold archive of the historical Mixpanel record before decommission). Designed for unattended overnight runs: - flock guard against concurrent invocations - atomic per-day write (.tmp -> rename only after successful gzip) - resume drives off disk state, not the manifest, so any crash mode (Ctrl+C, kill -9, OOM, VM reboot) leaves at most one in-flight day to redo - 5-attempt exponential backoff on 5xx/timeout, infinite backoff on 429 (does not consume retry budget), hard abort on 401/403 - 0-byte sentinel distinguishes legitimately empty days from unfetched ones - optional GCS mirror per chunk via gsutil Auth needs a Mixpanel service account (Org Settings -> Service Accounts) with Consumer role on the target project. README documents the full operational flow. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/migration/mixpanel_export.sh | 249 +++++++++++++++++++++++++++ 1 file changed, 249 insertions(+) create mode 100755 scripts/migration/mixpanel_export.sh diff --git a/scripts/migration/mixpanel_export.sh b/scripts/migration/mixpanel_export.sh new file mode 100755 index 00000000000..478dfdf5e56 --- /dev/null +++ b/scripts/migration/mixpanel_export.sh @@ -0,0 +1,249 @@ +#!/usr/bin/env bash +# +# Mixpanel raw-event export, resumable. +# +# Streams /api/2.0/export one UTC day at a time into gzipped JSONL files. +# Re-running the script skips days already on disk; it is safe to interrupt +# (Ctrl+C, kill, VM reboot) and re-launch — at most one in-flight day is +# redone. See scripts/migration/README.md for the full plan. +# +# Required env: +# MP_SERVICE_USER — Mixpanel service account username (..mp-service-account) +# MP_SERVICE_SECRET — service account secret +# MP_PROJECT_ID — Mixpanel project id (e.g. 3314908 for Based Hardware) +# +# Optional env: +# MP_START — first UTC day, YYYY-MM-DD (default: 2024-03-01) +# MP_END — last UTC day inclusive (default: yesterday UTC) +# MP_OUT — output directory (default: $HOME/mp-export) +# MP_GCS_BUCKET — if set, gsutil-mirror each chunk to gs://$BUCKET/mixpanel// +# MP_MIN_DISK_GB — refuse to start if free space below this (default: 20) +# MP_PACE_SECONDS — target seconds between request starts (default: 65; ≈55 req/hr) + +set -uo pipefail + +: "${MP_SERVICE_USER:?MP_SERVICE_USER not set}" +: "${MP_SERVICE_SECRET:?MP_SERVICE_SECRET not set}" +: "${MP_PROJECT_ID:?MP_PROJECT_ID not set}" + +MP_START="${MP_START:-2024-03-01}" +MP_END="${MP_END:-$(date -u -d 'yesterday' +%Y-%m-%d)}" +MP_OUT="${MP_OUT:-$HOME/mp-export}" +MP_GCS_BUCKET="${MP_GCS_BUCKET:-}" +MP_MIN_DISK_GB="${MP_MIN_DISK_GB:-20}" +MP_PACE_SECONDS="${MP_PACE_SECONDS:-65}" + +CHUNKS="$MP_OUT/chunks" +MANIFEST="$MP_OUT/manifest.jsonl" +FAILURES="$MP_OUT/failures.jsonl" +RUN_LOG="$MP_OUT/run.log" +LOCK="$MP_OUT/.lock" + +mkdir -p "$CHUNKS" +touch "$MANIFEST" "$FAILURES" "$RUN_LOG" + +# Single-instance guard. flock holds for the lifetime of FD 9. +exec 9>"$LOCK" +if ! flock -n 9; then + echo "another instance is running on $LOCK; abort" >&2 + exit 1 +fi + +log() { + printf '%s %s\n' "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*" | tee -a "$RUN_LOG" >&2 +} + +require_cmd() { + command -v "$1" >/dev/null 2>&1 || { log "FATAL: missing required command: $1"; exit 1; } +} +require_cmd curl +require_cmd gzip +require_cmd jq +require_cmd flock +require_cmd sha256sum + +# ---- Pre-flight: disk space ---- +free_gb=$(df -P "$MP_OUT" | awk 'NR==2 {print int($4/1024/1024)}') +if (( free_gb < MP_MIN_DISK_GB )); then + log "FATAL: only ${free_gb}GB free in $MP_OUT, need ${MP_MIN_DISK_GB}GB" + exit 1 +fi + +# ---- Pre-flight: auth probe ---- +probe_day=$(date -u -d 'yesterday' +%Y-%m-%d) +log "auth probe day=$probe_day project=$MP_PROJECT_ID" +probe_status=$(curl -sG -o /dev/null -w '%{http_code}' --max-time 60 \ + -u "$MP_SERVICE_USER:$MP_SERVICE_SECRET" \ + 'https://data.mixpanel.com/api/2.0/export' \ + --data-urlencode "from_date=$probe_day" \ + --data-urlencode "to_date=$probe_day" \ + --data-urlencode "project_id=$MP_PROJECT_ID" \ + --data-urlencode 'limit=1' || true) +if [[ "$probe_status" != "200" ]]; then + log "FATAL: auth probe returned HTTP $probe_status" + exit 1 +fi +log "auth probe OK" + +# ---- Decide what to fetch ---- +# Source of truth is the disk: a day is "done" iff its file exists and is a +# valid gzip (or a 0-byte sentinel for legitimately empty days). The manifest +# is informational. This makes resume robust against any partial-write or +# crash-after-rename-before-manifest race. +is_day_done() { + local d=$1 + local f="$CHUNKS/events-$d.jsonl.gz" + [[ -f "$f" ]] || return 1 + [[ ! -s "$f" ]] && return 0 + gzip -t "$f" 2>/dev/null +} + +all_dates=() +d="$MP_START" +while [[ "$d" < "$MP_END" || "$d" == "$MP_END" ]]; do + all_dates+=("$d") + d=$(date -u -d "$d + 1 day" +%Y-%m-%d) +done + +todo=() +done_count=0 +for d in "${all_dates[@]}"; do + if is_day_done "$d"; then + done_count=$((done_count + 1)) + else + todo+=("$d") + fi +done + +est_hours=$(( (${#todo[@]} * MP_PACE_SECONDS + 3599) / 3600 )) +log "plan: ${#all_dates[@]} total days, $done_count already done, ${#todo[@]} to fetch" +log "estimated wallclock: ~${est_hours}h at pace ${MP_PACE_SECONDS}s/req" + +if (( ${#todo[@]} == 0 )); then + log "nothing to fetch; exiting" + exit 0 +fi + +# ---- Fetch one day with retries ---- +fetch_one() { + local d=$1 + local f="$CHUNKS/events-$d.jsonl.gz" + local raw="$f.raw.tmp" + local gz="$f.gz.tmp" + local attempts=0 + local backoff=60 + local http_status="000" + + while (( attempts < 5 )); do + attempts=$((attempts + 1)) + rm -f "$raw" "$gz" + local start_epoch + start_epoch=$(date +%s) + + http_status=$(curl -sG -o "$raw" -w '%{http_code}' --max-time 1800 \ + -u "$MP_SERVICE_USER:$MP_SERVICE_SECRET" \ + 'https://data.mixpanel.com/api/2.0/export' \ + --data-urlencode "from_date=$d" \ + --data-urlencode "to_date=$d" \ + --data-urlencode "project_id=$MP_PROJECT_ID" \ + 2>>"$RUN_LOG" || echo "000") + + case "$http_status" in + 200) + if [[ ! -s "$raw" ]]; then + # Legitimate empty day: write 0-byte sentinel. + : > "$f" + rm -f "$raw" + local elapsed=$(( $(date +%s) - start_epoch )) + printf '{"date":"%s","bytes":0,"sha256":"","lines":0,"http_status":200,"empty":true,"attempts":%d,"elapsed_s":%d,"finished_at":"%s"}\n' \ + "$d" "$attempts" "$elapsed" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$MANIFEST" + log "OK $d empty attempts=$attempts elapsed=${elapsed}s" + return 0 + fi + if ! gzip < "$raw" > "$gz"; then + log "FAIL $d gzip error (attempt $attempts/5)" + rm -f "$raw" "$gz" + sleep "$backoff"; backoff=$((backoff * 2)) + continue + fi + rm -f "$raw" + mv "$gz" "$f" + local bytes + bytes=$(stat -c %s "$f") + local sha + sha=$(sha256sum "$f" | awk '{print $1}') + local lines + lines=$(zcat "$f" | wc -l) + local elapsed=$(( $(date +%s) - start_epoch )) + printf '{"date":"%s","bytes":%d,"sha256":"%s","lines":%d,"http_status":200,"empty":false,"attempts":%d,"elapsed_s":%d,"finished_at":"%s"}\n' \ + "$d" "$bytes" "$sha" "$lines" "$attempts" "$elapsed" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$MANIFEST" + log "OK $d lines=$lines bytes=$bytes attempts=$attempts elapsed=${elapsed}s" + return 0 + ;; + 401|403) + log "FATAL: HTTP $http_status on $d — service account creds rejected" + rm -f "$raw" "$gz" + exit 2 + ;; + 429) + log "429 $d — backing off ${backoff}s (does not count against retry budget)" + rm -f "$raw" "$gz" + sleep "$backoff" + backoff=$((backoff * 2)) + attempts=$((attempts - 1)) + ;; + *) + log "FAIL $d HTTP $http_status (attempt $attempts/5), sleeping ${backoff}s" + rm -f "$raw" "$gz" + sleep "$backoff" + backoff=$((backoff * 2)) + ;; + esac + done + + log "GIVEUP $d after 5 attempts (last_status=$http_status)" + printf '{"date":"%s","last_status":"%s","finished_at":"%s"}\n' \ + "$d" "$http_status" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$FAILURES" + rm -f "$raw" "$gz" + return 1 +} + +# ---- Main loop with rate-limit pacing ---- +fetched=0 +failed=0 +trap 'log "interrupted; safe to re-run to resume"; exit 130' INT TERM + +for d in "${todo[@]}"; do + cycle_start=$(date +%s) + + if fetch_one "$d"; then + fetched=$((fetched + 1)) + if [[ -n "$MP_GCS_BUCKET" ]]; then + f="$CHUNKS/events-$d.jsonl.gz" + if [[ -f "$f" ]]; then + gsutil -q cp "$f" "gs://$MP_GCS_BUCKET/mixpanel/$MP_PROJECT_ID/$(basename "$f")" \ + || log "WARN gcs upload failed for $d (file is on disk locally)" + fi + fi + else + failed=$((failed + 1)) + fi + + cycle_elapsed=$(( $(date +%s) - cycle_start )) + remaining=$(( MP_PACE_SECONDS - cycle_elapsed )) + if (( remaining > 0 )); then + sleep "$remaining" + fi +done + +# ---- Summary ---- +total_lines=$(jq -s 'map(select(.lines != null) | .lines) | add // 0' "$MANIFEST") +total_bytes=$(jq -s 'map(select(.bytes != null) | .bytes) | add // 0' "$MANIFEST") +fail_recorded=$(wc -l < "$FAILURES" | tr -d ' ') + +log "DONE fetched_this_run=$fetched failed_this_run=$failed manifest_lines=$total_lines manifest_bytes=$total_bytes total_failures=$fail_recorded" + +if (( failed > 0 )); then + log "$failed days failed this run; re-run to retry (resume is automatic)" + exit 1 +fi From 0a6f8e62eea741f72e756196fb234d262a5e75c1 Mon Sep 17 00:00:00 2001 From: Mohammed Mohsin Date: Wed, 6 May 2026 14:51:00 +0000 Subject: [PATCH 2/2] docs(scripts/migration): operational README for mixpanel_export.sh Documents env vars, output layout, resume semantics, failure-handling matrix, run command (tmux), and post-run verification queries (coverage gaps, manifest totals vs Mixpanel insights). Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/migration/README.md | 144 ++++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 scripts/migration/README.md diff --git a/scripts/migration/README.md b/scripts/migration/README.md new file mode 100644 index 00000000000..e5e67fbb501 --- /dev/null +++ b/scripts/migration/README.md @@ -0,0 +1,144 @@ +# Mixpanel → archive migration tooling + +One-shot operational scripts for exporting raw Mixpanel events as gzipped +JSONL chunks. Output is suitable for handoff to PostHog managed migration +(via S3 / GCS interop) or for cold-storage of the historical Mixpanel record +once the project is decommissioned. + +This directory does not get imported by any service; it exists for +reproducibility and audit trail. + +--- + +## `mixpanel_export.sh` + +Streams `data.mixpanel.com/api/2.0/export` one UTC day at a time, writes each +day to `/chunks/events-YYYY-MM-DD.jsonl.gz`, paces requests under the +documented 60 queries/hour rate limit, and is safe to interrupt and resume. + +### Required env + +| Var | Value | +|---|---| +| `MP_SERVICE_USER` | Mixpanel service account username (`..mp-service-account`) | +| `MP_SERVICE_SECRET` | Service account secret (shown once at creation) | +| `MP_PROJECT_ID` | Project id — `3314908` for Based Hardware | + +Service account is created at **Mixpanel → Organization Settings → Service +Accounts → New**, with **Consumer** role on the target project. Read-only is +sufficient for `/export`. + +### Optional env + +| Var | Default | Notes | +|---|---|---| +| `MP_START` | `2024-03-01` | First UTC day, inclusive | +| `MP_END` | yesterday UTC | Last UTC day, inclusive | +| `MP_OUT` | `$HOME/mp-export` | Output directory; persistent disk | +| `MP_GCS_BUCKET` | unset | If set, `gsutil cp` each chunk to `gs:///mixpanel//` after it lands | +| `MP_MIN_DISK_GB` | `20` | Refuse to start if free space is below this | +| `MP_PACE_SECONDS` | `65` | Target cycle time per request (≈55 req/hr at 65s) | + +### Layout produced + +``` +$MP_OUT/ +├── chunks/events-YYYY-MM-DD.jsonl.gz one file per UTC day; 0-byte file = empty day +├── manifest.jsonl append-only log: {date, bytes, sha256, lines, ...} +├── failures.jsonl days that exhausted 5 retries; auto-retried on next run +├── run.log full timestamped log +└── .lock flock guard against concurrent invocations +``` + +### Resume semantics + +Source of truth is the disk, not the manifest. A day is "done" iff its +gzip file exists and either: + +* is 0 bytes (sentinel for legitimately empty days), or +* passes `gzip -t` integrity check. + +Re-running the script: + +1. Acquires the flock — refuses to start if another instance holds it. +2. Iterates `[MP_START, MP_END]` and skips days that pass `is_day_done`. +3. Re-fetches everything else, including any `failures.jsonl` entries from + prior runs (they are not "done" on disk, so they get picked up + automatically). + +This is robust against: + +* Ctrl+C / `kill` / OOM / VM reboot mid-stream — the in-flight day's `.tmp` + is discarded; final file appears only after `mv`. +* Crash between `mv` and manifest append — file is "done" on disk, but + manifest is missing the entry. Next run sees the file and skips correctly; + manifest gets a coverage gap (acceptable, run.log is the audit trail). +* Truncated / corrupted final files — `gzip -t` catches them, day is + redownloaded. + +### Failure handling + +| HTTP / signal | Action | +|---|---| +| `200` non-empty | Save chunk, append manifest entry, advance | +| `200` empty body | Save 0-byte sentinel, manifest entry with `empty=true` | +| `429` | Sleep with exponential backoff, retry same day, does **not** count against 5-attempt budget | +| `5xx`, timeout, connection reset | Exponential backoff (60s, 120s, 240s, 480s, 960s), max 5 attempts | +| `401`, `403` | Hard abort (`exit 2`) — creds are bad, no point continuing | +| Disk full / `gzip` error | Per-day failure, retried; if persistent the run fails loudly | + +Days that exhaust retries are logged to `failures.jsonl` and the run continues +to the next day. Re-running the script picks them up automatically. + +### Running it + +Inside a named tmux session so it survives SSH disconnects: + +```bash +export MP_SERVICE_USER='posthog-migration.xxx.mp-service-account' +export MP_SERVICE_SECRET='...' +export MP_PROJECT_ID=3314908 +# optionally: export MP_GCS_BUCKET=omi-mixpanel-archive + +tmux new -d -s mp-export 'bash scripts/migration/mixpanel_export.sh' +``` + +Read-only attach to watch progress: + +```bash +tmux attach -r -t mp-export +``` + +Or check progress without attaching: + +```bash +wc -l "$HOME/mp-export/manifest.jsonl" # days completed +tail -n 5 "$HOME/mp-export/run.log" # recent activity +ls "$HOME/mp-export/chunks" | wc -l # files on disk +du -sh "$HOME/mp-export/chunks" # disk usage +``` + +### Wallclock estimate + +For Mar 2024 → today (≈793 days) at the default 65s pace: **~14h** unattended. +Failures and retries extend it. Resume on next run is exact — no work is +redone. + +### Verification after the run + +```bash +# Total events imported per manifest: +jq -s 'map(.lines // 0) | add' "$HOME/mp-export/manifest.jsonl" + +# Compare against Mixpanel insights total for the same window: +# (run from Mixpanel UI or MCP Run-Query / $any_event total math) + +# Spot-check a random chunk: +zcat "$HOME/mp-export/chunks/events-2026-04-01.jsonl.gz" | wc -l +zcat "$HOME/mp-export/chunks/events-2026-04-01.jsonl.gz" | head -1 | jq + +# Find missing days (gaps in coverage): +ls "$HOME/mp-export/chunks" | sed 's/events-//;s/.jsonl.gz//' | sort > /tmp/got +seq -f '%g' 0 792 | while read i; do date -u -d "2024-03-01 + $i day" +%F; done > /tmp/want +diff /tmp/want /tmp/got | head +```