diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml new file mode 100644 index 0000000..cf0a90d --- /dev/null +++ b/.github/workflows/test.yml @@ -0,0 +1,60 @@ +name: End-to-End Test + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + workflow_dispatch: + +jobs: + end-to-end-tests: + runs-on: ubuntu-latest + timeout-minutes: 45 # Increased to accommodate recording rules wait time + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Move Docker data to larger disk + run: | + echo "=== Disk space ===" + df -h / /mnt | grep -E '(Filesystem|/dev)' + echo "" + echo "=== Stopping Docker ===" + sudo systemctl stop docker.socket docker.service containerd.service + echo "" + echo "=== Preparing /mnt for Docker data ===" + sudo mkdir -p /mnt/docker-data/docker /mnt/docker-data/containerd + echo "" + echo "=== Bind mounting to /mnt ===" + sudo mount --bind /mnt/docker-data/docker /var/lib/docker + sudo mount --bind /mnt/docker-data/containerd /var/lib/containerd + echo "" + echo "=== Starting Docker ===" + sudo systemctl start containerd.service docker.service + echo "" + echo "=== Verifying Docker works ===" + docker ps + + - name: Install Nix + uses: cachix/install-nix-action@v27 + with: + nix_path: nixpkgs=channel:nixos-unstable + + - name: Show environment info + run: | + echo "=== System Info ===" + uname -a + echo "=== Docker Info ===" + docker version + echo "=== Available space ===" + df -h | head -5 + echo "=== Nix version ===" + nix --version + + - name: Run test-1-setup.sh (15-20 min) + run: nix develop --command ./test-1-setup.sh + + - name: Run test-2-teardown.sh (10-15 min) + run: nix develop --command ./test-2-teardown.sh diff --git a/monitoring/README.md b/monitoring/README.md index 5ff75a2..f854ebc 100644 --- a/monitoring/README.md +++ b/monitoring/README.md @@ -66,9 +66,23 @@ You can find the dashboard under `Home > Dashboards > grafana > CloudNativePG`. > streaming is unavailable, but all other Grafana features work normally > when accessed via port-forward. +## CloudNativePG Grafana Dashboard + +[CloudNativePG provides a default dashboard](https://cloudnative-pg.io/docs/devel/quickstart#grafana-dashboard) for Grafana in the dedicated [`grafana-dashboards` repository](https://github.com/cloudnative-pg/grafana-dashboards). The CNPG Playground monitoring `setup.sh` automatically installs the CNPG dashboard into grafana. You can also download the file [grafana-dashboard.json](https://github.com/cloudnative-pg/grafana-dashboards/blob/main/charts/cluster/grafana-dashboard.json) and manually import it via the GUI (menu: Dashboards > New > Import). + +### Dependencies + +The CNPG Playground monitoring `setup.sh` also installs and configures the dependencies of this dashboard: + +1. `node-exporter`: Node-level metrics (CPU, memory, disk, network at the host level) +2. `kube-state-metrics`: Kubernetes object metrics (pods, deployments, resource requests/limits) +3. Kubelet/cAdvisor Metrics (via `/metrics/cadvisor`): Container-level metrics (CPU, memory, network, disk I/O) +4. Canonical **Kubernetes recording rules from [kube-prometheus](https://github.com/prometheus-operator/kube-prometheus)**, which pre-compute common aggregations used by the CloudNativePG dashboard such as `node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate` and `node_namespace_pod_container:container_memory_working_set_bytes` and `namespace_cpu:kube_pod_container_resource_requests:sum` + ## PodMonitor To enable Prometheus to scrape metrics from your PostgreSQL pods, you must create a `PodMonitor` resource as described in the [documentation](https://cloudnative-pg.io/documentation/current/monitoring/#creating-a-podmonitor). - + + If a monitoring stack is running, then `demo/setup.sh` will automatically create PodMonitors. diff --git a/monitoring/kube-state-metrics/deployment.yaml b/monitoring/kube-state-metrics/deployment.yaml new file mode 100644 index 0000000..98480d5 --- /dev/null +++ b/monitoring/kube-state-metrics/deployment.yaml @@ -0,0 +1,93 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: +- apiGroups: ["*"] + resources: ["*"] + verbs: ["list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: kube-system +--- +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: kube-system + labels: + app.kubernetes.io/name: kube-state-metrics +spec: + clusterIP: None + ports: + - name: http-metrics + port: 8080 + targetPort: http-metrics + - name: telemetry + port: 8081 + targetPort: telemetry + selector: + app.kubernetes.io/name: kube-state-metrics +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: kube-system + labels: + app.kubernetes.io/name: kube-state-metrics +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics + template: + metadata: + labels: + app.kubernetes.io/name: kube-state-metrics + spec: + serviceAccountName: kube-state-metrics + nodeSelector: + node-role.kubernetes.io/infra: "" + containers: + - name: kube-state-metrics + image: registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.13.0 + ports: + - name: http-metrics + containerPort: 8080 + - name: telemetry + containerPort: 8081 +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kube-state-metrics + namespace: kube-system + labels: + app.kubernetes.io/name: kube-state-metrics +spec: + endpoints: + - port: http-metrics + interval: 30s + honorLabels: true + - port: telemetry + interval: 30s + selector: + matchLabels: + app.kubernetes.io/name: kube-state-metrics diff --git a/monitoring/kube-state-metrics/kustomization.yaml b/monitoring/kube-state-metrics/kustomization.yaml new file mode 100644 index 0000000..3535738 --- /dev/null +++ b/monitoring/kube-state-metrics/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +namespace: kube-system +resources: + - deployment.yaml + diff --git a/monitoring/node-exporter/kustomization.yaml b/monitoring/node-exporter/kustomization.yaml new file mode 100644 index 0000000..bc27ffa --- /dev/null +++ b/monitoring/node-exporter/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - node-exporter.yaml + - servicemonitor.yaml + + diff --git a/monitoring/node-exporter/node-exporter.yaml b/monitoring/node-exporter/node-exporter.yaml new file mode 100644 index 0000000..9cd0d42 --- /dev/null +++ b/monitoring/node-exporter/node-exporter.yaml @@ -0,0 +1,73 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-exporter + namespace: kube-system +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter + namespace: kube-system + labels: + app.kubernetes.io/name: node-exporter +spec: + selector: + matchLabels: + app.kubernetes.io/name: node-exporter + template: + metadata: + labels: + app.kubernetes.io/name: node-exporter + spec: + serviceAccountName: node-exporter + hostNetwork: true + hostPID: true + containers: + - name: node-exporter + image: quay.io/prometheus/node-exporter:v1.8.2 + args: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + ports: + - containerPort: 9100 + name: metrics + volumeMounts: + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + - name: root + mountPath: /host/root + readOnly: true + tolerations: + - operator: Exists + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + - name: root + hostPath: + path: / +--- +apiVersion: v1 +kind: Service +metadata: + name: node-exporter + namespace: kube-system + labels: + app.kubernetes.io/name: node-exporter +spec: + clusterIP: None + ports: + - name: metrics + port: 9100 + targetPort: metrics + selector: + app.kubernetes.io/name: node-exporter diff --git a/monitoring/node-exporter/servicemonitor.yaml b/monitoring/node-exporter/servicemonitor.yaml new file mode 100644 index 0000000..f8eb324 --- /dev/null +++ b/monitoring/node-exporter/servicemonitor.yaml @@ -0,0 +1,14 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: node-exporter + namespace: kube-system + labels: + app.kubernetes.io/name: node-exporter +spec: + endpoints: + - port: metrics + interval: 30s + selector: + matchLabels: + app.kubernetes.io/name: node-exporter diff --git a/monitoring/prometheus-instance/deploy_prometheus.yaml b/monitoring/prometheus-instance/deploy_prometheus.yaml index d948cb4..34e5d01 100644 --- a/monitoring/prometheus-instance/deploy_prometheus.yaml +++ b/monitoring/prometheus-instance/deploy_prometheus.yaml @@ -12,6 +12,7 @@ rules: resources: - nodes - nodes/metrics + - nodes/proxy - services - endpoints - pods @@ -44,7 +45,7 @@ roleRef: subjects: - kind: ServiceAccount name: prometheus - namespace: default + namespace: prometheus-operator --- apiVersion: monitoring.coreos.com/v1 @@ -55,5 +56,9 @@ spec: serviceAccountName: prometheus podMonitorSelector: {} podMonitorNamespaceSelector: {} + serviceMonitorSelector: {} + serviceMonitorNamespaceSelector: {} + ruleSelector: {} + ruleNamespaceSelector: {} nodeSelector: node-role.kubernetes.io/infra: "" diff --git a/monitoring/prometheus-instance/kustomization.yaml b/monitoring/prometheus-instance/kustomization.yaml index 831e040..8c30ad8 100644 --- a/monitoring/prometheus-instance/kustomization.yaml +++ b/monitoring/prometheus-instance/kustomization.yaml @@ -2,4 +2,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - deploy_prometheus.yaml + # Fetch upstream recording rules from kube-prometheus + - https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/v0.16.0/manifests/kubernetesControlPlane-prometheusRule.yaml namespace: prometheus-operator diff --git a/monitoring/prometheus-instance/servicemonitor-kubelet.yaml b/monitoring/prometheus-instance/servicemonitor-kubelet.yaml new file mode 100644 index 0000000..452d62b --- /dev/null +++ b/monitoring/prometheus-instance/servicemonitor-kubelet.yaml @@ -0,0 +1,55 @@ +apiVersion: v1 +kind: Service +metadata: + name: kubelet + namespace: kube-system + labels: + app.kubernetes.io/name: kubelet +spec: + clusterIP: None + ports: + - name: https-metrics + port: 10250 +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kubelet + namespace: kube-system + labels: + app.kubernetes.io/name: kubelet +spec: + endpoints: + - port: https-metrics + interval: 30s + scheme: https + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + tlsConfig: + insecureSkipVerify: true + relabelings: + - targetLabel: metrics_path + replacement: /metrics + - port: https-metrics + interval: 30s + path: /metrics/cadvisor + scheme: https + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + tlsConfig: + insecureSkipVerify: true + honorLabels: true + relabelings: + - targetLabel: metrics_path + replacement: /metrics/cadvisor + - port: https-metrics + interval: 30s + path: /metrics/probes + scheme: https + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + tlsConfig: + insecureSkipVerify: true + relabelings: + - targetLabel: metrics_path + replacement: /metrics/probes + selector: + matchLabels: + app.kubernetes.io/name: kubelet diff --git a/monitoring/prometheus-operator/kustomization.yaml b/monitoring/prometheus-operator/kustomization.yaml index 6660bcd..8688e62 100644 --- a/monitoring/prometheus-operator/kustomization.yaml +++ b/monitoring/prometheus-operator/kustomization.yaml @@ -1,5 +1,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - https://github.com/prometheus-operator/prometheus-operator/ + - https://github.com/prometheus-operator/prometheus-operator/releases/download/v0.87.1/bundle.yaml namespace: prometheus-operator diff --git a/monitoring/setup.sh b/monitoring/setup.sh index 120961c..e9ed038 100755 --- a/monitoring/setup.sh +++ b/monitoring/setup.sh @@ -41,12 +41,24 @@ for region in "${REGIONS[@]}"; do K8S_CLUSTER_NAME=$(get_cluster_name "${region}") CONTEXT_NAME=$(get_cluster_context "${region}") -# Deploy the Prometheus operator in the playground Kubernetes clusters + echo "๐Ÿ”ฅ Deploying Prometheus operator..." kubectl --context ${CONTEXT_NAME} create ns prometheus-operator || true kubectl kustomize ${GIT_REPO_ROOT}/monitoring/prometheus-operator | \ kubectl --context ${CONTEXT_NAME} apply --force-conflicts --server-side -f - + + echo "โณ Waiting for Prometheus CRDs to be ready..." + kubectl --context ${CONTEXT_NAME} wait --for condition=established --timeout=60s \ + crd/servicemonitors.monitoring.coreos.com crd/prometheusrules.monitoring.coreos.com || true -# We make sure that monitoring workloads are deployed in the infrastructure node. + echo "๐Ÿ“Š Deploying kube-state-metrics..." + kubectl kustomize ${GIT_REPO_ROOT}/monitoring/kube-state-metrics | \ + kubectl --context ${CONTEXT_NAME} apply --force-conflicts --server-side -f - + + echo "๐Ÿ–ฅ๏ธ Deploying node-exporter..." + kubectl kustomize ${GIT_REPO_ROOT}/monitoring/node-exporter | \ + kubectl --context ${CONTEXT_NAME} apply --force-conflicts --server-side -f - + + echo "๐Ÿ”ง Deploying Prometheus instance and recording rules via kustomize..." kubectl kustomize ${GIT_REPO_ROOT}/monitoring/prometheus-instance | \ kubectl --context=${CONTEXT_NAME} apply --force-conflicts --server-side -f - kubectl --context=${CONTEXT_NAME} -n prometheus-operator \ @@ -54,13 +66,34 @@ for region in "${REGIONS[@]}"; do --type='merge' \ --patch='{"spec":{"template":{"spec":{"tolerations":[{"key":"node-role.kubernetes.io/infra","operator":"Exists","effect":"NoSchedule"}],"nodeSelector":{"node-role.kubernetes.io/infra":""}}}}}' + echo "๐Ÿ”ง Setting up kubelet metrics scraping..." + kubectl --context=${CONTEXT_NAME} apply -f ${GIT_REPO_ROOT}/monitoring/prometheus-instance/servicemonitor-kubelet.yaml + NODE_IPS=$(kubectl --context ${CONTEXT_NAME} get nodes -o jsonpath='{range .items[*]}{.status.addresses[?(@.type=="InternalIP")].address}{"\n"}{end}') + cat </dev/null | \ + kubectl --context ${CONTEXT_NAME} delete --ignore-not-found=true -f - 2>/dev/null || true + + # Delete Grafana operator + echo "๐Ÿ“ˆ Removing Grafana operator..." + kubectl --context ${CONTEXT_NAME} delete --ignore-not-found=true \ + -f https://github.com/grafana/grafana-operator/releases/latest/download/kustomize-cluster_scoped.yaml || true + + echo "๐Ÿ”ง Removing kubelet monitoring resources..." + kubectl --context ${CONTEXT_NAME} delete --ignore-not-found=true \ + -f ${GIT_REPO_ROOT}/monitoring/prometheus-instance/servicemonitor-kubelet.yaml 2>/dev/null || true + + echo "๐Ÿ”ฅ Removing Prometheus resources..." + kubectl kustomize ${GIT_REPO_ROOT}/monitoring/prometheus-instance 2>/dev/null | \ + kubectl --context ${CONTEXT_NAME} delete --ignore-not-found=true -f - 2>/dev/null || true + kubectl kustomize ${GIT_REPO_ROOT}/monitoring/prometheus-operator 2>/dev/null | \ + kubectl --context ${CONTEXT_NAME} delete --ignore-not-found=true -f - 2>/dev/null || true + + echo "๐Ÿ–ฅ๏ธ Removing node-exporter..." + kubectl kustomize ${GIT_REPO_ROOT}/monitoring/node-exporter 2>/dev/null | \ + kubectl --context ${CONTEXT_NAME} delete --ignore-not-found=true -f - 2>/dev/null || true + + echo "๐Ÿ“Š Removing kube-state-metrics..." + kubectl kustomize ${GIT_REPO_ROOT}/monitoring/kube-state-metrics 2>/dev/null | \ + kubectl --context ${CONTEXT_NAME} delete --ignore-not-found=true -f - 2>/dev/null || true + + echo "๐Ÿ—‘๏ธ Removing prometheus-operator namespace..." + kubectl --context ${CONTEXT_NAME} delete namespace prometheus-operator --ignore-not-found=true || true + + echo "โœ… Monitoring teardown complete for region: ${region}" + echo +done + +echo "โœ… Monitoring cleanup complete!" + diff --git a/test-1-setup.sh b/test-1-setup.sh new file mode 100755 index 0000000..41446f7 --- /dev/null +++ b/test-1-setup.sh @@ -0,0 +1,155 @@ +#!/usr/bin/env bash +# End-to-end smoke test for cnpg-playground - Part 1: Setup & Validation +# Usage: ./test-1-setup.sh OR nix develop -c ./test-1-setup.sh +set -euo pipefail + +ROOT="$(git rev-parse --show-toplevel)" +source "${ROOT}/scripts/common.sh" +export KUBECONFIG="${KUBE_CONFIG_PATH}" +PASSED=0 FAILED=0 + +pass() { echo "โœ… $*"; PASSED=$((PASSED+1)); } +fail() { echo "โŒ $*"; FAILED=$((FAILED+1)); return 1; } +log() { echo "๐Ÿงช $*"; } + +for cmd in kind kubectl curl jq docker; do command -v "$cmd" >/dev/null || { fail "Missing: $cmd"; exit 1; }; done + +cleanup() { pkill -f "kubectl port-forward" || true; } +trap cleanup EXIT + +# Setup: infrastructure โ†’ monitoring โ†’ PostgreSQL +log "Setting up infrastructure (eu, us)... start at $(date +%H:%M:%S)" +if "${ROOT}/scripts/setup.sh"; then pass "Infrastructure setup finish at $(date +%H:%M:%S)"; else fail "Infrastructure setup"; exit 1; fi + +# Test infrastructure health (MinIO API + Kubernetes API) +log "Testing infrastructure health..." +for region in eu us; do + CTX=$(get_cluster_context "${region}") + + # Test Kubernetes API + kubectl --context "${CTX}" get nodes && \ + pass "${region}: Kubernetes API responsive" || fail "${region}: Kubernetes API failed" + + # Test MinIO API + MINIO_PORT=$((MINIO_BASE_PORT + $([ "$region" = "us" ] && echo 1 || echo 0))) + MINIO_STATUS=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:${MINIO_PORT}/minio/health/live || echo "000") + [ "${MINIO_STATUS}" = "200" ] && \ + pass "${region}: MinIO API responsive" || fail "${region}: MinIO API failed (HTTP ${MINIO_STATUS})" +done + +log "Setting up monitoring stack... start at $(date +%H:%M:%S)" +if "${ROOT}/monitoring/setup.sh"; then pass "Monitoring setup finish at $(date +%H:%M:%S)"; else fail "Monitoring setup"; exit 1; fi + +# Allow time for pod creation and initial metrics scraping +log "Waiting 60 seconds for pod creation and initial metrics scrape... start wait at $(date +%H:%M:%S)" +sleep 60 + +# Test monitoring health (Prometheus + Grafana + node/system metrics) +log "Testing monitoring stack health..." +for region in eu us; do + CTX=$(get_cluster_context "${region}") + + # Wait for Prometheus pod + kubectl --context "${CTX}" wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus \ + -n prometheus-operator --timeout=90s && \ + pass "${region}: Prometheus pod ready" || fail "${region}: Prometheus pod not ready" + + # Wait for Grafana pod + kubectl --context "${CTX}" wait --for=condition=Ready pod -l app=grafana \ + -n grafana --timeout=90s && \ + pass "${region}: Grafana pod ready" || fail "${region}: Grafana pod not ready" +done + +for region in eu us; do + # Test Prometheus HTTP and metrics + CTX=$(get_cluster_context "${region}") + PORT=9090; [ "$region" = "us" ] && PORT=9091 + kubectl port-forward -n prometheus-operator prometheus-prometheus-0 ${PORT}:9090 --context "${CTX}" & + sleep 3 + + [ "$(curl -s http://localhost:${PORT}/-/ready)" = "Prometheus Server is Ready." ] && \ + pass "${region}: Prometheus HTTP API" || fail "${region}: Prometheus HTTP failed" + + # Test node/system metrics (from kubelet/node-exporter) + NODE_METRICS=$(curl -s "http://localhost:${PORT}/api/v1/query?query=up%7Bjob%3D%22kubelet%22%7D" | jq -r '.data.result | length' || echo "0") + [ "${NODE_METRICS}" -ge 1 ] && \ + pass "${region}: Kubelet metrics available (${NODE_METRICS})" || fail "${region}: Kubelet metrics missing" + + CONTAINER_METRICS=$(curl -s "http://localhost:${PORT}/api/v1/query?query=container_cpu_usage_seconds_total" | jq -r '.data.result | length' || echo "0") + [ "${CONTAINER_METRICS}" -ge 1 ] && \ + pass "${region}: Container metrics available (${CONTAINER_METRICS})" || fail "${region}: Container metrics missing" + + # Test Grafana dashboard + GPORT=3000; [ "$region" = "us" ] && GPORT=3001 + kubectl port-forward -n grafana service/grafana-service ${GPORT}:3000 --context "${CTX}" & + sleep 3 + + DASHBOARD_COUNT=$(curl -s -u admin:admin "http://localhost:${GPORT}/api/search?query=cloudnativepg" | jq 'length' 2>/dev/null || echo "0") + [ "${DASHBOARD_COUNT}" -ge 1 ] && \ + pass "${region}: CloudNativePG dashboard exists" || fail "${region}: CloudNativePG dashboard missing" +done + +cleanup + +log "Deploying PostgreSQL clusters... start at $(date +%H:%M:%S)" +if LEGACY=true "${ROOT}/demo/setup.sh"; then pass "PostgreSQL setup finish at $(date +%H:%M:%S)"; else fail "PostgreSQL setup"; exit 1; fi + +# Test PostgreSQL health and metrics +log "Testing PostgreSQL clusters..." +for region in eu us; do + CTX=$(get_cluster_context "${region}") + log "Testing region: ${region}" + + # PostgreSQL readiness + kubectl --context "${CTX}" wait --for=condition=Ready pod -l cnpg.io/cluster=pg-${region} --timeout=30s && \ + pass "${region}: PostgreSQL ready" || fail "${region}: PostgreSQL not ready" + + # SQL query test + SQL=$(kubectl --context "${CTX}" exec pg-${region}-1 -- psql -U postgres -tAc "SELECT 1" | tr -d '\r' || echo "") + [ "${SQL}" = "1" ] && pass "${region}: SQL query OK" || fail "${region}: SQL query failed" +done + +# Test PostgreSQL metrics in Prometheus (needs time to scrape) +log "Waiting 60 seconds for PostgreSQL metrics to be scraped... start wait at $(date +%H:%M:%S)" +sleep 60 # Increased from 45s to ensure both regions have time to scrape + +cleanup # Clean any lingering port-forwards + +for region in eu us; do + CTX=$(get_cluster_context "${region}") + PORT=9090; [ "$region" = "us" ] && PORT=9091 + kubectl port-forward -n prometheus-operator prometheus-prometheus-0 ${PORT}:9090 --context "${CTX}" & + sleep 3 + + PGMETRICS=$(curl -s "http://localhost:${PORT}/api/v1/query?query=cnpg_collector_up" | jq -r '.data.result | length' || echo "0") + [ "${PGMETRICS}" -ge 1 ] && \ + pass "${region}: PostgreSQL metrics (${PGMETRICS})" || fail "${region}: PostgreSQL metrics missing" +done + +cleanup + +# Test recording rules (needs evaluation time and historical data) +# Recording rules need ~5 minutes of metrics data for irate() to work +log "Waiting 3.5 minutes for recording rules to evaluate (data + buffer)... start wait at $(date +%H:%M:%S)" +sleep 210 # 3.5 minutes wait for sufficient time series data + +for region in eu us; do + CTX=$(get_cluster_context "${region}") + PORT=9090; [ "$region" = "us" ] && PORT=9091 + + kubectl port-forward -n prometheus-operator prometheus-prometheus-0 ${PORT}:9090 --context "${CTX}" & + sleep 3 + + # Check if recording rule metrics exist + RULE_METRICS=$(curl -s "http://localhost:${PORT}/api/v1/query" --data-urlencode "query=node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate{pod=~\"pg-${region}-.*\",namespace=\"default\",node!=\"\"}" | jq -r '.data.result | length' || echo "0") + [ "${RULE_METRICS}" -ge 1 ] && \ + pass "${region}: Recording rule metrics (${RULE_METRICS})" || fail "${region}: Recording rule metrics missing" +done + +cleanup + +# Summary +log "==========================================" +log "Results: ${PASSED} passed, ${FAILED} failed" +[ ${FAILED} -eq 0 ] && { log "โœ… All tests PASSED"; exit 0; } || { log "โŒ FAILED"; exit 1; } + diff --git a/test-2-teardown.sh b/test-2-teardown.sh new file mode 100755 index 0000000..e7a9349 --- /dev/null +++ b/test-2-teardown.sh @@ -0,0 +1,94 @@ +#!/usr/bin/env bash +# End-to-end smoke test for cnpg-playground - Part 2: Teardown & Recreation +# Usage: ./test-2-teardown.sh OR nix develop -c ./test-2-teardown.sh +set -euo pipefail + +ROOT="$(git rev-parse --show-toplevel)" +source "${ROOT}/scripts/common.sh" +export KUBECONFIG="${KUBE_CONFIG_PATH}" +PASSED=0 FAILED=0 + +pass() { echo "โœ… $*"; PASSED=$((PASSED+1)); } +fail() { echo "โŒ $*"; FAILED=$((FAILED+1)); return 1; } +log() { echo "๐Ÿงช $*"; } + +for cmd in kind kubectl curl jq docker; do command -v "$cmd" >/dev/null || { fail "Missing: $cmd"; exit 1; }; done + +cleanup() { pkill -f "kubectl port-forward" || true; } +trap cleanup EXIT + +# Pre-flight check: Verify database is running before teardown tests +log "Pre-flight check: Verifying database is running..." +for region in eu us; do + CTX=$(get_cluster_context "${region}") + kubectl --context "${CTX}" wait --for=condition=Ready pod -l cnpg.io/cluster=pg-${region} --timeout=10s && \ + pass "${region}: PostgreSQL running" || { fail "${region}: PostgreSQL not ready. Run test-1-setup.sh first."; exit 1; } +done + +# Test teardown and recreation of PostgreSQL layer +log "Tearing down PostgreSQL clusters..." +"${ROOT}/demo/teardown.sh" && pass "PostgreSQL teardown" || fail "PostgreSQL teardown" +sleep 10 # extra time for pods to shut down (otherwise, the following test occasionally fails) + +for region in eu us; do + CTX=$(get_cluster_context "${region}") + PGPODS=$(kubectl --context "${CTX}" get pods -l cnpg.io/cluster=pg-${region} --no-headers | wc -l || echo "0") + [ "${PGPODS}" -eq 0 ] && pass "${region}: PostgreSQL removed" || fail "${region}: PostgreSQL still exists" +done + +log "Recreating PostgreSQL clusters..." +if LEGACY=true "${ROOT}/demo/setup.sh"; then pass "PostgreSQL re-setup"; else fail "PostgreSQL re-setup"; exit 1; fi + +for region in eu us; do + CTX=$(get_cluster_context "${region}") + kubectl --context "${CTX}" wait --for=condition=Ready pod -l cnpg.io/cluster=pg-${region} --timeout=30s && \ + pass "${region}: PostgreSQL ready" || fail "${region}: PostgreSQL not ready" +done + +# Test teardown and recreation of monitoring layer +log "Tearing down monitoring stack..." +"${ROOT}/monitoring/teardown.sh" && pass "Monitoring teardown" || fail "Monitoring teardown" + +for region in eu us; do + CTX=$(get_cluster_context "${region}") + for ns in prometheus-operator grafana; do + ! kubectl --context "${CTX}" get namespace "${ns}" && \ + pass "${region}: ${ns} removed" || fail "${region}: ${ns} still exists" + done +done + +log "Recreating monitoring stack..." +if "${ROOT}/monitoring/setup.sh"; then pass "Monitoring re-setup"; else fail "Monitoring re-setup"; exit 1; fi + +for region in eu us; do + CTX=$(get_cluster_context "${region}") + kubectl --context "${CTX}" wait --for=condition=Ready pod -l app.kubernetes.io/name=prometheus \ + -n prometheus-operator --timeout=90s && \ + pass "${region}: Prometheus ready" || fail "${region}: Prometheus not ready" + + kubectl --context "${CTX}" wait --for=condition=Ready pod -l app=grafana \ + -n grafana --timeout=90s && \ + pass "${region}: Grafana ready" || fail "${region}: Grafana not ready" +done + +# Test infrastructure teardown +log "Tearing down infrastructure..." +"${ROOT}/scripts/teardown.sh" && pass "Infrastructure teardown" || fail "Infrastructure teardown" +for region in eu us; do + CLUSTER_NAME=$(get_cluster_name "${region}") + ! kind get clusters | grep -qx "${CLUSTER_NAME}" && \ + pass "${region}: Cluster removed" || fail "${region}: Cluster still exists" + + # Verify MinIO containers are removed + MINIO_NAME="${MINIO_BASE_NAME}-${region}" + ! docker ps -a --format '{{.Names}}' | grep -qx "${MINIO_NAME}" && \ + pass "${region}: MinIO container removed" || fail "${region}: MinIO container still exists" +done + +cleanup + +# Summary +log "==========================================" +log "Results: ${PASSED} passed, ${FAILED} failed" +[ ${FAILED} -eq 0 ] && { log "โœ… All tests PASSED"; exit 0; } || { log "โŒ FAILED"; exit 1; } +