promtorture: kind deployment and prometheus inspection

ringerc · ringerc · commit cba9722618a6 · 2024-07-31T15:39:45.000+12:00
Some scripts to deploy kube-prometheus in kind, cut down to just
scrape promtorture with a single replica. Adds resource limits,
GOMEMLIMIT.

Scripts to query the tsdb, run promtool, etc.
diff --git a/testcases/promtorture/.cache/kube-prometheus.yaml b/testcases/promtorture/.cache/kube-prometheus.yaml
diff --git a/testcases/promtorture/.dockerignore b/testcases/promtorture/.dockerignore
@@ -0,0 +1,6 @@
+scripts
+promtorture
+LICENSE
+Dockerfile
+.cache
+promtool
diff --git a/testcases/promtorture/.gitignore b/testcases/promtorture/.gitignore
@@ -1 +1,3 @@
 promtorture
+promtool
+.cache
diff --git a/testcases/promtorture/scripts/config b/testcases/promtorture/scripts/config
@@ -0,0 +1,5 @@
+#!/bin/bash
+kind_cluster_name=promtorture
+kubectl_context=("--context" "kind-${kind_cluster_name}")
+kubectl_bin="kubectl"
+kubectl=("${kubectl_bin}" "${kubectl_context[@]}")
diff --git a/testcases/promtorture/scripts/get-promtool b/testcases/promtorture/scripts/get-promtool
@@ -0,0 +1,13 @@
+#!/bin/bash
+#
+# Grab promtool from the prometheus release bundle
+#
+# Can't be bothered getting latest with GH API
+
+set -e -u -o pipefail
+
+prom_version=2.53.1
+prom_target=linux-amd64
+
+curl -Lf1 --output-dir .cache -O https://github.com/prometheus/prometheus/releases/download/v${prom_version}/prometheus-${prom_version}.${prom_target}.tar.gz
+tar xf ./.cache/prometheus-${prom_version}.${prom_target}.tar.gz --strip-components=1 --wildcards '*/promtool'
diff --git a/testcases/promtorture/scripts/kind-create.sh b/testcases/promtorture/scripts/kind-create.sh
@@ -1,8 +1,10 @@
 #!/bin/bash
 
-kind_cluster_name=promtorture
+set -e -u -o pipefail -x
 
-if ! grep -q "${kind_cluster_name} " <<< $(kind get clusters); then
+source scripts/config
+
+if ! grep -q "${kind_cluster_name}" <<< $(kind get clusters); then
   kind create cluster --name "${kind_cluster_name}"
 fi
 
@@ -13,14 +15,67 @@ fi
 
 # because kubectl apply still doesn't know how to wait for CRDs
 # before applying the rest...
-yq '[.items[] | select(.kind == "CustomResourceDefinition")]' .cache/kube-prometheus.yaml | kubectl apply -f -
+yq '.|select(.kind == "CustomResourceDefinition")' .cache/kube-prometheus.yaml | "${kubectl[@]}" apply --server-side -f -
 
-# Sleeps suck, but we need to wait for the CRDs to be created and don't want to
-# overcomplicate this script by looping through kubectl api-resources and checking
-# for the CRDs we need.
-sleep 1
+"${kubectl[@]}" wait --for=condition=established --timeout=60s crd --all
 
 # this'll re-apply the CRDs but that's harmless
-kubectl apply -f .cache/kube-prometheus.yaml
+#
+# ServiceMonitors are omitted deliberately because for this test we only want to see
+# metrics for explicitly named targets.
+#
+yq '.|select((.kind != "CustomResourceDefinition") and (.kind != "ServiceMonitor"))' |  "${kubectl[@]}" apply --server-side -f -
+"${kubectl[@]}" apply --server-side -f .cache/kube-prometheus.yaml
+
+# Scale down to 1 replica and don't deploy alertmanager
+"${kubectl[@]}" patch -n monitoring prometheus k8s --type merge --patch-file /dev/stdin <<'__END__'
+apiVersion: monitoring.coreos.com/v1
+kind: Prometheus
+spec:
+  alerting:
+    alertmanagers: []
+  replicas: 1
+  containers: # this is strategically merged by the operator with the default spec, see kubectl explain prometheus.spec.containers
+    - name: config-reloader
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1000
+        allowPrivilegeEscalation: false
+        privileged: false
+        readOnlyRootFilesystem: true
+        capabilities:
+          drop:
+            - ALL
+    - name: prometheus
+      securityContext:
+        runAsNonRoot: true
+        runAsUser: 1000
+        allowPrivilegeEscalation: false
+        privileged: false
+        readOnlyRootFilesystem: true
+        capabilities:
+          drop:
+            - ALL
+      env:
+        # Set the GOMEMLIMIT to a high proportion of the container's memory
+        # limit, but not equal to it, so there's room for other processes,
+        # runtime overhead, error margins in different usage computation
+        # methods etc. We can refine this over time. I'm starting with 95%
+        # of the pod RAM limit since we also have a config reloader container
+        # etc. See https://pkg.go.dev/runtime
+        - name: GOMEMLIMIT
+          value: 450MiB
+  resources:
+    limits:
+      cpu: 1000m
+      memory: 500Mi
+    requests:
+      cpu: 200m
+      memory: 500Mi
+__END__
+
+
+"${kubectl[@]}" wait --for=condition=available --timeout=300s deployment --all -n monitoring
+"${kubectl[@]}" wait --for=condition=ready --timeout=300s pod --all -n monitoring
 
 # vim: et ts=2 sw=2 sts=2 ft=bash ai
diff --git a/testcases/promtorture/scripts/kind-deploy.sh b/testcases/promtorture/scripts/kind-deploy.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
+#
+set -e -u -o pipefail -x
 
-kind_cluster_name=promtorture
+source scripts/config
 
 targets=1
 info_metrics_labels=0
@@ -29,9 +31,8 @@ docker buildx build -t promtorture .
 
 kind load docker-image promtorture --name promtorture
 
-kubectl=("kubectl", "--context", "kind-${kind_cluster_name}")
 
-"${kubectl[@]}" apply -f /dev/stdin <<__END__
+"${kubectl[@]}" apply --server-side -f /dev/stdin <<__END__
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -46,23 +47,23 @@ spec:
   template:
     metadata:
       labels:
-	app: promtorture
+        app: promtorture
     spec:
       containers:
       - name: promtorture
-	image: promtorture
-  imagePullPolicy: Never
-	ports:
-	- containerPort: 8080
-    name: metrics
-	args:
-	- "--port=8080"
-  - "--targets=${targets}"
-  - "--info-metrics-labels=${info_metrics_labels}"
-  - "--gauge-metrics=${gauge_metrics}"
+        image: promtorture
+        imagePullPolicy: Never
+        ports:
+        - containerPort: 8080
+          name: metrics
+        args:
+        - "--port=8080"
+        - "--targets=${targets}"
+        - "--info-metrics-labels=${info_metrics_labels}"
+        - "--gauge-metrics=${gauge_metrics}"
 __END__
 
-"${kubectl[@]}" apply -f /dev/stdin <<__END__
+"${kubectl[@]}" apply --server-side -f /dev/stdin <<__END__
 apiVersion: v1
 kind: Service
 metadata:
@@ -77,11 +78,12 @@ spec:
     name: metrics
 __END__
 
-"${kubectl[@]}" apply -f /dev/stdin <<__END__
+"${kubectl[@]}" apply --server-side -f /dev/stdin <<__END__
 kind: PodMonitor
 apiVersion: monitoring.coreos.com/v1
 metadata:
   name: promtorture
+  namespace: monitoring
   labels:
     app: promtorture
 spec:
diff --git a/testcases/promtorture/scripts/meta b/testcases/promtorture/scripts/meta
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+set -e -u -o pipefail
+
+function usage() {
+  echo "Usage: meta -k target|targetmeta|tsdb|walreplay|config|flags|runtime"
+  exit 1
+}
+
+# what to fetch
+meta=
+while getopts "m:h" opt; do
+  case ${opt} in
+    m)
+      meta=$OPTARG
+      ;;
+    \?|h|: )
+      usage
+      ;;
+  esac
+done
+
+if [ -z "${meta}" ]; then
+  usage
+fi
+
+function apiQuery() {
+  local endpoint
+  endpoint=$1
+  shift
+  curl -sL --socks5-hostname localhost:1081 -G "${@}" 'http://prometheus-k8s.monitoring.svc.cluster.local:9090/'"${endpoint}"  | yq --prettyPrint .
+}
+
+case "${meta}" in
+  target)
+    apiQuery "/api/v1/targets" --data 'scrapePool=podMonitor/monitoring/promtorture/0'
+    ;;
+  targetmeta)
+    apiQuery "/api/v1/targets/metadata" --data 'match_target={job="monitoring/promtorture"}'
+    ;;
+  tsdb)
+    apiQuery "/api/v1/status/tsdb"
+    ;;
+  walreplay)
+    apiQuery "/api/v1/status/walreplay"
+    ;;
+  config)
+    apiQuery "/api/v1/status/config"
+    ;;
+  flags)
+    apiQuery "/api/v1/status/flags"
+    ;;
+  runtime)
+    apiQuery "/api/v1/status/runtimeinfo"
+    ;;
+  *)
+    echo 1>&2 "Unrecognised option: ${meta}"
+    usage
+    ;;
+esac
+
+# vim: set ft=sh et ai sw=2 ts=2 sts=2:
diff --git a/testcases/promtorture/scripts/promlogs b/testcases/promtorture/scripts/promlogs
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+set -e -u -o pipefail
+
+source scripts/config
+
+exec "${kubectl_bin}" stern "${kubectl_context[@]}" -n monitoring -l app.kubernetes.io/instance=k8s,app.kubernetes.io/name=prometheus "$@"
diff --git a/testcases/promtorture/scripts/promnuke b/testcases/promtorture/scripts/promnuke
@@ -0,0 +1,19 @@
+#!/bin/bash
+#
+# Delete the prom pods, forcing their ephemeral storage to be deleted
+# and TSDB heads to be pruned
+#
+
+set -e -u -o pipefail
+
+source scripts/config
+
+if ! "${kubectl[@]}" delete pod -n monitoring -l app.kubernetes.io/instance=k8s,app.kubernetes.io/name=prometheus --grace-period=5 --timeout=10s
+then
+  "${kubectl[@]}" delete pod -n monitoring -l app.kubernetes.io/instance=k8s,app.kubernetes.io/name=prometheus --force
+fi
+sleep 1
+echo 1>&2 "waiting for statefulset"
+"${kubectl[@]}" wait --v 1 -n monitoring statefulset/prometheus-k8s --for=jsonpath='{.status.availableReplicas}=1' --timeout=30s
+echo 1>&2 "waiting for pod ready"
+"${kubectl[@]}" wait --v 1 -n monitoring --for=condition=ready --timeout=300s pod -l app.kubernetes.io/instance=k8s,app.kubernetes.io/name=prometheus --timeout=30s
diff --git a/testcases/promtorture/scripts/promtool b/testcases/promtorture/scripts/promtool
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -e -u -o pipefail
+
+source scripts/config
+
+prom_pod="$("${kubectl[@]}" get pod -n monitoring -l app.kubernetes.io/instance=k8s,app.kubernetes.io/name=prometheus -o name | head -1)"
+
+# hints:
+# prom config is in /etc/prometheus/config_out/prometheus.env.yaml
+# prom data is in /prometheus
+#
+# handy commands include
+#
+#    promtool check service-discovery /etc/prometheus/config_out/prometheus.env.yaml podMonitor/monitoring/promtorture/0 --timeout=5s
+#    promtool query labels http://localhost:9090 'container'
+#    promtool push metrics ...
+#    promtool debug --server http://localhost:9090 'tsdb' 'head' 'select * from container_cpu_usage_seconds_total limit 1'
+#
+# Alternately you can run it locally, using scripts/get-promtool, run scripts/socks5 for proxy then
+# run with http_proxy env-var and kube uri for prometheus e.g.:
+#
+#     http_proxy=socks5://localhost:1081 ./promtool query instant http://prometheus-k8s.monitoring.svc.cluster.local:9090 gauge_metric_0
+#
+# e.g. to dump all metrics:
+#
+#     http_proxy=socks5://localhost:1081 ./promtool debug metrics http://prometheus-k8s.monitoring.svc.cluster.local:9090
+#     tar xOf debug.tar.gz metrics.txt
+#
+
+
+
+exec "${kubectl[@]}" exec -it -n monitoring "${prom_pod}" -- promtool "$@"
diff --git a/testcases/promtorture/scripts/resources b/testcases/promtorture/scripts/resources
@@ -0,0 +1,8 @@
+#!/bin/bash
+#
+# Resource use dump for prom
+
+set -e -u -o pipefail
+source scripts/config
+
+"${kubectl_bin}" resource-capacity "${kubectl_context[@]}" -n monitoring --pods --pod-labels app.kubernetes.io/instance=k8s,app.kubernetes.io/name=prometheus
diff --git a/testcases/promtorture/scripts/socks5 b/testcases/promtorture/scripts/socks5
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -e -u -o pipefail -x
+
+source scripts/config
+
+echo "use \"curl --socks5-hostname localhost:1081\" to make requests"
+
+exec kubectl socks5-proxy -p 1081

Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
`1`	`1`	`promtorture`
	`2`	`+promtool`
	`3`	`+.cache`