diff --git a/framework/observability/compose/conf/process-exporter.yaml b/framework/observability/compose/conf/process-exporter.yaml new file mode 100644 index 000000000..0fc360f38 --- /dev/null +++ b/framework/observability/compose/conf/process-exporter.yaml @@ -0,0 +1,4 @@ +process_names: + - name: "{{.Comm}}|{{ index .Cgroups 0 }}" + cmdline: + - '.+' \ No newline at end of file diff --git a/framework/observability/compose/conf/prometheus.yml b/framework/observability/compose/conf/prometheus.yml index 290aefe2b..71bf47a8e 100644 --- a/framework/observability/compose/conf/prometheus.yml +++ b/framework/observability/compose/conf/prometheus.yml @@ -20,6 +20,14 @@ scrape_configs: static_configs: - targets: - cadvisor:8080 + metric_relabel_configs: + # Extract hex ID from common cgroup/runtime formats: + # /docker/, docker-.scope, containerd://, cri-containerd-.scope, etc. + - source_labels: [ id ] + regex: '.*(?:/docker/|docker-|containerd://|containerd-|cri-containerd-|crio-)([a-f0-9]{12,64})(?:\.scope)?.*' + target_label: container_id + replacement: '$1' + action: replace - job_name: 'postgres_exporter_0' static_configs: - targets: ['postgres_exporter_0:9187'] @@ -35,3 +43,30 @@ scrape_configs: - job_name: 'postgres_exporter_4' static_configs: - targets: ['postgres_exporter_4:9187'] + - job_name: 'process_exporter' + static_configs: + - targets: ['process-exporter:9256'] + metric_relabel_configs: + # Extract container id from groupname so we can match it to the container + - source_labels: [groupname] + regex: "^[^|]+\\|.*/([a-f0-9]{12,64})$" + target_label: container_id + replacement: "$1" + action: replace + + # We detect the special '/../..' tail and set container_id=host. + - source_labels: [groupname] + regex: "^[^|]+\\|/\\.\\./\\.\\.$" + target_label: container_id + replacement: "host" + action: replace + + - source_labels: [groupname] + regex: "^([^|]+)\\|.*" + target_label: groupname + replacement: "$1" + action: replace + - job_name: container-sd + file_sd_configs: + - files: [ "/etc/prometheus/targets/merged.json" ] + refresh_interval: 15s \ No newline at end of file diff --git a/framework/observability/compose/docker-compose.yaml b/framework/observability/compose/docker-compose.yaml index ae915b3cf..cda3573cb 100644 --- a/framework/observability/compose/docker-compose.yaml +++ b/framework/observability/compose/docker-compose.yaml @@ -45,6 +45,7 @@ services: volumes: - /var/run/docker.sock:/var/run/docker.sock - ./conf/prometheus.yml:/etc/prometheus/prometheus.yml + - sd-targets:/etc/prometheus/targets ports: - '9099:9090' @@ -135,6 +136,35 @@ services: - '9304:9187' restart: unless-stopped + process-exporter: + image: ncabatoff/process-exporter:latest + container_name: process-exporter + pid: host + privileged: true + volumes: + - /proc:/host/proc:ro + - ./conf/process-exporter.yaml:/config.yaml:ro + command: + - "--procfs=/host/proc" + - "--config.path=/config.yaml" + ports: + - "9256:9256" + + sd-bridge: + image: alpine:3.20 + command: [ "/bin/sh","-c","apk add --no-cache bash curl jq docker-cli && exec bash scripts/sd-bridge.sh" ] + volumes: + - /var/run/docker.sock:/var/run/docker.sock:ro + - sd-targets:/out + - ./scripts:/scripts:ro + environment: + LABEL_MATCH: "prom_sd=true" + DISCOVERY_PATH: "/discovery" + DISCOVERY_PORT: "6688" + DISCOVERY_SCHEME: "http" + OUT: "/out/merged.json" + SLEEP: "15" + volumes: loki_data: grafana_data: @@ -142,6 +172,7 @@ volumes: grafana_logs: grafana_plugins: tempo_data: + sd-targets: networks: default: diff --git a/framework/observability/compose/scripts/sd-bridge.sh b/framework/observability/compose/scripts/sd-bridge.sh new file mode 100644 index 000000000..938d0b637 --- /dev/null +++ b/framework/observability/compose/scripts/sd-bridge.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# sd-bridge.sh (simplified) +# Discover Docker containers by label, pull each container's discovery JSON, +# add labels, merge + de-duplicate, and write a single file_sd JSON. + +set -Eeuo pipefail + +# --- Config (env) --- +LABEL_MATCH="${LABEL_MATCH:-framework=ctf}" +DEFAULT_PATH="${DISCOVERY_PATH:-/discovery}" +DEFAULT_PORT="${DISCOVERY_PORT:-6688}" +DEFAULT_SCHEME="${DISCOVERY_SCHEME:-http}" +PREFER_NETWORK="${NETWORK_NAME:-}" +OUT="${OUT:-/out/merged.json}" +SLEEP="${SLEEP:-15}" +REQUEST_TIMEOUT="${REQUEST_TIMEOUT:-5}" +REWRITE_TO_IP="${REWRITE_TO_IP:-0}" # set to 1 to replace host with container IP + +# --- Helpers --- +log(){ printf '[sd-bridge] %s\n' "$*" >&2; } +get_ip(){ + local cid="$1" net="$2" + if [[ -n "$net" ]]; then + docker inspect "$cid" | jq -r --arg n "$net" '.[0].NetworkSettings.Networks[$n].IPAddress // empty' + else + docker inspect "$cid" | jq -r '.[0].NetworkSettings.Networks | to_entries[0].value.IPAddress // empty' + fi +} +get_label(){ docker inspect "$1" | jq -r --arg k "$2" '.[0].Config.Labels[$k] // empty'; } +get_name(){ docker inspect "$1" | jq -r '.[0].Name | ltrimstr("/")'; } +merge_and_dedupe(){ + jq -s ' + add // [] + | map({targets: (.targets // []), labels: (.labels // {})}) + | group_by(.labels) + | map({labels: (.[0].labels), targets: ([.[].targets[]] | unique | sort)}) + ' +} +atomic_write(){ local p="$1" t="$1.tmp"; cat >"$t" && mv "$t" "$p"; } + +# --- Init --- +mkdir -p "$(dirname "$OUT")" +echo '[]' | atomic_write "$OUT" + +# --- Main loop --- +while true; do + mapfile -t cids < <(docker ps -q --filter "label=$LABEL_MATCH" || true) + if (( ${#cids[@]} == 0 )); then + echo '[]' | atomic_write "$OUT" + log "no matching containers; wrote empty array" + sleep "$SLEEP"; continue + fi + + files=() + for cid in "${cids[@]}"; do + ip="$(get_ip "$cid" "$PREFER_NETWORK")" + [[ -z "$ip" ]] && { log "skip ${cid:0:12}: no IP"; continue; } + name="$(get_name "$cid")" + + # Per-container overrides (optional) + path="$(get_label "$cid" prom_sd_path)"; path="${path:-$DEFAULT_PATH}" + port="$(get_label "$cid" prom_sd_port)"; port="${port:-$DEFAULT_PORT}" + scheme="$(get_label "$cid" prom_sd_scheme)"; scheme="${scheme:-$DEFAULT_SCHEME}" + + url="${scheme}://${ip}:${port}${path}" + f="$(mktemp)"; files+=("$f") + if curl -fsSL --max-time "$REQUEST_TIMEOUT" "$url" | jq '.' > "$f" 2>/dev/null; then + # Add labels (and optionally rewrite host -> container IP) + if [[ "$REWRITE_TO_IP" == "1" ]]; then + jq --arg ip "$ip" --arg name "$name" ' + map( + .targets |= map($ip + ":" + (split(":")[1])) | + .labels = ((.labels // {}) + { + container_name: $name, + scrape_path: (.labels.__metrics_path__ // "") + }) + ) + ' "$f" > "$f.tmp" && mv "$f.tmp" "$f" + else + jq --arg name "$name" ' + map( + .labels = ((.labels // {}) + { + container_name: $name, + scrape_path: (.labels.__metrics_path__ // "") + }) + ) + ' "$f" > "$f.tmp" && mv "$f.tmp" "$f" + fi + log "ok ${url}" + else + log "fail ${url}; using []" + echo '[]' > "$f" + fi + done + + if (( ${#files[@]} > 0 )); then + cat "${files[@]}" | merge_and_dedupe | atomic_write "$OUT" + rm -f "${files[@]}" + log "merged ${#files[@]} lists into $(wc -c < "$OUT") bytes" + else + echo '[]' | atomic_write "$OUT" + fi + + sleep "$SLEEP" +done