From 22fbed38fe24659881fdf0a9cbca5a0d156996d5 Mon Sep 17 00:00:00 2001 From: Nikolai Golub Date: Mon, 17 Nov 2025 17:26:02 +0100 Subject: [PATCH 1/6] Adding important metrics to telegraf --- telegraf/telegraf.conf | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/telegraf/telegraf.conf b/telegraf/telegraf.conf index da99b6f..af09033 100644 --- a/telegraf/telegraf.conf +++ b/telegraf/telegraf.conf @@ -21,16 +21,21 @@ data_format = "influx" [[inputs.socket_listener]] - service_address = "tcp://:8094" - data_format = "influx" + service_address = "tcp://:8094" + data_format = "influx" [[inputs.cpu]] percpu = true totalcpu = true +[[inputs.mem]] + [[inputs.system]] -[[inputs.mem]] +[[inputs.swap]] + +[[inputs.kernel]] +[[inputs.interrupts]] [[inputs.disk]] ignore_fs = ["tmpfs", "devtmpfs"] @@ -39,9 +44,28 @@ [[inputs.diskio]] [[inputs.filecount]] - directories = ["/mnt", "/mnt/rollup", "/mnt/rollup/state", "/mnt/rollup/accessory", "/mnt/rollup/ledger", "/mnt/rollup/preferred_sequencer"] + directories = ["/mnt", "/mnt/rollup", "/mnt/rollup/state", "/mnt/rollup/accessory", "/mnt/rollup/ledger", "/mnt/rollup/preferred_sequencer"] [[inputs.net]] - ignore_protocol_stats = true + ignore_protocol_stats = true + +[[inputs.netstat]] [[inputs.processes]] + +[[inputs.procstat]] + pattern = "rollup" + pid_finder = "native" + # Collect all available metrics + fieldpass = ["*"] + + +#[[inputs.http_response]] +# interval = "2s" +# urls = ["http://127.0.0.1:12346/sequencer/ready"] + +#[[inputs.prometheus]] +# urls = ["http://127.0.0.1:9845/metrics"] +# metric_version = 1 +# [inputs.prometheus.tags] +# source = "sov-rollup" \ No newline at end of file From a96be753f1d54e8e28e7284f8abcf3a1d61f22a5 Mon Sep 17 00:00:00 2001 From: Nikolai Golub Date: Mon, 17 Nov 2025 17:35:02 +0100 Subject: [PATCH 2/6] Add internal stat --- telegraf/telegraf.conf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/telegraf/telegraf.conf b/telegraf/telegraf.conf index af09033..3f3ab7e 100644 --- a/telegraf/telegraf.conf +++ b/telegraf/telegraf.conf @@ -60,6 +60,8 @@ fieldpass = ["*"] +[[inputs.internal]] + #[[inputs.http_response]] # interval = "2s" # urls = ["http://127.0.0.1:12346/sequencer/ready"] From 78b806c490d0504f94aed2de3d512868f1e3a45b Mon Sep 17 00:00:00 2001 From: Nikolai Golub Date: Tue, 18 Nov 2025 12:09:26 +0100 Subject: [PATCH 3/6] Unify telegraf.con --- telegraf/telegraf.conf | 82 +++++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 29 deletions(-) diff --git a/telegraf/telegraf.conf b/telegraf/telegraf.conf index 3f3ab7e..cd03818 100644 --- a/telegraf/telegraf.conf +++ b/telegraf/telegraf.conf @@ -1,21 +1,30 @@ +[global_tags] + # This will be used for checking eligibility for alerting + # Currently there will be 3: sov-dev, sov-int-staging, sov-testnet. Only sov-testnet will have alerts + environment = "sov-dev" + # Optional, can be helpful in alerts: primary-mock-da, secondary-celestia + # role = "mock-da" [agent] - interval = "5s" + interval = "30s" round_interval = true - debug = true - quiet = false - metric_batch_size = 1000 - metric_buffer_limit = 10000 - flush_interval = "5s" + metric_batch_size = 5000 + metric_buffer_limit = 50000 + collection_jitter = "0s" + flush_interval = "30s" + flush_jitter = "0s" + precision = "0s" + skip_processors_after_aggregators = true hostname = "dev-docker" - precision = "" [[outputs.influxdb_v2]] + # Adjust for staging/testnet environment urls = ["http://influxdb:8086"] organization = "sovereign" bucket = "sov-rollup" token = "sovereign" insecure_skip_verify = true +# Collectors [[inputs.socket_listener]] service_address = "udp://:8094" data_format = "influx" @@ -24,50 +33,65 @@ service_address = "tcp://:8094" data_format = "influx" +# Basic [[inputs.cpu]] percpu = true totalcpu = true - + collect_cpu_time = false + report_active = falsecore_tags = false [[inputs.mem]] -[[inputs.system]] - -[[inputs.swap]] - -[[inputs.kernel]] -[[inputs.interrupts]] - +# Storage [[inputs.disk]] - ignore_fs = ["tmpfs", "devtmpfs"] + ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] + # Adjust per server mount_points = ["/", "/mnt/rollup"] - [[inputs.diskio]] - [[inputs.filecount]] - directories = ["/mnt", "/mnt/rollup", "/mnt/rollup/state", "/mnt/rollup/accessory", "/mnt/rollup/ledger", "/mnt/rollup/preferred_sequencer"] + recursive = true + size = "100B" + # Adjust per server + directories = [ + "/mnt", + "/mnt/rollup/**", + ] + +# System +[[inputs.system]] +[[inputs.swap]] +[[inputs.kernel]] +[[inputs.interrupts]] +[[inputs.processes]] +# Network [[inputs.net]] + interfaces = ["*"] ignore_protocol_stats = true - [[inputs.netstat]] -[[inputs.processes]] - +# Misc [[inputs.procstat]] pattern = "rollup" pid_finder = "native" # Collect all available metrics fieldpass = ["*"] - - +# Telegraf self-monitoring [[inputs.internal]] +[[inputs.http_response]] + interval = "2s" + urls = ["http://127.0.0.1:12346/sequencer/ready"] -#[[inputs.http_response]] -# interval = "2s" -# urls = ["http://127.0.0.1:12346/sequencer/ready"] - +#[[inputs.postgresql]] +# address = "host=localhost user=rollup password=hunter2 sslmode=disable" +# Rollup, legacy #[[inputs.prometheus]] # urls = ["http://127.0.0.1:9845/metrics"] # metric_version = 1 # [inputs.prometheus.tags] -# source = "sov-rollup" \ No newline at end of file +# source = "sov-rollup" +# Alloy +#[[inputs.prometheus]] +# urls = ["http://127.0.0.1:12345/metrics"] +# metric_version = 2 +# [inputs.prometheus.tags] +# source = "alloy" \ No newline at end of file From 75925a6dbdb0d1a9ff90623e1d6ed01c9452e025 Mon Sep 17 00:00:00 2001 From: Nikolai Golub Date: Tue, 18 Nov 2025 12:16:27 +0100 Subject: [PATCH 4/6] Bump telegtraf version and fix config --- docker-compose.yml | 2 +- telegraf/telegraf.conf | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 3218222..ba748ec 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -86,7 +86,7 @@ services: timeout: 30s retries: 30 telegraf: - image: telegraf:1.27 + image: telegraf:1.34 depends_on: - influxdb volumes: diff --git a/telegraf/telegraf.conf b/telegraf/telegraf.conf index cd03818..dff5a4d 100644 --- a/telegraf/telegraf.conf +++ b/telegraf/telegraf.conf @@ -13,7 +13,6 @@ flush_interval = "30s" flush_jitter = "0s" precision = "0s" - skip_processors_after_aggregators = true hostname = "dev-docker" [[outputs.influxdb_v2]] @@ -51,10 +50,7 @@ recursive = true size = "100B" # Adjust per server - directories = [ - "/mnt", - "/mnt/rollup/**", - ] + directories = [ "/mnt", "/mnt/rollup/**" ] # System [[inputs.system]] From e8d8b82e2452ecbcb1b76e5b8aee59e8e77c44ff Mon Sep 17 00:00:00 2001 From: Nikolai Golub Date: Tue, 18 Nov 2025 12:32:02 +0100 Subject: [PATCH 5/6] Adding start of only alloy --- Makefile | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/Makefile b/Makefile index f62f606..bc090de 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,34 @@ start: echo "View logs with: make logs"; \ exit 1 +start-alloy-only: + @echo "Starting Grafana Alloy only..." + @$(DOCKER_COMPOSE) up grafana-alloy -d --build --force-recreate + @echo "Waiting for Grafana Alloy to become healthy..." + @timeout=60; \ + while [ $$timeout -gt 0 ]; do \ + status=$$(docker compose ps grafana-alloy --format json | jq -r '.Health // "starting"'); \ + if [ "$$status" = "healthy" ]; then \ + echo ""; \ + echo "✅ Grafana Alloy is healthy!"; \ + echo ""; \ + echo "🚀 Grafana Alloy is ready:"; \ + echo " - gRPC endpoint: localhost:$${ALLOY_GRPC_PORT:-4317}"; \ + echo " - HTTP endpoint: localhost:$${ALLOY_HTTP_PORT:-4318}"; \ + echo ""; \ + exit 0; \ + else \ + printf "\r⏳ Waiting for Grafana Alloy... ($$timeout seconds remaining) "; \ + sleep 1; \ + timeout=$$((timeout - 1)); \ + fi; \ + done; \ + echo ""; \ + echo "⚠️ Timeout waiting for Grafana Alloy to become healthy"; \ + echo "Check service status with: docker compose ps grafana-alloy"; \ + echo "View logs with: docker compose logs grafana-alloy"; \ + exit 1 + ## Stop all services stop: @echo "Shutting down services..." From 529ec8100bb781a579ea515688cea5ad263caaf2 Mon Sep 17 00:00:00 2001 From: Nikolai Golub Date: Tue, 18 Nov 2025 12:39:22 +0100 Subject: [PATCH 6/6] Add helpers for agents only --- Makefile | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/Makefile b/Makefile index bc090de..d724121 100644 --- a/Makefile +++ b/Makefile @@ -63,6 +63,36 @@ start-alloy-only: echo "View logs with: docker compose logs grafana-alloy"; \ exit 1 +start-telegraf-only: + @echo "Starting Telegraf only..." + @$(DOCKER_COMPOSE) up telegraf -d --build --force-recreate + @echo "Waiting for Telegraf to become healthy..." + @timeout=60; \ + while [ $$timeout -gt 0 ]; do \ + status=$$(docker compose ps telegraf --format json | jq -r '.Health // "starting"'); \ + if [ "$$status" = "healthy" ]; then \ + echo ""; \ + echo "✅ Telegraf is healthy!"; \ + echo ""; \ + echo "🚀 Telegraf is ready:"; \ + echo " - UDP endpoint: localhost:$${TELEGRAF_PORT:-8094}"; \ + echo " - TCP endpoint: localhost:$${TELEGRAF_PORT:-8094}"; \ + echo ""; \ + exit 0; \ + else \ + printf "\r⏳ Waiting for Telegraf... ($$timeout seconds remaining) "; \ + sleep 1; \ + timeout=$$((timeout - 1)); \ + fi; \ + done; \ + echo ""; \ + echo "⚠️ Timeout waiting for Telegraf to become healthy"; \ + echo "Check service status with: docker compose ps telegraf"; \ + echo "View logs with: docker compose logs telegraf"; \ + exit 1 + +make start-agents-only: start-telegraf-only start-alloy-only + ## Stop all services stop: @echo "Shutting down services..."