diff --git a/Makefile b/Makefile index f62f606..d724121 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,64 @@ start: echo "View logs with: make logs"; \ exit 1 +start-alloy-only: + @echo "Starting Grafana Alloy only..." + @$(DOCKER_COMPOSE) up grafana-alloy -d --build --force-recreate + @echo "Waiting for Grafana Alloy to become healthy..." + @timeout=60; \ + while [ $$timeout -gt 0 ]; do \ + status=$$(docker compose ps grafana-alloy --format json | jq -r '.Health // "starting"'); \ + if [ "$$status" = "healthy" ]; then \ + echo ""; \ + echo "✅ Grafana Alloy is healthy!"; \ + echo ""; \ + echo "🚀 Grafana Alloy is ready:"; \ + echo " - gRPC endpoint: localhost:$${ALLOY_GRPC_PORT:-4317}"; \ + echo " - HTTP endpoint: localhost:$${ALLOY_HTTP_PORT:-4318}"; \ + echo ""; \ + exit 0; \ + else \ + printf "\r⏳ Waiting for Grafana Alloy... ($$timeout seconds remaining) "; \ + sleep 1; \ + timeout=$$((timeout - 1)); \ + fi; \ + done; \ + echo ""; \ + echo "⚠️ Timeout waiting for Grafana Alloy to become healthy"; \ + echo "Check service status with: docker compose ps grafana-alloy"; \ + echo "View logs with: docker compose logs grafana-alloy"; \ + exit 1 + +start-telegraf-only: + @echo "Starting Telegraf only..." + @$(DOCKER_COMPOSE) up telegraf -d --build --force-recreate + @echo "Waiting for Telegraf to become healthy..." + @timeout=60; \ + while [ $$timeout -gt 0 ]; do \ + status=$$(docker compose ps telegraf --format json | jq -r '.Health // "starting"'); \ + if [ "$$status" = "healthy" ]; then \ + echo ""; \ + echo "✅ Telegraf is healthy!"; \ + echo ""; \ + echo "🚀 Telegraf is ready:"; \ + echo " - UDP endpoint: localhost:$${TELEGRAF_PORT:-8094}"; \ + echo " - TCP endpoint: localhost:$${TELEGRAF_PORT:-8094}"; \ + echo ""; \ + exit 0; \ + else \ + printf "\r⏳ Waiting for Telegraf... ($$timeout seconds remaining) "; \ + sleep 1; \ + timeout=$$((timeout - 1)); \ + fi; \ + done; \ + echo ""; \ + echo "⚠️ Timeout waiting for Telegraf to become healthy"; \ + echo "Check service status with: docker compose ps telegraf"; \ + echo "View logs with: docker compose logs telegraf"; \ + exit 1 + +make start-agents-only: start-telegraf-only start-alloy-only + ## Stop all services stop: @echo "Shutting down services..." diff --git a/docker-compose.yml b/docker-compose.yml index 3218222..ba748ec 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -86,7 +86,7 @@ services: timeout: 30s retries: 30 telegraf: - image: telegraf:1.27 + image: telegraf:1.34 depends_on: - influxdb volumes: diff --git a/telegraf/telegraf.conf b/telegraf/telegraf.conf index da99b6f..dff5a4d 100644 --- a/telegraf/telegraf.conf +++ b/telegraf/telegraf.conf @@ -1,47 +1,93 @@ +[global_tags] + # This will be used for checking eligibility for alerting + # Currently there will be 3: sov-dev, sov-int-staging, sov-testnet. Only sov-testnet will have alerts + environment = "sov-dev" + # Optional, can be helpful in alerts: primary-mock-da, secondary-celestia + # role = "mock-da" [agent] - interval = "5s" + interval = "30s" round_interval = true - debug = true - quiet = false - metric_batch_size = 1000 - metric_buffer_limit = 10000 - flush_interval = "5s" + metric_batch_size = 5000 + metric_buffer_limit = 50000 + collection_jitter = "0s" + flush_interval = "30s" + flush_jitter = "0s" + precision = "0s" hostname = "dev-docker" - precision = "" [[outputs.influxdb_v2]] + # Adjust for staging/testnet environment urls = ["http://influxdb:8086"] organization = "sovereign" bucket = "sov-rollup" token = "sovereign" insecure_skip_verify = true +# Collectors [[inputs.socket_listener]] service_address = "udp://:8094" data_format = "influx" [[inputs.socket_listener]] - service_address = "tcp://:8094" - data_format = "influx" + service_address = "tcp://:8094" + data_format = "influx" +# Basic [[inputs.cpu]] percpu = true totalcpu = true - -[[inputs.system]] - + collect_cpu_time = false + report_active = falsecore_tags = false [[inputs.mem]] +# Storage [[inputs.disk]] - ignore_fs = ["tmpfs", "devtmpfs"] + ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] + # Adjust per server mount_points = ["/", "/mnt/rollup"] - [[inputs.diskio]] - [[inputs.filecount]] - directories = ["/mnt", "/mnt/rollup", "/mnt/rollup/state", "/mnt/rollup/accessory", "/mnt/rollup/ledger", "/mnt/rollup/preferred_sequencer"] + recursive = true + size = "100B" + # Adjust per server + directories = [ "/mnt", "/mnt/rollup/**" ] +# System +[[inputs.system]] +[[inputs.swap]] +[[inputs.kernel]] +[[inputs.interrupts]] +[[inputs.processes]] + +# Network [[inputs.net]] - ignore_protocol_stats = true + interfaces = ["*"] + ignore_protocol_stats = true +[[inputs.netstat]] -[[inputs.processes]] +# Misc +[[inputs.procstat]] + pattern = "rollup" + pid_finder = "native" + # Collect all available metrics + fieldpass = ["*"] +# Telegraf self-monitoring +[[inputs.internal]] +[[inputs.http_response]] + interval = "2s" + urls = ["http://127.0.0.1:12346/sequencer/ready"] + +#[[inputs.postgresql]] +# address = "host=localhost user=rollup password=hunter2 sslmode=disable" +# Rollup, legacy +#[[inputs.prometheus]] +# urls = ["http://127.0.0.1:9845/metrics"] +# metric_version = 1 +# [inputs.prometheus.tags] +# source = "sov-rollup" +# Alloy +#[[inputs.prometheus]] +# urls = ["http://127.0.0.1:12345/metrics"] +# metric_version = 2 +# [inputs.prometheus.tags] +# source = "alloy" \ No newline at end of file