Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,64 @@ start:
echo "View logs with: make logs"; \
exit 1

start-alloy-only:
@echo "Starting Grafana Alloy only..."
@$(DOCKER_COMPOSE) up grafana-alloy -d --build --force-recreate
@echo "Waiting for Grafana Alloy to become healthy..."
@timeout=60; \
while [ $$timeout -gt 0 ]; do \
status=$$(docker compose ps grafana-alloy --format json | jq -r '.Health // "starting"'); \
if [ "$$status" = "healthy" ]; then \
echo ""; \
echo "✅ Grafana Alloy is healthy!"; \
echo ""; \
echo "🚀 Grafana Alloy is ready:"; \
echo " - gRPC endpoint: localhost:$${ALLOY_GRPC_PORT:-4317}"; \
echo " - HTTP endpoint: localhost:$${ALLOY_HTTP_PORT:-4318}"; \
echo ""; \
exit 0; \
else \
printf "\r⏳ Waiting for Grafana Alloy... ($$timeout seconds remaining) "; \
sleep 1; \
timeout=$$((timeout - 1)); \
fi; \
done; \
echo ""; \
echo "⚠️ Timeout waiting for Grafana Alloy to become healthy"; \
echo "Check service status with: docker compose ps grafana-alloy"; \
echo "View logs with: docker compose logs grafana-alloy"; \
exit 1

start-telegraf-only:
@echo "Starting Telegraf only..."
@$(DOCKER_COMPOSE) up telegraf -d --build --force-recreate
@echo "Waiting for Telegraf to become healthy..."
@timeout=60; \
while [ $$timeout -gt 0 ]; do \
status=$$(docker compose ps telegraf --format json | jq -r '.Health // "starting"'); \
if [ "$$status" = "healthy" ]; then \
echo ""; \
echo "✅ Telegraf is healthy!"; \
echo ""; \
echo "🚀 Telegraf is ready:"; \
echo " - UDP endpoint: localhost:$${TELEGRAF_PORT:-8094}"; \
echo " - TCP endpoint: localhost:$${TELEGRAF_PORT:-8094}"; \
echo ""; \
exit 0; \
else \
printf "\r⏳ Waiting for Telegraf... ($$timeout seconds remaining) "; \
sleep 1; \
timeout=$$((timeout - 1)); \
fi; \
done; \
echo ""; \
echo "⚠️ Timeout waiting for Telegraf to become healthy"; \
echo "Check service status with: docker compose ps telegraf"; \
echo "View logs with: docker compose logs telegraf"; \
exit 1

make start-agents-only: start-telegraf-only start-alloy-only

## Stop all services
stop:
@echo "Shutting down services..."
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ services:
timeout: 30s
retries: 30
telegraf:
image: telegraf:1.27
image: telegraf:1.34
depends_on:
- influxdb
volumes:
Expand Down
82 changes: 64 additions & 18 deletions telegraf/telegraf.conf
Original file line number Diff line number Diff line change
@@ -1,47 +1,93 @@
[global_tags]
# This will be used for checking eligibility for alerting
# Currently there will be 3: sov-dev, sov-int-staging, sov-testnet. Only sov-testnet will have alerts
environment = "sov-dev"
# Optional, can be helpful in alerts: primary-mock-da, secondary-celestia
# role = "mock-da"
[agent]
interval = "5s"
interval = "30s"
round_interval = true
debug = true
quiet = false
metric_batch_size = 1000
metric_buffer_limit = 10000
flush_interval = "5s"
metric_batch_size = 5000
metric_buffer_limit = 50000
collection_jitter = "0s"
flush_interval = "30s"
flush_jitter = "0s"
precision = "0s"
hostname = "dev-docker"
precision = ""

[[outputs.influxdb_v2]]
# Adjust for staging/testnet environment
urls = ["http://influxdb:8086"]
organization = "sovereign"
bucket = "sov-rollup"
token = "sovereign"
insecure_skip_verify = true

# Collectors
[[inputs.socket_listener]]
service_address = "udp://:8094"
data_format = "influx"

[[inputs.socket_listener]]
service_address = "tcp://:8094"
data_format = "influx"
service_address = "tcp://:8094"
data_format = "influx"

# Basic
[[inputs.cpu]]
percpu = true
totalcpu = true

[[inputs.system]]

collect_cpu_time = false
report_active = falsecore_tags = false
[[inputs.mem]]

# Storage
[[inputs.disk]]
ignore_fs = ["tmpfs", "devtmpfs"]
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
# Adjust per server
mount_points = ["/", "/mnt/rollup"]

[[inputs.diskio]]

[[inputs.filecount]]
directories = ["/mnt", "/mnt/rollup", "/mnt/rollup/state", "/mnt/rollup/accessory", "/mnt/rollup/ledger", "/mnt/rollup/preferred_sequencer"]
recursive = true
size = "100B"
# Adjust per server
directories = [ "/mnt", "/mnt/rollup/**" ]

# System
[[inputs.system]]
[[inputs.swap]]
[[inputs.kernel]]
[[inputs.interrupts]]
[[inputs.processes]]

# Network
[[inputs.net]]
ignore_protocol_stats = true
interfaces = ["*"]
ignore_protocol_stats = true
[[inputs.netstat]]

[[inputs.processes]]
# Misc
[[inputs.procstat]]
pattern = "rollup"
pid_finder = "native"
# Collect all available metrics
fieldpass = ["*"]
# Telegraf self-monitoring
[[inputs.internal]]
[[inputs.http_response]]
interval = "2s"
urls = ["http://127.0.0.1:12346/sequencer/ready"]

#[[inputs.postgresql]]
# address = "host=localhost user=rollup password=hunter2 sslmode=disable"
# Rollup, legacy
#[[inputs.prometheus]]
# urls = ["http://127.0.0.1:9845/metrics"]
# metric_version = 1
# [inputs.prometheus.tags]
# source = "sov-rollup"
# Alloy
#[[inputs.prometheus]]
# urls = ["http://127.0.0.1:12345/metrics"]
# metric_version = 2
# [inputs.prometheus.tags]
# source = "alloy"