Skip to content

Commit df61586

Browse files
committed
[telemetry] support prometheus sink
1 parent 5524f4d commit df61586

24 files changed

+2429
-419
lines changed

Cargo.lock

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/aptos-telemetry-service/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ gcp-bigquery-client = { workspace = true }
3535
jsonwebtoken = { workspace = true }
3636
once_cell = { workspace = true }
3737
prometheus = { workspace = true }
38+
prost = { workspace = true }
3839
rand = { workspace = true }
3940
rand_core = { workspace = true }
4041
reqwest = { workspace = true }
@@ -43,6 +44,7 @@ reqwest-retry = { workspace = true }
4344
serde = { workspace = true }
4445
serde_json = { workspace = true }
4546
serde_yaml = { workspace = true }
47+
snap = "1.1"
4648
thiserror = { workspace = true }
4749
tokio = { workspace = true }
4850
tracing = { workspace = true }

crates/aptos-telemetry-service/e2e-test/cleanup.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,26 @@ else
4646
echo -e "${YELLOW}No node process found${NC}"
4747
fi
4848
fi
49+
50+
# Stop telemetry service
51+
if [ -f "$TEST_DIR/telemetry.pid" ]; then
52+
TELEMETRY_PID=$(cat "$TEST_DIR/telemetry.pid")
53+
if ps -p $TELEMETRY_PID > /dev/null 2>&1; then
54+
kill $TELEMETRY_PID
55+
echo -e "${GREEN}✓ Telemetry service stopped (PID: $TELEMETRY_PID)${NC}"
56+
else
57+
echo -e "${YELLOW}Telemetry process not running${NC}"
58+
fi
59+
rm "$TEST_DIR/telemetry.pid"
60+
else
61+
# Try to find and kill any telemetry service process on port 8082
62+
if lsof -Pi :8082 -sTCP:LISTEN -t >/dev/null 2>&1; then
63+
kill $(lsof -t -i:8082) 2>/dev/null || true
64+
echo -e "${GREEN}✓ Stopped process on port 8082${NC}"
65+
else
66+
echo -e "${YELLOW}No telemetry service process found${NC}"
67+
fi
68+
fi
4969
echo ""
5070

5171
# Ask about data removal

crates/aptos-telemetry-service/e2e-test/docker-compose.yaml

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
11
# Docker Compose setup for E2E testing of Aptos Telemetry Service
2-
# Runs VictoriaMetrics and Loki locally for telemetry data ingestion testing
2+
#
3+
# Architecture:
4+
# - VictoriaMetrics: Primary metrics backend (accepts Prometheus text format via /api/v1/import/prometheus)
5+
# - Prometheus: Secondary metrics store (accepts remote write via /api/v1/write)
6+
# - Loki: Log aggregation backend
7+
# - Grafana: Visualization (queries all backends)
38
version: '3.8'
49

510
services:
6-
# VictoriaMetrics - Time-series database for metrics
11+
# VictoriaMetrics - Primary metrics backend for telemetry ingestion
12+
# Accepts Prometheus text format via /api/v1/import/prometheus (simpler than protobuf)
713
victoria-metrics:
814
image: victoriametrics/victoria-metrics:latest
915
container_name: telemetry-victoria-metrics
@@ -23,6 +29,28 @@ services:
2329
timeout: 5s
2430
retries: 3
2531

32+
# Prometheus - Secondary metrics backend for remote write
33+
prometheus:
34+
image: prom/prometheus:latest
35+
container_name: telemetry-prometheus
36+
ports:
37+
- "9090:9090" # HTTP API for queries and remote write
38+
volumes:
39+
- ./prometheus.yaml:/etc/prometheus/prometheus.yml
40+
- prometheus-data:/prometheus
41+
command:
42+
- '--config.file=/etc/prometheus/prometheus.yml'
43+
- '--storage.tsdb.path=/prometheus'
44+
- '--storage.tsdb.retention.time=30d'
45+
- '--web.enable-remote-write-receiver' # Enable receiving remote writes
46+
- '--web.enable-lifecycle' # Enable /-/reload endpoint
47+
restart: unless-stopped
48+
healthcheck:
49+
test: ["CMD", "wget", "-q", "--spider", "http://localhost:9090/-/healthy"]
50+
interval: 10s
51+
timeout: 5s
52+
retries: 3
53+
2654
# Loki - Log aggregation system
2755
loki:
2856
image: grafana/loki:3.0.0
@@ -56,11 +84,14 @@ services:
5684
restart: unless-stopped
5785
depends_on:
5886
- victoria-metrics
87+
- prometheus
5988
- loki
6089

6190
volumes:
6291
victoria-data:
6392
driver: local
93+
prometheus-data:
94+
driver: local
6495
loki-data:
6596
driver: local
6697
grafana-data:

crates/aptos-telemetry-service/e2e-test/grafana-datasources.yaml

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,16 @@
11
# Grafana datasources for E2E testing
2-
# Auto-configures VictoriaMetrics and Loki datasources
2+
#
3+
# Architecture:
4+
# - VictoriaMetrics (default): Receives telemetry metrics via text format import
5+
# - Prometheus: Scrapes VictoriaMetrics, provides alternative PromQL endpoint
6+
# - Loki: Receives telemetry logs
7+
#
8+
# Both VictoriaMetrics and Prometheus support PromQL, so queries work on either.
39

410
apiVersion: 1
511

612
datasources:
7-
# VictoriaMetrics (Prometheus-compatible) datasource
13+
# VictoriaMetrics - Primary metrics backend (receives telemetry via push)
814
- name: VictoriaMetrics
915
type: prometheus
1016
access: proxy
@@ -15,7 +21,19 @@ datasources:
1521
timeInterval: 10s
1622
editable: true
1723

18-
# Loki datasource
24+
# Prometheus - Secondary metrics backend (scrapes VictoriaMetrics)
25+
# Demonstrates multi-backend querying with same PromQL syntax
26+
- name: Prometheus
27+
type: prometheus
28+
access: proxy
29+
url: http://prometheus:9090
30+
isDefault: false
31+
jsonData:
32+
httpMethod: POST
33+
timeInterval: 15s
34+
editable: true
35+
36+
# Loki datasource - log aggregation
1937
- name: Loki
2038
type: loki
2139
access: proxy

0 commit comments

Comments
 (0)