diff --git a/.env.sample.holesky b/.env.sample.holesky index e66281fa..85ebeb35 100644 --- a/.env.sample.holesky +++ b/.env.sample.holesky @@ -176,10 +176,10 @@ CHARON_EXECUTION_CLIENT_RPC_ENDPOINT=http://${EL}:8545 # Loki log aggregation server addresses. Disable loki log aggregation by setting an empty address. #CHARON_LOKI_ADDRESSES= -# Charon Cluster Name. Mandatory to send logs with Promtail. +# Charon Cluster Name. Mandatory to send logs with Promtail and metrics with Prometheus. #CLUSTER_NAME= -# Charon Cluster Peer. Mandatory to send logs with Promtail. +# Charon Cluster Peer. Mandatory to send logs with Promtail and metrics with Prometheus. #CLUSTER_PEER= # Nickname to identify this charon node on monitoring (max 32 characters). @@ -250,6 +250,18 @@ LIDODVEXIT_EXIT_EPOCH=256 # See available tags https://github.com/prometheus/prometheus/releases. #PROMETHEUS_VERSION= +# Prometheus remote write token used for accessing external prometheus. +#PROM_REMOTE_WRITE_TOKEN= + +# Prometheus service owner used to uniquely identify user from which metrics are pushed. +#SERVICE_OWNER=charon_user + +# To get Alerted with Obol Agent monitoring on Discord, specify your Discord ID(s) below. +# Enable developer mode on discord with User Settings > Advanced. +# Then right click on a user's profile picture or name and select Copy ID to get a unique 18-digit number that represents their account. +# Specify multiple discord IDs using comma separation. (e.g. `ALERT_DISCORD_IDS=123456789098765432,098765432123456789`) +#ALERT_DISCORD_IDS="" + # Uncomment these if you have log exporting with Promtail # and want to disable log export on a particular container. #EL_NETHERMIND_PROMTAIL_MONITORED=false diff --git a/.env.sample.hoodi b/.env.sample.hoodi index 15233cc2..346e52b5 100644 --- a/.env.sample.hoodi +++ b/.env.sample.hoodi @@ -176,10 +176,10 @@ CHARON_EXECUTION_CLIENT_RPC_ENDPOINT=http://${EL}:8545 # Loki log aggregation server addresses. Disable loki log aggregation by setting an empty address. #CHARON_LOKI_ADDRESSES= -# Charon Cluster Name. Mandatory to send logs with Promtail. +# Charon Cluster Name. Mandatory to send logs with Promtail and metrics with Prometheus. #CLUSTER_NAME= -# Charon Cluster Peer. Mandatory to send logs with Promtail. +# Charon Cluster Peer. Mandatory to send logs with Promtail and metrics with Prometheus. #CLUSTER_PEER= # Nickname to identify this charon node on monitoring (max 32 characters). @@ -256,6 +256,18 @@ LIDODVEXIT_EXIT_EPOCH=256 # See available tags https://github.com/prometheus/prometheus/releases. #PROMETHEUS_VERSION= +# Prometheus remote write token used for accessing external prometheus. +#PROM_REMOTE_WRITE_TOKEN= + +# Prometheus service owner used to uniquely identify user from which metrics are pushed. +#SERVICE_OWNER=charon_user + +# To get Alerted with Obol Agent monitoring on Discord, specify your Discord ID(s) below. +# Enable developer mode on discord with User Settings > Advanced. +# Then right click on a user's profile picture or name and select Copy ID to get a unique 18-digit number that represents their account. +# Specify multiple discord IDs using comma separation. (e.g. `ALERT_DISCORD_IDS=123456789098765432,098765432123456789`) +#ALERT_DISCORD_IDS="" + # Uncomment these if you have log exporting with Promtail # and want to disable log export on a particular container. #EL_NETHERMIND_PROMTAIL_MONITORED=false diff --git a/.env.sample.mainnet b/.env.sample.mainnet index 15673f81..76342f2a 100644 --- a/.env.sample.mainnet +++ b/.env.sample.mainnet @@ -176,10 +176,10 @@ CHARON_EXECUTION_CLIENT_RPC_ENDPOINT=http://${EL}:8545 # Loki log aggregation server addresses. Disable loki log aggregation by setting an empty address. #CHARON_LOKI_ADDRESSES= -# Charon Cluster Name. Mandatory to send logs with Promtail. +# Charon Cluster Name. Mandatory to send logs with Promtail and metrics with Prometheus. #CLUSTER_NAME= -# Charon Cluster Peer. Mandatory to send logs with Promtail. +# Charon Cluster Peer. Mandatory to send logs with Promtail and metrics with Prometheus. #CLUSTER_PEER= # Nickname to identify this charon node on monitoring (max 32 characters). @@ -256,6 +256,18 @@ LIDODVEXIT_EXIT_EPOCH=194048 # See available tags https://github.com/prometheus/prometheus/releases. #PROMETHEUS_VERSION= +# Prometheus remote write token used for accessing external prometheus. +#PROM_REMOTE_WRITE_TOKEN= + +# Prometheus service owner used to uniquely identify user from which metrics are pushed. +#SERVICE_OWNER=charon_user + +# To get Alerted with Obol Agent monitoring on Discord, specify your Discord ID(s) below. +# Enable developer mode on discord with User Settings > Advanced. +# Then right click on a user's profile picture or name and select Copy ID to get a unique 18-digit number that represents their account. +# Specify multiple discord IDs using comma separation. (e.g. `ALERT_DISCORD_IDS=123456789098765432,098765432123456789`) +#ALERT_DISCORD_IDS="" + # Uncomment these if you have log exporting with Promtail # and want to disable log export on a particular container. #EL_NETHERMIND_PROMTAIL_MONITORED=false diff --git a/.gitignore b/.gitignore index b9c601f4..d76459fa 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ teku/validator/ # Teku directory for storing logs teku/logs/ commit-boost/config.toml +prometheus/prometheus.yml diff --git a/compose-mev.yml b/compose-mev.yml index 5e1f1b2c..944fa3aa 100644 --- a/compose-mev.yml +++ b/compose-mev.yml @@ -39,6 +39,8 @@ services: -request-timeout-getheader=${MEV_TIMEOUT_GETHEADER:-950} -request-timeout-getpayload=${MEV_TIMEOUT_GETPAYLOAD:-4000} -request-timeout-regval=${MEV_TIMEOUT_REGVAL:-3000} + -metrics + -metrics-addr=0.0.0.0:18551 labels: - "promtail-monitored=${MEV_MEV_BOOST_PROMTAIL_MONITORED:-true}" networks: [dvnode] diff --git a/compose-monitoring.yml b/compose-monitoring.yml index 7623b121..ddf02012 100644 --- a/compose-monitoring.yml +++ b/compose-monitoring.yml @@ -8,9 +8,16 @@ services: image: prom/prometheus:${PROMETHEUS_VERSION:-v2.53.5} user: ":" networks: [dvnode] + environment: + PROM_REMOTE_WRITE_TOKEN: ${PROM_REMOTE_WRITE_TOKEN:-} + SERVICE_OWNER: ${SERVICE_OWNER:-} + ALERT_DISCORD_IDS: "${ALERT_DISCORD_IDS:-}" + CLUSTER_NAME: ${CLUSTER_NAME:-} + CLUSTER_PEER: ${CLUSTER_PEER:-} volumes: - - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - ./prometheus:/etc/prometheus - ./data/prometheus:/prometheus + entrypoint: /etc/prometheus/run.sh restart: unless-stopped grafana: diff --git a/docker-compose.yml b/docker-compose.yml index bb609f75..bfb9c454 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -84,7 +84,7 @@ services: # \___|_| |_|\__,_|_| \___/|_| |_| charon: - image: obolnetwork/charon:${CHARON_VERSION:-v1.8.0} + image: obolnetwork/charon:${CHARON_VERSION:-v1.8.1} environment: - CHARON_BEACON_NODE_ENDPOINTS=${CHARON_BEACON_NODE_ENDPOINTS:-http://lighthouse:5052} - CHARON_BEACON_NODE_HEADERS=${CHARON_BEACON_NODE_HEADERS:-} @@ -160,6 +160,8 @@ services: -request-timeout-getheader=${MEVBOOST_TIMEOUT_GETHEADER:-900} -request-timeout-getpayload=${MEVBOOST_TIMEOUT_GETPAYLOAD:-4000} -request-timeout-regval=${MEVBOOST_TIMEOUT_REGVAL:-3000} + -metrics + -metrics-addr=0.0.0.0:18551 labels: - "promtail-monitored=${MEV_BOOST_PROMTAIL_MONITORED:-true}" networks: [dvnode] @@ -177,9 +179,16 @@ services: profiles: [""] user: ":" networks: [dvnode] + environment: + PROM_REMOTE_WRITE_TOKEN: ${PROM_REMOTE_WRITE_TOKEN:-} + SERVICE_OWNER: ${SERVICE_OWNER:-} + ALERT_DISCORD_IDS: "${ALERT_DISCORD_IDS:-}" + CLUSTER_NAME: ${CLUSTER_NAME:-} + CLUSTER_PEER: ${CLUSTER_PEER:-} volumes: - - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml + - ./prometheus:/etc/prometheus - ./data/prometheus:/prometheus + entrypoint: /etc/prometheus/run.sh restart: unless-stopped grafana: @@ -195,6 +204,7 @@ services: - ./grafana/grafana.ini:/etc/grafana/grafana.ini:ro - ./grafana/dashboards:/etc/dashboards - ./data/grafana:/var/lib/grafana + entrypoint: /etc/prometheus/run.sh restart: unless-stopped loki: diff --git a/prometheus/prometheus.yml b/prometheus/prometheus.yml.example similarity index 55% rename from prometheus/prometheus.yml rename to prometheus/prometheus.yml.example index a5866c81..375aa9d1 100644 --- a/prometheus/prometheus.yml +++ b/prometheus/prometheus.yml.example @@ -1,6 +1,8 @@ global: scrape_interval: 30s # Set the scrape interval to every 30 seconds. evaluation_interval: 30s # Evaluate rules every 30 seconds. + external_labels: + service_owner: $SERVICE_OWNER # replace this with your Operator name you want to be identified by, it helps us route alerts and metrics to your notification channels easily remote_write: - url: https://vm.monitoring.gcp.obol.tech/write @@ -8,7 +10,7 @@ remote_write: credentials: $PROM_REMOTE_WRITE_TOKEN write_relabel_configs: - source_labels: [job] - regex: "charon|nethermind|lighthouse|lodestar" + regex: "charon|mev-boost" action: keep # Keeps charon metrics and drop metrics from other containers. scrape_configs: @@ -21,9 +23,20 @@ scrape_configs: - job_name: "charon" static_configs: - targets: ["charon:3620"] + relabel_configs: + - target_label: alert_discord_ids + replacement: "$ALERT_DISCORD_IDS" + - job_name: "mev-boost" + static_configs: + - targets: ["mev-mevboost:18551","mev-boost:18551"] + relabel_configs: + - target_label: cluster_name + replacement: "$CLUSTER_NAME" + - target_label: cluster_peer + replacement: "$CLUSTER_PEER" - job_name: "lodestar" static_configs: - targets: [ "lodestar:5064" ] - job_name: "validator-ejector" static_configs: - - targets: [ "validator-ejector:8989" ] + - targets: [ "validator-ejector:8989" ] \ No newline at end of file diff --git a/prometheus/run.sh b/prometheus/run.sh new file mode 100644 index 00000000..2aa077b5 --- /dev/null +++ b/prometheus/run.sh @@ -0,0 +1,24 @@ +#!/bin/sh + +if [ -z "$SERVICE_OWNER" ] +then + if [ -n "$CLUSTER_NAME" ] && [ -n "$CLUSTER_PEER" ]; then + export SERVICE_OWNER="${CLUSTER_NAME}-${CLUSTER_PEER}" + else + export SERVICE_OWNER="unknown" + fi +fi + +if [ -z "$PROM_REMOTE_WRITE_TOKEN" ] +then + echo "\$PROM_REMOTE_WRITE_TOKEN variable is empty" >&2 + exit 1 +fi + +sed -e "s|\$PROM_REMOTE_WRITE_TOKEN|${PROM_REMOTE_WRITE_TOKEN}|g" \ + -e "s|\$SERVICE_OWNER|${SERVICE_OWNER}|g" \ + -e "s|\$ALERT_DISCORD_IDS|${ALERT_DISCORD_IDS}|g" \ + /etc/prometheus/prometheus.yml.example > /etc/prometheus/prometheus.yml + +/bin/prometheus \ + --config.file=/etc/prometheus/prometheus.yml