feat(profiling): Integrate profiling into tap-agent service

neithanmo · neithanmo · commit ae552a06d8bf · 2025-04-27T16:59:10.000-06:00
diff --git a/contrib/docker-compose.dev.yml b/contrib/docker-compose.dev.yml
@@ -32,7 +32,6 @@ services:
         condition: service_healthy
     volumes:
       - ../target/release/indexer-tap-agent:/usr/local/bin/indexer-tap-agent
-      - ./tap-agent/start.sh:/usr/local/bin/start.sh
       - ./tap-agent:/opt/config:ro
       - ./local-network/.env:/opt/.env:ro
       - ./local-network/contracts.json:/opt/contracts.json:ro
@@ -44,6 +43,8 @@ services:
     ports:
       # to expose the metrics port
       - "7300:7300"
+    networks:
+      - local-network
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:7300/metrics"]
       interval: 5s
diff --git a/contrib/docker-compose.prof.yml b/contrib/docker-compose.prof.yml
@@ -3,12 +3,11 @@ services:
     image: indexer-base:latest
     build:
       context: ..
-      # dockerfile: contrib/indexer-service/Dockerfile
     container_name: indexer-service
     volumes:
       - ./local-network/contracts.json:/opt/contracts.json:ro
       - ./local-network/.env:/opt/.env:ro
-      - ./profiling/output:/opt/profiling:rw
+      - ./profiling:/opt/profiling:rw
       - ./indexer-service/config.toml:/opt/config/config.toml
       - ./indexer-service/start-perf.sh:/usr/local/bin/start-perf.sh
       - ../migrations:/opt/migrations:ro
@@ -22,6 +21,48 @@ services:
       - "7601:7601"
     networks:
       - local-network
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:7601/"]
+      interval: 5s
+      timeout: 3s
+      retries: 10
+      start_period: 10s
+    cap_add:
+      - SYS_ADMIN
+    privileged: true
+    security_opt:
+      - seccomp:unconfined
+
+  tap-agent:
+    image: indexer-base:latest # Pre-built base image with dependencies
+    container_name: tap-agent
+    # depends_on:
+    #   indexer-service:
+    #     condition: service_healthy
+    volumes:
+      - ../target/release/indexer-tap-agent:/usr/local/bin/indexer-tap-agent
+      - ./tap-agent/start-perf.sh:/usr/local/bin/start-perf.sh
+      - ./tap-agent:/opt/config:ro
+      - ./profiling:/opt/profiling:rw
+      - ./local-network/.env:/opt/.env:ro
+      - ./local-network/contracts.json:/opt/contracts.json:ro
+      - ../migrations:/opt/migrations:ro
+    entrypoint: ["/usr/local/bin/start-perf.sh"]
+    environment:
+      - RUST_BACKTRACE=1
+      - RUST_LOG=debug
+      - PROFILER=${PROFILER:-flamegraph}
+    ports:
+      # to expose the metrics port
+      - "7300:7300"
+    networks:
+      - local-network
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:7300/metrics"]
+      interval: 5s
+      timeout: 3s
+      retries: 10
+      start_period: 10s
     cap_add:
       - SYS_ADMIN
     privileged: true
diff --git a/contrib/docker-compose.yml b/contrib/docker-compose.yml
@@ -44,6 +44,8 @@ services:
     ports:
       # to expose the metrics port
       - "7300:7300"
+    networks:
+      - local-network
     healthcheck:
       test: ["CMD", "curl", "-f", "http://localhost:7300/metrics"]
       interval: 5s
diff --git a/contrib/indexer-service/start-perf.sh b/contrib/indexer-service/start-perf.sh
@@ -13,12 +13,6 @@ VERIFIER_ADDRESS=$(jq -r '."1337".TAPVerifier.address' /opt/contracts.json)
 # Override with test values taken from test-assets/src/lib.rs
 ALLOCATION_ID="0xfa44c72b753a66591f241c7dc04e8178c30e13af" # ALLOCATION_ID_0
 
-# Wait for postgres to be ready
-until pg_isready -h postgres -U postgres -d indexer_components_1; do
-    stdbuf -oL echo "Waiting for postgres..."
-    sleep 2
-done
-
 # Get network subgraph deployment ID
 NETWORK_DEPLOYMENT=$(curl -s "http://graph-node:8000/subgraphs/name/graph-network" \
     -H 'content-type: application/json' \
@@ -67,8 +61,9 @@ export RUST_BACKTRACE=full
 export RUST_LOG="${RUST_LOG:-trace}"
 
 # Create output directory if it doesn't exist
-mkdir -p /opt/profiling
+mkdir -p /opt/profiling/indexer-service
 chmod 777 /opt/profiling
+chmod 777 /opt/profiling/indexer-service
 
 stdbuf -oL echo "📁 DEBUG: Profiling output directory: $(ls -la /opt/profiling)"
 
@@ -88,15 +83,15 @@ strace)
     # -e trace=all: trace all system calls
     # -s 256: show up to 256 characters per string
     # -o: output file
-    exec strace -f -tt -T -e trace=all -s 256 -o /opt/profiling/indexer-service.strace.log /usr/local/bin/indexer-service-rs --config /opt/config.toml
+    exec strace -f -tt -T -e trace=all -s 256 -o /opt/profiling/indexer-service/strace.log /usr/local/bin/indexer-service-rs --config /opt/config.toml
     ;;
 valgrind)
     stdbuf -oL echo "🔍 Starting with Valgrind profiling..."
 
     # Start with Massif memory profiler
     stdbuf -oL echo "🔄 Starting Valgrind Massif memory profiling..."
     exec valgrind --tool=massif \
-        --massif-out-file=/opt/profiling/indexer-service.massif.out \
+        --massif-out-file=/opt/profiling/indexer-service/massif.out \
         --time-unit=B \
         --detailed-freq=10 \
         --max-snapshots=100 \
@@ -113,7 +108,7 @@ valgrind)
 callgrind)
     stdbuf -oL echo "🔍 Starting with Callgrind CPU profiling..."
     exec valgrind --tool=callgrind \
-        --callgrind-out-file=/opt/profiling/indexer-service.callgrind.out \
+        --callgrind-out-file=/opt/profiling/indexer-service/callgrind.out \
         --cache-sim=yes \
         --branch-sim=yes \
         /usr/local/bin/indexer-service-rs --config /opt/config.toml
diff --git a/contrib/tap-agent/Dockerfile b/contrib/tap-agent/Dockerfile
@@ -16,7 +16,27 @@ FROM debian:bookworm-slim
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
     openssl ca-certificates postgresql-client curl jq iproute2 \
+    git linux-perf \
+    strace valgrind procps \
+    bpftrace linux-headers-generic \
     && rm -rf /var/lib/apt/lists/*
+
+
+# Clone FlameGraph repository
+RUN git clone https://github.com/brendangregg/FlameGraph.git /opt/FlameGraph && \
+    chmod +x /opt/FlameGraph/*.pl
+
+# Create profiling directory with proper permissions
+RUN mkdir -p /opt/profiling && chmod 777 /opt/profiling
+
+# Copy our start script into the image
+COPY contrib/tap-agent/start.sh /opt/config/start.sh
+COPY contrib/tap-agent/start-perf.sh /usr/local/bin/start-perf.sh
+COPY contrib/tap-agent/config.toml /opt/config/config.toml
+
+RUN chmod +x /opt/config/start.sh 
+RUN chmod +x /usr/local/bin/start-perf.sh
+
 COPY --from=build /root/target/release/indexer-tap-agent /usr/local/bin/indexer-tap-agent
 
 ENTRYPOINT [ "/usr/local/bin/indexer-tap-agent" ]
diff --git a/contrib/tap-agent/config.toml b/contrib/tap-agent/config.toml
@@ -3,37 +3,36 @@ indexer_address = "INDEXER_ADDRESS_PLACEHOLDER"
 operator_mnemonic = "INDEXER_MNEMONIC_PLACEHOLDER"
 
 [database]
-postgres_url = "postgresql://postgres@postgres:POSTGRES_PORT_PLACEHOLDER/indexer_components_1"
+postgres_url = "postgres://postgres@postgres:POSTGRES_PORT_PLACEHOLDER/indexer_components_1"
 
 [graph_node]
-query_url = "http://graph-node:GRAPH_NODE_GRAPHQL_PORT_PLACEHOLDER"
-status_url = "http://graph-node:GRAPH_NODE_STATUS_PORT_PLACEHOLDER/graphql"
+query_url = "http://graph-node:8000"
+status_url = "http://graph-node:8030/graphql"
 
 [subgraphs.network]
-query_url = "http://graph-node:GRAPH_NODE_GRAPHQL_PORT_PLACEHOLDER/subgraphs/name/graph-network"
-recently_closed_allocation_buffer_secs = 60
-syncing_interval_secs = 30
+query_url = "http://graph-node:8000/subgraphs/name/graph-network"
+deployment_id = "NETWORK_DEPLOYMENT_PLACEHOLDER"
 
 [subgraphs.escrow]
-query_url = "http://graph-node:GRAPH_NODE_GRAPHQL_PORT_PLACEHOLDER/subgraphs/name/semiotic/tap"
-syncing_interval_secs = 30
+query_url = "http://graph-node:8000/subgraphs/name/semiotic/tap"
+deployment_id = "ESCROW_DEPLOYMENT_PLACEHOLDER"
 
 [blockchain]
 chain_id = 1337
 receipts_verifier_address = "VERIFIER_ADDRESS_PLACEHOLDER"
 
 [service]
-host_and_port = "0.0.0.0:INDEXER_SERVICE_PORT_PLACEHOLDER"
-url_prefix = "/"
-serve_network_subgraph = false
-serve_escrow_subgraph = false
+host_and_port = "0.0.0.0:7601"
+free_query_auth_token = "freestuff"
+
+[service.tap]
+max_receipt_value_grt = "0.001"
 
 [tap]
 max_amount_willing_to_lose_grt = 1000
 
 [tap.rav_request]
 # Set a lower timestamp buffer threshold
-# for testing purposes
 timestamp_buffer_secs = 30
 # The trigger value divisor is used to calculate the trigger value for the RAV request.
 # using the formula:
@@ -44,4 +43,7 @@ timestamp_buffer_secs = 30
 trigger_value_divisor = 500_000
 
 [tap.sender_aggregator_endpoints]
-"ACCOUNT0_ADDRESS_PLACEHOLDER" = "http://tap-aggregator:TAP_AGGREGATOR_PORT_PLACEHOLDER"
+"ACCOUNT0_ADDRESS_PLACEHOLDER" = "http://tap-aggregator:7610"
+
+[horizon]
+enabled = false
diff --git a/contrib/tap-agent/start-perf.sh b/contrib/tap-agent/start-perf.sh
@@ -0,0 +1,164 @@
+#!/bin/bash
+set -eu
+
+# Source environment variables from .env file
+if [ -f /opt/.env ]; then
+    stdbuf -oL echo "Sourcing environment variables from .env file"
+    . /opt/.env
+fi
+
+# Extract TAPVerifier address from contracts.json
+VERIFIER_ADDRESS=$(jq -r '."1337".TAPVerifier.address' /opt/contracts.json)
+ALLOCATION_ID="0xfa44c72b753a66591f241c7dc04e8178c30e13af" # ALLOCATION_ID_0
+
+# Wait for postgres to be ready
+until pg_isready -h postgres -U postgres -d indexer_components_1; do
+    stdbuf -oL echo "Waiting for postgres..."
+    sleep 2
+done
+
+stdbuf -oL echo "Checking if required services are available..."
+for service in postgres graph-node tap-aggregator; do
+    if getent hosts $service >/dev/null 2>&1; then
+        IP=$(getent hosts $service | awk '{ print $1 }')
+        stdbuf -oL echo "✅ $service resolves to $IP"
+    else
+        stdbuf -oL echo "❌ Cannot resolve $service hostname"
+    fi
+done
+
+# Get network subgraph deployment ID with retries
+stdbuf -oL echo "Getting network subgraph deployment ID..."
+MAX_ATTEMPTS=30
+ATTEMPT=0
+NETWORK_DEPLOYMENT=""
+
+while [ -z "$NETWORK_DEPLOYMENT" ] || [ "$NETWORK_DEPLOYMENT" = "null" ] && [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do
+    NETWORK_DEPLOYMENT=$(curl -s "http://graph-node:8000/subgraphs/name/graph-network" \
+        -H 'content-type: application/json' \
+        -d '{"query": "{ _meta { deployment } }"}' | jq -r '.data._meta.deployment' 2>/dev/null)
+
+    if [ -z "$NETWORK_DEPLOYMENT" ] || [ "$NETWORK_DEPLOYMENT" = "null" ]; then
+        ATTEMPT=$((ATTEMPT + 1))
+        echo "Waiting for network subgraph to be deployed... Attempt $ATTEMPT/$MAX_ATTEMPTS"
+        sleep 5
+    fi
+done
+
+if [ -z "$NETWORK_DEPLOYMENT" ] || [ "$NETWORK_DEPLOYMENT" = "null" ]; then
+    echo "ERROR: Failed to get network subgraph deployment ID after $MAX_ATTEMPTS attempts"
+    exit 1
+fi
+
+stdbuf -oL echo "Network subgraph deployment ID: $NETWORK_DEPLOYMENT"
+
+# Get escrow subgraph deployment ID with retries
+stdbuf -oL echo "Getting escrow subgraph deployment ID..."
+MAX_ATTEMPTS=30
+ATTEMPT=0
+ESCROW_DEPLOYMENT=""
+
+while [ -z "$ESCROW_DEPLOYMENT" ] || [ "$ESCROW_DEPLOYMENT" = "null" ] && [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do
+    ESCROW_DEPLOYMENT=$(curl -s "http://graph-node:8000/subgraphs/name/semiotic/tap" \
+        -H 'content-type: application/json' \
+        -d '{"query": "{ _meta { deployment } }"}' | jq -r '.data._meta.deployment' 2>/dev/null)
+
+    if [ -z "$ESCROW_DEPLOYMENT" ] || [ "$ESCROW_DEPLOYMENT" = "null" ]; then
+        ATTEMPT=$((ATTEMPT + 1))
+        echo "Waiting for escrow subgraph to be deployed... Attempt $ATTEMPT/$MAX_ATTEMPTS"
+        sleep 5
+    fi
+done
+
+if [ -z "$ESCROW_DEPLOYMENT" ] || [ "$ESCROW_DEPLOYMENT" = "null" ]; then
+    stdbuf -oL echo "ERROR: Failed to get escrow subgraph deployment ID after $MAX_ATTEMPTS attempts"
+    exit 1
+fi
+
+stdbuf -oL echo "Escrow subgraph deployment ID: $ESCROW_DEPLOYMENT"
+
+# Copy the config template
+cp /opt/config/config.toml /opt/config.toml
+
+# Replace the placeholders with actual values
+sed -i "s/NETWORK_DEPLOYMENT_PLACEHOLDER/$NETWORK_DEPLOYMENT/g" /opt/config.toml
+sed -i "s/ESCROW_DEPLOYMENT_PLACEHOLDER/$ESCROW_DEPLOYMENT/g" /opt/config.toml
+sed -i "s/VERIFIER_ADDRESS_PLACEHOLDER/$VERIFIER_ADDRESS/g" /opt/config.toml
+sed -i "s/INDEXER_ADDRESS_PLACEHOLDER/$RECEIVER_ADDRESS/g" /opt/config.toml
+sed -i "s/INDEXER_MNEMONIC_PLACEHOLDER/$INDEXER_MNEMONIC/g" /opt/config.toml
+sed -i "s/ACCOUNT0_ADDRESS_PLACEHOLDER/$ACCOUNT0_ADDRESS/g" /opt/config.toml
+sed -i "s/TAP_AGGREGATOR_PORT_PLACEHOLDER/$TAP_AGGREGATOR/g" /opt/config.toml
+sed -i "s/POSTGRES_PORT_PLACEHOLDER/$POSTGRES/g" /opt/config.toml
+sed -i "s/GRAPH_NODE_GRAPHQL_PORT_PLACEHOLDER/$GRAPH_NODE_GRAPHQL/g" /opt/config.toml
+sed -i "s/GRAPH_NODE_STATUS_PORT_PLACEHOLDER/$GRAPH_NODE_STATUS/g" /opt/config.toml
+sed -i "s/INDEXER_SERVICE_PORT_PLACEHOLDER/$INDEXER_SERVICE/g" /opt/config.toml
+
+stdbuf -oL echo "Starting tap-agent with config:"
+cat /opt/config.toml
+
+# Set profiling tool based on environment variable
+# Default is no profiling
+PROFILER="${PROFILER:-none}"
+stdbuf -oL echo "🔍 DEBUG: Profiling with: $PROFILER"
+
+# Run agent with enhanced logging
+stdbuf -oL echo "Starting tap-agent..."
+export RUST_BACKTRACE=full
+export RUST_LOG=debug
+
+# Create output directory if it doesn't exist
+mkdir -p /opt/profiling/tap-agent
+chmod 777 /opt/profiling
+chmod 777 /opt/profiling/tap-agent
+
+case "$PROFILER" in
+flamegraph)
+    stdbuf -oL echo "🔥 Starting with profiler..."
+
+    # Start the service in the background with output redirection
+    stdbuf -oL echo "🚀 Starting service..."
+    exec /usr/local/bin/indexer-tap-agent --config /opt/config.toml
+    ;;
+strace)
+    stdbuf -oL echo "🔍 Starting with strace..."
+    # -f: follow child processes
+    # -tt: print timestamps with microsecond precision
+    # -T: show time spent in each syscall
+    # -e trace=all: trace all system calls
+    # -s 256: show up to 256 characters per string
+    # -o: output file
+    exec strace -f -tt -T -e trace=all -s 256 -o /opt/profiling/tap-agent/strace.log /usr/local/bin/indexer-tap-agent --config /opt/config.toml
+    ;;
+valgrind)
+    stdbuf -oL echo "🔍 Starting with Valgrind profiling..."
+
+    # Start with Massif memory profiler
+    stdbuf -oL echo "🔄 Starting Valgrind Massif memory profiling..."
+    exec valgrind --tool=massif \
+        --massif-out-file=/opt/profiling/tap-agent/massif.out \
+        --time-unit=B \
+        --detailed-freq=10 \
+        --max-snapshots=100 \
+        --threshold=0.5 \
+        /usr/local/bin/indexer-tap-agent --config /opt/config.toml
+    ;;
+# Use sudo callgrind_annotate indexer-service.callgrind.out
+# for humand friendly report  of callgrind output
+# Ideally you should set:
+# [profile.release.package."*"]
+# debug = true
+# force-frame-pointers = true
+# in the Cargo.toml
+callgrind)
+    stdbuf -oL echo "🔍 Starting with Callgrind CPU profiling..."
+    exec valgrind --tool=callgrind \
+        --callgrind-out-file=/opt/profiling/tap-agent/callgrind.out \
+        --cache-sim=yes \
+        --branch-sim=yes \
+        /usr/local/bin/indexer-tap-agent --config /opt/config.toml
+    ;;
+none)
+    stdbuf -oL echo "🔍 Starting without profiling..."
+    exec /usr/local/bin/indexer-tap-agent --config /opt/config.toml
+    ;;
+esac