Skip to content

Commit ae552a0

Browse files
committed
feat(profiling): Integrate profiling into tap-agent service
1 parent 1a4df78 commit ae552a0

File tree

7 files changed

+252
-27
lines changed

7 files changed

+252
-27
lines changed

contrib/docker-compose.dev.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ services:
3232
condition: service_healthy
3333
volumes:
3434
- ../target/release/indexer-tap-agent:/usr/local/bin/indexer-tap-agent
35-
- ./tap-agent/start.sh:/usr/local/bin/start.sh
3635
- ./tap-agent:/opt/config:ro
3736
- ./local-network/.env:/opt/.env:ro
3837
- ./local-network/contracts.json:/opt/contracts.json:ro
@@ -44,6 +43,8 @@ services:
4443
ports:
4544
# to expose the metrics port
4645
- "7300:7300"
46+
networks:
47+
- local-network
4748
healthcheck:
4849
test: ["CMD", "curl", "-f", "http://localhost:7300/metrics"]
4950
interval: 5s

contrib/docker-compose.prof.yml

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,11 @@ services:
33
image: indexer-base:latest
44
build:
55
context: ..
6-
# dockerfile: contrib/indexer-service/Dockerfile
76
container_name: indexer-service
87
volumes:
98
- ./local-network/contracts.json:/opt/contracts.json:ro
109
- ./local-network/.env:/opt/.env:ro
11-
- ./profiling/output:/opt/profiling:rw
10+
- ./profiling:/opt/profiling:rw
1211
- ./indexer-service/config.toml:/opt/config/config.toml
1312
- ./indexer-service/start-perf.sh:/usr/local/bin/start-perf.sh
1413
- ../migrations:/opt/migrations:ro
@@ -22,6 +21,48 @@ services:
2221
- "7601:7601"
2322
networks:
2423
- local-network
24+
healthcheck:
25+
test: ["CMD", "curl", "-f", "http://localhost:7601/"]
26+
interval: 5s
27+
timeout: 3s
28+
retries: 10
29+
start_period: 10s
30+
cap_add:
31+
- SYS_ADMIN
32+
privileged: true
33+
security_opt:
34+
- seccomp:unconfined
35+
36+
tap-agent:
37+
image: indexer-base:latest # Pre-built base image with dependencies
38+
container_name: tap-agent
39+
# depends_on:
40+
# indexer-service:
41+
# condition: service_healthy
42+
volumes:
43+
- ../target/release/indexer-tap-agent:/usr/local/bin/indexer-tap-agent
44+
- ./tap-agent/start-perf.sh:/usr/local/bin/start-perf.sh
45+
- ./tap-agent:/opt/config:ro
46+
- ./profiling:/opt/profiling:rw
47+
- ./local-network/.env:/opt/.env:ro
48+
- ./local-network/contracts.json:/opt/contracts.json:ro
49+
- ../migrations:/opt/migrations:ro
50+
entrypoint: ["/usr/local/bin/start-perf.sh"]
51+
environment:
52+
- RUST_BACKTRACE=1
53+
- RUST_LOG=debug
54+
- PROFILER=${PROFILER:-flamegraph}
55+
ports:
56+
# to expose the metrics port
57+
- "7300:7300"
58+
networks:
59+
- local-network
60+
healthcheck:
61+
test: ["CMD", "curl", "-f", "http://localhost:7300/metrics"]
62+
interval: 5s
63+
timeout: 3s
64+
retries: 10
65+
start_period: 10s
2566
cap_add:
2667
- SYS_ADMIN
2768
privileged: true

contrib/docker-compose.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ services:
4444
ports:
4545
# to expose the metrics port
4646
- "7300:7300"
47+
networks:
48+
- local-network
4749
healthcheck:
4850
test: ["CMD", "curl", "-f", "http://localhost:7300/metrics"]
4951
interval: 5s

contrib/indexer-service/start-perf.sh

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,6 @@ VERIFIER_ADDRESS=$(jq -r '."1337".TAPVerifier.address' /opt/contracts.json)
1313
# Override with test values taken from test-assets/src/lib.rs
1414
ALLOCATION_ID="0xfa44c72b753a66591f241c7dc04e8178c30e13af" # ALLOCATION_ID_0
1515

16-
# Wait for postgres to be ready
17-
until pg_isready -h postgres -U postgres -d indexer_components_1; do
18-
stdbuf -oL echo "Waiting for postgres..."
19-
sleep 2
20-
done
21-
2216
# Get network subgraph deployment ID
2317
NETWORK_DEPLOYMENT=$(curl -s "http://graph-node:8000/subgraphs/name/graph-network" \
2418
-H 'content-type: application/json' \
@@ -67,8 +61,9 @@ export RUST_BACKTRACE=full
6761
export RUST_LOG="${RUST_LOG:-trace}"
6862

6963
# Create output directory if it doesn't exist
70-
mkdir -p /opt/profiling
64+
mkdir -p /opt/profiling/indexer-service
7165
chmod 777 /opt/profiling
66+
chmod 777 /opt/profiling/indexer-service
7267

7368
stdbuf -oL echo "📁 DEBUG: Profiling output directory: $(ls -la /opt/profiling)"
7469

@@ -88,15 +83,15 @@ strace)
8883
# -e trace=all: trace all system calls
8984
# -s 256: show up to 256 characters per string
9085
# -o: output file
91-
exec strace -f -tt -T -e trace=all -s 256 -o /opt/profiling/indexer-service.strace.log /usr/local/bin/indexer-service-rs --config /opt/config.toml
86+
exec strace -f -tt -T -e trace=all -s 256 -o /opt/profiling/indexer-service/strace.log /usr/local/bin/indexer-service-rs --config /opt/config.toml
9287
;;
9388
valgrind)
9489
stdbuf -oL echo "🔍 Starting with Valgrind profiling..."
9590

9691
# Start with Massif memory profiler
9792
stdbuf -oL echo "🔄 Starting Valgrind Massif memory profiling..."
9893
exec valgrind --tool=massif \
99-
--massif-out-file=/opt/profiling/indexer-service.massif.out \
94+
--massif-out-file=/opt/profiling/indexer-service/massif.out \
10095
--time-unit=B \
10196
--detailed-freq=10 \
10297
--max-snapshots=100 \
@@ -113,7 +108,7 @@ valgrind)
113108
callgrind)
114109
stdbuf -oL echo "🔍 Starting with Callgrind CPU profiling..."
115110
exec valgrind --tool=callgrind \
116-
--callgrind-out-file=/opt/profiling/indexer-service.callgrind.out \
111+
--callgrind-out-file=/opt/profiling/indexer-service/callgrind.out \
117112
--cache-sim=yes \
118113
--branch-sim=yes \
119114
/usr/local/bin/indexer-service-rs --config /opt/config.toml

contrib/tap-agent/Dockerfile

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,27 @@ FROM debian:bookworm-slim
1616

1717
RUN apt-get update && apt-get install -y --no-install-recommends \
1818
openssl ca-certificates postgresql-client curl jq iproute2 \
19+
git linux-perf \
20+
strace valgrind procps \
21+
bpftrace linux-headers-generic \
1922
&& rm -rf /var/lib/apt/lists/*
23+
24+
25+
# Clone FlameGraph repository
26+
RUN git clone https://github.com/brendangregg/FlameGraph.git /opt/FlameGraph && \
27+
chmod +x /opt/FlameGraph/*.pl
28+
29+
# Create profiling directory with proper permissions
30+
RUN mkdir -p /opt/profiling && chmod 777 /opt/profiling
31+
32+
# Copy our start script into the image
33+
COPY contrib/tap-agent/start.sh /opt/config/start.sh
34+
COPY contrib/tap-agent/start-perf.sh /usr/local/bin/start-perf.sh
35+
COPY contrib/tap-agent/config.toml /opt/config/config.toml
36+
37+
RUN chmod +x /opt/config/start.sh
38+
RUN chmod +x /usr/local/bin/start-perf.sh
39+
2040
COPY --from=build /root/target/release/indexer-tap-agent /usr/local/bin/indexer-tap-agent
2141

2242
ENTRYPOINT [ "/usr/local/bin/indexer-tap-agent" ]

contrib/tap-agent/config.toml

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,37 +3,36 @@ indexer_address = "INDEXER_ADDRESS_PLACEHOLDER"
33
operator_mnemonic = "INDEXER_MNEMONIC_PLACEHOLDER"
44

55
[database]
6-
postgres_url = "postgresql://postgres@postgres:POSTGRES_PORT_PLACEHOLDER/indexer_components_1"
6+
postgres_url = "postgres://postgres@postgres:POSTGRES_PORT_PLACEHOLDER/indexer_components_1"
77

88
[graph_node]
9-
query_url = "http://graph-node:GRAPH_NODE_GRAPHQL_PORT_PLACEHOLDER"
10-
status_url = "http://graph-node:GRAPH_NODE_STATUS_PORT_PLACEHOLDER/graphql"
9+
query_url = "http://graph-node:8000"
10+
status_url = "http://graph-node:8030/graphql"
1111

1212
[subgraphs.network]
13-
query_url = "http://graph-node:GRAPH_NODE_GRAPHQL_PORT_PLACEHOLDER/subgraphs/name/graph-network"
14-
recently_closed_allocation_buffer_secs = 60
15-
syncing_interval_secs = 30
13+
query_url = "http://graph-node:8000/subgraphs/name/graph-network"
14+
deployment_id = "NETWORK_DEPLOYMENT_PLACEHOLDER"
1615

1716
[subgraphs.escrow]
18-
query_url = "http://graph-node:GRAPH_NODE_GRAPHQL_PORT_PLACEHOLDER/subgraphs/name/semiotic/tap"
19-
syncing_interval_secs = 30
17+
query_url = "http://graph-node:8000/subgraphs/name/semiotic/tap"
18+
deployment_id = "ESCROW_DEPLOYMENT_PLACEHOLDER"
2019

2120
[blockchain]
2221
chain_id = 1337
2322
receipts_verifier_address = "VERIFIER_ADDRESS_PLACEHOLDER"
2423

2524
[service]
26-
host_and_port = "0.0.0.0:INDEXER_SERVICE_PORT_PLACEHOLDER"
27-
url_prefix = "/"
28-
serve_network_subgraph = false
29-
serve_escrow_subgraph = false
25+
host_and_port = "0.0.0.0:7601"
26+
free_query_auth_token = "freestuff"
27+
28+
[service.tap]
29+
max_receipt_value_grt = "0.001"
3030

3131
[tap]
3232
max_amount_willing_to_lose_grt = 1000
3333

3434
[tap.rav_request]
3535
# Set a lower timestamp buffer threshold
36-
# for testing purposes
3736
timestamp_buffer_secs = 30
3837
# The trigger value divisor is used to calculate the trigger value for the RAV request.
3938
# using the formula:
@@ -44,4 +43,7 @@ timestamp_buffer_secs = 30
4443
trigger_value_divisor = 500_000
4544

4645
[tap.sender_aggregator_endpoints]
47-
"ACCOUNT0_ADDRESS_PLACEHOLDER" = "http://tap-aggregator:TAP_AGGREGATOR_PORT_PLACEHOLDER"
46+
"ACCOUNT0_ADDRESS_PLACEHOLDER" = "http://tap-aggregator:7610"
47+
48+
[horizon]
49+
enabled = false

contrib/tap-agent/start-perf.sh

Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
#!/bin/bash
2+
set -eu
3+
4+
# Source environment variables from .env file
5+
if [ -f /opt/.env ]; then
6+
stdbuf -oL echo "Sourcing environment variables from .env file"
7+
. /opt/.env
8+
fi
9+
10+
# Extract TAPVerifier address from contracts.json
11+
VERIFIER_ADDRESS=$(jq -r '."1337".TAPVerifier.address' /opt/contracts.json)
12+
ALLOCATION_ID="0xfa44c72b753a66591f241c7dc04e8178c30e13af" # ALLOCATION_ID_0
13+
14+
# Wait for postgres to be ready
15+
until pg_isready -h postgres -U postgres -d indexer_components_1; do
16+
stdbuf -oL echo "Waiting for postgres..."
17+
sleep 2
18+
done
19+
20+
stdbuf -oL echo "Checking if required services are available..."
21+
for service in postgres graph-node tap-aggregator; do
22+
if getent hosts $service >/dev/null 2>&1; then
23+
IP=$(getent hosts $service | awk '{ print $1 }')
24+
stdbuf -oL echo "$service resolves to $IP"
25+
else
26+
stdbuf -oL echo "❌ Cannot resolve $service hostname"
27+
fi
28+
done
29+
30+
# Get network subgraph deployment ID with retries
31+
stdbuf -oL echo "Getting network subgraph deployment ID..."
32+
MAX_ATTEMPTS=30
33+
ATTEMPT=0
34+
NETWORK_DEPLOYMENT=""
35+
36+
while [ -z "$NETWORK_DEPLOYMENT" ] || [ "$NETWORK_DEPLOYMENT" = "null" ] && [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do
37+
NETWORK_DEPLOYMENT=$(curl -s "http://graph-node:8000/subgraphs/name/graph-network" \
38+
-H 'content-type: application/json' \
39+
-d '{"query": "{ _meta { deployment } }"}' | jq -r '.data._meta.deployment' 2>/dev/null)
40+
41+
if [ -z "$NETWORK_DEPLOYMENT" ] || [ "$NETWORK_DEPLOYMENT" = "null" ]; then
42+
ATTEMPT=$((ATTEMPT + 1))
43+
echo "Waiting for network subgraph to be deployed... Attempt $ATTEMPT/$MAX_ATTEMPTS"
44+
sleep 5
45+
fi
46+
done
47+
48+
if [ -z "$NETWORK_DEPLOYMENT" ] || [ "$NETWORK_DEPLOYMENT" = "null" ]; then
49+
echo "ERROR: Failed to get network subgraph deployment ID after $MAX_ATTEMPTS attempts"
50+
exit 1
51+
fi
52+
53+
stdbuf -oL echo "Network subgraph deployment ID: $NETWORK_DEPLOYMENT"
54+
55+
# Get escrow subgraph deployment ID with retries
56+
stdbuf -oL echo "Getting escrow subgraph deployment ID..."
57+
MAX_ATTEMPTS=30
58+
ATTEMPT=0
59+
ESCROW_DEPLOYMENT=""
60+
61+
while [ -z "$ESCROW_DEPLOYMENT" ] || [ "$ESCROW_DEPLOYMENT" = "null" ] && [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do
62+
ESCROW_DEPLOYMENT=$(curl -s "http://graph-node:8000/subgraphs/name/semiotic/tap" \
63+
-H 'content-type: application/json' \
64+
-d '{"query": "{ _meta { deployment } }"}' | jq -r '.data._meta.deployment' 2>/dev/null)
65+
66+
if [ -z "$ESCROW_DEPLOYMENT" ] || [ "$ESCROW_DEPLOYMENT" = "null" ]; then
67+
ATTEMPT=$((ATTEMPT + 1))
68+
echo "Waiting for escrow subgraph to be deployed... Attempt $ATTEMPT/$MAX_ATTEMPTS"
69+
sleep 5
70+
fi
71+
done
72+
73+
if [ -z "$ESCROW_DEPLOYMENT" ] || [ "$ESCROW_DEPLOYMENT" = "null" ]; then
74+
stdbuf -oL echo "ERROR: Failed to get escrow subgraph deployment ID after $MAX_ATTEMPTS attempts"
75+
exit 1
76+
fi
77+
78+
stdbuf -oL echo "Escrow subgraph deployment ID: $ESCROW_DEPLOYMENT"
79+
80+
# Copy the config template
81+
cp /opt/config/config.toml /opt/config.toml
82+
83+
# Replace the placeholders with actual values
84+
sed -i "s/NETWORK_DEPLOYMENT_PLACEHOLDER/$NETWORK_DEPLOYMENT/g" /opt/config.toml
85+
sed -i "s/ESCROW_DEPLOYMENT_PLACEHOLDER/$ESCROW_DEPLOYMENT/g" /opt/config.toml
86+
sed -i "s/VERIFIER_ADDRESS_PLACEHOLDER/$VERIFIER_ADDRESS/g" /opt/config.toml
87+
sed -i "s/INDEXER_ADDRESS_PLACEHOLDER/$RECEIVER_ADDRESS/g" /opt/config.toml
88+
sed -i "s/INDEXER_MNEMONIC_PLACEHOLDER/$INDEXER_MNEMONIC/g" /opt/config.toml
89+
sed -i "s/ACCOUNT0_ADDRESS_PLACEHOLDER/$ACCOUNT0_ADDRESS/g" /opt/config.toml
90+
sed -i "s/TAP_AGGREGATOR_PORT_PLACEHOLDER/$TAP_AGGREGATOR/g" /opt/config.toml
91+
sed -i "s/POSTGRES_PORT_PLACEHOLDER/$POSTGRES/g" /opt/config.toml
92+
sed -i "s/GRAPH_NODE_GRAPHQL_PORT_PLACEHOLDER/$GRAPH_NODE_GRAPHQL/g" /opt/config.toml
93+
sed -i "s/GRAPH_NODE_STATUS_PORT_PLACEHOLDER/$GRAPH_NODE_STATUS/g" /opt/config.toml
94+
sed -i "s/INDEXER_SERVICE_PORT_PLACEHOLDER/$INDEXER_SERVICE/g" /opt/config.toml
95+
96+
stdbuf -oL echo "Starting tap-agent with config:"
97+
cat /opt/config.toml
98+
99+
# Set profiling tool based on environment variable
100+
# Default is no profiling
101+
PROFILER="${PROFILER:-none}"
102+
stdbuf -oL echo "🔍 DEBUG: Profiling with: $PROFILER"
103+
104+
# Run agent with enhanced logging
105+
stdbuf -oL echo "Starting tap-agent..."
106+
export RUST_BACKTRACE=full
107+
export RUST_LOG=debug
108+
109+
# Create output directory if it doesn't exist
110+
mkdir -p /opt/profiling/tap-agent
111+
chmod 777 /opt/profiling
112+
chmod 777 /opt/profiling/tap-agent
113+
114+
case "$PROFILER" in
115+
flamegraph)
116+
stdbuf -oL echo "🔥 Starting with profiler..."
117+
118+
# Start the service in the background with output redirection
119+
stdbuf -oL echo "🚀 Starting service..."
120+
exec /usr/local/bin/indexer-tap-agent --config /opt/config.toml
121+
;;
122+
strace)
123+
stdbuf -oL echo "🔍 Starting with strace..."
124+
# -f: follow child processes
125+
# -tt: print timestamps with microsecond precision
126+
# -T: show time spent in each syscall
127+
# -e trace=all: trace all system calls
128+
# -s 256: show up to 256 characters per string
129+
# -o: output file
130+
exec strace -f -tt -T -e trace=all -s 256 -o /opt/profiling/tap-agent/strace.log /usr/local/bin/indexer-tap-agent --config /opt/config.toml
131+
;;
132+
valgrind)
133+
stdbuf -oL echo "🔍 Starting with Valgrind profiling..."
134+
135+
# Start with Massif memory profiler
136+
stdbuf -oL echo "🔄 Starting Valgrind Massif memory profiling..."
137+
exec valgrind --tool=massif \
138+
--massif-out-file=/opt/profiling/tap-agent/massif.out \
139+
--time-unit=B \
140+
--detailed-freq=10 \
141+
--max-snapshots=100 \
142+
--threshold=0.5 \
143+
/usr/local/bin/indexer-tap-agent --config /opt/config.toml
144+
;;
145+
# Use sudo callgrind_annotate indexer-service.callgrind.out
146+
# for humand friendly report of callgrind output
147+
# Ideally you should set:
148+
# [profile.release.package."*"]
149+
# debug = true
150+
# force-frame-pointers = true
151+
# in the Cargo.toml
152+
callgrind)
153+
stdbuf -oL echo "🔍 Starting with Callgrind CPU profiling..."
154+
exec valgrind --tool=callgrind \
155+
--callgrind-out-file=/opt/profiling/tap-agent/callgrind.out \
156+
--cache-sim=yes \
157+
--branch-sim=yes \
158+
/usr/local/bin/indexer-tap-agent --config /opt/config.toml
159+
;;
160+
none)
161+
stdbuf -oL echo "🔍 Starting without profiling..."
162+
exec /usr/local/bin/indexer-tap-agent --config /opt/config.toml
163+
;;
164+
esac

0 commit comments

Comments
 (0)