diff --git a/.gitignore b/.gitignore index ad69aef7e..333840cbf 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,8 @@ crates/dips/node_modules/ crates/dips/generated/ crates/dips/npm-debug.log* contrib/local-network +contrib/profiling/indexer-service +contrib/profiling/tap-agent node_modules target/ *.code-workspace diff --git a/Cargo.lock b/Cargo.lock index 87166c314..e1b66576b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -45,6 +45,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", + "getrandom 0.2.16", "once_cell", "version_check", "zerocopy 0.7.35", @@ -59,6 +60,15 @@ dependencies = [ "memchr", ] +[[package]] +name = "aligned-vec" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc890384c8602f339876ded803c97ad529f3842aba97f6392b3dba0dd171769b" +dependencies = [ + "equator", +] + [[package]] name = "allocator-api2" version = "0.2.21" @@ -2443,6 +2453,15 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b" +[[package]] +name = "debugid" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef552e6f588e446098f6ba40d89ac146c8c7b64aade83c051ee00bb5d2bc18d" +dependencies = [ + "uuid", +] + [[package]] name = "der" version = "0.7.10" @@ -2712,6 +2731,26 @@ dependencies = [ "log", ] +[[package]] +name = "equator" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4711b213838dfee0117e3be6ac926007d7f433d7bbe33595975d4190cb07e6fc" +dependencies = [ + "equator-macro", +] + +[[package]] +name = "equator-macro" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44f23cf4b44bfce11a86ace86f8a73ffdec849c9fd00a386a53d278bd9e81fb3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.101", +] + [[package]] name = "equivalent" version = "1.0.2" @@ -2814,6 +2853,18 @@ dependencies = [ "version_check", ] +[[package]] +name = "findshlibs" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40b9e59cd0f7e0806cca4be089683ecb6434e602038df21fe6bf6711b2f07f64" +dependencies = [ + "cc", + "lazy_static", + "libc", + "winapi", +] + [[package]] name = "firestorm" version = "0.5.1" @@ -3368,6 +3419,12 @@ version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d231dfb89cfffdbc30e7fc41579ed6066ad03abda9e567ccafae602b97ec5024" +[[package]] +name = "hermit-abi" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbd780fe5cc30f81464441920d82ac8740e2e46b29a6fad543ddd075229ce37e" + [[package]] name = "hex" version = "0.4.3" @@ -3993,6 +4050,7 @@ dependencies = [ "insta", "itertools 0.14.0", "pin-project 1.1.10", + "profiler", "prometheus 0.13.4", "prost", "reqwest 0.12.15", @@ -4048,6 +4106,7 @@ dependencies = [ "insta", "itertools 0.14.0", "jsonrpsee", + "profiler", "prometheus 0.13.4", "ractor", "rand 0.9.1", @@ -4106,6 +4165,24 @@ dependencies = [ "serde", ] +[[package]] +name = "inferno" +version = "0.11.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "232929e1d75fe899576a3d5c7416ad0d88dbfbb3c3d6aa00873a7408a50ddb88" +dependencies = [ + "ahash 0.8.11", + "indexmap 2.9.0", + "is-terminal", + "itoa", + "log", + "num-format", + "once_cell", + "quick-xml", + "rgb", + "str_stack", +] + [[package]] name = "inlinable_string" version = "0.1.15" @@ -4189,6 +4266,17 @@ version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" +[[package]] +name = "is-terminal" +version = "0.4.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" +dependencies = [ + "hermit-abi 0.5.0", + "libc", + "windows-sys 0.59.0", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.1" @@ -4626,6 +4714,15 @@ version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" +[[package]] +name = "memmap2" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd3f7eed9d3848f8b98834af67102b720745c4ec028fcd0aa0239277e7de374f" +dependencies = [ + "libc", +] + [[package]] name = "memoffset" version = "0.7.1" @@ -4952,6 +5049,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-format" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a652d9771a63711fd3c3deb670acfbe5c30a4072e664d7a3bf5a9e1056ac72c3" +dependencies = [ + "arrayvec 0.7.6", + "itoa", +] + [[package]] name = "num-integer" version = "0.1.46" @@ -5000,7 +5107,7 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" dependencies = [ - "hermit-abi", + "hermit-abi 0.3.9", "libc", ] @@ -5427,6 +5534,30 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" +[[package]] +name = "pprof" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebbe2f8898beba44815fdc9e5a4ae9c929e21c5dc29b0c774a15555f7f58d6d0" +dependencies = [ + "aligned-vec", + "backtrace", + "cfg-if", + "findshlibs", + "inferno", + "libc", + "log", + "nix", + "once_cell", + "parking_lot", + "protobuf 2.28.0", + "protobuf-codegen-pure", + "smallvec", + "symbolic-demangle", + "tempfile", + "thiserror 1.0.69", +] + [[package]] name = "ppv-lite86" version = "0.2.21" @@ -5554,6 +5685,17 @@ dependencies = [ "yansi", ] +[[package]] +name = "profiler" +version = "0.1.0" +dependencies = [ + "chrono", + "pprof", + "thiserror 1.0.69", + "timer", + "tracing", +] + [[package]] name = "prometheus" version = "0.13.4" @@ -5696,6 +5838,25 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "protobuf-codegen" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "033460afb75cf755fcfc16dfaed20b86468082a2ea24e05ac35ab4a099a017d6" +dependencies = [ + "protobuf 2.28.0", +] + +[[package]] +name = "protobuf-codegen-pure" +version = "2.28.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95a29399fc94bcd3eeaa951c715f7bea69409b2445356b00519740bcd6ddd865" +dependencies = [ + "protobuf 2.28.0", + "protobuf-codegen", +] + [[package]] name = "protobuf-support" version = "3.7.2" @@ -5762,6 +5923,15 @@ version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" +[[package]] +name = "quick-xml" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f50b1c63b38611e7d4d7f68b82d3ad0cc71a2ad2e7f61fc10f1328d917c93cd" +dependencies = [ + "memchr", +] + [[package]] name = "quinn" version = "0.11.7" @@ -6187,6 +6357,15 @@ dependencies = [ "subtle", ] +[[package]] +name = "rgb" +version = "0.8.50" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57397d16646700483b67d2dd6511d79318f9d057fdbd21a4066aeac8b41d310a" +dependencies = [ + "bytemuck", +] + [[package]] name = "ring" version = "0.17.14" @@ -7350,6 +7529,12 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4af28eeb7c18ac2dbdb255d40bee63f203120e1db6b0024b177746ebec7049c1" +[[package]] +name = "str_stack" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9091b6114800a5f2141aee1d1b9d6ca3592ac062dc5decb3764ec5895a47b4eb" + [[package]] name = "stringprep" version = "0.1.5" @@ -7417,6 +7602,28 @@ version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" +[[package]] +name = "symbolic-common" +version = "12.15.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23eae23242dffa2e8e66c0e20f4ca1e28391f64e361db1e921a209c9bc70ec3a" +dependencies = [ + "debugid", + "memmap2", + "stable_deref_trait", + "uuid", +] + +[[package]] +name = "symbolic-demangle" +version = "12.15.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "153faacda0d58dc1eb3e8bbd5dab998041e95bd7f4ab2caeeadc89410617f144" +dependencies = [ + "rustc-demangle", + "symbolic-common", +] + [[package]] name = "syn" version = "1.0.109" @@ -7824,6 +8031,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "timer" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "31d42176308937165701f50638db1c31586f183f1aab416268216577aec7306b" +dependencies = [ + "chrono", +] + [[package]] name = "tiny-keccak" version = "2.0.2" diff --git a/Cargo.toml b/Cargo.toml index 5d22d4303..e5b4cd4ce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,7 @@ members = [ "crates/dips", "crates/indexer-receipt", "crates/monitor", + "crates/profiler", "crates/query", "crates/service", "crates/tap-agent", @@ -15,6 +16,13 @@ members = [ ] resolver = "2" +# This is to enable debug symbols +# which are useful during profiling +# with callgrind +# [profile.release] +# debug = true +# force-frame-pointers = true + [profile.dev.package."*"] opt-level = 3 diff --git a/contrib/PROFILING.md b/contrib/PROFILING.md new file mode 100644 index 000000000..bf90c665e --- /dev/null +++ b/contrib/PROFILING.md @@ -0,0 +1,145 @@ +# Profiling Tools + +This document explains the profiling infrastructure set up for our indexer-service and tap-agent services. The profiling setup enables developers to diagnose performance issues, memory leaks, and analyze runtime behavior in both development and production environments. + +## Overview + +Our project includes an integrated profiling system for the indexer services. The system supports multiple profiling methods through: + +1. A custom `profiler` library (included in the workspace) +2. Docker-based profiling environments +3. Various third-party profiling tools + +## Available Profiling Methods + +### Built-in Profiler (pprof-based Flamegraphs) + +A Rust library that uses [pprof](https://crates.io/crates/pprof) to continuously profile the application and generate flamegraphs at specified intervals. +This solution was particularly suitable because tools like `perf`, while powerful, often pose configuration challenges or require specific capabilities (like CAP_SYS_ADMIN) that complicate their deployment within standard Docker containers. + +- **Configuration**: Set in code with the `setup_profiling` function +- **Activation**: Enabled via the `profiling` feature flag +- **Output**: Flamegraphs (SVG) and protobuf profiles in `/opt/profiling/{service-name}/` + +### External Profiling Tools + +The profiling environment also supports the following tools: + +| Tool | Description | Output | +| ------------- | ---------------------------------------- | --------------------------------------------- | +| **strace** | Traces system calls with detailed timing | `/opt/profiling/{service-name}/strace.log` | +| **valgrind** | Memory profiling with Massif | `/opt/profiling/{service-name}/massif.out` | +| **callgrind** | CPU profiling (part of valgrind) | `/opt/profiling/{service-name}/callgrind.out` | + +## How to Use + +### Prerequisites + +Run the setup command first to prepare the testing environment: + +```bash +just setup +``` + +### Profiling Commands + +Use the following commands to profile specific services: + +```bash +# Profile with flamegraph (default) +just profile-flamegraph + +# Profile with valgrind +just profile-valgrind + +# Profile with strace +just profile-strace + +# Profile with callgrind +just profile-callgrind + +# Stop profiling (gracefully terminate to generate output) +just stop-profiling + +# Restore normal service without profiling +just profile-restore +``` + +### Viewing Results + +Profiling data is stored in: + +- `contrib/profiling/indexer-service/` +- `contrib/profiling/tap-agent/` + +#### Visualization Tools + +- **Flamegraphs**: Open the SVG files in any web browser +- **Callgrind**: Use `callgrind_annotate` or KCachegrind for visualization: + + ```bash + callgrind_annotate contrib/profiling/tap-agent/callgrind.out + ``` + +- **Massif**: Use `ms_print` to view memory profiling results: + + ```bash + ms_print contrib/profiling/tap-agent/massif.out + ``` + +- **Protobuf Profiles**: View with Go pprof tools: + +```go +# Install Go pprof tools if needed +go install github.com/google/pprof@latest + +# View interactive web UI (most user-friendly) +pprof -http=:8080 contrib/profiling/indexer-service/profile-*.pb + +# Or generate a flamegraph from protobuf data +pprof -flamegraph contrib/profiling/indexer-service/profile-*.pb > custom_flamegraph.svg +``` + +## Implementation Details + +### Profiler Integration + +The profiler library is conditionally compiled using the `profiling` feature flag: + +```rust +#[cfg(feature = "profiling")] +if let Err(e) = profiler::setup_profiling( + "/opt/profiling/indexer-service".to_string(), + 150, // sampling frequency (Hz) + 120, // interval between reports (seconds) + Some("Indexer Service".to_string()), +) { + tracing::error!("Failed to setup profiling: {e}"); +} else { + tracing::info!("Profiling setup complete."); +} +``` + +### Docker Environment + +The profiling infrastructure uses a custom Docker image with all necessary tools pre-installed. The container runs with elevated privileges to support profiling: + +```yaml +cap_add: + - SYS_ADMIN +privileged: true +security_opt: + - seccomp:unconfined +``` + +## Notes + +- The flamegraph profiling is enabled whenever using any of the profiling commands through the Justfile, as the binaries are compiled with the `profiling` feature flag. +- For production use, prefer the built-in profiler over the external tools to minimize performance impact. +- When using callgrind, consider enabling debug information and frame pointers in your Cargo.toml for better output: + + ```toml + [profile.release] + debug = true + force-frame-pointers = true + ``` diff --git a/contrib/base/Dockerfile b/contrib/base/Dockerfile index 581160a84..88d9f0051 100644 --- a/contrib/base/Dockerfile +++ b/contrib/base/Dockerfile @@ -2,5 +2,15 @@ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ openssl ca-certificates protobuf-compiler postgresql-client curl \ - jq \ + jq git linux-perf \ + strace valgrind procps \ + bpftrace linux-headers-generic \ && rm -rf /var/lib/apt/lists/* + + +# Clone FlameGraph repository +RUN git clone https://github.com/brendangregg/FlameGraph.git /opt/FlameGraph && \ + chmod +x /opt/FlameGraph/*.pl + +# Create profiling directory with proper permissions +RUN mkdir -p /opt/profiling && chmod 777 /opt/profiling diff --git a/contrib/docker-compose.dev.yml b/contrib/docker-compose.dev.yml index f162aa258..05de611d4 100644 --- a/contrib/docker-compose.dev.yml +++ b/contrib/docker-compose.dev.yml @@ -32,7 +32,6 @@ services: condition: service_healthy volumes: - ../target/release/indexer-tap-agent:/usr/local/bin/indexer-tap-agent - - ./tap-agent/start.sh:/usr/local/bin/start.sh - ./tap-agent:/opt/config:ro - ./local-network/.env:/opt/.env:ro - ./local-network/contracts.json:/opt/contracts.json:ro @@ -44,6 +43,8 @@ services: ports: # to expose the metrics port - "7300:7300" + networks: + - local-network healthcheck: test: ["CMD", "curl", "-f", "http://localhost:7300/metrics"] interval: 5s diff --git a/contrib/docker-compose.prof.yml b/contrib/docker-compose.prof.yml new file mode 100644 index 000000000..5ff8b8f2a --- /dev/null +++ b/contrib/docker-compose.prof.yml @@ -0,0 +1,75 @@ +services: + indexer-service: + image: indexer-base:latest + build: + context: .. + container_name: indexer-service + volumes: + - ./local-network/contracts.json:/opt/contracts.json:ro + - ./local-network/.env:/opt/.env:ro + - ./profiling:/opt/profiling:rw + - ./indexer-service/config.toml:/opt/config/config.toml + - ./indexer-service/start-perf.sh:/usr/local/bin/start-perf.sh + - ../migrations:/opt/migrations:ro + - ../target/release/indexer-service-rs:/usr/local/bin/indexer-service-rs + entrypoint: ["/usr/local/bin/start-perf.sh"] + environment: + - RUST_BACKTRACE=1 + - RUST_LOG=debug + - PROFILER=${PROFILER:-flamegraph} # Default to flamegraph if not specified + ports: + - "7601:7601" + networks: + - local-network + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:7601/"] + interval: 5s + timeout: 3s + retries: 10 + start_period: 10s + cap_add: + - SYS_ADMIN + privileged: true + security_opt: + - seccomp:unconfined + + tap-agent: + image: indexer-base:latest # Pre-built base image with dependencies + container_name: tap-agent + # depends_on: + # indexer-service: + # condition: service_healthy + volumes: + - ../target/release/indexer-tap-agent:/usr/local/bin/indexer-tap-agent + - ./tap-agent/start-perf.sh:/usr/local/bin/start-perf.sh + - ./tap-agent:/opt/config:ro + - ./profiling:/opt/profiling:rw + - ./local-network/.env:/opt/.env:ro + - ./local-network/contracts.json:/opt/contracts.json:ro + - ../migrations:/opt/migrations:ro + entrypoint: ["/usr/local/bin/start-perf.sh"] + environment: + - RUST_BACKTRACE=1 + - RUST_LOG=debug + - PROFILER=${PROFILER:-flamegraph} + ports: + # to expose the metrics port + - "7300:7300" + networks: + - local-network + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:7300/metrics"] + interval: 5s + timeout: 3s + retries: 10 + start_period: 10s + cap_add: + - SYS_ADMIN + privileged: true + security_opt: + - seccomp:unconfined + +networks: + local-network: + external: true + name: local-network_default diff --git a/contrib/docker-compose.yml b/contrib/docker-compose.yml index e7b6d9733..f7e90057a 100644 --- a/contrib/docker-compose.yml +++ b/contrib/docker-compose.yml @@ -44,6 +44,8 @@ services: ports: # to expose the metrics port - "7300:7300" + networks: + - local-network healthcheck: test: ["CMD", "curl", "-f", "http://localhost:7300/metrics"] interval: 5s diff --git a/contrib/indexer-service/Dockerfile b/contrib/indexer-service/Dockerfile index 5baf05d86..4e3eccd3b 100644 --- a/contrib/indexer-service/Dockerfile +++ b/contrib/indexer-service/Dockerfile @@ -6,27 +6,26 @@ COPY ../../ . # the prepared files in the `.sqlx` directory. # ENV SQLX_OFFLINE=true - RUN apt-get update && apt-get install -y --no-install-recommends \ protobuf-compiler && rm -rf /var/lib/apt/lists/* - RUN cargo build --release --bin indexer-service-rs ######################################################################################## - FROM debian:bookworm-slim - RUN apt-get update && apt-get install -y --no-install-recommends \ openssl ca-certificates protobuf-compiler postgresql-client curl \ jq \ && rm -rf /var/lib/apt/lists/* + COPY --from=build /root/target/release/indexer-service-rs /usr/local/bin/indexer-service-rs # Copy our start script into the image COPY contrib/indexer-service/start.sh /usr/local/bin/start.sh COPY contrib/indexer-service/config.toml /opt/config/config.toml +COPY contrib/indexer-service/start-perf.sh /usr/local/bin/start-perf.sh RUN chmod +x /usr/local/bin/start.sh +RUN chmod +x /usr/local/bin/start-perf.sh -ENTRYPOINT [ "/usr/local/bin/start.sh" ] +ENTRYPOINT [ "/usr/local/bin/start-perf.sh" ] diff --git a/contrib/indexer-service/start-perf.sh b/contrib/indexer-service/start-perf.sh new file mode 100755 index 000000000..ed94d03da --- /dev/null +++ b/contrib/indexer-service/start-perf.sh @@ -0,0 +1,126 @@ +#!/bin/bash +set -eu +# Source environment variables if available +if [ -f "/opt/.env" ]; then + source /opt/.env +fi + +cat /opt/.env + +# Extract TAPVerifier address from contracts.json +VERIFIER_ADDRESS=$(jq -r '."1337".TAPVerifier.address' /opt/contracts.json) + +# Override with test values taken from test-assets/src/lib.rs +ALLOCATION_ID="0xfa44c72b753a66591f241c7dc04e8178c30e13af" # ALLOCATION_ID_0 + +# Get network subgraph deployment ID +NETWORK_DEPLOYMENT=$(curl -s "http://graph-node:8000/subgraphs/name/graph-network" \ + -H 'content-type: application/json' \ + -d '{"query": "{ _meta { deployment } }"}' | jq -r '.data._meta.deployment' 2>/dev/null) +stdbuf -oL echo "Graph-network subgraph deployment ID: $NETWORK_DEPLOYMENT" + +# Get escrow subgraph deployment ID +ESCROW_DEPLOYMENT=$(curl -s "http://graph-node:8000/subgraphs/name/semiotic/tap" \ + -H 'content-type: application/json' \ + -d '{"query": "{ _meta { deployment } }"}' | jq -r '.data._meta.deployment' 2>/dev/null) + +stdbuf -oL echo "Escrow subgraph deployment ID: $ESCROW_DEPLOYMENT" +stdbuf -oL echo "Using test Network subgraph deployment ID: $NETWORK_DEPLOYMENT" +stdbuf -oL echo "Using test Verifier address: $VERIFIER_ADDRESS" +stdbuf -oL echo "Using test Indexer address: $RECEIVER_ADDRESS" +stdbuf -oL echo "Using TAPVerifier address from contracts.json: $VERIFIER_ADDRESS" +stdbuf -oL echo "Using test Account0 address: $ACCOUNT0_ADDRESS" + +# Create/copy config file +cp /opt/config/config.toml /opt/config.toml + +# Replace the placeholders with actual values +sed -i "s/NETWORK_DEPLOYMENT_PLACEHOLDER/$NETWORK_DEPLOYMENT/g" /opt/config.toml +sed -i "s/ESCROW_DEPLOYMENT_PLACEHOLDER/$ESCROW_DEPLOYMENT/g" /opt/config.toml +sed -i "s/VERIFIER_ADDRESS_PLACEHOLDER/$VERIFIER_ADDRESS/g" /opt/config.toml +sed -i "s/INDEXER_ADDRESS_PLACEHOLDER/$RECEIVER_ADDRESS/g" /opt/config.toml +sed -i "s/INDEXER_MNEMONIC_PLACEHOLDER/$INDEXER_MNEMONIC/g" /opt/config.toml +sed -i "s/ACCOUNT0_ADDRESS_PLACEHOLDER/$ACCOUNT0_ADDRESS/g" /opt/config.toml +sed -i "s/POSTGRES_PORT_PLACEHOLDER/$POSTGRES/g" /opt/config.toml + +stdbuf -oL echo "Starting indexer-service with config:" +cat /opt/config.toml + +# Run basic connectivity tests +stdbuf -oL echo "Testing graph-node endpoints..." +curl -s "http://graph-node:8000" >/dev/null && stdbuf -oL echo "Query endpoint OK" || stdbuf -oL echo "Query endpoint FAILED" +curl -s "http://graph-node:8030/graphql" >/dev/null && stdbuf -oL echo "Status endpoint OK" || stdbuf -oL echo "Status endpoint FAILED" + +# Set profiling tool based on environment variable +# Default is no profiling +PROFILER="${PROFILER:-none}" +stdbuf -oL echo "🔍 DEBUG: Profiling with: $PROFILER" + +# Set environment variables for the service +export RUST_BACKTRACE=full +export RUST_LOG="${RUST_LOG:-trace}" + +# Create output directory if it doesn't exist +mkdir -p /opt/profiling/indexer-service +chmod 777 /opt/profiling +chmod 777 /opt/profiling/indexer-service + +stdbuf -oL echo "📁 DEBUG: Profiling output directory: $(ls -la /opt/profiling)" + +case "$PROFILER" in +flamegraph) + stdbuf -oL echo "🔥 Starting with profiler..." + + # Start the service in the background with output redirection + stdbuf -oL echo "🚀 Starting service..." + exec /usr/local/bin/indexer-service-rs --config /opt/config.toml + ;; +strace) + stdbuf -oL echo "🔍 Starting with strace..." + # -f: follow child processes + # -tt: print timestamps with microsecond precision + # -T: show time spent in each syscall + # -e trace=all: trace all system calls + # -s 256: show up to 256 characters per string + # -o: output file + exec strace -f -tt -T -e trace=all -s 256 -o /opt/profiling/indexer-service/strace.log /usr/local/bin/indexer-service-rs --config /opt/config.toml + ;; +valgrind) + stdbuf -oL echo "🔍 Starting with Valgrind profiling..." + + # Start with Massif memory profiler + stdbuf -oL echo "🔄 Starting Valgrind Massif memory profiling..." + exec valgrind --tool=massif \ + --massif-out-file=/opt/profiling/indexer-service/massif.out \ + --time-unit=B \ + --detailed-freq=10 \ + --max-snapshots=100 \ + --threshold=0.5 \ + /usr/local/bin/indexer-service-rs --config /opt/config.toml + ;; +# Use callgrind_annotate indexer-service.callgrind.out +# for humand friendly report of callgrind output +# Ideally you should set: +# [profile.release.package."*"] +# debug = true +# force-frame-pointers = true +# in the Cargo.toml +callgrind) + stdbuf -oL echo "🔍 Starting with Callgrind CPU profiling..." + exec valgrind --tool=callgrind \ + --callgrind-out-file=/opt/profiling/indexer-service/callgrind.out \ + --cache-sim=yes \ + --branch-sim=yes \ + --collect-jumps=yes \ + --collect-systime=yes \ + --collect-bus=yes \ + --dump-instr=yes \ + --dump-line=yes \ + --compress-strings=no \ + /usr/local/bin/indexer-service-rs --config /opt/config.toml + ;; +none) + stdbuf -oL echo "🔍 Starting without profiling..." + exec /usr/local/bin/indexer-service-rs --config /opt/config.toml + ;; +esac diff --git a/contrib/tap-agent/Dockerfile b/contrib/tap-agent/Dockerfile index 8c2337d89..a7693d8ca 100644 --- a/contrib/tap-agent/Dockerfile +++ b/contrib/tap-agent/Dockerfile @@ -17,6 +17,18 @@ FROM debian:bookworm-slim RUN apt-get update && apt-get install -y --no-install-recommends \ openssl ca-certificates postgresql-client curl jq iproute2 \ && rm -rf /var/lib/apt/lists/* + +# Create profiling directory with proper permissions +RUN mkdir -p /opt/profiling && chmod 777 /opt/profiling + +# Copy our start script into the image +COPY contrib/tap-agent/start.sh /opt/config/start.sh +COPY contrib/tap-agent/start-perf.sh /usr/local/bin/start-perf.sh +COPY contrib/tap-agent/config.toml /opt/config/config.toml + +RUN chmod +x /opt/config/start.sh +RUN chmod +x /usr/local/bin/start-perf.sh + COPY --from=build /root/target/release/indexer-tap-agent /usr/local/bin/indexer-tap-agent ENTRYPOINT [ "/usr/local/bin/indexer-tap-agent" ] diff --git a/contrib/tap-agent/config.toml b/contrib/tap-agent/config.toml index 5893cb657..ca4b45138 100644 --- a/contrib/tap-agent/config.toml +++ b/contrib/tap-agent/config.toml @@ -3,37 +3,36 @@ indexer_address = "INDEXER_ADDRESS_PLACEHOLDER" operator_mnemonic = "INDEXER_MNEMONIC_PLACEHOLDER" [database] -postgres_url = "postgresql://postgres@postgres:POSTGRES_PORT_PLACEHOLDER/indexer_components_1" +postgres_url = "postgres://postgres@postgres:POSTGRES_PORT_PLACEHOLDER/indexer_components_1" [graph_node] -query_url = "http://graph-node:GRAPH_NODE_GRAPHQL_PORT_PLACEHOLDER" -status_url = "http://graph-node:GRAPH_NODE_STATUS_PORT_PLACEHOLDER/graphql" +query_url = "http://graph-node:8000" +status_url = "http://graph-node:8030/graphql" [subgraphs.network] -query_url = "http://graph-node:GRAPH_NODE_GRAPHQL_PORT_PLACEHOLDER/subgraphs/name/graph-network" -recently_closed_allocation_buffer_secs = 60 -syncing_interval_secs = 30 +query_url = "http://graph-node:8000/subgraphs/name/graph-network" +deployment_id = "NETWORK_DEPLOYMENT_PLACEHOLDER" [subgraphs.escrow] -query_url = "http://graph-node:GRAPH_NODE_GRAPHQL_PORT_PLACEHOLDER/subgraphs/name/semiotic/tap" -syncing_interval_secs = 30 +query_url = "http://graph-node:8000/subgraphs/name/semiotic/tap" +deployment_id = "ESCROW_DEPLOYMENT_PLACEHOLDER" [blockchain] chain_id = 1337 receipts_verifier_address = "VERIFIER_ADDRESS_PLACEHOLDER" [service] -host_and_port = "0.0.0.0:INDEXER_SERVICE_PORT_PLACEHOLDER" -url_prefix = "/" -serve_network_subgraph = false -serve_escrow_subgraph = false +host_and_port = "0.0.0.0:7601" +free_query_auth_token = "freestuff" + +[service.tap] +max_receipt_value_grt = "0.001" [tap] max_amount_willing_to_lose_grt = 1000 [tap.rav_request] # Set a lower timestamp buffer threshold -# for testing purposes timestamp_buffer_secs = 30 # The trigger value divisor is used to calculate the trigger value for the RAV request. # using the formula: @@ -44,4 +43,7 @@ timestamp_buffer_secs = 30 trigger_value_divisor = 500_000 [tap.sender_aggregator_endpoints] -"ACCOUNT0_ADDRESS_PLACEHOLDER" = "http://tap-aggregator:TAP_AGGREGATOR_PORT_PLACEHOLDER" +"ACCOUNT0_ADDRESS_PLACEHOLDER" = "http://tap-aggregator:7610" + +[horizon] +enabled = false diff --git a/contrib/tap-agent/start-perf.sh b/contrib/tap-agent/start-perf.sh new file mode 100755 index 000000000..247796286 --- /dev/null +++ b/contrib/tap-agent/start-perf.sh @@ -0,0 +1,171 @@ +#!/bin/bash +set -eu + +# Source environment variables from .env file +if [ -f /opt/.env ]; then + stdbuf -oL echo "Sourcing environment variables from .env file" + . /opt/.env +fi + +# Extract TAPVerifier address from contracts.json +VERIFIER_ADDRESS=$(jq -r '."1337".TAPVerifier.address' /opt/contracts.json) +ALLOCATION_ID="0xfa44c72b753a66591f241c7dc04e8178c30e13af" # ALLOCATION_ID_0 + +# Wait for postgres to be ready +until pg_isready -h postgres -U postgres -d indexer_components_1; do + stdbuf -oL echo "Waiting for postgres..." + sleep 2 +done + +stdbuf -oL echo "Checking if required services are available..." +for service in postgres graph-node tap-aggregator; do + if getent hosts $service >/dev/null 2>&1; then + IP=$(getent hosts $service | awk '{ print $1 }') + stdbuf -oL echo "✅ $service resolves to $IP" + else + stdbuf -oL echo "❌ Cannot resolve $service hostname" + fi +done + +# Get network subgraph deployment ID with retries +stdbuf -oL echo "Getting network subgraph deployment ID..." +MAX_ATTEMPTS=30 +ATTEMPT=0 +NETWORK_DEPLOYMENT="" + +while [ -z "$NETWORK_DEPLOYMENT" ] || [ "$NETWORK_DEPLOYMENT" = "null" ] && [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do + NETWORK_DEPLOYMENT=$(curl -s "http://graph-node:8000/subgraphs/name/graph-network" \ + -H 'content-type: application/json' \ + -d '{"query": "{ _meta { deployment } }"}' | jq -r '.data._meta.deployment' 2>/dev/null) + + if [ -z "$NETWORK_DEPLOYMENT" ] || [ "$NETWORK_DEPLOYMENT" = "null" ]; then + ATTEMPT=$((ATTEMPT + 1)) + echo "Waiting for network subgraph to be deployed... Attempt $ATTEMPT/$MAX_ATTEMPTS" + sleep 5 + fi +done + +if [ -z "$NETWORK_DEPLOYMENT" ] || [ "$NETWORK_DEPLOYMENT" = "null" ]; then + echo "ERROR: Failed to get network subgraph deployment ID after $MAX_ATTEMPTS attempts" + exit 1 +fi + +stdbuf -oL echo "Network subgraph deployment ID: $NETWORK_DEPLOYMENT" + +# Get escrow subgraph deployment ID with retries +stdbuf -oL echo "Getting escrow subgraph deployment ID..." +MAX_ATTEMPTS=30 +ATTEMPT=0 +ESCROW_DEPLOYMENT="" + +while [ -z "$ESCROW_DEPLOYMENT" ] || [ "$ESCROW_DEPLOYMENT" = "null" ] && [ $ATTEMPT -lt $MAX_ATTEMPTS ]; do + ESCROW_DEPLOYMENT=$(curl -s "http://graph-node:8000/subgraphs/name/semiotic/tap" \ + -H 'content-type: application/json' \ + -d '{"query": "{ _meta { deployment } }"}' | jq -r '.data._meta.deployment' 2>/dev/null) + + if [ -z "$ESCROW_DEPLOYMENT" ] || [ "$ESCROW_DEPLOYMENT" = "null" ]; then + ATTEMPT=$((ATTEMPT + 1)) + echo "Waiting for escrow subgraph to be deployed... Attempt $ATTEMPT/$MAX_ATTEMPTS" + sleep 5 + fi +done + +if [ -z "$ESCROW_DEPLOYMENT" ] || [ "$ESCROW_DEPLOYMENT" = "null" ]; then + stdbuf -oL echo "ERROR: Failed to get escrow subgraph deployment ID after $MAX_ATTEMPTS attempts" + exit 1 +fi + +stdbuf -oL echo "Escrow subgraph deployment ID: $ESCROW_DEPLOYMENT" + +# Copy the config template +cp /opt/config/config.toml /opt/config.toml + +# Replace the placeholders with actual values +sed -i "s/NETWORK_DEPLOYMENT_PLACEHOLDER/$NETWORK_DEPLOYMENT/g" /opt/config.toml +sed -i "s/ESCROW_DEPLOYMENT_PLACEHOLDER/$ESCROW_DEPLOYMENT/g" /opt/config.toml +sed -i "s/VERIFIER_ADDRESS_PLACEHOLDER/$VERIFIER_ADDRESS/g" /opt/config.toml +sed -i "s/INDEXER_ADDRESS_PLACEHOLDER/$RECEIVER_ADDRESS/g" /opt/config.toml +sed -i "s/INDEXER_MNEMONIC_PLACEHOLDER/$INDEXER_MNEMONIC/g" /opt/config.toml +sed -i "s/ACCOUNT0_ADDRESS_PLACEHOLDER/$ACCOUNT0_ADDRESS/g" /opt/config.toml +sed -i "s/TAP_AGGREGATOR_PORT_PLACEHOLDER/$TAP_AGGREGATOR/g" /opt/config.toml +sed -i "s/POSTGRES_PORT_PLACEHOLDER/$POSTGRES/g" /opt/config.toml +sed -i "s/GRAPH_NODE_GRAPHQL_PORT_PLACEHOLDER/$GRAPH_NODE_GRAPHQL/g" /opt/config.toml +sed -i "s/GRAPH_NODE_STATUS_PORT_PLACEHOLDER/$GRAPH_NODE_STATUS/g" /opt/config.toml +sed -i "s/INDEXER_SERVICE_PORT_PLACEHOLDER/$INDEXER_SERVICE/g" /opt/config.toml + +stdbuf -oL echo "Starting tap-agent with config:" +cat /opt/config.toml + +# Set profiling tool based on environment variable +# Default is no profiling +PROFILER="${PROFILER:-none}" +stdbuf -oL echo "🔍 DEBUG: Profiling with: $PROFILER" + +# Run agent with enhanced logging +stdbuf -oL echo "Starting tap-agent..." +export RUST_BACKTRACE=full +export RUST_LOG=debug + +# Create output directory if it doesn't exist +mkdir -p /opt/profiling/tap-agent +chmod 777 /opt/profiling +chmod 777 /opt/profiling/tap-agent + +case "$PROFILER" in +flamegraph) + stdbuf -oL echo "🔥 Starting with profiler..." + + # Start the service in the background with output redirection + stdbuf -oL echo "🚀 Starting service..." + exec /usr/local/bin/indexer-tap-agent --config /opt/config.toml + ;; +strace) + stdbuf -oL echo "🔍 Starting with strace..." + # -f: follow child processes + # -tt: print timestamps with microsecond precision + # -T: show time spent in each syscall + # -e trace=all: trace all system calls + # -s 256: show up to 256 characters per string + # -o: output file + exec strace -f -tt -T -e trace=all -s 256 -o /opt/profiling/tap-agent/strace.log /usr/local/bin/indexer-tap-agent --config /opt/config.toml + ;; +valgrind) + stdbuf -oL echo "🔍 Starting with Valgrind profiling..." + + # Start with Massif memory profiler + stdbuf -oL echo "🔄 Starting Valgrind Massif memory profiling..." + exec valgrind --tool=massif \ + --massif-out-file=/opt/profiling/tap-agent/massif.out \ + --time-unit=B \ + --detailed-freq=10 \ + --max-snapshots=100 \ + --threshold=0.5 \ + /usr/local/bin/indexer-tap-agent --config /opt/config.toml + ;; +# Use callgrind_annotate indexer-service.callgrind.out +# or KcacheGrind viewer +# for humand friendly report +# Ideally you should set: +# [profile.release.package."*"] +# debug = true +# force-frame-pointers = true +# in the Cargo.toml +callgrind) + stdbuf -oL echo "🔍 Starting with Callgrind CPU profiling..." + exec valgrind --tool=callgrind \ + --callgrind-out-file=/opt/profiling/tap-agent/callgrind.out \ + --cache-sim=yes \ + --branch-sim=yes \ + --collect-jumps=yes \ + --collect-systime=yes \ + --collect-bus=yes \ + --dump-instr=yes \ + --dump-line=yes \ + --compress-strings=no \ + /usr/local/bin/indexer-tap-agent --config /opt/config.toml + ;; +none) + stdbuf -oL echo "🔍 Starting without profiling..." + exec /usr/local/bin/indexer-tap-agent --config /opt/config.toml + ;; +esac diff --git a/crates/profiler/Cargo.toml b/crates/profiler/Cargo.toml new file mode 100644 index 000000000..3077dc33c --- /dev/null +++ b/crates/profiler/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "profiler" +version = "0.1.0" +edition = "2021" + + +[features] +default = ["flamegraph", "protobuf-codec"] +flamegraph = ["pprof/flamegraph"] +protobuf-codec = ["pprof/protobuf-codec"] + +[dependencies] +tracing.workspace = true +thiserror.workspace = true + +pprof = { version = "0.14", default-features = false } +timer = { version = "0.2" } +chrono = { version = "0.4" } diff --git a/crates/profiler/src/error.rs b/crates/profiler/src/error.rs new file mode 100644 index 000000000..3b893cd7e --- /dev/null +++ b/crates/profiler/src/error.rs @@ -0,0 +1,25 @@ +// Copyright 2023-, Edge & Node, GraphOps, and Semiotic Labs. +// SPDX-License-Identifier: Apache-2.0 + +use thiserror::Error; + +#[derive(Error, Debug)] +pub enum ProfilerError { + #[error("IO Error: {0}")] + IoError(#[from] std::io::Error), + + #[error("Failed to create flamegraph: {0}")] + FlamegraphCreationError(String), + + #[error("Failed to generate protobuf: {0}")] + ProtobufError(String), + + #[error("Failed to generate profile report: {0}")] + ReportError(String), + + #[error("Failed to serialize profile: {0}")] + SerializationError(String), + + #[error("System time error: {0}")] + TimeError(#[from] std::time::SystemTimeError), +} diff --git a/crates/profiler/src/lib.rs b/crates/profiler/src/lib.rs new file mode 100644 index 000000000..d26fcd3a2 --- /dev/null +++ b/crates/profiler/src/lib.rs @@ -0,0 +1,219 @@ +// Copyright 2023-, Edge & Node, GraphOps, and Semiotic Labs. +// SPDX-License-Identifier: Apache-2.0 + +use std::fs::{self, File}; +use std::io::Write; +use std::path::{Path, PathBuf}; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use std::thread; +use std::time::{Duration, SystemTime}; + +use chrono::{DateTime, Utc}; +use pprof::protos::Message; + +mod error; +pub use error::ProfilerError; + +// Time to wait before starting profiling (in seconds) +const WAIT_TIME: u64 = 10; + +/// Save a flamegraph to the specified path +fn save_flamegraph( + report: &pprof::Report, + path: &Path, + options: &mut pprof::flamegraph::Options, +) -> Result<(), ProfilerError> { + let file = File::create(path)?; + + report + .flamegraph_with_options(file, options) + .map_err(|e| ProfilerError::FlamegraphCreationError(e.to_string()))?; + + tracing::info!("✅ Generated flamegraph: {:?}", path); + Ok(()) +} + +/// Save a protobuf profile to the specified path +fn save_protobuf(profile: &pprof::protos::Profile, path: &Path) -> Result<(), ProfilerError> { + // Try write_to_bytes first + match profile.write_to_bytes() { + Ok(bytes) => { + let mut file = File::create(path)?; + file.write_all(&bytes)?; + } + Err(e) => { + // Alternative approach: try direct file writing + tracing::info!( + "⚠️ Failed to serialize profile: {}, trying direct writer", + e + ); + + let mut file = File::create(path)?; + profile + .write_to_writer(&mut file) + .map_err(|e| ProfilerError::SerializationError(e.to_string()))?; + } + } + + tracing::info!("✅ Generated protobuf profile: {:?}", path); + Ok(()) +} + +/// Generate a unique filename with timestamp and counter +fn generate_filename( + base_path: &str, + prefix: &str, + extension: &str, + counter: u64, +) -> Result { + // Convert SystemTime to DateTime + let system_time = SystemTime::now(); + let datetime: DateTime = system_time.into(); + + // Format the datetime (YYYY-MM-DD-HH_MM_SS) + let formatted_time = datetime.format("%Y-%m-%d-%H_%M_%S").to_string(); + + let filename = format!("{}-{}-{}.{}", prefix, formatted_time, counter, extension); + Ok(Path::new(base_path).join(filename)) +} + +/// Process a single profiling report +fn process_profiling_report( + guard: &pprof::ProfilerGuard<'_>, + path: &str, + counter: u64, + options: &mut pprof::flamegraph::Options, +) -> Result<(), ProfilerError> { + let report = guard + .report() + .build() + .map_err(|e| ProfilerError::ReportError(e.to_string()))?; + + // Generate flamegraph + let flamegraph_path = generate_filename(path, "flamegraph", "svg", counter)?; + if let Err(e) = save_flamegraph(&report, &flamegraph_path, options) { + tracing::error!("Failed to save flamegraph: {}", e); + // Continue execution to try saving protobuf + } + + // Generate protobuf profile + match report.pprof() { + Ok(profile) => { + let proto_path = generate_filename(path, "profile", "pb", counter)?; + if let Err(e) = save_protobuf(&profile, &proto_path) { + tracing::error!("Failed to save protobuf: {}", e); + } + } + Err(e) => { + tracing::error!("Failed to generate pprof profile: {}", e); + } + } + + Ok(()) +} + +fn setup(path: String, frequency: i32, interval: u64, name: String) -> Result<(), ProfilerError> { + // Ensure the profiling directory exists + let profile_dir = Path::new(&path); + if !profile_dir.exists() { + fs::create_dir_all(profile_dir)?; + } + + // Create a background thread for continuous profiling + let path_clone = path.clone(); + thread::spawn(move || { + // Wait a bit for the application to start + thread::sleep(Duration::from_secs(WAIT_TIME)); + tracing::info!("🔍 Starting continuous profiling..."); + + // Counter for tracking report generation + let counter = Arc::new(AtomicU64::new(0)); + + // Create a separate thread for continuous data collection + thread::spawn(move || { + // Start continuous profiling + let guard = match pprof::ProfilerGuardBuilder::default() + .frequency(frequency) + .blocklist(&["libc", "libgcc", "pthread", "vdso"]) + .build() + { + Ok(guard) => guard, + Err(e) => { + tracing::error!("Failed to initialize profiler: {}", e); + return; + } + }; + + tracing::info!("📊 Continuous profiling active"); + let mut options = pprof::flamegraph::Options::default(); + options.title = name; + + // Create a timer thread to periodically save reports + thread::spawn(move || { + loop { + // Sleep for `interval` seconds before saving reports + thread::sleep(Duration::from_secs(interval)); + + let current_counter = counter.fetch_add(1, Ordering::Relaxed); + + tracing::info!("💾 Saving profiling snapshot #{}...", current_counter); + + if let Err(e) = + process_profiling_report(&guard, &path_clone, current_counter, &mut options) + { + tracing::error!("Error processing profiling report: {}", e); + } + } + }); + + // Keep profiling thread alive + loop { + thread::sleep(Duration::from_secs(3600)); + } + }); + }); + + Ok(()) +} + +/// Sets up continuous CPU profiling with flamegraph and protobuf output. +/// +/// # Arguments +/// +/// * `path` - Directory where profiling data will be stored +/// * `frequency` - Sampling frequency in Hz +/// * `interval` - Time between saving reports in seconds +/// * `name` - Optional service name for labeling profiles +/// +/// # Errors +/// +/// Returns `ProfilerError` if directory creation fails +/// +/// # Examples +/// +/// ``` +/// # fn main() -> Result<(), Box> { +/// profiler::setup_profiling( +/// "/opt/profiling/my-service".to_string(), +/// 150, +/// 120, +/// Some("My Service".to_string()), +/// )?; +/// # Ok(()) +/// # } +/// ``` +pub fn setup_profiling( + path: String, + frequency: i32, + interval: u64, + name: Option, +) -> Result<(), ProfilerError> { + tracing::info!("🔍 Setting up profiling..."); + setup( + path, + frequency, + interval, + name.unwrap_or("Service".to_string()), + ) +} diff --git a/crates/service/Cargo.toml b/crates/service/Cargo.toml index 9f90b5f13..4d23da0d5 100644 --- a/crates/service/Cargo.toml +++ b/crates/service/Cargo.toml @@ -14,6 +14,7 @@ indexer-config = { path = "../config" } indexer-dips = { path = "../dips" } indexer-query = { path = "../query" } indexer-receipt = { path = "../indexer-receipt" } +profiler = { path = "../profiler", optional = true } anyhow = { workspace = true } prometheus = { workspace = true } reqwest = { workspace = true } @@ -79,3 +80,7 @@ wiremock.workspace = true [build-dependencies] build-info-build.workspace = true + +[features] +default = [] +profiling = ["profiler"] diff --git a/crates/service/src/main.rs b/crates/service/src/main.rs index 9214c770b..dcb8a284a 100644 --- a/crates/service/src/main.rs +++ b/crates/service/src/main.rs @@ -10,10 +10,27 @@ use tracing_subscriber::{EnvFilter, FmtSubscriber}; #[tokio::main] async fn main() -> ExitCode { init_tracing(); + + #[cfg(all(feature = "profiling", not(test)))] + if let Err(e) = profiler::setup_profiling( + "/opt/profiling/indexer-service".to_string(), + 150, + 120, + Some("Indexer Service".to_string()), + ) { + // If profiling fails, log the error + // but continue running the application + // as profiling is just for development. + tracing::error!("Failed to setup profiling: {e}"); + } else { + tracing::info!("Profiling setup complete."); + } + if let Err(e) = run().await { tracing::error!("Indexer service error: {e}"); return ExitCode::from(1); } + ExitCode::SUCCESS } diff --git a/crates/tap-agent/Cargo.toml b/crates/tap-agent/Cargo.toml index 2d98cb9e8..05b2fb555 100644 --- a/crates/tap-agent/Cargo.toml +++ b/crates/tap-agent/Cargo.toml @@ -9,7 +9,9 @@ name = "indexer-tap-agent" path = "src/main.rs" [features] +default = [] test = ["dep:test-assets", "dep:rand"] +profiling = ["profiler"] [dependencies] indexer-monitor = { path = "../monitor" } @@ -18,6 +20,7 @@ indexer-allocation = { path = "../allocation" } indexer-config = { path = "../config" } indexer-query = { path = "../query" } indexer-receipt = { path = "../indexer-receipt" } +profiler = { path = "../profiler", optional = true } anyhow.workspace = true async-trait.workspace = true sqlx.workspace = true @@ -50,6 +53,7 @@ rand = { workspace = true, optional = true } itertools.workspace = true educe.workspace = true + [dev-dependencies] # Release-please breaks with cyclical dependencies if dev-dependencies # import the current crate. For testing we import the current crate with the `test` diff --git a/crates/tap-agent/src/main.rs b/crates/tap-agent/src/main.rs index 21a91eb6a..b7ede51d7 100644 --- a/crates/tap-agent/src/main.rs +++ b/crates/tap-agent/src/main.rs @@ -7,7 +7,20 @@ use tokio::signal::unix::{signal, SignalKind}; #[tokio::main] async fn main() -> anyhow::Result<()> { - // Parse basic configurations, also initializes logging. + #[cfg(all(feature = "profiling", not(test)))] + if let Err(e) = profiler::setup_profiling( + "/opt/profiling/tap-agent".to_string(), + 150, + 120, + Some("Tap-agent service".to_string()), + ) { + // If profiling fails, log the error + // but continue running the application + // as profiling is just for development. + tracing::error!("Failed to setup profiling: {e}"); + } else { + tracing::info!("Profiling setup complete."); + } // initialize LazyLock'd config _ = &*CONFIG; diff --git a/justfile b/justfile index 0ce8989a1..a1417e120 100644 --- a/justfile +++ b/justfile @@ -69,6 +69,43 @@ down: @cd contrib/local-network && docker compose down docker rm -f indexer-service tap-agent gateway block-oracle indexer-agent graph-node redpanda tap-aggregator tap-escrow-manager 2>/dev/null || true + +# Profiling commands +# ----------------------------- + +# Profile indexer-service with flamegraph +profile-flamegraph: + @mkdir -p contrib/profiling/output + ./prof-reload.sh flamegraph + +# Profile indexer-service with valgrind +profile-valgrind: + @mkdir -p contrib/profiling/output + ./prof-reload.sh valgrind + +# Profile indexer-service with strace +profile-strace: + @mkdir -p contrib/profiling/output + ./prof-reload.sh strace + +profile-callgrind: + @mkdir -p contrib/profiling/output + ./prof-reload.sh callgrind + +# Stop the running indexer-service (useful after profiling) +# This sends SIGTERM, allowing the trap in start-perf.sh to handle cleanup (e.g., generate flamegraph) +stop-profiling: # <-- New Rule Added Here + @echo "🛑 Stopping the indexer-service container (allowing profiling data generation)..." + cd contrib && docker compose -f docker-compose.prof.yml stop indexer-service tap-agent + @echo "✅ Service stop signal sent. Check profiling output directory." + +# Restore normal service (without profiling) +profile-restore: + @echo "🔄 Restoring normal service..." + cd contrib && docker compose -f docker-compose.prof.yml up -d --force-recreate indexer-service tap-agent + @echo "✅ Normal service restored" + + test-local: @cd integration-tests && ./fund_escrow.sh @cd integration-tests && cargo run diff --git a/prof-reload.sh b/prof-reload.sh new file mode 100755 index 000000000..3c0b09dfa --- /dev/null +++ b/prof-reload.sh @@ -0,0 +1,45 @@ +#!/bin/bash +set -e + +# TODO: Might this file is redundant and we can use dev-reload +# script instead? + +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +PROFILER=${1:-flamegraph} +echo -e "${BLUE}Using profiler: ${GREEN}${PROFILER}${NC}" + +# Ensure profiling output directory exists +mkdir -p contrib/profiling/indexer-service +mkdir -p contrib/profiling/tap-agent + +# 1. Compile the binaries locally +# and use profiling feature flag +# for flamegraph +echo -e "${BLUE}Compiling Rust code...${NC}" +RUSTFLAGS='-C force-frame-pointers=yes' CARGO_PROFILE_RELEASE_DEBUG=true cargo build --release --features "profiling" + +# 2. Check if compilation succeeded +if [ $? -ne 0 ]; then + echo -e "${YELLOW}Compilation failed. Not restarting containers.${NC}" + exit 1 +fi + +echo -e "${GREEN}Compilation successful!${NC}" + +echo -e "${BLUE}Restarting indexer-service with profiling...${NC}" +cd contrib + +# Stop the existing service and remove container +# to avoid conflicts. probably not needed and a restart could +# be enough. +docker compose -f docker-compose.prof.yml stop indexer-service tap-agent +docker rm -f indexer-service tap-agent 2>/dev/null || true + +export PROFILER=$PROFILER +docker compose -f docker-compose.prof.yml up -d indexer-service tap-agent + +echo -e "${GREEN}Done! Containers restarted with profiling.${NC}"