From 42517278fa2bc7fe0dd19af69317ae71b0953e56 Mon Sep 17 00:00:00 2001
From: Tomer Keshet <tomer@robusta.dev>
Date: Tue, 19 Aug 2025 14:00:14 +0300
Subject: [PATCH 01/15] wip copy test 113 from add-hard-evals branch

---
 tests/llm/fixtures/shared/tempo.yaml          | 117 +++++++
 .../postgres.yaml                             | 139 ++++++++
 .../services.yaml                             | 306 ++++++++++++++++++
 .../test_case.yaml                            |  68 ++++
 .../toolsets.yaml                             |  10 +
 .../traffic-generator-job.yaml                | 159 +++++++++
 6 files changed, 799 insertions(+)
 create mode 100644 tests/llm/fixtures/shared/tempo.yaml
 create mode 100644 tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/postgres.yaml
 create mode 100644 tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/services.yaml
 create mode 100644 tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/test_case.yaml
 create mode 100644 tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/toolsets.yaml
 create mode 100644 tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/traffic-generator-job.yaml
diff --git a/tests/llm/fixtures/shared/tempo.yaml b/tests/llm/fixtures/shared/tempo.yaml
new file mode 100644
index 000000000..c35735895
--- /dev/null
+++ b/tests/llm/fixtures/shared/tempo.yaml
@@ -0,0 +1,117 @@
+# Shared Tempo deployment configuration for test fixtures
+# Apply with: kubectl apply -f tempo.yaml -n <namespace>
+# Note: Namespace must be created separately
+---
+# Tempo deployment - lightweight, in-memory storage
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tempo
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: tempo
+  template:
+    metadata:
+      labels:
+        app: tempo
+    spec:
+      containers:
+      - name: tempo
+        image: grafana/tempo:2.3.0
+        args:
+        - -config.file=/etc/tempo/tempo-config.yaml
+        ports:
+        - containerPort: 3200
+          name: http
+        - containerPort: 4317
+          name: otlp-grpc
+        - containerPort: 4318
+          name: otlp-http
+        volumeMounts:
+        - name: config
+          mountPath: /etc/tempo
+        resources:
+          requests:
+            memory: "64Mi"
+            cpu: "10m"
+          limits:
+            memory: "256Mi"
+      volumes:
+      - name: config
+        configMap:
+          name: tempo-config
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: tempo
+spec:
+  selector:
+    app: tempo
+  ports:
+  - name: http
+    port: 3200
+    targetPort: 3200
+  - name: otlp-grpc
+    port: 4317
+    targetPort: 4317
+  - name: otlp-http
+    port: 4318
+    targetPort: 4318
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: tempo-config
+data:
+  tempo-config.yaml: |
+    server:
+      http_listen_port: 3200
+
+    distributor:
+      receivers:
+        otlp:
+          protocols:
+            grpc:
+              endpoint: 0.0.0.0:4317
+            http:
+              endpoint: 0.0.0.0:4318
+
+    ingester:
+      max_block_duration: 5m
+      trace_idle_period: 30s
+      max_block_bytes: 10_000_000
+
+    compactor:
+      compaction:
+        block_retention: 17520h  # 2 years (730 days * 24 hours)
+
+    storage:
+      trace:
+        backend: local
+        local:
+          path: /tmp/tempo/traces
+        wal:
+          path: /tmp/tempo/wal
+        pool:
+          max_workers: 10
+          queue_depth: 100
+
+    querier:
+      frontend_worker:
+        frontend_address: 127.0.0.1:9095
+      max_concurrent_queries: 10
+      search:
+        max_duration: 17520h  # 2 years
+
+    query_frontend:
+      search:
+        max_duration: 17520h  # 2 years
+
+    overrides:
+      max_traces_per_user: 10000
+      ingestion_rate_limit_bytes: 15000000
+      ingestion_burst_size_bytes: 20000000
+      max_bytes_per_trace: 5000000
diff --git a/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/postgres.yaml b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/postgres.yaml
new file mode 100644
index 000000000..4debc1ce4
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/postgres.yaml
@@ -0,0 +1,139 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: postgres-init
+data:
+  seed.sql: |
+    -- Connect to the default database first
+    \c postgres;
+
+    -- Create shipping_rates table
+    CREATE TABLE shipping_rates (
+        id SERIAL PRIMARY KEY,
+        zone_id VARCHAR(50) NOT NULL,
+        promo_code VARCHAR(50),
+        rate_per_kg DECIMAL(10,2) NOT NULL,
+        discount_percent DECIMAL(5,2) DEFAULT 0,
+        active BOOLEAN DEFAULT true,
+        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+    );
+
+    -- Create index ONLY on zone_id (missing compound index is the problem!)
+    CREATE INDEX idx_zone_id ON shipping_rates(zone_id);
+
+    -- Insert base rates for different zones
+    INSERT INTO shipping_rates (zone_id, rate_per_kg, discount_percent, active) VALUES
+    ('us-west-1', 5.00, 0, true),
+    ('us-west-2', 5.50, 0, true),
+    ('us-east-1', 6.00, 0, true),
+    ('us-east-2', 6.50, 0, true),
+    ('eu-west-1', 8.00, 0, true),
+    ('eu-central-1', 8.50, 0, true),
+    ('ap-south-1', 9.00, 0, true),
+    ('ap-northeast-1', 9.50, 0, true);
+
+    -- Insert promo code rates (these will cause slow queries)
+    -- Generate many promo codes to make table scanning expensive
+    DO $$
+    DECLARE
+        zone_list text[] := ARRAY['us-west-1', 'us-west-2', 'us-east-1', 'us-east-2',
+                                   'eu-west-1', 'eu-central-1', 'ap-south-1', 'ap-northeast-1'];
+        promo_list text[] := ARRAY['SAVE10', 'WELCOME20', 'HOLIDAY15', 'SPECIAL25',
+                                   'FLASH30', 'MEMBER10', 'FIRST15', 'RETURN20',
+                                   'SUMMER10', 'WINTER15', 'SPRING20', 'FALL25'];
+        zone text;
+        promo text;
+        i int;
+    BEGIN
+        -- Insert specific promo codes
+        FOREACH zone IN ARRAY zone_list LOOP
+            FOREACH promo IN ARRAY promo_list LOOP
+                INSERT INTO shipping_rates (zone_id, promo_code, rate_per_kg, discount_percent, active)
+                VALUES (zone, promo, 5.00 + random() * 5, 10 + random() * 20, true);
+            END LOOP;
+        END LOOP;
+
+        -- Add many more rows to make table scanning slow (50,000+ rows)
+        FOR i IN 1..50000 LOOP
+            INSERT INTO shipping_rates (zone_id, promo_code, rate_per_kg, discount_percent, active)
+            VALUES (
+                zone_list[1 + floor(random() * 8)],
+                'PROMO' || i,
+                5.00 + random() * 10,
+                random() * 30,
+                random() > 0.2
+            );
+        END LOOP;
+    END $$;
+
+    -- Analyze table for query planner
+    ANALYZE shipping_rates;
+
+    -- Show the problem: query without promo_code uses index
+    EXPLAIN (ANALYZE, BUFFERS)
+    SELECT rate_per_kg, discount_percent
+    FROM shipping_rates
+    WHERE zone_id = 'us-west-1' AND active = true
+    LIMIT 1;
+
+    -- Show the problem: query with promo_code does table scan
+    EXPLAIN (ANALYZE, BUFFERS)
+    SELECT rate_per_kg, discount_percent
+    FROM shipping_rates
+    WHERE zone_id = 'us-west-1' AND promo_code = 'SAVE10' AND active = true
+    LIMIT 1;
+
+    -- The fix would be: CREATE INDEX idx_shipping_compound ON shipping_rates(zone_id, promo_code, active);
+    -- But we don't create it - that's for Holmes to discover!
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: postgres
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: postgres
+  template:
+    metadata:
+      labels:
+        app: postgres
+    spec:
+      containers:
+      - name: postgres
+        image: postgres:15-alpine
+        env:
+        - name: POSTGRES_USER
+          value: postgres
+        - name: POSTGRES_PASSWORD
+          value: postgres
+        - name: POSTGRES_DB
+          value: shipping
+        ports:
+        - containerPort: 5432
+        volumeMounts:
+        - name: init
+          mountPath: /docker-entrypoint-initdb.d
+        resources:
+          requests:
+            memory: "256Mi"
+            cpu: "100m"
+          limits:
+            memory: "512Mi"
+            cpu: "500m"
+      volumes:
+      - name: init
+        configMap:
+          name: postgres-init
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: postgres
+spec:
+  selector:
+    app: postgres
+  ports:
+  - port: 5432
+    targetPort: 5432
diff --git a/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/services.yaml b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/services.yaml
new file mode 100644
index 000000000..aaf547853
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/services.yaml
@@ -0,0 +1,306 @@
+# Checkout Service
+apiVersion: v1
+kind: Secret
+metadata:
+  name: checkout-app
+type: Opaque
+stringData:
+  app.py: |
+    import os
+    import time
+    import random
+    import requests
+    from flask import Flask, request, jsonify
+    from opentelemetry import trace
+    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.sdk.resources import Resource
+    from opentelemetry.instrumentation.flask import FlaskInstrumentor
+    from opentelemetry.instrumentation.requests import RequestsInstrumentor
+
+    # Configure OpenTelemetry
+    resource = Resource.create({"service.name": "checkout-service"})
+    provider = TracerProvider(resource=resource)
+    trace.set_tracer_provider(provider)
+
+    otlp_exporter = OTLPSpanExporter(
+        endpoint="tempo.app-113.svc.cluster.local:4317",
+        insecure=True
+    )
+    provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
+
+    app = Flask(__name__)
+    FlaskInstrumentor().instrument_app(app)
+    RequestsInstrumentor().instrument()
+
+    tracer = trace.get_tracer(__name__)
+
+    @app.route('/health')
+    def health():
+        return 'OK'
+
+    @app.route('/checkout', methods=['POST'])
+    def checkout():
+        with tracer.start_as_current_span("process_checkout") as span:
+            data = request.json or {}
+
+            # Extract parameters
+            user_id = data.get('user_id', 'guest')
+            zone_id = data.get('zone_id', 'us-west-1')
+            promo_code = data.get('promo_code')
+            items = data.get('items', [])
+
+            # Add span attributes
+            span.set_attribute("user.id", user_id)
+            span.set_attribute("zone.id", zone_id)
+            span.set_attribute("items.count", len(items))
+            if promo_code:
+                span.set_attribute("promo.code", promo_code)
+
+            # Calculate shipping
+            with tracer.start_as_current_span("calculate_shipping"):
+                shipping_url = "http://shipping-calculator.app-113.svc.cluster.local:8081/calculate"
+                shipping_data = {
+                    "zone_id": zone_id,
+                    "promo_code": promo_code,
+                    "weight": sum(item.get('weight', 1.0) for item in items)
+                }
+
+                try:
+                    response = requests.post(shipping_url, json=shipping_data, timeout=30)
+                    shipping_cost = response.json().get('cost', 10.0)
+                except Exception as e:
+                    span.record_exception(e)
+                    shipping_cost = 10.0
+
+            # Calculate total
+            subtotal = sum(item.get('price', 0) for item in items)
+            total = subtotal + shipping_cost
+
+            return jsonify({
+                "order_id": f"ord-{random.randint(1000, 9999)}",
+                "subtotal": subtotal,
+                "shipping": shipping_cost,
+                "total": total
+            })
+
+    if __name__ == '__main__':
+        app.run(host='0.0.0.0', port=8080)
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: checkout
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: checkout
+  template:
+    metadata:
+      labels:
+        app: checkout
+    spec:
+      containers:
+      - name: checkout
+        image: python:3.11-slim
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          pip install flask requests opentelemetry-api opentelemetry-sdk \
+            opentelemetry-instrumentation-flask opentelemetry-instrumentation-requests \
+            opentelemetry-exporter-otlp-proto-grpc && \
+          python /app/app.py
+        volumeMounts:
+        - name: app
+          mountPath: /app
+        ports:
+        - containerPort: 8080
+        env:
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        resources:
+          requests:
+            memory: "128Mi"
+            cpu: "50m"
+          limits:
+            memory: "256Mi"
+            cpu: "200m"
+      volumes:
+      - name: app
+        secret:
+          secretName: checkout-app
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: checkout
+spec:
+  selector:
+    app: checkout
+  ports:
+  - port: 8080
+    targetPort: 8080
+---
+# Shipping Calculator Service
+apiVersion: v1
+kind: Secret
+metadata:
+  name: shipping-calculator-app
+type: Opaque
+stringData:
+  app.py: |
+    import os
+    import time
+    import psycopg2
+    from flask import Flask, request, jsonify
+    from opentelemetry import trace
+    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.sdk.resources import Resource
+    from opentelemetry.instrumentation.flask import FlaskInstrumentor
+    from opentelemetry.instrumentation.psycopg2 import Psycopg2Instrumentor
+
+    # Configure OpenTelemetry
+    resource = Resource.create({"service.name": "shipping-calculator"})
+    provider = TracerProvider(resource=resource)
+    trace.set_tracer_provider(provider)
+
+    otlp_exporter = OTLPSpanExporter(
+        endpoint="tempo.app-113.svc.cluster.local:4317",
+        insecure=True
+    )
+    provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
+
+    app = Flask(__name__)
+    FlaskInstrumentor().instrument_app(app)
+    Psycopg2Instrumentor().instrument()
+
+    tracer = trace.get_tracer(__name__)
+
+    def get_db_connection():
+        return psycopg2.connect(
+            host="postgres.app-113.svc.cluster.local",
+            database="shipping",
+            user="postgres",
+            password="postgres"
+        )
+
+    @app.route('/health')
+    def health():
+        return 'OK'
+
+    @app.route('/calculate', methods=['POST'])
+    def calculate():
+        with tracer.start_as_current_span("calculate_shipping_rate") as span:
+            data = request.json or {}
+            zone_id = data.get('zone_id', 'us-west-1')
+            promo_code = data.get('promo_code')
+            weight = data.get('weight', 1.0)
+
+            span.set_attribute("zone.id", zone_id)
+            span.set_attribute("weight", weight)
+            if promo_code:
+                span.set_attribute("promo.code", promo_code)
+
+            with tracer.start_as_current_span("database_query") as db_span:
+                conn = get_db_connection()
+                cursor = conn.cursor()
+
+                try:
+                    if promo_code:
+                        # This query will be slow - no index on (zone_id, promo_code, active)
+                        query = """
+                            SELECT rate_per_kg, discount_percent
+                            FROM shipping_rates
+                            WHERE zone_id = %s AND promo_code = %s AND active = true
+                            LIMIT 1
+                        """
+                        db_span.set_attribute("db.statement", query)
+                        cursor.execute(query, (zone_id, promo_code))
+                    else:
+                        # This query will be fast - uses index on zone_id
+                        query = """
+                            SELECT rate_per_kg, discount_percent
+                            FROM shipping_rates
+                            WHERE zone_id = %s AND active = true
+                            LIMIT 1
+                        """
+                        db_span.set_attribute("db.statement", query)
+                        cursor.execute(query, (zone_id,))
+
+                    result = cursor.fetchone()
+                    if result:
+                        rate, discount = result
+                        cost = weight * rate * (1 - discount/100)
+                    else:
+                        cost = weight * 5.0  # Default rate
+
+                finally:
+                    cursor.close()
+                    conn.close()
+
+            return jsonify({"cost": round(cost, 2)})
+
+    if __name__ == '__main__':
+        # Wait for Postgres to be ready
+        time.sleep(10)
+        app.run(host='0.0.0.0', port=8081)
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: shipping-calculator
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: shipping-calculator
+  template:
+    metadata:
+      labels:
+        app: shipping-calculator
+    spec:
+      containers:
+      - name: shipping-calculator
+        image: python:3.11-slim
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          apt-get update && apt-get install -y libpq-dev gcc && \
+          pip install flask psycopg2 opentelemetry-api opentelemetry-sdk \
+            opentelemetry-instrumentation-flask opentelemetry-instrumentation-psycopg2 \
+            opentelemetry-exporter-otlp-proto-grpc && \
+          python /app/app.py
+        volumeMounts:
+        - name: app
+          mountPath: /app
+        ports:
+        - containerPort: 8081
+        env:
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        resources:
+          requests:
+            memory: "128Mi"
+            cpu: "50m"
+          limits:
+            memory: "256Mi"
+            cpu: "200m"
+      volumes:
+      - name: app
+        secret:
+          secretName: shipping-calculator-app
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: shipping-calculator
+spec:
+  selector:
+    app: shipping-calculator
+  ports:
+  - port: 8081
+    targetPort: 8081
diff --git a/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/test_case.yaml
new file mode 100644
index 000000000..a40a6b961
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/test_case.yaml
@@ -0,0 +1,68 @@
+user_prompt:
+- "The checkout service in namespace app-113 is experiencing high latency. Investigate using tracing data."
+- "The checkout service in namespace app-113 is experiencing high latency. Investigate all traces over 0.1s in duration"
+
+expected_output:
+  - The checkout service has intermittent high latency affecting approximately 30% of requests
+  - Root cause is missing database index on the shipping_rates table in Postgres
+  - Slow queries occur when promo_code parameter is included in the request
+  - Fast queries use index on zone_id, but queries with promo_code cause full table scan
+  - Solution is to create compound index on (zone_id, promo_code, active)
+
+tags:
+  - database
+  - kubernetes
+  - medium
+
+port_forwards:
+  - namespace: app-113
+    service: tempo
+    local_port: 3200
+    remote_port: 3200
+
+before_test: |
+  # Create namespace
+  kubectl create namespace app-113 || true
+
+  # Deploy Tempo from shared config
+  kubectl apply -f ../../shared/tempo.yaml -n app-113
+  echo "deployed tempo"
+
+  # Deploy Postgres with initial schema
+  kubectl apply -f postgres.yaml -n app-113
+  kubectl wait --for=condition=ready pod -l app=postgres -n app-113 --timeout=60s
+  echo "database pod ready"
+
+  # Wait for database initialization to complete (init script runs automatically)
+  sleep 10
+
+  # Deploy microservices
+  kubectl apply -f services.yaml -n app-113
+  kubectl wait --for=condition=ready pod -l app=checkout -n app-113 --timeout=60s
+  kubectl wait --for=condition=ready pod -l app=shipping-calculator -n app-113 --timeout=60s
+  echo "microservices ready"
+
+  # Wait for Tempo to be ready
+  kubectl wait --for=condition=ready pod -l app=tempo -n app-113 --timeout=60s
+  echo "Waiting for Tempo to be fully ready (internal initialization)..."
+  sleep 25  # Tempo needs ~18s after pod ready for /ready endpoint to return 200
+  echo "Tempo ready"
+
+  # Run traffic generator job for 2 minutes of heavy load
+  kubectl apply -f traffic-generator-job.yaml -n app-113
+  kubectl wait --for=condition=complete job/traffic-generator -n app-113 --timeout=120s
+  echo "Done generating traffic"
+
+  # Verify traces are in Tempo using kubectl exec
+  echo "Checking if traces were received by Tempo..."
+  TRACE_COUNT=$(kubectl exec -n app-113 deployment/tempo -- curl -s 'http://localhost:3200/api/search?limit=1' 2>/dev/null | grep -o '"traces"' | wc -l || echo "0")
+
+  if [ "$TRACE_COUNT" -eq "0" ]; then
+    echo "WARNING: No traces found in Tempo (this might be timing related)"
+    # Don't fail the test for this
+  else
+    echo "SUCCESS: Found traces in Tempo"
+  fi
+
+after_test: |
+  kubectl delete namespace app-113 || true
diff --git a/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/toolsets.yaml b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/toolsets.yaml
new file mode 100644
index 000000000..7298e0687
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/toolsets.yaml
@@ -0,0 +1,10 @@
+toolsets:
+  kubernetes/core:
+    enabled: true
+  kubernetes/logs:
+    enabled: true
+  grafana/tempo:
+    enabled: true
+    config:
+      url: http://localhost:3200
+      healthcheck: "ready"
diff --git a/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/traffic-generator-job.yaml b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/traffic-generator-job.yaml
new file mode 100644
index 000000000..b1451de79
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/113_checkout_latency_tracing/traffic-generator-job.yaml
@@ -0,0 +1,159 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: traffic-generator
+data:
+  generator.py: |
+    import time
+    import random
+    import requests
+    import concurrent.futures
+    from datetime import datetime
+
+    CHECKOUT_URL = "http://checkout.app-113.svc.cluster.local:8080/checkout"
+    ZONES = ['us-west-1', 'us-west-2', 'us-east-1', 'us-east-2',
+             'eu-west-1', 'eu-central-1', 'ap-south-1', 'ap-northeast-1']
+    PROMO_CODES = ['SAVE10', 'WELCOME20', 'HOLIDAY15', 'SPECIAL25',
+                   'FLASH30', 'MEMBER10', 'FIRST15', 'RETURN20',
+                   'SUMMER10', 'WINTER15', 'SPRING20', 'FALL25']
+
+    def generate_request():
+        """Generate a single checkout request"""
+        # 30% chance to include promo code (will be slow)
+        include_promo = random.random() < 0.3
+
+        data = {
+            "user_id": f"user-{random.randint(1000, 9999)}",
+            "zone_id": random.choice(ZONES),
+            "items": [
+                {
+                    "id": f"item-{i}",
+                    "price": round(random.uniform(10, 200), 2),
+                    "weight": round(random.uniform(0.1, 5.0), 2)
+                }
+                for i in range(random.randint(1, 5))
+            ]
+        }
+
+        if include_promo:
+            data["promo_code"] = random.choice(PROMO_CODES)
+
+        try:
+            response = requests.post(CHECKOUT_URL, json=data, timeout=30)
+            latency = response.elapsed.total_seconds()
+            status = "success" if response.status_code == 200 else "error"
+            has_promo = "with_promo" if include_promo else "no_promo"
+            print(f"{datetime.now().isoformat()} - {status} - {has_promo} - {latency:.2f}s")
+            return latency
+        except Exception as e:
+            print(f"{datetime.now().isoformat()} - error - {str(e)}")
+            return None
+
+    def run_load_test():
+        """Run concurrent requests with a limit of 2000 total requests"""
+        print(f"Starting traffic generation at {datetime.now().isoformat()}")
+        print("Generating moderate load (max 2000 requests)...")
+
+        start_time = time.time()
+        max_requests = 2000  # Limit total requests
+        max_duration = 120  # 2 minutes max
+        request_count = 0
+        slow_requests = 0
+        fast_requests = 0
+
+        # Use thread pool for concurrent requests (reduced workers)
+        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+            futures = []
+
+            while request_count < max_requests and (time.time() - start_time) < max_duration:
+                # Submit fewer requests at a time to avoid overwhelming Tempo
+                requests_to_submit = min(3, max_requests - request_count - len(futures))
+                for _ in range(requests_to_submit):
+                    future = executor.submit(generate_request)
+                    futures.append(future)
+
+                # Process completed futures
+                done_futures = []
+                for future in futures:
+                    if future.done():
+                        latency = future.result()
+                        if latency:
+                            request_count += 1
+                            if latency > 2.0:  # Consider > 2s as slow
+                                slow_requests += 1
+                            else:
+                                fast_requests += 1
+                        done_futures.append(future)
+
+                # Remove completed futures
+                for future in done_futures:
+                    futures.remove(future)
+
+                # Longer sleep to reduce load on Tempo
+                time.sleep(0.3)
+
+            # Wait for remaining futures to complete
+            for future in concurrent.futures.as_completed(futures):
+                latency = future.result()
+                if latency:
+                    request_count += 1
+                    if latency > 2.0:
+                        slow_requests += 1
+                    else:
+                        fast_requests += 1
+
+        elapsed = time.time() - start_time
+        print(f"\nTraffic generation completed in {elapsed:.1f} seconds")
+        print(f"Total requests: {request_count}")
+        print(f"Fast requests (<2s): {fast_requests} ({fast_requests*100/max(request_count,1):.1f}%)")
+        print(f"Slow requests (>2s): {slow_requests} ({slow_requests*100/max(request_count,1):.1f}%)")
+        print(f"Average RPS: {request_count/elapsed:.1f}")
+
+    if __name__ == "__main__":
+        # Wait for services to be ready
+        print("Waiting for services to be ready...")
+        time.sleep(10)
+
+        # Warm up with a few requests
+        print("Warming up services...")
+        for _ in range(5):
+            generate_request()
+            time.sleep(1)
+
+        # Run the main load test
+        run_load_test()
+
+        print("\nTraffic generation completed successfully!")
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: traffic-generator
+spec:
+  backoffLimit: 1
+  activeDeadlineSeconds: 300
+  template:
+    spec:
+      restartPolicy: Never
+      containers:
+      - name: generator
+        image: python:3.11-slim
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          pip install requests && \
+          python /app/generator.py
+        volumeMounts:
+        - name: script
+          mountPath: /app
+        resources:
+          requests:
+            memory: "128Mi"
+            cpu: "200m"
+          limits:
+            memory: "256Mi"
+            cpu: "500m"
+      volumes:
+      - name: script
+        configMap:
+          name: traffic-generator

From 248aff250ef029867f600b9f45e98c79aad00d62 Mon Sep 17 00:00:00 2001
From: Tomer Keshet <tomer@robusta.dev>
Date: Tue, 19 Aug 2025 15:35:05 +0300
Subject: [PATCH 02/15] options to run only setup and also print its output

---
 conftest.py                  | 6 ++++++
 tests/llm/test_ask_holmes.py | 7 +++++++
 tests/llm/utils/commands.py  | 9 +++++++++
 3 files changed, 22 insertions(+)

diff --git a/conftest.py b/conftest.py
index 58b74e80e..a67ea7912 100644
--- a/conftest.py
+++ b/conftest.py
@@ -31,6 +31,12 @@ def pytest_addoption(parser):
         default=False,
         help="Skip running after_test commands for test cases (useful for debugging test failures)",
     )
+    parser.addoption(
+        "--only-setup",
+        action="store_true",
+        default=False,
+        help="Only run before_test setup commands, skip the actual test execution",
+    )
 
 
 def pytest_configure(config):
diff --git a/tests/llm/test_ask_holmes.py b/tests/llm/test_ask_holmes.py
index d1e531ebb..2c2e0f49a 100644
--- a/tests/llm/test_ask_holmes.py
+++ b/tests/llm/test_ask_holmes.py
@@ -69,6 +69,13 @@ def test_ask_holmes(
     # Check if test should be skipped
     check_and_skip_test(test_case)
 
+    # Check if --only-setup is set
+    only_setup = request.config.getoption("--only-setup", False)
+    if only_setup:
+        print(f"\n🧪 TEST: {test_case.id}")
+        print("   ⚙️  --only-setup mode: Skipping test execution, only ran setup")
+        pytest.skip("Skipping test execution due to --only-setup flag")
+
     # Check for setup failures
     setup_failures = shared_test_infrastructure.get("setup_failures", {})
     if test_case.id in setup_failures:
diff --git a/tests/llm/utils/commands.py b/tests/llm/utils/commands.py
index 31732dd91..9d26f0b52 100644
--- a/tests/llm/utils/commands.py
+++ b/tests/llm/utils/commands.py
@@ -2,6 +2,7 @@
 import logging
 import os
 import subprocess
+import sys
 import time
 from contextlib import contextmanager
 from typing import Dict, Optional
@@ -70,6 +71,14 @@ def _invoke_command(command: str, cwd: str) -> str:
         output = f"{result.stdout}\n{result.stderr}"
         logging.debug(f"** `{command}`:\n{output}")
         logging.debug(f"Ran `{command}` in {cwd} with exit code {result.returncode}")
+
+        # Show output if SHOW_SETUP_OUTPUT is set
+        if os.environ.get("SHOW_SETUP_OUTPUT", "").lower() in ("true", "1"):
+            if result.stdout:
+                sys.stderr.write(f"[SETUP OUTPUT] {result.stdout}\n")
+            if result.stderr:
+                sys.stderr.write(f"[SETUP STDERR] {result.stderr}\n")
+
         return output
     except subprocess.CalledProcessError as e:
         truncated_command = _truncate_script(command)

From 8bef25631ac1cbf41b5d70c34311159026cba537 Mon Sep 17 00:00:00 2001
From: Tomer Keshet <tomer@robusta.dev>
Date: Tue, 19 Aug 2025 16:18:53 +0300
Subject: [PATCH 03/15] WIP preparing a basic tempo

---
 pyproject.toml                                |  3 +-
 tests/llm/fixtures/shared/tempo.yaml          |  5 +-
 .../test_case.yaml                            | 55 +++++++++++++++++++
 .../toolsets.yaml                             | 10 ++++
 4 files changed, 68 insertions(+), 5 deletions(-)
 create mode 100644 tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
 create mode 100644 tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/toolsets.yaml

diff --git a/pyproject.toml b/pyproject.toml
index ee8dc9051..f550a2804 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -119,7 +119,8 @@ markers = [
     "toolset-limitation: Tests that cannot be solved no matter how smart the model, unless we improve the underlying toolsets themselves",
     "ask-for-clarification: Tests where Holmes should ask the user for clarification",
     "database: Tests involving database interactions",
-    "datadog: DataDog toolset"
+    "datadog: DataDog toolset",
+    "traces: Tests where the ai is expected to find the solution using the traces"
 ]
 
 addopts = [
diff --git a/tests/llm/fixtures/shared/tempo.yaml b/tests/llm/fixtures/shared/tempo.yaml
index c35735895..5dcc9a81a 100644
--- a/tests/llm/fixtures/shared/tempo.yaml
+++ b/tests/llm/fixtures/shared/tempo.yaml
@@ -103,12 +103,9 @@ data:
       frontend_worker:
         frontend_address: 127.0.0.1:9095
       max_concurrent_queries: 10
-      search:
-        max_duration: 17520h  # 2 years
 
     query_frontend:
-      search:
-        max_duration: 17520h  # 2 years
+      max_batch_size: 5
 
     overrides:
       max_traces_per_user: 10000
diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
new file mode 100644
index 000000000..5186ef4ab
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
@@ -0,0 +1,55 @@
+user_prompt:
+- "The checkout service in namespace app-114 is experiencing high latency. Investigate all traces over 1s in duration"
+
+expected_output:
+  - The checkout service has intermittent high latency affecting approximately 30% of requests
+  - Root cause is missing database index on the shipping_rates table in Postgres
+  - Slow queries occur when promo_code parameter is included in the request
+
+tags:
+  - kubernetes
+  - medium
+  - traces
+
+port_forwards:
+  - namespace: app-114
+    service: tempo
+    local_port: 3200
+    remote_port: 3200
+
+before_test: |
+  echo "🚀 Setting up test 114 - Creating namespace app-114"
+  kubectl create namespace app-114 || true
+  echo "✅ Namespace app-114 created successfully!"
+
+  echo "📦 Deploying Tempo from shared config"
+  kubectl apply -f ../../shared/tempo.yaml -n app-114
+
+  echo "⏳ Waiting for Tempo pod to be ready"
+  kubectl wait --for=condition=ready pod -l app=tempo -n app-114 --timeout=60s
+
+  echo "🔍 Checking Tempo deployment status"
+  kubectl get pods -n app-114 -l app=tempo
+
+  echo "⏰ Waiting for Tempo to be fully ready (checking every 5s, timeout 60s)"
+  TEMPO_READY=false
+  for i in {1..12}; do
+    if kubectl exec -n app-114 deployment/tempo -- wget -q -O - http://localhost:3200/ready 2>/dev/null; then
+      echo "✅ Tempo is ready!"
+      TEMPO_READY=true
+      break
+    else
+      echo "⏳ Attempt $i/12: Tempo not ready yet, waiting 5s..."
+      sleep 5
+    fi
+  done
+
+  if [ "$TEMPO_READY" = false ]; then
+    echo "❌ Tempo failed to become ready after 60 seconds"
+    exit 1
+  fi
+
+  echo "✅ Tempo deployment complete!"
+
+after_test: |
+  kubectl delete namespace app-114 || true
diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/toolsets.yaml b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/toolsets.yaml
new file mode 100644
index 000000000..7298e0687
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/toolsets.yaml
@@ -0,0 +1,10 @@
+toolsets:
+  kubernetes/core:
+    enabled: true
+  kubernetes/logs:
+    enabled: true
+  grafana/tempo:
+    enabled: true
+    config:
+      url: http://localhost:3200
+      healthcheck: "ready"

From db1f6efbd76ce11162680bd964d7d7c174302349 Mon Sep 17 00:00:00 2001
From: Tomer Keshet <tomer@robusta.dev>
Date: Sun, 24 Aug 2025 12:07:23 +0300
Subject: [PATCH 04/15] traces are generated in a stable manner and checked

---
 .../checkout-service.yaml                     | 167 ++++++++++++++++++
 .../test_case.yaml                            |  80 ++++++++-
 .../traffic-generator.yaml                    | 157 ++++++++++++++++
 3 files changed, 401 insertions(+), 3 deletions(-)
 create mode 100644 tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/checkout-service.yaml
 create mode 100644 tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/traffic-generator.yaml

diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/checkout-service.yaml b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/checkout-service.yaml
new file mode 100644
index 000000000..f5fad6f0b
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/checkout-service.yaml
@@ -0,0 +1,167 @@
+# Simplified Checkout Service with Dummy SQL
+apiVersion: v1
+kind: Secret
+metadata:
+  name: checkout-app
+type: Opaque
+stringData:
+  app.py: |
+    import os
+    import time
+    import random
+    from flask import Flask, request, jsonify
+    from opentelemetry import trace
+    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.sdk.resources import Resource
+    from opentelemetry.instrumentation.flask import FlaskInstrumentor
+
+    # Configure OpenTelemetry
+    resource = Resource.create({"service.name": "checkout-service"})
+    provider = TracerProvider(resource=resource)
+    trace.set_tracer_provider(provider)
+
+    otlp_exporter = OTLPSpanExporter(
+        endpoint="tempo.app-114.svc.cluster.local:4317",
+        insecure=True
+    )
+    provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
+
+    app = Flask(__name__)
+    FlaskInstrumentor().instrument_app(app)
+
+    tracer = trace.get_tracer(__name__)
+
+    @app.route('/health')
+    def health():
+        return 'OK'
+
+    @app.route('/checkout', methods=['POST'])
+    def checkout():
+        with tracer.start_as_current_span("process_checkout") as span:
+            data = request.json or {}
+
+            # Log the incoming request
+            print(f"[CHECKOUT] Received request: {data}", flush=True)
+
+            # Extract parameters
+            user_id = data.get('user_id', 'guest')
+            zone_id = data.get('zone_id', 'us-west-1')
+            promo_code = data.get('promo_code')
+            items = data.get('items', [])
+
+            # Add span attributes
+            span.set_attribute("user.id", user_id)
+            span.set_attribute("zone.id", zone_id)
+            span.set_attribute("items.count", len(items))
+            if promo_code:
+                span.set_attribute("promo.code", promo_code)
+
+            # Simulate database query for shipping calculation
+            with tracer.start_as_current_span("database_query") as db_span:
+                db_span.set_attribute("db.system", "postgresql")
+                db_span.set_attribute("db.operation", "SELECT")
+
+                if promo_code:
+                    # Simulate slow query with promo_code
+                    query = "SELECT rate_per_kg, discount_percent FROM shipping_rates WHERE zone_id = ? AND promo_code = ? AND active = true"
+                    db_span.set_attribute("db.statement", query)
+                    print(f"[DB] Executing SLOW query with promo_code: {promo_code} (simulating 2s delay)", flush=True)
+                    time.sleep(2.0)  # Simulate slow query
+                    shipping_rate = 4.5
+                    discount = 15.0
+                else:
+                    # Simulate fast query without promo_code
+                    query = "SELECT rate_per_kg, discount_percent FROM shipping_rates WHERE zone_id = ? AND active = true"
+                    db_span.set_attribute("db.statement", query)
+                    print(f"[DB] Executing FAST query without promo_code (simulating 0.1s delay)", flush=True)
+                    time.sleep(0.1)  # Simulate fast query
+                    shipping_rate = 5.0
+                    discount = 0.0
+
+                # Calculate shipping cost
+                total_weight = sum(item.get('weight', 1.0) for item in items)
+                shipping_cost = total_weight * shipping_rate * (1 - discount/100)
+
+            # Calculate total
+            subtotal = sum(item.get('price', 0) for item in items)
+            total = subtotal + shipping_cost
+
+            response = {
+                "order_id": f"ord-{random.randint(1000, 9999)}",
+                "subtotal": subtotal,
+                "shipping": round(shipping_cost, 2),
+                "total": round(total, 2)
+            }
+
+            print(f"[CHECKOUT] Sending response: {response}", flush=True)
+            return jsonify(response)
+
+    if __name__ == '__main__':
+        print("[CHECKOUT] Starting checkout service on port 8080", flush=True)
+        app.run(host='0.0.0.0', port=8080)
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: checkout
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: checkout
+  template:
+    metadata:
+      labels:
+        app: checkout
+    spec:
+      containers:
+      - name: checkout
+        image: python:3.11-slim
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          pip install flask opentelemetry-api opentelemetry-sdk \
+            opentelemetry-instrumentation-flask \
+            opentelemetry-exporter-otlp-proto-grpc && \
+          python /app/app.py
+        volumeMounts:
+        - name: app
+          mountPath: /app
+        ports:
+        - containerPort: 8080
+        env:
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        startupProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          initialDelaySeconds: 10
+          periodSeconds: 5
+          timeoutSeconds: 3
+          successThreshold: 1
+          failureThreshold: 24
+        resources:
+          requests:
+            memory: "128Mi"
+            cpu: "50m"
+          limits:
+            memory: "256Mi"
+            cpu: "200m"
+      volumes:
+      - name: app
+        secret:
+          secretName: checkout-app
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: checkout
+spec:
+  selector:
+    app: checkout
+  ports:
+  - port: 8080
+    targetPort: 8080
diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
index 5186ef4ab..d0ffa6d7c 100644
--- a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
@@ -28,9 +28,6 @@ before_test: |
   echo "⏳ Waiting for Tempo pod to be ready"
   kubectl wait --for=condition=ready pod -l app=tempo -n app-114 --timeout=60s
 
-  echo "🔍 Checking Tempo deployment status"
-  kubectl get pods -n app-114 -l app=tempo
-
   echo "⏰ Waiting for Tempo to be fully ready (checking every 5s, timeout 60s)"
   TEMPO_READY=false
   for i in {1..12}; do
@@ -51,5 +48,82 @@ before_test: |
 
   echo "✅ Tempo deployment complete!"
 
+  echo "🛒 Deploying checkout service"
+  kubectl apply -f checkout-service.yaml -n app-114
+
+  echo "⏳ Waiting for checkout pod to be ready"
+  kubectl wait --for=condition=ready pod -l app=checkout -n app-114 --timeout=60s
+
+  echo "🔍 Checking checkout deployment status"
+  kubectl get pods -n app-114 -l app=checkout
+
+  echo "🚦 Deploying traffic generator"
+  kubectl apply -f traffic-generator.yaml -n app-114
+
+  echo "⏳ Waiting for traffic generator pod to be ready"
+  kubectl wait --for=condition=ready pod -l app=traffic-generator -n app-114 --timeout=60s
+
+  echo "🔍 Checking all pods status"
+  kubectl get pods -n app-114
+
+  echo "⏰ Letting traffic generator run for 10 seconds to generate requests"
+  sleep 10
+
+  echo "🔍 Verifying traffic generator log entries"
+  if kubectl logs -n app-114 -l app=traffic-generator --tail=100 | grep -q "WITH promo_code"; then
+    echo "✅ Found traffic generator log WITH promo_code"
+  else
+    echo "❌ Missing traffic generator log WITH promo_code"
+    exit 1
+  fi
+
+  if kubectl logs -n app-114 -l app=traffic-generator --tail=100 | grep -q "WITHOUT promo_code"; then
+    echo "✅ Found traffic generator log WITHOUT promo_code"
+  else
+    echo "❌ Missing traffic generator log WITHOUT promo_code"
+    exit 1
+  fi
+
+  echo "🔍 Verifying checkout service log entries"
+  if kubectl logs -n app-114 -l app=checkout --tail=100 | grep -q "FAST query without promo_code"; then
+    echo "✅ Found checkout FAST query log"
+  else
+    echo "❌ Missing checkout FAST query log"
+    exit 1
+  fi
+
+  if kubectl logs -n app-114 -l app=checkout --tail=100 | grep -q "SLOW query with promo_code"; then
+    echo "✅ Found checkout SLOW query log"
+  else
+    echo "❌ Missing checkout SLOW query log"
+    exit 1
+  fi
+
+  # Commented out traffic generator trace checks as it no longer sends traces
+  # echo "🔍 Querying Tempo for traces from traffic generator"
+  # TRAFFIC_GEN_TRACES=$(curl -s "http://localhost:3200/api/search?tags=service.name%3Dtraffic-generator&limit=10" 2>/dev/null | grep -o '"traceID"' | wc -l || echo "0")
+  # echo "Found $TRAFFIC_GEN_TRACES traces from traffic-generator"
+
+  echo "🔍 Querying Tempo for traces from checkout service"
+  CHECKOUT_TRACES=$(kubectl run -n app-114 tempo-query --rm -i --restart=Never --image=curlimages/curl -- -s "http://tempo:3200/api/search?tags=service.name%3Dcheckout-service&limit=10" 2>/dev/null | grep -o '"traceID"' | wc -l || echo "0")
+  echo "Found $CHECKOUT_TRACES traces from checkout-service"
+
+  # Commented out traffic generator trace check
+  # if [ "$TRAFFIC_GEN_TRACES" -gt "0" ]; then
+  #   echo "✅ Found traces from traffic-generator"
+  # else
+  #   echo "❌ No traces found from traffic-generator"
+  #   exit 1
+  # fi
+
+  if [ "$CHECKOUT_TRACES" -gt "0" ]; then
+    echo "✅ Found traces from checkout-service"
+  else
+    echo "❌ No traces found from checkout-service"
+    exit 1
+  fi
+
+  echo "✅ Test setup complete!"
+
 after_test: |
   kubectl delete namespace app-114 || true
diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/traffic-generator.yaml b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/traffic-generator.yaml
new file mode 100644
index 000000000..0ba5b9086
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/traffic-generator.yaml
@@ -0,0 +1,157 @@
+# Traffic Generator Deployment
+apiVersion: v1
+kind: Secret
+metadata:
+  name: traffic-generator-app
+type: Opaque
+stringData:
+  app.py: |
+    import time
+    import random
+    import requests
+    from datetime import datetime
+    # from opentelemetry import trace
+    # from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+    # from opentelemetry.sdk.trace import TracerProvider
+    # from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    # from opentelemetry.sdk.resources import Resource
+    # from opentelemetry.instrumentation.requests import RequestsInstrumentor
+
+    # # Configure OpenTelemetry
+    # resource = Resource.create({"service.name": "traffic-generator"})
+    # provider = TracerProvider(resource=resource)
+    # trace.set_tracer_provider(provider)
+
+    # otlp_exporter = OTLPSpanExporter(
+    #     endpoint="tempo.app-114.svc.cluster.local:4317",
+    #     insecure=True
+    # )
+    # provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
+
+    # # Instrument requests library
+    # RequestsInstrumentor().instrument()
+
+    # tracer = trace.get_tracer(__name__)
+
+    CHECKOUT_URL = "http://checkout.app-114.svc.cluster.local:8080/checkout"
+    ZONES = ['us-west-1', 'us-west-2', 'us-east-1', 'us-east-2']
+    PROMO_CODES = ['SAVE10', 'WELCOME20', 'HOLIDAY15', 'SPECIAL25']
+
+    def generate_traffic():
+        """Continuously generate traffic to checkout service"""
+        print("[TRAFFIC-GEN] Starting traffic generator", flush=True)
+        request_count = 0
+
+        while True:
+            request_count += 1
+
+            # 30% chance to include promo code
+            include_promo = random.random() < 0.3
+
+            # Build request data
+            data = {
+                "user_id": f"user-{random.randint(1000, 9999)}",
+                "zone_id": random.choice(ZONES),
+                "items": [
+                    {
+                        "id": f"item-{i}",
+                        "price": round(random.uniform(10, 200), 2),
+                        "weight": round(random.uniform(0.5, 5.0), 2)
+                    }
+                    for i in range(random.randint(1, 3))
+                ]
+            }
+
+            if include_promo:
+                data["promo_code"] = random.choice(PROMO_CODES)
+
+            # Log the request
+            promo_status = "WITH" if include_promo else "WITHOUT"
+            print(f"[TRAFFIC-GEN] Request #{request_count}: Sending request {promo_status} promo_code", flush=True)
+
+            # Make request with tracing
+            # with tracer.start_as_current_span("checkout_request") as span:
+            #     span.set_attribute("request.number", request_count)
+            #     span.set_attribute("has.promo_code", include_promo)
+            #     if include_promo:
+            #         span.set_attribute("promo.code", data.get("promo_code"))
+
+            try:
+                start_time = time.time()
+                response = requests.post(CHECKOUT_URL, json=data, timeout=10)
+                latency = time.time() - start_time
+
+                # span.set_attribute("http.status_code", response.status_code)
+                # span.set_attribute("response.latency", latency)
+
+                status = "success" if response.status_code == 200 else f"error({response.status_code})"
+                print(f"[TRAFFIC-GEN] Request #{request_count}: Response status={status}, latency={latency:.2f}s", flush=True)
+
+            except Exception as e:
+                # span.record_exception(e)
+                # span.set_status(trace.Status(trace.StatusCode.ERROR, str(e)))
+                print(f"[TRAFFIC-GEN] Request #{request_count}: Error - {str(e)}", flush=True)
+
+            # Wait 100ms to 200ms second before next request
+            sleep_time = random.uniform(0.1, 0.2)
+            time.sleep(sleep_time)
+
+    if __name__ == '__main__':
+        print("[TRAFFIC-GEN] Starting...", flush=True)
+
+        # Start generating traffic
+        generate_traffic()
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: traffic-generator
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: traffic-generator
+  template:
+    metadata:
+      labels:
+        app: traffic-generator
+    spec:
+      containers:
+      - name: traffic-generator
+        image: python:3.11-slim
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          pip install requests && \
+          # pip install opentelemetry-api opentelemetry-sdk \
+          #   opentelemetry-instrumentation-requests \
+          #   opentelemetry-exporter-otlp-proto-grpc && \
+          touch /tmp/ready && \
+          python /app/app.py
+        volumeMounts:
+        - name: app
+          mountPath: /app
+        env:
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        startupProbe:
+          exec:
+            command:
+            - cat
+            - /tmp/ready
+          initialDelaySeconds: 5
+          periodSeconds: 5
+          timeoutSeconds: 1
+          successThreshold: 1
+          failureThreshold: 12
+        resources:
+          requests:
+            memory: "64Mi"
+            cpu: "25m"
+          limits:
+            memory: "128Mi"
+            cpu: "100m"
+      volumes:
+      - name: app
+        secret:
+          secretName: traffic-generator-app

From 920d5ade0f32bf2642d2759330f365a44ed30d31 Mon Sep 17 00:00:00 2001
From: Tomer Keshet <tomer@robusta.dev>
Date: Sun, 24 Aug 2025 12:51:50 +0300
Subject: [PATCH 05/15] make logs less obvious

---
 .../checkout-service.yaml                        | 16 +++++++++-------
 .../test_case.yaml                               | 14 +++-----------
 2 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/checkout-service.yaml b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/checkout-service.yaml
index f5fad6f0b..783958e99 100644
--- a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/checkout-service.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/checkout-service.yaml
@@ -42,8 +42,8 @@ stringData:
         with tracer.start_as_current_span("process_checkout") as span:
             data = request.json or {}
 
-            # Log the incoming request
-            print(f"[CHECKOUT] Received request: {data}", flush=True)
+            # Log the incoming request (without revealing the data)
+            print(f"[CHECKOUT] Processing checkout request for user {data.get('user_id', 'guest')}", flush=True)
 
             # Extract parameters
             user_id = data.get('user_id', 'guest')
@@ -67,16 +67,18 @@ stringData:
                     # Simulate slow query with promo_code
                     query = "SELECT rate_per_kg, discount_percent FROM shipping_rates WHERE zone_id = ? AND promo_code = ? AND active = true"
                     db_span.set_attribute("db.statement", query)
-                    print(f"[DB] Executing SLOW query with promo_code: {promo_code} (simulating 2s delay)", flush=True)
-                    time.sleep(2.0)  # Simulate slow query
+                    # print(f"[DB] Executing shipping rate query", flush=True)
+                    sleep_time = random.uniform(1.5, 3.5)
+                    time.sleep(sleep_time) # Simulate slow query
                     shipping_rate = 4.5
                     discount = 15.0
                 else:
                     # Simulate fast query without promo_code
                     query = "SELECT rate_per_kg, discount_percent FROM shipping_rates WHERE zone_id = ? AND active = true"
                     db_span.set_attribute("db.statement", query)
-                    print(f"[DB] Executing FAST query without promo_code (simulating 0.1s delay)", flush=True)
-                    time.sleep(0.1)  # Simulate fast query
+                    # print(f"[DB] Executing shipping rate query", flush=True)
+                    sleep_time = random.uniform(0.1, 0.2)
+                    time.sleep(sleep_time) # Simulate fast query
                     shipping_rate = 5.0
                     discount = 0.0
 
@@ -95,7 +97,7 @@ stringData:
                 "total": round(total, 2)
             }
 
-            print(f"[CHECKOUT] Sending response: {response}", flush=True)
+            print(f"[CHECKOUT] Completed checkout request", flush=True)
             return jsonify(response)
 
     if __name__ == '__main__':
diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
index d0ffa6d7c..4be7cc11b 100644
--- a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
@@ -84,18 +84,10 @@ before_test: |
     exit 1
   fi
 
-  echo "🔍 Verifying checkout service log entries"
-  if kubectl logs -n app-114 -l app=checkout --tail=100 | grep -q "FAST query without promo_code"; then
-    echo "✅ Found checkout FAST query log"
+  if kubectl logs -n app-114 -l app=checkout --tail=100 | grep -q "Processing checkout request"; then
+    echo "✅ Found checkout request log"
   else
-    echo "❌ Missing checkout FAST query log"
-    exit 1
-  fi
-
-  if kubectl logs -n app-114 -l app=checkout --tail=100 | grep -q "SLOW query with promo_code"; then
-    echo "✅ Found checkout SLOW query log"
-  else
-    echo "❌ Missing checkout SLOW query log"
+    echo "❌ Missing checkout request log"
     exit 1
   fi
 

From ecd28531f3d446e1cff0478e06819c039f59997b Mon Sep 17 00:00:00 2001
From: Tomer Keshet <tomer@robusta.dev>
Date: Sun, 24 Aug 2025 14:05:41 +0300
Subject: [PATCH 06/15] clearer expectations

---
 .../test_case.yaml                                     | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
index 4be7cc11b..6675e9963 100644
--- a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
@@ -1,10 +1,9 @@
 user_prompt:
-- "The checkout service in namespace app-114 is experiencing high latency. Investigate all traces over 1s in duration"
+- "The checkout service in namespace app-114 is experiencing high latency. Investigate why."
 
 expected_output:
-  - The checkout service has intermittent high latency affecting approximately 30% of requests
-  - Root cause is missing database index on the shipping_rates table in Postgres
-  - Slow queries occur when promo_code parameter is included in the request
+  - The answer must explicitly state that queries that include a promo code are slow or all slow requests include promo code.
+  - Including a query that mentions promo code is not sufficient.
 
 tags:
   - kubernetes
@@ -115,6 +114,9 @@ before_test: |
     exit 1
   fi
 
+  # Delete Traffic generator so the ai won't cheat
+  kubectl delete -f traffic-generator.yaml -n app-114
+
   echo "✅ Test setup complete!"
 
 after_test: |

From 974e53c23a942d836d1e002fdaaf6de00f79d57e Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Sun, 24 Aug 2025 14:09:43 +0300
Subject: [PATCH 07/15] WIP

---
 .../runbooks/high_latency_investigation.md    | 312 ++++++++++++++++++
 .../plugins/toolsets/prometheus_advanced.yaml | 266 +++++++++++++++
 holmes/plugins/toolsets/tempo_advanced.yaml   | 294 +++++++++++++++++
 3 files changed, 872 insertions(+)
 create mode 100644 holmes/plugins/runbooks/high_latency_investigation.md
 create mode 100644 holmes/plugins/toolsets/prometheus_advanced.yaml
 create mode 100644 holmes/plugins/toolsets/tempo_advanced.yaml

diff --git a/holmes/plugins/runbooks/high_latency_investigation.md b/holmes/plugins/runbooks/high_latency_investigation.md
new file mode 100644
index 000000000..0bd2d7a19
--- /dev/null
+++ b/holmes/plugins/runbooks/high_latency_investigation.md
@@ -0,0 +1,312 @@
+# Performance Investigation Runbook
+
+## Overview
+This runbook guides the investigation of performance issues by analyzing both metrics and distributed traces to identify root causes. It works with any metric and trace attribute naming convention.
+
+## Prerequisites
+- Prometheus or compatible metrics system
+- Tempo or compatible distributed tracing system
+- Service instrumentation with trace context
+
+## Investigation Steps
+
+### 1. Discover Available Metrics and Labels
+
+First, discover what metrics and labels are available:
+
+```
+# Use prometheus/metrics toolset
+list_available_metrics(
+  name_filter="duration|latency|time",
+  type_filter="histogram"
+)
+```
+
+### 2. Identify Affected Services and Operations
+
+Find which operations have high values:
+
+```
+# Use prometheus/advanced-latency toolset
+find_top_metric_values(
+  metric_name="${your_latency_metric}",
+  group_by_label="${endpoint_label}",
+  top_n=10,
+  percentile=0.95,
+  time_range="30m"
+)
+```
+
+### 3. Analyze Metric Distribution
+
+Understand if the latency is consistent or has specific patterns:
+
+```
+analyze_metric_distribution(
+  metric_name="${your_latency_metric}",
+  label_filters={"${service_label}": "${affected_service}"},
+  time_range="1h"
+)
+```
+
+Look for:
+- Bimodal distributions (suggesting two different code paths)
+- Long tail latencies (small percentage of very slow requests)
+- Consistent high latency (systemic issue)
+
+### 4. Break Down by Available Dimensions
+
+Analyze by the labels available in your metrics:
+
+```
+analyze_metric_by_dimensions(
+  metric_name="${your_latency_metric}",
+  group_by=["${discovered_labels}"],  # Use labels discovered in step 1
+  filters={"${service_label}": "${affected_service}"},
+  percentiles=[0.5, 0.95, 0.99],
+  time_range="1h"
+)
+```
+
+Key patterns to identify based on your available labels:
+- Specific operations or endpoints
+- Different request types or methods
+- Error conditions
+- Client or user segments
+
+### 5. Discover Trace Attributes and Find Slow Traces
+
+First discover available span attributes:
+
+```
+# Use tempo toolset
+fetch_tempo_tags(
+  start_datetime="-1h",
+  end_datetime="now"
+)
+```
+
+Then find example slow traces:
+
+Get specific examples of slow requests for detailed analysis:
+
+```
+# Use tempo/advanced-tracing toolset
+find_traces_by_criteria(
+  service_name="${affected_service}",
+  operation_name="${affected_operation}",
+  min_duration="${threshold_duration}",
+  span_attributes={"${your_attributes}": "${values}"},
+  time_range="30m",
+  limit=10
+)
+```
+
+### 6. Analyze Trace Breakdown
+
+For each slow trace, identify where time is spent:
+
+```
+analyze_trace_latency_breakdown(
+  trace_id="${trace_id}",
+  include_dependencies=true
+)
+```
+
+Look for:
+- Long-running spans
+- Sequential operations that could be parallelized
+- External service calls with high latency
+- Database queries taking excessive time
+
+### 7. Analyze Span Attributes
+
+Group traces by discovered attributes to find patterns:
+
+```
+analyze_span_attributes(
+  service_name="${affected_service}",
+  group_by_attributes=["${discovered_attributes}"],  # Use attributes from step 5
+  min_duration="500ms",
+  aggregation="p95",
+  time_range="1h"
+)
+```
+
+This helps identify patterns based on your actual span attributes:
+- Specific operations or endpoints
+- User or tenant segments
+- External dependencies
+- Error conditions
+
+### 8. Analyze Operation Patterns
+
+Analyze operations within traces:
+
+```
+analyze_span_operations(
+  service_name="${affected_service}",
+  operation_type_attribute="${operation_attribute}",  # e.g., 'db.system', 'rpc.method'
+  min_duration="100ms",
+  group_by_attributes=["${relevant_attributes}"],
+  time_range="1h"
+)
+```
+
+Look for:
+- N+1 query problems
+- Missing indexes
+- Lock contention
+- Slow aggregation queries
+
+### 9. Correlate with Resource Metrics
+
+Identify resource metrics to correlate:
+
+```
+# First find available resource metrics
+list_available_metrics(
+  name_filter="cpu|memory|disk|network|connection",
+  type_filter="gauge"
+)
+
+# Then correlate
+correlate_metrics(
+  primary_metric="${your_latency_metric}",
+  correlation_metrics=["${discovered_resource_metrics}"],
+  label_filters={"${service_label}": "${affected_service}"},
+  time_range="1h"
+)
+```
+
+### 10. Compare with Historical Baseline
+
+Determine if this is a new issue or degradation:
+
+```
+compare_metric_periods(
+  metric_name="${your_latency_metric}",
+  current_period="1h",
+  comparison_period="24h",
+  group_by=["${relevant_labels}"],
+  threshold_percent=20
+)
+```
+
+### 11. Trace Service Dependencies
+
+Understand the full request flow and identify bottlenecks:
+
+```
+trace_service_dependencies(
+  root_service="${affected_service}",
+  latency_threshold="100ms",
+  time_range="1h"
+)
+```
+
+### 12. Check for Anomalies
+
+Detect unusual patterns in metrics:
+
+```
+detect_metric_anomalies(
+  metric_name="${your_latency_metric}",
+  sensitivity=3,
+  lookback_window="7d",
+  group_by=["${relevant_labels}"]
+)
+```
+
+And in traces:
+
+```
+detect_trace_anomalies(
+  service_name="${affected_service}",
+  baseline_window="24h",
+  sensitivity=3,
+  anomaly_types=["latency", "errors", "span_count"]
+)
+```
+
+## Common Root Causes and Solutions
+
+### 1. Database Issues
+**Symptoms**: High database query duration in traces
+**Solutions**:
+- Add missing indexes
+- Optimize queries
+- Implement caching
+- Use read replicas for read-heavy workloads
+
+### 2. N+1 Query Problems
+**Symptoms**: Multiple similar database queries in a single trace
+**Solutions**:
+- Implement eager loading
+- Use batch queries
+- Add caching layer
+
+### 3. External Service Latency
+**Symptoms**: High latency in spans calling external services
+**Solutions**:
+- Implement circuit breakers
+- Add timeouts
+- Use asynchronous processing
+- Cache external service responses
+
+### 4. Resource Constraints
+**Symptoms**: High CPU/memory correlation with latency
+**Solutions**:
+- Scale horizontally (add more pods/instances)
+- Scale vertically (increase resource limits)
+- Optimize code for efficiency
+- Implement rate limiting
+
+### 5. Inefficient Code Paths
+**Symptoms**: Specific request patterns much slower
+**Solutions**:
+- Profile and optimize hot paths
+- Implement caching
+- Parallelize independent operations
+- Use more efficient algorithms
+
+### 6. Network Issues
+**Symptoms**: Intermittent high latency, timeouts
+**Solutions**:
+- Check network connectivity
+- Verify DNS resolution times
+- Review firewall/proxy configurations
+- Consider service mesh overhead
+
+### 7. Configuration Issues
+**Symptoms**: Sudden latency increase after deployment
+**Solutions**:
+- Review recent configuration changes
+- Check timeout settings
+- Verify connection pool sizes
+- Review retry configurations
+
+## Escalation Criteria
+
+Escalate to senior engineers or SRE team if:
+- Latency affects > 10% of requests
+- P95 latency exceeds SLO by 2x
+- Issue persists after initial mitigation attempts
+- Multiple services are affected simultaneously
+- Data loss or corruption is suspected
+
+## Monitoring and Alerting
+
+Set up alerts for:
+- P95 latency exceeding threshold
+- Sudden latency spike (> 50% increase)
+- Error rate correlation with latency
+- Resource utilization above 80%
+
+## Post-Incident Actions
+
+1. Document root cause and timeline
+2. Update runbook with new findings
+3. Implement additional monitoring if gaps found
+4. Consider architectural improvements
+5. Share learnings with the team
diff --git a/holmes/plugins/toolsets/prometheus_advanced.yaml b/holmes/plugins/toolsets/prometheus_advanced.yaml
new file mode 100644
index 000000000..2144b722d
--- /dev/null
+++ b/holmes/plugins/toolsets/prometheus_advanced.yaml
@@ -0,0 +1,266 @@
+name: prometheus/advanced-latency
+description: Advanced Prometheus tools for latency and performance analysis
+docs_url: https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/prometheus-advanced.html
+icon_url: https://upload.wikimedia.org/wikipedia/commons/3/38/Prometheus_software_logo.svg
+
+tools:
+  - name: analyze_metric_by_dimensions
+    description: |
+      Analyzes any metric broken down by its available label dimensions.
+      Supports histogram, summary, gauge, and counter metrics.
+      Automatically discovers available labels from the metric.
+    parameters:
+      metric_name:
+        type: string
+        required: true
+        description: The metric name to analyze
+      percentiles:
+        type: array
+        required: false
+        description: For histogram/summary metrics - percentiles to calculate
+        default: [0.5, 0.95, 0.99]
+      group_by:
+        type: array
+        required: false
+        description: Labels to group by (will be validated against available labels)
+      filters:
+        type: object
+        required: false
+        description: Label filters to apply as key-value pairs
+      time_range:
+        type: string
+        required: false
+        description: Time range for analysis (e.g., "5m", "1h", "24h")
+        default: "1h"
+      aggregation:
+        type: string
+        required: false
+        description: Aggregation method (avg, sum, max, min, stddev)
+        default: "avg"
+
+  - name: find_top_metric_values
+    description: |
+      Finds the highest values for any metric, grouped by labels.
+      Useful for identifying outliers, top consumers, or slowest operations.
+    parameters:
+      metric_name:
+        type: string
+        required: true
+        description: The metric to analyze
+      group_by_label:
+        type: string
+        required: true
+        description: Label to group results by
+      top_n:
+        type: integer
+        required: false
+        description: Number of top entries to return
+        default: 10
+      percentile:
+        type: number
+        required: false
+        description: For histogram/summary metrics - percentile to use
+        default: 0.95
+      min_threshold:
+        type: number
+        required: false
+        description: Minimum value to include in results
+        default: 0
+      time_range:
+        type: string
+        required: false
+        description: Time range for analysis
+        default: "1h"
+
+  - name: correlate_metrics
+    description: |
+      Correlates one metric with others to identify relationships.
+      Helps identify bottlenecks and dependencies between metrics.
+    parameters:
+      primary_metric:
+        type: string
+        required: true
+        description: The primary metric to analyze
+      correlation_metrics:
+        type: array
+        required: true
+        description: List of metrics to correlate with
+      label_filters:
+        type: object
+        required: false
+        description: Label filters to apply to all metrics
+      correlation_method:
+        type: string
+        required: false
+        description: Correlation method (pearson, spearman, kendall)
+        default: "pearson"
+      time_range:
+        type: string
+        required: false
+        description: Time range for correlation analysis
+        default: "1h"
+
+  - name: analyze_metric_distribution
+    description: |
+      Analyzes the distribution of any metric to identify patterns.
+      Detects bimodal distributions, outliers, and tail values.
+    parameters:
+      metric_name:
+        type: string
+        required: true
+        description: The metric to analyze (histogram type recommended)
+      buckets:
+        type: array
+        required: false
+        description: For histogram metrics - custom buckets for analysis
+      label_filters:
+        type: object
+        required: false
+        description: Label filters to apply
+      distribution_stats:
+        type: array
+        required: false
+        description: Statistics to calculate (mean, median, stddev, skew, kurtosis)
+        default: ["mean", "median", "stddev"]
+      time_range:
+        type: string
+        required: false
+        description: Time range for analysis
+        default: "1h"
+
+  - name: compare_metric_periods
+    description: |
+      Compares latency between two time periods to identify degradations.
+      Useful for before/after deployment comparisons.
+    parameters:
+      metric_name:
+        type: string
+        required: true
+        description: The metric to compare
+      current_period:
+        type: string
+        required: false
+        description: Current time period (e.g., "1h")
+        default: "1h"
+      comparison_period:
+        type: string
+        required: false
+        description: Comparison period offset (e.g., "24h" for yesterday)
+        default: "24h"
+      group_by:
+        type: array
+        required: false
+        description: Labels to group comparison by
+      threshold_percent:
+        type: number
+        required: false
+        description: Percentage change threshold to highlight
+        default: 10
+
+  - name: find_metric_outliers_with_trace_ids
+    description: |
+      Identifies outlier metric values and retrieves associated trace IDs if available.
+      Links metrics to traces through exemplars or labels.
+    parameters:
+      metric_name:
+        type: string
+        required: true
+        description: The metric to analyze
+      threshold:
+        type: number
+        required: true
+        description: Threshold value for outliers
+      trace_id_label:
+        type: string
+        required: false
+        description: Label containing trace ID (if available)
+      comparison_operator:
+        type: string
+        required: false
+        description: Operator for threshold comparison (>, <, >=, <=)
+        default: ">"
+      limit:
+        type: integer
+        required: false
+        description: Maximum number of results to return
+        default: 20
+      time_range:
+        type: string
+        required: false
+        description: Time range to search
+        default: "1h"
+
+  - name: analyze_metric_by_segments
+    description: |
+      Analyzes any metric grouped by specified label segments.
+      Useful for comparing performance across different dimensions.
+    parameters:
+      metric_name:
+        type: string
+        required: true
+        description: The metric to analyze
+      segment_labels:
+        type: array
+        required: true
+        description: Labels to segment analysis by
+      comparison_threshold:
+        type: number
+        required: false
+        description: Threshold value for comparison
+      aggregation:
+        type: string
+        required: false
+        description: How to aggregate within segments (avg, sum, max, min, p95)
+        default: "avg"
+      time_range:
+        type: string
+        required: false
+        description: Time range for analysis
+        default: "1h"
+
+  - name: detect_metric_anomalies
+    description: |
+      Detects anomalous patterns in any metric using statistical analysis.
+      Identifies sudden spikes, gradual degradations, and periodic patterns.
+    parameters:
+      metric_name:
+        type: string
+        required: true
+        description: The metric to analyze
+      sensitivity:
+        type: number
+        required: false
+        description: Anomaly detection sensitivity (1-5, higher is more sensitive)
+        default: 3
+      lookback_window:
+        type: string
+        required: false
+        description: Historical window for baseline (e.g., "7d")
+        default: "7d"
+      group_by:
+        type: array
+        required: false
+        description: Labels to detect anomalies by
+
+additional_instructions: |
+  This toolset provides generic metric analysis capabilities without assuming specific metric names or labels.
+
+  When investigating performance issues:
+
+  1. First discover available metrics and their labels using prometheus/metrics list_available_metrics
+  2. Use analyze_metric_by_dimensions to break down metrics by their actual labels
+  3. Find outliers with find_top_metric_values using discovered label names
+  4. Correlate different metrics to identify relationships
+  5. Compare time periods to detect regressions
+  6. Look for anomalies in metric patterns
+
+  For histogram metrics:
+  - Use histogram_quantile() for percentile calculations
+  - The _bucket suffix contains the histogram data
+  - Apply rate() before histogram_quantile()
+
+  For summary metrics:
+  - Quantiles are pre-calculated in labels
+  - Check for quantile labels in the metric
+
+  The tools work with any metric naming convention - always discover actual metric and label names first.
diff --git a/holmes/plugins/toolsets/tempo_advanced.yaml b/holmes/plugins/toolsets/tempo_advanced.yaml
new file mode 100644
index 000000000..89884cb8e
--- /dev/null
+++ b/holmes/plugins/toolsets/tempo_advanced.yaml
@@ -0,0 +1,294 @@
+name: tempo/advanced-tracing
+description: Advanced Tempo tools for distributed tracing and performance root cause analysis
+docs_url: https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/tempo-advanced.html
+icon_url: https://grafana.com/static/assets/img/blog/tempo.png
+
+tools:
+  - name: analyze_trace_latency_breakdown
+    description: |
+      Analyzes a trace to identify where time is spent across services.
+      Provides span-level breakdown with critical path analysis.
+    parameters:
+      trace_id:
+        type: string
+        required: true
+        description: The trace ID to analyze
+      include_dependencies:
+        type: boolean
+        required: false
+        description: Include external service dependencies in analysis
+        default: true
+
+  - name: find_traces_by_criteria
+    description: |
+      Finds traces matching specified criteria.
+      Filters can be based on any span attributes available in your traces.
+    parameters:
+      service_name:
+        type: string
+        required: false
+        description: Service to search traces for
+      operation_name:
+        type: string
+        required: false
+        description: Specific operation name
+      min_duration:
+        type: string
+        required: true
+        description: Minimum trace duration (e.g., "500ms", "2s")
+      max_duration:
+        type: string
+        required: false
+        description: Maximum trace duration
+      span_attributes:
+        type: object
+        required: false
+        description: Any span attributes to filter by (discovered from your traces)
+      error_only:
+        type: boolean
+        required: false
+        description: Only return traces with errors
+        default: false
+      time_range:
+        type: string
+        required: false
+        description: Time range to search
+        default: "1h"
+      limit:
+        type: integer
+        required: false
+        description: Maximum traces to return
+        default: 50
+
+  - name: correlate_traces_with_metrics
+    description: |
+      Correlates slow traces with metrics to identify resource constraints.
+      Links trace spans to CPU, memory, network metrics.
+    parameters:
+      trace_ids:
+        type: array
+        required: false
+        description: Specific trace IDs to correlate (auto-detects if not provided)
+      service_name:
+        type: string
+        required: false
+        description: Service to focus correlation on
+      metric_sources:
+        type: array
+        required: false
+        description: Metric sources to correlate (prometheus, datadog, etc.)
+        default: ["prometheus"]
+      time_window:
+        type: string
+        required: false
+        description: Time window around trace for metrics
+        default: "1m"
+
+  - name: analyze_span_attributes
+    description: |
+      Analyzes span attributes to find patterns.
+      Groups traces by any available span attributes.
+    parameters:
+      service_name:
+        type: string
+        required: true
+        description: Service to analyze spans for
+      group_by_attributes:
+        type: array
+        required: true
+        description: Span attributes to group by (will use actual attributes from your traces)
+      min_duration:
+        type: string
+        required: false
+        description: Minimum span duration to include
+        default: "100ms"
+      aggregation:
+        type: string
+        required: false
+        description: How to aggregate (p50, p95, p99, avg, max)
+        default: "p95"
+      time_range:
+        type: string
+        required: false
+        description: Time range for analysis
+        default: "1h"
+
+  - name: trace_service_dependencies
+    description: |
+      Maps service dependencies and their latency contributions.
+      Identifies critical path and bottleneck services.
+    parameters:
+      root_service:
+        type: string
+        required: true
+        description: Root service to trace dependencies from
+      depth:
+        type: integer
+        required: false
+        description: Maximum dependency depth to analyze
+        default: 5
+      latency_threshold:
+        type: string
+        required: false
+        description: Only show dependencies above this latency
+      time_range:
+        type: string
+        required: false
+        description: Time range for dependency analysis
+        default: "1h"
+
+  - name: compare_trace_patterns
+    description: |
+      Compares traces between different time periods or deployments.
+      Identifies changes in service behavior and latency.
+    parameters:
+      service_name:
+        type: string
+        required: true
+        description: Service to compare traces for
+      operation_name:
+        type: string
+        required: false
+        description: Specific operation to compare
+      baseline_period:
+        type: object
+        required: true
+        description: Baseline time period (start, end)
+      comparison_period:
+        type: object
+        required: true
+        description: Comparison time period (start, end)
+      attributes_to_compare:
+        type: array
+        required: false
+        description: Specific attributes to compare
+
+  - name: detect_trace_anomalies
+    description: |
+      Detects anomalous traces using statistical analysis.
+      Identifies outliers in latency, error patterns, and span counts.
+    parameters:
+      service_name:
+        type: string
+        required: true
+        description: Service to detect anomalies for
+      baseline_window:
+        type: string
+        required: false
+        description: Historical window for baseline
+        default: "24h"
+      sensitivity:
+        type: number
+        required: false
+        description: Anomaly sensitivity (1-5)
+        default: 3
+      anomaly_types:
+        type: array
+        required: false
+        description: Types of anomalies to detect
+        default: ["latency", "errors", "span_count"]
+
+  - name: analyze_span_operations
+    description: |
+      Analyzes operations within traces based on span attributes.
+      Can identify slow operations, repeated patterns, and bottlenecks.
+    parameters:
+      service_name:
+        type: string
+        required: false
+        description: Service to analyze operations for
+      operation_type_attribute:
+        type: string
+        required: false
+        description: Span attribute that identifies operation type (e.g., 'db.system', 'rpc.method')
+      min_duration:
+        type: string
+        required: false
+        description: Minimum operation duration to include
+        default: "100ms"
+      group_by_attributes:
+        type: array
+        required: false
+        description: Additional attributes to group by
+      time_range:
+        type: string
+        required: false
+        description: Time range for analysis
+        default: "1h"
+
+  - name: trace_error_propagation
+    description: |
+      Traces how errors propagate through the system.
+      Identifies error origins and affected downstream services.
+    parameters:
+      error_type:
+        type: string
+        required: false
+        description: Specific error type to trace
+      service_name:
+        type: string
+        required: false
+        description: Service where error originated
+      include_retries:
+        type: boolean
+        required: false
+        description: Include retry attempts in analysis
+        default: true
+      time_range:
+        type: string
+        required: false
+        description: Time range to analyze
+        default: "1h"
+
+  - name: calculate_service_metrics_from_traces
+    description: |
+      Calculates service metrics from trace data.
+      Measures success rate, duration percentiles, and other KPIs.
+    parameters:
+      service_name:
+        type: string
+        required: true
+        description: Service to calculate metrics for
+      operation_name:
+        type: string
+        required: false
+        description: Specific operation (all if not specified)
+      metric_type:
+        type: string
+        required: true
+        description: Type of metric (duration_percentiles, success_rate, operation_rate)
+      threshold:
+        type: number
+        required: false
+        description: Threshold for metric calculation
+      time_range:
+        type: string
+        required: false
+        description: Time range for SLI calculation
+        default: "1h"
+
+additional_instructions: |
+  This toolset provides generic trace analysis without assuming specific attribute names.
+
+  When investigating performance issues using traces:
+
+  1. First use fetch_tempo_tags to discover available span attributes in your system
+  2. Use discovered attributes to filter and analyze traces
+  3. Find example traces matching the reported issue pattern
+  4. Group by actual span attributes to identify patterns
+  5. Break down individual traces to find bottleneck spans
+  6. Compare different time periods to identify changes
+
+  The tools work with any span attribute naming convention:
+  - OpenTelemetry semantic conventions
+  - Custom application-specific attributes
+  - Service mesh added attributes
+  - APM vendor-specific attributes
+
+  Common patterns to look for:
+  - Repeated operations (potential N+1 problems)
+  - Long-duration spans (bottlenecks)
+  - Error propagation through services
+  - Timeout and retry patterns
+
+  Always discover and use actual attribute names from your tracing data rather than assuming standard names.

From 55cbbcff55a602af0aca32aabcbd187943ee8442 Mon Sep 17 00:00:00 2001
From: Tomer Keshet <tomer@robusta.dev>
Date: Sun, 24 Aug 2025 14:21:27 +0300
Subject: [PATCH 08/15] add tags

---
 .../114_checkout_latency_tracing_rebuild/test_case.yaml        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
index 6675e9963..3d14e66ba 100644
--- a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
@@ -7,7 +7,8 @@ expected_output:
 
 tags:
   - kubernetes
-  - medium
+  - hard
+  - chain-of-causation
   - traces
 
 port_forwards:

From e91eeb2ac49f82e0aab076cc6cfb0ba0b00c1b23 Mon Sep 17 00:00:00 2001
From: Tomer Keshet <tomer@robusta.dev>
Date: Sun, 24 Aug 2025 14:36:20 +0300
Subject: [PATCH 09/15] WIP - not allowing Kubectl secrets

---
 holmes/plugins/toolsets/kubernetes.yaml | 109 +++++++++++++++++++-----
 1 file changed, 88 insertions(+), 21 deletions(-)

diff --git a/holmes/plugins/toolsets/kubernetes.yaml b/holmes/plugins/toolsets/kubernetes.yaml
index 81e61412b..eabaf3c35 100644
--- a/holmes/plugins/toolsets/kubernetes.yaml
+++ b/holmes/plugins/toolsets/kubernetes.yaml
@@ -16,34 +16,70 @@ toolsets:
           for example when a user asks
             - 'describe pod xyz-123'
             - 'show service xyz-123 in namespace my-ns'
-        command: "kubectl describe {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}"
+          You are NEVER EVER allowed to get kubernetes secrets!
+        command: |
+          if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+          kubectl describe {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}
 
       - name: "kubectl_get_by_name"
-        description: "Run `kubectl get <kind> <name> --show-labels`"
-        command: "kubectl get --show-labels -o wide {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}"
+        description: "Run `kubectl get <kind> <name> --show-labels`. You are NEVER EVER allowed to get kubernetes secrets!"
+        command: |
+          if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+          kubectl get --show-labels -o wide {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}
 
       - name: "kubectl_get_by_kind_in_namespace"
-        description: "Run `kubectl get <kind> -n <namespace> --show-labels` to get all resources of a given type in namespace"
-        command: "kubectl get --show-labels -o wide {{ kind }} -n {{namespace}}"
+        description: "Run `kubectl get <kind> -n <namespace> --show-labels` to get all resources of a given type in namespace. You are NEVER EVER allowed to get kubernetes secrets!"
+        command: |
+          if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+          kubectl get --show-labels -o wide {{ kind }} -n {{namespace}}
 
       - name: "kubectl_get_by_kind_in_cluster"
-        description: "Run `kubectl get -A <kind> --show-labels` to get all resources of a given type in the cluster"
-        command: "kubectl get -A --show-labels -o wide {{ kind }}"
+        description: "Run `kubectl get -A <kind> --show-labels` to get all resources of a given type in the cluster. You are NEVER EVER allowed to get kubernetes secrets!"
+        command: |
+          if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+          kubectl get -A --show-labels -o wide {{ kind }}
 
       - name: "kubectl_find_resource"
-        description: "Run `kubectl get {{ kind }} -A --show-labels | grep {{ keyword }}` to find a resource where you know a substring of the name, IP, namespace, or labels"
-        command: "kubectl get -A --show-labels -o wide {{ kind }} | grep {{ keyword }}"
+        description: "Run `kubectl get {{ kind }} -A --show-labels | grep {{ keyword }}` to find a resource where you know a substring of the name, IP, namespace, or labels. You are NEVER EVER allowed to get kubernetes secrets!"
+        command: |
+          if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+          kubectl get -A --show-labels -o wide {{ kind }} | grep {{ keyword }}
 
       - name: "kubectl_get_yaml"
-        description: "Run `kubectl get -o yaml` on a single Kubernetes resource"
-        command: "kubectl get -o yaml {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}"
+        description: "Run `kubectl get -o yaml` on a single Kubernetes resource. You are NEVER EVER allowed to get kubernetes secrets!"
+        command: |
+          if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+          kubectl get -o yaml {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}
 
       - name: "kubectl_events"
-        description: "Retrieve the events for a specific Kubernetes resource. `resource_type` can be any kubernetes resource type: 'pod', 'service', 'deployment', 'job', 'node', etc."
-        command: "kubectl events --for {{resource_type}}/{{ resource_name }}{% if namespace %} -n {{ namespace }}{% endif %}"
+        description: "Retrieve the events for a specific Kubernetes resource. `resource_type` can be any kubernetes resource type: 'pod', 'service', 'deployment', 'job', 'node', etc. You are NEVER EVER allowed to get kubernetes secrets!"
+        command: |
+          if [ "{{resource_type}}" = "secret" ] || [ "{{resource_type}}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+          kubectl events --for {{resource_type}}/{{ resource_name }}{% if namespace %} -n {{ namespace }}{% endif %}
 
       - name: "kubectl_memory_requests_all_namespaces"
-        description: "Fetch and display memory requests for all pods across all namespaces in MiB, summing requests across multiple containers where applicable and handling binary, decimal, and millibyte units correctly."
+        description: "Fetch and display memory requests for all pods across all namespaces in MiB, summing requests across multiple containers where applicable and handling binary, decimal, and millibyte units correctly. You are NEVER EVER allowed to get kubernetes secrets!"
         command: |
           kubectl get pods --all-namespaces -o custom-columns="NAMESPACE:.metadata.namespace,NAME:.metadata.name,MEMORY_REQUEST:.spec.containers[*].resources.requests.memory" --no-headers | \
           awk '
@@ -85,7 +121,7 @@ toolsets:
             }' | sort -k3 -nr
 
       - name: "kubectl_memory_requests_namespace"
-        description: "Fetch and display memory requests for all pods in a specified namespace in MiB, summing requests across multiple containers where applicable and handling binary, decimal, and millibyte units correctly."
+        description: "Fetch and display memory requests for all pods in a specified namespace in MiB, summing requests across multiple containers where applicable and handling binary, decimal, and millibyte units correctly. You are NEVER EVER allowed to get kubernetes secrets!"
         command: |
           kubectl get pods -n {{ namespace }} -o custom-columns="NAMESPACE:.metadata.namespace,NAME:.metadata.name,MEMORY_REQUEST:.spec.containers[*].resources.requests.memory" --no-headers | \
           awk '
@@ -129,8 +165,13 @@ toolsets:
       - name: "kubernetes_jq_query"
         user_description: "Query Kubernetes Resources: kubectl get {{kind}} --all-namespaces -o json | jq -r {{jq_expr}}"
         description: >
-          Use kubectl to get json for all resources of a specific kind pipe the results to jq to filter them. Do not worry about escaping the jq_expr it will be done by the system on an unescaped expression that you give. e.g. give an expression like .items[] | .spec.containers[].image | select(test("^gcr.io/") | not)
-        command: kubectl get {{ kind }} --all-namespaces -o json | jq -r {{ jq_expr }}
+          Use kubectl to get json for all resources of a specific kind pipe the results to jq to filter them. Do not worry about escaping the jq_expr it will be done by the system on an unescaped expression that you give. e.g. give an expression like .items[] | .spec.containers[].image | select(test("^gcr.io/") | not). You are NEVER EVER allowed to get kubernetes secrets!
+        command: |
+          if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+          kubectl get {{ kind }} --all-namespaces -o json | jq -r {{ jq_expr }}
 
       - name: "kubernetes_count"
         user_description: "Count Kubernetes Resources: kubectl get {{kind}} --all-namespaces -o json | jq -c -r {{ jq_expr }}"
@@ -140,7 +181,13 @@ toolsets:
           Use select() to filter objects before extracting properties, e.g. .items[] | select(.metadata.namespace == "test-1") | .metadata.name
           Do not worry about escaping the jq_expr it will be done by the system on an unescaped expression that you give.
           e.g. give an expression like .items[] | select(.spec.containers[].image | test("^gcr.io/") | not) | .metadata.name
+          You are NEVER EVER allowed to get kubernetes secrets!
         script: |
+          if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+
           echo "Command executed: kubectl get {{ kind }} --all-namespaces -o json | jq -c -r {{ jq_expr }}"
           echo "---"
 
@@ -239,10 +286,20 @@ toolsets:
     tools:
       - name: "kubectl_lineage_children"
         description: "Get all children/dependents of a Kubernetes resource, recursively, including their status"
-        command: "kubectl lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}"
+        command: |
+          if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+          kubectl lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}
       - name: "kubectl_lineage_parents"
         description: "Get all parents/dependencies of a Kubernetes resource, recursively, including their status"
-        command: "kubectl lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} -D"
+        command: |
+          if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+          kubectl lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} -D
 
   kubernetes/kube-lineage-extras: # To make this work, build kube-lineage from source
     description: "Fetches children/dependents and parents/dependencies resources using kube-lineage"
@@ -255,7 +312,17 @@ toolsets:
     tools:
       - name: "kubectl_lineage_children"
         description: "Get all children/dependents of a Kubernetes resource, recursively, including their status"
-        command: "kube-lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}"
+        command: |
+          if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+          kube-lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %}
       - name: "kubectl_lineage_parents"
         description: "Get all parents/dependencies of a Kubernetes resource, recursively, including their status"
-        command: "kube-lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} -D"
+        command: |
+          if [ "{{ kind }}" = "secret" ] || [ "{{ kind }}" = "secrets" ]; then
+            echo "Not allowed to get kubernetes secrets"
+            exit 1
+          fi
+          kube-lineage {{ kind }} {{ name }}{% if namespace %} -n {{ namespace }}{% endif %} -D

From 939f54b658ae090a1815ac49cc2b28fb2ea1e951 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Sun, 24 Aug 2025 14:42:58 +0300
Subject: [PATCH 10/15] WIP

---
 .../runbooks/high_latency_investigation.md    |  10 +-
 .../toolsets/grafana/toolset_grafana_tempo.py | 317 +++++++++++++-
 .../plugins/toolsets/prometheus/prometheus.py | 397 ++++++++++++++++++
 .../plugins/toolsets/prometheus_advanced.yaml | 266 ------------
 holmes/plugins/toolsets/tempo_advanced.yaml   | 294 -------------
 5 files changed, 716 insertions(+), 568 deletions(-)
 delete mode 100644 holmes/plugins/toolsets/prometheus_advanced.yaml
 delete mode 100644 holmes/plugins/toolsets/tempo_advanced.yaml

diff --git a/holmes/plugins/runbooks/high_latency_investigation.md b/holmes/plugins/runbooks/high_latency_investigation.md
index 0bd2d7a19..810206aad 100644
--- a/holmes/plugins/runbooks/high_latency_investigation.md
+++ b/holmes/plugins/runbooks/high_latency_investigation.md
@@ -88,16 +88,12 @@ fetch_tempo_tags(
 
 Then find example slow traces:
 
-Get specific examples of slow requests for detailed analysis:
-
 ```
-# Use tempo/advanced-tracing toolset
-find_traces_by_criteria(
+# Use grafana/tempo toolset
+fetch_tempo_traces(
   service_name="${affected_service}",
-  operation_name="${affected_operation}",
   min_duration="${threshold_duration}",
-  span_attributes={"${your_attributes}": "${values}"},
-  time_range="30m",
+  start_datetime="-30m",
   limit=10
 )
 ```
diff --git a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
index c1c41daa8..2d84d2c60 100644
--- a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
+++ b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
@@ -291,6 +291,314 @@ def get_parameterized_one_liner(self, params: Dict) -> str:
         return f"{toolset_name_for_one_liner(self._toolset.name)}: Fetched Tempo Trace (trace_id={params.get('trace_id')})"
 
 
+class AnalyzeTracesByAttributes(Tool):
+    def __init__(self, toolset: BaseGrafanaTempoToolset):
+        super().__init__(
+            name="analyze_traces_by_attributes",
+            description="Analyzes traces grouped by specified span attributes to find patterns in performance or errors.",
+            parameters={
+                "service_name": ToolParameter(
+                    description="Service to analyze traces for",
+                    type="string",
+                    required=False,
+                ),
+                "group_by_attributes": ToolParameter(
+                    description="Span attributes to group analysis by (discovered from your traces)",
+                    type="array",
+                    required=True,
+                ),
+                "min_duration": ToolParameter(
+                    description="Minimum duration to include (e.g., '100ms', '1s')",
+                    type="string",
+                    required=False,
+                ),
+                "start_datetime": ToolParameter(
+                    description="Start time for analysis (RFC3339 or relative)",
+                    type="string",
+                    required=False,
+                ),
+                "end_datetime": ToolParameter(
+                    description="End time for analysis (RFC3339 or relative)",
+                    type="string",
+                    required=False,
+                ),
+                "limit": ToolParameter(
+                    description="Maximum number of traces to analyze",
+                    type="integer",
+                    required=False,
+                ),
+            },
+        )
+        self._toolset = toolset
+
+    def _invoke(self, params: Dict) -> StructuredToolResult:
+        try:
+            # Build query with flexible attributes
+            group_by = params.get("group_by_attributes", [])
+            service_name = params.get("service_name")
+            min_duration = params.get("min_duration", "100ms")
+
+            start, end = process_timestamps_to_int(
+                params.get("start_datetime"),
+                params.get("end_datetime"),
+                default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
+            )
+
+            # Build TraceQL query
+            filters = []
+            if service_name:
+                filters.append(f'resource.service.name="{service_name}"')
+            filters.append(f"duration>{min_duration}")
+
+            query = " && ".join(filters)
+            query = f"{{{query}}}"
+
+            base_url = get_base_url(self._toolset.grafana_config)
+            traces = query_tempo_traces(
+                base_url=base_url,
+                api_key=self._toolset.grafana_config.api_key,
+                headers=self._toolset.grafana_config.headers,
+                query=query,
+                start=start,
+                end=end,
+                limit=params.get("limit", 100),
+            )
+
+            # Group traces by specified attributes
+            grouped_analysis = {}
+            for trace in traces:
+                # Extract attribute values for grouping
+                group_key = []
+                for attr in group_by:
+                    # Look for attribute in trace spans
+                    value = "unknown"
+                    # This would need actual trace parsing logic
+                    group_key.append(f"{attr}={value}")
+
+                key = ", ".join(group_key)
+                if key not in grouped_analysis:
+                    grouped_analysis[key] = {
+                        "count": 0,
+                        "total_duration": 0,
+                        "errors": 0,
+                    }
+
+                grouped_analysis[key]["count"] += 1
+                # Add duration and error tracking
+
+            return StructuredToolResult(
+                status=ToolResultStatus.SUCCESS,
+                data=yaml.dump(grouped_analysis),
+                params=params,
+            )
+
+        except Exception as e:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error=f"Error analyzing traces: {str(e)}",
+                params=params,
+            )
+
+    def get_parameterized_one_liner(self, params: Dict) -> str:
+        return f"{toolset_name_for_one_liner(self._toolset.name)}: Analyze traces by attributes"
+
+
+class FindSlowOperations(Tool):
+    def __init__(self, toolset: BaseGrafanaTempoToolset):
+        super().__init__(
+            name="find_slow_operations",
+            description="Identifies slow operations within traces based on span durations and attributes.",
+            parameters={
+                "service_name": ToolParameter(
+                    description="Service to analyze",
+                    type="string",
+                    required=False,
+                ),
+                "operation_attribute": ToolParameter(
+                    description="Span attribute that identifies operation type",
+                    type="string",
+                    required=False,
+                ),
+                "min_duration": ToolParameter(
+                    description="Minimum duration to consider slow",
+                    type="string",
+                    required=True,
+                ),
+                "group_by": ToolParameter(
+                    description="Additional attributes to group by",
+                    type="array",
+                    required=False,
+                ),
+                "start_datetime": ToolParameter(
+                    description="Start time for search",
+                    type="string",
+                    required=False,
+                ),
+                "end_datetime": ToolParameter(
+                    description="End time for search",
+                    type="string",
+                    required=False,
+                ),
+            },
+        )
+        self._toolset = toolset
+
+    def _invoke(self, params: Dict) -> StructuredToolResult:
+        try:
+            min_duration = get_param_or_raise(params, "min_duration")
+            service_name = params.get("service_name")
+
+            start, end = process_timestamps_to_int(
+                params.get("start_datetime"),
+                params.get("end_datetime"),
+                default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
+            )
+
+            # Build query for slow operations
+            filters = [f"duration>{min_duration}"]
+            if service_name:
+                filters.append(f'resource.service.name="{service_name}"')
+
+            query = " && ".join(filters)
+            query = f"{{{query}}}"
+
+            base_url = get_base_url(self._toolset.grafana_config)
+            traces = query_tempo_traces(
+                base_url=base_url,
+                api_key=self._toolset.grafana_config.api_key,
+                headers=self._toolset.grafana_config.headers,
+                query=query,
+                start=start,
+                end=end,
+                limit=50,
+            )
+
+            return StructuredToolResult(
+                status=ToolResultStatus.SUCCESS,
+                data=format_traces_list(traces),
+                params=params,
+            )
+
+        except Exception as e:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error=f"Error finding slow operations: {str(e)}",
+                params=params,
+            )
+
+    def get_parameterized_one_liner(self, params: Dict) -> str:
+        return f"{toolset_name_for_one_liner(self._toolset.name)}: Find slow operations"
+
+
+class CompareTracePeriods(Tool):
+    def __init__(self, toolset: BaseGrafanaTempoToolset):
+        super().__init__(
+            name="compare_trace_periods",
+            description="Compares trace patterns between two time periods to identify changes in performance or behavior.",
+            parameters={
+                "service_name": ToolParameter(
+                    description="Service to compare",
+                    type="string",
+                    required=True,
+                ),
+                "baseline_start": ToolParameter(
+                    description="Baseline period start time",
+                    type="string",
+                    required=True,
+                ),
+                "baseline_end": ToolParameter(
+                    description="Baseline period end time",
+                    type="string",
+                    required=True,
+                ),
+                "comparison_start": ToolParameter(
+                    description="Comparison period start time",
+                    type="string",
+                    required=True,
+                ),
+                "comparison_end": ToolParameter(
+                    description="Comparison period end time",
+                    type="string",
+                    required=True,
+                ),
+                "attributes_to_compare": ToolParameter(
+                    description="Span attributes to compare",
+                    type="array",
+                    required=False,
+                ),
+            },
+        )
+        self._toolset = toolset
+
+    def _invoke(self, params: Dict) -> StructuredToolResult:
+        try:
+            service_name = get_param_or_raise(params, "service_name")
+
+            # Get baseline traces
+            baseline_start, baseline_end = process_timestamps_to_int(
+                params.get("baseline_start"),
+                params.get("baseline_end"),
+                default_time_span_seconds=3600,
+            )
+
+            comparison_start, comparison_end = process_timestamps_to_int(
+                params.get("comparison_start"),
+                params.get("comparison_end"),
+                default_time_span_seconds=3600,
+            )
+
+            query = f'{{resource.service.name="{service_name}"}}'
+            base_url = get_base_url(self._toolset.grafana_config)
+
+            # Fetch baseline traces
+            baseline_traces = query_tempo_traces(
+                base_url=base_url,
+                api_key=self._toolset.grafana_config.api_key,
+                headers=self._toolset.grafana_config.headers,
+                query=query,
+                start=baseline_start,
+                end=baseline_end,
+                limit=100,
+            )
+
+            # Fetch comparison traces
+            comparison_traces = query_tempo_traces(
+                base_url=base_url,
+                api_key=self._toolset.grafana_config.api_key,
+                headers=self._toolset.grafana_config.headers,
+                query=query,
+                start=comparison_start,
+                end=comparison_end,
+                limit=100,
+            )
+
+            # Compare the two sets
+            comparison_result = {
+                "baseline_count": len(baseline_traces),
+                "comparison_count": len(comparison_traces),
+                "baseline_period": f"{baseline_start} to {baseline_end}",
+                "comparison_period": f"{comparison_start} to {comparison_end}",
+            }
+
+            return StructuredToolResult(
+                status=ToolResultStatus.SUCCESS,
+                data=yaml.dump(comparison_result),
+                params=params,
+            )
+
+        except Exception as e:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error=f"Error comparing periods: {str(e)}",
+                params=params,
+            )
+
+    def get_parameterized_one_liner(self, params: Dict) -> str:
+        return (
+            f"{toolset_name_for_one_liner(self._toolset.name)}: Compare trace periods"
+        )
+
+
 class GrafanaTempoToolset(BaseGrafanaTempoToolset):
     def __init__(self):
         super().__init__(
@@ -298,7 +606,14 @@ def __init__(self):
             description="Fetches kubernetes traces from Tempo",
             icon_url="https://grafana.com/static/assets/img/blog/tempo.png",
             docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/grafanatempo.html",
-            tools=[GetTempoTraces(self), GetTempoTraceById(self), GetTempoTags(self)],
+            tools=[
+                GetTempoTraces(self),
+                GetTempoTraceById(self),
+                GetTempoTags(self),
+                AnalyzeTracesByAttributes(self),
+                FindSlowOperations(self),
+                CompareTracePeriods(self),
+            ],
         )
         template_file_path = os.path.abspath(
             os.path.join(os.path.dirname(__file__), "toolset_grafana_tempo.jinja2")
diff --git a/holmes/plugins/toolsets/prometheus/prometheus.py b/holmes/plugins/toolsets/prometheus/prometheus.py
index ace6d3faf..90a7bfc5f 100644
--- a/holmes/plugins/toolsets/prometheus/prometheus.py
+++ b/holmes/plugins/toolsets/prometheus/prometheus.py
@@ -816,6 +816,399 @@ def get_parameterized_one_liner(self, params) -> str:
         return f"{toolset_name_for_one_liner(self.toolset.name)}: Query ({description})"
 
 
+class AnalyzeMetricByDimensions(BasePrometheusTool):
+    def __init__(self, toolset: "PrometheusToolset"):
+        super().__init__(
+            name="analyze_metric_by_dimensions",
+            description="Analyzes any metric broken down by its available label dimensions. Automatically discovers available labels from the metric.",
+            parameters={
+                "metric_name": ToolParameter(
+                    description="The metric name to analyze",
+                    type="string",
+                    required=True,
+                ),
+                "group_by": ToolParameter(
+                    description="Labels to group by (will be validated against available labels)",
+                    type="array",
+                    required=False,
+                ),
+                "filters": ToolParameter(
+                    description="Label filters to apply as key-value pairs",
+                    type="object",
+                    required=False,
+                ),
+                "percentiles": ToolParameter(
+                    description="For histogram/summary metrics - percentiles to calculate",
+                    type="array",
+                    required=False,
+                ),
+                "time_range": ToolParameter(
+                    description="Time range for analysis (e.g., '5m', '1h', '24h')",
+                    type="string",
+                    required=False,
+                ),
+                "aggregation": ToolParameter(
+                    description="Aggregation method (avg, sum, max, min, p50, p95, p99)",
+                    type="string",
+                    required=False,
+                ),
+            },
+            toolset=toolset,
+        )
+
+    def _invoke(self, params: Any) -> StructuredToolResult:
+        if not self.toolset.config or not self.toolset.config.prometheus_url:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error="Prometheus is not configured. Prometheus URL is missing",
+                params=params,
+            )
+
+        try:
+            metric_name = get_param_or_raise(params, "metric_name")
+            group_by = params.get("group_by", [])
+            filters = params.get("filters", {})
+            time_range = params.get("time_range", "1h")
+            aggregation = params.get("aggregation", "avg")
+
+            # Build the base query with filters
+            filter_str = ""
+            if filters:
+                filter_items = [f'{k}="{v}"' for k, v in filters.items()]
+                filter_str = "{" + ",".join(filter_items) + "}"
+
+            # Build the query based on aggregation type
+            if aggregation in ["p50", "p95", "p99"]:
+                percentile = float(aggregation[1:]) / 100
+                query = f"histogram_quantile({percentile}, sum(rate({metric_name}_bucket{filter_str}[{time_range}])) by (le"
+                if group_by:
+                    query += f", {', '.join(group_by)}"
+                query += "))"
+            elif group_by:
+                query = f'{aggregation}(rate({metric_name}{filter_str}[{time_range}])) by ({", ".join(group_by)})'
+            else:
+                query = f"{aggregation}(rate({metric_name}{filter_str}[{time_range}]))"
+
+            url = urljoin(self.toolset.config.prometheus_url, "api/v1/query")
+            payload = {"query": query}
+
+            response = requests.post(
+                url=url,
+                headers=self.toolset.config.headers,
+                auth=self.toolset.config.get_auth(),
+                data=payload,
+                timeout=60,
+                verify=self.toolset.config.prometheus_ssl_enabled,
+            )
+
+            if response.status_code == 200:
+                data = response.json()
+                return StructuredToolResult(
+                    status=ToolResultStatus.SUCCESS,
+                    data=json.dumps(data.get("data"), indent=2),
+                    params=params,
+                )
+            else:
+                return StructuredToolResult(
+                    status=ToolResultStatus.ERROR,
+                    error=f"Query failed with status {response.status_code}: {response.text}",
+                    params=params,
+                )
+
+        except Exception as e:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error=f"Error analyzing metric: {str(e)}",
+                params=params,
+            )
+
+    def get_parameterized_one_liner(self, params) -> str:
+        metric_name = params.get("metric_name", "")
+        return f"{toolset_name_for_one_liner(self.toolset.name)}: Analyze {metric_name} by dimensions"
+
+
+class FindTopMetricValues(BasePrometheusTool):
+    def __init__(self, toolset: "PrometheusToolset"):
+        super().__init__(
+            name="find_top_metric_values",
+            description="Finds the highest values for any metric, grouped by labels. Useful for identifying outliers or slowest operations.",
+            parameters={
+                "metric_name": ToolParameter(
+                    description="The metric to analyze",
+                    type="string",
+                    required=True,
+                ),
+                "group_by_label": ToolParameter(
+                    description="Label to group results by",
+                    type="string",
+                    required=True,
+                ),
+                "top_n": ToolParameter(
+                    description="Number of top entries to return",
+                    type="integer",
+                    required=False,
+                ),
+                "percentile": ToolParameter(
+                    description="For histogram/summary metrics - percentile to use (e.g., 0.95)",
+                    type="number",
+                    required=False,
+                ),
+                "time_range": ToolParameter(
+                    description="Time range for analysis",
+                    type="string",
+                    required=False,
+                ),
+            },
+            toolset=toolset,
+        )
+
+    def _invoke(self, params: Any) -> StructuredToolResult:
+        if not self.toolset.config or not self.toolset.config.prometheus_url:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error="Prometheus is not configured. Prometheus URL is missing",
+                params=params,
+            )
+
+        try:
+            metric_name = get_param_or_raise(params, "metric_name")
+            group_by_label = get_param_or_raise(params, "group_by_label")
+            top_n = params.get("top_n", 10)
+            percentile = params.get("percentile", 0.95)
+            time_range = params.get("time_range", "1h")
+
+            # Check if it's a histogram metric
+            if "_bucket" in metric_name or percentile:
+                query = f"topk({top_n}, histogram_quantile({percentile}, sum(rate({metric_name}_bucket[{time_range}])) by (le, {group_by_label})))"
+            else:
+                query = f"topk({top_n}, avg(rate({metric_name}[{time_range}])) by ({group_by_label}))"
+
+            url = urljoin(self.toolset.config.prometheus_url, "api/v1/query")
+            payload = {"query": query}
+
+            response = requests.post(
+                url=url,
+                headers=self.toolset.config.headers,
+                auth=self.toolset.config.get_auth(),
+                data=payload,
+                timeout=60,
+                verify=self.toolset.config.prometheus_ssl_enabled,
+            )
+
+            if response.status_code == 200:
+                data = response.json()
+                return StructuredToolResult(
+                    status=ToolResultStatus.SUCCESS,
+                    data=json.dumps(data.get("data"), indent=2),
+                    params=params,
+                )
+            else:
+                return StructuredToolResult(
+                    status=ToolResultStatus.ERROR,
+                    error=f"Query failed with status {response.status_code}: {response.text}",
+                    params=params,
+                )
+
+        except Exception as e:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error=f"Error finding top values: {str(e)}",
+                params=params,
+            )
+
+    def get_parameterized_one_liner(self, params) -> str:
+        metric_name = params.get("metric_name", "")
+        return f"{toolset_name_for_one_liner(self.toolset.name)}: Find top values for {metric_name}"
+
+
+class CompareMetricPeriods(BasePrometheusTool):
+    def __init__(self, toolset: "PrometheusToolset"):
+        super().__init__(
+            name="compare_metric_periods",
+            description="Compares a metric between two time periods to identify changes or degradations.",
+            parameters={
+                "metric_name": ToolParameter(
+                    description="The metric to compare",
+                    type="string",
+                    required=True,
+                ),
+                "current_period": ToolParameter(
+                    description="Current time period (e.g., '1h')",
+                    type="string",
+                    required=False,
+                ),
+                "comparison_offset": ToolParameter(
+                    description="How far back to compare (e.g., '24h' for yesterday)",
+                    type="string",
+                    required=False,
+                ),
+                "group_by": ToolParameter(
+                    description="Labels to group comparison by",
+                    type="array",
+                    required=False,
+                ),
+            },
+            toolset=toolset,
+        )
+
+    def _invoke(self, params: Any) -> StructuredToolResult:
+        if not self.toolset.config or not self.toolset.config.prometheus_url:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error="Prometheus is not configured. Prometheus URL is missing",
+                params=params,
+            )
+
+        try:
+            metric_name = get_param_or_raise(params, "metric_name")
+            current_period = params.get("current_period", "1h")
+            comparison_offset = params.get("comparison_offset", "24h")
+            group_by = params.get("group_by", [])
+
+            # Build group by clause
+            group_clause = ""
+            if group_by:
+                group_clause = f' by ({", ".join(group_by)})'
+
+            # Query comparing current vs offset period
+            query = f"""
+                (avg(rate({metric_name}[{current_period}])){group_clause} -
+                 avg(rate({metric_name}[{current_period}] offset {comparison_offset})){group_clause}) /
+                 avg(rate({metric_name}[{current_period}] offset {comparison_offset})){group_clause} * 100
+            """
+
+            url = urljoin(self.toolset.config.prometheus_url, "api/v1/query")
+            payload = {"query": query}
+
+            response = requests.post(
+                url=url,
+                headers=self.toolset.config.headers,
+                auth=self.toolset.config.get_auth(),
+                data=payload,
+                timeout=60,
+                verify=self.toolset.config.prometheus_ssl_enabled,
+            )
+
+            if response.status_code == 200:
+                data = response.json()
+                return StructuredToolResult(
+                    status=ToolResultStatus.SUCCESS,
+                    data=json.dumps(data.get("data"), indent=2),
+                    params=params,
+                )
+            else:
+                return StructuredToolResult(
+                    status=ToolResultStatus.ERROR,
+                    error=f"Query failed with status {response.status_code}: {response.text}",
+                    params=params,
+                )
+
+        except Exception as e:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error=f"Error comparing periods: {str(e)}",
+                params=params,
+            )
+
+    def get_parameterized_one_liner(self, params) -> str:
+        metric_name = params.get("metric_name", "")
+        return f"{toolset_name_for_one_liner(self.toolset.name)}: Compare {metric_name} periods"
+
+
+class DetectMetricAnomalies(BasePrometheusTool):
+    def __init__(self, toolset: "PrometheusToolset"):
+        super().__init__(
+            name="detect_metric_anomalies",
+            description="Detects anomalous patterns in metrics using statistical analysis. Identifies spikes and deviations from normal.",
+            parameters={
+                "metric_name": ToolParameter(
+                    description="The metric to analyze",
+                    type="string",
+                    required=True,
+                ),
+                "sensitivity": ToolParameter(
+                    description="Standard deviations for anomaly threshold (2-4 typical)",
+                    type="number",
+                    required=False,
+                ),
+                "lookback_window": ToolParameter(
+                    description="Historical window for baseline (e.g., '7d')",
+                    type="string",
+                    required=False,
+                ),
+                "group_by": ToolParameter(
+                    description="Labels to detect anomalies by",
+                    type="array",
+                    required=False,
+                ),
+            },
+            toolset=toolset,
+        )
+
+    def _invoke(self, params: Any) -> StructuredToolResult:
+        if not self.toolset.config or not self.toolset.config.prometheus_url:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error="Prometheus is not configured. Prometheus URL is missing",
+                params=params,
+            )
+
+        try:
+            metric_name = get_param_or_raise(params, "metric_name")
+            sensitivity = params.get("sensitivity", 3)
+            lookback_window = params.get("lookback_window", "1h")
+            group_by = params.get("group_by", [])
+
+            # Build group by clause
+            group_clause = ""
+            if group_by:
+                group_clause = f' by ({", ".join(group_by)})'
+
+            # Z-score based anomaly detection query
+            query = f"""
+                (rate({metric_name}[5m]){group_clause} -
+                 avg_over_time(rate({metric_name}[5m])[{lookback_window}:]){group_clause}) /
+                 stddev_over_time(rate({metric_name}[5m])[{lookback_window}:]){group_clause} > {sensitivity}
+            """
+
+            url = urljoin(self.toolset.config.prometheus_url, "api/v1/query")
+            payload = {"query": query}
+
+            response = requests.post(
+                url=url,
+                headers=self.toolset.config.headers,
+                auth=self.toolset.config.get_auth(),
+                data=payload,
+                timeout=60,
+                verify=self.toolset.config.prometheus_ssl_enabled,
+            )
+
+            if response.status_code == 200:
+                data = response.json()
+                return StructuredToolResult(
+                    status=ToolResultStatus.SUCCESS,
+                    data=json.dumps(data.get("data"), indent=2),
+                    params=params,
+                )
+            else:
+                return StructuredToolResult(
+                    status=ToolResultStatus.ERROR,
+                    error=f"Query failed with status {response.status_code}: {response.text}",
+                    params=params,
+                )
+
+        except Exception as e:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error=f"Error detecting anomalies: {str(e)}",
+                params=params,
+            )
+
+    def get_parameterized_one_liner(self, params) -> str:
+        metric_name = params.get("metric_name", "")
+        return f"{toolset_name_for_one_liner(self.toolset.name)}: Detect anomalies in {metric_name}"
+
+
 class PrometheusToolset(Toolset):
     config: Optional[Union[PrometheusConfig, AMPConfig]] = None
 
@@ -831,6 +1224,10 @@ def __init__(self):
                 ListAvailableMetrics(toolset=self),
                 ExecuteInstantQuery(toolset=self),
                 ExecuteRangeQuery(toolset=self),
+                AnalyzeMetricByDimensions(toolset=self),
+                FindTopMetricValues(toolset=self),
+                CompareMetricPeriods(toolset=self),
+                DetectMetricAnomalies(toolset=self),
             ],
             tags=[
                 ToolsetTag.CORE,
diff --git a/holmes/plugins/toolsets/prometheus_advanced.yaml b/holmes/plugins/toolsets/prometheus_advanced.yaml
deleted file mode 100644
index 2144b722d..000000000
--- a/holmes/plugins/toolsets/prometheus_advanced.yaml
+++ /dev/null
@@ -1,266 +0,0 @@
-name: prometheus/advanced-latency
-description: Advanced Prometheus tools for latency and performance analysis
-docs_url: https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/prometheus-advanced.html
-icon_url: https://upload.wikimedia.org/wikipedia/commons/3/38/Prometheus_software_logo.svg
-
-tools:
-  - name: analyze_metric_by_dimensions
-    description: |
-      Analyzes any metric broken down by its available label dimensions.
-      Supports histogram, summary, gauge, and counter metrics.
-      Automatically discovers available labels from the metric.
-    parameters:
-      metric_name:
-        type: string
-        required: true
-        description: The metric name to analyze
-      percentiles:
-        type: array
-        required: false
-        description: For histogram/summary metrics - percentiles to calculate
-        default: [0.5, 0.95, 0.99]
-      group_by:
-        type: array
-        required: false
-        description: Labels to group by (will be validated against available labels)
-      filters:
-        type: object
-        required: false
-        description: Label filters to apply as key-value pairs
-      time_range:
-        type: string
-        required: false
-        description: Time range for analysis (e.g., "5m", "1h", "24h")
-        default: "1h"
-      aggregation:
-        type: string
-        required: false
-        description: Aggregation method (avg, sum, max, min, stddev)
-        default: "avg"
-
-  - name: find_top_metric_values
-    description: |
-      Finds the highest values for any metric, grouped by labels.
-      Useful for identifying outliers, top consumers, or slowest operations.
-    parameters:
-      metric_name:
-        type: string
-        required: true
-        description: The metric to analyze
-      group_by_label:
-        type: string
-        required: true
-        description: Label to group results by
-      top_n:
-        type: integer
-        required: false
-        description: Number of top entries to return
-        default: 10
-      percentile:
-        type: number
-        required: false
-        description: For histogram/summary metrics - percentile to use
-        default: 0.95
-      min_threshold:
-        type: number
-        required: false
-        description: Minimum value to include in results
-        default: 0
-      time_range:
-        type: string
-        required: false
-        description: Time range for analysis
-        default: "1h"
-
-  - name: correlate_metrics
-    description: |
-      Correlates one metric with others to identify relationships.
-      Helps identify bottlenecks and dependencies between metrics.
-    parameters:
-      primary_metric:
-        type: string
-        required: true
-        description: The primary metric to analyze
-      correlation_metrics:
-        type: array
-        required: true
-        description: List of metrics to correlate with
-      label_filters:
-        type: object
-        required: false
-        description: Label filters to apply to all metrics
-      correlation_method:
-        type: string
-        required: false
-        description: Correlation method (pearson, spearman, kendall)
-        default: "pearson"
-      time_range:
-        type: string
-        required: false
-        description: Time range for correlation analysis
-        default: "1h"
-
-  - name: analyze_metric_distribution
-    description: |
-      Analyzes the distribution of any metric to identify patterns.
-      Detects bimodal distributions, outliers, and tail values.
-    parameters:
-      metric_name:
-        type: string
-        required: true
-        description: The metric to analyze (histogram type recommended)
-      buckets:
-        type: array
-        required: false
-        description: For histogram metrics - custom buckets for analysis
-      label_filters:
-        type: object
-        required: false
-        description: Label filters to apply
-      distribution_stats:
-        type: array
-        required: false
-        description: Statistics to calculate (mean, median, stddev, skew, kurtosis)
-        default: ["mean", "median", "stddev"]
-      time_range:
-        type: string
-        required: false
-        description: Time range for analysis
-        default: "1h"
-
-  - name: compare_metric_periods
-    description: |
-      Compares latency between two time periods to identify degradations.
-      Useful for before/after deployment comparisons.
-    parameters:
-      metric_name:
-        type: string
-        required: true
-        description: The metric to compare
-      current_period:
-        type: string
-        required: false
-        description: Current time period (e.g., "1h")
-        default: "1h"
-      comparison_period:
-        type: string
-        required: false
-        description: Comparison period offset (e.g., "24h" for yesterday)
-        default: "24h"
-      group_by:
-        type: array
-        required: false
-        description: Labels to group comparison by
-      threshold_percent:
-        type: number
-        required: false
-        description: Percentage change threshold to highlight
-        default: 10
-
-  - name: find_metric_outliers_with_trace_ids
-    description: |
-      Identifies outlier metric values and retrieves associated trace IDs if available.
-      Links metrics to traces through exemplars or labels.
-    parameters:
-      metric_name:
-        type: string
-        required: true
-        description: The metric to analyze
-      threshold:
-        type: number
-        required: true
-        description: Threshold value for outliers
-      trace_id_label:
-        type: string
-        required: false
-        description: Label containing trace ID (if available)
-      comparison_operator:
-        type: string
-        required: false
-        description: Operator for threshold comparison (>, <, >=, <=)
-        default: ">"
-      limit:
-        type: integer
-        required: false
-        description: Maximum number of results to return
-        default: 20
-      time_range:
-        type: string
-        required: false
-        description: Time range to search
-        default: "1h"
-
-  - name: analyze_metric_by_segments
-    description: |
-      Analyzes any metric grouped by specified label segments.
-      Useful for comparing performance across different dimensions.
-    parameters:
-      metric_name:
-        type: string
-        required: true
-        description: The metric to analyze
-      segment_labels:
-        type: array
-        required: true
-        description: Labels to segment analysis by
-      comparison_threshold:
-        type: number
-        required: false
-        description: Threshold value for comparison
-      aggregation:
-        type: string
-        required: false
-        description: How to aggregate within segments (avg, sum, max, min, p95)
-        default: "avg"
-      time_range:
-        type: string
-        required: false
-        description: Time range for analysis
-        default: "1h"
-
-  - name: detect_metric_anomalies
-    description: |
-      Detects anomalous patterns in any metric using statistical analysis.
-      Identifies sudden spikes, gradual degradations, and periodic patterns.
-    parameters:
-      metric_name:
-        type: string
-        required: true
-        description: The metric to analyze
-      sensitivity:
-        type: number
-        required: false
-        description: Anomaly detection sensitivity (1-5, higher is more sensitive)
-        default: 3
-      lookback_window:
-        type: string
-        required: false
-        description: Historical window for baseline (e.g., "7d")
-        default: "7d"
-      group_by:
-        type: array
-        required: false
-        description: Labels to detect anomalies by
-
-additional_instructions: |
-  This toolset provides generic metric analysis capabilities without assuming specific metric names or labels.
-
-  When investigating performance issues:
-
-  1. First discover available metrics and their labels using prometheus/metrics list_available_metrics
-  2. Use analyze_metric_by_dimensions to break down metrics by their actual labels
-  3. Find outliers with find_top_metric_values using discovered label names
-  4. Correlate different metrics to identify relationships
-  5. Compare time periods to detect regressions
-  6. Look for anomalies in metric patterns
-
-  For histogram metrics:
-  - Use histogram_quantile() for percentile calculations
-  - The _bucket suffix contains the histogram data
-  - Apply rate() before histogram_quantile()
-
-  For summary metrics:
-  - Quantiles are pre-calculated in labels
-  - Check for quantile labels in the metric
-
-  The tools work with any metric naming convention - always discover actual metric and label names first.
diff --git a/holmes/plugins/toolsets/tempo_advanced.yaml b/holmes/plugins/toolsets/tempo_advanced.yaml
deleted file mode 100644
index 89884cb8e..000000000
--- a/holmes/plugins/toolsets/tempo_advanced.yaml
+++ /dev/null
@@ -1,294 +0,0 @@
-name: tempo/advanced-tracing
-description: Advanced Tempo tools for distributed tracing and performance root cause analysis
-docs_url: https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/tempo-advanced.html
-icon_url: https://grafana.com/static/assets/img/blog/tempo.png
-
-tools:
-  - name: analyze_trace_latency_breakdown
-    description: |
-      Analyzes a trace to identify where time is spent across services.
-      Provides span-level breakdown with critical path analysis.
-    parameters:
-      trace_id:
-        type: string
-        required: true
-        description: The trace ID to analyze
-      include_dependencies:
-        type: boolean
-        required: false
-        description: Include external service dependencies in analysis
-        default: true
-
-  - name: find_traces_by_criteria
-    description: |
-      Finds traces matching specified criteria.
-      Filters can be based on any span attributes available in your traces.
-    parameters:
-      service_name:
-        type: string
-        required: false
-        description: Service to search traces for
-      operation_name:
-        type: string
-        required: false
-        description: Specific operation name
-      min_duration:
-        type: string
-        required: true
-        description: Minimum trace duration (e.g., "500ms", "2s")
-      max_duration:
-        type: string
-        required: false
-        description: Maximum trace duration
-      span_attributes:
-        type: object
-        required: false
-        description: Any span attributes to filter by (discovered from your traces)
-      error_only:
-        type: boolean
-        required: false
-        description: Only return traces with errors
-        default: false
-      time_range:
-        type: string
-        required: false
-        description: Time range to search
-        default: "1h"
-      limit:
-        type: integer
-        required: false
-        description: Maximum traces to return
-        default: 50
-
-  - name: correlate_traces_with_metrics
-    description: |
-      Correlates slow traces with metrics to identify resource constraints.
-      Links trace spans to CPU, memory, network metrics.
-    parameters:
-      trace_ids:
-        type: array
-        required: false
-        description: Specific trace IDs to correlate (auto-detects if not provided)
-      service_name:
-        type: string
-        required: false
-        description: Service to focus correlation on
-      metric_sources:
-        type: array
-        required: false
-        description: Metric sources to correlate (prometheus, datadog, etc.)
-        default: ["prometheus"]
-      time_window:
-        type: string
-        required: false
-        description: Time window around trace for metrics
-        default: "1m"
-
-  - name: analyze_span_attributes
-    description: |
-      Analyzes span attributes to find patterns.
-      Groups traces by any available span attributes.
-    parameters:
-      service_name:
-        type: string
-        required: true
-        description: Service to analyze spans for
-      group_by_attributes:
-        type: array
-        required: true
-        description: Span attributes to group by (will use actual attributes from your traces)
-      min_duration:
-        type: string
-        required: false
-        description: Minimum span duration to include
-        default: "100ms"
-      aggregation:
-        type: string
-        required: false
-        description: How to aggregate (p50, p95, p99, avg, max)
-        default: "p95"
-      time_range:
-        type: string
-        required: false
-        description: Time range for analysis
-        default: "1h"
-
-  - name: trace_service_dependencies
-    description: |
-      Maps service dependencies and their latency contributions.
-      Identifies critical path and bottleneck services.
-    parameters:
-      root_service:
-        type: string
-        required: true
-        description: Root service to trace dependencies from
-      depth:
-        type: integer
-        required: false
-        description: Maximum dependency depth to analyze
-        default: 5
-      latency_threshold:
-        type: string
-        required: false
-        description: Only show dependencies above this latency
-      time_range:
-        type: string
-        required: false
-        description: Time range for dependency analysis
-        default: "1h"
-
-  - name: compare_trace_patterns
-    description: |
-      Compares traces between different time periods or deployments.
-      Identifies changes in service behavior and latency.
-    parameters:
-      service_name:
-        type: string
-        required: true
-        description: Service to compare traces for
-      operation_name:
-        type: string
-        required: false
-        description: Specific operation to compare
-      baseline_period:
-        type: object
-        required: true
-        description: Baseline time period (start, end)
-      comparison_period:
-        type: object
-        required: true
-        description: Comparison time period (start, end)
-      attributes_to_compare:
-        type: array
-        required: false
-        description: Specific attributes to compare
-
-  - name: detect_trace_anomalies
-    description: |
-      Detects anomalous traces using statistical analysis.
-      Identifies outliers in latency, error patterns, and span counts.
-    parameters:
-      service_name:
-        type: string
-        required: true
-        description: Service to detect anomalies for
-      baseline_window:
-        type: string
-        required: false
-        description: Historical window for baseline
-        default: "24h"
-      sensitivity:
-        type: number
-        required: false
-        description: Anomaly sensitivity (1-5)
-        default: 3
-      anomaly_types:
-        type: array
-        required: false
-        description: Types of anomalies to detect
-        default: ["latency", "errors", "span_count"]
-
-  - name: analyze_span_operations
-    description: |
-      Analyzes operations within traces based on span attributes.
-      Can identify slow operations, repeated patterns, and bottlenecks.
-    parameters:
-      service_name:
-        type: string
-        required: false
-        description: Service to analyze operations for
-      operation_type_attribute:
-        type: string
-        required: false
-        description: Span attribute that identifies operation type (e.g., 'db.system', 'rpc.method')
-      min_duration:
-        type: string
-        required: false
-        description: Minimum operation duration to include
-        default: "100ms"
-      group_by_attributes:
-        type: array
-        required: false
-        description: Additional attributes to group by
-      time_range:
-        type: string
-        required: false
-        description: Time range for analysis
-        default: "1h"
-
-  - name: trace_error_propagation
-    description: |
-      Traces how errors propagate through the system.
-      Identifies error origins and affected downstream services.
-    parameters:
-      error_type:
-        type: string
-        required: false
-        description: Specific error type to trace
-      service_name:
-        type: string
-        required: false
-        description: Service where error originated
-      include_retries:
-        type: boolean
-        required: false
-        description: Include retry attempts in analysis
-        default: true
-      time_range:
-        type: string
-        required: false
-        description: Time range to analyze
-        default: "1h"
-
-  - name: calculate_service_metrics_from_traces
-    description: |
-      Calculates service metrics from trace data.
-      Measures success rate, duration percentiles, and other KPIs.
-    parameters:
-      service_name:
-        type: string
-        required: true
-        description: Service to calculate metrics for
-      operation_name:
-        type: string
-        required: false
-        description: Specific operation (all if not specified)
-      metric_type:
-        type: string
-        required: true
-        description: Type of metric (duration_percentiles, success_rate, operation_rate)
-      threshold:
-        type: number
-        required: false
-        description: Threshold for metric calculation
-      time_range:
-        type: string
-        required: false
-        description: Time range for SLI calculation
-        default: "1h"
-
-additional_instructions: |
-  This toolset provides generic trace analysis without assuming specific attribute names.
-
-  When investigating performance issues using traces:
-
-  1. First use fetch_tempo_tags to discover available span attributes in your system
-  2. Use discovered attributes to filter and analyze traces
-  3. Find example traces matching the reported issue pattern
-  4. Group by actual span attributes to identify patterns
-  5. Break down individual traces to find bottleneck spans
-  6. Compare different time periods to identify changes
-
-  The tools work with any span attribute naming convention:
-  - OpenTelemetry semantic conventions
-  - Custom application-specific attributes
-  - Service mesh added attributes
-  - APM vendor-specific attributes
-
-  Common patterns to look for:
-  - Repeated operations (potential N+1 problems)
-  - Long-duration spans (bottlenecks)
-  - Error propagation through services
-  - Timeout and retry patterns
-
-  Always discover and use actual attribute names from your tracing data rather than assuming standard names.

From b3f3ab7e44ce13e2cb4f78d65583a5dc909436d4 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Sun, 24 Aug 2025 18:42:45 +0300
Subject: [PATCH 11/15] wip

---
 .../api_service.yaml                          |  32 ++
 .../prometheus_mock.yaml                      | 324 ++++++++++++++++
 .../200_latency_investigation/tempo_mock.yaml | 255 +++++++++++++
 .../200_latency_investigation/test_case.yaml  |  66 ++++
 .../200_latency_investigation/toolsets.yaml   |  15 +
 .../grafana/test_tempo_advanced_tools.py      | 359 ++++++++++++++++++
 .../prometheus/test_advanced_tools.py         | 314 +++++++++++++++
 7 files changed, 1365 insertions(+)
 create mode 100644 tests/llm/fixtures/test_ask_holmes/200_latency_investigation/api_service.yaml
 create mode 100644 tests/llm/fixtures/test_ask_holmes/200_latency_investigation/prometheus_mock.yaml
 create mode 100644 tests/llm/fixtures/test_ask_holmes/200_latency_investigation/tempo_mock.yaml
 create mode 100644 tests/llm/fixtures/test_ask_holmes/200_latency_investigation/test_case.yaml
 create mode 100644 tests/llm/fixtures/test_ask_holmes/200_latency_investigation/toolsets.yaml
 create mode 100644 tests/plugins/toolsets/grafana/test_tempo_advanced_tools.py
 create mode 100644 tests/plugins/toolsets/prometheus/test_advanced_tools.py

diff --git a/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/api_service.yaml b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/api_service.yaml
new file mode 100644
index 000000000..8872dc2f1
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/api_service.yaml
@@ -0,0 +1,32 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: api-service
+  namespace: app-200
+spec:
+  selector:
+    app: api-service
+  ports:
+    - port: 8080
+      targetPort: 8080
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: api-service
+  namespace: app-200
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: api-service
+  template:
+    metadata:
+      labels:
+        app: api-service
+    spec:
+      containers:
+      - name: api
+        image: nginx:alpine
+        ports:
+        - containerPort: 8080
diff --git a/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/prometheus_mock.yaml b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/prometheus_mock.yaml
new file mode 100644
index 000000000..d77c9045f
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/prometheus_mock.yaml
@@ -0,0 +1,324 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus-mock
+  namespace: app-200
+spec:
+  selector:
+    app: prometheus-mock
+  ports:
+    - port: 9090
+      targetPort: 9090
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: prometheus-mock-server
+  namespace: app-200
+data:
+  server.py: |
+    from flask import Flask, request, jsonify
+    import time
+    import random
+
+    app = Flask(__name__)
+
+    # Mock metrics data with varying latency by dimensions
+    def generate_metrics_data():
+        current_time = int(time.time())
+
+        # Different latency patterns by endpoint and user agent
+        patterns = [
+            # High latency pattern - recommendations endpoint with mobile UA
+            {
+                "endpoint": "/api/v2/recommendations",
+                "user_agent": "MobileApp/2.0",
+                "region": "eu-west",
+                "p95_latency": 3.2,
+                "p50_latency": 1.8,
+                "request_rate": 50,
+            },
+            {
+                "endpoint": "/api/v2/recommendations",
+                "user_agent": "MobileApp/2.1",
+                "region": "eu-west",
+                "p95_latency": 3.1,
+                "p50_latency": 1.7,
+                "request_rate": 45,
+            },
+            {
+                "endpoint": "/api/v2/recommendations",
+                "user_agent": "MobileApp/2.2",
+                "region": "eu-west",
+                "p95_latency": 3.3,
+                "p50_latency": 1.9,
+                "request_rate": 48,
+            },
+            # Normal latency - same endpoint, different UA
+            {
+                "endpoint": "/api/v2/recommendations",
+                "user_agent": "WebBrowser",
+                "region": "us-east",
+                "p95_latency": 0.5,
+                "p50_latency": 0.2,
+                "request_rate": 200,
+            },
+            # Normal latency - other endpoints
+            {
+                "endpoint": "/api/v2/products",
+                "user_agent": "MobileApp/2.0",
+                "region": "eu-west",
+                "p95_latency": 0.3,
+                "p50_latency": 0.15,
+                "request_rate": 100,
+            },
+            {
+                "endpoint": "/api/v2/users",
+                "user_agent": "WebBrowser",
+                "region": "us-east",
+                "p95_latency": 0.2,
+                "p50_latency": 0.1,
+                "request_rate": 150,
+            },
+        ]
+
+        return patterns
+
+    @app.route('/api/v1/metadata', methods=['GET'])
+    def metadata():
+        """Return available metrics metadata"""
+        return jsonify({
+            "status": "success",
+            "data": {
+                "http_request_duration_seconds": {
+                    "type": "histogram",
+                    "help": "HTTP request latency",
+                    "labels": ["endpoint", "method", "status_code", "user_agent", "region"]
+                },
+                "http_requests_total": {
+                    "type": "counter",
+                    "help": "Total HTTP requests",
+                    "labels": ["endpoint", "method", "status_code", "user_agent", "region"]
+                },
+                "instance_cpu_usage": {
+                    "type": "gauge",
+                    "help": "CPU usage percentage",
+                    "labels": ["instance", "region"]
+                }
+            }
+        })
+
+    @app.route('/api/v1/query', methods=['POST'])
+    def query():
+        """Handle instant queries"""
+        query_str = request.form.get('query', '')
+        current_time = int(time.time())
+
+        # Mock responses based on query patterns
+        if 'http_request_duration_seconds' in query_str:
+            if 'histogram_quantile' in query_str:
+                # Return p95 latencies
+                patterns = generate_metrics_data()
+                result = []
+
+                for pattern in patterns:
+                    if 'by' in query_str:  # Grouped query
+                        metric = {
+                            "endpoint": pattern["endpoint"],
+                            "user_agent": pattern["user_agent"],
+                            "region": pattern["region"]
+                        }
+                        value = pattern["p95_latency"] if '0.95' in query_str else pattern["p50_latency"]
+                    else:
+                        metric = {}
+                        value = 0.8  # Overall average
+
+                    result.append({
+                        "metric": metric,
+                        "value": [current_time, str(value)]
+                    })
+
+                return jsonify({
+                    "status": "success",
+                    "data": {
+                        "resultType": "vector",
+                        "result": result
+                    }
+                })
+
+            elif 'topk' in query_str:
+                # Return top slow endpoints
+                patterns = generate_metrics_data()
+                # Sort by latency and return top entries
+                sorted_patterns = sorted(patterns, key=lambda x: x["p95_latency"], reverse=True)
+                result = []
+
+                for i, pattern in enumerate(sorted_patterns[:5]):
+                    result.append({
+                        "metric": {
+                            "endpoint": pattern["endpoint"],
+                            "user_agent": pattern["user_agent"],
+                            "region": pattern["region"]
+                        },
+                        "value": [current_time, str(pattern["p95_latency"])]
+                    })
+
+                return jsonify({
+                    "status": "success",
+                    "data": {
+                        "resultType": "vector",
+                        "result": result
+                    }
+                })
+
+        elif 'instance_cpu_usage' in query_str:
+            # Return CPU metrics correlating with high latency
+            result = [
+                {
+                    "metric": {"instance": "recommender-eu-1", "region": "eu-west"},
+                    "value": [current_time, "0.95"]
+                },
+                {
+                    "metric": {"instance": "recommender-eu-2", "region": "eu-west"},
+                    "value": [current_time, "0.93"]
+                },
+                {
+                    "metric": {"instance": "recommender-us-1", "region": "us-east"},
+                    "value": [current_time, "0.45"]
+                }
+            ]
+
+            return jsonify({
+                "status": "success",
+                "data": {
+                    "resultType": "vector",
+                    "result": result
+                }
+            })
+
+        elif 'offset' in query_str:
+            # Historical comparison - return lower values
+            patterns = generate_metrics_data()
+            result = []
+
+            for pattern in patterns:
+                # Historical data shows 50% lower latency
+                historical_value = pattern["p95_latency"] * 0.5 if pattern["endpoint"] == "/api/v2/recommendations" else pattern["p95_latency"]
+                result.append({
+                    "metric": {
+                        "endpoint": pattern["endpoint"],
+                        "user_agent": pattern["user_agent"]
+                    },
+                    "value": [current_time - 86400, str(historical_value)]
+                })
+
+            return jsonify({
+                "status": "success",
+                "data": {
+                    "resultType": "vector",
+                    "result": result
+                }
+            })
+
+        elif 'stddev_over_time' in query_str:
+            # Anomaly detection - flag the high latency pattern
+            result = [
+                {
+                    "metric": {
+                        "endpoint": "/api/v2/recommendations",
+                        "user_agent": "MobileApp/2.0",
+                        "region": "eu-west"
+                    },
+                    "value": [current_time, "5.2"]  # High z-score indicating anomaly
+                }
+            ]
+
+            return jsonify({
+                "status": "success",
+                "data": {
+                    "resultType": "vector",
+                    "result": result
+                }
+            })
+
+        # Default empty response
+        return jsonify({
+            "status": "success",
+            "data": {
+                "resultType": "vector",
+                "result": []
+            }
+        })
+
+    @app.route('/api/v1/query_range', methods=['POST'])
+    def query_range():
+        """Handle range queries"""
+        # Similar to instant query but with time series data
+        return query()
+
+    @app.route('/api/v1/series', methods=['GET'])
+    def series():
+        """Return series metadata"""
+        return jsonify({
+            "status": "success",
+            "data": [
+                {
+                    "__name__": "http_request_duration_seconds_bucket",
+                    "endpoint": "/api/v2/recommendations",
+                    "user_agent": "MobileApp/2.0",
+                    "region": "eu-west",
+                    "le": "0.5"
+                },
+                {
+                    "__name__": "http_request_duration_seconds_bucket",
+                    "endpoint": "/api/v2/recommendations",
+                    "user_agent": "MobileApp/2.0",
+                    "region": "eu-west",
+                    "le": "1.0"
+                }
+            ]
+        })
+
+    @app.route('/api/v1/labels', methods=['GET'])
+    def labels():
+        """Return available labels"""
+        return jsonify({
+            "status": "success",
+            "data": ["endpoint", "method", "status_code", "user_agent", "region", "instance"]
+        })
+
+    if __name__ == '__main__':
+        app.run(host='0.0.0.0', port=9090)
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: prometheus-mock
+  namespace: app-200
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: prometheus-mock
+  template:
+    metadata:
+      labels:
+        app: prometheus-mock
+    spec:
+      containers:
+      - name: server
+        image: python:3.9-slim
+        command: ["sh", "-c"]
+        args:
+          - |
+            pip install flask
+            python /app/server.py
+        ports:
+        - containerPort: 9090
+        volumeMounts:
+        - name: server-code
+          mountPath: /app
+      volumes:
+      - name: server-code
+        configMap:
+          name: prometheus-mock-server
diff --git a/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/tempo_mock.yaml b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/tempo_mock.yaml
new file mode 100644
index 000000000..dfce117a5
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/tempo_mock.yaml
@@ -0,0 +1,255 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: tempo-mock
+  namespace: app-200
+spec:
+  selector:
+    app: tempo-mock
+  ports:
+    - port: 3100
+      targetPort: 3100
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: tempo-mock-server
+  namespace: app-200
+data:
+  server.py: |
+    from flask import Flask, request, jsonify
+    import time
+    import uuid
+    import random
+
+    app = Flask(__name__)
+
+    def generate_trace_data():
+        """Generate realistic trace data with patterns matching the latency issue"""
+        traces = []
+
+        # High latency traces for mobile app EU users
+        for i in range(10):
+            trace_id = str(uuid.uuid4()).replace('-', '')
+            traces.append({
+                "traceID": trace_id,
+                "rootServiceName": "api-service",
+                "rootTraceName": "POST /api/v2/recommendations",
+                "startTimeUnixNano": str(int(time.time() - 300) * 1000000000),
+                "durationMs": 3200 + random.randint(-200, 200),  # ~3.2s
+                "spanSet": {
+                    "spans": [
+                        {
+                            "spanID": f"span{i}1",
+                            "operationName": "POST /api/v2/recommendations",
+                            "duration": 3200000000,  # nanoseconds
+                            "attributes": {
+                                "http.method": "POST",
+                                "http.url": "/api/v2/recommendations",
+                                "http.user_agent": f"MobileApp/2.{i % 3}",
+                                "http.status_code": "200",
+                                "user.region": "eu-west",
+                                "customer.tier": "premium"
+                            }
+                        },
+                        {
+                            "spanID": f"span{i}2",
+                            "operationName": "recommendation-service.get-item",
+                            "duration": 500000000,  # 500ms per call
+                            "attributes": {
+                                "db.system": "redis",
+                                "db.operation": "GET",
+                                "service.name": "recommendation-service"
+                            }
+                        },
+                        {
+                            "spanID": f"span{i}3",
+                            "operationName": "recommendation-service.get-item",
+                            "duration": 500000000,  # N+1 pattern
+                            "attributes": {
+                                "db.system": "redis",
+                                "db.operation": "GET",
+                                "service.name": "recommendation-service"
+                            }
+                        },
+                        {
+                            "spanID": f"span{i}4",
+                            "operationName": "recommendation-service.get-item",
+                            "duration": 500000000,  # N+1 pattern
+                            "attributes": {
+                                "db.system": "redis",
+                                "db.operation": "GET",
+                                "service.name": "recommendation-service"
+                            }
+                        }
+                    ]
+                }
+            })
+
+        # Normal latency traces for web users
+        for i in range(20):
+            trace_id = str(uuid.uuid4()).replace('-', '')
+            traces.append({
+                "traceID": trace_id,
+                "rootServiceName": "api-service",
+                "rootTraceName": "POST /api/v2/recommendations",
+                "startTimeUnixNano": str(int(time.time() - 300) * 1000000000),
+                "durationMs": 500 + random.randint(-50, 50),  # ~500ms
+                "spanSet": {
+                    "spans": [
+                        {
+                            "spanID": f"web{i}1",
+                            "operationName": "POST /api/v2/recommendations",
+                            "duration": 500000000,
+                            "attributes": {
+                                "http.method": "POST",
+                                "http.url": "/api/v2/recommendations",
+                                "http.user_agent": "Mozilla/5.0",
+                                "http.status_code": "200",
+                                "user.region": "us-east",
+                                "customer.tier": "standard"
+                            }
+                        },
+                        {
+                            "spanID": f"web{i}2",
+                            "operationName": "recommendation-service.batch-get",
+                            "duration": 300000000,  # Single batch call
+                            "attributes": {
+                                "db.system": "redis",
+                                "db.operation": "MGET",
+                                "service.name": "recommendation-service"
+                            }
+                        }
+                    ]
+                }
+            })
+
+        return traces
+
+    @app.route('/api/search', methods=['GET'])
+    def search():
+        """Search traces endpoint"""
+        query = request.args.get('q', '')
+        limit = int(request.args.get('limit', 20))
+
+        traces = generate_trace_data()
+
+        # Filter based on query
+        if 'duration>2s' in query or 'duration>2000ms' in query:
+            # Return only slow traces
+            traces = [t for t in traces if t['durationMs'] > 2000]
+        elif 'duration>500ms' in query:
+            # Return medium and slow traces
+            traces = [t for t in traces if t['durationMs'] > 500]
+
+        if 'service.name="api-service"' in query:
+            # Filter by service (all our traces are from api-service)
+            pass
+
+        # Return limited results
+        traces = traces[:limit]
+
+        return jsonify({
+            "traces": traces
+        })
+
+    @app.route('/api/traces/<trace_id>', methods=['GET'])
+    def get_trace(trace_id):
+        """Get specific trace by ID"""
+        # Generate a detailed trace
+        return jsonify({
+            "traceID": trace_id,
+            "rootServiceName": "api-service",
+            "rootTraceName": "POST /api/v2/recommendations",
+            "startTimeUnixNano": str(int(time.time() - 300) * 1000000000),
+            "durationMs": 3200,
+            "spanSet": {
+                "spans": [
+                    {
+                        "spanID": "root",
+                        "operationName": "POST /api/v2/recommendations",
+                        "startTimeUnixNano": str(int(time.time() - 300) * 1000000000),
+                        "endTimeUnixNano": str(int(time.time() - 297) * 1000000000),
+                        "duration": 3200000000,
+                        "attributes": {
+                            "http.method": "POST",
+                            "http.url": "/api/v2/recommendations",
+                            "http.user_agent": "MobileApp/2.0",
+                            "http.status_code": "200",
+                            "user.region": "eu-west",
+                            "customer.tier": "premium",
+                            "user.id": "user123",
+                            "trace.id": trace_id
+                        }
+                    }
+                ]
+            }
+        })
+
+    @app.route('/api/v2/search/tags', methods=['GET'])
+    def tags():
+        """Return available span tags/attributes"""
+        return jsonify({
+            "scopes": [
+                {
+                    "name": "span",
+                    "tags": [
+                        "http.method",
+                        "http.url",
+                        "http.user_agent",
+                        "http.status_code",
+                        "user.region",
+                        "customer.tier",
+                        "user.id",
+                        "service.name",
+                        "db.system",
+                        "db.operation"
+                    ]
+                },
+                {
+                    "name": "resource",
+                    "tags": [
+                        "service.name",
+                        "k8s.pod.name",
+                        "k8s.namespace.name",
+                        "k8s.deployment.name"
+                    ]
+                }
+            ]
+        })
+
+    if __name__ == '__main__':
+        app.run(host='0.0.0.0', port=3100)
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: tempo-mock
+  namespace: app-200
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: tempo-mock
+  template:
+    metadata:
+      labels:
+        app: tempo-mock
+    spec:
+      containers:
+      - name: server
+        image: python:3.9-slim
+        command: ["sh", "-c"]
+        args:
+          - |
+            pip install flask
+            python /app/server.py
+        ports:
+        - containerPort: 3100
+        volumeMounts:
+        - name: server-code
+          mountPath: /app
+      volumes:
+      - name: server-code
+        configMap:
+          name: tempo-mock-server
diff --git a/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/test_case.yaml
new file mode 100644
index 000000000..892277c7d
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/test_case.yaml
@@ -0,0 +1,66 @@
+tags:
+  - network
+  - traces
+  - easy
+
+before_test: |
+  # Create namespace
+  kubectl create namespace app-200
+
+  # Deploy a mock Prometheus with latency metrics
+  kubectl apply -f ./prometheus_mock.yaml -n app-200
+
+  # Deploy a mock Tempo with trace data
+  kubectl apply -f ./tempo_mock.yaml -n app-200
+
+  # Deploy an application with varying latency patterns
+  kubectl apply -f ./api_service.yaml -n app-200
+
+  # Wait for services to be ready
+  kubectl wait --for=condition=ready pod -l app=prometheus-mock -n app-200 --timeout=60s
+  kubectl wait --for=condition=ready pod -l app=tempo-mock -n app-200 --timeout=60s
+  kubectl wait --for=condition=ready pod -l app=api-service -n app-200 --timeout=60s
+
+  # Set up port forwarding with unique ports
+  kubectl port-forward -n app-200 service/prometheus-mock 20090:9090 &
+  kubectl port-forward -n app-200 service/tempo-mock 20100:3100 &
+  sleep 5
+
+after_test: |
+  # Kill port-forward processes
+  pkill -f "kubectl port-forward.*app-200.*20090:9090" || true
+  pkill -f "kubectl port-forward.*app-200.*20100:3100" || true
+
+  # Delete namespace
+  kubectl delete namespace app-200 --ignore-not-found
+
+user_prompt: |
+  I'm seeing high latency in my API service. Some users are reporting slow responses
+  but not all. Can you investigate what's causing this and identify which specific
+  subset of requests is affected?
+
+expected_output: |
+  The high latency issue is affecting a specific subset of requests:
+
+  **Affected Traffic Pattern:**
+  - Endpoint: `/api/v2/recommendations`
+  - User Agent: Mobile app versions 2.0-2.3
+  - Geographic Region: EU region users
+  - Time Pattern: Latency spikes during peak hours (10am-2pm UTC)
+
+  **Root Cause Analysis:**
+  - The `/api/v2/recommendations` endpoint makes calls to an external recommendation service
+  - EU users are routed to an EU-based instance that has resource constraints
+  - Mobile app versions 2.0-2.3 use an inefficient API calling pattern (N+1 queries)
+  - The combination causes p95 latency to exceed 3 seconds during peak load
+
+  **Evidence:**
+  - Prometheus metrics show p95 latency of 3.2s for the specific endpoint/user-agent combination
+  - Only 15% of total traffic is affected (matching the mobile app + EU user segment)
+  - Tempo traces show multiple sequential calls to the recommendation service
+  - Resource metrics correlate with high latency periods (CPU at 95% on EU instances)
+
+  **Recommendations:**
+  1. Immediate: Scale up EU recommendation service instances
+  2. Short-term: Implement request batching to reduce N+1 queries
+  3. Long-term: Update mobile app to use more efficient API patterns
diff --git a/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/toolsets.yaml b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/toolsets.yaml
new file mode 100644
index 000000000..a7d1e1e9c
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/200_latency_investigation/toolsets.yaml
@@ -0,0 +1,15 @@
+# Configure toolsets for this test
+toolsets:
+  prometheus/metrics:
+    enabled: true
+    config:
+      prometheus_url: http://localhost:20090
+      prometheus_ssl_enabled: false
+      tool_calls_return_data: true
+
+  grafana/tempo:
+    enabled: true
+    config:
+      url: http://localhost:20100
+      api_key: ""
+      grafana_datasource_uid: "tempo"
diff --git a/tests/plugins/toolsets/grafana/test_tempo_advanced_tools.py b/tests/plugins/toolsets/grafana/test_tempo_advanced_tools.py
new file mode 100644
index 000000000..42c6af2c1
--- /dev/null
+++ b/tests/plugins/toolsets/grafana/test_tempo_advanced_tools.py
@@ -0,0 +1,359 @@
+from unittest.mock import patch
+
+import pytest
+import yaml
+
+from holmes.core.tools import ToolResultStatus
+from holmes.plugins.toolsets.grafana.toolset_grafana_tempo import (
+    AnalyzeTracesByAttributes,
+    CompareTracePeriods,
+    FindSlowOperations,
+    GrafanaTempoConfig,
+    GrafanaTempoToolset,
+)
+
+
+@pytest.fixture
+def tempo_toolset():
+    """Create a GrafanaTempoToolset with mock config"""
+    toolset = GrafanaTempoToolset()
+    toolset._grafana_config = GrafanaTempoConfig(
+        api_key="test-api-key",
+        url="http://grafana:3000",
+        grafana_datasource_uid="tempo-uid",
+    )
+    return toolset
+
+
+class TestAnalyzeTracesByAttributes:
+    def test_analyze_traces_basic(self, tempo_toolset):
+        tool = AnalyzeTracesByAttributes(toolset=tempo_toolset)
+
+        # Mock trace data
+        mock_traces = [
+            {"traceID": "trace1", "duration": 1500},
+            {"traceID": "trace2", "duration": 2000},
+            {"traceID": "trace3", "duration": 1800},
+        ]
+
+        with patch(
+            "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces",
+            return_value=mock_traces,
+        ) as mock_query:
+            result = tool._invoke(
+                {
+                    "service_name": "api-service",
+                    "group_by_attributes": ["http.method", "http.status_code"],
+                    "min_duration": "500ms",
+                    "start_datetime": "-1h",
+                }
+            )
+
+            assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify query was called with correct parameters
+            mock_query.assert_called_once()
+            call_args = mock_query.call_args
+            assert 'resource.service.name="api-service"' in call_args[1]["query"]
+            assert "duration>500ms" in call_args[1]["query"]
+
+            # Check that result contains grouped analysis
+            result_data = yaml.safe_load(result.data)
+            assert isinstance(result_data, dict)
+
+    def test_analyze_without_service_filter(self, tempo_toolset):
+        tool = AnalyzeTracesByAttributes(toolset=tempo_toolset)
+
+        mock_traces = []
+
+        with patch(
+            "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces",
+            return_value=mock_traces,
+        ) as mock_query:
+            result = tool._invoke(
+                {
+                    "group_by_attributes": ["user.id", "tenant.id"],
+                    "min_duration": "1s",
+                }
+            )
+
+            assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify service filter was not included
+            call_args = mock_query.call_args
+            assert "resource.service.name" not in call_args[1]["query"]
+            assert "duration>1s" in call_args[1]["query"]
+
+    def test_custom_limit(self, tempo_toolset):
+        tool = AnalyzeTracesByAttributes(toolset=tempo_toolset)
+
+        with patch(
+            "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces",
+            return_value=[],
+        ) as mock_query:
+            result = tool._invoke(
+                {
+                    "group_by_attributes": ["endpoint"],
+                    "limit": 500,
+                }
+            )
+
+            assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify custom limit was used
+            call_args = mock_query.call_args
+            assert call_args[1]["limit"] == 500
+
+    def test_error_handling(self, tempo_toolset):
+        tool = AnalyzeTracesByAttributes(toolset=tempo_toolset)
+
+        with patch(
+            "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces",
+            side_effect=Exception("API error"),
+        ):
+            result = tool._invoke(
+                {
+                    "group_by_attributes": ["test"],
+                }
+            )
+
+            assert result.status == ToolResultStatus.ERROR
+            assert "API error" in result.error
+
+
+class TestFindSlowOperations:
+    def test_find_slow_operations(self, tempo_toolset):
+        tool = FindSlowOperations(toolset=tempo_toolset)
+
+        mock_traces = [
+            {"traceID": "slow1", "duration": 5000},
+            {"traceID": "slow2", "duration": 6000},
+        ]
+
+        with patch(
+            "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces",
+            return_value=mock_traces,
+        ) as mock_query:
+            with patch(
+                "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.format_traces_list",
+                return_value="formatted_traces",
+            ) as mock_format:
+                result = tool._invoke(
+                    {
+                        "service_name": "backend",
+                        "min_duration": "2s",
+                        "start_datetime": "-30m",
+                    }
+                )
+
+                assert result.status == ToolResultStatus.SUCCESS
+                assert result.data == "formatted_traces"
+
+                # Verify query construction
+                call_args = mock_query.call_args
+                assert "duration>2s" in call_args[1]["query"]
+                assert 'resource.service.name="backend"' in call_args[1]["query"]
+
+                # Verify formatting was called
+                mock_format.assert_called_once_with(mock_traces)
+
+    def test_find_slow_operations_without_service(self, tempo_toolset):
+        tool = FindSlowOperations(toolset=tempo_toolset)
+
+        with patch(
+            "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces",
+            return_value=[],
+        ) as mock_query:
+            with patch(
+                "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.format_traces_list",
+                return_value="formatted_traces",
+            ):
+                result = tool._invoke(
+                    {
+                        "min_duration": "500ms",
+                    }
+                )
+
+                assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify only duration filter was applied
+            call_args = mock_query.call_args
+            query = call_args[1]["query"]
+            assert "duration>500ms" in query
+            assert "resource.service.name" not in query
+
+    def test_missing_required_parameter(self, tempo_toolset):
+        tool = FindSlowOperations(toolset=tempo_toolset)
+
+        with patch(
+            "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.get_param_or_raise",
+            side_effect=ValueError("min_duration is required"),
+        ):
+            result = tool._invoke({})
+
+            assert result.status == ToolResultStatus.ERROR
+            assert "min_duration is required" in result.error
+
+
+class TestCompareTracePeriods:
+    def test_compare_periods(self, tempo_toolset):
+        tool = CompareTracePeriods(toolset=tempo_toolset)
+
+        baseline_traces = [
+            {"traceID": "b1", "duration": 1000},
+            {"traceID": "b2", "duration": 1200},
+        ]
+
+        comparison_traces = [
+            {"traceID": "c1", "duration": 1500},
+            {"traceID": "c2", "duration": 1600},
+            {"traceID": "c3", "duration": 1700},
+        ]
+
+        with patch(
+            "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.process_timestamps_to_int",
+            side_effect=[(1234567800, 1234567860), (1234567900, 1234567960)],
+        ):
+            with patch(
+                "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces",
+                side_effect=[baseline_traces, comparison_traces],
+            ) as mock_query:
+                with patch(
+                    "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.get_base_url",
+                    return_value="http://grafana:3000",
+                ):
+                    result = tool._invoke(
+                        {
+                            "service_name": "api",
+                            "baseline_start": "-25h",
+                            "baseline_end": "-24h",
+                            "comparison_start": "-1h",
+                            "comparison_end": "now",
+                        }
+                    )
+
+            assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify two queries were made
+            assert mock_query.call_count == 2
+
+            # Check result contains comparison data
+            result_data = yaml.safe_load(result.data)
+            assert result_data["baseline_count"] == 2
+            assert result_data["comparison_count"] == 3
+            assert "baseline_period" in result_data
+            assert "comparison_period" in result_data
+
+    def test_compare_with_attributes(self, tempo_toolset):
+        tool = CompareTracePeriods(toolset=tempo_toolset)
+
+        with patch(
+            "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.process_timestamps_to_int",
+            side_effect=[(1234567800, 1234567860), (1234567900, 1234567960)],
+        ):
+            with patch(
+                "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.get_base_url",
+                return_value="http://grafana:3000",
+            ):
+                with patch(
+                    "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces",
+                    return_value=[],
+                ) as mock_query:
+                    result = tool._invoke(
+                        {
+                            "service_name": "frontend",
+                            "baseline_start": "-48h",
+                            "baseline_end": "-47h",
+                            "comparison_start": "-2h",
+                            "comparison_end": "-1h",
+                            "attributes_to_compare": [
+                                "http.method",
+                                "http.status_code",
+                            ],
+                        }
+                    )
+
+            assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify both queries used same service filter
+            calls = mock_query.call_args_list
+            for call in calls:
+                assert 'resource.service.name="frontend"' in call[1]["query"]
+
+    def test_missing_service_name(self, tempo_toolset):
+        tool = CompareTracePeriods(toolset=tempo_toolset)
+
+        with patch(
+            "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.get_param_or_raise",
+            side_effect=ValueError("service_name is required"),
+        ):
+            result = tool._invoke(
+                {
+                    "baseline_start": "-2h",
+                    "baseline_end": "-1h",
+                    "comparison_start": "-1h",
+                    "comparison_end": "now",
+                }
+            )
+
+            assert result.status == ToolResultStatus.ERROR
+            assert "service_name is required" in result.error
+
+
+class TestToolIntegration:
+    """Test that tools are properly integrated into the toolset"""
+
+    def test_tools_in_toolset(self):
+        toolset = GrafanaTempoToolset()
+        tool_names = [tool.name for tool in toolset.tools]
+
+        # Original tools
+        assert "fetch_tempo_traces" in tool_names
+        assert "fetch_tempo_trace_by_id" in tool_names
+        assert "fetch_tempo_tags" in tool_names
+
+        # New advanced tools
+        assert "analyze_traces_by_attributes" in tool_names
+        assert "find_slow_operations" in tool_names
+        assert "compare_trace_periods" in tool_names
+
+    def test_tool_one_liners(self, tempo_toolset):
+        # Test that each tool generates appropriate one-liner descriptions
+        tools = [
+            AnalyzeTracesByAttributes(toolset=tempo_toolset),
+            FindSlowOperations(toolset=tempo_toolset),
+            CompareTracePeriods(toolset=tempo_toolset),
+        ]
+
+        for tool in tools:
+            one_liner = tool.get_parameterized_one_liner({})
+            assert "Grafana" in one_liner or "grafana" in one_liner
+
+
+class TestTimeProcessing:
+    """Test time processing utilities"""
+
+    def test_process_timestamps(self, tempo_toolset):
+        tool = FindSlowOperations(toolset=tempo_toolset)
+
+        with patch(
+            "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.process_timestamps_to_int",
+            return_value=(1234567890, 1234567900),
+        ) as mock_process:
+            with patch(
+                "holmes.plugins.toolsets.grafana.toolset_grafana_tempo.query_tempo_traces",
+                return_value=[],
+            ):
+                tool._invoke(
+                    {
+                        "min_duration": "1s",
+                        "start_datetime": "-1h",
+                        "end_datetime": "now",
+                    }
+                )
+
+                # Verify time processing was called
+                mock_process.assert_called_once()
+                call_args = mock_process.call_args
+                assert call_args[0][0] == "-1h"
+                assert call_args[0][1] == "now"
diff --git a/tests/plugins/toolsets/prometheus/test_advanced_tools.py b/tests/plugins/toolsets/prometheus/test_advanced_tools.py
new file mode 100644
index 000000000..07e8d0493
--- /dev/null
+++ b/tests/plugins/toolsets/prometheus/test_advanced_tools.py
@@ -0,0 +1,314 @@
+import json
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from holmes.core.tools import ToolResultStatus
+from holmes.plugins.toolsets.prometheus.prometheus import (
+    AnalyzeMetricByDimensions,
+    CompareMetricPeriods,
+    DetectMetricAnomalies,
+    FindTopMetricValues,
+    PrometheusConfig,
+    PrometheusToolset,
+)
+
+
+@pytest.fixture
+def prometheus_toolset():
+    """Create a PrometheusToolset with mock config"""
+    toolset = PrometheusToolset()
+    toolset.config = PrometheusConfig(
+        prometheus_url="http://prometheus:9090/",
+        prometheus_ssl_enabled=False,
+    )
+    return toolset
+
+
+class TestAnalyzeMetricByDimensions:
+    def test_basic_metric_analysis(self, prometheus_toolset):
+        tool = AnalyzeMetricByDimensions(toolset=prometheus_toolset)
+
+        # Mock successful response
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {
+            "data": {
+                "result": [
+                    {
+                        "metric": {"endpoint": "/api/users", "method": "GET"},
+                        "value": [1234567890, "0.95"],
+                    },
+                    {
+                        "metric": {"endpoint": "/api/products", "method": "POST"},
+                        "value": [1234567890, "1.2"],
+                    },
+                ]
+            }
+        }
+
+        with patch("requests.post", return_value=mock_response) as mock_post:
+            result = tool._invoke(
+                {
+                    "metric_name": "http_request_duration_seconds",
+                    "group_by": ["endpoint", "method"],
+                    "filters": {"service": "api"},
+                    "time_range": "1h",
+                }
+            )
+
+            assert result.status == ToolResultStatus.SUCCESS
+            assert "result" in json.loads(result.data)
+
+            # Verify the query was constructed correctly
+            mock_post.assert_called_once()
+            call_args = mock_post.call_args
+            assert "query" in call_args[1]["data"]
+            query = call_args[1]["data"]["query"]
+            assert "http_request_duration_seconds" in query
+            assert 'service="api"' in query
+            assert "endpoint" in query
+            assert "method" in query
+
+    def test_histogram_percentile_aggregation(self, prometheus_toolset):
+        tool = AnalyzeMetricByDimensions(toolset=prometheus_toolset)
+
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"data": {"result": []}}
+
+        with patch("requests.post", return_value=mock_response) as mock_post:
+            result = tool._invoke(
+                {
+                    "metric_name": "http_request_duration_seconds",
+                    "aggregation": "p95",
+                    "time_range": "5m",
+                }
+            )
+
+            assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify histogram_quantile was used
+            call_args = mock_post.call_args
+            query = call_args[1]["data"]["query"]
+            assert "histogram_quantile(0.95" in query
+            assert "_bucket" in query
+
+    def test_missing_prometheus_url(self, prometheus_toolset):
+        tool = AnalyzeMetricByDimensions(toolset=prometheus_toolset)
+        prometheus_toolset.config = None
+
+        result = tool._invoke({"metric_name": "test_metric"})
+
+        assert result.status == ToolResultStatus.ERROR
+        assert "Prometheus is not configured" in result.error
+
+
+class TestFindTopMetricValues:
+    def test_find_top_values(self, prometheus_toolset):
+        tool = FindTopMetricValues(toolset=prometheus_toolset)
+
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {
+            "data": {
+                "result": [
+                    {"metric": {"endpoint": "/slow"}, "value": [1234567890, "2.5"]},
+                    {"metric": {"endpoint": "/slower"}, "value": [1234567890, "3.1"]},
+                ]
+            }
+        }
+
+        with patch("requests.post", return_value=mock_response) as mock_post:
+            result = tool._invoke(
+                {
+                    "metric_name": "request_duration",
+                    "group_by_label": "endpoint",
+                    "top_n": 5,
+                    "time_range": "30m",
+                }
+            )
+
+            assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify topk was used
+            call_args = mock_post.call_args
+            query = call_args[1]["data"]["query"]
+            assert "topk(5" in query
+            assert "endpoint" in query
+
+    def test_histogram_metric_top_values(self, prometheus_toolset):
+        tool = FindTopMetricValues(toolset=prometheus_toolset)
+
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"data": {"result": []}}
+
+        with patch("requests.post", return_value=mock_response) as mock_post:
+            result = tool._invoke(
+                {
+                    "metric_name": "latency_histogram",
+                    "group_by_label": "service",
+                    "percentile": 0.99,
+                    "top_n": 10,
+                }
+            )
+
+            assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify histogram_quantile was used for percentile
+            call_args = mock_post.call_args
+            query = call_args[1]["data"]["query"]
+            assert "histogram_quantile(0.99" in query
+            assert "topk(10" in query
+
+
+class TestCompareMetricPeriods:
+    def test_compare_periods(self, prometheus_toolset):
+        tool = CompareMetricPeriods(toolset=prometheus_toolset)
+
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {
+            "data": {
+                "result": [
+                    {"metric": {"endpoint": "/api"}, "value": [1234567890, "15.5"]}
+                ]
+            }
+        }
+
+        with patch("requests.post", return_value=mock_response) as mock_post:
+            result = tool._invoke(
+                {
+                    "metric_name": "errors_total",
+                    "current_period": "1h",
+                    "comparison_offset": "24h",
+                    "group_by": ["endpoint"],
+                }
+            )
+
+            assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify offset comparison query
+            call_args = mock_post.call_args
+            query = call_args[1]["data"]["query"]
+            assert "offset 24h" in query
+            assert "errors_total" in query
+            assert "endpoint" in query
+            # Should calculate percentage change
+            assert "*" in query and "100" in query
+
+    def test_compare_without_grouping(self, prometheus_toolset):
+        tool = CompareMetricPeriods(toolset=prometheus_toolset)
+
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"data": {"result": []}}
+
+        with patch("requests.post", return_value=mock_response) as mock_post:
+            result = tool._invoke(
+                {
+                    "metric_name": "cpu_usage",
+                    "current_period": "5m",
+                    "comparison_offset": "1h",
+                }
+            )
+
+            assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify no grouping clause
+            call_args = mock_post.call_args
+            query = call_args[1]["data"]["query"]
+            assert "by (" not in query
+
+
+class TestDetectMetricAnomalies:
+    def test_anomaly_detection(self, prometheus_toolset):
+        tool = DetectMetricAnomalies(toolset=prometheus_toolset)
+
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {
+            "data": {
+                "result": [{"metric": {"pod": "pod-1"}, "value": [1234567890, "5.2"]}]
+            }
+        }
+
+        with patch("requests.post", return_value=mock_response) as mock_post:
+            result = tool._invoke(
+                {
+                    "metric_name": "response_time",
+                    "sensitivity": 2.5,
+                    "lookback_window": "6h",
+                    "group_by": ["pod"],
+                }
+            )
+
+            assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify z-score calculation
+            call_args = mock_post.call_args
+            query = call_args[1]["data"]["query"]
+            assert "stddev_over_time" in query
+            assert "avg_over_time" in query
+            assert "response_time" in query
+            assert "> 2.5" in query
+            assert "6h" in query
+
+    def test_default_sensitivity(self, prometheus_toolset):
+        tool = DetectMetricAnomalies(toolset=prometheus_toolset)
+
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.json.return_value = {"data": {"result": []}}
+
+        with patch("requests.post", return_value=mock_response) as mock_post:
+            result = tool._invoke({"metric_name": "error_rate"})
+
+            assert result.status == ToolResultStatus.SUCCESS
+
+            # Verify default sensitivity of 3
+            call_args = mock_post.call_args
+            query = call_args[1]["data"]["query"]
+            assert "> 3" in query
+
+    def test_query_failure(self, prometheus_toolset):
+        tool = DetectMetricAnomalies(toolset=prometheus_toolset)
+
+        mock_response = MagicMock()
+        mock_response.status_code = 400
+        mock_response.text = "Bad query"
+
+        with patch("requests.post", return_value=mock_response):
+            result = tool._invoke({"metric_name": "test_metric"})
+
+            assert result.status == ToolResultStatus.ERROR
+            assert "400" in result.error
+            assert "Bad query" in result.error
+
+
+class TestToolIntegration:
+    """Test that tools are properly integrated into the toolset"""
+
+    def test_tools_in_toolset(self):
+        toolset = PrometheusToolset()
+        tool_names = [tool.name for tool in toolset.tools]
+
+        assert "analyze_metric_by_dimensions" in tool_names
+        assert "find_top_metric_values" in tool_names
+        assert "compare_metric_periods" in tool_names
+        assert "detect_metric_anomalies" in tool_names
+
+    def test_tool_one_liners(self, prometheus_toolset):
+        # Test that each tool generates appropriate one-liner descriptions
+        tools = [
+            AnalyzeMetricByDimensions(toolset=prometheus_toolset),
+            FindTopMetricValues(toolset=prometheus_toolset),
+            CompareMetricPeriods(toolset=prometheus_toolset),
+            DetectMetricAnomalies(toolset=prometheus_toolset),
+        ]
+
+        for tool in tools:
+            one_liner = tool.get_parameterized_one_liner({"metric_name": "test_metric"})
+            assert "Prometheus" in one_liner
+            assert "test_metric" in one_liner

From 4bc6ec9968a4aaf81ae0205a4a3d70c1fbd84fe4 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Sun, 24 Aug 2025 22:40:26 +0300
Subject: [PATCH 12/15] wip

---
 .../toolsets/grafana/toolset_grafana_tempo.py | 792 +++++++++++++++++-
 .../plugins/toolsets/grafana/trace_parser.py  |   2 +-
 .../test_case.yaml                            |  21 +-
 3 files changed, 778 insertions(+), 37 deletions(-)

diff --git a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
index 2d84d2c60..ae7848a2a 100644
--- a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
+++ b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
@@ -22,7 +22,10 @@
     query_tempo_trace_by_id,
     query_tempo_traces,
 )
-from holmes.plugins.toolsets.grafana.trace_parser import format_traces_list
+from holmes.plugins.toolsets.grafana.trace_parser import (
+    format_traces_list,
+    build_span_hierarchy,
+)
 from holmes.plugins.toolsets.logging_utils.logging_api import (
     DEFAULT_TIME_SPAN_SECONDS,
 )
@@ -121,7 +124,7 @@ def __init__(self, toolset: BaseGrafanaTempoToolset):
                 ),
                 "limit": ToolParameter(
                     description="Maximum number of traces to return. Defaults to 50",
-                    type="string",
+                    type="integer",
                     required=False,
                 ),
                 "sort": ToolParameter(
@@ -247,7 +250,7 @@ def _invoke(self, params: Dict) -> StructuredToolResult:
             )
         except requests.exceptions.RequestException as e:
             raise Exception(
-                f"Failed to retrieve trace by ID after retries: {e} \n for URL: {url}"
+                f"Failed to retrieve tags after retries: {e} \n for URL: {url}"
             )
 
     def get_parameterized_one_liner(self, params: Dict) -> str:
@@ -354,37 +357,124 @@ def _invoke(self, params: Dict) -> StructuredToolResult:
             query = f"{{{query}}}"
 
             base_url = get_base_url(self._toolset.grafana_config)
-            traces = query_tempo_traces(
+            traces_summary = query_tempo_traces(
                 base_url=base_url,
                 api_key=self._toolset.grafana_config.api_key,
                 headers=self._toolset.grafana_config.headers,
                 query=query,
                 start=start,
                 end=end,
-                limit=params.get("limit", 100),
+                limit=params.get("limit", 50),
             )
 
             # Group traces by specified attributes
             grouped_analysis = {}
-            for trace in traces:
-                # Extract attribute values for grouping
-                group_key = []
-                for attr in group_by:
-                    # Look for attribute in trace spans
-                    value = "unknown"
-                    # This would need actual trace parsing logic
-                    group_key.append(f"{attr}={value}")
-
-                key = ", ".join(group_key)
-                if key not in grouped_analysis:
-                    grouped_analysis[key] = {
-                        "count": 0,
-                        "total_duration": 0,
-                        "errors": 0,
-                    }
-
-                grouped_analysis[key]["count"] += 1
-                # Add duration and error tracking
+            traces = traces_summary.get("traces", [])
+
+            # For each trace, fetch full details to get attributes
+            for trace_summary in traces[
+                : params.get("limit", 50)
+            ]:  # Limit to avoid too many API calls
+                trace_id = trace_summary.get("traceID")
+                if not trace_id:
+                    continue
+
+                try:
+                    # Fetch raw trace data to get span attributes
+                    url = f"{base_url}/api/traces/{trace_id}"
+                    response = requests.get(
+                        url,
+                        headers=build_headers(
+                            api_key=self._toolset.grafana_config.api_key,
+                            additional_headers=self._toolset.grafana_config.headers,
+                        ),
+                        timeout=5,
+                    )
+                    response.raise_for_status()
+                    trace_raw = response.json()
+
+                    # Extract attributes from all spans in the trace
+                    attr_values = {}
+                    for attr in group_by:
+                        attr_values[attr] = "unknown"
+
+                    # Search through batches and spans for attributes
+                    for batch in trace_raw.get("batches", []):
+                        # Check resource attributes first (e.g., service.name, k8s.pod.name)
+                        for resource_attr in batch.get("resource", {}).get(
+                            "attributes", []
+                        ):
+                            attr_key = resource_attr.get("key", "")
+                            if attr_key in group_by:
+                                attr_value = (
+                                    list(resource_attr.get("value", {}).values())[0]
+                                    if resource_attr.get("value")
+                                    else "unknown"
+                                )
+                                attr_values[attr_key] = str(attr_value)
+
+                        for scope_spans in batch.get("scopeSpans", []):
+                            for span_data in scope_spans.get("spans", []):
+                                # Check span attributes
+                                for span_attr in span_data.get("attributes", []):
+                                    attr_key = span_attr.get("key", "")
+                                    if attr_key in group_by:
+                                        # Extract the value from the attribute
+                                        attr_value = (
+                                            list(span_attr.get("value", {}).values())[0]
+                                            if span_attr.get("value")
+                                            else "unknown"
+                                        )
+                                        attr_values[attr_key] = str(attr_value)
+
+                    # Build the grouping key from extracted attributes
+                    group_key = ", ".join(
+                        [
+                            f"{attr}={attr_values.get(attr, 'unknown')}"
+                            for attr in group_by
+                        ]
+                    )
+
+                    if group_key not in grouped_analysis:
+                        grouped_analysis[group_key] = {
+                            "count": 0,
+                            "total_duration_ms": 0,
+                            "avg_duration_ms": 0,
+                            "min_duration_ms": float("inf"),
+                            "max_duration_ms": 0,
+                        }
+
+                    duration_ms = trace_summary.get("durationMs", 0)
+                    grouped_analysis[group_key]["count"] += 1
+                    grouped_analysis[group_key]["total_duration_ms"] += duration_ms
+                    grouped_analysis[group_key]["min_duration_ms"] = min(
+                        grouped_analysis[group_key]["min_duration_ms"], duration_ms
+                    )
+                    grouped_analysis[group_key]["max_duration_ms"] = max(
+                        grouped_analysis[group_key]["max_duration_ms"], duration_ms
+                    )
+
+                except Exception:
+                    # If we can't fetch the trace, skip it
+                    continue
+
+            # Calculate averages
+            for key in grouped_analysis:
+                if grouped_analysis[key]["count"] > 0:
+                    grouped_analysis[key]["avg_duration_ms"] = round(
+                        grouped_analysis[key]["total_duration_ms"]
+                        / grouped_analysis[key]["count"],
+                        2,
+                    )
+                    grouped_analysis[key]["min_duration_ms"] = round(
+                        grouped_analysis[key]["min_duration_ms"], 2
+                    )
+                    grouped_analysis[key]["max_duration_ms"] = round(
+                        grouped_analysis[key]["max_duration_ms"], 2
+                    )
+                    grouped_analysis[key]["total_duration_ms"] = round(
+                        grouped_analysis[key]["total_duration_ms"], 2
+                    )
 
             return StructuredToolResult(
                 status=ToolResultStatus.SUCCESS,
@@ -574,8 +664,8 @@ def _invoke(self, params: Dict) -> StructuredToolResult:
 
             # Compare the two sets
             comparison_result = {
-                "baseline_count": len(baseline_traces),
-                "comparison_count": len(comparison_traces),
+                "baseline_count": len(baseline_traces.get("traces", [])),
+                "comparison_count": len(comparison_traces.get("traces", [])),
                 "baseline_period": f"{baseline_start} to {baseline_end}",
                 "comparison_period": f"{comparison_start} to {comparison_end}",
             }
@@ -599,6 +689,648 @@ def get_parameterized_one_liner(self, params: Dict) -> str:
         )
 
 
+class ListServices(Tool):
+    def __init__(self, toolset: BaseGrafanaTempoToolset):
+        super().__init__(
+            name="list_services",
+            description="Lists all services that have traces in Tempo, optionally filtered by namespace",
+            parameters={
+                "namespace": ToolParameter(
+                    description="Filter services by Kubernetes namespace",
+                    type="string",
+                    required=False,
+                ),
+                "start_datetime": ToolParameter(
+                    description="Start time for search (RFC3339 or relative)",
+                    type="string",
+                    required=False,
+                ),
+                "end_datetime": ToolParameter(
+                    description="End time for search (RFC3339 or relative)",
+                    type="string",
+                    required=False,
+                ),
+            },
+        )
+        self._toolset = toolset
+
+    def _invoke(self, params: Dict) -> StructuredToolResult:
+        try:
+            start, end = process_timestamps_to_int(
+                params.get("start_datetime"),
+                params.get("end_datetime"),
+                default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
+            )
+
+            base_url = get_base_url(self._toolset.grafana_config)
+
+            # Get all service names
+            services_url = f"{base_url}/api/v2/search/tag/service.name/values?start={start}&end={end}"
+
+            response = requests.get(
+                services_url,
+                headers=build_headers(
+                    api_key=self._toolset.grafana_config.api_key,
+                    additional_headers=self._toolset.grafana_config.headers,
+                ),
+                timeout=10,
+            )
+            response.raise_for_status()
+            services_data = response.json()
+            services = services_data.get("tagValues", [])
+
+            # If namespace filter provided, get traces for each service and filter
+            if params.get("namespace"):
+                namespace = params["namespace"]
+                filtered_services = []
+
+                for service in services:
+                    # Check if this service has traces in the specified namespace
+                    query = f'{{resource.service.name="{service}" && resource.k8s.namespace.name="{namespace}"}}'
+                    traces = query_tempo_traces(
+                        base_url=base_url,
+                        api_key=self._toolset.grafana_config.api_key,
+                        headers=self._toolset.grafana_config.headers,
+                        query=query,
+                        start=start,
+                        end=end,
+                        limit=1,  # Just check if any exist
+                    )
+
+                    if traces.get("traces"):
+                        filtered_services.append(service)
+
+                services = filtered_services
+
+            # Get basic stats for each service
+            service_stats = []
+            for service in services:
+                query = f'{{resource.service.name="{service}"}}'
+                if params.get("namespace"):
+                    query = f'{{resource.service.name="{service}" && resource.k8s.namespace.name="{params["namespace"]}"}}'
+
+                # Get a sample of traces for basic stats
+                traces = query_tempo_traces(
+                    base_url=base_url,
+                    api_key=self._toolset.grafana_config.api_key,
+                    headers=self._toolset.grafana_config.headers,
+                    query=query,
+                    start=start,
+                    end=end,
+                    limit=100,
+                )
+
+                trace_list = traces.get("traces", [])
+                if trace_list:
+                    durations = [
+                        t.get("durationMs", 0)
+                        for t in trace_list
+                        if t.get("durationMs", 0) > 0
+                    ]
+                    if durations:
+                        service_stats.append(
+                            {
+                                "service_name": service,
+                                "trace_count_sample": len(durations),
+                                "avg_duration_ms": round(
+                                    sum(durations) / len(durations), 2
+                                ),
+                                "min_duration_ms": round(min(durations), 2),
+                                "max_duration_ms": round(max(durations), 2),
+                            }
+                        )
+
+            # Sort by average duration (slowest first)
+            service_stats.sort(key=lambda x: x["avg_duration_ms"], reverse=True)
+
+            result = {
+                "total_services": len(services),
+                "services": service_stats if service_stats else services,
+            }
+
+            if params.get("namespace"):
+                result["namespace_filter"] = params["namespace"]
+
+            return StructuredToolResult(
+                status=ToolResultStatus.SUCCESS,
+                data=yaml.dump(result, default_flow_style=False, sort_keys=False),
+                params=params,
+            )
+
+        except Exception as e:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error=f"Error listing services: {str(e)}",
+                params=params,
+            )
+
+    def get_parameterized_one_liner(self, params: Dict) -> str:
+        return f"{toolset_name_for_one_liner(self._toolset.name)}: List services"
+
+
+class FetchTracesComparativeSample(Tool):
+    def __init__(self, toolset: BaseGrafanaTempoToolset):
+        super().__init__(
+            name="fetch_traces_comparative_sample",
+            description="""Fetches statistics and representative samples of fast, slow, and typical traces for comparative analysis to identify performance patterns.
+
+Examples:
+- For service latency: service_name="payment" (matches "payment-service" too)
+- For namespace issues: namespace="production"
+- Combined: service_name="auth", namespace="staging"
+
+The tool automatically compares fast vs slow traces and highlights attribute differences. Usually this is the best first tool to call when investigating trace data as it gives a fantastic overview.""",
+            parameters={
+                "service_name": ToolParameter(
+                    description="Service to analyze (partial match supported, e.g., 'payment' matches 'payment-service')",
+                    type="string",
+                    required=False,
+                ),
+                "namespace": ToolParameter(
+                    description="Kubernetes namespace to filter traces (e.g., 'production', 'staging')",
+                    type="string",
+                    required=False,
+                ),
+                "base_query": ToolParameter(
+                    description="Custom TraceQL filter. If not provided, service_name and/or namespace will be used",
+                    type="string",
+                    required=False,
+                ),
+                "sample_size": ToolParameter(
+                    description="Number of traces to fetch from each category (fast/slow/typical). Default 5",
+                    type="integer",
+                    required=False,
+                ),
+                "start_datetime": ToolParameter(
+                    description="Start time for analysis (RFC3339 or relative)",
+                    type="string",
+                    required=False,
+                ),
+                "end_datetime": ToolParameter(
+                    description="End time for analysis (RFC3339 or relative)",
+                    type="string",
+                    required=False,
+                ),
+            },
+        )
+        self._toolset = toolset
+
+    def _invoke(self, params: Dict) -> StructuredToolResult:
+        try:
+            # Build base query from parameters
+            if params.get("base_query"):
+                base_query = params["base_query"]
+            else:
+                filters = []
+
+                # Add service filter (with smart matching)
+                if params.get("service_name"):
+                    service = params["service_name"]
+                    # Try exact match first, then with -service suffix, then regex
+                    # For now, use regex for flexibility
+                    filters.append(f'resource.service.name=~"{service}.*"')
+
+                # Add namespace filter
+                if params.get("namespace"):
+                    namespace = params["namespace"]
+                    filters.append(f'resource.k8s.namespace.name="{namespace}"')
+
+                if not filters:
+                    return StructuredToolResult(
+                        status=ToolResultStatus.ERROR,
+                        error="Either base_query, service_name, or namespace is required",
+                        params=params,
+                    )
+
+                base_query = " && ".join(filters)
+
+            sample_size = params.get("sample_size", 5)
+
+            start, end = process_timestamps_to_int(
+                params.get("start_datetime"),
+                params.get("end_datetime"),
+                default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
+            )
+
+            base_url = get_base_url(self._toolset.grafana_config)
+
+            # Step 1: Get overall trace statistics
+            stats_query = f"{{{base_query}}}"
+            all_traces_summary = query_tempo_traces(
+                base_url=base_url,
+                api_key=self._toolset.grafana_config.api_key,
+                headers=self._toolset.grafana_config.headers,
+                query=stats_query,
+                start=start,
+                end=end,
+                limit=1000,  # Get enough for good statistics
+            )
+
+            traces = all_traces_summary.get("traces", [])
+            if len(traces) == 0:
+                return StructuredToolResult(
+                    status=ToolResultStatus.SUCCESS,
+                    data="No traces found matching the query",
+                    params=params,
+                )
+
+            # Calculate statistics
+            durations = [
+                t.get("durationMs", 0) for t in traces if t.get("durationMs", 0) > 0
+            ]
+            durations.sort()
+
+            if len(durations) == 0:
+                return StructuredToolResult(
+                    status=ToolResultStatus.ERROR,
+                    error="No traces with valid duration found",
+                    params=params,
+                )
+
+            stats = {
+                "total_traces_analyzed": len(durations),
+                "avg_duration_ms": round(sum(durations) / len(durations), 2),
+                "min_duration_ms": round(durations[0], 2),
+                "max_duration_ms": round(durations[-1], 2),
+                "p50_duration_ms": round(durations[len(durations) // 2], 2),
+                "p90_duration_ms": round(
+                    durations[min(int(len(durations) * 0.9), len(durations) - 1)], 2
+                ),
+                "p99_duration_ms": round(
+                    durations[min(int(len(durations) * 0.99), len(durations) - 1)], 2
+                ),
+            }
+
+            # Step 2: Get slowest traces (sorted by duration descending)
+            slow_traces_data = []
+            # Sort traces by duration descending and take top N
+            sorted_slow = sorted(
+                traces, key=lambda x: x.get("durationMs", 0), reverse=True
+            )[:sample_size]
+
+            for trace_summary in sorted_slow:
+                trace_id = trace_summary.get("traceID")
+                if not trace_id:
+                    continue
+
+                # Fetch full trace details
+                try:
+                    url = f"{base_url}/api/traces/{trace_id}"
+                    response = requests.get(
+                        url,
+                        headers=build_headers(
+                            api_key=self._toolset.grafana_config.api_key,
+                            additional_headers=self._toolset.grafana_config.headers,
+                        ),
+                        timeout=5,
+                    )
+                    response.raise_for_status()
+                    trace_raw = response.json()
+
+                    # Extract key attributes from the trace
+                    trace_attributes = self._extract_trace_attributes(trace_raw)
+
+                    # Build span hierarchy for analysis
+                    root_spans = build_span_hierarchy(trace_raw)
+                    slowest_spans = self._find_slowest_spans(root_spans, 3)
+
+                    slow_traces_data.append(
+                        {
+                            "trace_id": trace_id,
+                            "duration_ms": round(trace_summary.get("durationMs", 0), 2),
+                            "root_service": trace_summary.get(
+                                "rootServiceName", "unknown"
+                            ),
+                            "key_attributes": trace_attributes,
+                            "slowest_operations": slowest_spans,
+                            "span_count": self._count_spans(root_spans),
+                        }
+                    )
+                except Exception:
+                    continue
+
+            # Step 3: Get fastest traces (but not trivially fast)
+            fast_traces_data = []
+            # Filter out very fast traces (likely health checks)
+            min_duration_threshold = (
+                stats["p50_duration_ms"] * 0.1
+            )  # At least 10% of median
+            meaningful_fast = [
+                t for t in traces if t.get("durationMs", 0) >= min_duration_threshold
+            ]
+            sorted_fast = sorted(meaningful_fast, key=lambda x: x.get("durationMs", 0))[
+                :sample_size
+            ]
+
+            for trace_summary in sorted_fast:
+                trace_id = trace_summary.get("traceID")
+                if not trace_id:
+                    continue
+
+                try:
+                    url = f"{base_url}/api/traces/{trace_id}"
+                    response = requests.get(
+                        url,
+                        headers=build_headers(
+                            api_key=self._toolset.grafana_config.api_key,
+                            additional_headers=self._toolset.grafana_config.headers,
+                        ),
+                        timeout=5,
+                    )
+                    response.raise_for_status()
+                    trace_raw = response.json()
+
+                    trace_attributes = self._extract_trace_attributes(trace_raw)
+                    root_spans = build_span_hierarchy(trace_raw)
+
+                    fast_traces_data.append(
+                        {
+                            "trace_id": trace_id,
+                            "duration_ms": round(trace_summary.get("durationMs", 0), 2),
+                            "root_service": trace_summary.get(
+                                "rootServiceName", "unknown"
+                            ),
+                            "key_attributes": trace_attributes,
+                            "span_count": self._count_spans(root_spans),
+                        }
+                    )
+                except Exception:
+                    continue
+
+            # Step 4: Get typical traces (around median)
+            typical_traces_data = []
+            median = stats["p50_duration_ms"]
+            # Find traces within 20% of median
+            typical_traces = [
+                t
+                for t in traces
+                if median * 0.8 <= t.get("durationMs", 0) <= median * 1.2
+            ][:sample_size]
+
+            for trace_summary in typical_traces:
+                trace_id = trace_summary.get("traceID")
+                if not trace_id:
+                    continue
+
+                try:
+                    url = f"{base_url}/api/traces/{trace_id}"
+                    response = requests.get(
+                        url,
+                        headers=build_headers(
+                            api_key=self._toolset.grafana_config.api_key,
+                            additional_headers=self._toolset.grafana_config.headers,
+                        ),
+                        timeout=5,
+                    )
+                    response.raise_for_status()
+                    trace_raw = response.json()
+
+                    trace_attributes = self._extract_trace_attributes(trace_raw)
+                    root_spans = build_span_hierarchy(trace_raw)
+
+                    typical_traces_data.append(
+                        {
+                            "trace_id": trace_id,
+                            "duration_ms": round(trace_summary.get("durationMs", 0), 2),
+                            "root_service": trace_summary.get(
+                                "rootServiceName", "unknown"
+                            ),
+                            "key_attributes": trace_attributes,
+                            "span_count": self._count_spans(root_spans),
+                        }
+                    )
+                except Exception:
+                    continue
+
+            # Step 5: Analyze patterns
+            analysis_insights = self._generate_insights(
+                slow_traces_data, fast_traces_data, typical_traces_data
+            )
+
+            # Format output
+            result = {
+                "statistics": stats,
+                "slow_traces": slow_traces_data,
+                "fast_traces": fast_traces_data,
+                "typical_traces": typical_traces_data,
+                "pattern_analysis": analysis_insights,
+            }
+
+            return StructuredToolResult(
+                status=ToolResultStatus.SUCCESS,
+                data=yaml.dump(result, default_flow_style=False, sort_keys=False),
+                params=params,
+            )
+
+        except Exception as e:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error=f"Error analyzing traces: {str(e)}",
+                params=params,
+            )
+
+    def _extract_trace_attributes(self, trace_raw: Dict) -> Dict[str, Any]:
+        """Extract key attributes from trace for analysis"""
+        attributes = {}
+
+        # Common attributes to look for
+        interesting_keys = [
+            "user.id",
+            "user.tier",
+            "user.type",
+            "user.email",
+            "customer.id",
+            "promo.code",
+            "coupon.code",
+            "discount.type",
+            "zone.id",
+            "region",
+            "availability_zone",
+            "datacenter",
+            "items.count",
+            "cart.size",
+            "order.total",
+            "request.size",
+            "db.operation",
+            "db.statement",
+            "db.table",
+            "cache.hit",
+            "http.method",
+            "http.route",
+            "http.status_code",
+            "http.url",
+            "error",
+            "error.message",
+            "error.type",
+            "feature.flag",
+            "experiment.id",
+            "version",
+            "deployment.id",
+            "queue.name",
+            "job.type",
+            "workflow.step",
+        ]
+
+        for batch in trace_raw.get("batches", []):
+            # Check resource attributes
+            for attr in batch.get("resource", {}).get("attributes", []):
+                key = attr.get("key", "")
+                if any(
+                    k in key.lower()
+                    for k in ["service", "version", "namespace", "pod", "node"]
+                ):
+                    value = (
+                        list(attr.get("value", {}).values())[0]
+                        if attr.get("value")
+                        else None
+                    )
+                    if value:
+                        attributes[key] = value
+
+            # Check span attributes
+            for scope_spans in batch.get("scopeSpans", []):
+                for span_data in scope_spans.get("spans", []):
+                    for attr in span_data.get("attributes", []):
+                        key = attr.get("key", "")
+                        if key in interesting_keys or any(
+                            k in key.lower()
+                            for k in ["promo", "zone", "user", "error", "db"]
+                        ):
+                            value = (
+                                list(attr.get("value", {}).values())[0]
+                                if attr.get("value")
+                                else None
+                            )
+                            if value and key not in attributes:
+                                attributes[key] = value
+
+        return attributes
+
+    def _find_slowest_spans(self, root_spans, limit=3):
+        """Find the slowest spans in the trace"""
+        all_spans = []
+
+        def collect_spans(span):
+            all_spans.append(span)
+            for child in span.children:
+                collect_spans(child)
+
+        for root in root_spans:
+            collect_spans(root)
+
+        # Sort by duration and get top N
+        sorted_spans = sorted(all_spans, key=lambda s: s.duration_ms, reverse=True)[
+            :limit
+        ]
+
+        result = []
+        for span in sorted_spans:
+            span_info = {
+                "operation": span.name,
+                "service": span.service_name,
+                "duration_ms": round(span.duration_ms, 2),
+            }
+
+            # Add relevant attributes
+            if span.attributes.get("db.statement"):
+                span_info["db_query"] = span.attributes["db.statement"][:100] + "..."
+            if span.attributes.get("http.route"):
+                span_info["http_route"] = span.attributes["http.route"]
+
+            result.append(span_info)
+
+        return result
+
+    def _count_spans(self, root_spans):
+        """Count total number of spans in trace"""
+        count = 0
+
+        def count_recursive(span):
+            nonlocal count
+            count += 1
+            for child in span.children:
+                count_recursive(child)
+
+        for root in root_spans:
+            count_recursive(root)
+
+        return count
+
+    def _generate_insights(self, slow_traces, fast_traces, typical_traces):
+        """Generate insights by comparing trace groups"""
+        insights = {
+            "common_patterns_in_slow_traces": [],
+            "common_patterns_in_fast_traces": [],
+            "key_differences": [],
+        }
+
+        # Analyze attribute patterns
+        slow_attrs = {}
+        fast_attrs = {}
+
+        # Collect attribute frequencies in slow traces
+        for trace in slow_traces:
+            for key, value in trace.get("key_attributes", {}).items():
+                if key not in slow_attrs:
+                    slow_attrs[key] = {}
+                slow_attrs[key][str(value)] = slow_attrs[key].get(str(value), 0) + 1
+
+        # Collect attribute frequencies in fast traces
+        for trace in fast_traces:
+            for key, value in trace.get("key_attributes", {}).items():
+                if key not in fast_attrs:
+                    fast_attrs[key] = {}
+                fast_attrs[key][str(value)] = fast_attrs[key].get(str(value), 0) + 1
+
+        # Find patterns unique to slow traces
+        for key, values in slow_attrs.items():
+            if len(slow_traces) > 0:
+                for value, count in values.items():
+                    ratio = count / len(slow_traces)
+                    if ratio >= 0.8:  # Present in 80%+ of slow traces
+                        fast_count = fast_attrs.get(key, {}).get(value, 0)
+                        fast_ratio = (
+                            fast_count / len(fast_traces) if len(fast_traces) > 0 else 0
+                        )
+                        if fast_ratio < 0.2:  # But in less than 20% of fast traces
+                            insights["common_patterns_in_slow_traces"].append(
+                                f"{key}={value} appears in {int(ratio*100)}% of slow traces but only {int(fast_ratio*100)}% of fast traces"
+                            )
+
+        # Check span count differences
+        if slow_traces and fast_traces:
+            avg_slow_spans = sum(t.get("span_count", 0) for t in slow_traces) / len(
+                slow_traces
+            )
+            avg_fast_spans = sum(t.get("span_count", 0) for t in fast_traces) / len(
+                fast_traces
+            )
+            if avg_slow_spans > avg_fast_spans * 1.5:
+                insights["key_differences"].append(
+                    f"Slow traces have {avg_slow_spans:.1f} spans on average vs {avg_fast_spans:.1f} for fast traces"
+                )
+
+        # Check for missing attributes
+        slow_keys = set()
+        for trace in slow_traces:
+            slow_keys.update(trace.get("key_attributes", {}).keys())
+
+        fast_keys = set()
+        for trace in fast_traces:
+            fast_keys.update(trace.get("key_attributes", {}).keys())
+
+        only_in_slow = slow_keys - fast_keys
+        if only_in_slow:
+            insights["key_differences"].append(
+                f"Attributes only in slow traces: {', '.join(only_in_slow)}"
+            )
+
+        return insights
+
+    def get_parameterized_one_liner(self, params: Dict) -> str:
+        return f"{toolset_name_for_one_liner(self._toolset.name)}: Comparative trace analysis"
+
+
 class GrafanaTempoToolset(BaseGrafanaTempoToolset):
     def __init__(self):
         super().__init__(
@@ -607,12 +1339,14 @@ def __init__(self):
             icon_url="https://grafana.com/static/assets/img/blog/tempo.png",
             docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/grafanatempo.html",
             tools=[
+                ListServices(self),
                 GetTempoTraces(self),
                 GetTempoTraceById(self),
                 GetTempoTags(self),
-                AnalyzeTracesByAttributes(self),
-                FindSlowOperations(self),
-                CompareTracePeriods(self),
+                # AnalyzeTracesByAttributes(self),
+                # FindSlowOperations(self),
+                # CompareTracePeriods(self),
+                FetchTracesComparativeSample(self),
             ],
         )
         template_file_path = os.path.abspath(
diff --git a/holmes/plugins/toolsets/grafana/trace_parser.py b/holmes/plugins/toolsets/grafana/trace_parser.py
index 1910c090b..2913be6e4 100644
--- a/holmes/plugins/toolsets/grafana/trace_parser.py
+++ b/holmes/plugins/toolsets/grafana/trace_parser.py
@@ -187,7 +187,7 @@ def format_traces_list(trace_data: Dict) -> str:
                 else "\n"
             )
             trace_str += f"\tstartTime={unix_nano_to_rfc3339(int(trace.get('startTimeUnixNano')))}"
-            trace_str += f" rootServiceName={trace.get('trootServiceName')}"
+            trace_str += f" rootServiceName={trace.get('rootServiceName')}"
             trace_str += f" rootTraceName={trace.get('rootTraceName')}"
             traces_str.append(trace_str)
         return "\n".join(traces_str)
diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
index 3d14e66ba..e611dbc57 100644
--- a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
@@ -66,14 +66,21 @@ before_test: |
   echo "🔍 Checking all pods status"
   kubectl get pods -n app-114
 
-  echo "⏰ Letting traffic generator run for 10 seconds to generate requests"
-  sleep 10
+  echo "⏰ Waiting for traffic generator to produce logs (checking every 3s, timeout 60s)"
+  PROMO_LOG_FOUND=false
+  for i in {1..20}; do
+    if kubectl logs -n app-114 -l app=traffic-generator --tail=100 2>/dev/null | grep -q "WITH promo_code"; then
+      echo "✅ Found traffic generator log WITH promo_code after $((i*3)) seconds"
+      PROMO_LOG_FOUND=true
+      break
+    else
+      echo "⏳ Attempt $i/20: No promo_code log yet, waiting 3s..."
+      sleep 3
+    fi
+  done
 
-  echo "🔍 Verifying traffic generator log entries"
-  if kubectl logs -n app-114 -l app=traffic-generator --tail=100 | grep -q "WITH promo_code"; then
-    echo "✅ Found traffic generator log WITH promo_code"
-  else
-    echo "❌ Missing traffic generator log WITH promo_code"
+  if [ "$PROMO_LOG_FOUND" = false ]; then
+    echo "❌ Missing traffic generator log WITH promo_code after 60 seconds"
     exit 1
   fi
 

From 8102b12cd8c59874a2fb2cdcceae13789b9162af Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Sun, 24 Aug 2025 22:59:27 +0300
Subject: [PATCH 13/15] improvements

---
 .../toolsets/grafana/toolset_grafana_tempo.py | 56 +++++++++----------
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
index ae7848a2a..5f569d082 100644
--- a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
+++ b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
@@ -886,9 +886,7 @@ def _invoke(self, params: Dict) -> StructuredToolResult:
                 # Add service filter (with smart matching)
                 if params.get("service_name"):
                     service = params["service_name"]
-                    # Try exact match first, then with -service suffix, then regex
-                    # For now, use regex for flexibility
-                    filters.append(f'resource.service.name=~"{service}.*"')
+                    filters.append(f'resource.service.name=~".*{service}.*"')
 
                 # Add namespace filter
                 if params.get("namespace"):
@@ -996,14 +994,14 @@ def _invoke(self, params: Dict) -> StructuredToolResult:
 
                     slow_traces_data.append(
                         {
-                            "trace_id": trace_id,
-                            "duration_ms": round(trace_summary.get("durationMs", 0), 2),
-                            "root_service": trace_summary.get(
+                            "traceID": trace_id,
+                            "durationMs": round(trace_summary.get("durationMs", 0), 2),
+                            "rootServiceName": trace_summary.get(
                                 "rootServiceName", "unknown"
                             ),
-                            "key_attributes": trace_attributes,
-                            "slowest_operations": slowest_spans,
-                            "span_count": self._count_spans(root_spans),
+                            "attributes": trace_attributes,
+                            "slowestSpans": slowest_spans,
+                            "spanCount": self._count_spans(root_spans),
                         }
                     )
                 except Exception:
@@ -1045,13 +1043,13 @@ def _invoke(self, params: Dict) -> StructuredToolResult:
 
                     fast_traces_data.append(
                         {
-                            "trace_id": trace_id,
-                            "duration_ms": round(trace_summary.get("durationMs", 0), 2),
-                            "root_service": trace_summary.get(
+                            "traceID": trace_id,
+                            "durationMs": round(trace_summary.get("durationMs", 0), 2),
+                            "rootServiceName": trace_summary.get(
                                 "rootServiceName", "unknown"
                             ),
-                            "key_attributes": trace_attributes,
-                            "span_count": self._count_spans(root_spans),
+                            "attributes": trace_attributes,
+                            "spanCount": self._count_spans(root_spans),
                         }
                     )
                 except Exception:
@@ -1090,13 +1088,13 @@ def _invoke(self, params: Dict) -> StructuredToolResult:
 
                     typical_traces_data.append(
                         {
-                            "trace_id": trace_id,
-                            "duration_ms": round(trace_summary.get("durationMs", 0), 2),
-                            "root_service": trace_summary.get(
+                            "traceID": trace_id,
+                            "durationMs": round(trace_summary.get("durationMs", 0), 2),
+                            "rootServiceName": trace_summary.get(
                                 "rootServiceName", "unknown"
                             ),
-                            "key_attributes": trace_attributes,
-                            "span_count": self._count_spans(root_spans),
+                            "attributes": trace_attributes,
+                            "spanCount": self._count_spans(root_spans),
                         }
                     )
                 except Exception:
@@ -1227,15 +1225,15 @@ def collect_spans(span):
         for span in sorted_spans:
             span_info = {
                 "operation": span.name,
-                "service": span.service_name,
-                "duration_ms": round(span.duration_ms, 2),
+                "serviceName": span.service_name,
+                "durationMs": round(span.duration_ms, 2),
             }
 
             # Add relevant attributes
             if span.attributes.get("db.statement"):
-                span_info["db_query"] = span.attributes["db.statement"][:100] + "..."
+                span_info["dbStatement"] = span.attributes["db.statement"][:100] + "..."
             if span.attributes.get("http.route"):
-                span_info["http_route"] = span.attributes["http.route"]
+                span_info["httpRoute"] = span.attributes["http.route"]
 
             result.append(span_info)
 
@@ -1270,14 +1268,14 @@ def _generate_insights(self, slow_traces, fast_traces, typical_traces):
 
         # Collect attribute frequencies in slow traces
         for trace in slow_traces:
-            for key, value in trace.get("key_attributes", {}).items():
+            for key, value in trace.get("attributes", {}).items():
                 if key not in slow_attrs:
                     slow_attrs[key] = {}
                 slow_attrs[key][str(value)] = slow_attrs[key].get(str(value), 0) + 1
 
         # Collect attribute frequencies in fast traces
         for trace in fast_traces:
-            for key, value in trace.get("key_attributes", {}).items():
+            for key, value in trace.get("attributes", {}).items():
                 if key not in fast_attrs:
                     fast_attrs[key] = {}
                 fast_attrs[key][str(value)] = fast_attrs[key].get(str(value), 0) + 1
@@ -1299,10 +1297,10 @@ def _generate_insights(self, slow_traces, fast_traces, typical_traces):
 
         # Check span count differences
         if slow_traces and fast_traces:
-            avg_slow_spans = sum(t.get("span_count", 0) for t in slow_traces) / len(
+            avg_slow_spans = sum(t.get("spanCount", 0) for t in slow_traces) / len(
                 slow_traces
             )
-            avg_fast_spans = sum(t.get("span_count", 0) for t in fast_traces) / len(
+            avg_fast_spans = sum(t.get("spanCount", 0) for t in fast_traces) / len(
                 fast_traces
             )
             if avg_slow_spans > avg_fast_spans * 1.5:
@@ -1313,11 +1311,11 @@ def _generate_insights(self, slow_traces, fast_traces, typical_traces):
         # Check for missing attributes
         slow_keys = set()
         for trace in slow_traces:
-            slow_keys.update(trace.get("key_attributes", {}).keys())
+            slow_keys.update(trace.get("attributes", {}).keys())
 
         fast_keys = set()
         for trace in fast_traces:
-            fast_keys.update(trace.get("key_attributes", {}).keys())
+            fast_keys.update(trace.get("attributes", {}).keys())
 
         only_in_slow = slow_keys - fast_keys
         if only_in_slow:

From a88e07daa597ac3fcdfb9e603500d0bd039aacd5 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Mon, 25 Aug 2025 07:19:51 +0300
Subject: [PATCH 14/15] Update toolset_grafana_tempo.py

---
 .../toolsets/grafana/toolset_grafana_tempo.py | 64 ++++---------------
 1 file changed, 11 insertions(+), 53 deletions(-)

diff --git a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
index 5f569d082..d6d30dca3 100644
--- a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
+++ b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
@@ -80,7 +80,7 @@ class GetTempoTraces(Tool):
     def __init__(self, toolset: BaseGrafanaTempoToolset):
         super().__init__(
             name="fetch_tempo_traces",
-            description="""Lists Tempo traces. At least one of `service_name`, `pod_name` or `deployment_name` argument is required.""",
+            description="""Lists Tempo traces. At least one of `service_name`, `pod_name` or `deployment_name` argument is required. You should usually call fetch_traces_comparative_sample before calling this tool to first get an overview.""",
             parameters={
                 "min_duration": ToolParameter(
                     description="The minimum duration of traces to fetch, e.g., '5s' for 5 seconds.",
@@ -839,7 +839,7 @@ def __init__(self, toolset: BaseGrafanaTempoToolset):
 - For namespace issues: namespace="production"
 - Combined: service_name="auth", namespace="staging"
 
-The tool automatically compares fast vs slow traces and highlights attribute differences. Usually this is the best first tool to call when investigating trace data as it gives a fantastic overview.""",
+The tool automatically compares fast vs slow traces and highlights attribute differences. Important: call this tool first when investigating performance issues via traces. This is the best first tool to call as it gives a fantastic overview. You can later call other trace tools for more information.""",
             parameters={
                 "service_name": ToolParameter(
                     description="Service to analyze (partial match supported, e.g., 'payment' matches 'payment-service')",
@@ -1128,78 +1128,36 @@ def _invoke(self, params: Dict) -> StructuredToolResult:
             )
 
     def _extract_trace_attributes(self, trace_raw: Dict) -> Dict[str, Any]:
-        """Extract key attributes from trace for analysis"""
+        """Extract ALL attributes from trace for analysis"""
         attributes = {}
 
-        # Common attributes to look for
-        interesting_keys = [
-            "user.id",
-            "user.tier",
-            "user.type",
-            "user.email",
-            "customer.id",
-            "promo.code",
-            "coupon.code",
-            "discount.type",
-            "zone.id",
-            "region",
-            "availability_zone",
-            "datacenter",
-            "items.count",
-            "cart.size",
-            "order.total",
-            "request.size",
-            "db.operation",
-            "db.statement",
-            "db.table",
-            "cache.hit",
-            "http.method",
-            "http.route",
-            "http.status_code",
-            "http.url",
-            "error",
-            "error.message",
-            "error.type",
-            "feature.flag",
-            "experiment.id",
-            "version",
-            "deployment.id",
-            "queue.name",
-            "job.type",
-            "workflow.step",
-        ]
-
         for batch in trace_raw.get("batches", []):
-            # Check resource attributes
+            # Extract all resource attributes
             for attr in batch.get("resource", {}).get("attributes", []):
                 key = attr.get("key", "")
-                if any(
-                    k in key.lower()
-                    for k in ["service", "version", "namespace", "pod", "node"]
-                ):
+                if key:
                     value = (
                         list(attr.get("value", {}).values())[0]
                         if attr.get("value")
                         else None
                     )
-                    if value:
+                    if value is not None:
                         attributes[key] = value
 
-            # Check span attributes
+            # Extract all span attributes (from first span of each type we haven't seen)
             for scope_spans in batch.get("scopeSpans", []):
                 for span_data in scope_spans.get("spans", []):
                     for attr in span_data.get("attributes", []):
                         key = attr.get("key", "")
-                        if key in interesting_keys or any(
-                            k in key.lower()
-                            for k in ["promo", "zone", "user", "error", "db"]
-                        ):
+                        if (
+                            key and key not in attributes
+                        ):  # Only add if we haven't seen this key yet
                             value = (
                                 list(attr.get("value", {}).values())[0]
                                 if attr.get("value")
                                 else None
                             )
-                            if value and key not in attributes:
+                            if value is not None:
                                 attributes[key] = value
 
         return attributes

From 68648019277244e4a0065d2dfc02b3c6075b7cd5 Mon Sep 17 00:00:00 2001
From: Robusta Runner <aantny@gmail.com>
Date: Mon, 25 Aug 2025 09:41:02 +0300
Subject: [PATCH 15/15] fix

---
 .../toolsets/grafana/toolset_grafana_tempo.py | 244 +++++++++++++++++-
 .../checkout-service.yaml                     |   2 +-
 .../test_case.yaml                            | 131 ++++++++++
 .../toolsets.yaml                             |   4 +-
 .../traffic-generator.yaml                    |   4 +-
 .../checkout-service.yaml                     | 169 ++++++++++++
 .../test_case.yaml                            |  42 +--
 .../toolsets.yaml                             |  12 +
 .../traffic-generator.yaml                    | 157 +++++++++++
 9 files changed, 726 insertions(+), 39 deletions(-)
 rename tests/llm/fixtures/test_ask_holmes/{114_checkout_latency_tracing_rebuild => 114a_checkout_latency_tracing_comparative}/checkout-service.yaml (98%)
 create mode 100644 tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/test_case.yaml
 rename tests/llm/fixtures/test_ask_holmes/{114_checkout_latency_tracing_rebuild => 114a_checkout_latency_tracing_comparative}/toolsets.yaml (59%)
 rename tests/llm/fixtures/test_ask_holmes/{114_checkout_latency_tracing_rebuild => 114a_checkout_latency_tracing_comparative}/traffic-generator.yaml (97%)
 create mode 100644 tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/checkout-service.yaml
 rename tests/llm/fixtures/test_ask_holmes/{114_checkout_latency_tracing_rebuild => 114b_checkout_latency_tracing_simple}/test_case.yaml (69%)
 create mode 100644 tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/toolsets.yaml
 create mode 100644 tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/traffic-generator.yaml

diff --git a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
index d6d30dca3..9894899f8 100644
--- a/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
+++ b/holmes/plugins/toolsets/grafana/toolset_grafana_tempo.py
@@ -50,6 +50,8 @@ class GrafanaTempoLabelsConfig(BaseModel):
 
 class GrafanaTempoConfig(GrafanaConfig):
     labels: GrafanaTempoLabelsConfig = GrafanaTempoLabelsConfig()
+    enable_comparative_sample: bool = False
+    enable_simple_comparison: bool = True
 
 
 class BaseGrafanaTempoToolset(BaseGrafanaToolset):
@@ -832,14 +834,14 @@ class FetchTracesComparativeSample(Tool):
     def __init__(self, toolset: BaseGrafanaTempoToolset):
         super().__init__(
             name="fetch_traces_comparative_sample",
-            description="""Fetches statistics and representative samples of fast, slow, and typical traces for comparative analysis to identify performance patterns.
+            description="""Fetches statistics and representative samples of fast, slow, and typical traces for performance analysis.
+
+Important: call this tool first when investigating performance issues via traces. This tool provides comprehensive analysis for identifying patterns.
 
 Examples:
 - For service latency: service_name="payment" (matches "payment-service" too)
 - For namespace issues: namespace="production"
-- Combined: service_name="auth", namespace="staging"
-
-The tool automatically compares fast vs slow traces and highlights attribute differences. Important: call this tool first when investigating performance issues via traces. This is the best first tool to call as it gives a fantastic overview. You can later call other trace tools for more information.""",
+- Combined: service_name="auth", namespace="staging\"""",
             parameters={
                 "service_name": ToolParameter(
                     description="Service to analyze (partial match supported, e.g., 'payment' matches 'payment-service')",
@@ -1287,6 +1289,203 @@ def get_parameterized_one_liner(self, params: Dict) -> str:
         return f"{toolset_name_for_one_liner(self._toolset.name)}: Comparative trace analysis"
 
 
+class FetchTracesSimpleComparison(Tool):
+    def __init__(self, toolset: BaseGrafanaTempoToolset):
+        super().__init__(
+            name="fetch_traces_comparative_sample",
+            description="""Fetches statistics and representative samples of fast, slow, and typical traces for performance analysis.
+
+Important: call this tool first when investigating performance issues via traces. This tool provides comprehensive analysis for identifying patterns.
+
+Examples:
+- For service latency: service_name="payment" (matches "payment-service" too)
+- For namespace issues: namespace="production"
+- Combined: service_name="auth", namespace="staging\"""",
+            parameters={
+                "service_name": ToolParameter(
+                    description="Service to analyze (partial match supported)",
+                    type="string",
+                    required=False,
+                ),
+                "namespace": ToolParameter(
+                    description="Kubernetes namespace to filter traces",
+                    type="string",
+                    required=False,
+                ),
+                "base_query": ToolParameter(
+                    description="Custom TraceQL filter",
+                    type="string",
+                    required=False,
+                ),
+                "sample_count": ToolParameter(
+                    description="Number of traces to fetch from each category (fastest/slowest). Default 3",
+                    type="integer",
+                    required=False,
+                ),
+                "start_datetime": ToolParameter(
+                    description="Start time for analysis (RFC3339 or relative)",
+                    type="string",
+                    required=False,
+                ),
+                "end_datetime": ToolParameter(
+                    description="End time for analysis (RFC3339 or relative)",
+                    type="string",
+                    required=False,
+                ),
+            },
+        )
+        self._toolset = toolset
+
+    def _invoke(self, params: Dict) -> StructuredToolResult:
+        try:
+            # Build query (same as before)
+            if params.get("base_query"):
+                base_query = params["base_query"]
+            else:
+                filters = []
+                if params.get("service_name"):
+                    service = params["service_name"]
+                    filters.append(f'resource.service.name=~".*{service}.*"')
+                if params.get("namespace"):
+                    namespace = params["namespace"]
+                    filters.append(f'resource.k8s.namespace.name="{namespace}"')
+
+                if not filters:
+                    return StructuredToolResult(
+                        status=ToolResultStatus.ERROR,
+                        error="Either base_query, service_name, or namespace is required",
+                        params=params,
+                    )
+                base_query = " && ".join(filters)
+
+            sample_count = params.get("sample_count", 3)
+
+            start, end = process_timestamps_to_int(
+                params.get("start_datetime"),
+                params.get("end_datetime"),
+                default_time_span_seconds=DEFAULT_TIME_SPAN_SECONDS,
+            )
+
+            base_url = get_base_url(self._toolset.grafana_config)
+
+            # Step 1: Get all trace summaries
+            stats_query = f"{{{base_query}}}"
+            all_traces_response = query_tempo_traces(
+                base_url=base_url,
+                api_key=self._toolset.grafana_config.api_key,
+                headers=self._toolset.grafana_config.headers,
+                query=stats_query,
+                start=start,
+                end=end,
+                limit=1000,
+            )
+
+            traces = all_traces_response.get("traces", [])
+            if not traces:
+                return StructuredToolResult(
+                    status=ToolResultStatus.SUCCESS,
+                    data="No traces found matching the query",
+                    params=params,
+                )
+
+            # Step 2: Sort traces by duration
+            sorted_traces = sorted(traces, key=lambda x: x.get("durationMs", 0))
+
+            # Step 3: Calculate basic statistics
+            durations = [t.get("durationMs", 0) for t in sorted_traces]
+            stats = {
+                "trace_count": len(durations),
+                "min_ms": durations[0],
+                "p25_ms": durations[len(durations) // 4]
+                if len(durations) >= 4
+                else durations[0],
+                "p50_ms": durations[len(durations) // 2],
+                "p75_ms": durations[3 * len(durations) // 4]
+                if len(durations) >= 4
+                else durations[-1],
+                "p90_ms": durations[int(len(durations) * 0.9)]
+                if len(durations) >= 10
+                else durations[-1],
+                "p99_ms": durations[int(len(durations) * 0.99)]
+                if len(durations) >= 100
+                else durations[-1],
+                "max_ms": durations[-1],
+            }
+
+            # Step 4: Select representative traces to fetch
+            fastest_indices = list(range(min(sample_count, len(sorted_traces))))
+            slowest_indices = list(
+                range(max(0, len(sorted_traces) - sample_count), len(sorted_traces))
+            )
+
+            # Add median trace
+            median_idx = len(sorted_traces) // 2
+
+            # Step 5: Fetch full trace details
+            def fetch_full_trace(trace_summary):
+                trace_id = trace_summary.get("traceID")
+                if not trace_id:
+                    return None
+
+                try:
+                    url = f"{base_url}/api/traces/{trace_id}"
+                    response = requests.get(
+                        url,
+                        headers=build_headers(
+                            api_key=self._toolset.grafana_config.api_key,
+                            additional_headers=self._toolset.grafana_config.headers,
+                        ),
+                        timeout=5,
+                    )
+                    response.raise_for_status()
+                    return {
+                        "traceID": trace_id,
+                        "durationMs": trace_summary.get("durationMs", 0),
+                        "rootServiceName": trace_summary.get(
+                            "rootServiceName", "unknown"
+                        ),
+                        "traceData": response.json(),  # Raw trace data
+                    }
+                except Exception:
+                    return {
+                        "traceID": trace_id,
+                        "durationMs": trace_summary.get("durationMs", 0),
+                        "error": "Failed to fetch full trace",
+                    }
+
+            # Fetch the selected traces
+            result = {
+                "statistics": stats,
+                "all_trace_durations_ms": durations,  # All durations for distribution analysis
+                "fastest_traces": [
+                    fetch_full_trace(sorted_traces[i]) for i in fastest_indices
+                ],
+                "median_trace": fetch_full_trace(sorted_traces[median_idx]),
+                "slowest_traces": [
+                    fetch_full_trace(sorted_traces[i]) for i in slowest_indices
+                ],
+            }
+
+            # Return as YAML for readability
+            return StructuredToolResult(
+                status=ToolResultStatus.SUCCESS,
+                data=yaml.dump(result, default_flow_style=False, sort_keys=False),
+                params=params,
+            )
+
+        except Exception as e:
+            return StructuredToolResult(
+                status=ToolResultStatus.ERROR,
+                error=f"Error fetching traces: {str(e)}",
+                params=params,
+            )
+
+    def get_parameterized_one_liner(self, params: Dict) -> str:
+        return (
+            f"{toolset_name_for_one_liner(self._toolset.name)}: Simple trace comparison"
+        )
+
+
 class GrafanaTempoToolset(BaseGrafanaTempoToolset):
     def __init__(self):
         super().__init__(
@@ -1294,18 +1493,35 @@ def __init__(self):
             description="Fetches kubernetes traces from Tempo",
             icon_url="https://grafana.com/static/assets/img/blog/tempo.png",
             docs_url="https://docs.robusta.dev/master/configuration/holmesgpt/toolsets/grafanatempo.html",
-            tools=[
-                ListServices(self),
-                GetTempoTraces(self),
-                GetTempoTraceById(self),
-                GetTempoTags(self),
-                # AnalyzeTracesByAttributes(self),
-                # FindSlowOperations(self),
-                # CompareTracePeriods(self),
-                FetchTracesComparativeSample(self),
-            ],
+            tools=[],  # Will be populated in prerequisites_callable
         )
         template_file_path = os.path.abspath(
             os.path.join(os.path.dirname(__file__), "toolset_grafana_tempo.jinja2")
         )
         self._load_llm_instructions(jinja_template=f"file://{template_file_path}")
+
+    def prerequisites_callable(self, config: dict[str, Any]) -> tuple[bool, str]:
+        # Call parent to validate config
+        success, msg = super().prerequisites_callable(config)
+        if not success:
+            return success, msg
+
+        # Build tools list based on config
+        tools = [
+            ListServices(self),
+            GetTempoTraces(self),
+            GetTempoTraceById(self),
+            GetTempoTags(self),
+        ]
+
+        # Add comparison tools conditionally
+        if self.grafana_config.enable_comparative_sample:
+            tools.append(FetchTracesComparativeSample(self))
+
+        if self.grafana_config.enable_simple_comparison:
+            tools.append(FetchTracesSimpleComparison(self))
+
+        # Update the tools list
+        self.tools = tools
+
+        return True, ""
diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/checkout-service.yaml b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/checkout-service.yaml
similarity index 98%
rename from tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/checkout-service.yaml
rename to tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/checkout-service.yaml
index 783958e99..c7eadcffe 100644
--- a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/checkout-service.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/checkout-service.yaml
@@ -23,7 +23,7 @@ stringData:
     trace.set_tracer_provider(provider)
 
     otlp_exporter = OTLPSpanExporter(
-        endpoint="tempo.app-114.svc.cluster.local:4317",
+        endpoint="tempo.app-114a.svc.cluster.local:4317",
         insecure=True
     )
     provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
diff --git a/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/test_case.yaml
new file mode 100644
index 000000000..978daf667
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/test_case.yaml
@@ -0,0 +1,131 @@
+user_prompt:
+- "The checkout service in namespace app-114a is experiencing high latency. Investigate why."
+
+expected_output:
+  - The answer must explicitly state that queries that include a promo code are slow or all slow requests include promo code.
+  - Including a query that mentions promo code is not sufficient.
+
+tags:
+  - kubernetes
+  - hard
+  - chain-of-causation
+  - traces
+
+port_forwards:
+  - namespace: app-114a
+    service: tempo
+    local_port: 3214
+    remote_port: 3200
+
+before_test: |
+  echo "🚀 Setting up test 114a - Creating namespace app-114a"
+  kubectl create namespace app-114a || true
+  echo "✅ Namespace app-114a created successfully!"
+
+  echo "📦 Deploying Tempo from shared config"
+  kubectl apply -f ../../shared/tempo.yaml -n app-114a
+
+  echo "⏳ Waiting for Tempo pod to be ready"
+  kubectl wait --for=condition=ready pod -l app=tempo -n app-114a --timeout=60s
+
+  echo "⏰ Waiting for Tempo to be fully ready (checking every 5s, timeout 60s)"
+  TEMPO_READY=false
+  for i in {1..12}; do
+    if kubectl exec -n app-114a deployment/tempo -- wget -q -O - http://localhost:3200/ready 2>/dev/null; then
+      echo "✅ Tempo is ready!"
+      TEMPO_READY=true
+      break
+    else
+      echo "⏳ Attempt $i/12: Tempo not ready yet, waiting 5s..."
+      sleep 5
+    fi
+  done
+
+  if [ "$TEMPO_READY" = false ]; then
+    echo "❌ Tempo failed to become ready after 60 seconds"
+    exit 1
+  fi
+
+  echo "✅ Tempo deployment complete!"
+
+  echo "🛒 Deploying checkout service"
+  kubectl apply -f checkout-service.yaml -n app-114a
+
+  echo "⏳ Waiting for checkout pod to be ready"
+  kubectl wait --for=condition=ready pod -l app=checkout -n app-114a --timeout=60s
+
+  echo "🔍 Checking checkout deployment status"
+  kubectl get pods -n app-114a -l app=checkout
+
+  echo "🚦 Deploying traffic generator"
+  kubectl apply -f traffic-generator.yaml -n app-114a
+
+  echo "⏳ Waiting for traffic generator pod to be ready"
+  kubectl wait --for=condition=ready pod -l app=traffic-generator -n app-114a --timeout=60s
+
+  echo "🔍 Checking all pods status"
+  kubectl get pods -n app-114a
+
+  echo "⏰ Waiting for traffic generator to produce logs (checking every 3s, timeout 60s)"
+  PROMO_LOG_FOUND=false
+  for i in {1..20}; do
+    if kubectl logs -n app-114a -l app=traffic-generator --tail=100 2>/dev/null | grep -q "WITH promo_code"; then
+      echo "✅ Found traffic generator log WITH promo_code after $((i*3)) seconds"
+      PROMO_LOG_FOUND=true
+      break
+    else
+      echo "⏳ Attempt $i/20: No promo_code log yet, waiting 3s..."
+      sleep 3
+    fi
+  done
+
+  if [ "$PROMO_LOG_FOUND" = false ]; then
+    echo "❌ Missing traffic generator log WITH promo_code after 60 seconds"
+    exit 1
+  fi
+
+  if kubectl logs -n app-114a -l app=traffic-generator --tail=100 | grep -q "WITHOUT promo_code"; then
+    echo "✅ Found traffic generator log WITHOUT promo_code"
+  else
+    echo "❌ Missing traffic generator log WITHOUT promo_code"
+    exit 1
+  fi
+
+  if kubectl logs -n app-114a -l app=checkout --tail=100 | grep -q "Processing checkout request"; then
+    echo "✅ Found checkout request log"
+  else
+    echo "❌ Missing checkout request log"
+    exit 1
+  fi
+
+  # Commented out traffic generator trace checks as it no longer sends traces
+  # echo "🔍 Querying Tempo for traces from traffic generator"
+  # TRAFFIC_GEN_TRACES=$(curl -s "http://localhost:3200/api/search?tags=service.name%3Dtraffic-generator&limit=10" 2>/dev/null | grep -o '"traceID"' | wc -l || echo "0")
+  # echo "Found $TRAFFIC_GEN_TRACES traces from traffic-generator"
+
+  echo "🔍 Querying Tempo for traces from checkout service"
+  CHECKOUT_TRACES=$(kubectl run -n app-114a tempo-query --rm -i --restart=Never --image=curlimages/curl -- -s "http://tempo:3200/api/search?tags=service.name%3Dcheckout-service&limit=10" 2>/dev/null | grep -o '"traceID"' | wc -l || echo "0")
+  echo "Found $CHECKOUT_TRACES traces from checkout-service"
+
+  # Commented out traffic generator trace check
+  # if [ "$TRAFFIC_GEN_TRACES" -gt "0" ]; then
+  #   echo "✅ Found traces from traffic-generator"
+  # else
+  #   echo "❌ No traces found from traffic-generator"
+  #   exit 1
+  # fi
+
+  if [ "$CHECKOUT_TRACES" -gt "0" ]; then
+    echo "✅ Found traces from checkout-service"
+  else
+    echo "❌ No traces found from checkout-service"
+    exit 1
+  fi
+
+  # Delete Traffic generator so the ai won't cheat
+  kubectl delete -f traffic-generator.yaml -n app-114a
+
+  echo "✅ Test setup complete!"
+
+after_test: |
+  kubectl delete namespace app-114a || true
diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/toolsets.yaml b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/toolsets.yaml
similarity index 59%
rename from tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/toolsets.yaml
rename to tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/toolsets.yaml
index 7298e0687..3facea342 100644
--- a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/toolsets.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/toolsets.yaml
@@ -6,5 +6,7 @@ toolsets:
   grafana/tempo:
     enabled: true
     config:
-      url: http://localhost:3200
+      url: http://localhost:3214
       healthcheck: "ready"
+      enable_comparative_sample: true
+      enable_simple_comparison: false
diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/traffic-generator.yaml b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/traffic-generator.yaml
similarity index 97%
rename from tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/traffic-generator.yaml
rename to tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/traffic-generator.yaml
index 0ba5b9086..675d0eaaf 100644
--- a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/traffic-generator.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/114a_checkout_latency_tracing_comparative/traffic-generator.yaml
@@ -23,7 +23,7 @@ stringData:
     # trace.set_tracer_provider(provider)
 
     # otlp_exporter = OTLPSpanExporter(
-    #     endpoint="tempo.app-114.svc.cluster.local:4317",
+    #     endpoint="tempo.app-114a.svc.cluster.local:4317",
     #     insecure=True
     # )
     # provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
@@ -33,7 +33,7 @@ stringData:
 
     # tracer = trace.get_tracer(__name__)
 
-    CHECKOUT_URL = "http://checkout.app-114.svc.cluster.local:8080/checkout"
+    CHECKOUT_URL = "http://checkout.app-114a.svc.cluster.local:8080/checkout"
     ZONES = ['us-west-1', 'us-west-2', 'us-east-1', 'us-east-2']
     PROMO_CODES = ['SAVE10', 'WELCOME20', 'HOLIDAY15', 'SPECIAL25']
 
diff --git a/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/checkout-service.yaml b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/checkout-service.yaml
new file mode 100644
index 000000000..8c6a0f088
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/checkout-service.yaml
@@ -0,0 +1,169 @@
+# Simplified Checkout Service with Dummy SQL
+apiVersion: v1
+kind: Secret
+metadata:
+  name: checkout-app
+type: Opaque
+stringData:
+  app.py: |
+    import os
+    import time
+    import random
+    from flask import Flask, request, jsonify
+    from opentelemetry import trace
+    from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+    from opentelemetry.sdk.trace import TracerProvider
+    from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    from opentelemetry.sdk.resources import Resource
+    from opentelemetry.instrumentation.flask import FlaskInstrumentor
+
+    # Configure OpenTelemetry
+    resource = Resource.create({"service.name": "checkout-service"})
+    provider = TracerProvider(resource=resource)
+    trace.set_tracer_provider(provider)
+
+    otlp_exporter = OTLPSpanExporter(
+        endpoint="tempo.app-114b.svc.cluster.local:4317",
+        insecure=True
+    )
+    provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
+
+    app = Flask(__name__)
+    FlaskInstrumentor().instrument_app(app)
+
+    tracer = trace.get_tracer(__name__)
+
+    @app.route('/health')
+    def health():
+        return 'OK'
+
+    @app.route('/checkout', methods=['POST'])
+    def checkout():
+        with tracer.start_as_current_span("process_checkout") as span:
+            data = request.json or {}
+
+            # Log the incoming request (without revealing the data)
+            print(f"[CHECKOUT] Processing checkout request for user {data.get('user_id', 'guest')}", flush=True)
+
+            # Extract parameters
+            user_id = data.get('user_id', 'guest')
+            zone_id = data.get('zone_id', 'us-west-1')
+            promo_code = data.get('promo_code')
+            items = data.get('items', [])
+
+            # Add span attributes
+            span.set_attribute("user.id", user_id)
+            span.set_attribute("zone.id", zone_id)
+            span.set_attribute("items.count", len(items))
+            if promo_code:
+                span.set_attribute("promo.code", promo_code)
+
+            # Simulate database query for shipping calculation
+            with tracer.start_as_current_span("database_query") as db_span:
+                db_span.set_attribute("db.system", "postgresql")
+                db_span.set_attribute("db.operation", "SELECT")
+
+                if promo_code:
+                    # Simulate slow query with promo_code
+                    query = "SELECT rate_per_kg, discount_percent FROM shipping_rates WHERE zone_id = ? AND promo_code = ? AND active = true"
+                    db_span.set_attribute("db.statement", query)
+                    # print(f"[DB] Executing shipping rate query", flush=True)
+                    sleep_time = random.uniform(1.5, 3.5)
+                    time.sleep(sleep_time) # Simulate slow query
+                    shipping_rate = 4.5
+                    discount = 15.0
+                else:
+                    # Simulate fast query without promo_code
+                    query = "SELECT rate_per_kg, discount_percent FROM shipping_rates WHERE zone_id = ? AND active = true"
+                    db_span.set_attribute("db.statement", query)
+                    # print(f"[DB] Executing shipping rate query", flush=True)
+                    sleep_time = random.uniform(0.1, 0.2)
+                    time.sleep(sleep_time) # Simulate fast query
+                    shipping_rate = 5.0
+                    discount = 0.0
+
+                # Calculate shipping cost
+                total_weight = sum(item.get('weight', 1.0) for item in items)
+                shipping_cost = total_weight * shipping_rate * (1 - discount/100)
+
+            # Calculate total
+            subtotal = sum(item.get('price', 0) for item in items)
+            total = subtotal + shipping_cost
+
+            response = {
+                "order_id": f"ord-{random.randint(1000, 9999)}",
+                "subtotal": subtotal,
+                "shipping": round(shipping_cost, 2),
+                "total": round(total, 2)
+            }
+
+            print(f"[CHECKOUT] Completed checkout request", flush=True)
+            return jsonify(response)
+
+    if __name__ == '__main__':
+        print("[CHECKOUT] Starting checkout service on port 8080", flush=True)
+        app.run(host='0.0.0.0', port=8080)
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: checkout
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: checkout
+  template:
+    metadata:
+      labels:
+        app: checkout
+    spec:
+      containers:
+      - name: checkout
+        image: python:3.11-slim
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          pip install flask opentelemetry-api opentelemetry-sdk \
+            opentelemetry-instrumentation-flask \
+            opentelemetry-exporter-otlp-proto-grpc && \
+          python /app/app.py
+        volumeMounts:
+        - name: app
+          mountPath: /app
+        ports:
+        - containerPort: 8080
+        env:
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        startupProbe:
+          httpGet:
+            path: /health
+            port: 8080
+          initialDelaySeconds: 10
+          periodSeconds: 5
+          timeoutSeconds: 3
+          successThreshold: 1
+          failureThreshold: 24
+        resources:
+          requests:
+            memory: "128Mi"
+            cpu: "50m"
+          limits:
+            memory: "256Mi"
+            cpu: "200m"
+      volumes:
+      - name: app
+        secret:
+          secretName: checkout-app
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: checkout
+spec:
+  selector:
+    app: checkout
+  ports:
+  - port: 8080
+    targetPort: 8080
diff --git a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/test_case.yaml
similarity index 69%
rename from tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
rename to tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/test_case.yaml
index e611dbc57..ed24c676c 100644
--- a/tests/llm/fixtures/test_ask_holmes/114_checkout_latency_tracing_rebuild/test_case.yaml
+++ b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/test_case.yaml
@@ -1,5 +1,5 @@
 user_prompt:
-- "The checkout service in namespace app-114 is experiencing high latency. Investigate why."
+- "The checkout service in namespace app-114b is experiencing high latency. Investigate why."
 
 expected_output:
   - The answer must explicitly state that queries that include a promo code are slow or all slow requests include promo code.
@@ -12,26 +12,26 @@ tags:
   - traces
 
 port_forwards:
-  - namespace: app-114
+  - namespace: app-114b
     service: tempo
-    local_port: 3200
+    local_port: 3215
     remote_port: 3200
 
 before_test: |
-  echo "🚀 Setting up test 114 - Creating namespace app-114"
-  kubectl create namespace app-114 || true
-  echo "✅ Namespace app-114 created successfully!"
+  echo "🚀 Setting up test 114 - Creating namespace app-114b"
+  kubectl create namespace app-114b || true
+  echo "✅ Namespace app-114b created successfully!"
 
   echo "📦 Deploying Tempo from shared config"
-  kubectl apply -f ../../shared/tempo.yaml -n app-114
+  kubectl apply -f ../../shared/tempo.yaml -n app-114b
 
   echo "⏳ Waiting for Tempo pod to be ready"
-  kubectl wait --for=condition=ready pod -l app=tempo -n app-114 --timeout=60s
+  kubectl wait --for=condition=ready pod -l app=tempo -n app-114b --timeout=60s
 
   echo "⏰ Waiting for Tempo to be fully ready (checking every 5s, timeout 60s)"
   TEMPO_READY=false
   for i in {1..12}; do
-    if kubectl exec -n app-114 deployment/tempo -- wget -q -O - http://localhost:3200/ready 2>/dev/null; then
+    if kubectl exec -n app-114b deployment/tempo -- wget -q -O - http://localhost:3200/ready 2>/dev/null; then
       echo "✅ Tempo is ready!"
       TEMPO_READY=true
       break
@@ -49,27 +49,27 @@ before_test: |
   echo "✅ Tempo deployment complete!"
 
   echo "🛒 Deploying checkout service"
-  kubectl apply -f checkout-service.yaml -n app-114
+  kubectl apply -f checkout-service.yaml -n app-114b
 
   echo "⏳ Waiting for checkout pod to be ready"
-  kubectl wait --for=condition=ready pod -l app=checkout -n app-114 --timeout=60s
+  kubectl wait --for=condition=ready pod -l app=checkout -n app-114b --timeout=60s
 
   echo "🔍 Checking checkout deployment status"
-  kubectl get pods -n app-114 -l app=checkout
+  kubectl get pods -n app-114b -l app=checkout
 
   echo "🚦 Deploying traffic generator"
-  kubectl apply -f traffic-generator.yaml -n app-114
+  kubectl apply -f traffic-generator.yaml -n app-114b
 
   echo "⏳ Waiting for traffic generator pod to be ready"
-  kubectl wait --for=condition=ready pod -l app=traffic-generator -n app-114 --timeout=60s
+  kubectl wait --for=condition=ready pod -l app=traffic-generator -n app-114b --timeout=60s
 
   echo "🔍 Checking all pods status"
-  kubectl get pods -n app-114
+  kubectl get pods -n app-114b
 
   echo "⏰ Waiting for traffic generator to produce logs (checking every 3s, timeout 60s)"
   PROMO_LOG_FOUND=false
   for i in {1..20}; do
-    if kubectl logs -n app-114 -l app=traffic-generator --tail=100 2>/dev/null | grep -q "WITH promo_code"; then
+    if kubectl logs -n app-114b -l app=traffic-generator --tail=100 2>/dev/null | grep -q "WITH promo_code"; then
       echo "✅ Found traffic generator log WITH promo_code after $((i*3)) seconds"
       PROMO_LOG_FOUND=true
       break
@@ -84,14 +84,14 @@ before_test: |
     exit 1
   fi
 
-  if kubectl logs -n app-114 -l app=traffic-generator --tail=100 | grep -q "WITHOUT promo_code"; then
+  if kubectl logs -n app-114b -l app=traffic-generator --tail=100 | grep -q "WITHOUT promo_code"; then
     echo "✅ Found traffic generator log WITHOUT promo_code"
   else
     echo "❌ Missing traffic generator log WITHOUT promo_code"
     exit 1
   fi
 
-  if kubectl logs -n app-114 -l app=checkout --tail=100 | grep -q "Processing checkout request"; then
+  if kubectl logs -n app-114b -l app=checkout --tail=100 | grep -q "Processing checkout request"; then
     echo "✅ Found checkout request log"
   else
     echo "❌ Missing checkout request log"
@@ -104,7 +104,7 @@ before_test: |
   # echo "Found $TRAFFIC_GEN_TRACES traces from traffic-generator"
 
   echo "🔍 Querying Tempo for traces from checkout service"
-  CHECKOUT_TRACES=$(kubectl run -n app-114 tempo-query --rm -i --restart=Never --image=curlimages/curl -- -s "http://tempo:3200/api/search?tags=service.name%3Dcheckout-service&limit=10" 2>/dev/null | grep -o '"traceID"' | wc -l || echo "0")
+  CHECKOUT_TRACES=$(kubectl run -n app-114b tempo-query --rm -i --restart=Never --image=curlimages/curl -- -s "http://tempo:3200/api/search?tags=service.name%3Dcheckout-service&limit=10" 2>/dev/null | grep -o '"traceID"' | wc -l || echo "0")
   echo "Found $CHECKOUT_TRACES traces from checkout-service"
 
   # Commented out traffic generator trace check
@@ -123,9 +123,9 @@ before_test: |
   fi
 
   # Delete Traffic generator so the ai won't cheat
-  kubectl delete -f traffic-generator.yaml -n app-114
+  kubectl delete -f traffic-generator.yaml -n app-114b
 
   echo "✅ Test setup complete!"
 
 after_test: |
-  kubectl delete namespace app-114 || true
+  kubectl delete namespace app-114b || true
diff --git a/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/toolsets.yaml b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/toolsets.yaml
new file mode 100644
index 000000000..05d06ab94
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/toolsets.yaml
@@ -0,0 +1,12 @@
+toolsets:
+  kubernetes/core:
+    enabled: true
+  kubernetes/logs:
+    enabled: true
+  grafana/tempo:
+    enabled: true
+    config:
+      url: http://localhost:3215
+      healthcheck: "ready"
+      enable_comparative_sample: false
+      enable_simple_comparison: true
diff --git a/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/traffic-generator.yaml b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/traffic-generator.yaml
new file mode 100644
index 000000000..98c6443fa
--- /dev/null
+++ b/tests/llm/fixtures/test_ask_holmes/114b_checkout_latency_tracing_simple/traffic-generator.yaml
@@ -0,0 +1,157 @@
+# Traffic Generator Deployment
+apiVersion: v1
+kind: Secret
+metadata:
+  name: traffic-generator-app
+type: Opaque
+stringData:
+  app.py: |
+    import time
+    import random
+    import requests
+    from datetime import datetime
+    # from opentelemetry import trace
+    # from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
+    # from opentelemetry.sdk.trace import TracerProvider
+    # from opentelemetry.sdk.trace.export import BatchSpanProcessor
+    # from opentelemetry.sdk.resources import Resource
+    # from opentelemetry.instrumentation.requests import RequestsInstrumentor
+
+    # # Configure OpenTelemetry
+    # resource = Resource.create({"service.name": "traffic-generator"})
+    # provider = TracerProvider(resource=resource)
+    # trace.set_tracer_provider(provider)
+
+    # otlp_exporter = OTLPSpanExporter(
+    #     endpoint="tempo.app-114b.svc.cluster.local:4317",
+    #     insecure=True
+    # )
+    # provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
+
+    # # Instrument requests library
+    # RequestsInstrumentor().instrument()
+
+    # tracer = trace.get_tracer(__name__)
+
+    CHECKOUT_URL = "http://checkout.app-114b.svc.cluster.local:8080/checkout"
+    ZONES = ['us-west-1', 'us-west-2', 'us-east-1', 'us-east-2']
+    PROMO_CODES = ['SAVE10', 'WELCOME20', 'HOLIDAY15', 'SPECIAL25']
+
+    def generate_traffic():
+        """Continuously generate traffic to checkout service"""
+        print("[TRAFFIC-GEN] Starting traffic generator", flush=True)
+        request_count = 0
+
+        while True:
+            request_count += 1
+
+            # 30% chance to include promo code
+            include_promo = random.random() < 0.3
+
+            # Build request data
+            data = {
+                "user_id": f"user-{random.randint(1000, 9999)}",
+                "zone_id": random.choice(ZONES),
+                "items": [
+                    {
+                        "id": f"item-{i}",
+                        "price": round(random.uniform(10, 200), 2),
+                        "weight": round(random.uniform(0.5, 5.0), 2)
+                    }
+                    for i in range(random.randint(1, 3))
+                ]
+            }
+
+            if include_promo:
+                data["promo_code"] = random.choice(PROMO_CODES)
+
+            # Log the request
+            promo_status = "WITH" if include_promo else "WITHOUT"
+            print(f"[TRAFFIC-GEN] Request #{request_count}: Sending request {promo_status} promo_code", flush=True)
+
+            # Make request with tracing
+            # with tracer.start_as_current_span("checkout_request") as span:
+            #     span.set_attribute("request.number", request_count)
+            #     span.set_attribute("has.promo_code", include_promo)
+            #     if include_promo:
+            #         span.set_attribute("promo.code", data.get("promo_code"))
+
+            try:
+                start_time = time.time()
+                response = requests.post(CHECKOUT_URL, json=data, timeout=10)
+                latency = time.time() - start_time
+
+                # span.set_attribute("http.status_code", response.status_code)
+                # span.set_attribute("response.latency", latency)
+
+                status = "success" if response.status_code == 200 else f"error({response.status_code})"
+                print(f"[TRAFFIC-GEN] Request #{request_count}: Response status={status}, latency={latency:.2f}s", flush=True)
+
+            except Exception as e:
+                # span.record_exception(e)
+                # span.set_status(trace.Status(trace.StatusCode.ERROR, str(e)))
+                print(f"[TRAFFIC-GEN] Request #{request_count}: Error - {str(e)}", flush=True)
+
+            # Wait 100ms to 200ms second before next request
+            sleep_time = random.uniform(0.1, 0.2)
+            time.sleep(sleep_time)
+
+    if __name__ == '__main__':
+        print("[TRAFFIC-GEN] Starting...", flush=True)
+
+        # Start generating traffic
+        generate_traffic()
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: traffic-generator
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: traffic-generator
+  template:
+    metadata:
+      labels:
+        app: traffic-generator
+    spec:
+      containers:
+      - name: traffic-generator
+        image: python:3.11-slim
+        command: ["/bin/bash", "-c"]
+        args:
+        - |
+          pip install requests && \
+          # pip install opentelemetry-api opentelemetry-sdk \
+          #   opentelemetry-instrumentation-requests \
+          #   opentelemetry-exporter-otlp-proto-grpc && \
+          touch /tmp/ready && \
+          python /app/app.py
+        volumeMounts:
+        - name: app
+          mountPath: /app
+        env:
+        - name: PYTHONUNBUFFERED
+          value: "1"
+        startupProbe:
+          exec:
+            command:
+            - cat
+            - /tmp/ready
+          initialDelaySeconds: 5
+          periodSeconds: 5
+          timeoutSeconds: 1
+          successThreshold: 1
+          failureThreshold: 12
+        resources:
+          requests:
+            memory: "64Mi"
+            cpu: "25m"
+          limits:
+            memory: "128Mi"
+            cpu: "100m"
+      volumes:
+      - name: app
+        secret:
+          secretName: traffic-generator-app