Merge pull request #22 from volcengine/feat/support-agent-server-telemetry

innsd · web-flow · commit f80ef9506909 · 2025-12-12T13:16:48.000+08:00
feat: support agent server telemetry
diff --git a/agentkit/apps/agent_server_app/agent_server_app.py b/agentkit/apps/agent_server_app/agent_server_app.py
@@ -19,6 +19,7 @@
 from fastapi import Request
 from fastapi import HTTPException
 from fastapi.responses import StreamingResponse
+from opentelemetry import trace
 from google.adk.agents.base_agent import BaseAgent
 from google.adk.artifacts.in_memory_artifact_service import (
     InMemoryArtifactService,
@@ -41,6 +42,8 @@
 from veadk.memory.short_term_memory import ShortTermMemory
 
 from agentkit.apps.base_app import BaseAgentkitApp
+from agentkit.apps.agent_server_app.telemetry import telemetry
+from agentkit.apps.agent_server_app.middleware import AgentkitTelemetryHTTPMiddleware
 
 
 class AgentKitAgentLoader(BaseAgentLoader):
@@ -87,7 +90,13 @@ def __init__(
 
         self.app = self.server.get_fast_api_app()
 
+        # Attach ASGI middleware for unified telemetry across all routes
+        self.app.add_middleware(AgentkitTelemetryHTTPMiddleware)
+
         async def _invoke_compat(request: Request):
+            # Use current request span from middleware for telemetry
+            span = trace.get_current_span()
+
             # Extract headers (fallback keys supported)
             headers = request.headers
             user_id = (
@@ -126,6 +135,14 @@ async def _invoke_compat(request: Request):
                         text = ""
             content = types.UserContent(parts=[types.Part(text=text or "")])
 
+            # trace request attributes on current span
+            telemetry.trace_agent_server(
+                func_name="_invoke_compat",
+                span=span,
+                headers=dict(headers),
+                text=text or "",
+            )
+
             # Ensure session exists
             session = await self.server.session_service.get_session(
                 app_name=app_name, user_id=user_id, session_id=session_id
@@ -154,8 +171,11 @@ async def event_generator():
                                 )
                                 + "\n\n"
                             )
+                    # finish span on successful end of stream handled by middleware
+                    pass
                 except Exception as e:
                     yield f'data: {{"error": "{str(e)}"}}\n\n'
+                    telemetry.trace_agent_server_finish(func_result="", exception=e)
 
             return StreamingResponse(
                 event_generator(),
diff --git a/agentkit/apps/agent_server_app/middleware.py b/agentkit/apps/agent_server_app/middleware.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable
+
+from opentelemetry import trace
+from opentelemetry import context as context_api
+
+from agentkit.apps.agent_server_app.telemetry import telemetry
+
+_EXCLUDED_HEADERS = {
+    "authorization",
+    "token"
+}
+
+
+class AgentkitTelemetryHTTPMiddleware:
+    def __init__(self, app: Callable):
+        self.app = app
+
+    async def __call__(self, scope, receive, send):
+        print(f"test: {scope}")
+        if scope["type"] != "http":
+            return await self.app(scope, receive, send)
+
+        method = scope.get("method", "")
+        path = scope.get("path", "")
+        headers_list = scope.get("headers", [])
+        headers = {k.decode("latin-1"): v.decode("latin-1") for k, v in headers_list}
+        span = telemetry.tracer.start_span(name="agent_server_request")
+        ctx = trace.set_span_in_context(span)
+        context_api.attach(ctx)
+        headers = {
+            k: v for k, v in headers.items()
+            if k.lower() not in _EXCLUDED_HEADERS
+        }
+
+        # Currently unable to retrieve user_id and session_id from headers; keep logic for future use
+        user_id = headers.get("user_id") or headers.get("x-user-id") or ""
+        session_id = headers.get("session_id") or headers.get("x-session-id") or ""
+        headers["user_id"] = user_id
+        headers["session_id"] = session_id
+        telemetry.trace_agent_server(
+            func_name=f"{method} {path}",
+            span=span,
+            headers=headers,
+            text="",  # do not consume body in middleware
+        )
+
+        async def send_wrapper(message):
+            try:
+                if message.get("type") == "http.response.body":
+                    more_body = message.get("more_body", False)
+                    if not more_body:
+                        telemetry.trace_agent_server_finish(
+                            path=path, func_result="", exception=None
+                        )
+                elif message.get("type") == "http.response.start":
+                    # could record status code if needed
+                    pass
+            finally:
+                await send(message)
+
+        try:
+            await self.app(scope, receive, send_wrapper)
+        except Exception as e:
+            telemetry.trace_agent_server_finish(path=path,func_result="", exception=e)
+            raise
diff --git a/agentkit/apps/agent_server_app/telemetry.py b/agentkit/apps/agent_server_app/telemetry.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import time
+from typing import Optional
+
+from opentelemetry import trace
+from opentelemetry.trace import get_tracer
+from opentelemetry.metrics import get_meter
+from opentelemetry.trace.span import Span
+
+from agentkit.apps.utils import safe_serialize_to_json_string
+
+_INVOKE_PATH = ["/run_sse", "/run", "/invoke"]
+
+_GEN_AI_CLIENT_OPERATION_DURATION_BUCKETS = [
+    0.01,
+    0.02,
+    0.04,
+    0.08,
+    0.16,
+    0.32,
+    0.64,
+    1.28,
+    2.56,
+    5.12,
+    10.24,
+    20.48,
+    40.96,
+    81.92,
+    163.84,
+]
+
+logger = logging.getLogger("agentkit." + __name__)
+
+
+class Telemetry:
+    def __init__(self):
+        self.tracer = get_tracer("agentkit.agent_server_app")
+        self.meter = get_meter("agentkit.agent_server_app")
+        self.latency_histogram = self.meter.create_histogram(
+            name="agentkit_runtime_operation_latency",
+            description="operation latency",
+            unit="s",
+            explicit_bucket_boundaries_advisory=_GEN_AI_CLIENT_OPERATION_DURATION_BUCKETS,
+        )
+
+    def trace_agent_server(
+        self,
+        func_name: str,
+        span: Span,
+        headers: dict,
+        text: str,
+    ) -> None:
+        span.set_attribute(key="gen_ai.system", value="agentkit")
+        span.set_attribute(key="gen_ai.func_name", value=func_name)
+
+        span.set_attribute(
+            key="gen_ai.request.headers",
+            value=safe_serialize_to_json_string(headers),
+        )
+
+        session_id = headers.get("session_id") or headers.get("x-session-id") or ""
+        if session_id:
+            span.set_attribute(key="gen_ai.session.id", value=session_id)
+        user_id = headers.get("user_id") or headers.get("x-user-id") or ""
+        if user_id:
+            span.set_attribute(key="gen_ai.user.id", value=user_id)
+
+        # Currently unable to retrieve input
+        # span.set_attribute(
+        #     key="gen_ai.input", value=safe_serialize_to_json_string(text)
+        # )
+
+        span.set_attribute(key="gen_ai.span.kind", value="agent_server")
+        span.set_attribute(key="gen_ai.operation.name", value="invoke_agent")
+        span.set_attribute(key="gen_ai.operation.type", value="agent_server")
+
+    def trace_agent_server_finish(
+        self,
+        path: str,
+        func_result: str,
+        exception: Optional[Exception],
+    ) -> None:
+        span = trace.get_current_span()
+        if span and span.is_recording():
+            # Currently unable to retrieve output
+            # span.set_attribute(key="gen_ai.output", value=func_result)
+
+            attributes = {
+                "gen_ai_operation_name": "invoke_agent",
+                "gen_ai_operation_type": "agent_server",
+            }
+            if exception:
+                self.handle_exception(span, exception)
+                attributes["error_type"] = exception.__class__.__name__
+
+            # only record invoke request latency metrics
+            if hasattr(span, "start_time") and self.latency_histogram and path in _INVOKE_PATH:
+                duration = (time.time_ns() - span.start_time) / 1e9  # type: ignore
+                self.latency_histogram.record(duration, attributes)
+            span.end()
+
+    @staticmethod
+    def handle_exception(span: trace.Span, exception: Exception) -> None:
+        status = trace.Status(
+            status_code=trace.StatusCode.ERROR,
+            description=f"{type(exception).__name__}: {exception}",
+        )
+        span.set_status(status)
+        span.record_exception(exception)
+
+
+telemetry = Telemetry()