Skip to content

Commit 774a466

Browse files
OpenTelemetry monitoring (#64)
* fix: hanging docker stop * perf: enhance telemetry
1 parent 7c48310 commit 774a466

File tree

3 files changed

+47
-3
lines changed

3 files changed

+47
-3
lines changed

main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
if telemetry.ENABLE_TELEMETRY:
1313
print("WARNING: Running telemetry.", flush=True)
14+
telemetry.setting_app_name(app_name)
1415
telemetry.setting_otlp(app, app_name=app_name, endpoint=OTLP_GRPC_ENDPOINT)
1516
app.add_middleware(telemetry.PrometheusMiddleware, app_name=app_name)
1617
app.add_route("/metrics", telemetry.metrics)

start

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@ docker run -d --rm \
3333
-p $DEBUG_PORT:$DEBUG_PORT \
3434
-e ENABLE_TELEMETRY=$ENABLE_TELEMETRY \
3535
--network dev-setup_default \
36-
--log-driver=loki \
37-
--log-opt loki-url="http://$HOST_IP:3100/loki/api/v1/push" \
3836
refinery-authorizer-dev $CMD > /dev/null 2>&1
3937
echo -ne '\t\t\t [done]\n'
4038

telemetry.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
from starlette.status import HTTP_500_INTERNAL_SERVER_ERROR
2323
from starlette.types import ASGIApp
2424

25+
26+
APP_NAME = os.getenv("APP_NAME")
2527
ENABLE_TELEMETRY = os.getenv("ENABLE_TELEMETRY", "false") == "true"
2628

2729
INFO = Gauge("fastapi_app_info", "FastAPI application information.", ["app_name"])
@@ -50,6 +52,41 @@
5052
"Gauge of requests by method and path currently being processed",
5153
["method", "path", "app_name"],
5254
)
55+
TASKS_IN_PROGRESS = Gauge(
56+
"cognition_tasks_in_progress",
57+
"Indicates if the task master thread is running (1) or not (0)",
58+
["task_name", "app_name"],
59+
)
60+
TASKS_PROCESSED = Counter(
61+
"cognition_task_processed_total",
62+
"Total items processed by the task",
63+
["task_name", "app_name"],
64+
)
65+
TASKS_ERRORS = Counter(
66+
"cognition_task_errors_total",
67+
"Total errors encountered by the task",
68+
["task_name", "app_name"],
69+
)
70+
WEBSOCKET_EXTERNAL_SUCCESS = Counter(
71+
"cognition_websocket_external_success_total",
72+
"Total successful external websocket connections",
73+
["app_name", "org_id", "project_id"],
74+
)
75+
WEBSOCKET_EXTERNAL_FAILURE = Counter(
76+
"cognition_websocket_external_failure_total",
77+
"Total failed external websocket connections",
78+
["app_name", "org_id", "project_id"],
79+
)
80+
WEBSOCKET_INTERNAL_SUCCESS = Counter(
81+
"cognition_websocket_internal_success_total",
82+
"Total successful internal websocket connections",
83+
["app_name", "org_id", "project_id"],
84+
)
85+
WEBSOCKET_INTERNAL_FAILURE = Counter(
86+
"cognition_websocket_internal_failure_total",
87+
"Total failed internal websocket connections",
88+
["app_name", "org_id", "project_id"],
89+
)
5390

5491

5592
class PrometheusMiddleware(BaseHTTPMiddleware):
@@ -122,12 +159,20 @@ def metrics(request: Request) -> Response:
122159
)
123160

124161

162+
def setting_app_name(app_name: str) -> None:
163+
global APP_NAME
164+
if APP_NAME is None:
165+
APP_NAME = app_name
166+
167+
125168
def setting_otlp(
126169
app: ASGIApp, app_name: str, endpoint: str, log_correlation: bool = True
127170
) -> None:
128171
# Setting OpenTelemetry
129172
# set the service name to show in traces
130-
resource = Resource.create(attributes={"service.name": app_name})
173+
resource = Resource.create(
174+
attributes={"service.name": app_name, "compose_service": app_name}
175+
)
131176

132177
# set the tracer provider
133178
tracer = TracerProvider(resource=resource)

0 commit comments

Comments
 (0)