From 1c8d0622c1d80759c162fd4b2778cd73b553e297 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Tue, 23 Sep 2025 08:59:48 +0200 Subject: [PATCH 1/9] perf: add telemetry --- app.py | 10 ++- requirements.txt | 152 +++++++++++++++++++++++++---------- requirements/requirements.in | 9 ++- util/telemetry.py | 139 ++++++++++++++++++++++++++++++++ 4 files changed, 266 insertions(+), 44 deletions(-) create mode 100644 util/telemetry.py diff --git a/app.py b/app.py index ad21e86f..f18c6c43 100644 --- a/app.py +++ b/app.py @@ -1,4 +1,5 @@ import logging +import os from fastapi import FastAPI from starlette.middleware import Middleware @@ -62,7 +63,7 @@ PREFIX_TASK_EXECUTION, PREFIX_PLAYGROUND, ) -from util import security, clean_up +from util import security, clean_up, telemetry from middleware import log_storage from submodules.model import session from controller.sums_table import manager as sums_table_manager @@ -70,9 +71,14 @@ logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger(__name__) +OTLP_GRPC_ENDPOINT = os.getenv("OTLP_GRPC_ENDPOINT", "tempo:4317") + init_config() migrate_kratos_users() fastapi_app = FastAPI() +telemetry.setting_otlp( + fastapi_app, app_name="refinery-gateway", endpoint=OTLP_GRPC_ENDPOINT +) fastapi_app.include_router( @@ -141,6 +147,8 @@ fastapi_app.middleware("http")(handle_db_session) +fastapi_app.add_middleware(telemetry.PrometheusMiddleware, app_name="refinery-gateway") +fastapi_app.add_route("/metrics", telemetry.metrics) middleware = [Middleware(DatabaseSessionHandler)] app = Starlette(routes=routes, middleware=middleware) diff --git a/requirements.txt b/requirements.txt index ab3d54b1..bada940f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,29 +8,31 @@ alembic==1.7.1 # via -r requirements/requirements.in annotated-types==0.7.0 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # pydantic anyio==4.9.0 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # httpx # openai # starlette argon2-cffi==25.1.0 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # minio argon2-cffi-bindings==21.2.0 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # argon2-cffi +asgiref==3.9.1 + # via opentelemetry-instrumentation-asgi blis==0.7.11 # via thinc boto3==1.39.6 - # via -r requirements/common-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt botocore==1.39.10 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # boto3 # s3transfer catalogue==2.0.10 @@ -40,25 +42,25 @@ catalogue==2.0.10 # thinc certifi==2025.7.14 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # httpcore # httpx # minio # requests cffi==1.17.1 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # argon2-cffi-bindings charset-normalizer==3.4.2 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # requests click==8.2.1 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # typer # uvicorn -cloudpathlib==0.21.1 +cloudpathlib==0.22.0 # via weasel confection==0.1.5 # via @@ -76,10 +78,14 @@ docker==7.1.0 et-xmlfile==2.0.0 # via openpyxl fastapi==0.116.1 - # via -r requirements/common-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt +googleapis-common-protos==1.70.0 + # via opentelemetry-exporter-otlp-proto-grpc +grpcio==1.75.0 + # via opentelemetry-exporter-otlp-proto-grpc h11==0.16.0 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # httpcore # uvicorn httpcore==1.0.9 @@ -88,15 +94,17 @@ httpx==0.28.1 # via openai idna==3.10 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # anyio # httpx # requests +importlib-metadata==8.7.0 + # via opentelemetry-api jinja2==3.1.6 # via spacy jmespath==1.0.1 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # boto3 # botocore langcodes==3.5.0 @@ -105,9 +113,9 @@ language-data==1.3.0 # via langcodes mako==1.3.10 # via alembic -marisa-trie==1.2.1 +marisa-trie==1.3.1 # via language-data -markdown-it-py==3.0.0 +markdown-it-py==4.0.0 # via rich markupsafe==3.0.2 # via @@ -116,7 +124,7 @@ markupsafe==3.0.2 mdurl==0.1.2 # via markdown-it-py minio==7.2.15 - # via -r requirements/common-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt murmurhash==1.0.13 # via # preshed @@ -124,7 +132,7 @@ murmurhash==1.0.13 # thinc numpy==1.23.4 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # blis # pandas # spacy @@ -133,31 +141,82 @@ openai==1.31.0 # via -r requirements/requirements.in openpyxl==3.0.10 # via -r requirements/requirements.in +opentelemetry-api==1.37.0 + # via + # -r requirements/requirements.in + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-instrumentation + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-logging + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-exporter-otlp-proto-common==1.37.0 + # via opentelemetry-exporter-otlp-proto-grpc +opentelemetry-exporter-otlp-proto-grpc==1.37.0 + # via -r requirements/requirements.in +opentelemetry-instrumentation==0.58b0 + # via + # -r requirements/requirements.in + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-logging +opentelemetry-instrumentation-asgi==0.58b0 + # via opentelemetry-instrumentation-fastapi +opentelemetry-instrumentation-fastapi==0.58b0 + # via -r requirements/requirements.in +opentelemetry-instrumentation-logging==0.58b0 + # via -r requirements/requirements.in +opentelemetry-proto==1.37.0 + # via + # opentelemetry-exporter-otlp-proto-common + # opentelemetry-exporter-otlp-proto-grpc +opentelemetry-sdk==1.37.0 + # via + # -r requirements/requirements.in + # opentelemetry-exporter-otlp-proto-grpc +opentelemetry-semantic-conventions==0.58b0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-fastapi + # opentelemetry-sdk +opentelemetry-util-http==0.58b0 + # via + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-fastapi packaging==25.0 # via + # opentelemetry-instrumentation # spacy # thinc # weasel pandas==1.5.1 - # via -r requirements/common-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt preshed==3.0.10 # via # spacy # thinc +prometheus-client==0.23.1 + # via -r requirements/requirements.in +protobuf==6.32.1 + # via + # googleapis-common-protos + # opentelemetry-proto psycopg2-binary==2.9.9 - # via -r requirements/common-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt pycparser==2.22 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # cffi pycryptodome==3.23.0 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # minio # rncryptor pydantic==2.7.4 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # -r requirements/requirements.in # confection # fastapi @@ -167,7 +226,7 @@ pydantic==2.7.4 # weasel pydantic-core==2.18.4 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # pydantic pygments==2.19.2 # via rich @@ -177,38 +236,38 @@ pyminizip==0.2.6 # via -r requirements/requirements.in python-dateutil==2.9.0.post0 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # botocore # pandas pytz==2025.2 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # pandas requests==2.32.4 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # docker # spacy # weasel -rich==14.0.0 +rich==14.1.0 # via typer rncryptor==3.3.0 # via -r requirements/requirements.in s3transfer==0.13.1 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # boto3 shellingham==1.5.4 # via typer six==1.17.0 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # python-dateutil -smart-open==7.3.0.post1 +smart-open==7.3.1 # via weasel sniffio==1.3.1 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # anyio # openai spacy[ja]==3.7.5 @@ -219,7 +278,7 @@ spacy-loggers==1.0.5 # via spacy sqlalchemy==1.4.42 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # alembic srsly==2.5.1 # via @@ -229,9 +288,9 @@ srsly==2.5.1 # weasel starlette==0.47.2 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # fastapi -sudachidict-core==20250515 +sudachidict-core==20250825 # via spacy sudachipy==0.6.10 # via @@ -243,30 +302,35 @@ tqdm==4.67.1 # via # openai # spacy -typer==0.16.0 +typer==0.19.1 # via # spacy # weasel typing-extensions==4.14.1 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # anyio # fastapi + # grpcio # minio # openai + # opentelemetry-api + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-sdk + # opentelemetry-semantic-conventions # pydantic # pydantic-core # starlette # typer urllib3==2.5.0 # via - # -r requirements/common-requirements.txt + # -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt # botocore # docker # minio # requests uvicorn==0.35.0 - # via -r requirements/common-requirements.txt + # via -r /Users/andhrelja/Projects/refinery-gateway/requirements/common-requirements.txt wasabi==1.1.3 # via # spacy @@ -274,8 +338,12 @@ wasabi==1.1.3 # weasel weasel==0.4.1 # via spacy -wrapt==1.17.2 - # via smart-open +wrapt==1.17.3 + # via + # opentelemetry-instrumentation + # smart-open +zipp==3.23.0 + # via importlib-metadata # The following packages are considered to be unsafe in a requirements file: # setuptools diff --git a/requirements/requirements.in b/requirements/requirements.in index d16840f2..d8e47458 100644 --- a/requirements/requirements.in +++ b/requirements/requirements.in @@ -10,4 +10,11 @@ spacy[ja]==3.7.5 pyminizip==0.2.6 rncryptor==3.3.0 pydantic==2.7.4 -openai==1.31.0 \ No newline at end of file +openai==1.31.0 +opentelemetry-api==1.37.0 +opentelemetry-sdk==1.37.0 +opentelemetry-exporter-otlp-proto-grpc==1.37.0 +opentelemetry-instrumentation==0.58b0 +opentelemetry-instrumentation-fastapi==0.58b0 +opentelemetry-instrumentation-logging==0.58b0 +prometheus-client==0.23.1 \ No newline at end of file diff --git a/util/telemetry.py b/util/telemetry.py new file mode 100644 index 00000000..d06a95ff --- /dev/null +++ b/util/telemetry.py @@ -0,0 +1,139 @@ +import time +from typing import Tuple + +from opentelemetry import trace +from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter +from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor +from opentelemetry.instrumentation.logging import LoggingInstrumentor +from opentelemetry.sdk.resources import Resource +from opentelemetry.sdk.trace import TracerProvider +from opentelemetry.sdk.trace.export import BatchSpanProcessor +from prometheus_client import REGISTRY, Counter, Gauge, Histogram +from prometheus_client.openmetrics.exposition import ( + CONTENT_TYPE_LATEST, + generate_latest, +) +from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint +from starlette.requests import Request +from starlette.responses import Response +from starlette.routing import Match +from starlette.status import HTTP_500_INTERNAL_SERVER_ERROR +from starlette.types import ASGIApp + +INFO = Gauge("fastapi_app_info", "FastAPI application information.", ["app_name"]) +REQUESTS = Counter( + "fastapi_requests_total", + "Total count of requests by method and path.", + ["method", "path", "app_name"], +) +RESPONSES = Counter( + "fastapi_responses_total", + "Total count of responses by method, path and status codes.", + ["method", "path", "status_code", "app_name"], +) +REQUESTS_PROCESSING_TIME = Histogram( + "fastapi_requests_duration_seconds", + "Histogram of requests processing time by path (in seconds)", + ["method", "path", "app_name"], +) +EXCEPTIONS = Counter( + "fastapi_exceptions_total", + "Total count of exceptions raised by path and exception type", + ["method", "path", "exception_type", "app_name"], +) +REQUESTS_IN_PROGRESS = Gauge( + "fastapi_requests_in_progress", + "Gauge of requests by method and path currently being processed", + ["method", "path", "app_name"], +) + + +class PrometheusMiddleware(BaseHTTPMiddleware): + def __init__(self, app: ASGIApp, app_name: str = "fastapi-app") -> None: + super().__init__(app) + self.app_name = app_name + INFO.labels(app_name=self.app_name).inc() + + async def dispatch( + self, request: Request, call_next: RequestResponseEndpoint + ) -> Response: + method = request.method + path, is_handled_path = self.get_path(request) + + if not is_handled_path: + return await call_next(request) + + REQUESTS_IN_PROGRESS.labels( + method=method, path=path, app_name=self.app_name + ).inc() + REQUESTS.labels(method=method, path=path, app_name=self.app_name).inc() + before_time = time.perf_counter() + try: + response = await call_next(request) + except BaseException as e: + status_code = HTTP_500_INTERNAL_SERVER_ERROR + EXCEPTIONS.labels( + method=method, + path=path, + exception_type=type(e).__name__, + app_name=self.app_name, + ).inc() + raise e from None + else: + status_code = response.status_code + after_time = time.perf_counter() + # retrieve trace id for exemplar + span = trace.get_current_span() + trace_id = trace.format_trace_id(span.get_span_context().trace_id) + + REQUESTS_PROCESSING_TIME.labels( + method=method, path=path, app_name=self.app_name + ).observe(after_time - before_time, exemplar={"TraceID": trace_id}) + finally: + RESPONSES.labels( + method=method, + path=path, + status_code=status_code, + app_name=self.app_name, + ).inc() + REQUESTS_IN_PROGRESS.labels( + method=method, path=path, app_name=self.app_name + ).dec() + + return response + + @staticmethod + def get_path(request: Request) -> Tuple[str, bool]: + for route in request.app.routes: + match, child_scope = route.matches(request.scope) + if match == Match.FULL: + return route.path, True + + return request.url.path, False + + +def metrics(request: Request) -> Response: + return Response( + generate_latest(REGISTRY), headers={"Content-Type": CONTENT_TYPE_LATEST} + ) + + +def setting_otlp( + app: ASGIApp, app_name: str, endpoint: str, log_correlation: bool = True +) -> None: + # Setting OpenTelemetry + # set the service name to show in traces + resource = Resource.create(attributes={"service.name": app_name}) + + # set the tracer provider + tracer = TracerProvider(resource=resource) + trace.set_tracer_provider(tracer) + + tracer.add_span_processor( + BatchSpanProcessor(OTLPSpanExporter(endpoint=endpoint, insecure=True)) + ) + + if log_correlation: + LoggingInstrumentor().instrument(set_logging_format=True) + + FastAPIInstrumentor.instrument_app(app, tracer_provider=tracer) From 2ed26986986a5262f15d614ee0e6eb97e5a71f57 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Tue, 23 Sep 2025 21:20:26 +0200 Subject: [PATCH 2/9] chore: update submodules --- submodules/model | 2 +- util/telemetry.py | 139 ---------------------------------------------- 2 files changed, 1 insertion(+), 140 deletions(-) delete mode 100644 util/telemetry.py diff --git a/submodules/model b/submodules/model index cb5bab59..721d5499 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit cb5bab5995cd6d423880e01696bf712a214b1a98 +Subproject commit 721d54999b715bcbb41636079116d93b05c41f1b diff --git a/util/telemetry.py b/util/telemetry.py deleted file mode 100644 index d06a95ff..00000000 --- a/util/telemetry.py +++ /dev/null @@ -1,139 +0,0 @@ -import time -from typing import Tuple - -from opentelemetry import trace -from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter -from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor -from opentelemetry.instrumentation.logging import LoggingInstrumentor -from opentelemetry.sdk.resources import Resource -from opentelemetry.sdk.trace import TracerProvider -from opentelemetry.sdk.trace.export import BatchSpanProcessor -from prometheus_client import REGISTRY, Counter, Gauge, Histogram -from prometheus_client.openmetrics.exposition import ( - CONTENT_TYPE_LATEST, - generate_latest, -) -from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint -from starlette.requests import Request -from starlette.responses import Response -from starlette.routing import Match -from starlette.status import HTTP_500_INTERNAL_SERVER_ERROR -from starlette.types import ASGIApp - -INFO = Gauge("fastapi_app_info", "FastAPI application information.", ["app_name"]) -REQUESTS = Counter( - "fastapi_requests_total", - "Total count of requests by method and path.", - ["method", "path", "app_name"], -) -RESPONSES = Counter( - "fastapi_responses_total", - "Total count of responses by method, path and status codes.", - ["method", "path", "status_code", "app_name"], -) -REQUESTS_PROCESSING_TIME = Histogram( - "fastapi_requests_duration_seconds", - "Histogram of requests processing time by path (in seconds)", - ["method", "path", "app_name"], -) -EXCEPTIONS = Counter( - "fastapi_exceptions_total", - "Total count of exceptions raised by path and exception type", - ["method", "path", "exception_type", "app_name"], -) -REQUESTS_IN_PROGRESS = Gauge( - "fastapi_requests_in_progress", - "Gauge of requests by method and path currently being processed", - ["method", "path", "app_name"], -) - - -class PrometheusMiddleware(BaseHTTPMiddleware): - def __init__(self, app: ASGIApp, app_name: str = "fastapi-app") -> None: - super().__init__(app) - self.app_name = app_name - INFO.labels(app_name=self.app_name).inc() - - async def dispatch( - self, request: Request, call_next: RequestResponseEndpoint - ) -> Response: - method = request.method - path, is_handled_path = self.get_path(request) - - if not is_handled_path: - return await call_next(request) - - REQUESTS_IN_PROGRESS.labels( - method=method, path=path, app_name=self.app_name - ).inc() - REQUESTS.labels(method=method, path=path, app_name=self.app_name).inc() - before_time = time.perf_counter() - try: - response = await call_next(request) - except BaseException as e: - status_code = HTTP_500_INTERNAL_SERVER_ERROR - EXCEPTIONS.labels( - method=method, - path=path, - exception_type=type(e).__name__, - app_name=self.app_name, - ).inc() - raise e from None - else: - status_code = response.status_code - after_time = time.perf_counter() - # retrieve trace id for exemplar - span = trace.get_current_span() - trace_id = trace.format_trace_id(span.get_span_context().trace_id) - - REQUESTS_PROCESSING_TIME.labels( - method=method, path=path, app_name=self.app_name - ).observe(after_time - before_time, exemplar={"TraceID": trace_id}) - finally: - RESPONSES.labels( - method=method, - path=path, - status_code=status_code, - app_name=self.app_name, - ).inc() - REQUESTS_IN_PROGRESS.labels( - method=method, path=path, app_name=self.app_name - ).dec() - - return response - - @staticmethod - def get_path(request: Request) -> Tuple[str, bool]: - for route in request.app.routes: - match, child_scope = route.matches(request.scope) - if match == Match.FULL: - return route.path, True - - return request.url.path, False - - -def metrics(request: Request) -> Response: - return Response( - generate_latest(REGISTRY), headers={"Content-Type": CONTENT_TYPE_LATEST} - ) - - -def setting_otlp( - app: ASGIApp, app_name: str, endpoint: str, log_correlation: bool = True -) -> None: - # Setting OpenTelemetry - # set the service name to show in traces - resource = Resource.create(attributes={"service.name": app_name}) - - # set the tracer provider - tracer = TracerProvider(resource=resource) - trace.set_tracer_provider(tracer) - - tracer.add_span_processor( - BatchSpanProcessor(OTLPSpanExporter(endpoint=endpoint, insecure=True)) - ) - - if log_correlation: - LoggingInstrumentor().instrument(set_logging_format=True) - - FastAPIInstrumentor.instrument_app(app, tracer_provider=tracer) From 70cd18d81ef74c6489ebd7fad98e48072441ea67 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Tue, 23 Sep 2025 21:20:34 +0200 Subject: [PATCH 3/9] perf: add telemetry --- app.py | 16 ++++++++++++++-- start | 2 ++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/app.py b/app.py index f18c6c43..905d1b0d 100644 --- a/app.py +++ b/app.py @@ -63,9 +63,9 @@ PREFIX_TASK_EXECUTION, PREFIX_PLAYGROUND, ) -from util import security, clean_up, telemetry +from util import security, clean_up from middleware import log_storage -from submodules.model import session +from submodules.model import session, telemetry from controller.sums_table import manager as sums_table_manager logging.basicConfig(level=logging.DEBUG) @@ -122,7 +122,15 @@ fastapi_app.include_router( playground_router, prefix=PREFIX_PLAYGROUND, tags=["playground"] ) + + fastapi_app_internal = FastAPI() +telemetry.setting_otlp( + fastapi_app_internal, + app_name="refinery-gateway-internal", + endpoint=OTLP_GRPC_ENDPOINT, +) + fastapi_app_internal.include_router( task_execution_router, prefix=PREFIX_TASK_EXECUTION, tags=["task-execution"] ) @@ -149,6 +157,10 @@ fastapi_app.middleware("http")(handle_db_session) fastapi_app.add_middleware(telemetry.PrometheusMiddleware, app_name="refinery-gateway") fastapi_app.add_route("/metrics", telemetry.metrics) +fastapi_app_internal.add_route("/metrics", telemetry.metrics) +fastapi_app_internal.add_middleware( + telemetry.PrometheusMiddleware, app_name="refinery-gateway-internal" +) middleware = [Middleware(DatabaseSessionHandler)] app = Starlette(routes=routes, middleware=middleware) diff --git a/start b/start index f89f56f8..2bc8b2fb 100755 --- a/start +++ b/start @@ -126,6 +126,8 @@ docker run -d --rm \ --mount type=bind,source="$(pwd)"/,target=/app \ -v /var/run/docker.sock:/var/run/docker.sock \ --network dev-setup_default \ +--log-driver=loki \ +--log-opt loki-url="http://$HOST_IP:3100/loki/api/v1/push" \ graphql-dev $CMD > /dev/null 2>&1 echo -ne '\t\t\t [done]\n' From b893dfdaba49a262401a081773aff4105c9ddce9 Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Wed, 24 Sep 2025 23:56:59 +0200 Subject: [PATCH 4/9] chore: update submodules --- submodules/model | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/model b/submodules/model index 721d5499..74c62671 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit 721d54999b715bcbb41636079116d93b05c41f1b +Subproject commit 74c6267117ea5fc56823c5cd270ce8ea2622fc43 From d5a6db18096377aa3a0e6343dea6a97ce34f755f Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Thu, 25 Sep 2025 01:34:23 +0200 Subject: [PATCH 5/9] fix: telemetry --- app.py | 45 ++++++++++++++++++++++++++++----------------- start | 13 ++++++++----- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/app.py b/app.py index 905d1b0d..22d9331e 100644 --- a/app.py +++ b/app.py @@ -75,11 +75,9 @@ init_config() migrate_kratos_users() -fastapi_app = FastAPI() -telemetry.setting_otlp( - fastapi_app, app_name="refinery-gateway", endpoint=OTLP_GRPC_ENDPOINT -) +app_name = "refinery-gateway" +fastapi_app = FastAPI(title=app_name) fastapi_app.include_router( org_router, prefix=PREFIX_ORGANIZATION, tags=["organization"] @@ -123,13 +121,8 @@ playground_router, prefix=PREFIX_PLAYGROUND, tags=["playground"] ) - -fastapi_app_internal = FastAPI() -telemetry.setting_otlp( - fastapi_app_internal, - app_name="refinery-gateway-internal", - endpoint=OTLP_GRPC_ENDPOINT, -) +app_name_internal = app_name + "-i" +fastapi_app_internal = FastAPI(title=app_name_internal) fastapi_app_internal.include_router( task_execution_router, prefix=PREFIX_TASK_EXECUTION, tags=["task-execution"] @@ -155,12 +148,30 @@ fastapi_app.middleware("http")(handle_db_session) -fastapi_app.add_middleware(telemetry.PrometheusMiddleware, app_name="refinery-gateway") -fastapi_app.add_route("/metrics", telemetry.metrics) -fastapi_app_internal.add_route("/metrics", telemetry.metrics) -fastapi_app_internal.add_middleware( - telemetry.PrometheusMiddleware, app_name="refinery-gateway-internal" -) + +if telemetry.ENABLE_TELEMETRY: + print("WARNING: Running telemetry.", flush=True) + telemetry.setting_otlp(fastapi_app, app_name=app_name, endpoint=OTLP_GRPC_ENDPOINT) + fastapi_app.add_middleware(telemetry.PrometheusMiddleware, app_name=app_name) + fastapi_app.add_route("/metrics", telemetry.metrics) + + # -------- internal -------- + app_name += "-i" + telemetry.setting_otlp( + fastapi_app_internal, app_name=app_name_internal, endpoint=OTLP_GRPC_ENDPOINT + ) + fastapi_app_internal.add_middleware( + telemetry.PrometheusMiddleware, app_name=app_name_internal + ) + fastapi_app_internal.add_route("/metrics", telemetry.metrics) + + # Filter out /metrics + logging.getLogger("uvicorn.access").addFilter( + lambda record: not any( + item in record.getMessage() + for item in ("GET /api/metrics", "GET /internal/api/metrics") + ) + ) middleware = [Middleware(DatabaseSessionHandler)] app = Starlette(routes=routes, middleware=middleware) diff --git a/start b/start index 2bc8b2fb..36bb6a29 100755 --- a/start +++ b/start @@ -2,11 +2,13 @@ DEBUG_MODE=false DEBUG_PORT=15670 +ENABLE_TELEMETRY=false -while getopts d flag +while getopts dg flag do case "${flag}" in d) DEBUG_MODE=true;; + g) ENABLE_TELEMETRY=true;; esac done @@ -58,13 +60,13 @@ fi MINIO_ENDPOINT="http://$HOST_IP:7053" -INFERENCE_DIR=${DEV_SETUP_DIR}inference/ -LOG_DIR=${DEV_SETUP_DIR}logs/ +INFERENCE_DIR=${DEV_SETUP_DIR}etc/inference/ +LOG_DIR=${DEV_SETUP_DIR}etc/logs/ -CONFIG_DIR=${PWD%/*}/dev-setup/config/ +CONFIG_DIR=${PWD%/*}/dev-setup/etc/config/ if [ ! -d "$CONFIG_DIR" ] then - CONFIG_DIR=${PWD%/*/*}/dev-setup/config/ + CONFIG_DIR=${PWD%/*/*}/dev-setup/etc/config/ if [ ! -d "$CONFIG_DIR" ] then # to include volume for local development, use the dev-setup inference folder: @@ -120,6 +122,7 @@ docker run -d --rm \ -e KERN_S3_ENDPOINT=${MINIO_ENDPOINT} \ -e SMTP_HOST=mailhog \ -e SMTP_PORT=1025 \ +-e ENABLE_TELEMETRY=$ENABLE_TELEMETRY \ -v "$INFERENCE_DIR":/inference \ -v "$LOG_DIR":/logs \ -v "$CONFIG_DIR":/config \ From 0ac1b428d9e32d1fdbbb976f1e77bafe37f0dc7a Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Wed, 15 Oct 2025 19:17:27 +0200 Subject: [PATCH 6/9] fix: hanging docker stop --- start | 2 -- 1 file changed, 2 deletions(-) diff --git a/start b/start index 36bb6a29..d4b5b1a5 100755 --- a/start +++ b/start @@ -129,8 +129,6 @@ docker run -d --rm \ --mount type=bind,source="$(pwd)"/,target=/app \ -v /var/run/docker.sock:/var/run/docker.sock \ --network dev-setup_default \ ---log-driver=loki \ ---log-opt loki-url="http://$HOST_IP:3100/loki/api/v1/push" \ graphql-dev $CMD > /dev/null 2>&1 echo -ne '\t\t\t [done]\n' From 6d41ead0029d775fd94a6ae6c3a1c7b2f937f83e Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Tue, 21 Oct 2025 14:32:33 +0200 Subject: [PATCH 7/9] chore: update submodules --- submodules/model | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/model b/submodules/model index 7fecf20e..687078b4 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit 7fecf20e1485e804c8faa1b8b0373abe3d34b1fd +Subproject commit 687078b4b48a86324d968b2116044845aad646b6 From b4de01d5fda95094685bde85cf948835826591bd Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Tue, 21 Oct 2025 14:32:41 +0200 Subject: [PATCH 8/9] perf: enhance monitoring --- app.py | 1 + 1 file changed, 1 insertion(+) diff --git a/app.py b/app.py index 22d9331e..a5998a7d 100644 --- a/app.py +++ b/app.py @@ -151,6 +151,7 @@ if telemetry.ENABLE_TELEMETRY: print("WARNING: Running telemetry.", flush=True) + telemetry.setting_app_name(app_name) telemetry.setting_otlp(fastapi_app, app_name=app_name, endpoint=OTLP_GRPC_ENDPOINT) fastapi_app.add_middleware(telemetry.PrometheusMiddleware, app_name=app_name) fastapi_app.add_route("/metrics", telemetry.metrics) From 88ac2629932e8c08a49a3a092507356305bfcb1e Mon Sep 17 00:00:00 2001 From: andhreljaKern Date: Tue, 21 Oct 2025 16:12:04 +0200 Subject: [PATCH 9/9] chore(opentelemetry): update submodules --- submodules/model | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/model b/submodules/model index 687078b4..fa52e172 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit 687078b4b48a86324d968b2116044845aad646b6 +Subproject commit fa52e1725d0691979895644d63c0b61728ea771b