hspedro
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 0 additions & 6 deletions b/‎.github/workflows/test.yml‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎README.md‎
Lines changed: 42 additions & 0 deletions b/‎README.md‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎babeltron/app/main.py‎
Lines changed: 9 additions & 26 deletions b/‎babeltron/app/main.py‎
Lines changed: 9 additions & 26 deletions
diff --git a/‎babeltron/app/routers/healthcheck.py‎
Lines changed: 0 additions & 2 deletions b/‎babeltron/app/routers/healthcheck.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎babeltron/app/routers/translate.py‎
Lines changed: 66 additions & 21 deletions b/‎babeltron/app/routers/translate.py‎
Lines changed: 66 additions & 21 deletions
diff --git a/‎babeltron/app/tracing.py‎
Lines changed: 67 additions & 0 deletions b/‎babeltron/app/tracing.py‎
Lines changed: 67 additions & 0 deletions
@@ -48,9 +48,3 @@ jobs:
 
     - name: Run tests
       run: make test
-
-    - name: Upload coverage reports to Codecov
-      if: inputs.upload-coverage
-      uses: codecov/codecov-action@v3
-      env:
-        CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
@@ -169,6 +169,48 @@ The Docker setup mounts the local `./models` directory to `/models` inside the c
 
 If no models are found when starting the container, you'll be prompted to download the small model automatically.
 
+## Distributed Tracing
+
+Babeltron supports distributed tracing with OpenTelemetry and Jaeger. The application is configured to send traces to the OpenTelemetry Collector, which forwards them to Jaeger.
+
+### Configuration
+
+Tracing can be configured using the following environment variables:
+
+- `OTLP_MODE`: The OpenTelemetry protocol mode (`otlp-grpc` or `otlp-http`)
+- `OTEL_SERVICE_NAME`: The name of the service in traces (default: `babeltron`)
+- `OTLP_GRPC_ENDPOINT`: The endpoint for the OpenTelemetry Collector using gRPC (default: `otel-collector:4317`)
+- `OTLP_HTTP_ENDPOINT`: The endpoint for the OpenTelemetry Collector using HTTP (default: `http://otel-collector:4318/v1/traces`)
+
+### Accessing Jaeger UI
+
+When running with Docker Compose, you can access the Jaeger UI at:
+
+```
+http://localhost:16686
+```
+
+### Tracing Features
+
+The distributed tracing implementation provides insights into:
+
+- Request flow through the API
+- Detailed timing of translation steps:
+  - Tokenization
+  - Model inference
+  - Decoding
+- Error details and context
+- Cross-service communication
+
+### Disabling Tracing
+
+To disable tracing, set the `OTLP_GRPC_ENDPOINT` environment variable to `disabled`:
+
+```yaml
+environment:
+  - OTLP_GRPC_ENDPOINT=disabled
+```
+
 ## Contributing
 
 Install pre-commit hooks with `make pre-commit-install` and refer to the [CONTRIBUTING.md](docs/CONTRIBUTING.md) file for more information.
 
@@ -1,16 +1,10 @@
-import os
-from contextlib import asynccontextmanager
 from importlib.metadata import version
-from typing import AsyncIterator
 
 from fastapi import FastAPI, Response
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi_cache import FastAPICache
-from fastapi_cache.backends.inmemory import InMemoryBackend
-from fastapi_cache.backends.redis import RedisBackend
-from redis import asyncio as aioredis
 
 from babeltron.app.monitoring import PrometheusMiddleware, metrics_endpoint
+from babeltron.app.tracing import setup_jaeger
 from babeltron.app.utils import include_routers
 
 try:
@@ -19,23 +13,6 @@
     __version__ = "0.1.0-dev"
 
 
-@asynccontextmanager
-async def lifespan(_: FastAPI) -> AsyncIterator[None]:
-    cache_url = os.environ.get("CACHE_URL", "")
-
-    if cache_url.startswith("in-memory"):
-        FastAPICache.init(InMemoryBackend(), prefix="babeltron")
-        print("Using in-memory cache")
-    elif cache_url.startswith("redis"):
-        redis = aioredis.from_url(cache_url)
-        FastAPICache.init(RedisBackend(redis), prefix="babeltron")
-        print("Using Redis cache")
-    else:
-        print("No cache_url provided, not using cache")
-
-    yield
-
-
 app = FastAPI(
     title="Babeltron Translation API",
     description="API for machine translation using NLLB models",
@@ -52,7 +29,6 @@ async def lifespan(_: FastAPI) -> AsyncIterator[None]:
     docs_url="/docs",
     redoc_url="/redoc",
     openapi_url="/openapi.json",
-    lifespan=lifespan,
 )
 
 # Configure CORS
@@ -64,6 +40,9 @@ async def lifespan(_: FastAPI) -> AsyncIterator[None]:
     allow_headers=["*"],  # Allows all headers
 )
 
+# Set up Jaeger tracing
+setup_jaeger(app)
+
 # Include all routers
 include_routers(app)
 
@@ -80,4 +59,8 @@ async def metrics():
 if __name__ == "__main__":
     import uvicorn
 
-    uvicorn.run(app, host="0.0.0.0", port=8000)
+    log_config = uvicorn.config.LOGGING_CONFIG
+    log_config["formatters"]["access"][
+        "fmt"
+    ] = "%(asctime)s %(levelname)s [%(name)s] [%(filename)s:%(lineno)d] [trace_id=%(otelTraceID)s span_id=%(otelSpanID)s] - %(message)s"
+    uvicorn.run(app, host="0.0.0.0", port=8000, log_config=log_config)
@@ -16,7 +16,6 @@ class HealthResponse(BaseModel):
     version: Optional[str] = None
 
 
-@router.get("/healthcheck", summary="Healthcheck")
 @router.get(
     "/healthz",
     summary="Check API health",
@@ -34,7 +33,6 @@ class ReadinessResponse(BaseModel):
     error: Optional[str] = None
 
 
-@router.get("/readiness", summary="Readiness Probe")
 @router.get(
     "/readyz",
     summary="Check API Readiness",
 
@@ -1,40 +1,41 @@
+import logging
 import os
+import time
 
 import torch
 from fastapi import APIRouter, HTTPException, status
-from fastapi_cache.decorator import cache
+from opentelemetry import trace
 from pydantic import BaseModel, Field
 from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 
 from babeltron.app.monitoring import track_dynamic_translation_metrics
-from babeltron.app.utils import ORJsonCoder, cache_key_builder, get_model_path
+from babeltron.app.utils import get_model_path
 
 router = APIRouter(tags=["Translation"])
 
 MODEL_COMPRESSION_ENABLED = os.environ.get(
     "MODEL_COMPRESSION_ENABLED", "true"
 ).lower() in ("true", "1", "yes")
-CACHE_TTL_SECONDS = int(os.environ.get("CACHE_TTL_SECONDS", "3600"))
 
 try:
     MODEL_PATH = get_model_path()
-    print(f"Loading model from: {MODEL_PATH}")
+    logging.info(f"Loading model from: {MODEL_PATH}")
     model = M2M100ForConditionalGeneration.from_pretrained(MODEL_PATH)
 
     # Apply FP16 compression if enabled and supported
     if MODEL_COMPRESSION_ENABLED and torch.cuda.is_available():
-        print("Applying FP16 model compression")
+        logging.info("Applying FP16 model compression")
         model = model.half()  # Convert to FP16 precision
         model = model.to("cuda")  # Move to GPU
     elif MODEL_COMPRESSION_ENABLED:
-        print("FP16 compression enabled but GPU not available, using CPU")
+        logging.info("FP16 compression enabled but GPU not available, using CPU")
     else:
-        print("Model compression disabled")
+        logging.info("Model compression disabled")
 
     tokenizer = M2M100Tokenizer.from_pretrained(MODEL_PATH)
-    print("Model loaded successfully")
+    logging.info("Model loaded successfully")
 except Exception as e:
-    print(f"Error loading model: {e}")
+    logging.error(f"Error loading model: {e}")
     model = None
     tokenizer = None
 
@@ -77,31 +78,75 @@ class TranslationResponse(BaseModel):
     response_description="The translated text in the target language",
     status_code=status.HTTP_200_OK,
 )
-@cache(expire=CACHE_TTL_SECONDS, key_builder=cache_key_builder, coder=ORJsonCoder)
 @track_dynamic_translation_metrics()
 async def translate(request: TranslationRequest):
+    # Get current span from context
+    current_span = trace.get_current_span()
+    # Add request attributes to the current span
+    current_span.set_attribute("src_lang", request.src_lang)
+    current_span.set_attribute("tgt_lang", request.tgt_lang)
+    current_span.set_attribute("text_length", len(request.text))
+
+    logging.info(f"Translating text from {request.src_lang} to {request.tgt_lang}")
+
     if model is None or tokenizer is None:
+        current_span.set_attribute("error", "model_not_loaded")
+        logging.error("Translation model not loaded")
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
             detail="Translation model not loaded. Please check server logs.",
         )
 
     try:
-        tokenizer.src_lang = request.src_lang
-        encoded_text = tokenizer(request.text, return_tensors="pt")
+        tracer = trace.get_tracer(__name__)
+
+        with tracer.start_as_current_span("tokenization") as tokenize_span:
+            start_time = time.time()
+            tokenizer.src_lang = request.src_lang
+            encoded_text = tokenizer(request.text, return_tensors="pt")
+            tokenize_span.set_attribute(
+                "token_count", encoded_text["input_ids"].shape[1]
+            )
+            tokenize_span.set_attribute(
+                "duration_ms", (time.time() - start_time) * 1000
+            )
 
-        # Move input to GPU if model is on GPU
         if torch.cuda.is_available() and next(model.parameters()).is_cuda:
-            encoded_text = {k: v.to("cuda") for k, v in encoded_text.items()}
-
-        generated_tokens = model.generate(
-            **encoded_text, forced_bos_token_id=tokenizer.get_lang_id(request.tgt_lang)
-        )
-        translation = tokenizer.batch_decode(
-            generated_tokens, skip_special_tokens=True
-        )[0]
+            with tracer.start_as_current_span("move_to_gpu") as gpu_span:
+                start_time = time.time()
+                encoded_text = {k: v.to("cuda") for k, v in encoded_text.items()}
+                gpu_span.set_attribute("duration_ms", (time.time() - start_time) * 1000)
+
+        with tracer.start_as_current_span("model_inference") as inference_span:
+            start_time = time.time()
+            generated_tokens = model.generate(
+                **encoded_text,
+                forced_bos_token_id=tokenizer.get_lang_id(request.tgt_lang),
+            )
+            inference_time = time.time() - start_time
+            inference_span.set_attribute("inference_time_seconds", inference_time)
+            inference_span.set_attribute(
+                "output_token_count", generated_tokens.shape[1]
+            )
+            inference_span.set_attribute("duration_ms", inference_time * 1000)
+
+        with tracer.start_as_current_span("decoding") as decode_span:
+            start_time = time.time()
+            translation = tokenizer.batch_decode(
+                generated_tokens, skip_special_tokens=True
+            )[0]
+            decode_span.set_attribute("duration_ms", (time.time() - start_time) * 1000)
+
+        current_span.set_attribute("translation_length", len(translation))
+
+        logging.info(f"Translation completed: {len(translation)} characters")
         return {"translation": translation}
     except Exception as e:
+        current_span.record_exception(e)
+        current_span.set_attribute("error", str(e))
+        current_span.set_attribute("error_type", type(e).__name__)
+
+        logging.error(f"Error translating text: {e}", exc_info=True)
         raise HTTPException(
             status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
             detail=f"Error during translation: {str(e)}",
 
@@ -0,0 +1,67 @@
+import logging
+import os
+
+from fastapi import FastAPI
+from opentelemetry import trace
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+    OTLPSpanExporter as OTLPSpanExporterGRPC,
+)
+from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
+    OTLPSpanExporter as OTLPSpanExporterHTTP,
+)
+from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
+from opentelemetry.instrumentation.logging import LoggingInstrumentor
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+
+# Check if we're in a test environment
+IN_TEST = os.environ.get("PYTEST_CURRENT_TEST") is not None
+
+OTLP_MODE = os.environ.get("OTLP_MODE", "otlp-grpc")
+OTLP_GRPC_ENDPOINT = os.environ.get("OTLP_GRPC_ENDPOINT", "otel-collector:4317")
+OTLP_HTTP_ENDPOINT = os.environ.get(
+    "OTLP_HTTP_ENDPOINT", "http://otel-collector:4318/v1/traces"
+)
+
+
+def setup_jaeger(app: FastAPI, log_correlation: bool = True) -> None:
+    # Skip setup if we're in a test environment
+    if IN_TEST:
+        logging.info("Skipping OpenTelemetry setup in test environment")
+        return
+
+    # Check if tracing is disabled
+    if OTLP_GRPC_ENDPOINT.lower() == "disabled":
+        logging.info("OpenTelemetry tracing is disabled")
+        return
+
+    tracer = TracerProvider()
+    trace.set_tracer_provider(tracer)
+
+    if OTLP_MODE == "otlp-grpc":
+        tracer.add_span_processor(
+            BatchSpanProcessor(
+                OTLPSpanExporterGRPC(endpoint=OTLP_GRPC_ENDPOINT, insecure=True)
+            )
+        )
+    elif OTLP_MODE == "otlp-http":
+        tracer.add_span_processor(
+            BatchSpanProcessor(OTLPSpanExporterHTTP(endpoint=OTLP_HTTP_ENDPOINT))
+        )
+    else:
+        tracer.add_span_processor(
+            BatchSpanProcessor(
+                OTLPSpanExporterGRPC(endpoint=OTLP_GRPC_ENDPOINT, insecure=True)
+            )
+        )
+
+    if log_correlation:
+        LoggingInstrumentor().instrument(set_logging_format=True)
+
+    FastAPIInstrumentor.instrument_app(
+        app,
+        tracer_provider=tracer,
+        excluded_urls="/metrics,/healthz,/readyz,/docs,/redoc,/openapi.json",
+    )
+
+    logging.info(f"OpenTelemetry tracing enabled with endpoint: {OTLP_GRPC_ENDPOINT}")