hspedro · hspedro · Mar 14, 2025 · Mar 14, 2025 · Mar 14, 2025 · Mar 14, 2025
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -67,14 +67,16 @@ jobs:
             type=sha,format=short
             ${{ inputs.image_tag != '' && inputs.image_tag || 'latest' }}
 
-      - name: Build and push Docker image
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          push: true
-          tags: ${{ steps.meta.outputs.tags }}
-          labels: ${{ steps.meta.outputs.labels }}
-          cache-from: type=gha
-          cache-to: type=gha,mode=max
-          build-args: |
-            BUILDKIT_INLINE_CACHE=1
+      # TODO: fix the memory used inside the worker because right now it's not supporting
+      # large image sizes
+      # - name: Build and push Docker image
+      #   uses: docker/build-push-action@v5
+      #   with:
+      #     context: .
+      #     push: true
+      #     tags: ${{ steps.meta.outputs.tags }}
+      #     labels: ${{ steps.meta.outputs.labels }}
+      #     cache-from: type=gha
+      #     cache-to: type=gha,mode=max
+      #     build-args: |
+      #       BUILDKIT_INLINE_CACHE=1
diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: check-poetry install test lint format help system-deps coverage coverage-html download-model download-model-m2m-small download-model-m2m-medium download-model-m2m-large download-model-nllb download-model-nllb-small download-model-nllb-medium download-model-nllb-large serve serve-prod docker-build docker-run docker compose-up docker compose-down pre-commit-install pre-commit-run docker-build-with-model docker-up docker-down
+.PHONY: check-poetry install test lint format help system-deps coverage coverage-html download-model download-model-m2m-small download-model-m2m-medium download-model-m2m-large download-model-nllb download-model-nllb-small download-model-nllb-medium download-model-nllb-large serve serve-prod docker-build docker-run docker compose-up compose-down pre-commit-install pre-commit-run docker-build-with-model docker-up docker-down
 
 # Define model path variable with default value, can be overridden by environment
 MODEL_PATH ?= ./models

diff --git a/README.md b/README.md
@@ -17,6 +17,10 @@ translation accessible through straightforward API endpoints.
 - Supports two powerful translation models:
   - **M2M100**: Supports 100+ languages
   - **NLLB (No Language Left Behind)**: Supports 200+ languages, including many low-resource languages
+- **Language Detection**: Automatically detects the language of text using the Lingua language detector, which is highly accurate even for short text snippets
+  - Lingua prioritizes quality over quantity, focusing on accurate detection rather than supporting every possible language
+  - Supports 75 different languages with high precision, even though the translation models accept 200+ languages
+  - Particularly effective for short texts and informal language, making it ideal for real-world applications
 
 ## Requirements
 
@@ -167,6 +171,41 @@ curl -X POST "http://localhost:8000/api/v1/translate" \
 # Response:
 # {"translation":"Hola, ¿cómo estás?"}
 
+# Translate text with automatic language detection
+curl -X POST "http://localhost:8000/api/v1/translate" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "text": "Bonjour, comment ça va?",
+    "src_lang": "auto",
+    "tgt_lang": "en"
+  }'
+
+# Response:
+# {
+#   "translation": "Hello, how are you?",
+#   "model_type": "m2m100",
+#   "architecture": "cpu_compiled",
+#   "detected_lang": "fr",
+#   "detection_confidence": 0.98
+# }
+
+# You can also omit the source language entirely for automatic detection
+curl -X POST "http://localhost:8000/api/v1/translate" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "text": "Hola, ¿cómo estás?",
+    "tgt_lang": "en"
+  }'
+
+# Response includes the detected language and confidence score
+# {
+#   "translation": "Hello, how are you?",
+#   "model_type": "m2m100",
+#   "architecture": "cpu_compiled",
+#   "detected_lang": "es",
+#   "detection_confidence": 0.95
+# }
+
 # Translate text with NLLB model using FLORES-200 language codes
 curl -X POST "http://localhost:8000/api/v1/translate" \
   -H "Content-Type: application/json" \
@@ -403,6 +442,69 @@ For a detailed technical overview of the system architecture, including diagrams
 
 ![Babeltron Overview](docs/images/overview.png)
 
+## Downloading Models
+
+Before using Babeltron, you need to download at least one model:
+
+### Translation Models
+
+```bash
+# Download the default M2M100 small model (418M parameters)
+make download-model-m2m-small
+
+# Download the M2M100 medium model (1.2B parameters)
+make download-model-m2m-medium
+
+# Download the M2M100 large model (12B parameters)
+make download-model-m2m-large
+
+# Download the NLLB small model (600M parameters)
+make download-model-nllb-small
+
+# Download the NLLB large model (3.3B parameters)
+make download-model-nllb-large
+```
+
+### Language Detection Model
+
+```bash
+# Download the XLM-RoBERTa model for language detection
+make download-model-xlm-roberta
+```
+
+## API Usage
+
+### Translation
+
+```bash
+curl -X POST "http://localhost:8000/translate" \
+     -H "Content-Type: application/json" \
+     -d '{"text": "Hello, how are you?", "source_lang": "en", "target_lang": "fr"}'
+```
+
+Response:
+```json
+{
+  "translated_text": "Bonjour, comment ça va ?"
+}
+```
+
+### Language Detection
+
+```bash
+curl -X POST "http://localhost:8000/detect" \
+     -H "Content-Type: application/json" \
+     -d '{"text": "Hello, how are you?"}'
+```
+
+Response:
+```json
+{
+  "language": "en",
+  "confidence": 0.98
+}
+```
+
 ## License
 
 MIT License

diff --git a/babeltron/app/config.py b/babeltron/app/config.py
@@ -4,9 +4,11 @@
 class ModelType:
     M2M100 = "m2m100"
     NLLB = "nllb"
+    LINGUA = "lingua"
 
 
 BABELTRON_MODEL_TYPE = os.getenv("BABELTRON_MODEL_TYPE", ModelType.M2M100)
+DETECTION_MODEL_TYPE = os.getenv("DETECTION_MODEL_TYPE", ModelType.LINGUA)
 MODEL_PATH = os.getenv("MODEL_PATH", "./models")
 LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
 IN_TEST = os.environ.get("PYTEST_CURRENT_TEST") is not None

diff --git a/babeltron/app/main.py b/babeltron/app/main.py
@@ -12,7 +12,7 @@
 )
 from babeltron.app.middlewares.auth import BasicAuthMiddleware
 from babeltron.app.monitoring import PrometheusMiddleware, metrics_endpoint
-from babeltron.app.routers import healthcheck, translate
+from babeltron.app.routers import detect, healthcheck, translate
 from babeltron.version import __version__
 
 # Configure logging
@@ -54,6 +54,7 @@
 # Include routers
 app.include_router(translate.router, prefix="/api/v1")
 app.include_router(healthcheck.router, prefix="/api/v1")
+app.include_router(detect.router, prefix="/api/v1")
 
 
 @app.get("/", include_in_schema=False)

diff --git a/babeltron/app/models/__init__.py b/babeltron/app/models/__init__.py
@@ -4,14 +4,22 @@
 This package contains the translation model implementations.
 """
 
-from babeltron.app.models.base import TranslationModelBase
-from babeltron.app.models.factory import get_translation_model
-from babeltron.app.models.m2m100 import M2M100TranslationModel
-from babeltron.app.models.m2m100 import ModelArchitecture as M2MModelArchitecture
-from babeltron.app.models.m2m100 import get_translation_model as get_m2m_model
-from babeltron.app.models.nllb import ModelArchitecture as NLLBModelArchitecture
-from babeltron.app.models.nllb import NLLBTranslationModel
-from babeltron.app.models.nllb import get_translation_model as get_nllb_model
+from babeltron.app.models.translation.base import TranslationModelBase
+from babeltron.app.models.translation.factory import get_translation_model
+from babeltron.app.models.translation.m2m100 import M2M100TranslationModel
+from babeltron.app.models.translation.m2m100 import (
+    ModelArchitecture as M2MModelArchitecture,
+)
+from babeltron.app.models.translation.m2m100 import (
+    get_translation_model as get_m2m_model,
+)
+from babeltron.app.models.translation.nllb import (
+    ModelArchitecture as NLLBModelArchitecture,
+)
+from babeltron.app.models.translation.nllb import NLLBTranslationModel
+from babeltron.app.models.translation.nllb import (
+    get_translation_model as get_nllb_model,
+)
 
 __all__ = [
     "TranslationModelBase",

diff --git a/babeltron/app/models/detection/__init__.py b/babeltron/app/models/detection/__init__.py
diff --git a/babeltron/app/models/detection/base.py b/babeltron/app/models/detection/base.py
@@ -0,0 +1,36 @@
+from abc import ABC, abstractmethod
+from typing import Any, Tuple
+
+
+class DetectionModelBase(ABC):
+    """
+    Abstract base class for detection models.
+
+    Any concrete implementation must provide methods for:
+    - Loading the model
+    - Detecting language of text
+    """
+
+    @abstractmethod
+    def load(self) -> Tuple[Any, Any, str]:
+        """
+        Load the model and tokenizer, and determine the architecture.
+
+        Returns:
+            Tuple containing (model, tokenizer, architecture)
+        """
+        pass
+
+    @abstractmethod
+    def detect(self, text: str, tracer=None) -> str:
+        """
+        Detect language of text.
+
+        Args:
+            text: The text to detect language
+            tracer: Optional OpenTelemetry tracer for spans (can be None)
+
+        Returns:
+            The detected language
+        """
+        pass
diff --git a/babeltron/app/models/detection/factory.py b/babeltron/app/models/detection/factory.py
@@ -0,0 +1,65 @@
+import logging
+import os
+from typing import Callable, Dict
+
+from babeltron.app.config import ModelType
+from babeltron.app.models.detection.base import DetectionModelBase
+
+# Default model type from environment variable, fallback to Lingua
+DEFAULT_DETECTION_MODEL_TYPE = os.environ.get(
+    "DEFAULT_DETECTION_MODEL_TYPE", ModelType.LINGUA
+)
+
+# Registry of model types to their factory functions
+MODEL_REGISTRY: Dict[str, Callable] = {}
+
+
+def register_model(model_type: str):
+    """
+    Decorator to register a model factory function in the MODEL_REGISTRY.
+
+    Args:
+        model_type: The type of model to register
+
+    Returns:
+        Decorator function that registers the model factory
+    """
+
+    def decorator(factory_func: Callable):
+        MODEL_REGISTRY[model_type] = factory_func
+        return factory_func
+
+    return decorator
+
+
+def get_detection_model(model_type: str = None) -> DetectionModelBase:
+    """
+    Factory function to get the appropriate detection model based on the model type.
+
+    Args:
+        model_type: The type of model to use, defaults to DEFAULT_DETECTION_MODEL_TYPE
+
+    Returns:
+        An instance of the appropriate detection model
+
+    Raises:
+        ValueError: If the model type is not supported
+    """
+    if model_type is None:
+        model_type = DEFAULT_DETECTION_MODEL_TYPE
+
+    if model_type not in MODEL_REGISTRY:
+        supported_models = ", ".join(MODEL_REGISTRY.keys())
+        raise ValueError(
+            f"Unsupported model type: {model_type}. Supported types: {supported_models}"
+        )
+
+    logging.info(f"Creating detection model of type: {model_type}")
+    return MODEL_REGISTRY[model_type]()
+
+
+# Import models at the end to avoid circular imports
+from babeltron.app.models.detection.lingua import get_lingua_model  # noqa
+
+# Register the Lingua model
+register_model(ModelType.LINGUA)(get_lingua_model)