diff --git a/.gitignore b/.gitignore index 9a79cad0..7901ca20 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,7 @@ inference-api/__pycache__/ CLAUDE.md docs/RAG_PRODUCTIONIZATION_PLAN.md docs/DOCKER_CONTROL_SERVICE_PLAN.md +!app/backend/shared_config/models_from_inference_server.json + +request-venv/* +app/.env-old diff --git a/app/.env.default b/app/.env.default index 58a4a7a1..5f01b3f2 100644 --- a/app/.env.default +++ b/app/.env.default @@ -13,6 +13,9 @@ TT_INFERENCE_ARTIFACT_VERSION=v0.8.0 # Security Credentials (REQUIRED - keep secret in production!) JWT_SECRET=test-secret-456 DJANGO_SECRET_KEY=django-insecure-default + +# TTS Inference Server API Key (media inference engine) +TTS_API_KEY=your-tts-api-key HF_TOKEN=hf_*** # Docker Control Service (secure Docker operations API) diff --git a/app/backend/Dockerfile b/app/backend/Dockerfile index 2ea935eb..9ec42f65 100644 --- a/app/backend/Dockerfile +++ b/app/backend/Dockerfile @@ -34,7 +34,7 @@ RUN if [ "$VITE_ENABLE_DEPLOYED" != "true" ]; then \ . "$HOME/.cargo/env" && \ # Clone and install tt-smi mkdir -p /opt/tenstorrent-tools && \ - git clone https://github.com/tenstorrent/tt-smi.git /opt/tenstorrent-tools/tt-smi && \ + git clone --branch v4.0.0 --depth 1 https://github.com/tenstorrent/tt-smi.git /opt/tenstorrent-tools/tt-smi && \ cd /opt/tenstorrent-tools/tt-smi && \ pip3 install --upgrade pip && \ pip3 install . && \ diff --git a/app/backend/api/settings.py b/app/backend/api/settings.py index cf7d799e..06671024 100644 --- a/app/backend/api/settings.py +++ b/app/backend/api/settings.py @@ -64,11 +64,6 @@ # Application definition INSTALLED_APPS = [ - "django.contrib.admin", - "django.contrib.auth", - "django.contrib.contenttypes", - "django.contrib.sessions", - "django.contrib.messages", "django.contrib.staticfiles", "docker_control.apps.DockerControlConfig", "model_control", @@ -81,11 +76,8 @@ MIDDLEWARE = [ "corsheaders.middleware.CorsMiddleware", "django.middleware.security.SecurityMiddleware", - "django.contrib.sessions.middleware.SessionMiddleware", "django.middleware.common.CommonMiddleware", "django.middleware.csrf.CsrfViewMiddleware", - "django.contrib.auth.middleware.AuthenticationMiddleware", - "django.contrib.messages.middleware.MessageMiddleware", "django.middleware.clickjacking.XFrameOptionsMiddleware", ] @@ -100,25 +92,12 @@ "context_processors": [ "django.template.context_processors.debug", "django.template.context_processors.request", - "django.contrib.auth.context_processors.auth", - "django.contrib.messages.context_processors.messages", ], }, }, ] WSGI_APPLICATION = "api.wsgi.application" -SESSIONS_ENGINE = "django.contrib.sessions.backends.cache" -# Database -# https://docs.djangoproject.com/en/4.2/ref/settings/#databases - -# SQLite database for deployment history and other persistent data -DATABASES = { - "default": { - "ENGINE": "django.db.backends.sqlite3", - "NAME": backend_config.backend_cache_root / "db.sqlite3", - } -} # local memory thread-safe default # the LOCATION for locmem.LocMemCache cache backend is just a name for tracking @@ -135,24 +114,6 @@ }, } -# Password validation -# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators - -AUTH_PASSWORD_VALIDATORS = [ - { - "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", - }, - { - "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", - }, - { - "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", - }, - { - "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", - }, -] - # Internationalization # https://docs.djangoproject.com/en/4.2/topics/i18n/ diff --git a/app/backend/api/urls.py b/app/backend/api/urls.py index 441f06b7..34717c7b 100644 --- a/app/backend/api/urls.py +++ b/app/backend/api/urls.py @@ -19,12 +19,11 @@ 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) """ -from django.contrib import admin from api.views import UpStatusView from django.urls import include, path +from model_control.views import OpenAIAudioSpeechView urlpatterns = [ - path("admin/", admin.site.urls), path("up/", UpStatusView.as_view()), path("docker/", include("docker_control.urls")), path("models/", include("model_control.urls")), @@ -32,4 +31,6 @@ path("collections/", include("vector_db_control.urls")), path("logs/", include("logs_control.urls")), path("board/", include("board_control.urls")), + # OpenAI-compatible audio endpoint + path("v1/audio/speech", OpenAIAudioSpeechView.as_view()), ] diff --git a/app/backend/board_control/services.py b/app/backend/board_control/services.py index 2c08a231..eaa39b31 100644 --- a/app/backend/board_control/services.py +++ b/app/backend/board_control/services.py @@ -16,15 +16,19 @@ class SystemResourceService: """Service for monitoring system resources and TT device telemetry""" - + # Cache keys and timeout TT_SMI_CACHE_KEY = "tt_smi_data" TT_SMI_CACHE_TIMEOUT = 3600 # Cache for 1 hour (since we'll refresh on events only) BOARD_TYPE_CACHE_KEY = "board_type_data" BOARD_TYPE_CACHE_TIMEOUT = 3600 # Cache board type for 1 hour (since it rarely changes) + + # Device state cache keys + DEVICE_STATE_CACHE_KEY = "device_state_v2" + DEVICE_RESETTING_KEY = "device_resetting" @staticmethod - def get_tt_smi_data(timeout=10): + def get_tt_smi_data(timeout=30): """Get raw tt-smi data with caching to reduce expensive calls""" # Check cache first cached_data = cache.get(SystemResourceService.TT_SMI_CACHE_KEY) @@ -412,9 +416,245 @@ def force_refresh_tt_smi_cache(): # Clear the existing cache cache.delete(SystemResourceService.TT_SMI_CACHE_KEY) cache.delete(SystemResourceService.BOARD_TYPE_CACHE_KEY) - + # Fetch fresh data SystemResourceService.get_tt_smi_data() SystemResourceService.get_board_type() - - logger.info("tt-smi cache refreshed successfully") \ No newline at end of file + + logger.info("tt-smi cache refreshed successfully") + + # ------------------------------------------------------------------------- + # Device State Machine — single source of truth + # ------------------------------------------------------------------------- + + @staticmethod + def _extract_board_type_from_data(data): + """Extract canonical board-type string from tt-smi JSON data.""" + if not data or "device_info" not in data or not data["device_info"]: + return "unknown" + + board_types = [] + for info in data["device_info"]: + board_info = info.get("board_info", {}) + board_types.append(board_info.get("board_type", "unknown")) + + if not board_types: + return "unknown" + + # Strip "local"/"remote" suffix if present + filtered = [bt.rsplit(" ", 1)[0] for bt in board_types] + unique = set(filtered) + + if len(unique) > 1: + logger.warning(f"Mixed board types detected: {unique}") + return "unknown" + + raw = unique.pop() + num_devices = len(data["device_info"]) + raw_lower = raw.lower() + + if "n150" in raw_lower: + return "N150X4" if num_devices >= 4 else "N150" + if "n300" in raw_lower: + return "T3K" if num_devices >= 4 else "N300" + if "p300" in raw_lower: + if num_devices >= 8: + return "P300Cx4" + if num_devices >= 4: + return "P300Cx2" + return "P300c" + if "p150" in raw_lower: + if num_devices >= 8: + return "P150X8" + if num_devices >= 4: + return "P150X4" + return "P150" + if "p100" in raw_lower: + return "P100" + if "e150" in raw_lower: + return "E150" + if "galaxy" in raw_lower: + return "GALAXY_T3K" if "t3k" in raw_lower else "GALAXY" + + logger.warning(f"Unknown board type string: {raw!r}") + return "unknown" + + @staticmethod + def _extract_devices_from_data(data): + """Extract device summary list from tt-smi JSON data.""" + devices = [] + if not data or "device_info" not in data: + return devices + + for idx, device in enumerate(data["device_info"]): + board_info = device.get("board_info", {}) + telemetry = device.get("telemetry", {}) + + def _f(v): + try: + return float(v) if v is not None else 0.0 + except (TypeError, ValueError): + return 0.0 + + devices.append({ + "index": idx, + "board_type": board_info.get("board_type", "Unknown"), + "bus_id": board_info.get("bus_id", "N/A"), + "temperature": _f(telemetry.get("asic_temperature")), + "power": _f(telemetry.get("power")), + "voltage": _f(telemetry.get("voltage")), + }) + return devices + + @staticmethod + def get_device_state(): + """ + Single authoritative device state resolver. + + States: + HEALTHY — tt-smi -s succeeded, devices visible + BAD_STATE — /dev/tenstorrent present but tt-smi timed out / errored + RESETTING — tt-smi -r is actively running + NOT_PRESENT — /dev/tenstorrent path does not exist + UNKNOWN — can't determine (startup / tt-smi missing) + """ + # RESETTING takes priority — check before cache + if cache.get(SystemResourceService.DEVICE_RESETTING_KEY): + return { + "state": "RESETTING", + "board_type": "unknown", + "board_name": "Resetting…", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": False, + } + + # Return cached result if still fresh + cached = cache.get(SystemResourceService.DEVICE_STATE_CACHE_KEY) + if cached is not None: + return cached + + # Check physical device presence + if not os.path.exists("/dev/tenstorrent"): + result = { + "state": "NOT_PRESENT", + "board_type": "unknown", + "board_name": "Not Present", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": False, + } + cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=15) + return result + + # Try tt-smi -s with 30-second timeout (Docker cold-start can be slower than host) + try: + logger.info("Running tt-smi -s for device state check") + process = subprocess.Popen( + ["tt-smi", "-s"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + stdin=subprocess.DEVNULL, + text=True, + preexec_fn=os.setsid, + ) + + try: + stdout, stderr = process.communicate(timeout=30) + except subprocess.TimeoutExpired: + logger.error("tt-smi -s timed out after 30s — board in BAD_STATE") + try: + os.killpg(os.getpgid(process.pid), signal.SIGTERM) + process.wait(timeout=2) + except Exception: + try: + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + except Exception: + pass + result = { + "state": "BAD_STATE", + "board_type": "unknown", + "board_name": "Bad State", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": True, + } + cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=10) + return result + + if process.returncode != 0: + logger.error(f"tt-smi -s exit code {process.returncode}: {stderr.strip()!r}") + result = { + "state": "BAD_STATE", + "board_type": "unknown", + "board_name": "Bad State", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": True, + } + cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=10) + return result + + try: + data = json.loads(stdout) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse tt-smi JSON: {e}") + result = { + "state": "BAD_STATE", + "board_type": "unknown", + "board_name": "Bad State", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": True, + } + cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=10) + return result + + board_type = SystemResourceService._extract_board_type_from_data(data) + devices = SystemResourceService._extract_devices_from_data(data) + result = { + "state": "HEALTHY", + "board_type": board_type, + "board_name": board_type, + "devices": devices, + "last_updated": timezone.now().isoformat(), + "reset_suggested": False, + } + cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=30) + return result + + except FileNotFoundError: + logger.error("tt-smi command not found") + # Don't cache UNKNOWN so each call re-checks (tt-smi may be installed later) + return { + "state": "UNKNOWN", + "board_type": "unknown", + "board_name": "Unknown", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": False, + } + except Exception as e: + logger.error(f"Unexpected error in get_device_state: {e}") + return { + "state": "UNKNOWN", + "board_type": "unknown", + "board_name": "Unknown", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": False, + } + + @staticmethod + def set_resetting_state(): + """Mark the device as actively resetting (clears state cache).""" + cache.set(SystemResourceService.DEVICE_RESETTING_KEY, True, timeout=120) + cache.delete(SystemResourceService.DEVICE_STATE_CACHE_KEY) + logger.info("Device state set to RESETTING") + + @staticmethod + def clear_device_state_cache(): + """Clear device state cache and resetting flag after reset completes.""" + cache.delete(SystemResourceService.DEVICE_STATE_CACHE_KEY) + cache.delete(SystemResourceService.DEVICE_RESETTING_KEY) + logger.info("Device state cache cleared") \ No newline at end of file diff --git a/app/backend/board_control/urls.py b/app/backend/board_control/urls.py index 42e59361..3e2b323d 100644 --- a/app/backend/board_control/urls.py +++ b/app/backend/board_control/urls.py @@ -19,4 +19,8 @@ # Cache management path("refresh-cache/", views.RefreshCacheView.as_view(), name="refresh-cache"), -] \ No newline at end of file + + # Unified device state & reset (new) + path("device-state/", views.DeviceStateView.as_view(), name="device-state"), + path("device-reset/", views.DeviceResetView.as_view(), name="device-reset"), +] \ No newline at end of file diff --git a/app/backend/board_control/views.py b/app/backend/board_control/views.py index f904557c..7dd83428 100644 --- a/app/backend/board_control/views.py +++ b/app/backend/board_control/views.py @@ -228,20 +228,78 @@ def patch(self, request, alert_id, *args, **kwargs): @method_decorator(csrf_exempt, name='dispatch') class RefreshCacheView(APIView): """Manual cache refresh endpoint for debugging and manual triggering""" - + def post(self, request, *args, **kwargs): try: logger.info("Manual cache refresh requested") SystemResourceService.force_refresh_tt_smi_cache() - + return Response({ "status": "success", "message": "tt-smi cache refreshed successfully" }, status=status.HTTP_200_OK) - + except Exception as e: logger.error(f"Error manually refreshing cache: {str(e)}") return Response( {"error": "Failed to refresh cache", "details": str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR - ) \ No newline at end of file + ) + + +@method_decorator(csrf_exempt, name='dispatch') +class DeviceStateView(APIView): + """ + GET /board-api/device-state/ + + Single source of truth for board state. Replaces the need to call + /board-api/status/, /board-api/footer-data/, and /docker-api/board-info/ + separately. All components should poll this endpoint. + """ + + def get(self, request, *args, **kwargs): + try: + state = SystemResourceService.get_device_state() + return Response(state, status=status.HTTP_200_OK) + except Exception as e: + logger.error(f"Error getting device state: {e}") + return Response({ + "state": "UNKNOWN", + "board_type": "unknown", + "board_name": "Unknown", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": False, + }, status=status.HTTP_200_OK) + + +@method_decorator(csrf_exempt, name='dispatch') +class DeviceResetView(APIView): + """ + POST /board-api/device-reset/ + + Dedicated board reset endpoint. Separated from the Docker-coupled + /docker-api/reset_board/ for clarity; the old endpoint keeps working via + the same perform_reset() logic. + """ + + def post(self, request, *args, **kwargs): + from docker_control.docker_utils import perform_reset + try: + logger.info("Device reset requested via /board-api/device-reset/") + result = perform_reset() + http_status_code = result.pop("http_status", 200) + + success = result.get("status") == "success" + return Response({ + "success": success, + "message": result.get("message", ""), + "attempts_used": result.get("attempts_used", 0), + }, status=http_status_code) + except Exception as e: + logger.error(f"Error in device reset: {e}") + return Response({ + "success": False, + "message": str(e), + "attempts_used": 0, + }, status=status.HTTP_500_INTERNAL_SERVER_ERROR) \ No newline at end of file diff --git a/app/backend/docker_control/admin.py b/app/backend/docker_control/admin.py index 2c79060a..917beb36 100644 --- a/app/backend/docker_control/admin.py +++ b/app/backend/docker_control/admin.py @@ -1,7 +1,3 @@ # SPDX-License-Identifier: Apache-2.0 # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC - -from django.contrib import admin - -# Register your models here. diff --git a/app/backend/docker_control/apps.py b/app/backend/docker_control/apps.py index 0a263c9c..68dfa377 100644 --- a/app/backend/docker_control/apps.py +++ b/app/backend/docker_control/apps.py @@ -14,32 +14,15 @@ class DockerControlConfig(AppConfig): def ready(self): """Initialize docker control services""" logger.info("Docker control app is ready") - - # Verify database migrations are applied + + # Log how many deployments are already tracked try: - from django.db import connection - - # Check if ModelDeployment table exists - with connection.cursor() as cursor: - cursor.execute(""" - SELECT name FROM sqlite_master - WHERE type='table' AND name='docker_control_modeldeployment' - """) - table_exists = cursor.fetchone() is not None - - if not table_exists: - logger.warning( - "ModelDeployment table not found. Database migrations may not be applied. " - "Run: python manage.py migrate docker_control" - ) - else: - # Count existing deployment records - from docker_control.models import ModelDeployment - count = ModelDeployment.objects.count() - logger.info(f"Deployment history table verified. Existing records: {count}") + from docker_control.models import ModelDeployment + count = ModelDeployment.objects.count() + logger.info(f"Deployment store loaded. Existing records: {count}") except Exception as e: - logger.warning(f"Could not verify deployment history table: {e}") - + logger.warning(f"Could not read deployment store: {e}") + # Start container health monitoring service try: from docker_control.health_monitor import start_health_monitoring diff --git a/app/backend/docker_control/chip_allocator.py b/app/backend/docker_control/chip_allocator.py new file mode 100644 index 00000000..ce3c3dfe --- /dev/null +++ b/app/backend/docker_control/chip_allocator.py @@ -0,0 +1,349 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +""" +Chip slot allocator for automatic device_id assignment. + +Manages automatic chip slot allocation based on: +- Current deployments (from deployment_store) +- Model chip requirements (single vs multi-chip) +- Board topology +""" + +import threading +from typing import Dict, List, Optional, Set + +from shared_config.logger_config import get_logger +from shared_config.model_config import get_model_chip_requirement +from docker_control.deployment_store import ModelDeployment + +logger = get_logger(__name__) + + +# --------------------------------------------------------------------------- +# Exception Classes +# --------------------------------------------------------------------------- + +class AllocationError(Exception): + """Base exception for chip slot allocation errors""" + pass + + +class MultiChipConflictError(AllocationError): + """ + Exception raised when multi-chip model deployment conflicts with existing deployments. + + Attributes: + message: Error message + conflicts: List of conflicting deployment info dicts + """ + def __init__(self, message: str, conflicts: List[Dict] = None): + super().__init__(message) + self.conflicts = conflicts or [] + + +# --------------------------------------------------------------------------- +# Chip Slot Allocator +# --------------------------------------------------------------------------- + +# Board type to slot count mapping (matching frontend MULTI_CHIP_BOARD_SLOTS) +MULTI_CHIP_BOARD_SLOTS = { + "T3K": 4, + "T3000": 4, + "N150X4": 4, + "N300x4": 4, + "P150X4": 4, + "P150X8": 8, + "P300Cx2": 4, + "P300Cx4": 8, + "GALAXY": 32, + "GALAXY_T3K": 32, +} + + +class ChipSlotAllocator: + """ + Manages automatic chip slot allocation. + + Thread-safe allocator that determines the best chip slot for a model + based on current deployments and chip requirements. + """ + + def __init__(self): + """Initialize allocator with current board type and slot count""" + self._lock = threading.Lock() + self.board_type = self._detect_board_type() + self.total_slots = self._get_total_slots() + logger.info(f"ChipSlotAllocator initialized: board={self.board_type}, slots={self.total_slots}") + + def _detect_board_type(self) -> str: + """Detect current board type""" + from docker_control.docker_utils import detect_board_type + return detect_board_type() + + def _get_total_slots(self) -> int: + """Get total number of chip slots for current board""" + # Multi-chip boards have multiple slots + if self.board_type in MULTI_CHIP_BOARD_SLOTS: + return MULTI_CHIP_BOARD_SLOTS[self.board_type] + + # Single-chip boards (N150, N300, E150, P100, P150, P300c) have 1 slot + return 1 + + def get_chip_status(self) -> Dict: + """ + Returns current chip slot occupancy status. + + Returns: + Dictionary with board_type, total_slots, and per-slot status: + { + "board_type": "T3K", + "total_slots": 4, + "slots": [ + {"slot_id": 0, "status": "occupied", "model_name": "...", "deployment_id": 123, "is_multi_chip": False}, + {"slot_id": 1, "status": "available"}, + ... + ] + } + """ + active_deployments = self._get_active_deployments() + slots_info = [] + occupied_map = {} + + # Build occupied slots map + for deployment in active_deployments: + model_chips = self._get_chips_required(deployment.model_name) + + if model_chips == 4: + # Multi-chip: mark ALL slots as occupied + for slot_id in range(min(4, self.total_slots)): # Multi-chip models use up to 4 slots + occupied_map[slot_id] = { + "model_name": deployment.model_name, + "deployment_id": deployment.id, + "is_multi_chip": True, + "port": deployment.port, + } + else: + # Single-chip: mark specific slot + if deployment.device_id < self.total_slots: + occupied_map[deployment.device_id] = { + "model_name": deployment.model_name, + "deployment_id": deployment.id, + "is_multi_chip": False, + "port": deployment.port, + } + + # Build slot status list + for slot_id in range(self.total_slots): + if slot_id in occupied_map: + slots_info.append({ + "slot_id": slot_id, + "status": "occupied", + **occupied_map[slot_id] + }) + else: + slots_info.append({ + "slot_id": slot_id, + "status": "available" + }) + + return { + "board_type": self.board_type, + "total_slots": self.total_slots, + "slots": slots_info + } + + def allocate_chip_slot(self, model_name: str, manual_override: Optional[int] = None) -> int: + """ + Auto-allocate chip slot or use manual override. + + Args: + model_name: Name of the model being deployed + manual_override: Optional manual device_id for advanced mode + + Returns: + Allocated device_id (0-based slot number) + + Raises: + AllocationError: If allocation fails (all slots occupied) + MultiChipConflictError: If multi-chip model conflicts with existing deployments + """ + with self._lock: + chips_required = self._get_chips_required(model_name) + + # Advanced mode: manual override + if manual_override is not None: + validation = self._validate_manual_allocation(manual_override, chips_required, model_name) + if not validation["valid"]: + raise AllocationError(validation["message"]) + logger.info(f"Manual allocation: device_id={manual_override} for {model_name}") + return manual_override + + # Auto-allocation + if chips_required == 4: + device_id = self._allocate_multi_chip(model_name) + else: + device_id = self._allocate_single_chip(model_name) + + logger.info(f"Auto-allocated: device_id={device_id} for {model_name} ({chips_required} chips)") + return device_id + + def _allocate_single_chip(self, model_name: str) -> int: + """ + Find first available slot for single-chip model. + + Args: + model_name: Name of the model + + Returns: + Device ID of first available slot + + Raises: + AllocationError: If all slots are occupied + """ + occupied_slots = self._get_occupied_slots() + + for slot_id in range(self.total_slots): + if slot_id not in occupied_slots: + return slot_id + + raise AllocationError( + f"All {self.total_slots} chip slots are occupied. " + f"Stop at least one model to free up a slot." + ) + + def _allocate_multi_chip(self, model_name: str) -> int: + """ + Validate all slots are free for multi-chip model, return 0. + + Args: + model_name: Name of the model + + Returns: + Device ID 0 (multi-chip models always use device_id=0) + + Raises: + MultiChipConflictError: If any slots are occupied + """ + occupied_slots = self._get_occupied_slots() + + if occupied_slots: + # Build detailed conflict information + active_deployments = self._get_active_deployments() + conflicts = [] + + for deployment in active_deployments: + model_chips = self._get_chips_required(deployment.model_name) + conflicts.append({ + "model": deployment.model_name, + "deployment_id": deployment.id, + "slot": deployment.device_id, + "chips": model_chips + }) + + raise MultiChipConflictError( + f"{model_name} requires all 4 chip slots. " + f"Currently occupied: {len(occupied_slots)} slot(s). " + f"Stop all running models first.", + conflicts=conflicts + ) + + return 0 # Multi-chip models always use device_id=0 + + def _validate_manual_allocation(self, device_id: int, chips_required: int, model_name: str) -> Dict: + """ + Validate manual chip slot selection in advanced mode. + + Args: + device_id: Manually selected device ID + chips_required: Number of chips required by model + model_name: Name of the model + + Returns: + Dictionary with "valid" boolean and optional "message" + """ + # Check bounds + if device_id < 0 or device_id >= self.total_slots: + return { + "valid": False, + "message": f"Invalid device_id {device_id}. Must be 0-{self.total_slots - 1}." + } + + occupied_slots = self._get_occupied_slots() + + if chips_required == 4: + # Multi-chip: ensure all slots are free + if occupied_slots: + return { + "valid": False, + "message": f"{model_name} requires all 4 chip slots. Currently occupied: {len(occupied_slots)} slot(s)." + } + else: + # Single-chip: ensure selected slot is free + if device_id in occupied_slots: + # Find which model is using this slot + active_deployments = self._get_active_deployments() + occupying_model = None + for deployment in active_deployments: + if deployment.device_id == device_id: + occupying_model = deployment.model_name + break + # Check if a multi-chip model is occupying all slots + model_chips = self._get_chips_required(deployment.model_name) + if model_chips == 4: + occupying_model = f"{deployment.model_name} (multi-chip)" + break + + return { + "valid": False, + "message": f"Chip slot {device_id} is occupied by {occupying_model or 'another model'}." + } + + return {"valid": True} + + def _get_active_deployments(self) -> List[ModelDeployment]: + """ + Get list of active deployments (starting or running status). + + Returns: + List of ModelDeployment objects + """ + return list(ModelDeployment.objects.filter(status__in=['starting', 'running'])) + + def _get_occupied_slots(self) -> Set[int]: + """ + Returns set of occupied slot IDs. + + Multi-chip deployments occupy slots 0-3. + Single-chip deployments occupy their specific device_id slot. + + Returns: + Set of occupied slot IDs + """ + active = self._get_active_deployments() + occupied = set() + + for deployment in active: + chips = self._get_chips_required(deployment.model_name) + if chips == 4: + # Multi-chip: occupies all 4 slots + occupied.update(range(min(4, self.total_slots))) + else: + # Single-chip: occupies specific slot + if deployment.device_id < self.total_slots: + occupied.add(deployment.device_id) + + return occupied + + def _get_chips_required(self, model_name: str) -> int: + """ + Get number of chips required for a model. + + Args: + model_name: Name of the model + + Returns: + Number of chips required (1 or 4) + """ + return get_model_chip_requirement(model_name) diff --git a/app/backend/docker_control/deployment_store.py b/app/backend/docker_control/deployment_store.py new file mode 100644 index 00000000..ba5421fd --- /dev/null +++ b/app/backend/docker_control/deployment_store.py @@ -0,0 +1,248 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +""" +Thread-safe JSON file store replacing Django ORM for ModelDeployment. + +Provides a drop-in ORM-like interface (objects.create, filter, all, get, save) +backed by a single JSON file in the persistent storage volume. +""" + +import json +import os +import threading +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, List, Optional + +from shared_config.logger_config import get_logger + +logger = get_logger(__name__) + +_STORE_PATH = ( + Path(os.getenv("INTERNAL_PERSISTENT_STORAGE_VOLUME", "/tt_studio_persistent_volume")) + / "backend_volume" + / "deployments.json" +) + +_lock = threading.Lock() + + +def _now() -> datetime: + return datetime.now(timezone.utc) + + +def _parse_dt(s: Optional[str]) -> Optional[datetime]: + if s is None: + return None + try: + return datetime.fromisoformat(s) + except Exception: + return None + + +def _sort_key(record: dict, field: str): + """Return a sortable key for a field, handling None and datetime strings.""" + val = record.get(field) + if val is None: + return "" + return val # ISO strings sort lexicographically = chronologically + + +def _load_raw() -> dict: + if not _STORE_PATH.exists(): + return {"next_id": 1, "records": []} + try: + with open(_STORE_PATH, "r") as f: + return json.load(f) + except Exception as e: + logger.warning(f"Could not read deployment store, starting fresh: {e}") + return {"next_id": 1, "records": []} + + +def _save_raw(data: dict) -> None: + _STORE_PATH.parent.mkdir(parents=True, exist_ok=True) + tmp = _STORE_PATH.with_suffix(".tmp") + try: + with open(tmp, "w") as f: + json.dump(data, f, indent=2, default=str) + os.replace(tmp, _STORE_PATH) + except Exception as e: + logger.error(f"Failed to save deployment store: {e}") + try: + tmp.unlink(missing_ok=True) + except Exception: + pass + + +def _match(record: dict, kwargs: dict) -> bool: + """Match a record against filter kwargs, supporting __in and __isnull suffixes.""" + for key, val in kwargs.items(): + if key.endswith("__in"): + field = key[: -len("__in")] + if record.get(field) not in val: + return False + elif key.endswith("__isnull"): + field = key[: -len("__isnull")] + is_null = record.get(field) is None + if is_null != val: + return False + else: + if record.get(key) != val: + return False + return True + + +class _QuerySet: + def __init__(self, records: List[dict]): + self._records = records + + def filter(self, **kwargs) -> "_QuerySet": + return _QuerySet([r for r in self._records if _match(r, kwargs)]) + + def order_by(self, *fields) -> "_QuerySet": + records = list(self._records) + for field in reversed(fields): + reverse = field.startswith("-") + fname = field.lstrip("-") + records.sort(key=lambda r: _sort_key(r, fname), reverse=reverse) + return _QuerySet(records) + + def first(self) -> Optional["ModelDeployment"]: + if not self._records: + return None + return ModelDeployment._from_dict(self._records[0]) + + def exists(self) -> bool: + return len(self._records) > 0 + + def count(self) -> int: + return len(self._records) + + def get(self, **kwargs) -> "ModelDeployment": + matches = [r for r in self._records if _match(r, kwargs)] + if not matches: + raise ModelDeployment.DoesNotExist(f"No record matching {kwargs}") + if len(matches) > 1: + raise Exception(f"Multiple records matching {kwargs}") + return ModelDeployment._from_dict(matches[0]) + + def __iter__(self): + return (ModelDeployment._from_dict(r) for r in self._records) + + def __getitem__(self, key): + if isinstance(key, slice): + return _QuerySet(self._records[key]) + return ModelDeployment._from_dict(self._records[key]) + + def __len__(self) -> int: + return len(self._records) + + +class _Manager: + def create(self, **kwargs) -> "ModelDeployment": + with _lock: + data = _load_raw() + record = { + "id": data["next_id"], + "container_id": kwargs.get("container_id", ""), + "container_name": kwargs.get("container_name", ""), + "model_name": kwargs.get("model_name", ""), + "device": kwargs.get("device", ""), + "deployed_at": _now().isoformat(), + "stopped_at": None, + "status": kwargs.get("status", "running"), + "stopped_by_user": kwargs.get("stopped_by_user", False), + "port": kwargs.get("port", None), + "device_id": kwargs.get("device_id", 0), + "workflow_log_path": kwargs.get("workflow_log_path", None), + } + data["next_id"] += 1 + data["records"].append(record) + _save_raw(data) + return ModelDeployment._from_dict(record) + + def all(self) -> _QuerySet: + with _lock: + data = _load_raw() + return _QuerySet(list(data["records"])) + + def filter(self, **kwargs) -> _QuerySet: + return self.all().filter(**kwargs) + + def get(self, **kwargs) -> "ModelDeployment": + return self.all().get(**kwargs) + + +class ModelDeployment: + class DoesNotExist(Exception): + pass + + objects: _Manager # set below + + def __init__(self): + self.id: Optional[int] = None + self.container_id: str = "" + self.container_name: str = "" + self.model_name: str = "" + self.device: str = "" + self.deployed_at: Optional[datetime] = None + self.stopped_at: Optional[datetime] = None + self.status: str = "running" + self.stopped_by_user: bool = False + self.port: Optional[int] = None + self.device_id: int = 0 + self.workflow_log_path: Optional[str] = None + + @classmethod + def _from_dict(cls, d: dict) -> "ModelDeployment": + obj = cls() + obj.id = d.get("id") + obj.container_id = d.get("container_id", "") + obj.container_name = d.get("container_name", "") + obj.model_name = d.get("model_name", "") + obj.device = d.get("device", "") + obj.deployed_at = _parse_dt(d.get("deployed_at")) + obj.stopped_at = _parse_dt(d.get("stopped_at")) + obj.status = d.get("status", "running") + obj.stopped_by_user = d.get("stopped_by_user", False) + obj.port = d.get("port") + obj.device_id = d.get("device_id", 0) + obj.workflow_log_path = d.get("workflow_log_path") + return obj + + def _to_dict(self) -> dict: + return { + "id": self.id, + "container_id": self.container_id, + "container_name": self.container_name, + "model_name": self.model_name, + "device": self.device, + "deployed_at": self.deployed_at.isoformat() if self.deployed_at else None, + "stopped_at": self.stopped_at.isoformat() if self.stopped_at else None, + "status": self.status, + "stopped_by_user": self.stopped_by_user, + "port": self.port, + "device_id": self.device_id, + "workflow_log_path": self.workflow_log_path, + } + + def save(self) -> None: + with _lock: + data = _load_raw() + for i, r in enumerate(data["records"]): + if r.get("id") == self.id: + data["records"][i] = self._to_dict() + _save_raw(data) + return + # Not found — append as new (shouldn't happen in normal flow) + logger.warning(f"save() called on deployment id={self.id} not found in store; appending") + data["records"].append(self._to_dict()) + _save_raw(data) + + def __str__(self) -> str: + return f"{self.model_name} on {self.device} - {self.status}" + + +ModelDeployment.objects = _Manager() diff --git a/app/backend/docker_control/docker_utils.py b/app/backend/docker_control/docker_utils.py index 0c4ab8f5..e98d775a 100644 --- a/app/backend/docker_control/docker_utils.py +++ b/app/backend/docker_control/docker_utils.py @@ -50,6 +50,33 @@ def _ensure_network(): # Initialize network on module load _ensure_network() +# When deploying a single-chip model on a multi-chip board, the inference +# server needs the constituent single-chip device name (e.g. "n300" for one +# chip of a T3K board), not the board-level name ("t3k"). +_BOARD_TO_SINGLE_CHIP_DEVICE = { + # Multi-chip Wormhole boards → constituent N300 chip + "T3K": "n300", + "T3000": "n300", + "N300x4": "n300", + "N150X4": "n150", + # Multi-chip Blackhole boards → constituent single-chip device + "P150X4": "p150", + "P150X8": "p150", + "P300Cx2": "p300c", + "P300Cx4": "p300c", + # Galaxy (N300-based) + "GALAXY": "n300", + "GALAXY_T3K": "n300", + # True single-chip boards are unchanged + "N150": "n150", + "N300": "n300", + "E150": "e150", + "P100": "p100", + "P150": "p150", + "P300c": "p300c", + "unknown": "cpu", +} + def map_board_type_to_device_name(board_type): """Map our internal board type names to TT Inference Server device names""" @@ -86,217 +113,114 @@ def map_board_type_to_device_name(board_type): logger.info(f"Mapped board type '{board_type}' to device name '{device_name}'") return device_name -def run_container(impl, weights_id): +def run_container(impl, weights_id, device_id=0): """Run a docker container via TT Inference Server API""" - if (impl.model_type == ModelTypes.CHAT): - # For chat models, we use the TT Inference Server API to run the container - try: - logger.info(f"Calling TT Inference Server API") - logger.info(f"run_container called for {impl.model_name}") - - board_type = detect_board_type() + try: + logger.info(f"Calling TT Inference Server API") + logger.info(f"run_container called for {impl.model_name}") + + # Determine the correct inference-server device name. + # A single-chip model on a multi-chip board (e.g. Llama-8B on T3K) + # must use the constituent chip device ("n300"), not the board device + # ("t3k"). We use chips_required + board_type to pick the right name. + from shared_config.model_config import infer_chips_required + board_type = detect_board_type() + chips_required = infer_chips_required(impl.device_configurations) + if chips_required == 1: + device = _BOARD_TO_SINGLE_CHIP_DEVICE.get(board_type, "cpu") + else: device = map_board_type_to_device_name(board_type) - - # Create payload for the API call - payload = { - "model": impl.model_name, - "workflow": "server", # Default workflow for container runs - "device": device, # Use mapped device name - "docker_server": True, - "dev_mode": True - } - - logger.info(f"API payload: {payload}") - - # Make POST request to TT Inference Server API - api_url = "http://172.18.0.1:8001/run" + logger.info( + f"Device name '{device}' for {impl.model_name} " + f"(board={board_type}, chips_required={chips_required})" + ) + + BASE_SERVICE_PORT = 7000 + + # Create payload for the API call + payload = { + "model": impl.model_name, + "workflow": "server", # Default workflow for container runs + "device": device, # Use mapped device name + "docker_server": True, + "dev_mode": True, + } - response = requests.post( - api_url, - json=payload, - timeout=DEPLOYMENT_TIMEOUT_SECONDS # 5 hour timeout for container startup and weight downloads - ) + # Only pin to a specific chip slot for multi-chip boards + if chips_required > 1: + payload["device_id"] = str(device_id) + payload["service_port"] = str(BASE_SERVICE_PORT + device_id) + service_port = BASE_SERVICE_PORT + device_id + else: + service_port = BASE_SERVICE_PORT # single chip always uses base port - if response.status_code in [200, 202]: - api_result = response.json() - logger.info(f"API call successful (status {response.status_code}): {api_result}") - logger.info(f"api_result contains docker_log_file_path: {'docker_log_file_path' in api_result}") - if 'docker_log_file_path' in api_result: - logger.info(f"api_result['docker_log_file_path'] = {api_result.get('docker_log_file_path')}") - else: - logger.warning(f"docker_log_file_path NOT found in api_result. Available keys: {list(api_result.keys())}") + # media/forge models require skipping hw validation; vLLM models do not + if impl.model_type != ModelTypes.CHAT: + payload["skip_system_sw_validation"] = True - # Update deploy cache on success - update_deploy_cache() - - # Notify agent about new container deployment - notify_agent_of_new_container(api_result["container_name"]) - - # Save deployment record to database - container_id = None - container_name = "unknown" - try: - container_id = api_result.get("container_id") - container_name = api_result.get("container_name", "unknown") - - # If container_id is not in response, try to get it from Docker by name - if not container_id and container_name: - try: - docker_client = get_docker_client() - container_info = docker_client.get_container(container_name) - container_id = container_info.get("id") - logger.info(f"Retrieved container_id {container_id} from Docker for {container_name}") - except Exception as docker_error: - logger.warning(f"Could not get container_id from Docker: {docker_error}") - # Use container_name as fallback ID if we can't get the actual ID - container_id = container_name - - if container_id: - # Extract workflow log path from API response - workflow_log_path = api_result.get("docker_log_file_path") - logger.info(f"Extracted workflow_log_path from api_result: {workflow_log_path}") - logger.info(f"workflow_log_path type: {type(workflow_log_path)}, is None: {workflow_log_path is None}") - - ModelDeployment.objects.create( - container_id=container_id, - container_name=container_name, - model_name=impl.model_name, - device=device, - status="running", - stopped_by_user=False, - port=7000, # TT Inference Server default port - workflow_log_path=workflow_log_path - ) - logger.info(f"Saved deployment record for {container_name} (ID: {container_id})") - if workflow_log_path: - logger.info(f"Workflow log path saved: {workflow_log_path}") - else: - logger.warning(f"Workflow log path is None/empty for {container_name}") - else: - logger.warning(f"Could not save deployment record: no container_id or container_name") - except Exception as e: - import traceback - logger.error( - f"Failed to save deployment record for {container_name} (ID: {container_id}): {type(e).__name__}: {e}\n" - f"Traceback: {traceback.format_exc()}" - ) - # Don't fail the deployment if we can't save the record - - return { - "status": "success", - "container_name": api_result["container_name"], - "container_id": api_result.get("container_id"), # Pass through container_id - "job_id": api_result.get("job_id") or api_result.get("container_id"), # Use job_id or container_id as fallback - "api_response": api_result - } - else: - error_msg = f"API call failed with status {response.status_code}: {response.text}" - logger.error(error_msg) - - # Try to extract job_id and error details from response - job_id = None - error_detail = error_msg - try: - error_data = response.json() - if isinstance(error_data, dict): - # Extract job_id if present - job_id = error_data.get('job_id') - # Extract error message if present - error_detail = error_data.get('message', error_msg) - logger.info(f"Extracted job_id from error response: {job_id}") - except Exception as parse_error: - logger.warning(f"Could not parse error response: {parse_error}") - - return { - "status": "error", - "message": error_detail, - "job_id": job_id - } - - except requests.exceptions.RequestException as e: - error_msg = f"Network error calling TT Inference Server API: {str(e)}" - logger.error(error_msg) - return {"status": "error", "message": error_msg} - except Exception as e: - error_msg = f"Unexpected error in run_container: {str(e)}" - logger.error(error_msg) - return {"status": "error", "message": error_msg} - else: - # For non-chat models, we use the docker client to run the container - try: - logger.info(f"run_container called for {impl.model_name}") - - - run_kwargs = copy.deepcopy(impl.docker_config) - # handle runtime configuration changes to docker kwargs - device_mounts = get_devices_mounts(impl) - if device_mounts: - run_kwargs.update({"devices": device_mounts}) - run_kwargs.update({"ports": get_port_mounts(impl)}) - # add bridge inter-container network - run_kwargs.update({"network": backend_config.docker_bridge_network_name}) - # add unique container name suffixing with host port - host_port = list(run_kwargs["ports"].values())[0] - logger.info(f"!!!host_port:= {host_port}") - run_kwargs.update({"name": f"{impl.container_base_name}_p{host_port}"}) - run_kwargs.update({"hostname": f"{impl.container_base_name}_p{host_port}"}) - # add environment variables - run_kwargs["environment"]["MODEL_WEIGHTS_ID"] = weights_id - # container path, not backend path - run_kwargs["environment"]["MODEL_WEIGHTS_PATH"] = get_model_weights_path( - impl.model_container_weights_dir, weights_id - ) - logger.info(f"run_kwargs:= {run_kwargs}") - - # Convert run_kwargs to docker-control-service API format - docker_client = get_docker_client() - api_kwargs = { - "image": impl.image_version, - "name": run_kwargs.get("name"), - "command": run_kwargs.get("command"), - "environment": run_kwargs.get("environment", {}), - "ports": run_kwargs.get("ports", {}), - "volumes": run_kwargs.get("volumes"), - "network": run_kwargs.get("network"), - "detach": run_kwargs.get("detach", True), - } + logger.info(f"API payload: {payload}") - # Add devices if present - if "devices" in run_kwargs: - api_kwargs["devices"] = run_kwargs["devices"] + # Make POST request to TT Inference Server API + api_url = "http://172.18.0.1:8001/run" - # Add hostname if present - if "hostname" in run_kwargs: - api_kwargs["hostname"] = run_kwargs["hostname"] + response = requests.post( + api_url, + json=payload, + timeout=DEPLOYMENT_TIMEOUT_SECONDS # 5 hour timeout for container startup and weight downloads + ) - container_result = docker_client.run_container(**api_kwargs) - logger.info(f"Container started via docker-control-service: {container_result}") + if response.status_code in [200, 202]: + api_result = response.json() + logger.info(f"API call successful (status {response.status_code}): {api_result}") + logger.info(f"api_result contains docker_log_file_path: {'docker_log_file_path' in api_result}") + if 'docker_log_file_path' in api_result: + logger.info(f"api_result['docker_log_file_path'] = {api_result.get('docker_log_file_path')}") + else: + logger.warning(f"docker_log_file_path NOT found in api_result. Available keys: {list(api_result.keys())}") - # Extract container info from API response - container_id = container_result.get("id") - container_name = container_result.get("name") - # on changes to containers, update deploy cache + # Update deploy cache on success update_deploy_cache() # Notify agent about new container deployment - notify_agent_of_new_container(container_name) + notify_agent_of_new_container(api_result["container_name"]) - # Save deployment record to database + # Create the deployment record only after successful API response + # (never before — avoids stale "starting" records blocking slots on failure) + container_id = None + container_name = "unknown" try: - # Get device from impl configuration - device_config = impl.device_configurations[0] if impl.device_configurations else None - device_name = device_config.name if device_config else "unknown" - - ModelDeployment.objects.create( - container_id=container_id, - container_name=container_name, - model_name=impl.model_name, - device=device_name, - status="running", - stopped_by_user=False, - port=host_port - ) - logger.info(f"Saved deployment record for {container_name} (ID: {container_id})") + container_id = api_result.get("container_id") + container_name = api_result.get("container_name", "unknown") + + # If container_id is not in response, try to get it from Docker by name + if not container_id and container_name: + try: + docker_client = get_docker_client() + container_info = docker_client.get_container(container_name) + container_id = container_info.get("id") + logger.info(f"Retrieved container_id {container_id} from Docker for {container_name}") + except Exception as docker_error: + logger.warning(f"Could not get container_id from Docker: {docker_error}") + container_id = container_name + + if container_id: + workflow_log_path = api_result.get("docker_log_file_path") + logger.info(f"Extracted workflow_log_path from api_result: {workflow_log_path}") + + ModelDeployment.objects.create( + container_id=container_id, + container_name=container_name, + model_name=impl.model_name, + device=device, + device_id=device_id, + status="running", + stopped_by_user=False, + port=service_port, + workflow_log_path=workflow_log_path + ) + logger.info(f"Saved deployment record for {container_name} (ID: {container_id})") + else: + logger.warning(f"Could not save deployment record: no container_id or container_name") except Exception as e: import traceback logger.error( @@ -307,13 +231,43 @@ def run_container(impl, weights_id): return { "status": "success", - "container_id": container_id, - "container_name": container_name, - "service_route": impl.service_route, - "port_bindings": run_kwargs["ports"], + "container_name": api_result["container_name"], + "container_id": api_result.get("container_id"), # Pass through container_id + "job_id": api_result.get("job_id") or api_result.get("container_id"), # Use job_id or container_id as fallback + "api_response": api_result + } + else: + error_msg = f"API call failed with status {response.status_code}: {response.text}" + logger.error(error_msg) + + # Try to extract job_id and error details from response + job_id = None + error_detail = error_msg + try: + error_data = response.json() + if isinstance(error_data, dict): + # Extract job_id if present + job_id = error_data.get('job_id') + # Extract error message if present + error_detail = error_data.get('message', error_msg) + logger.info(f"Extracted job_id from error response: {job_id}") + except Exception as parse_error: + logger.warning(f"Could not parse error response: {parse_error}") + + return { + "status": "error", + "message": error_detail, + "job_id": job_id } - except Exception as e: - return {"status": "error", "message": str(e)} + + except requests.exceptions.RequestException as e: + error_msg = f"Network error calling TT Inference Server API: {str(e)}" + logger.error(error_msg) + return {"status": "error", "message": error_msg} + except Exception as e: + error_msg = f"Unexpected error in run_container: {str(e)}" + logger.error(error_msg) + return {"status": "error", "message": error_msg} def run_agent_container(container_name, port_bindings, impl): # runs agent container after associated llm container runs @@ -355,22 +309,47 @@ def get_runtime_device_configuration(device_configurations): return next(iter(device_configurations)) -def get_devices_mounts(impl): +def get_devices_mounts(impl, device_id=0): device_config = get_runtime_device_configuration(impl.device_configurations) assert isinstance(device_config, DeviceConfigurations) - # TODO: add logic to handle multiple devices and multiple containers - single_device_mounts = ["/dev/tenstorrent/0:/dev/tenstorrent/0"] + + # Single-chip device configurations: pin to the requested chip slot + single_chip_configs = { + DeviceConfigurations.E150, + DeviceConfigurations.N150, + DeviceConfigurations.N150_WH_ARCH_YAML, + DeviceConfigurations.N300, + DeviceConfigurations.N300_WH_ARCH_YAML, + DeviceConfigurations.P100, + DeviceConfigurations.P150, + DeviceConfigurations.P300c, + } + + # Multi-chip configurations manage their own chip allocation; expose full directory all_device_mounts = ["/dev/tenstorrent:/dev/tenstorrent"] - device_map = { - DeviceConfigurations.E150: single_device_mounts, - DeviceConfigurations.N150: single_device_mounts, - DeviceConfigurations.N150_WH_ARCH_YAML: single_device_mounts, - DeviceConfigurations.N300: single_device_mounts, - DeviceConfigurations.N300x4_WH_ARCH_YAML: all_device_mounts, - DeviceConfigurations.N300x4: all_device_mounts, + + if device_config in single_chip_configs: + return [f"/dev/tenstorrent/{device_id}:/dev/tenstorrent/{device_id}"] + + # Multi-chip (T3K, Galaxy, N300x4, P150X4, P150X8, etc.) + multi_chip_configs = { + DeviceConfigurations.N150X4, + DeviceConfigurations.N300x4, + DeviceConfigurations.N300x4_WH_ARCH_YAML, + DeviceConfigurations.T3K, + DeviceConfigurations.T3K_RING, + DeviceConfigurations.T3K_LINE, + DeviceConfigurations.P150X4, + DeviceConfigurations.P150X8, + DeviceConfigurations.P300Cx2, + DeviceConfigurations.P300Cx4, + DeviceConfigurations.GALAXY, + DeviceConfigurations.GALAXY_T3K, } - device_mounts = device_map.get(device_config) - return device_mounts + if device_config in multi_chip_configs: + return all_device_mounts + + return None def get_port_mounts(impl): @@ -493,6 +472,15 @@ def parse_env_var_str(env_var_list): def get_container_status(): containers = get_managed_containers() + + # Build container_id → device_id lookup from deployment database + device_id_lookup: dict = {} + try: + for dep in ModelDeployment.objects.filter(status__in=["starting", "running"]): + device_id_lookup[dep.container_id] = dep.device_id + except Exception as e: + logger.warning(f"Could not load device_id lookup: {e}") + data = {} for con in containers: data[con.id] = { @@ -508,6 +496,7 @@ def get_container_status(): for k, v in con.attrs.get("NetworkSettings").get("Networks").items() }, "env_vars": parse_env_var_str(con.attrs.get("Config").get("Env")), + "device_id": device_id_lookup.get(con.id), } return data @@ -550,12 +539,12 @@ def update_deploy_cache(): if is_tt_inference_container: logger.info(f"Detected TT Inference Server container: {con['name']} (ID: {con_id})") - # Try to find the model implementation from the database + # Try to find the model implementation from the deployment store deployment_found = False try: from docker_control.models import ModelDeployment deployment = ModelDeployment.objects.filter(container_id=con_id).first() - + if deployment: # Find the model implementation by model name model_impl = None @@ -565,11 +554,12 @@ def update_deploy_cache(): logger.info(f"Matched TT Inference Server container to model_impl: {model_impl.model_name}") deployment_found = True break - + if not model_impl: logger.warning(f"Could not find model_impl for {deployment.model_name} in container {con['name']}") else: - logger.warning(f"No deployment record found for TT Inference Server container {con_id}") + # No record by container_id — could be a pre-existing container or still starting up + logger.debug(f"No deployment record found for TT Inference Server container {con_id}") except Exception as e: # Check if this is a migration/database issue error_str = str(e).lower() @@ -582,13 +572,25 @@ def update_deploy_cache(): if not deployment_found: logger.info(f"Using fallback logic to match container {con['name']}") # Try to match by container name + # First try exact match model_impl = None for k, v in model_implmentations.items(): - if v.model_name in con["name"]: + if v.model_name == con["name"]: model_impl = v - logger.info(f"Matched container by name to model_impl: {model_impl.model_name}") + logger.info(f"Matched container by exact name to model_impl: {model_impl.model_name}") break - + + # Fall back to longest-substring match (prevents short names like "Llama-3.1-8B" + # from beating "Llama-3.1-8B-Instruct" on container name "Llama-3.1-8B-Instruct") + if not model_impl: + best_match_len = 0 + for k, v in model_implmentations.items(): + if v.model_name in con["name"] and len(v.model_name) > best_match_len: + model_impl = v + best_match_len = len(v.model_name) + if model_impl: + logger.info(f"Matched container by name substring to model_impl: {model_impl.model_name}") + if not model_impl: logger.warning(f"Could not match TT Inference Server container {con['name']} to any model_impl. Skipping.") continue @@ -625,11 +627,23 @@ def update_deploy_cache(): hostname = con["networks"][backend_config.docker_bridge_network_name][ "DNSNames" ][0] + # Use the actual container port from port bindings instead of the + # static model_impl.service_port (which is always 7000). Multi-slot + # deployments bind to 7000+device_id, so we must resolve the real port. + actual_port = model_impl.service_port # default fallback + port_bindings = con.get("port_bindings", {}) + if port_bindings: + container_port_key = next(iter(port_bindings.keys()), None) + if container_port_key: + try: + actual_port = int(container_port_key.split("/")[0]) + except (ValueError, IndexError): + pass con["internal_url"] = ( - f"{hostname}:{model_impl.service_port}{model_impl.service_route}" + f"{hostname}:{actual_port}{model_impl.service_route}" ) con["health_url"] = ( - f"{hostname}:{model_impl.service_port}{model_impl.health_route}" + f"{hostname}:{actual_port}{model_impl.health_route}" ) cache.set(con_id, con, timeout=None) logger.info(f"Added container {con['name']} (ID: {con_id[:12]}) to deploy cache") @@ -655,195 +669,92 @@ def remove_id_prefix(s): def perform_reset(): + """ + Reset the TT board using tt-smi -r (up to 2 attempts, 30-second timeout each). + + The tt-smi -s pre-check has been intentionally removed: when the board is in + a bad state tt-smi -s itself hangs, which makes recovery worse. We go + straight to tt-smi -r and let the result speak for itself. + """ try: - logger.info("Running initial tt-smi -s command to check device detection.") - - # Initial check to see if Tenstorrent devices are detected - def check_device_detection(): - process = subprocess.Popen( - ["tt-smi", "-s"], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - stdin=subprocess.DEVNULL, # Prevents interactive command-line interface - text=True, - ) - output = [] - detected_chips = 0 - warnings = [] - for line in iter(process.stdout.readline, ""): - logger.info(f"tt-smi output: {line.strip()}") - output.append(line) - lower_line = line.lower() - if "detected chips" in lower_line: - # Expect format like: "Detected Chips: 2" - try: - parts = line.strip().split(":") - if len(parts) == 2: - detected_chips = int(parts[1].strip().split()[0]) - except (ValueError, IndexError) as e: - warnings.append(f"Unable to parse detected chips from line: {line.strip()}") - logger.warning(f"Unable to parse detected chips from line '{line.strip()}': {e}") - if "response_q out of sync" in lower_line or "rd_ptr" in lower_line: - warnings.append(line.strip()) - if "No Tenstorrent devices detected" in line: - return { - "status": "error", - "message": "No Tenstorrent devices detected! Please check your hardware and try again.", - "output": "".join(output), - "http_status": 503, # Service Unavailable - } - process.stdout.close() - return_code = process.wait() - - # Parse JSON output if text parsing didn't find chips - if detected_chips == 0: - full_output = "".join(output) - try: - json_data = json.loads(full_output) - if "device_info" in json_data and isinstance(json_data["device_info"], list): - detected_chips = len(json_data["device_info"]) - logger.info(f"Detected {detected_chips} chips from JSON output") - except json.JSONDecodeError as e: - logger.warning(f"Could not parse tt-smi output as JSON: {e}") - - # If chips are detected, allow reset but surface warnings/return code - if detected_chips > 0: - if return_code != 0: - warnings.append(f"tt-smi -s exited with code {return_code}") - status_val = "success" if not warnings and return_code == 0 else "warning" - return { - "status": status_val, - "output": "".join(output), - "warnings": warnings, - "detected_chips": detected_chips, - "return_code": return_code, - } - if return_code != 0: - return { - "status": "error", - "message": f"tt-smi -s command failed with return code {return_code}. Please check if tt-smi is properly installed.", - "output": "".join(output), - "http_status": 500, # Internal Server Error - } - return { - "status": "success", - "message": "No Tenstorrent devices detected. tt-smi executed successfully.", - "output": "".join(output), - "detected_chips": 0, - "return_code": return_code, - } + logger.info("Starting board reset — running tt-smi -r directly (no pre-check)") - # Run the device detection check - detection_result = check_device_detection() - detection_warnings = detection_result.get("warnings", []) - detection_output = detection_result.get("output", "") - if detection_result.get("status") == "error": - return detection_result - if detection_output: - cumulative_output = [detection_output] - else: - cumulative_output = [] - if detection_warnings: - cumulative_output.append("Warnings during device detection:\n") - cumulative_output.extend([w + "\n" for w in detection_warnings]) - - logger.info("Running tt-smi reset command.") - - def stream_command_output(command): - logger.info(f"Executing command: {' '.join(command)}") - process = subprocess.Popen( - command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - stdin=subprocess.DEVNULL, # Prevents interactive command-line interface - text=True, - ) - output = [] - for line in iter(process.stdout.readline, ""): - logger.info(f"Command output: {line.strip()}") - output.append(line) - process.stdout.close() - return_code = process.wait() - if return_code != 0: - logger.info(f"Command failed with return code {return_code}") - output.append(f"Command failed with return code {return_code}") - error_message = "tt-smi reset failed. Please check if:\n" - error_message += "1. The Tenstorrent device is properly connected\n" - error_message += "2. You have the correct permissions to access the device\n" - error_message += "3. The tt-smi utility is properly installed\n" - error_message += "4. The device firmware is up to date" - return { - "status": "error", - "message": error_message, - "output": "".join(output), - "http_status": 500, # Internal Server Error - } - else: - logger.info( - f"Command completed successfully with return code {return_code}" + # Signal that a reset is in progress so the device-state endpoint reports RESETTING + SystemResourceService.set_resetting_state() + + MAX_ATTEMPTS = 2 + last_output = "" + + for attempt in range(1, MAX_ATTEMPTS + 1): + logger.info(f"Reset attempt {attempt} of {MAX_ATTEMPTS}") + try: + process = subprocess.Popen( + ["tt-smi", "-r"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + stdin=subprocess.DEVNULL, + text=True, + preexec_fn=os.setsid, ) - return {"status": "success", "output": "".join(output)} - - # Attempt software resets first (up to MAX_RESET_ATTEMPTS) - MAX_RESET_ATTEMPTS = 3 - reset_attempts = 0 - reset_success = False - - # Try tt-smi reset with retries (no reset config file; use default tt-smi behavior) - while reset_attempts < MAX_RESET_ATTEMPTS and not reset_success: - reset_attempts += 1 - logger.info(f"Reset attempt {reset_attempts} of {MAX_RESET_ATTEMPTS}") - cumulative_output.append(f"Attempting reset {reset_attempts} of {MAX_RESET_ATTEMPTS}...\n") - - # Perform reset using tt-smi default behavior (no reset_config.json) - cumulative_output.append("Executing tt-smi -r with default reset configuration.\n") - reset_result = stream_command_output(["tt-smi", "-r"]) - cumulative_output.append(reset_result.get('output', '') + "\n") - - if reset_result.get("status") == "success": - logger.info(f"Reset attempt {reset_attempts} succeeded") - reset_success = True - break - - logger.warning(f"Reset attempt {reset_attempts} failed") - # Small delay between attempts - time.sleep(2) - - # If all reset attempts failed - if not reset_success: - all_output = "".join(cumulative_output) - logger.error(f"All {MAX_RESET_ATTEMPTS} reset attempts failed") - return { - "status": "error", - "message": f"All {MAX_RESET_ATTEMPTS} reset attempts failed using tt-smi --reset command.", - "output": all_output, - "http_status": 500 - } - all_output = "".join(cumulative_output) - if reset_success: - return { - "status": "success", - "message": f"Reset successful after {reset_attempts} attempt(s)", - "output": all_output, - "warnings": detection_warnings, - "http_status": 200 - } - else: - return { - "status": "error", - "message": "All reset attempts failed with no specific error", - "output": all_output, - "warnings": detection_warnings, - "http_status": 500 - } + try: + stdout, _ = process.communicate(timeout=30) + last_output = stdout + logger.info(f"tt-smi -r attempt {attempt} output: {stdout.strip()!r:.200}") + + if process.returncode == 0: + logger.info(f"Reset succeeded on attempt {attempt}") + SystemResourceService.clear_device_state_cache() + return { + "status": "success", + "message": f"Board reset successfully after {attempt} attempt(s)", + "attempts_used": attempt, + "output": stdout, + "http_status": 200, + } + + logger.warning( + f"Reset attempt {attempt} failed: exit code {process.returncode}" + ) + + except subprocess.TimeoutExpired: + logger.warning(f"Reset attempt {attempt} timed out after 30s") + try: + os.killpg(os.getpgid(process.pid), signal.SIGTERM) + process.wait(timeout=2) + except Exception: + try: + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + except Exception: + pass + last_output = "(timeout)" + + except Exception as exc: + logger.error(f"Reset attempt {attempt} raised exception: {exc}") + last_output = str(exc) + + # All attempts failed + logger.error(f"All {MAX_ATTEMPTS} reset attempts failed") + SystemResourceService.clear_device_state_cache() + return { + "status": "error", + "message": ( + f"Board did not recover after {MAX_ATTEMPTS} reset attempts. " + "Manual intervention may be required." + ), + "attempts_used": MAX_ATTEMPTS, + "output": last_output, + "http_status": 500, + } except Exception as e: - logger.exception("Exception occurred during reset operation.") + logger.exception("Unexpected error during reset operation") + SystemResourceService.clear_device_state_cache() return { "status": "error", "message": str(e), - "output": "An exception occurred during the reset operation.", + "attempts_used": 0, + "output": "", "http_status": 500, } diff --git a/app/backend/docker_control/health_monitor.py b/app/backend/docker_control/health_monitor.py index 1e1a8e86..7e0faf0b 100644 --- a/app/backend/docker_control/health_monitor.py +++ b/app/backend/docker_control/health_monitor.py @@ -16,15 +16,45 @@ _stop_monitoring = False +def _cleanup_stale_starting_records(): + """Remove pending 'starting' records older than 10 minutes. + + These are left behind when a deployment API call fails after the + pending record was already created. They permanently block their + chip slot if not cleaned up. + """ + try: + stale_cutoff = timezone.now() - timezone.timedelta(minutes=10) + starting_deployments = ModelDeployment.objects.filter(status="starting") + for dep in starting_deployments: + if ( + dep.container_id.startswith("pending_") + and dep.deployed_at is not None + and dep.deployed_at < stale_cutoff + ): + logger.info( + f"Cleaning up stale 'starting' record: {dep.model_name} " + f"(id={dep.id}, deployed_at={dep.deployed_at})" + ) + dep.status = "failed" + dep.stopped_at = timezone.now() + dep.save() + except Exception as e: + logger.error(f"Error cleaning up stale starting records: {e}") + + def check_container_health(): - """Check for containers that died unexpectedly""" + """Check for containers that died unexpectedly and clean up stale records""" try: + # Clean up stale pending records that block chip slots + _cleanup_stale_starting_records() + # Get all running deployments from database running_deployments = ModelDeployment.objects.filter(status="running") - + if not running_deployments.exists(): return - + logger.debug(f"Checking health of {running_deployments.count()} running deployments") # Check actual Docker container status via docker-control-service diff --git a/app/backend/docker_control/migrations/0001_initial.py b/app/backend/docker_control/migrations/0001_initial.py deleted file mode 100644 index 0b4c168d..00000000 --- a/app/backend/docker_control/migrations/0001_initial.py +++ /dev/null @@ -1,33 +0,0 @@ -# Generated by Django 5.0.4 on 2025-11-12 15:18 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - initial = True - - dependencies = [ - ] - - operations = [ - migrations.CreateModel( - name='ModelDeployment', - fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('container_id', models.CharField(db_index=True, max_length=255, unique=True)), - ('container_name', models.CharField(db_index=True, max_length=255)), - ('model_name', models.CharField(db_index=True, max_length=255)), - ('device', models.CharField(max_length=50)), - ('deployed_at', models.DateTimeField(auto_now_add=True, db_index=True)), - ('stopped_at', models.DateTimeField(blank=True, null=True)), - ('status', models.CharField(db_index=True, default='running', max_length=50)), - ('stopped_by_user', models.BooleanField(default=False)), - ('port', models.IntegerField(blank=True, null=True)), - ], - options={ - 'ordering': ['-deployed_at'], - 'indexes': [models.Index(fields=['status', '-deployed_at'], name='docker_cont_status_a5afde_idx'), models.Index(fields=['model_name', '-deployed_at'], name='docker_cont_model_n_2ecff9_idx')], - }, - ), - ] diff --git a/app/backend/docker_control/migrations/0002_modeldeployment_workflow_log_path.py b/app/backend/docker_control/migrations/0002_modeldeployment_workflow_log_path.py deleted file mode 100644 index 518dde93..00000000 --- a/app/backend/docker_control/migrations/0002_modeldeployment_workflow_log_path.py +++ /dev/null @@ -1,18 +0,0 @@ -# Generated by Django 5.0.4 on 2025-11-12 21:35 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('docker_control', '0001_initial'), - ] - - operations = [ - migrations.AddField( - model_name='modeldeployment', - name='workflow_log_path', - field=models.CharField(blank=True, help_text='Path to workflow log file from tt-inference-server', max_length=512, null=True), - ), - ] diff --git a/app/backend/docker_control/models.py b/app/backend/docker_control/models.py index a94f60ff..7f6b1f02 100644 --- a/app/backend/docker_control/models.py +++ b/app/backend/docker_control/models.py @@ -2,39 +2,6 @@ # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC -from django.db import models -from django.utils import timezone +from docker_control.deployment_store import ModelDeployment - -class ModelDeployment(models.Model): - """Track all model deployments with full history""" - # Deployment identification - container_id = models.CharField(max_length=255, unique=True, db_index=True) - container_name = models.CharField(max_length=255, db_index=True) - - # Model information - model_name = models.CharField(max_length=255, db_index=True) - device = models.CharField(max_length=50) # n150, n300, etc. - - # Deployment metadata - deployed_at = models.DateTimeField(auto_now_add=True, db_index=True) - stopped_at = models.DateTimeField(null=True, blank=True) - - # Status tracking - status = models.CharField(max_length=50, default="running", db_index=True) - # Choices: starting, running, stopped, exited, dead, error - stopped_by_user = models.BooleanField(default=False) # True if user clicked stop/delete - - # Container details - port = models.IntegerField(null=True, blank=True) - workflow_log_path = models.CharField(max_length=512, null=True, blank=True, help_text="Path to workflow log file from tt-inference-server") - - class Meta: - ordering = ['-deployed_at'] - indexes = [ - models.Index(fields=['status', '-deployed_at']), - models.Index(fields=['model_name', '-deployed_at']), - ] - - def __str__(self): - return f"{self.model_name} on {self.device} - {self.status}" +__all__ = ["ModelDeployment"] diff --git a/app/backend/docker_control/urls.py b/app/backend/docker_control/urls.py index ae37307f..b94fabd0 100644 --- a/app/backend/docker_control/urls.py +++ b/app/backend/docker_control/urls.py @@ -9,6 +9,7 @@ StopView, ContainersView, StatusView, + ChipStatusView, DeployView, DeploymentProgressView, DeploymentLogsView, @@ -32,6 +33,7 @@ path("deploy/progress/stream//", views.DeploymentProgressStreamView.as_view(), name="deployment-progress-stream"), path("stop/", views.StopView.as_view()), path("status/", views.StatusView.as_view()), + path("chip-status/", views.ChipStatusView.as_view(), name="chip-status"), path("redeploy/", views.RedeployView.as_view()), path("reset_board/", views.ResetBoardView.as_view()), path("docker/image_status//", views.ImageStatusView.as_view(), name="docker-image-status"), diff --git a/app/backend/docker_control/views.py b/app/backend/docker_control/views.py index 741a8bcf..bbbb9461 100644 --- a/app/backend/docker_control/views.py +++ b/app/backend/docker_control/views.py @@ -11,10 +11,11 @@ from rest_framework.renderers import JSONRenderer from django.views.decorators.csrf import csrf_exempt from django.utils.decorators import method_decorator -import json +import json import shutil import subprocess import os +from pathlib import Path import re import os @@ -43,6 +44,24 @@ logger = get_logger(__name__) logger.info(f"importing {__name__}") +# Build model_name → status lookup from catalog JSON +_CATALOG_PATH = Path(__file__).parent.parent / "shared_config/models_from_inference_server.json" +try: + _catalog = json.loads(_CATALOG_PATH.read_text()) + _status_lookup: dict[str, str | None] = {m["model_name"]: m.get("status") for m in _catalog["models"]} +except Exception: + logger.warning(f"Could not load model catalog from {_CATALOG_PATH}; status will be null for all models") + _status_lookup = {} + +# Manual compatibility overrides: model names that are always shown as compatible +# (e.g. when sync JSON device_configurations don't match detected board) +_OVERRIDE_PATH = Path(__file__).parent.parent / "shared_config/model_compatibility_overrides.json" +try: + _override_data = json.loads(_OVERRIDE_PATH.read_text()) + _compatibility_override_names: set[str] = set(_override_data.get("model_names", [])) +except Exception: + _compatibility_override_names = set() + # Track when deployment started deployment_start_times = {} # {job_id: timestamp} - Track when deployment started @@ -179,16 +198,30 @@ def get(self, request, *args, **kwargs): for board, devices in board_to_device_map.items(): if board != 'unknown' and bool(set(devices).intersection(impl.device_configurations)): compatible_boards.append(board) + + # Manual override: always show certain models as compatible (e.g. whisper when sync JSON is incomplete) + if impl.model_name in _compatibility_override_names: + is_compatible = True + if current_board != 'unknown' and current_board not in compatible_boards: + compatible_boards = list(compatible_boards) + [current_board] + logger.info(f"Model {impl.model_name}: compatibility overridden to True") logger.info(f"Model {impl.model_name}: compatible={is_compatible}, boards={compatible_boards}") - + + # Infer chip requirements for this model + from shared_config.model_config import infer_chips_required + chips_required = infer_chips_required(impl.device_configurations) + data.append({ "id": impl_id, "name": impl.model_name, "is_compatible": is_compatible, "compatible_boards": compatible_boards, "model_type": impl.model_type.value, - "current_board": current_board + "display_model_type": impl.display_model_type, + "current_board": current_board, + "status": _status_lookup.get(impl.model_name), + "chips_required": chips_required, }) return Response(data, status=status.HTTP_200_OK) @@ -200,6 +233,32 @@ def get(self, request, *args, **kwargs): return Response(data, status=status.HTTP_200_OK) +class ChipStatusView(APIView): + """API endpoint for chip slot occupancy status""" + + def get(self, request, *args, **kwargs): + """ + Get current chip slot status. + + Returns JSON with board type, total slots, and per-slot occupancy info. + """ + try: + from docker_control.chip_allocator import ChipSlotAllocator + + allocator = ChipSlotAllocator() + status_info = allocator.get_chip_status() + + return Response(status_info, status=status.HTTP_200_OK) + + except Exception as e: + logger.error(f"Error getting chip status: {str(e)}") + return Response( + { + "error": "Failed to get chip status", + "message": str(e) + }, + status=status.HTTP_500_INTERNAL_SERVER_ERROR + ) @@ -207,16 +266,55 @@ class DeployView(APIView): def post(self, request, *args, **kwargs): serializer = DeploymentSerializer(data=request.data) if serializer.is_valid(): + from docker_control.chip_allocator import ChipSlotAllocator, AllocationError, MultiChipConflictError + impl_id = request.data.get("model_id") weights_id = request.data.get("weights_id") + + # Get manual override if in advanced mode (optional) + manual_device_id = request.data.get("device_id") + if manual_device_id is not None: + manual_device_id = int(manual_device_id) + impl = model_implmentations[impl_id] - response = run_container(impl, weights_id) - + + # Auto-allocate chip slot + try: + allocator = ChipSlotAllocator() + device_id = allocator.allocate_chip_slot( + impl.model_name, + manual_override=manual_device_id + ) + logger.info(f"Allocated device_id={device_id} for {impl.model_name}") + + except MultiChipConflictError as e: + logger.warning(f"Multi-chip conflict for {impl.model_name}: {str(e)}") + return Response({ + "status": "error", + "error_type": "multi_chip_conflict", + "message": str(e), + "conflicts": e.conflicts # List of conflicting deployments + }, status=status.HTTP_409_CONFLICT) + + except AllocationError as e: + logger.warning(f"Allocation failed for {impl.model_name}: {str(e)}") + return Response({ + "status": "error", + "error_type": "allocation_failed", + "message": str(e) + }, status=status.HTTP_409_CONFLICT) + + # Continue with deployment using allocated device_id + response = run_container(impl, weights_id, device_id=device_id) + + # Add allocated_device_id to response + response["allocated_device_id"] = device_id + # Ensure job_id is set for progress tracking # Use job_id from API response, or fallback to container_id or container_name if not response.get("job_id"): response["job_id"] = response.get("container_id") or response.get("container_name") - + # Check if deployment failed if response.get("status") == "error": logger.error(f"Deployment failed: {response.get('message', 'Unknown error')}") @@ -224,14 +322,14 @@ def post(self, request, *args, **kwargs): response, status=status.HTTP_500_INTERNAL_SERVER_ERROR ) - + # Refresh tt-smi cache after successful deployment if response.get("status") == "success": try: SystemResourceService.force_refresh_tt_smi_cache() except Exception as e: logger.warning(f"Failed to refresh tt-smi cache after deployment: {e}") - + return Response(response, status=status.HTTP_201_CREATED) else: return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) @@ -706,14 +804,7 @@ def get(self, request, model_id): logger.info(f"Checking status for image: {image_name}:{image_tag}") image_status = check_image_exists(image_name, image_tag) logger.info(f"Image status result: {image_status}") - - # Add pull progress if available - if model_id in pull_progress: - image_status['pull_in_progress'] = True - image_status['progress'] = pull_progress[model_id] - else: - image_status['pull_in_progress'] = False - + image_status['pull_in_progress'] = False return Response(image_status, status=status.HTTP_200_OK) except KeyError: logger.warning(f"Model {model_id} not found in model_implementations") @@ -1172,6 +1263,7 @@ def get(self, request): 'container_name': deployment.container_name, 'model_name': deployment.model_name, 'device': deployment.device, + 'device_id': deployment.device_id, 'deployed_at': deployment.deployed_at.isoformat() if deployment.deployed_at else None, 'stopped_at': deployment.stopped_at.isoformat() if deployment.stopped_at else None, 'status': deployment.status, diff --git a/app/backend/model_control/metrics_tracker.py b/app/backend/model_control/metrics_tracker.py index fcc948ee..091d5180 100644 --- a/app/backend/model_control/metrics_tracker.py +++ b/app/backend/model_control/metrics_tracker.py @@ -25,6 +25,20 @@ def __init__(self): self.prompt_tokens: int = 0 self.last_token_count: int = 0 + def record_content_token(self) -> None: + """Record arrival of a single content token (from delta chunks)""" + current_time = time.time() + if self.first_token_time is None: + self.first_token_time = current_time + self.token_times.append(current_time) + self.num_tokens += 1 + self.last_token_count = self.num_tokens + + def set_prompt_tokens(self, prompt_tokens: int) -> None: + """Set prompt token count from usage data""" + if self.prompt_tokens == 0: + self.prompt_tokens = prompt_tokens + def record_token(self, completion_tokens: int, prompt_tokens: int = 0) -> None: """ Record token arrival from usage data diff --git a/app/backend/model_control/model_utils.py b/app/backend/model_control/model_utils.py index 4e91e214..9b6454b5 100644 --- a/app/backend/model_control/model_utils.py +++ b/app/backend/model_control/model_utils.py @@ -26,6 +26,53 @@ encoded_jwt = jwt.encode(json_payload, backend_config.jwt_secret, algorithm="HS256") AUTH_TOKEN = os.getenv('CLOUD_CHAT_UI_AUTH_TOKEN', '') +def messages_to_prompt(messages: list) -> str: + """Convert chat messages list to a plain text prompt for base/completion models.""" + parts = [] + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + if role == "system": + parts.append(content) + elif role == "user": + parts.append(f"User: {content}") + elif role == "assistant": + parts.append(f"Assistant: {content}") + parts.append("Assistant:") + return "\n\n".join(parts) + + +def get_model_name_from_container(internal_url: str, fallback: str) -> str: + """Query vLLM /v1/models to get the exact model name loaded in the container. + + Args: + internal_url: Raw internal URL from deploy cache (e.g. "container:7000/v1/chat/completions") + fallback: Value to return if the query fails (typically hf_model_id) + + Returns: + The actual model name reported by vLLM, or fallback on any error. + """ + try: + # Strip the route path to get just host:port + # e.g. "container:7000/v1/chat/completions" -> "container:7000" + base = internal_url.split("/")[0] + models_url = f"http://{base}/v1/models" + headers = {"Authorization": f"Bearer {encoded_jwt}"} + response = requests.get(models_url, headers=headers, timeout=3) + if response.status_code == 200: + model_id = response.json()["data"][0]["id"] + logger.info(f"Resolved actual model name from /v1/models: {model_id}") + return model_id + else: + logger.warning( + f"GET {models_url} returned {response.status_code}, using fallback: {fallback}" + ) + return fallback + except Exception as e: + logger.warning(f"Failed to query /v1/models ({e}), using fallback: {fallback}") + return fallback + + def get_deploy_cache(): # the cache is initialized when by docker_control is imported def get_all_records(): @@ -45,13 +92,32 @@ def health_check(url, json_data, timeout=5): try: headers = {"Authorization": f"Bearer {encoded_jwt}"} response = requests.get(url, json=json_data, headers=headers, timeout=5) - response.raise_for_status() - logger.info(f"Health check passed: {response.status_code}") - return True, response.json() if response.content else {} + except requests.exceptions.ConnectionError as e: + # Port not yet listening — container is still starting up + logger.info(f"Health check: connection refused (starting): {e}") + return None, str(e) except requests.RequestException as e: - logger.error(f"Health check failed: {str(e)}") + logger.error(f"Health check failed (network error): {str(e)}") return False, str(e) + if response.status_code == 200: + logger.info(f"Health check passed: {response.status_code}") + return True, response.json() if response.content else {} + + # 503 with "not ready" means model is still loading (media-server models) + if response.status_code == 503: + try: + body = response.json() + except Exception: + body = {} + detail = body.get("detail", "") + if "not ready" in detail.lower(): + logger.info(f"Health check: model not ready yet (starting): {detail}") + return None, detail + + logger.error(f"Health check failed: {response.status_code} {response.text[:200]}") + return False, response.text[:200] + def stream_response_from_agent_api(url, json_data): logger.info('[TRACE_FLOW_STEP_3_BACKEND_TO_AGENT] stream_response_from_agent_api called', extra={'url': url, 'json_data': json_data}) try: @@ -173,7 +239,7 @@ def stream_to_cloud_model(url, json_data): json_data["top_k"] = int(top_k) if top_k is not None else 20 json_data["top_p"] = float(top_p) if top_p is not None else 0.9 json_data["max_tokens"] = int(max_tokens) if max_tokens is not None else 512 - json_data["stream_options"] = {"include_usage": True, "continuous_usage_stats": True} + json_data["stream_options"] = {"include_usage": True} # Log final parameters being used logger.info("=== Final Model Parameters ===") @@ -231,7 +297,7 @@ def stream_to_cloud_model(url, json_data): chunk_dict = json.loads(sub_chunk) logger.info(f"Successfully parsed JSON: {chunk_dict}") - usage = chunk_dict.get("usage", {}) + usage = chunk_dict.get("usage") or {} completion_tokens = usage.get("completion_tokens", 0) prompt_tokens = usage.get("prompt_tokens", 0) logger.info(f"Usage info: {usage}, completion tokens: {completion_tokens}") @@ -314,7 +380,7 @@ def stream_response_from_external_api(url, json_data): json_data["top_k"] = int(top_k) if top_k is not None else 20 json_data["top_p"] = float(top_p) if top_p is not None else 0.9 json_data["max_tokens"] = int(max_tokens) if max_tokens is not None else 512 - json_data["stream_options"] = {"include_usage": True, "continuous_usage_stats": True} + json_data["stream_options"] = {"include_usage": True} # Log final parameters being used logger.info("=== Final Model Parameters ===") @@ -366,23 +432,30 @@ def stream_response_from_external_api(url, json_data): elif new_chunk != "": chunk_dict = json.loads(new_chunk) - usage = chunk_dict.get("usage", {}) - completion_tokens = usage.get("completion_tokens", 0) - prompt_tokens = usage.get("prompt_tokens", 0) - # Record token arrival using metrics tracker - if completion_tokens > 0: - tracker.record_token( - completion_tokens=completion_tokens, - prompt_tokens=prompt_tokens - ) - logger.info(f"Recorded token: completion={completion_tokens}, TTFT={tracker.get_ttft():.4f}s, TPOT={tracker.get_tpot():.4f}s") + # Track TTFT/TPOT from content delta chunks (accurate per-token timing) + choices = chunk_dict.get("choices") or [] + if choices: + delta_content = choices[0].get("delta", {}).get("content", "") + if delta_content: + tracker.record_content_token() + logger.info(f"Recorded token: count={tracker.num_tokens}, TTFT={tracker.get_ttft():.4f}s, TPOT={tracker.get_tpot():.4f}s") + + # Capture prompt_tokens from usage chunk at the end + usage = chunk_dict.get("usage") or {} + prompt_tokens = usage.get("prompt_tokens", 0) + if prompt_tokens > 0: + tracker.set_prompt_tokens(prompt_tokens) # Yield the current chunk yield chunk logger.info("stream_response_from_external done") + except requests.exceptions.HTTPError as e: + body = e.response.text if e.response is not None else "(no body)" + logger.error(f"HTTPError {e.response.status_code}: {body}") + yield f"error: {str(e)}" except requests.RequestException as e: logger.error(f"RequestException: {str(e)}") yield f"error: {str(e)}" diff --git a/app/backend/model_control/pipeline_views.py b/app/backend/model_control/pipeline_views.py new file mode 100644 index 00000000..d2265dad --- /dev/null +++ b/app/backend/model_control/pipeline_views.py @@ -0,0 +1,199 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +""" +Voice pipeline view: Whisper STT → LLM → TTS (optional). +Accepts multipart/form-data and streams SSE events to the client. +""" + +import base64 +import json +import time + +import requests +from django.http import StreamingHttpResponse +from rest_framework.views import APIView + +from model_control.model_utils import ( + encoded_jwt, + get_deploy_cache, + stream_response_from_external_api, +) +from shared_config.logger_config import get_logger + +logger = get_logger(__name__) + + +class VoicePipelineView(APIView): + """ + POST /models-api/pipeline/voice/ + + Multipart fields: + audio_file – audio blob + whisper_deploy_id – deploy_id of running Whisper + llm_deploy_id – deploy_id of running LLM + tts_deploy_id – (optional) deploy_id of running speecht5_tts + system_prompt – (optional) string + """ + + def post(self, request, *args, **kwargs): + audio_file = request.FILES.get("audio_file") + whisper_deploy_id = request.data.get("whisper_deploy_id") + llm_deploy_id = request.data.get("llm_deploy_id") + tts_deploy_id = request.data.get("tts_deploy_id") + system_prompt = request.data.get( + "system_prompt", + "You are a helpful assistant. Be concise.", + ) + + if not audio_file: + from rest_framework.response import Response + from rest_framework import status + return Response( + {"error": "audio_file is required"}, + status=status.HTTP_400_BAD_REQUEST, + ) + if not whisper_deploy_id or not llm_deploy_id: + from rest_framework.response import Response + from rest_framework import status + return Response( + {"error": "whisper_deploy_id and llm_deploy_id are required"}, + status=status.HTTP_400_BAD_REQUEST, + ) + + def event_stream(): + headers = {"Authorization": f"Bearer {encoded_jwt}"} + deploy_cache = get_deploy_cache() + + # ------------------------------------------------------------------ + # Step 1: STT (Whisper) + # ------------------------------------------------------------------ + try: + whisper_deploy = deploy_cache[whisper_deploy_id] + whisper_url = "http://" + whisper_deploy["internal_url"] + file_payload = { + "file": (audio_file.name, audio_file, audio_file.content_type) + } + stt_resp = requests.post( + whisper_url, files=file_payload, headers=headers, timeout=60 + ) + stt_resp.raise_for_status() + transcript = stt_resp.json().get("text", "") + yield f"data: {json.dumps({'type': 'transcript', 'text': transcript})}\n\n" + except Exception as exc: + logger.error(f"STT step failed: {exc}") + yield f"data: {json.dumps({'type': 'error', 'stage': 'stt', 'message': str(exc)})}\n\n" + return + + if not transcript: + yield f"data: {json.dumps({'type': 'error', 'stage': 'stt', 'message': 'Empty transcript'})}\n\n" + return + + # ------------------------------------------------------------------ + # Step 2: LLM streaming + # ------------------------------------------------------------------ + llm_deploy = deploy_cache[llm_deploy_id] + llm_url = "http://" + llm_deploy["internal_url"] + hf_model_id = llm_deploy["model_impl"].hf_model_id + + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": transcript}) + + llm_payload = { + "model": hf_model_id, + "messages": messages, + "stream": True, + "max_tokens": 512, + } + + llm_full_text = "" + try: + for chunk in stream_response_from_external_api(llm_url, llm_payload): + if isinstance(chunk, bytes): + chunk = chunk.decode("utf-8") + llm_full_text += chunk + yield f"data: {json.dumps({'type': 'llm_chunk', 'text': chunk})}\n\n" + except Exception as exc: + logger.error(f"LLM step failed: {exc}") + yield f"data: {json.dumps({'type': 'error', 'stage': 'llm', 'message': str(exc)})}\n\n" + return + + # ------------------------------------------------------------------ + # Step 3: TTS (optional) + # ------------------------------------------------------------------ + if tts_deploy_id and llm_full_text.strip(): + try: + tts_deploy = deploy_cache[tts_deploy_id] + tts_url = "http://" + tts_deploy["internal_url"] + model_impl = tts_deploy.get("model_impl") + model_name = getattr(model_impl, "model_name", None) if model_impl else None + + # Determine if this is OpenAI-style or enqueue-style endpoint + is_openai_style = "/v1/audio/speech" in tts_url + + if is_openai_style: + # OpenAI-style: POST directly and get audio back + payload = {"model": model_name, "text": llm_full_text.strip(), "voice": "default"} + tts_resp = requests.post(tts_url, json=payload, headers=headers, timeout=120) + tts_resp.raise_for_status() + + audio_b64 = base64.b64encode(tts_resp.content).decode("utf-8") + content_type = tts_resp.headers.get("Content-Type", "audio/wav") + data_uri = f"data:{content_type};base64,{audio_b64}" + yield f"data: {json.dumps({'type': 'audio_url', 'url': data_uri})}\n\n" + else: + # Enqueue-style: POST → poll status → fetch audio + tts_resp = requests.post( + tts_url, + json={"text": llm_full_text.strip()}, + headers=headers, + timeout=30, + ) + + # If 404 on enqueue, try fallback to /v1/audio/speech + if tts_resp.status_code == 404 and "/enqueue" in tts_url: + logger.info(f"Pipeline TTS 404 on {tts_url}, trying /v1/audio/speech") + fallback_url = tts_url.replace("/enqueue", "/v1/audio/speech") + payload = {"model": model_name, "text": llm_full_text.strip(), "voice": "default"} + tts_resp = requests.post(fallback_url, json=payload, headers=headers, timeout=120) + tts_resp.raise_for_status() + + audio_b64 = base64.b64encode(tts_resp.content).decode("utf-8") + content_type = tts_resp.headers.get("Content-Type", "audio/wav") + data_uri = f"data:{content_type};base64,{audio_b64}" + yield f"data: {json.dumps({'type': 'audio_url', 'url': data_uri})}\n\n" + else: + tts_resp.raise_for_status() + + task_id = tts_resp.json().get("task_id") + status_url = tts_url.replace("/enqueue", f"/status/{task_id}") + + # Poll for completion + for _ in range(120): + st = requests.get(status_url, headers=headers, timeout=10) + if st.status_code != 404 and st.json().get("status") == "Completed": + break + time.sleep(1) + + audio_url = tts_url.replace("/enqueue", f"/fetch_audio/{task_id}") + audio_resp = requests.get(audio_url, headers=headers, timeout=30) + audio_resp.raise_for_status() + + audio_b64 = base64.b64encode(audio_resp.content).decode("utf-8") + content_type = audio_resp.headers.get("Content-Type", "audio/wav") + data_uri = f"data:{content_type};base64,{audio_b64}" + yield f"data: {json.dumps({'type': 'audio_url', 'url': data_uri})}\n\n" + + except Exception as exc: + logger.error(f"TTS step failed: {exc}") + yield f"data: {json.dumps({'type': 'error', 'stage': 'tts', 'message': str(exc)})}\n\n" + # Don't abort — transcript and LLM response were already sent + + yield f"data: {json.dumps({'type': 'done'})}\n\n" + + response = StreamingHttpResponse(event_stream(), content_type="text/event-stream") + response["Cache-Control"] = "no-cache" + response["X-Accel-Buffering"] = "no" + return response diff --git a/app/backend/model_control/test_tts_fallback.py b/app/backend/model_control/test_tts_fallback.py new file mode 100644 index 00000000..a46da61d --- /dev/null +++ b/app/backend/model_control/test_tts_fallback.py @@ -0,0 +1,155 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +""" +Tests for TTS inference view fallback behavior. +""" + +import pytest +from unittest.mock import Mock, patch, MagicMock +from rest_framework.test import APIRequestFactory +from rest_framework import status as http_status + +from model_control.views import TtsInferenceView, OpenAIAudioSpeechView + + +class TestTtsInferenceFallback: + """Test TTS inference view with fallback to /v1/audio/speech on 404.""" + + @patch('model_control.views.get_deploy_cache') + @patch('model_control.views.requests.post') + def test_tts_fallback_on_404_from_enqueue(self, mock_post, mock_cache): + """When /enqueue returns 404 for TTS media model, should retry with /v1/audio/speech.""" + # Setup mock deploy cache + mock_impl = Mock() + mock_impl.model_name = "speecht5_tts" + mock_impl.inference_engine = "media" + + mock_cache.return_value = { + "test_deploy_id": { + "internal_url": "speecht5_tts:7000/enqueue", + "model_impl": mock_impl + } + } + + # First call returns 404, second call succeeds + mock_resp_404 = Mock() + mock_resp_404.status_code = 404 + + mock_resp_success = Mock() + mock_resp_success.status_code = 200 + mock_resp_success.headers = {"Content-Type": "audio/wav"} + mock_resp_success.content = b"fake_audio_data" + + mock_post.side_effect = [mock_resp_404, mock_resp_success] + + # Create request + factory = APIRequestFactory() + request = factory.post('/models-api/tts/', { + 'deploy_id': 'test_deploy_id', + 'text': 'Hello world' + }, format='json') + + # Call view + view = TtsInferenceView.as_view() + response = view(request) + + # Verify fallback was attempted + assert mock_post.call_count == 2 + first_call_url = mock_post.call_args_list[0][0][0] + second_call_url = mock_post.call_args_list[1][0][0] + + assert "enqueue" in first_call_url + assert "/v1/audio/speech" in second_call_url + assert response.status_code == 200 + + @patch('model_control.views.get_deploy_cache') + @patch('model_control.views.requests.post') + def test_tts_success_without_fallback(self, mock_post, mock_cache): + """When initial request succeeds, should not retry.""" + # Setup mock deploy cache + mock_impl = Mock() + mock_impl.model_name = "speecht5_tts" + mock_impl.inference_engine = "media" + + mock_cache.return_value = { + "test_deploy_id": { + "internal_url": "speecht5_tts:7000/v1/audio/speech", + "model_impl": mock_impl + } + } + + # First call succeeds + mock_resp_success = Mock() + mock_resp_success.status_code = 200 + mock_resp_success.headers = {"Content-Type": "audio/wav"} + mock_resp_success.content = b"fake_audio_data" + + mock_post.return_value = mock_resp_success + + # Create request + factory = APIRequestFactory() + request = factory.post('/models-api/tts/', { + 'deploy_id': 'test_deploy_id', + 'text': 'Hello world' + }, format='json') + + # Call view + view = TtsInferenceView.as_view() + response = view(request) + + # Verify no fallback was needed + assert mock_post.call_count == 1 + assert response.status_code == 200 + + +class TestOpenAIAudioSpeechFallback: + """Test OpenAI audio/speech view with fallback to /v1/audio/speech on 404.""" + + @patch('model_control.views.get_deploy_cache') + @patch('model_control.views.requests.post') + def test_openai_audio_fallback_on_404(self, mock_post, mock_cache): + """OpenAI endpoint should also retry with /v1/audio/speech on 404.""" + # Setup mock deploy cache + mock_impl = Mock() + mock_impl.model_name = "speecht5_tts" + mock_impl.inference_engine = "media" + + mock_cache.return_value = { + "deploy_1": { + "internal_url": "speecht5_tts:7000/enqueue", + "model_impl": mock_impl + } + } + + # First call returns 404, second call succeeds + mock_resp_404 = Mock() + mock_resp_404.status_code = 404 + + mock_resp_success = Mock() + mock_resp_success.status_code = 200 + mock_resp_success.headers = {"Content-Type": "audio/wav"} + mock_resp_success.content = b"fake_audio_data" + + mock_post.side_effect = [mock_resp_404, mock_resp_success] + + # Create request + factory = APIRequestFactory() + request = factory.post('/v1/audio/speech', { + 'model': 'speecht5_tts', + 'input': 'Hello world' + }, format='json') + + # Call view + view = OpenAIAudioSpeechView.as_view() + response = view(request) + + # Verify fallback was attempted + assert mock_post.call_count == 2 + second_call_url = mock_post.call_args_list[1][0][0] + assert "/v1/audio/speech" in second_call_url + assert response.status_code == 200 + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/app/backend/model_control/urls.py b/app/backend/model_control/urls.py index 158dfde3..74590b91 100644 --- a/app/backend/model_control/urls.py +++ b/app/backend/model_control/urls.py @@ -5,6 +5,7 @@ # model_control/urls.py from django.urls import path from . import views +from .pipeline_views import VoicePipelineView urlpatterns = [ path("inference/", views.InferenceView.as_view()), @@ -18,6 +19,8 @@ path("object-detection-cloud/", views.ObjectDetectionInferenceCloudView.as_view()), path("speech-recognition/", views.SpeechRecognitionInferenceView.as_view()), path("speech-recognition-cloud/", views.SpeechRecognitionInferenceCloudView.as_view()), + path("tts/", views.TtsInferenceView.as_view()), + path("pipeline/voice/", VoicePipelineView.as_view()), path("health/", views.ModelHealthView.as_view()), path("inference_cloud/", views.InferenceCloudView.as_view()), path("logs//", views.ContainerLogsView.as_view(), name="container-logs"), diff --git a/app/backend/model_control/views.py b/app/backend/model_control/views.py index 64bdc46b..7c03c671 100644 --- a/app/backend/model_control/views.py +++ b/app/backend/model_control/views.py @@ -42,6 +42,8 @@ def select_renderer(self, request, renderers, format_suffix): from model_control.model_utils import ( encoded_jwt, get_deploy_cache, + get_model_name_from_container, + messages_to_prompt, stream_response_from_external_api, stream_response_from_agent_api, health_check, @@ -57,6 +59,7 @@ def select_renderer(self, request, renderers, format_suffix): +TTS_API_KEY = os.environ.get("TTS_API_KEY", "") CLOUD_CHAT_UI_URL =os.environ.get("CLOUD_CHAT_UI_URL") CLOUD_YOLOV4_API_URL = os.environ.get("CLOUD_YOLOV4_API_URL") CLOUD_YOLOV4_API_AUTH_TOKEN = os.environ.get("CLOUD_YOLOV4_API_AUTH_TOKEN") @@ -85,8 +88,18 @@ def post(self, request, *args, **kwargs): internal_url = "http://" + deploy["internal_url"] logger.info(f"internal_url:= {internal_url}") logger.info(f"using vllm model:= {deploy["model_impl"].model_name}") - data["model"] = deploy["model_impl"].hf_model_id - + data["model"] = get_model_name_from_container( + deploy["internal_url"], fallback=deploy["model_impl"].hf_model_id + ) + + # Route base/completion models to /v1/completions with a plain prompt + service_route = deploy["model_impl"].service_route + logger.info(f"service_route:= {service_route}") + if service_route == "/v1/completions": + messages = data.pop("messages", []) + data["prompt"] = messages_to_prompt(messages) + data.pop("stream_options", None) + # Create a generator that can be cancelled def generate_response(): try: @@ -116,7 +129,9 @@ def post(self, request, *agrs, **kwargs): if deploy_id and deploy_id in deploy_cache: deploy = deploy_cache[deploy_id] logger.info(f"using vllm model:= {deploy['model_impl'].model_name}") - data["model"] = deploy["model_impl"].hf_model_id + data["model"] = get_model_name_from_container( + deploy["internal_url"], fallback=deploy["model_impl"].hf_model_id + ) else: logger.info("No valid deployment found, proceeding with agent-only mode (cloud LLM)") # Remove deploy_id from data since it's not needed for agent @@ -195,9 +210,12 @@ def get(self, request, *args, **kwargs): deploy = get_deploy_cache()[deploy_id] health_url = "http://" + deploy["health_url"] check_passed, health_content = health_check(health_url, json_data=None) - if check_passed: + if check_passed is True: ret_status = status.HTTP_200_OK content = {"message": "Healthy", "details": health_content} + elif check_passed is None: + ret_status = status.HTTP_202_ACCEPTED + content = {"message": "Starting", "details": health_content} else: ret_status = status.HTTP_503_SERVICE_UNAVAILABLE content = {"message": "Unavailable", "details": health_content} @@ -615,10 +633,111 @@ def post(self, request, *args, **kwargs): return Response(inference_data.json(), status=status.HTTP_200_OK) +class TtsInferenceView(APIView): + """Text-to-speech inference: supports both OpenAI-style and enqueue-style endpoints.""" + def post(self, request, *args, **kwargs): + data = request.data + logger.info(f"{self.__class__.__name__} data:={data}") + serializer = InferenceSerializer(data=data) + if serializer.is_valid(): + deploy_id = data.get("deploy_id") + text = data.get("text") or data.get("prompt") + if not text: + return Response({"error": "text is required"}, status=status.HTTP_400_BAD_REQUEST) + deploy = get_deploy_cache()[deploy_id] + internal_url = "http://" + deploy["internal_url"] + try: + model_impl = deploy.get("model_impl") + model_name = getattr(model_impl, "model_name", None) if model_impl else None + inference_engine = getattr(model_impl, "inference_engine", None) + + if inference_engine == "media": + headers = {"Authorization": f"Bearer {TTS_API_KEY}"} + payload = {"model": model_name, "text": text, "voice": "default"} + else: + headers = {"Authorization": f"Bearer {encoded_jwt}"} + payload = {"model": model_name, "input": text, "voice": "default"} + + audio_resp = requests.post(internal_url, json=payload, headers=headers, timeout=120) + + # If 404 on /enqueue for TTS media model, retry with /v1/audio/speech + if audio_resp.status_code == 404 and inference_engine == "media" and "/enqueue" in internal_url: + logger.info(f"TTS 404 on {internal_url}, retrying with /v1/audio/speech") + fallback_url = internal_url.replace("/enqueue", "/v1/audio/speech") + audio_resp = requests.post(fallback_url, json=payload, headers=headers, timeout=120) + + audio_resp.raise_for_status() + + content_type = audio_resp.headers.get("Content-Type", "audio/wav") + django_response = HttpResponse(audio_resp.content, content_type=content_type) + django_response["Content-Disposition"] = "attachment; filename=tts_output.wav" + return django_response + + except requests.exceptions.HTTPError as http_err: + logger.error(f"TTS HTTP error: {http_err}") + return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR) + else: + return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) + + +class OpenAIAudioSpeechView(APIView): + """OpenAI-compatible POST /v1/audio/speech — looks up deployed TTS model by name.""" + def post(self, request, *args, **kwargs): + data = request.data + model_name = data.get("model") + text = data.get("input") or data.get("text") + if not model_name: + return Response({"error": "model is required"}, status=status.HTTP_400_BAD_REQUEST) + if not text: + return Response({"error": "input is required"}, status=status.HTTP_400_BAD_REQUEST) + + # Find a running TTS deployment matching the requested model name + deploy = None + for entry in get_deploy_cache().values(): + impl = entry.get("model_impl") + if impl and getattr(impl, "model_name", None) == model_name: + deploy = entry + break + if deploy is None: + return Response( + {"error": f"No running deployment found for model '{model_name}'"}, + status=status.HTTP_404_NOT_FOUND, + ) + + internal_url = "http://" + deploy["internal_url"] + try: + model_impl = deploy.get("model_impl") + inference_engine = getattr(model_impl, "inference_engine", None) + + if inference_engine == "media": + headers = {"Authorization": f"Bearer {TTS_API_KEY}"} + payload = {"model": model_name, "text": text, "voice": data.get("voice", "default")} + else: + headers = {"Authorization": f"Bearer {encoded_jwt}"} + payload = {"model": model_name, "input": text, "voice": data.get("voice", "default")} + + audio_resp = requests.post(internal_url, json=payload, headers=headers, timeout=120) + + # If 404 on /enqueue for TTS media model, retry with /v1/audio/speech + if audio_resp.status_code == 404 and inference_engine == "media" and "/enqueue" in internal_url: + logger.info(f"OpenAI audio/speech 404 on {internal_url}, retrying with /v1/audio/speech") + fallback_url = internal_url.replace("/enqueue", "/v1/audio/speech") + audio_resp = requests.post(fallback_url, json=payload, headers=headers, timeout=120) + + audio_resp.raise_for_status() + + content_type = audio_resp.headers.get("Content-Type", "audio/wav") + return HttpResponse(audio_resp.content, content_type=content_type) + + except requests.exceptions.HTTPError as http_err: + logger.error(f"OpenAI audio/speech HTTP error: {http_err}") + return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR) + + class ContainerLogsView(View): # Define event detection configuration before the get method SIMPLE_EVENT_KEYWORDS = [ - '[ERROR]', '[FATAL]', '[CRITICAL]', + '[ERROR]', '[FATAL]', '[CRITICAL]', '[WARN]', '[WARNING]', 'RESPONSE_Q OUT OF SYNC', 'ABORTED', 'CORE DUMPED', diff --git a/app/backend/shared_config/model_config.py b/app/backend/shared_config/model_config.py index 9d9a7e93..21545fda 100644 --- a/app/backend/shared_config/model_config.py +++ b/app/backend/shared_config/model_config.py @@ -2,6 +2,7 @@ # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC +import json import os from dataclasses import dataclass, asdict from typing import Set, Dict, Any, Union @@ -11,7 +12,6 @@ from shared_config.backend_config import backend_config from shared_config.setup_config import SetupTypes from shared_config.model_type_config import ModelTypes -from shared_config.model_type_config import ModelTypes from shared_config.logger_config import get_logger logger = get_logger(__name__) @@ -62,6 +62,8 @@ class ModelImpl: service_port: int = 7000 env_file: str = "" health_route: str = "/health" + display_model_type: str = "LLM" + inference_engine: str = "vllm" def __post_init__(self): # _init methods compute values that are dependent on other values @@ -216,54 +218,113 @@ def base_docker_config(): } -# model_ids are unique strings to define a model, they could be uuids but -# using friendly strings prefixed with id_ is more helpful for debugging +# --------------------------------------------------------------------------- +# JSON-based model loader +# --------------------------------------------------------------------------- + +CATALOG_JSON = Path(__file__).parent / "models_from_inference_server.json" + +# device_type strings in the catalog → DeviceConfigurations member names +# (only names that actually exist in the enum; others are skipped) +_CATALOG_DEVICE_MAP = { + "N150": "N150", + "N300": "N300", + "T3K": "T3K", + "N150X4": "N150X4", + "P100": "P100", + "P150": "P150", + "P150X4": "P150X4", + "P150X8": "P150X8", + "GALAXY": "GALAXY", + "GALAXY_T3K": "GALAXY_T3K", +} + + +def load_model_implementations_from_json(json_path: Path) -> list: + with open(json_path) as f: + catalog = json.load(f) + impls = [] + for entry in catalog["models"]: + docker_image = entry.get("docker_image") or "" + if ":" in docker_image: + image_name, image_tag = docker_image.rsplit(":", 1) + else: + image_name, image_tag = docker_image, "latest" + + device_configs = { + DeviceConfigurations[_CATALOG_DEVICE_MAP[d]] + for d in entry.get("device_configurations", []) + if d in _CATALOG_DEVICE_MAP + } + + try: + model_type = ModelTypes[entry["model_type"]] + except KeyError: + model_type = ModelTypes.CHAT + + try: + setup_type = SetupTypes[entry["setup_type"]] + except KeyError: + setup_type = SetupTypes.TT_INFERENCE_SERVER + + cfg = base_docker_config() + cfg["environment"].update(entry.get("env_vars") or {}) + + impl = ModelImpl( + model_name=entry["model_name"], + hf_model_id=entry.get("hf_model_id"), + image_name=image_name, + image_tag=image_tag, + device_configurations=device_configs, + docker_config=cfg, + service_route=entry["service_route"], + setup_type=setup_type, + model_type=model_type, + version=entry.get("version", "0.0.1"), + shm_size=entry.get("shm_size", "32G"), + display_model_type=entry.get("display_model_type", "LLM"), + inference_engine=entry.get("inference_engine", "vllm"), + ) + impls.append(impl) + return impls -# Helper device configuration sets for easier management -N150_N300 = {DeviceConfigurations.N150, DeviceConfigurations.N150_WH_ARCH_YAML, DeviceConfigurations.N300, DeviceConfigurations.N300_WH_ARCH_YAML} -ALL_BOARDS = {DeviceConfigurations.N150, DeviceConfigurations.N150_WH_ARCH_YAML, DeviceConfigurations.N300, DeviceConfigurations.N300_WH_ARCH_YAML, DeviceConfigurations.N300x4, DeviceConfigurations.N300x4_WH_ARCH_YAML} -T3000_ONLY = {DeviceConfigurations.N300x4, DeviceConfigurations.N300x4_WH_ARCH_YAML} -model_implmentations_list = [ - # Speech Recognition - Can run on N150 and N300 +# --------------------------------------------------------------------------- +# Hardcoded models NOT present in tt-inference-server catalog +# --------------------------------------------------------------------------- + +_ALL_WH_BOARDS = { + DeviceConfigurations.N150, + DeviceConfigurations.N150_WH_ARCH_YAML, + DeviceConfigurations.N300, + DeviceConfigurations.N300_WH_ARCH_YAML, + DeviceConfigurations.N300x4, + DeviceConfigurations.N300x4_WH_ARCH_YAML, +} + +_hardcoded_impls = [ + # Object Detection - legacy YOLOv4 (not in tt-inference-server catalog) ModelImpl( - model_name="Whisper-Distil-Large-v3", - model_id="id_whisper_distil_large_v3_v0.1.0", - image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-whisper-distil-large-v3-dev", - image_tag="v0.0.1-tt-metal-1a1a9e2bb102", - device_configurations=ALL_BOARDS, # Can run on N150 and N300 + model_name="YOLOv4", + model_id="id_yolov4v0.0.1", + image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-yolov4-src-base", + image_tag="v0.0.1-tt-metal-65d246482b3f", + device_configurations=_ALL_WH_BOARDS, docker_config=base_docker_config(), shm_size="32G", service_port=7000, - service_route="/inference", - health_route="/", - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.SPEECH_RECOGNITION, + service_route="/objdetection_v2", + setup_type=SetupTypes.NO_SETUP, + model_type=ModelTypes.OBJECT_DETECTION, + display_model_type="CNN", ), - # TODO: add this model back in when its in tt-inference-server-main branch - # Image Generation - Can run on N150 and N300 - # ModelImpl( - # model_name="Stable-Diffusion-3.5-medium", - # model_id="id_stable_diffusion_3.5_mediumv0.1.0", - # image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-stable-diffusion-3.5-src-base", - # image_tag="v0.0.1-tt-metal-a0560feb3eed", - # device_configurations=ALL_BOARDS, # Can run on N150 and N300 - # docker_config=base_docker_config(), - # shm_size="32G", - # service_port=7000, - # service_route="/enqueue", - # health_route="/", - # setup_type=SetupTypes.TT_INFERENCE_SERVER, - # model_type=ModelTypes.IMAGE_GENERATION, - # ), - - # Image Generation - Can run on N150 and N300 + # Legacy Stable-Diffusion-1.4 (not in tt-inference-server catalog) ModelImpl( model_name="Stable-Diffusion-1.4", model_id="id_stable_diffusionv0.1.0", image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-stable-diffusion-1.4-src-base", image_tag="v0.0.1-tt-metal-cc8b4e1dac99", - device_configurations=ALL_BOARDS, # Can run on N150 and N300 + device_configurations=_ALL_WH_BOARDS, docker_config=base_docker_config(), shm_size="32G", service_port=7000, @@ -271,148 +332,96 @@ def base_docker_config(): health_route="/", setup_type=SetupTypes.TT_INFERENCE_SERVER, model_type=ModelTypes.IMAGE_GENERATION, + display_model_type="IMAGE", ), +] - # Object Detection - Can run on all boards - ModelImpl( - model_name="YOLOv4", - model_id="id_yolov4v0.0.1", - image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-yolov4-src-base", - image_tag="v0.0.1-tt-metal-65d246482b3f", - device_configurations=ALL_BOARDS, # Can run on all boards - docker_config=base_docker_config(), - shm_size="32G", - service_port=7000, - service_route="/objdetection_v2", - setup_type=SetupTypes.NO_SETUP, - model_type=ModelTypes.OBJECT_DETECTION - ), - # Mock Chat - # TODO: currently not working. - # remove this model for now until its in tt-inference-server-main branch - # TODO: add / make a new mock model - # ModelImpl( - # hf_model_id="meta-llama/Llama-3.1-70B-Instruct", - # model_name="Mock-Llama-3.1-70B-Instruct", - # model_id="id_mock_vllm_modelv0.0.1", - # image_name="ghcr.io/tenstorrent/tt-inference-server/mock.vllm.openai.api", - # image_tag="v0.0.1-tt-metal-385904186f81-384f1790c3be", - # device_configurations={DeviceConfigurations.CPU}, - # docker_config=base_docker_config(), - # shm_size="1G", - # service_port=7000, - # service_route="/v1/chat/completions", - # setup_type=SetupTypes.MAKE_VOLUMES, - # model_type=ModelTypes.MOCK - # ), - - # --- Chat Models --- - - # 1B, 3B, 8B, 11B models - Can run on all boards - ModelImpl( - hf_model_id="meta-llama/Llama-3.2-1B-Instruct", - image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - device_configurations=ALL_BOARDS, # Can run on all boards - docker_config=base_docker_config(), - service_route="/v1/chat/completions", - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.CHAT +def validate_model_implemenation_config(impl): + # no / in model_id strings, model_id will be used in path names + assert "/" not in impl.model_id - ), - ModelImpl( - hf_model_id="meta-llama/Llama-3.2-3B-Instruct", - image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - device_configurations=ALL_BOARDS, # Can run on all boards - docker_config=base_docker_config(), - service_route="/v1/chat/completions", - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.CHAT - - ), - ModelImpl( - hf_model_id="meta-llama/Llama-3.1-8B-Instruct", - image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - device_configurations=ALL_BOARDS | {DeviceConfigurations.P300Cx2}, - docker_config=base_docker_config(), - service_route="/v1/chat/completions", - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.CHAT - ), - # TODO: add this model back in when its in tt-inference-server-main branch - # ModelImpl( - # hf_model_id="meta-llama/Llama-3.2-11B-Vision-Instruct", - # image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - # image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - # device_configurations=ALL_BOARDS, # Can run on all boards - # docker_config=base_docker_config(), - # service_route="/v1/chat/completions", - # setup_type=SetupTypes.TT_INFERENCE_SERVER, - # model_type=ModelTypes.CHAT - - # ), - - # 32B models - T3000 and P300Cx2 - ModelImpl( - hf_model_id="Qwen/Qwen3-32B", - image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - device_configurations={DeviceConfigurations.N300x4, DeviceConfigurations.N300x4_WH_ARCH_YAML, DeviceConfigurations.P300Cx2}, - docker_config=base_docker_config(), - service_route="/v1/chat/completions", - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.CHAT - ), +# --------------------------------------------------------------------------- +# Build final model_implmentations dict +# --------------------------------------------------------------------------- - # 70B models - Only T3000 +_json_impls = load_model_implementations_from_json(CATALOG_JSON) - ModelImpl( - hf_model_id="meta-llama/Llama-3.1-70B-Instruct", - image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - device_configurations=T3000_ONLY, # Only T3000 - docker_config=base_docker_config(), - shm_size="32G", - service_port=7000, - service_route="/v1/chat/completions", - env_file=os.environ.get("VLLM_LLAMA31_ENV_FILE"), - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.CHAT - ), - # ModelImpl( - # hf_model_id="meta-llama/Llama-3.1-70B-Instruct", - # image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - # image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - # device_configurations=T3000_ONLY, # Only T3000 - # docker_config=base_docker_config(), - # service_route="/v1/chat/completions", - # setup_type=SetupTypes.TT_INFERENCE_SERVER, - # model_type=ModelTypes.CHAT - # ), - ModelImpl( - hf_model_id="meta-llama/Llama-3.3-70B-Instruct", - image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - device_configurations=T3000_ONLY | {DeviceConfigurations.P300Cx2}, - docker_config=base_docker_config(), - service_route="/v1/chat/completions", - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.CHAT - ), - #! Add new model vLLM model implementations here -] +model_implmentations = {} +for impl in _json_impls + _hardcoded_impls: + validate_model_implemenation_config(impl) + model_implmentations[impl.model_id] = impl -def validate_model_implemenation_config(impl): - # no / in model_id strings, model_id will be used in path names - assert not "/" in impl.model_id +# --------------------------------------------------------------------------- +# Chip Requirement Inference +# --------------------------------------------------------------------------- -# build and validate the model_implmentations config -model_implmentations = {} -for impl in model_implmentations_list: - validate_model_implemenation_config(impl) - model_implmentations[impl.model_id] = impl \ No newline at end of file +# Board type classifications for chip allocation +SINGLE_CHIP_BOARDS_STR = {"N150", "N300", "E150", "P100", "P150", "P300c"} +MULTI_CHIP_ONLY_BOARDS_STR = { + "T3K", "GALAXY", "GALAXY_T3K", "P150X4", "P150X8", + "N150X4", "N300x4", "P300Cx2", "P300Cx4" +} + + +def infer_chips_required(device_configurations: Set[DeviceConfigurations]) -> int: + """ + Infer chip requirements from device_configurations set. + + Logic: + - If model supports ANY single-chip board → requires 1 chip + - If model ONLY supports multi-chip boards → requires 4 chips + - Default to 1 chip for unknown configurations + + Args: + device_configurations: Set of DeviceConfigurations enum values + + Returns: + Number of chips required (1 or 4) + + Examples: + Single-chip model (supports N150, N300, etc.): + infer_chips_required({DeviceConfigurations.N150, DeviceConfigurations.T3K}) → 1 + + Multi-chip only model (only T3K, Galaxy, P150X4): + infer_chips_required({DeviceConfigurations.T3K, DeviceConfigurations.GALAXY}) → 4 + """ + if not device_configurations: + return 1 # Default to single chip + + # Convert DeviceConfigurations to string names for comparison + config_names = {cfg.name for cfg in device_configurations} + + # If ANY single-chip board is supported → 1 chip + if config_names.intersection(SINGLE_CHIP_BOARDS_STR): + return 1 + + # If ONLY multi-chip boards supported → 4 chips + if config_names.intersection(MULTI_CHIP_ONLY_BOARDS_STR): + return 4 + + return 1 # Default to single chip for unknown boards + + +def get_model_chip_requirement(model_name: str) -> int: + """ + Get chip requirement for a specific model by name. + + Searches through model_implmentations and infers chip requirement + based on device_configurations. + + Args: + model_name: Name of the model (e.g., "Llama-3.1-70B-Instruct") + + Returns: + Number of chips required (1 or 4) + """ + for impl in model_implmentations.values(): + if impl.model_name == model_name: + return infer_chips_required(impl.device_configurations) + + # Model not found, default to 1 chip + logger.warning(f"Model {model_name} not found in model_implmentations, defaulting to 1 chip") + return 1 \ No newline at end of file diff --git a/app/backend/shared_config/model_type_config.py b/app/backend/shared_config/model_type_config.py index 769754c1..a5d0e584 100644 --- a/app/backend/shared_config/model_type_config.py +++ b/app/backend/shared_config/model_type_config.py @@ -9,4 +9,9 @@ class ModelTypes(Enum): CHAT = "chat" OBJECT_DETECTION = "object_detection" IMAGE_GENERATION = "image_generation" - SPEECH_RECOGNITION = "speech_recognition" \ No newline at end of file + SPEECH_RECOGNITION = "speech_recognition" + VLM = "vlm" + TTS = "tts" + VIDEO = "video_generation" + EMBEDDING = "embedding" + CNN = "cnn" \ No newline at end of file diff --git a/app/backend/shared_config/models_from_inference_server.json b/app/backend/shared_config/models_from_inference_server.json new file mode 100644 index 00000000..b228e375 --- /dev/null +++ b/app/backend/shared_config/models_from_inference_server.json @@ -0,0 +1,1740 @@ +{ + "source": { + "artifact_version": "0.9.0", + "generated_at": "2026-03-03T15:24:14.263161+00:00" + }, + "total_models": 60, + "models": [ + { + "model_name": "DeepSeek-R1-Distill-Llama-70B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "P150X4", + "P150X8", + "T3K" + ], + "hf_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 70 + }, + { + "model_name": "distil-large-v3", + "model_type": "SPEECH_RECOGNITION", + "display_model_type": "AUDIO", + "device_configurations": [ + "GALAXY", + "N150", + "T3K" + ], + "hf_model_id": "distil-whisper/distil-large-v3", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "FLUX.1-dev", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "black-forest-labs/FLUX.1-dev", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-c180ef7", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "FLUX.1-schnell", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "black-forest-labs/FLUX.1-schnell", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-c180ef7", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "Llama-3.1-70B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "P150X4", + "P150X8", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.1-70B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 70 + }, + { + "model_name": "Llama-3.1-70B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "P150X4", + "P150X8", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.1-70B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 70 + }, + { + "model_name": "Llama-3.1-8B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "N150", + "N300", + "P100", + "P150", + "P150X4", + "P150X8", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.1-8B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-25305db-6e67d2d", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 8 + }, + { + "model_name": "Llama-3.1-8B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "N150", + "N300", + "P100", + "P150", + "P150X4", + "P150X8", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.1-8B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-25305db-6e67d2d", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 8 + }, + { + "model_name": "Llama-3.3-70B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "P150X4", + "P150X8", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.3-70B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 70 + }, + { + "model_name": "Mistral-7B-Instruct-v0.3", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.3", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-9b67e09-a91b644", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 7 + }, + { + "model_name": "mochi-1-preview", + "model_type": "VIDEO", + "display_model_type": "VIDEO", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "genmo/mochi-1-preview", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "Motif-Image-6B-Preview", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "Motif-Technologies/Motif-Image-6B-Preview", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-c180ef7", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 6 + }, + { + "model_name": "Qwen3-32B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "P150X8", + "T3K" + ], + "hf_model_id": "Qwen/Qwen3-32B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 32 + }, + { + "model_name": "speecht5_tts", + "model_type": "TTS", + "display_model_type": "TEXT_TO_SPEECH", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "microsoft/speecht5_tts", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-a9b09e0", + "service_route": "/v1/audio/speech", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "stable-diffusion-3.5-large", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "stabilityai/stable-diffusion-3.5-large", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-c180ef7", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "stable-diffusion-xl-1.0-inpainting-0.1", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.5.0-fbbbd2da8cfab49ddf43d28dd9c0813a3c3ee2bd", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "stable-diffusion-xl-base-1.0", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "stabilityai/stable-diffusion-xl-base-1.0", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "stable-diffusion-xl-base-1.0-img-2-img", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "stabilityai/stable-diffusion-xl-base-1.0-img-2-img", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "Wan2.2-T2V-A14B-Diffusers", + "model_type": "VIDEO", + "display_model_type": "VIDEO", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 14 + }, + { + "model_name": "whisper-large-v3", + "model_type": "SPEECH_RECOGNITION", + "display_model_type": "AUDIO", + "device_configurations": [ + "GALAXY", + "N150", + "T3K" + ], + "hf_model_id": "openai/whisper-large-v3", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "Llama-3.2-11B-Vision", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "N300", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-11B-Vision", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-v0.61.1-rc1-5cbc982", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 11 + }, + { + "model_name": "Llama-3.2-11B-Vision-Instruct", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "N300", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-11B-Vision-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-v0.61.1-rc1-5cbc982", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 11 + }, + { + "model_name": "Llama-3.2-1B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-1B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-9b67e09-a91b644", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 1 + }, + { + "model_name": "Llama-3.2-1B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-1B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-9b67e09-a91b644", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 1 + }, + { + "model_name": "Llama-3.2-3B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-3B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-20edc39-03cb300", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 3 + }, + { + "model_name": "Llama-3.2-3B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-3B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-20edc39-03cb300", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 3 + }, + { + "model_name": "Llama-3.2-90B-Vision", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-90B-Vision", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-v0.61.1-rc1-5cbc982", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "MAX_PREFILL_CHUNK_SIZE": 16 + }, + "param_count": 90 + }, + { + "model_name": "Llama-3.2-90B-Vision-Instruct", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-90B-Vision-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-v0.61.1-rc1-5cbc982", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "MAX_PREFILL_CHUNK_SIZE": 16 + }, + "param_count": 90 + }, + { + "model_name": "Qwen-Image", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "Qwen/Qwen-Image", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-be88351", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "TT_DIT_CACHE_DIR": "/tmp/TT_DIT_CACHE" + }, + "param_count": null + }, + { + "model_name": "Qwen-Image-2512", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "Qwen/Qwen-Image-2512", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-be88351", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "TT_DIT_CACHE_DIR": "/tmp/TT_DIT_CACHE" + }, + "param_count": null + }, + { + "model_name": "Qwen2.5-72B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-72B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-13f44c5-0edd242", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "MAX_PREFILL_CHUNK_SIZE": "16" + }, + "param_count": 72 + }, + { + "model_name": "Qwen2.5-72B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-72B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-13f44c5-0edd242", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "MAX_PREFILL_CHUNK_SIZE": "16" + }, + "param_count": 72 + }, + { + "model_name": "Qwen2.5-VL-72B-Instruct", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-VL-72B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c18569e-b2894d3", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 72 + }, + { + "model_name": "Qwen3-8B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "Qwen/Qwen3-8B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-e95ffa5-48eba14", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 8 + }, + { + "model_name": "QwQ-32B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "Qwen/QwQ-32B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-e95ffa5-48eba14", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 32 + }, + { + "model_name": "AFM-4.5B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N300", + "T3K" + ], + "hf_model_id": "arcee-ai/AFM-4.5B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-ae65ee5-35f023f", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 4 + }, + { + "model_name": "bge-large-en-v1.5", + "model_type": "EMBEDDING", + "display_model_type": "EMBEDDING", + "device_configurations": [ + "GALAXY", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "BAAI/bge-large-en-v1.5", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.2.0-2496be4518bca0a7a5b497a4cda3cfe7e2f59756", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM__MAX_NUM_BATCHED_TOKENS": "3072", + "VLLM__MAX_MODEL_LENGTH": "384", + "VLLM__MIN_CONTEXT_LENGTH": "32", + "VLLM__MAX_NUM_SEQS": "8", + "MAX_BATCH_SIZE": "8", + "DEFAULT_THROTTLE_LEVEL": "0" + }, + "param_count": null + }, + { + "model_name": "DeepSeek-R1-0528", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY" + ], + "hf_model_id": "deepseek-ai/DeepSeek-R1-0528", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-e3d97e5-a186bf4", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1", + "VLLM_USE_V1": "1" + }, + "param_count": null + }, + { + "model_name": "efficientnet", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "efficientnet", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "gemma-3-1b-it", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150" + ], + "hf_model_id": "google/gemma-3-1b-it", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c254ee3-c4f2327", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 1 + }, + { + "model_name": "gemma-3-27b-it", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "google/gemma-3-27b-it", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-0b10c51-3499ffa", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 27 + }, + { + "model_name": "gemma-3-4b-it", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "google/gemma-3-4b-it", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c254ee3-c4f2327", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 4 + }, + { + "model_name": "gpt-oss-120b", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "openai/gpt-oss-120b", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 120 + }, + { + "model_name": "gpt-oss-20b", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "openai/gpt-oss-20b", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-60ffb199-3499ffa1", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 20 + }, + { + "model_name": "medgemma-27b-it", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "google/medgemma-27b-it", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-0b10c51-3499ffa", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 27 + }, + { + "model_name": "medgemma-4b-it", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "google/medgemma-4b-it", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c254ee3-c4f2327", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 4 + }, + { + "model_name": "mobilenetv2", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "mobilenetv2", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "Qwen2.5-7B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150X4", + "N300" + ], + "hf_model_id": "Qwen/Qwen2.5-7B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-5b5db8a-e771fff", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 7 + }, + { + "model_name": "Qwen2.5-7B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150X4", + "N300" + ], + "hf_model_id": "Qwen/Qwen2.5-7B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-5b5db8a-e771fff", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 7 + }, + { + "model_name": "Qwen2.5-Coder-32B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-Coder-32B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-17a5973-aa4ae1e", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 32 + }, + { + "model_name": "Qwen2.5-VL-32B-Instruct", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-VL-32B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c18569e-b2894d3", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 32 + }, + { + "model_name": "Qwen2.5-VL-3B-Instruct", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-VL-3B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c18569e-b2894d3", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 3 + }, + { + "model_name": "Qwen2.5-VL-7B-Instruct", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-VL-7B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c18569e-b2894d3", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 7 + }, + { + "model_name": "Qwen3-Embedding-4B", + "model_type": "EMBEDDING", + "display_model_type": "EMBEDDING", + "device_configurations": [ + "GALAXY", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "Qwen/Qwen3-Embedding-4B", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM__MAX_NUM_BATCHED_TOKENS": "1024", + "VLLM__MAX_MODEL_LENGTH": "1024", + "VLLM__MIN_CONTEXT_LENGTH": "32", + "VLLM__MAX_NUM_SEQS": "1", + "MAX_BATCH_SIZE": "1", + "DEFAULT_THROTTLE_LEVEL": "0" + }, + "param_count": 4 + }, + { + "model_name": "Qwen3-Embedding-8B", + "model_type": "EMBEDDING", + "display_model_type": "EMBEDDING", + "device_configurations": [ + "GALAXY", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "Qwen/Qwen3-Embedding-8B", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.2.0-2496be4518bca0a7a5b497a4cda3cfe7e2f59756", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM__MAX_NUM_BATCHED_TOKENS": "1024", + "VLLM__MAX_MODEL_LENGTH": "1024", + "VLLM__MIN_CONTEXT_LENGTH": "32", + "VLLM__MAX_NUM_SEQS": "1" + }, + "param_count": 8 + }, + { + "model_name": "resnet-50", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "resnet-50", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "segformer", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "segformer", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "unet", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "unet", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "vit", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "vit", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "vovnet", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "vovnet", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + } + ] +} diff --git a/app/backend/shared_config/sync_models_from_inference_server.py b/app/backend/shared_config/sync_models_from_inference_server.py new file mode 100644 index 00000000..7fce24c0 --- /dev/null +++ b/app/backend/shared_config/sync_models_from_inference_server.py @@ -0,0 +1,267 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC + +""" +Sync script: reads ../../tt-inference-server/model_specs_output.json and +normalizes it into models_from_inference_server.json (co-located with this script). + +Run from any directory: + python app/backend/shared_config/sync_models_from_inference_server.py +""" + +import json +import os +from datetime import datetime, timezone +from pathlib import Path + +# --------------------------------------------------------------------------- +# Paths +# --------------------------------------------------------------------------- +SCRIPT_DIR = Path(__file__).parent +OUTPUT_JSON = SCRIPT_DIR / "models_from_inference_server.json" + +# Source JSON resolution order: +# 1. Explicit --source CLI argument +# 2. TT_INFERENCE_ARTIFACT_PATH env var (set by run.py after artifact download) +# 3. .artifacts/tt-inference-server/ next to repo root (artifact default location) +# 4. tt-inference-server/ next to repo root (legacy submodule path) +_REPO_ROOT = SCRIPT_DIR / "../../.." +_CANDIDATE_SOURCES = [ + _REPO_ROOT / ".artifacts/tt-inference-server/model_specs_output.json", + _REPO_ROOT / "tt-inference-server/model_specs_output.json", +] + + +def resolve_source_json(override: str | None = None) -> Path: + """Return the path to model_specs_output.json, trying candidates in order.""" + if override: + p = Path(override) + if not p.exists(): + raise FileNotFoundError(f"--source path not found: {p}") + return p.resolve() + + # Check env var set by run.py + artifact_path = os.environ.get("TT_INFERENCE_ARTIFACT_PATH") + if artifact_path: + p = Path(artifact_path) / "model_specs_output.json" + if p.exists(): + return p.resolve() + + # Try static candidates + for candidate in _CANDIDATE_SOURCES: + if candidate.exists(): + return candidate.resolve() + + raise FileNotFoundError( + "Cannot find model_specs_output.json. Tried:\n" + + "\n".join(f" {c.resolve()}" for c in _CANDIDATE_SOURCES) + ) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- +DEVICE_SPECIFIC_ENV_KEYS = {"WH_ARCH_YAML", "MESH_DEVICE", "ARCH_NAME"} + +STATUS_ORDER = {"COMPLETE": 3, "FUNCTIONAL": 2, "EXPERIMENTAL": 1} + +# device_type string (from tt-inference-server) → DeviceConfigurations member name +# Only include device_types that exist in DeviceConfigurations enum +DEVICE_TYPE_TO_CONFIG = { + "N150": "N150", + "N300": "N300", + "T3K": "T3K", + "N150X4": "N150X4", + "P100": "P100", + "P150": "P150", + "P150X4": "P150X4", + "P150X8": "P150X8", + "GALAXY": "GALAXY", + "GALAXY_T3K": "GALAXY_T3K", +} + + +def map_model_type(raw_model_type: str, inference_engine: str) -> str: + """Map tt-inference-server model_type + inference_engine to tt-studio ModelTypes.""" + if raw_model_type == "LLM" and inference_engine == "vLLM": + return "CHAT" + if raw_model_type == "VLM": + return "VLM" + if raw_model_type == "IMAGE": + return "IMAGE_GENERATION" + if raw_model_type == "AUDIO": + return "SPEECH_RECOGNITION" + if raw_model_type == "TEXT_TO_SPEECH" or raw_model_type == "TTS": + return "TTS" + if raw_model_type == "VIDEO": + return "VIDEO" + if raw_model_type == "EMBEDDING": + return "EMBEDDING" + # CNN + media engine = image generation (FLUX, Motif, etc.) + if raw_model_type == "CNN" and inference_engine == "media": + return "IMAGE_GENERATION" + # CNN + forge = computer vision / object detection (resnet, vit, etc.) + if raw_model_type == "CNN" and inference_engine == "forge": + return "CNN" + return "CHAT" + + +CHAT_CAPABLE_PATTERNS = [ + "instruct", "-chat", "chat-", "-it-", "-it", "assistant", + # Reasoning / thinking models that do have chat templates + "deepseek-r1", "qwq", "qwen3", "gpt-oss", +] + + +def is_chat_capable(hf_model_id: str) -> bool: + lower = hf_model_id.lower() + return any(p in lower for p in CHAT_CAPABLE_PATTERNS) + + +def map_service_route(inference_engine: str, hf_model_id: str = "", raw_model_type: str = "") -> str: + """Derive service_route from inference_engine, model type, and model id. + + Args: + inference_engine: Engine type (vLLM, media, forge) + hf_model_id: HuggingFace model ID (for vLLM chat detection) + raw_model_type: Raw model type from inference server (TEXT_TO_SPEECH, TTS, etc.) + """ + if inference_engine == "vLLM": + return "/v1/chat/completions" if is_chat_capable(hf_model_id) else "/v1/completions" + if inference_engine == "media": + # TTS models use OpenAI-compatible /v1/audio/speech endpoint + if raw_model_type in ("TEXT_TO_SPEECH", "TTS"): + return "/v1/audio/speech" + # Other media models (image gen, speech recognition, etc.) use enqueue + return "/enqueue" + if inference_engine == "forge": + return "/v1/chat/completions" + return "/v1/chat/completions" + + +def filter_env_vars(env_vars: dict) -> dict: + """Strip device-specific env vars that ModelImpl.__post_init__ handles.""" + return {k: v for k, v in env_vars.items() if k not in DEVICE_SPECIFIC_ENV_KEYS} + + +def pick_higher_status(current: str | None, candidate: str) -> str: + """Return whichever status is higher priority.""" + if current is None: + return candidate + return current if STATUS_ORDER.get(current, 0) >= STATUS_ORDER.get(candidate, 0) else candidate + + +def normalize(source_path: Path) -> list[dict]: + with open(source_path) as f: + raw = json.load(f) + + # group by model_name, skipping GPU entries + by_model: dict[str, list[dict]] = {} + for entry in raw.values(): + if entry.get("device_type") == "GPU": + continue + name = entry["model_name"] + by_model.setdefault(name, []).append(entry) + + models = [] + for model_name, entries in by_model.items(): + # Use first entry as the canonical source for model-level fields + first = entries[0] + + # Aggregate device_types + device_configurations = sorted( + { + DEVICE_TYPE_TO_CONFIG[e["device_type"]] + for e in entries + if e.get("device_type") in DEVICE_TYPE_TO_CONFIG + } + ) + + # Pick highest status + status = None + for e in entries: + status = pick_higher_status(status, e.get("status", "EXPERIMENTAL")) + + # Model-level env_vars (from first entry, strip device-specific keys) + env_vars = filter_env_vars(first.get("env_vars") or {}) + + inference_engine = first.get("inference_engine", "vLLM") + raw_model_type = first.get("model_type", "LLM") + + models.append({ + "model_name": model_name, + "model_type": map_model_type(raw_model_type, inference_engine), + "display_model_type": raw_model_type, + "device_configurations": device_configurations, + "hf_model_id": first.get("hf_model_repo"), + "inference_engine": inference_engine, + "supported_modalities": first.get("supported_modalities", ["text"]), + "status": status, + "version": first.get("version", "0.0.1"), + "docker_image": first.get("docker_image"), + "service_route": map_service_route(inference_engine, hf_model_id=first.get("hf_model_repo", ""), raw_model_type=raw_model_type), + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": env_vars, + "param_count": first.get("param_count"), + }) + + # Sort: by status (highest first), then alphabetically by model_name + models.sort(key=lambda m: (-STATUS_ORDER.get(m["status"], 0), m["model_name"].lower())) + return models + + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Sync model catalog from tt-inference-server") + parser.add_argument("--source", default=None, help="Path to model_specs_output.json (overrides auto-detection)") + args = parser.parse_args() + + source_path = resolve_source_json(args.source) + print(f"Reading: {source_path}") + + if not source_path.exists(): + raise FileNotFoundError(f"Source not found: {source_path}") + + models = normalize(source_path) + + # Resolve artifact version from VERSION file or env vars (avoid leaking absolute paths) + artifact_version = None + version_file = source_path.parent / "VERSION" + if version_file.exists(): + artifact_version = version_file.read_text().strip() + if not artifact_version: + artifact_version = ( + os.environ.get("TT_INFERENCE_ARTIFACT_VERSION") + or os.environ.get("TT_INFERENCE_ARTIFACT_BRANCH") + or "unknown" + ) + + catalog = { + "source": { + "artifact_version": artifact_version, + "generated_at": datetime.now(timezone.utc).isoformat(), + }, + "total_models": len(models), + "models": models, + } + + out_path = OUTPUT_JSON.resolve() + with open(out_path, "w") as f: + json.dump(catalog, f, indent=2) + f.write("\n") + + print(f"Written {len(models)} models → {out_path}") + + # Print a summary + from collections import Counter + status_counts = Counter(m["status"] for m in models) + type_counts = Counter(m["model_type"] for m in models) + display_type_counts = Counter(m["display_model_type"] for m in models) + print(f" Status distribution: {dict(status_counts)}") + print(f" Type distribution: {dict(type_counts)}") + print(f" Display type distribution: {dict(display_type_counts)}") + + +if __name__ == "__main__": + main() diff --git a/app/backend/shared_config/test_sync_models.py b/app/backend/shared_config/test_sync_models.py new file mode 100644 index 00000000..e7a18802 --- /dev/null +++ b/app/backend/shared_config/test_sync_models.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +""" +Tests for sync_models_from_inference_server.py route derivation logic. +""" + +import pytest +from sync_models_from_inference_server import map_service_route + + +class TestServiceRouteMapping: + """Test that service routes are correctly derived for different model types.""" + + def test_vllm_chat_capable_models(self): + """vLLM chat-capable models should use /v1/chat/completions.""" + assert map_service_route("vLLM", "meta-llama/Llama-3.1-8B-Instruct", "") == "/v1/chat/completions" + assert map_service_route("vLLM", "mistralai/Mistral-7B-Instruct-v0.3", "") == "/v1/chat/completions" + assert map_service_route("vLLM", "Qwen/QwQ-32B", "") == "/v1/chat/completions" + + def test_vllm_base_models(self): + """vLLM base models should use /v1/completions.""" + assert map_service_route("vLLM", "meta-llama/Llama-3.1-70B", "") == "/v1/completions" + assert map_service_route("vLLM", "meta-llama/Llama-3.2-1B", "") == "/v1/completions" + + def test_tts_media_models_use_openai_endpoint(self): + """TTS media models should use /v1/audio/speech (OpenAI-compatible).""" + assert map_service_route("media", "", "TEXT_TO_SPEECH") == "/v1/audio/speech" + assert map_service_route("media", "", "TTS") == "/v1/audio/speech" + + def test_non_tts_media_models_use_enqueue(self): + """Non-TTS media models should use /enqueue.""" + assert map_service_route("media", "", "IMAGE") == "/enqueue" + assert map_service_route("media", "", "AUDIO") == "/enqueue" + assert map_service_route("media", "", "VIDEO") == "/enqueue" + assert map_service_route("media", "", "CNN") == "/enqueue" + assert map_service_route("media", "", "EMBEDDING") == "/enqueue" + + def test_forge_models_use_chat_completions(self): + """Forge models should use /v1/chat/completions.""" + assert map_service_route("forge", "", "") == "/v1/chat/completions" + assert map_service_route("forge", "", "CNN") == "/v1/chat/completions" + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/app/docker-compose.dev-mode.yml b/app/docker-compose.dev-mode.yml index 9cfccb6b..6fd24a51 100644 --- a/app/docker-compose.dev-mode.yml +++ b/app/docker-compose.dev-mode.yml @@ -10,7 +10,8 @@ services: volumes: # Mount the local api directory for live code changes - ./backend:/backend - command: python ./manage.py runserver 0.0.0.0:8000 + command: > + python manage.py runserver 0.0.0.0:8000 environment: - DEBUG=True # Allow container to access host services (docker-control-service) diff --git a/app/docker-compose.yml b/app/docker-compose.yml index 19f07ba6..74bbedae 100644 --- a/app/docker-compose.yml +++ b/app/docker-compose.yml @@ -20,7 +20,8 @@ services: - tt_studio_network ports: - "8000:8000" - command: gunicorn --workers 3 --bind 0.0.0.0:8000 --preload --timeout 1200 api.wsgi:application + command: > + gunicorn --workers 3 --bind 0.0.0.0:8000 --preload --timeout 1200 api.wsgi:application depends_on: tt_studio_chroma: condition: service_healthy @@ -35,6 +36,7 @@ services: - INTERNAL_PERSISTENT_STORAGE_VOLUME - BACKEND_API_HOSTNAME - JWT_SECRET + - TTS_API_KEY - TAVILY_API_KEY - CLOUD_CHAT_UI_URL - CLOUD_CHAT_UI_AUTH_TOKEN @@ -58,7 +60,7 @@ services: # Mount the local api directory for live code changes - ./backend:/backend # Mount tt-inference-server workflow logs for viewing deployment logs - - ${TT_STUDIO_ROOT}/tt-inference-server/workflow_logs:${TT_STUDIO_ROOT}/tt-inference-server/workflow_logs:ro + - ${TT_STUDIO_ROOT}/.artifacts/tt-inference-server/workflow_logs:${TT_STUDIO_ROOT}/.artifacts/tt-inference-server/workflow_logs:ro healthcheck: # On first application load resources for transformers/etc diff --git a/app/frontend/src/api/modelsDeployedApis.ts b/app/frontend/src/api/modelsDeployedApis.ts index 0bc1cd28..d9e77830 100644 --- a/app/frontend/src/api/modelsDeployedApis.ts +++ b/app/frontend/src/api/modelsDeployedApis.ts @@ -30,6 +30,7 @@ interface ContainerData { image_name: string; port_bindings: { [key: string]: PortBinding[] }; networks: { [key: string]: Network }; + device_id?: number | null; } interface StopResponse { @@ -48,19 +49,55 @@ interface DeployedModelInfo { id: string; modelName: string; status: string; + model_type?: string; internal_url?: string; health_url?: string; model_impl?: { model_name?: string; hf_model_id?: string; + model_type?: string; }; } export const ModelType = { ChatModel: "ChatModel", + VLM: "VLM", ImageGeneration: "ImageGeneration", + VideoGeneration: "VideoGeneration", ObjectDetectionModel: "ObjectDetectionModel", SpeechRecognitionModel: "SpeechRecognitionModel", + TTS: "TTS", + Embedding: "Embedding", + CNN: "CNN", +}; + +/** + * Map backend model_type strings (from catalog/API) to frontend ModelType constants. + * Falls back to ChatModel for unknown types. + */ +export const getModelTypeFromBackendType = (backendType: string): string => { + switch (backendType) { + case "chat": + return ModelType.ChatModel; + case "vlm": + return ModelType.VLM; + case "image_generation": + return ModelType.ImageGeneration; + case "video_generation": + return ModelType.VideoGeneration; + case "object_detection": + return ModelType.ObjectDetectionModel; + case "speech_recognition": + return ModelType.SpeechRecognitionModel; + case "tts": + return ModelType.TTS; + case "embedding": + return ModelType.Embedding; + case "cnn": + return ModelType.CNN; + default: + return ModelType.ChatModel; + } }; export const fetchModels = async (): Promise => { @@ -114,6 +151,7 @@ export const fetchModels = async (): Promise => { health: container.health || "unknown", ports: portMapping, name: container.name || "Unnamed container", + device_id: container.device_id ?? null, }; }); @@ -214,12 +252,13 @@ export const handleRedeploy = (modelName: string): void => { export const handleModelNavigationClick = ( modelID: string, modelName: string, - navigate: NavigateFunction + navigate: NavigateFunction, + modelType?: string ): void => { - const modelType = getModelTypeFromName(modelName); - const destination = getDestinationFromModelType(modelType); - console.log(`${modelType} button clicked for model: ${modelID}`); - console.log(`Opening ${modelType} for model: ${modelName}`); + const resolvedModelType = modelType ?? getModelTypeFromName(modelName); + const destination = getDestinationFromModelType(resolvedModelType); + console.log(`${resolvedModelType} button clicked for model: ${modelID}`); + console.log(`Opening ${resolvedModelType} for model: ${modelName}`); customToast.success(`${destination.slice(1)} page opened!`); navigate(destination, { @@ -233,14 +272,125 @@ export const getDestinationFromModelType = (modelType: string): string => { switch (modelType) { case ModelType.ChatModel: return "/chat"; + case ModelType.VLM: + return "/chat"; // VLM reuses the chat UI (supports image content) case ModelType.ImageGeneration: return "/image-generation"; + case ModelType.VideoGeneration: + return "/chat"; // placeholder until video UI exists case ModelType.ObjectDetectionModel: return "/object-detection"; case ModelType.SpeechRecognitionModel: return "/speech-to-text"; + case ModelType.TTS: + return "/tts"; + case ModelType.Embedding: + return "/chat"; // placeholder + case ModelType.CNN: + return "/object-detection"; // CNN reuses object detection UI default: - return "/chat"; // /chat is the default + return "/chat"; + } +}; + +// ----- deployModel with device_id support ----- +export const deployModel = async ( + modelId: string, + weightsId: string, + deviceId: number = 0, +): Promise<{ job_id?: string; status?: string; message?: string }> => { + const payload = JSON.stringify({ + model_id: modelId, + weights_id: weightsId, + device_id: deviceId, + }); + const response = await fetch("/docker-api/deploy/", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: payload, + }); + return response.json(); +}; + +// ----- TTS Inference ----- +export const runTTSInference = async ( + deployId: string, + text: string, +): Promise => { + const response = await fetch("/models-api/tts/", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ deploy_id: deployId, text }), + }); + if (!response.ok) { + throw new Error(`TTS request failed: HTTP ${response.status}`); + } + return response.blob(); +}; + +// ----- Voice Pipeline ----- +export interface VoicePipelineRequest { + audioFile: File; + whisperDeployId: string; + llmDeployId: string; + ttsDeployId?: string; + systemPrompt?: string; +} + +/** + * Calls the voice pipeline endpoint and returns an SSE EventSource. + * The caller is responsible for closing the EventSource when done. + */ +export const runVoicePipeline = async ( + req: VoicePipelineRequest, + onTranscript: (text: string) => void, + onLlmChunk: (text: string) => void, + onAudio: (dataUrl: string) => void, + onError: (stage: string, message: string) => void, + onDone: () => void, +): Promise => { + const form = new FormData(); + form.append("audio_file", req.audioFile); + form.append("whisper_deploy_id", req.whisperDeployId); + form.append("llm_deploy_id", req.llmDeployId); + if (req.ttsDeployId) form.append("tts_deploy_id", req.ttsDeployId); + if (req.systemPrompt) form.append("system_prompt", req.systemPrompt); + + const response = await fetch("/models-api/pipeline/voice/", { + method: "POST", + body: form, + }); + + if (!response.ok || !response.body) { + onError("pipeline", `HTTP ${response.status}`); + return; + } + + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + buffer += decoder.decode(value, { stream: true }); + + const lines = buffer.split("\n"); + buffer = lines.pop() ?? ""; + + for (const line of lines) { + if (!line.startsWith("data: ")) continue; + try { + const evt = JSON.parse(line.slice(6)); + if (evt.type === "transcript") onTranscript(evt.text); + else if (evt.type === "llm_chunk") onLlmChunk(evt.text); + else if (evt.type === "audio_url") onAudio(evt.url); + else if (evt.type === "error") onError(evt.stage ?? "unknown", evt.message); + else if (evt.type === "done") onDone(); + } catch { + // skip malformed lines + } + } } }; @@ -252,6 +402,8 @@ export const getModelTypeFromName = (modelName: string): string => { modelType = ModelType.ImageGeneration; } else if (modelName.toLowerCase().includes("whisper")) { modelType = ModelType.SpeechRecognitionModel; + } else if (modelName.toLowerCase().includes("tts")) { + modelType = ModelType.TTS; } else { modelType = ModelType.ChatModel; } @@ -298,6 +450,7 @@ export const fetchDeployedModelsInfo = async (): Promise< modelData.model_impl?.hf_model_id || "Unknown Model", status: "deployed", + model_type: modelData.model_impl?.model_type, internal_url: modelData.internal_url, health_url: modelData.health_url, model_impl: modelData.model_impl, diff --git a/app/frontend/src/components/ChipConfigStep.tsx b/app/frontend/src/components/ChipConfigStep.tsx new file mode 100644 index 00000000..a98d32b0 --- /dev/null +++ b/app/frontend/src/components/ChipConfigStep.tsx @@ -0,0 +1,289 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +import { useState, useEffect } from "react"; +import axios from "axios"; +import { Cpu, Layers } from "lucide-react"; +import { useStepper } from "./ui/stepper"; +import { ChipStatusDisplay } from "./ChipStatusDisplay"; +import { Button } from "./ui/button"; + +interface ChipSlot { + slot_id: number; + status: "available" | "occupied"; + model_name?: string; + deployment_id?: number; + is_multi_chip?: boolean; +} + +interface ChipStatus { + board_type: string; + total_slots: number; + slots: ChipSlot[]; +} + +interface ChipConfigStepProps { + onConfirm: (mode: "single" | "multi", slotId: number) => void; +} + +export function ChipConfigStep({ onConfirm }: ChipConfigStepProps) { + const { nextStep } = useStepper(); + const [selectedMode, setSelectedMode] = useState<"single" | "multi" | null>( + null + ); + const [selectedSlot, setSelectedSlot] = useState(null); + const [chipStatus, setChipStatus] = useState(null); + + // Fetch chip status on mount and poll every 7 minutes + useEffect(() => { + const fetchChipStatus = async () => { + try { + const response = await axios.get("/docker-api/chip-status/"); + setChipStatus(response.data); + } catch (error) { + console.error("Error fetching chip status:", error); + } + }; + + fetchChipStatus(); + const interval = setInterval(fetchChipStatus, 7 * 60 * 1000); + return () => clearInterval(interval); + }, []); + + const handleModeSelect = (mode: "single" | "multi") => { + setSelectedMode(mode); + setSelectedSlot(null); // reset slot when mode changes + }; + + const needsSlotPicker = + selectedMode === "single" && + chipStatus !== null && + chipStatus.total_slots > 1; + + const isConfirmDisabled = + !selectedMode || (needsSlotPicker && selectedSlot === null); + + const handleConfirm = () => { + if (isConfirmDisabled || !selectedMode) return; + // Multi-chip always uses device_id 0; single uses the chosen slot + const slotId = + selectedMode === "multi" ? 0 : (selectedSlot ?? 0); + onConfirm(selectedMode, slotId); + nextStep(); + }; + + return ( +
+ {/* Header */} +
+

+ Choose Chip Configuration +

+

+ Select how many chips to use. This determines which models are + available in the next step. +

+
+ + {/* Mode selection cards */} +
+ {/* 1 Chip card */} + + + {/* All Chips / T3K card */} + +
+ + {/* Slot picker — only shown when "1 Chip" is selected on a multi-slot board */} + {needsSlotPicker && chipStatus && ( +
+

+ Select Chip Slot +

+
+ {chipStatus.slots.map((slot) => { + const isAvailable = slot.status === "available"; + const isSelected = selectedSlot === slot.slot_id; + return ( + + ); + })} +
+ {selectedSlot !== null && ( +

+ ✓ Slot {selectedSlot} selected — model will run on{" "} + + /dev/tenstorrent/{selectedSlot} + +

+ )} +
+ )} + + {/* Chip slot status */} +
+

+ Current Slot Status +

+ {chipStatus ? ( + + ) : ( +
+ Fetching hardware status... +
+ )} +
+ + {/* Confirm button */} +
+ +
+
+ ); +} diff --git a/app/frontend/src/components/ChipStatusDisplay.tsx b/app/frontend/src/components/ChipStatusDisplay.tsx new file mode 100644 index 00000000..581e5d0e --- /dev/null +++ b/app/frontend/src/components/ChipStatusDisplay.tsx @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +import React from "react"; +import { Cpu } from "lucide-react"; + +interface ChipSlot { + slot_id: number; + status: "available" | "occupied"; + model_name?: string; + deployment_id?: number; + is_multi_chip?: boolean; +} + +interface ChipStatusDisplayProps { + boardType: string; + totalSlots: number; + slots: ChipSlot[]; + onStopModel?: (deploymentId: number) => void; + className?: string; +} + +export function ChipStatusDisplay({ + boardType, + totalSlots, + slots, + onStopModel, + className = "", +}: ChipStatusDisplayProps) { + const availableCount = slots.filter((s) => s.status === "available").length; + + // Check if two adjacent slots are both occupied and multi-chip (connector line) + const hasConnector = (index: number): boolean => { + if (index >= slots.length - 1) return false; + const curr = slots[index]; + const next = slots[index + 1]; + return ( + curr.status === "occupied" && + next.status === "occupied" && + !!curr.is_multi_chip && + !!next.is_multi_chip + ); + }; + + return ( +
+ {/* Header row */} +
+
+ + {boardType} + + + {totalSlots} SLOTS + +
+ + {availableCount}/{totalSlots} IDLE + +
+ + {/* Slot cards row */} +
+ {slots.map((slot, index) => { + const isOccupied = slot.status === "occupied"; + const showConnector = hasConnector(index); + + return ( + + {/* Slot card */} +
+ {/* Slot header row */} +
+ + SLOT {String(slot.slot_id).padStart(2, "0")} + + + {isOccupied ? "IN USE" : "IDLE"} + +
+ + {/* Chip icon */} +
+ +
+ + {/* Model name (only if occupied) */} + {isOccupied && slot.model_name && ( +
+ + {slot.model_name} + +
+ )} + + {/* Stop button (optional) */} + {isOccupied && onStopModel && slot.deployment_id && ( + + )} +
+ + {/* Connector between adjacent multi-chip slots */} + {showConnector && ( +
+
+
+
+
+ )} + + ); + })} +
+
+ ); +} diff --git a/app/frontend/src/components/DeployModelStep.tsx b/app/frontend/src/components/DeployModelStep.tsx index 17e3e553..acbb53dd 100644 --- a/app/frontend/src/components/DeployModelStep.tsx +++ b/app/frontend/src/components/DeployModelStep.tsx @@ -8,8 +8,7 @@ import { useStepper } from "./ui/stepper"; import { StepperFormActions } from "./StepperFormActions"; import { useModels } from "../hooks/useModels"; import { useRefresh } from "../hooks/useRefresh"; -import { Cpu, AlertTriangle, ExternalLink } from "lucide-react"; -import { checkCurrentlyDeployedModels } from "../api/modelsDeployedApis"; +import { Cpu, AlertTriangle, ExternalLink, Info } from "lucide-react"; import { Button } from "./ui/button"; import { useNavigate } from "react-router-dom"; import axios from "axios"; @@ -17,23 +16,25 @@ import axios from "axios"; export function DeployModelStep({ handleDeploy, selectedModel, + selectedDeviceId, }: { selectedModel: string | null; handleDeploy: () => Promise<{ success: boolean; job_id?: string }>; + selectedDeviceId?: number; }) { const { nextStep, isLastStep } = useStepper(); const { refreshModels } = useModels(); const { triggerRefresh, triggerHardwareRefresh } = useRefresh(); const navigate = useNavigate(); const [modelName, setModelName] = useState(null); - const [deployedInfo, setDeployedInfo] = useState<{ - hasDeployedModels: boolean; - count: number; - modelNames: string[]; + const [slotInfo, setSlotInfo] = useState<{ + totalSlots: number; + availableSlots: number; + occupiedDetails: { slot_id: number; model_name: string; port?: number }[]; }>({ - hasDeployedModels: false, - count: 0, - modelNames: [], + totalSlots: 0, + availableSlots: 0, + occupiedDetails: [], }); // Track deployment error state that persists even after deployment stops @@ -157,38 +158,53 @@ export function DeployModelStep({ }, [selectedModel]); useEffect(() => { - // Don't check for deployed models while deployment is in progress + // Don't check slot status while deployment is in progress // This prevents the blocking UI from showing immediately after a successful deployment if (isDeploymentInProgress) { return; } - const checkDeployedModels = async () => { + const fetchSlotStatus = async () => { try { - const info = await checkCurrentlyDeployedModels(); - setDeployedInfo(info); + const response = await axios.get("/docker-api/chip-status/"); + const data = response.data as { + total_slots: number; + slots: { slot_id: number; status: string; model_name?: string; port?: number }[]; + }; + const occupied = data.slots.filter((s) => s.status === "occupied"); + setSlotInfo({ + totalSlots: data.total_slots, + availableSlots: data.total_slots - occupied.length, + occupiedDetails: occupied.map((s) => ({ + slot_id: s.slot_id, + model_name: s.model_name || "Unknown", + port: s.port, + })), + }); } catch (error) { - console.error("Error checking deployed models:", error); + console.error("Error fetching chip status:", error); } }; - checkDeployedModels(); + fetchSlotStatus(); }, [isDeploymentInProgress]); + const allSlotsOccupied = slotInfo.totalSlots > 0 && slotInfo.availableSlots === 0; + const deployButtonText = useMemo(() => { - if (deployedInfo.hasDeployedModels) { - return "Delete Existing Models First"; + if (allSlotsOccupied) { + return "All Slots Occupied"; } if (!selectedModel) return "Select a Model"; return "Deploy Model"; }, [ selectedModel, - deployedInfo.hasDeployedModels, + allSlotsOccupied, ]); const isDeployDisabled = !selectedModel || - deployedInfo.hasDeployedModels; + allSlotsOccupied; const onDeploy = useCallback(async () => { if (isDeployDisabled) return { success: false }; @@ -196,13 +212,11 @@ export function DeployModelStep({ // Mark deployment as in progress to prevent blocking UI setIsDeploymentInProgress(true); - // Clear deployed info to prevent blocking UI from showing during deployment - // This ensures users see the "working" state instead of the error message - setDeployedInfo({ - hasDeployedModels: false, - count: 0, - modelNames: [], - }); + // Optimistically mark a slot as taken to prevent blocking UI during deployment + setSlotInfo((prev) => ({ + ...prev, + availableSlots: Math.max(0, prev.availableSlots - 1), + })); // Reset error state and polling flag when starting a new deployment setDeploymentError({ @@ -263,9 +277,10 @@ export function DeployModelStep({ // Note: The AnimatedDeployButton will reset its state when onDeploy is called again }; - // Show a warning banner if models are deployed, but don't block the entire UI - // The deploy button will be disabled, providing a better UX than the blocking error - const showDeployedWarning = deployedInfo.hasDeployedModels && !isDeploymentInProgress; + // Show blocking warning only when ALL slots are occupied + const showSlotsFullWarning = allSlotsOccupied && !isDeploymentInProgress; + // Show informational status when some slots are in use but others are available + const showSlotInfo = !allSlotsOccupied && slotInfo.occupiedDetails.length > 0 && !isDeploymentInProgress; return ( <> @@ -273,21 +288,24 @@ export function DeployModelStep({ className="flex flex-col items-center justify-center p-6 overflow-hidden" style={{ minHeight: "200px" }} > - {/* Show warning banner when models are already deployed */} - {showDeployedWarning && ( + {/* Show blocking warning when ALL chip slots are occupied */} + {showSlotsFullWarning && (

- Model Already Deployed + All Chip Slots Occupied

- {deployedInfo.count} model{deployedInfo.count > 1 ? "s are" : " is"} currently deployed: {deployedInfo.modelNames.join(", ")} + All {slotInfo.totalSlots} slots are in use:{" "} + {slotInfo.occupiedDetails + .map((s) => `${s.model_name} (slot ${s.slot_id}${s.port ? ` :${s.port}` : ""})`) + .join(", ")}

- Delete existing model{deployedInfo.count > 1 ? "s" : ""} before deploying a new one. + Free up a slot before deploying a new model.

+ + {/* ── HEADER ── */} -
-
- - - Reset Card - -
- {boardInfo && boardInfo.type !== "unknown" && ( - - )} - {boardLoading && ( -
- - - Detecting... - +
+
+ {isLoading ? ( +
+ +
+ ) : isCompleted ? ( +
+ +
+ ) : isFailed ? ( +
+ +
+ ) : ( +
+ +
+ )} +
+ + {isLoading + ? resetStep === "deleting" + ? "Removing deployed models…" + : "Resetting board…" + : isCompleted + ? "Reset complete" + : isFailed + ? "Reset failed" + : "Reset Card"} + + {isLoading && ( +

+ Step {resetStep === "deleting" ? "1" : "2"} of 2 — do not + close this window +

+ )}
+
+ {/* Board badge — only when idle */} + {!isLoading && !isCompleted && !isFailed && boardType !== "unknown" && ( + )}
- - Are you sure you want to reset the card? - - {boardInfo && boardInfo.type === "unknown" && ( -
- -
-
- No Tenstorrent device detected -
-
- Device /dev/tenstorrent not found. Please check - your hardware connection and ensure the device is properly - installed. + +
+ {/* ── IDLE: board status + step overview ── */} + {!isLoading && !isCompleted && !isFailed && ( + <> + + + {isResettingContext && ( +
+ + Board is already resetting… +
+ )} + + {/* Step overview */} + } + label={ + deployedCount > 0 + ? `Stop ${deployedCount} deployed model${deployedCount > 1 ? "s" : ""}` + : "Stop deployed models" + } + state="pending" + /> + } + label="Reset the board (tt-smi -r)" + state="pending" + /> + + {/* Warning */} +
+ + + Warning: This will + interrupt any ongoing processes on the card. + {resetHistory.length > 0 && ( + + Last reset:{" "} + {resetHistory[resetHistory.length - 1].toLocaleTimeString()} + + )} +
-
-
- )} -
-
-
- Warning! This action will stop all deployed models and might - interrupt ongoing processes. -
- {resetHistory.length > 0 && ( -
- Note: This card was reset in the last 5 minutes. Frequent resets - may cause issues. Please wait before resetting again. + + )} + + {/* ── LOADING: step progress ── */} + {isLoading && ( + <> + } + label={ + deployedCount > 0 + ? `Stop ${deployedCount} deployed model${deployedCount > 1 ? "s" : ""}` + : "Stop deployed models" + } + sublabel="Sending stop signal to all containers…" + state={step1State} + /> + } + label="Reset the board" + sublabel="Running tt-smi -r, this may take 10–30 seconds…" + state={step2State} + /> + + )} + + {/* ── COMPLETED ── */} + {isCompleted && ( + <> + } + label="Deployed models removed" + state={deployedCount === 0 ? "skipped" : "done"} + /> + } + label="Board reset" + state="done" + /> + {cmdOutput && ( + + )} + {showOutput && cmdOutput && ( + +
+                    {cmdOutput}
+                  
+
+ )} + + )} + + {/* ── FAILED ── */} + {isFailed && ( + <> +
+ +
+

+ {errorMessage} +

+ {cmdOutput && ( + + )} +
- )} -
+ {showOutput && cmdOutput && ( + +
+                    {cmdOutput}
+                  
+
+ )} + + )}
- {errorMessage && ( -
-
- -
-
Error:
-
-                  {errorMessage}
-                
-
-
-
- )} - - - - Reset History - - -
    - {resetHistory.length > 0 ? ( - resetHistory.map((resetTime, index) => ( -
  • {resetTime.toLocaleString()}
  • - )) + + {/* ── FOOTER ── */} + + {(isCompleted || isFailed) ? ( + + ) : ( + <> + +
-
-
- {fullOutput && ( - - - Command Output - - - -
- - - + + )} - - - - diff --git a/app/frontend/src/components/SelectionSteps.tsx b/app/frontend/src/components/SelectionSteps.tsx index 07f8211d..948b5a40 100644 --- a/app/frontend/src/components/SelectionSteps.tsx +++ b/app/frontend/src/components/SelectionSteps.tsx @@ -10,6 +10,7 @@ import { customToast } from "./CustomToaster"; import StepperFooter from "./StepperFooter"; import { DeployModelStep } from "./DeployModelStep"; import { FirstStepForm } from "./FirstStepForm"; +import { ChipConfigStep } from "./ChipConfigStep"; const dockerAPIURL = "/docker-api/"; const deployUrl = `${dockerAPIURL}deploy/`; @@ -22,6 +23,9 @@ export interface Model { compatible_boards: string[]; // List of boards this model can run on model_type: string; // Type of model (e.g., CHAT, IMAGE_GENERATION, etc.) current_board: string; // The detected board type + status?: "EXPERIMENTAL" | "FUNCTIONAL" | "COMPLETE" | null; + display_model_type?: string; + chips_required?: number; // Number of chips required (1 or 4) } export default function StepperDemo() { @@ -29,17 +33,52 @@ export default function StepperDemo() { const navigate = useNavigate(); const autoDeployModel = searchParams.get("auto-deploy"); - const steps = [ - { label: "Step 1", description: "Model Selection" }, - { label: "Final Step", description: "Deploy Model" }, - ]; + const [chipStatus, setChipStatus] = useState<{ + board_type: string; + total_slots: number; + slots: { slot_id: number; status: string; model_name?: string; deployment_id?: number; is_multi_chip?: boolean }[]; + } | null>(null); + const [totalSlots, setTotalSlots] = useState(null); + const isMultiChipBoard = totalSlots !== null && totalSlots > 1; + + // Fetch chip status on mount and poll every 7 minutes + useEffect(() => { + const fetchChipStatus = () => { + axios + .get("/docker-api/chip-status/") + .then((res) => { + setChipStatus(res.data); + setTotalSlots(res.data.total_slots ?? 1); + }) + .catch(() => { + setChipStatus(null); + setTotalSlots(1); // safe fallback to single-chip + }); + }; + fetchChipStatus(); + const interval = setInterval(fetchChipStatus, 7 * 60 * 1000); + return () => clearInterval(interval); + }, []); + + const steps = isMultiChipBoard + ? [ + { label: "Step 1", description: "Hardware Configuration" }, + { label: "Step 2", description: "Model Selection" }, + { label: "Final Step", description: "Deploy Model" }, + ] + : [ + { label: "Step 1", description: "Model Selection" }, + { label: "Final Step", description: "Deploy Model" }, + ]; // No-op function for removing dynamic steps (no dynamic steps in this component) const removeDynamicSteps = () => { // This component uses static steps, so no action needed }; + const [chipMode, setChipMode] = useState<"single" | "multi" | null>(null); const [selectedModel, setSelectedModel] = useState(null); + const [selectedDeviceId, setSelectedDeviceId] = useState(0); const [loading, setLoading] = useState(false); const [formError, setFormError] = useState(false); const [isAutoDeploying, setIsAutoDeploying] = useState(false); @@ -72,9 +111,11 @@ export default function StepperDemo() { console.log("Found model for auto-deploy:", model); // Deploy with default weights + const deviceIdParam = parseInt(searchParams.get("device-id") ?? "0", 10); const deployPayload = { model_id: model.id, weights_id: "", // Empty string for default weights + device_id: isNaN(deviceIdParam) ? 0 : deviceIdParam, }; console.log("Auto-deploy payload:", deployPayload); @@ -137,6 +178,7 @@ export default function StepperDemo() { const payload = JSON.stringify({ model_id, weights_id, + device_id: selectedDeviceId, }); console.log("📦 Deploying with default weights:", { model_id, weights_id }); @@ -171,6 +213,48 @@ export default function StepperDemo() { }; } catch (error) { console.error("Error during deployment:", error); + + // Check if this is a chip allocation conflict error + if (axios.isAxiosError(error) && error.response?.status === 409) { + const errorData = error.response.data; + const errorType = errorData?.error_type; + + if (errorType === 'multi_chip_conflict') { + // Multi-chip conflict with detailed information + const conflicts = errorData?.conflicts || []; + const message = errorData?.message || 'Multi-chip model requires all slots to be free'; + + customToast.error( +
+

Multi-chip Deployment Conflict

+

{message}

+ + {conflicts.length > 0 && ( +
+

Stop these models first:

+
    + {conflicts.map((c: any, i: number) => ( +
  • + • {c.model} (slot {c.slot}) +
  • + ))} +
+

Go to Models Deployed page to stop models.

+
+ )} +
, + { duration: 15000 } + ); + + return { success: false }; + } else if (errorType === 'allocation_failed') { + // General allocation failure (all slots occupied) + const message = errorData?.message || 'All chip slots are occupied'; + customToast.error(`Chip Allocation Failed: ${message}`, { duration: 10000 }); + return { success: false }; + } + } + // Extract error message and job_id from response if available const errorMessage = axios.isAxiosError(error) && error.response?.data?.message @@ -186,6 +270,17 @@ export default function StepperDemo() { } }; + // Wait until we know total_slots to avoid re-mounting Stepper mid-render + if (totalSlots === null) { + return ( +
+
+ Detecting hardware... +
+
+ ); + } + return (
- {step.label === "Step 1" && ( + {/* Multi-chip flow: Step 1 = Hardware Config */} + {isMultiChipBoard && step.label === "Step 1" && ( + { + setChipMode(mode); + setSelectedDeviceId(slotId); + }} + /> + )} + {/* Multi-chip flow: Step 2 = Model Selection (with chipMode filter) */} + {isMultiChipBoard && step.label === "Step 2" && ( + { + console.log("🔄 setSelectedModel called with:", modelId); + setSelectedModel(modelId); + }} + setSelectedDeviceId={setSelectedDeviceId} + setFormError={setFormError} + autoDeployModel={autoDeployModel} + isAutoDeploying={isAutoDeploying} + chipMode={chipMode ?? undefined} + /> + )} + {/* Single-chip flow: Step 1 = Model Selection (no chipMode filter) */} + {!isMultiChipBoard && step.label === "Step 1" && ( { console.log("🔄 setSelectedModel called with:", modelId); setSelectedModel(modelId); }} + setSelectedDeviceId={setSelectedDeviceId} setFormError={setFormError} autoDeployModel={autoDeployModel} isAutoDeploying={isAutoDeploying} /> )} + {/* Both flows: Final Step = Deploy */} {step.label === "Final Step" && ( )} diff --git a/app/frontend/src/components/chatui/runInference.ts b/app/frontend/src/components/chatui/runInference.ts index 251c7cd0..32a8abdc 100644 --- a/app/frontend/src/components/chatui/runInference.ts +++ b/app/frontend/src/components/chatui/runInference.ts @@ -309,7 +309,7 @@ export const runInference = async ( const jsonData = JSON.parse(trimmedLine.slice(5)); // Handle final statistics from backend (after [DONE]) - if (!isAgentSelected && jsonData.ttft && jsonData.tpot) { + if (!isAgentSelected && jsonData.tokens_decoded !== undefined) { const backendStats: InferenceStats = { user_ttft_s: jsonData.ttft, user_tpot: jsonData.tpot, @@ -331,8 +331,8 @@ export const runInference = async ( metricsTracker.recordUsage(usage); } - // Handle generated text content - const content = jsonData.choices[0]?.delta?.content || ""; + // Handle generated text content (chat completions use delta.content, text completions use text) + const content = jsonData.choices[0]?.delta?.content ?? jsonData.choices[0]?.text ?? ""; if (content) { // Record first token arrival metricsTracker.recordFirstToken(); diff --git a/app/frontend/src/components/models/DeleteModelDialog.tsx b/app/frontend/src/components/models/DeleteModelDialog.tsx index 887436f2..e8948286 100644 --- a/app/frontend/src/components/models/DeleteModelDialog.tsx +++ b/app/frontend/src/components/models/DeleteModelDialog.tsx @@ -1,7 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC -// React import not needed for modern JSX transform +import type { ReactNode } from "react"; +import { AlertTriangle, CheckCircle, Loader2, Trash2, RotateCcw } from "lucide-react"; import { Dialog, DialogContent, @@ -10,70 +11,176 @@ import { DialogTitle, } from "../ui/dialog"; import { Button } from "../ui/button"; -import { AlertTriangle } from "lucide-react"; + +export type DeleteStep = "deleting" | "resetting" | null; interface Props { open: boolean; modelId: string; isLoading: boolean; + deleteStep: DeleteStep; onConfirm: () => void; onCancel: () => void; } +function StepRow({ + number, + icon, + label, + sublabel, + state, +}: { + number: number; + icon: ReactNode; + label: string; + sublabel?: string; + state: "pending" | "active" | "done"; +}) { + return ( +
+
+ {state === "active" ? ( + + ) : state === "done" ? ( + + ) : ( +
+ {number} +
+ )} +
+
+
+ + {icon} + {label} + +
+ {sublabel && state === "active" && ( +
{sublabel}
+ )} + {state === "done" && ( +
Completed
+ )} +
+
+ ); +} + export default function DeleteModelDialog({ open, - modelId: _modelId, // Marked as intentionally unused for now + modelId: _modelId, isLoading, + deleteStep, onConfirm, onCancel, }: Props) { + const step1State = + deleteStep === "deleting" + ? "active" + : deleteStep === "resetting" + ? "done" + : "pending"; + + const step2State = + deleteStep === "resetting" ? "active" : "pending"; + return ( - !v && onCancel()}> - + !v && !isLoading && onCancel()}> + -
-
- - - Delete Model & Reset Card +
+ {isLoading ? ( +
+ +
+ ) : ( +
+ +
+ )} +
+ + {isLoading + ? deleteStep === "deleting" + ? "Removing model…" + : "Resetting board…" + : "Delete Model & Reset Card"} + {isLoading && ( +

+ Step {deleteStep === "deleting" ? "1" : "2"} of 2 — do not close this window +

+ )}
-
- -
-
- Warning! This action will stop and remove the model, then reset - the card. -
-
- Deleting a model will attempt to stop and remove the model - container. -
- After deletion, the card will automatically be reset using{" "} - tt-smi reset. -
- - This may interrupt any ongoing processes on the card. - -
-
+ +
+ {/* Step 1 */} + } + label="Stop & remove model container" + sublabel="Sending stop signal to the container…" + state={step1State} + /> + + {/* Step 2 */} + } + label="Reset the board" + sublabel="Running tt-smi -r, this may take 10–30 seconds…" + state={step2State} + />
- + + {/* Warning — only shown when idle */} + {!isLoading && ( +
+ + + Warning: This will + interrupt any ongoing processes on the card and cannot be undone. + +
+ )} + + diff --git a/app/frontend/src/components/models/ModelsDeployedCard.tsx b/app/frontend/src/components/models/ModelsDeployedCard.tsx index 2f68ca20..e9e5e696 100644 --- a/app/frontend/src/components/models/ModelsDeployedCard.tsx +++ b/app/frontend/src/components/models/ModelsDeployedCard.tsx @@ -22,6 +22,8 @@ import { handleRedeploy, handleModelNavigationClick, fetchModels, + fetchDeployedModelsInfo, + getModelTypeFromBackendType, } from "../../api/modelsDeployedApis"; import type { ColumnVisibilityMap, @@ -30,10 +32,12 @@ import type { } from "../../types/models"; import ModelsToolbar from "./ModelsToolbar.tsx"; import ModelsTable from "./ModelsTable.tsx"; -import DeleteModelDialog from "./DeleteModelDialog.tsx"; +import DeleteModelDialog, { type DeleteStep } from "./DeleteModelDialog.tsx"; import LogStreamDialog from "./Logs/LogStreamDialog.tsx"; import { useNavigate } from "react-router-dom"; import { useTablePrefs } from "../../hooks/useTablePrefs"; +import axios from "axios"; +import { ChipStatusDisplay } from "../ChipStatusDisplay"; export default function ModelsDeployedCard(): JSX.Element { const { models, setModels, refreshModels } = useModels(); @@ -43,6 +47,27 @@ export default function ModelsDeployedCard(): JSX.Element { const [loading, setLoading] = useState(true); const [loadError, setLoadError] = useState(null); + // Chip slot status for multi-chip boards + const [chipStatus, setChipStatus] = useState<{ + board_type: string; + total_slots: number; + slots: { slot_id: number; status: string; model_name?: string; deployment_id?: number; is_multi_chip?: boolean }[]; + } | null>(null); + + useEffect(() => { + const fetchChipStatus = () => { + axios + .get("/docker-api/chip-status/") + .then((res) => setChipStatus(res.data)) + .catch(() => setChipStatus(null)); + }; + fetchChipStatus(); + const interval = setInterval(fetchChipStatus, 7 * 60 * 1000); + return () => clearInterval(interval); + }, [refreshTrigger]); + + const isMultiChipBoard = chipStatus !== null && chipStatus.total_slots > 1; + const { isRefreshing, refreshAllHealth, register } = useHealthRefresh(); const { value: columns, @@ -59,7 +84,10 @@ export default function ModelsDeployedCard(): JSX.Element { setLoadError(null); try { const fetched = await fetchModels(); - setModels(fetched); + const deployedInfo = await fetchDeployedModelsInfo(); + const typeById = Object.fromEntries(deployedInfo.map(d => [d.id, d.model_type])); + const enriched = fetched.map(m => ({ ...m, model_type: m.model_type ?? typeById[m.id] })); + setModels(enriched); if (fetched.length === 0) { triggerRefresh(); } @@ -131,6 +159,7 @@ export default function ModelsDeployedCard(): JSX.Element { const [showDeleteModal, setShowDeleteModal] = useState(false); const [deleteTargetId, setDeleteTargetId] = useState(null); const [isProcessingDelete, setIsProcessingDelete] = useState(false); + const [deleteStep, setDeleteStep] = useState(null); useEffect(() => { loadModels(); @@ -150,30 +179,28 @@ export default function ModelsDeployedCard(): JSX.Element { setIsProcessingDelete(true); const truncatedModelId = deleteTargetId.substring(0, 4); try { + // Step 1: stop & remove the model (backend also runs tt-smi -r internally) + setDeleteStep("deleting"); await customToast.promise(deleteModel(deleteTargetId), { - loading: `Attempting to delete Model ID: ${truncatedModelId}...`, - success: `Model ID: ${truncatedModelId} has been deleted.`, - error: `Failed to delete Model ID: ${truncatedModelId}.`, + loading: `Stopping model ${truncatedModelId}…`, + success: `Model ${truncatedModelId} stopped.`, + error: `Failed to stop model ${truncatedModelId}.`, }); - // Simulate resetCard same as original placeholder - await customToast.promise( - new Promise((resolve) => window.setTimeout(resolve, 2000)), - { - loading: "Resetting card (tt-smi reset)...", - success: "Card reset successfully!", - error: "Failed to reset card.", - } - ); + + // Step 2: board reset is handled by the stop API, show progress while cleanup settles + setDeleteStep("resetting"); + await new Promise((resolve) => window.setTimeout(resolve, 2000)); + await refreshModels(); triggerHardwareRefresh(); setShowDeleteModal(false); setDeleteTargetId(null); - // Slight delay then refresh health window.setTimeout(() => { refreshAllHealth(); }, 1000); } finally { setIsProcessingDelete(false); + setDeleteStep(null); } }, [deleteTargetId, refreshModels, triggerHardwareRefresh, refreshAllHealth]); @@ -301,6 +328,18 @@ export default function ModelsDeployedCard(): JSX.Element { />
+ + {/* Chip slot visualization for multi-chip boards */} + {isMultiChipBoard && chipStatus && ( +
+ +
+ )} +
@@ -317,9 +356,13 @@ export default function ModelsDeployedCard(): JSX.Element { setShowDeleteModal(true); }} onRedeploy={(image?: string) => image && handleRedeploy(image)} - onNavigateToModel={(id: string, name: string) => - handleModelNavigationClick(id, name, navigate) - } + onNavigateToModel={(id: string, name: string) => { + const row = rows.find((r) => r.id === id); + const frontendType = row?.model_type + ? getModelTypeFromBackendType(row.model_type) + : undefined; + handleModelNavigationClick(id, name, navigate, frontendType); + }} onOpenApi={(id: string) => { const encoded = encodeURIComponent(id); window.location.href = `/api-info/${encoded}`; @@ -353,8 +396,9 @@ export default function ModelsDeployedCard(): JSX.Element { open={showDeleteModal} modelId={deleteTargetId || ""} isLoading={isProcessingDelete} + deleteStep={deleteStep} onConfirm={handleConfirmDelete} - onCancel={() => setShowDeleteModal(false)} + onCancel={() => !isProcessingDelete && setShowDeleteModal(false)} /> ); diff --git a/app/frontend/src/components/models/ModelsTable.tsx b/app/frontend/src/components/models/ModelsTable.tsx index d02e3a70..aa7242eb 100644 --- a/app/frontend/src/components/models/ModelsTable.tsx +++ b/app/frontend/src/components/models/ModelsTable.tsx @@ -12,6 +12,7 @@ import { } from "../ui/table"; import { Activity, + Cpu, Heart, Network, // Settings, @@ -129,6 +130,13 @@ export default function ModelsTable({ /> Model Name + + + Chip + {image && (
@@ -178,6 +186,7 @@ export default function ModelsTable({ const isExpanded = !!expanded[row.id]; const colCount = 1 /* name */ + + 1 /* chip */ + 1 /* status */ + 1 /* health */ + 1 /* manage */ + @@ -206,6 +215,16 @@ export default function ModelsTable({ + + {row.device_id != null ? ( + + + Slot {String(row.device_id).padStart(2, "0")} + + ) : ( + + )} + {image ? ( @@ -231,6 +250,7 @@ export default function ModelsTable({ id={row.id} name={row.name} image={row.image} + model_type={row.model_type} health={healthMap[row.id]} onDelete={onDelete} onRedeploy={onRedeploy} @@ -259,6 +279,10 @@ export default function ModelsTable({
+
+
Chip Slot
+ +
Ports
diff --git a/app/frontend/src/components/models/row-cells/ManageCell.tsx b/app/frontend/src/components/models/row-cells/ManageCell.tsx index 5e863c6f..4804a75e 100644 --- a/app/frontend/src/components/models/row-cells/ManageCell.tsx +++ b/app/frontend/src/components/models/row-cells/ManageCell.tsx @@ -11,10 +11,12 @@ import { Image as ImageIcon, Crosshair, Mic, + Volume2, } from "lucide-react"; import type { HealthStatus } from "../../../types/models"; import { getModelTypeFromName, + getModelTypeFromBackendType, ModelType, } from "../../../api/modelsDeployedApis"; @@ -22,6 +24,7 @@ interface Props { id: string; name?: string; image?: string; + model_type?: string; health?: HealthStatus; onDelete: (id: string) => void; onRedeploy: (image?: string) => void; @@ -33,6 +36,7 @@ export default React.memo(function ManageCell({ id, name, image: _image, + model_type, health, onDelete, onRedeploy: _onRedeploy, @@ -48,7 +52,9 @@ export default React.memo(function ManageCell({ const dangerBtn = "!border-red-400/70 !text-red-300 !bg-red-600/20 hover:!bg-red-600/30 shadow-[0_8px_24px_rgba(255,0,0,0.15)]"; - const modelType = getModelTypeFromName(name ?? ""); + const modelType = model_type + ? getModelTypeFromBackendType(model_type) + : getModelTypeFromName(name ?? ""); const openLabel = modelType === ModelType.ImageGeneration ? "Image Gen" @@ -56,7 +62,9 @@ export default React.memo(function ManageCell({ ? "Object Detect" : modelType === ModelType.SpeechRecognitionModel ? "Speech" - : "Chat"; + : modelType === ModelType.TTS + ? "TTS" + : "Chat"; const OpenIcon = modelType === ModelType.ImageGeneration ? ImageIcon @@ -64,7 +72,9 @@ export default React.memo(function ManageCell({ ? Crosshair : modelType === ModelType.SpeechRecognitionModel ? Mic - : MessageSquareText; + : modelType === ModelType.TTS + ? Volume2 + : MessageSquareText; return (
diff --git a/app/frontend/src/components/pipeline/VoicePipelineDemo.tsx b/app/frontend/src/components/pipeline/VoicePipelineDemo.tsx new file mode 100644 index 00000000..fcdf11dc --- /dev/null +++ b/app/frontend/src/components/pipeline/VoicePipelineDemo.tsx @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +import { useEffect, useRef, useState } from "react"; +import { Mic, Square, Volume2, CheckCircle, Loader2, Circle } from "lucide-react"; +import { Button } from "../ui/button"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "../ui/select"; +import { runVoicePipeline } from "../../api/modelsDeployedApis"; +import { customToast } from "../CustomToaster"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +interface DeployedModelInfo { + id: string; + modelName: string; + model_type?: string; +} + +type PipelineStage = "idle" | "recording" | "stt" | "llm" | "tts" | "done"; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +async function fetchDeployedByType( + modelType: string +): Promise { + try { + const res = await fetch("/models-api/deployed/"); + if (!res.ok) return []; + const data = await res.json(); + return Object.entries(data) + .map(([id, info]: [string, any]) => ({ + id, + modelName: + info.model_impl?.model_name || + info.model_impl?.hf_model_id || + "Unknown", + model_type: info.model_impl?.model_type, + })) + .filter((m) => m.model_type === modelType); + } catch { + return []; + } +} + +// --------------------------------------------------------------------------- +// Stage indicator +// --------------------------------------------------------------------------- + +const STAGES: { key: PipelineStage; label: string }[] = [ + { key: "recording", label: "Mic" }, + { key: "stt", label: "Whisper" }, + { key: "llm", label: "LLM" }, + { key: "tts", label: "TTS" }, +]; + +const STAGE_ORDER: Record = { + idle: -1, + recording: 0, + stt: 1, + llm: 2, + tts: 3, + done: 4, +}; + +function StageIndicator({ current }: { current: PipelineStage }) { + return ( +
+ {STAGES.map((s, i) => { + const order = STAGE_ORDER[s.key]; + const currentOrder = STAGE_ORDER[current]; + const isDone = currentOrder > order; + const isActive = current === s.key; + + return ( +
+ {i > 0 && ( +
+ )} +
+ {isDone ? ( + + ) : isActive ? ( + + ) : ( + + )} + + {s.label} + +
+
+ ); + })} +
+ ); +} + +// --------------------------------------------------------------------------- +// Main component +// --------------------------------------------------------------------------- + +export default function VoicePipelineDemo() { + // Model dropdowns + const [sttModels, setSttModels] = useState([]); + const [llmModels, setLlmModels] = useState([]); + const [ttsModels, setTtsModels] = useState([]); + + const [whisperDeployId, setWhisperDeployId] = useState(""); + const [llmDeployId, setLlmDeployId] = useState(""); + const [ttsDeployId, setTtsDeployId] = useState(""); + + // Recording + const [isRecording, setIsRecording] = useState(false); + const mediaRecorderRef = useRef(null); + const chunksRef = useRef([]); + + // Pipeline state + const [stage, setStage] = useState("idle"); + const [transcript, setTranscript] = useState(""); + const [llmResponse, setLlmResponse] = useState(""); + const [audioUrl, setAudioUrl] = useState(null); + const audioRef = useRef(null); + + // Fetch deployed models on mount + useEffect(() => { + Promise.all([ + fetchDeployedByType("speech_recognition"), + fetchDeployedByType("chat"), + fetchDeployedByType("tts"), + ]).then(([stt, llm, tts]) => { + setSttModels(stt); + setLlmModels(llm); + setTtsModels(tts); + if (stt.length > 0) setWhisperDeployId(stt[0].id); + if (llm.length > 0) setLlmDeployId(llm[0].id); + if (tts.length > 0) setTtsDeployId(tts[0].id); + }); + }, []); + + const startRecording = async () => { + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + const mr = new MediaRecorder(stream); + chunksRef.current = []; + mr.ondataavailable = (e) => chunksRef.current.push(e.data); + mr.start(); + mediaRecorderRef.current = mr; + setIsRecording(true); + setStage("recording"); + setTranscript(""); + setLlmResponse(""); + setAudioUrl(null); + } catch (err) { + customToast.error("Microphone access denied"); + } + }; + + const stopRecording = () => { + const mr = mediaRecorderRef.current; + if (!mr) return; + mr.onstop = async () => { + const blob = new Blob(chunksRef.current, { type: "audio/webm" }); + const file = new File([blob], "recording.webm", { type: "audio/webm" }); + await runPipeline(file); + }; + mr.stop(); + mr.stream.getTracks().forEach((t) => t.stop()); + setIsRecording(false); + }; + + const runPipeline = async (audioFile: File) => { + if (!whisperDeployId || !llmDeployId) { + customToast.error("Please select STT and LLM models"); + setStage("idle"); + return; + } + + setStage("stt"); + let llmText = ""; + + await runVoicePipeline( + { + audioFile, + whisperDeployId, + llmDeployId, + ttsDeployId: ttsDeployId || undefined, + }, + // onTranscript + (text) => { + setTranscript(text); + setStage("llm"); + }, + // onLlmChunk + (chunk) => { + llmText += chunk; + setLlmResponse((prev) => prev + chunk); + }, + // onAudio + (url) => { + setAudioUrl(url); + setStage("tts"); + // Auto-play + setTimeout(() => { + if (audioRef.current) { + audioRef.current.src = url; + audioRef.current.play().catch(() => {}); + } + }, 100); + }, + // onError + (stage, message) => { + customToast.error(`Pipeline error (${stage}): ${message}`); + setStage("idle"); + }, + // onDone + () => { + setStage("done"); + } + ); + }; + + return ( +
+

+ Voice Pipeline Demo +

+

+ Mic → Whisper STT → LLM → TTS → Speaker +

+ + {/* Model selectors */} +
+
+ + +
+ +
+ + +
+ +
+ + +
+
+ + {/* Stage indicator */} +
+ +
+ + {/* Record button */} +
+ {isRecording ? ( + + ) : ( + + )} +
+ + {/* Outputs */} + {transcript && ( +
+

+ Transcript +

+

+ {transcript} +

+
+ )} + + {llmResponse && ( +
+

+ LLM Response +

+

+ {llmResponse} +

+
+ )} + + {audioUrl && ( +
+ +
+ )} + + {/* Hidden audio element for autoplay */} + {!audioUrl &&
+ ); +} diff --git a/app/frontend/src/components/tts/TTSDemo.tsx b/app/frontend/src/components/tts/TTSDemo.tsx new file mode 100644 index 00000000..3d9c6cbc --- /dev/null +++ b/app/frontend/src/components/tts/TTSDemo.tsx @@ -0,0 +1,275 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +import { useEffect, useRef, useState } from "react"; +import { Volume2, Loader2, Download } from "lucide-react"; +import { motion } from "framer-motion"; +import { Button } from "../ui/button"; +import { Textarea } from "../ui/textarea"; +import { Card } from "../ui/card"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "../ui/select"; +import { runTTSInference } from "../../api/modelsDeployedApis"; +import { customToast } from "../CustomToaster"; + +interface DeployedModelInfo { + id: string; + modelName: string; + model_type?: string; +} + +async function fetchTTSModels(): Promise { + try { + const res = await fetch("/models-api/deployed/"); + if (!res.ok) return []; + const data = await res.json(); + return Object.entries(data) + .map(([id, info]: [string, any]) => ({ + id, + modelName: + info.model_impl?.model_name || + info.model_impl?.hf_model_id || + "Unknown", + model_type: info.model_impl?.model_type, + })) + .filter((m) => m.model_type === "tts"); + } catch { + return []; + } +} + +export default function TTSDemo() { + const [ttsModels, setTtsModels] = useState([]); + const [selectedDeployId, setSelectedDeployId] = useState(""); + const [text, setText] = useState(""); + const [audioUrl, setAudioUrl] = useState(null); + const [isLoading, setIsLoading] = useState(false); + const audioRef = useRef(null); + + useEffect(() => { + fetchTTSModels().then((models) => { + setTtsModels(models); + if (models.length > 0) setSelectedDeployId(models[0].id); + }); + }, []); + + // Revoke previous object URL to avoid memory leaks + useEffect(() => { + return () => { + if (audioUrl) URL.revokeObjectURL(audioUrl); + }; + }, [audioUrl]); + + const handleGenerate = async () => { + if (!selectedDeployId) { + customToast.error("Please select a TTS model"); + return; + } + if (!text.trim()) { + customToast.error("Please enter some text to synthesize"); + return; + } + + setIsLoading(true); + if (audioUrl) { + URL.revokeObjectURL(audioUrl); + setAudioUrl(null); + } + + try { + const blob = await runTTSInference(selectedDeployId, text.trim()); + const url = URL.createObjectURL(blob); + setAudioUrl(url); + setTimeout(() => { + if (audioRef.current) { + audioRef.current.src = url; + audioRef.current.play().catch(() => {}); + } + }, 100); + } catch (err) { + customToast.error( + `TTS generation failed: ${err instanceof Error ? err.message : "Unknown error"}` + ); + } finally { + setIsLoading(false); + } + }; + + const handleKeyDown = (e: React.KeyboardEvent) => { + if (e.key === "Enter" && !e.shiftKey) { + e.preventDefault(); + if (!isLoading && ttsModels.length > 0 && text.trim()) { + handleGenerate(); + } + } + }; + + const handleDownload = () => { + if (!audioUrl) return; + const a = document.createElement("a"); + a.href = audioUrl; + a.download = "tts-output.wav"; + a.click(); + }; + + return ( + +
+
+ {/* Header */} + +

+ Text to Speech Demo +

+

+ Type text below and generate audio using a deployed TTS model. +

+
+ + {/* Model selector */} + + + {ttsModels.length === 0 ? ( +
+ No TTS models are currently deployed. Deploy a TTS model to + get started. +
+ ) : ( + + )} +
+ + {/* Text input */} + + +