diff --git a/.gitignore b/.gitignore index 9a79cad0..52a9a37b 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,5 @@ inference-api/__pycache__/ CLAUDE.md docs/RAG_PRODUCTIONIZATION_PLAN.md docs/DOCKER_CONTROL_SERVICE_PLAN.md +!app/backend/shared_config/models_from_inference_server.json + diff --git a/app/backend/api/settings.py b/app/backend/api/settings.py index cf7d799e..06671024 100644 --- a/app/backend/api/settings.py +++ b/app/backend/api/settings.py @@ -64,11 +64,6 @@ # Application definition INSTALLED_APPS = [ - "django.contrib.admin", - "django.contrib.auth", - "django.contrib.contenttypes", - "django.contrib.sessions", - "django.contrib.messages", "django.contrib.staticfiles", "docker_control.apps.DockerControlConfig", "model_control", @@ -81,11 +76,8 @@ MIDDLEWARE = [ "corsheaders.middleware.CorsMiddleware", "django.middleware.security.SecurityMiddleware", - "django.contrib.sessions.middleware.SessionMiddleware", "django.middleware.common.CommonMiddleware", "django.middleware.csrf.CsrfViewMiddleware", - "django.contrib.auth.middleware.AuthenticationMiddleware", - "django.contrib.messages.middleware.MessageMiddleware", "django.middleware.clickjacking.XFrameOptionsMiddleware", ] @@ -100,25 +92,12 @@ "context_processors": [ "django.template.context_processors.debug", "django.template.context_processors.request", - "django.contrib.auth.context_processors.auth", - "django.contrib.messages.context_processors.messages", ], }, }, ] WSGI_APPLICATION = "api.wsgi.application" -SESSIONS_ENGINE = "django.contrib.sessions.backends.cache" -# Database -# https://docs.djangoproject.com/en/4.2/ref/settings/#databases - -# SQLite database for deployment history and other persistent data -DATABASES = { - "default": { - "ENGINE": "django.db.backends.sqlite3", - "NAME": backend_config.backend_cache_root / "db.sqlite3", - } -} # local memory thread-safe default # the LOCATION for locmem.LocMemCache cache backend is just a name for tracking @@ -135,24 +114,6 @@ }, } -# Password validation -# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators - -AUTH_PASSWORD_VALIDATORS = [ - { - "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", - }, - { - "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", - }, - { - "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", - }, - { - "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", - }, -] - # Internationalization # https://docs.djangoproject.com/en/4.2/topics/i18n/ diff --git a/app/backend/api/urls.py b/app/backend/api/urls.py index 441f06b7..f8aa4a19 100644 --- a/app/backend/api/urls.py +++ b/app/backend/api/urls.py @@ -19,12 +19,10 @@ 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) """ -from django.contrib import admin from api.views import UpStatusView from django.urls import include, path urlpatterns = [ - path("admin/", admin.site.urls), path("up/", UpStatusView.as_view()), path("docker/", include("docker_control.urls")), path("models/", include("model_control.urls")), diff --git a/app/backend/board_control/services.py b/app/backend/board_control/services.py index 2c08a231..48b8c30f 100644 --- a/app/backend/board_control/services.py +++ b/app/backend/board_control/services.py @@ -16,12 +16,16 @@ class SystemResourceService: """Service for monitoring system resources and TT device telemetry""" - + # Cache keys and timeout TT_SMI_CACHE_KEY = "tt_smi_data" TT_SMI_CACHE_TIMEOUT = 3600 # Cache for 1 hour (since we'll refresh on events only) BOARD_TYPE_CACHE_KEY = "board_type_data" BOARD_TYPE_CACHE_TIMEOUT = 3600 # Cache board type for 1 hour (since it rarely changes) + + # Device state cache keys + DEVICE_STATE_CACHE_KEY = "device_state_v2" + DEVICE_RESETTING_KEY = "device_resetting" @staticmethod def get_tt_smi_data(timeout=10): @@ -412,9 +416,245 @@ def force_refresh_tt_smi_cache(): # Clear the existing cache cache.delete(SystemResourceService.TT_SMI_CACHE_KEY) cache.delete(SystemResourceService.BOARD_TYPE_CACHE_KEY) - + # Fetch fresh data SystemResourceService.get_tt_smi_data() SystemResourceService.get_board_type() - - logger.info("tt-smi cache refreshed successfully") \ No newline at end of file + + logger.info("tt-smi cache refreshed successfully") + + # ------------------------------------------------------------------------- + # Device State Machine — single source of truth + # ------------------------------------------------------------------------- + + @staticmethod + def _extract_board_type_from_data(data): + """Extract canonical board-type string from tt-smi JSON data.""" + if not data or "device_info" not in data or not data["device_info"]: + return "unknown" + + board_types = [] + for info in data["device_info"]: + board_info = info.get("board_info", {}) + board_types.append(board_info.get("board_type", "unknown")) + + if not board_types: + return "unknown" + + # Strip "local"/"remote" suffix if present + filtered = [bt.rsplit(" ", 1)[0] for bt in board_types] + unique = set(filtered) + + if len(unique) > 1: + logger.warning(f"Mixed board types detected: {unique}") + return "unknown" + + raw = unique.pop() + num_devices = len(data["device_info"]) + raw_lower = raw.lower() + + if "n150" in raw_lower: + return "N150X4" if num_devices >= 4 else "N150" + if "n300" in raw_lower: + return "T3K" if num_devices >= 4 else "N300" + if "p300" in raw_lower: + if num_devices >= 8: + return "P300Cx4" + if num_devices >= 4: + return "P300Cx2" + return "P300c" + if "p150" in raw_lower: + if num_devices >= 8: + return "P150X8" + if num_devices >= 4: + return "P150X4" + return "P150" + if "p100" in raw_lower: + return "P100" + if "e150" in raw_lower: + return "E150" + if "galaxy" in raw_lower: + return "GALAXY_T3K" if "t3k" in raw_lower else "GALAXY" + + logger.warning(f"Unknown board type string: {raw!r}") + return "unknown" + + @staticmethod + def _extract_devices_from_data(data): + """Extract device summary list from tt-smi JSON data.""" + devices = [] + if not data or "device_info" not in data: + return devices + + for idx, device in enumerate(data["device_info"]): + board_info = device.get("board_info", {}) + telemetry = device.get("telemetry", {}) + + def _f(v): + try: + return float(v) if v is not None else 0.0 + except (TypeError, ValueError): + return 0.0 + + devices.append({ + "index": idx, + "board_type": board_info.get("board_type", "Unknown"), + "bus_id": board_info.get("bus_id", "N/A"), + "temperature": _f(telemetry.get("asic_temperature")), + "power": _f(telemetry.get("power")), + "voltage": _f(telemetry.get("voltage")), + }) + return devices + + @staticmethod + def get_device_state(): + """ + Single authoritative device state resolver. + + States: + HEALTHY — tt-smi -s succeeded, devices visible + BAD_STATE — /dev/tenstorrent present but tt-smi timed out / errored + RESETTING — tt-smi -r is actively running + NOT_PRESENT — /dev/tenstorrent path does not exist + UNKNOWN — can't determine (startup / tt-smi missing) + """ + # RESETTING takes priority — check before cache + if cache.get(SystemResourceService.DEVICE_RESETTING_KEY): + return { + "state": "RESETTING", + "board_type": "unknown", + "board_name": "Resetting…", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": False, + } + + # Return cached result if still fresh + cached = cache.get(SystemResourceService.DEVICE_STATE_CACHE_KEY) + if cached is not None: + return cached + + # Check physical device presence + if not os.path.exists("/dev/tenstorrent"): + result = { + "state": "NOT_PRESENT", + "board_type": "unknown", + "board_name": "Not Present", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": False, + } + cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=15) + return result + + # Try tt-smi -s with 10-second timeout + try: + logger.info("Running tt-smi -s for device state check") + process = subprocess.Popen( + ["tt-smi", "-s"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + stdin=subprocess.DEVNULL, + text=True, + preexec_fn=os.setsid, + ) + + try: + stdout, stderr = process.communicate(timeout=10) + except subprocess.TimeoutExpired: + logger.error("tt-smi -s timed out after 10s — board in BAD_STATE") + try: + os.killpg(os.getpgid(process.pid), signal.SIGTERM) + process.wait(timeout=2) + except Exception: + try: + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + except Exception: + pass + result = { + "state": "BAD_STATE", + "board_type": "unknown", + "board_name": "Bad State", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": True, + } + cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=10) + return result + + if process.returncode != 0: + logger.error(f"tt-smi -s exit code {process.returncode}: {stderr.strip()!r}") + result = { + "state": "BAD_STATE", + "board_type": "unknown", + "board_name": "Bad State", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": True, + } + cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=10) + return result + + try: + data = json.loads(stdout) + except json.JSONDecodeError as e: + logger.error(f"Failed to parse tt-smi JSON: {e}") + result = { + "state": "BAD_STATE", + "board_type": "unknown", + "board_name": "Bad State", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": True, + } + cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=10) + return result + + board_type = SystemResourceService._extract_board_type_from_data(data) + devices = SystemResourceService._extract_devices_from_data(data) + result = { + "state": "HEALTHY", + "board_type": board_type, + "board_name": board_type, + "devices": devices, + "last_updated": timezone.now().isoformat(), + "reset_suggested": False, + } + cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=30) + return result + + except FileNotFoundError: + logger.error("tt-smi command not found") + # Don't cache UNKNOWN so each call re-checks (tt-smi may be installed later) + return { + "state": "UNKNOWN", + "board_type": "unknown", + "board_name": "Unknown", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": False, + } + except Exception as e: + logger.error(f"Unexpected error in get_device_state: {e}") + return { + "state": "UNKNOWN", + "board_type": "unknown", + "board_name": "Unknown", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": False, + } + + @staticmethod + def set_resetting_state(): + """Mark the device as actively resetting (clears state cache).""" + cache.set(SystemResourceService.DEVICE_RESETTING_KEY, True, timeout=120) + cache.delete(SystemResourceService.DEVICE_STATE_CACHE_KEY) + logger.info("Device state set to RESETTING") + + @staticmethod + def clear_device_state_cache(): + """Clear device state cache and resetting flag after reset completes.""" + cache.delete(SystemResourceService.DEVICE_STATE_CACHE_KEY) + cache.delete(SystemResourceService.DEVICE_RESETTING_KEY) + logger.info("Device state cache cleared") \ No newline at end of file diff --git a/app/backend/board_control/urls.py b/app/backend/board_control/urls.py index 42e59361..3e2b323d 100644 --- a/app/backend/board_control/urls.py +++ b/app/backend/board_control/urls.py @@ -19,4 +19,8 @@ # Cache management path("refresh-cache/", views.RefreshCacheView.as_view(), name="refresh-cache"), -] \ No newline at end of file + + # Unified device state & reset (new) + path("device-state/", views.DeviceStateView.as_view(), name="device-state"), + path("device-reset/", views.DeviceResetView.as_view(), name="device-reset"), +] \ No newline at end of file diff --git a/app/backend/board_control/views.py b/app/backend/board_control/views.py index f904557c..7dd83428 100644 --- a/app/backend/board_control/views.py +++ b/app/backend/board_control/views.py @@ -228,20 +228,78 @@ def patch(self, request, alert_id, *args, **kwargs): @method_decorator(csrf_exempt, name='dispatch') class RefreshCacheView(APIView): """Manual cache refresh endpoint for debugging and manual triggering""" - + def post(self, request, *args, **kwargs): try: logger.info("Manual cache refresh requested") SystemResourceService.force_refresh_tt_smi_cache() - + return Response({ "status": "success", "message": "tt-smi cache refreshed successfully" }, status=status.HTTP_200_OK) - + except Exception as e: logger.error(f"Error manually refreshing cache: {str(e)}") return Response( {"error": "Failed to refresh cache", "details": str(e)}, status=status.HTTP_500_INTERNAL_SERVER_ERROR - ) \ No newline at end of file + ) + + +@method_decorator(csrf_exempt, name='dispatch') +class DeviceStateView(APIView): + """ + GET /board-api/device-state/ + + Single source of truth for board state. Replaces the need to call + /board-api/status/, /board-api/footer-data/, and /docker-api/board-info/ + separately. All components should poll this endpoint. + """ + + def get(self, request, *args, **kwargs): + try: + state = SystemResourceService.get_device_state() + return Response(state, status=status.HTTP_200_OK) + except Exception as e: + logger.error(f"Error getting device state: {e}") + return Response({ + "state": "UNKNOWN", + "board_type": "unknown", + "board_name": "Unknown", + "devices": [], + "last_updated": timezone.now().isoformat(), + "reset_suggested": False, + }, status=status.HTTP_200_OK) + + +@method_decorator(csrf_exempt, name='dispatch') +class DeviceResetView(APIView): + """ + POST /board-api/device-reset/ + + Dedicated board reset endpoint. Separated from the Docker-coupled + /docker-api/reset_board/ for clarity; the old endpoint keeps working via + the same perform_reset() logic. + """ + + def post(self, request, *args, **kwargs): + from docker_control.docker_utils import perform_reset + try: + logger.info("Device reset requested via /board-api/device-reset/") + result = perform_reset() + http_status_code = result.pop("http_status", 200) + + success = result.get("status") == "success" + return Response({ + "success": success, + "message": result.get("message", ""), + "attempts_used": result.get("attempts_used", 0), + }, status=http_status_code) + except Exception as e: + logger.error(f"Error in device reset: {e}") + return Response({ + "success": False, + "message": str(e), + "attempts_used": 0, + }, status=status.HTTP_500_INTERNAL_SERVER_ERROR) \ No newline at end of file diff --git a/app/backend/docker_control/admin.py b/app/backend/docker_control/admin.py index 2c79060a..917beb36 100644 --- a/app/backend/docker_control/admin.py +++ b/app/backend/docker_control/admin.py @@ -1,7 +1,3 @@ # SPDX-License-Identifier: Apache-2.0 # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC - -from django.contrib import admin - -# Register your models here. diff --git a/app/backend/docker_control/apps.py b/app/backend/docker_control/apps.py index 0a263c9c..68dfa377 100644 --- a/app/backend/docker_control/apps.py +++ b/app/backend/docker_control/apps.py @@ -14,32 +14,15 @@ class DockerControlConfig(AppConfig): def ready(self): """Initialize docker control services""" logger.info("Docker control app is ready") - - # Verify database migrations are applied + + # Log how many deployments are already tracked try: - from django.db import connection - - # Check if ModelDeployment table exists - with connection.cursor() as cursor: - cursor.execute(""" - SELECT name FROM sqlite_master - WHERE type='table' AND name='docker_control_modeldeployment' - """) - table_exists = cursor.fetchone() is not None - - if not table_exists: - logger.warning( - "ModelDeployment table not found. Database migrations may not be applied. " - "Run: python manage.py migrate docker_control" - ) - else: - # Count existing deployment records - from docker_control.models import ModelDeployment - count = ModelDeployment.objects.count() - logger.info(f"Deployment history table verified. Existing records: {count}") + from docker_control.models import ModelDeployment + count = ModelDeployment.objects.count() + logger.info(f"Deployment store loaded. Existing records: {count}") except Exception as e: - logger.warning(f"Could not verify deployment history table: {e}") - + logger.warning(f"Could not read deployment store: {e}") + # Start container health monitoring service try: from docker_control.health_monitor import start_health_monitoring diff --git a/app/backend/docker_control/deployment_store.py b/app/backend/docker_control/deployment_store.py new file mode 100644 index 00000000..ba5421fd --- /dev/null +++ b/app/backend/docker_control/deployment_store.py @@ -0,0 +1,248 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC + +""" +Thread-safe JSON file store replacing Django ORM for ModelDeployment. + +Provides a drop-in ORM-like interface (objects.create, filter, all, get, save) +backed by a single JSON file in the persistent storage volume. +""" + +import json +import os +import threading +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, List, Optional + +from shared_config.logger_config import get_logger + +logger = get_logger(__name__) + +_STORE_PATH = ( + Path(os.getenv("INTERNAL_PERSISTENT_STORAGE_VOLUME", "/tt_studio_persistent_volume")) + / "backend_volume" + / "deployments.json" +) + +_lock = threading.Lock() + + +def _now() -> datetime: + return datetime.now(timezone.utc) + + +def _parse_dt(s: Optional[str]) -> Optional[datetime]: + if s is None: + return None + try: + return datetime.fromisoformat(s) + except Exception: + return None + + +def _sort_key(record: dict, field: str): + """Return a sortable key for a field, handling None and datetime strings.""" + val = record.get(field) + if val is None: + return "" + return val # ISO strings sort lexicographically = chronologically + + +def _load_raw() -> dict: + if not _STORE_PATH.exists(): + return {"next_id": 1, "records": []} + try: + with open(_STORE_PATH, "r") as f: + return json.load(f) + except Exception as e: + logger.warning(f"Could not read deployment store, starting fresh: {e}") + return {"next_id": 1, "records": []} + + +def _save_raw(data: dict) -> None: + _STORE_PATH.parent.mkdir(parents=True, exist_ok=True) + tmp = _STORE_PATH.with_suffix(".tmp") + try: + with open(tmp, "w") as f: + json.dump(data, f, indent=2, default=str) + os.replace(tmp, _STORE_PATH) + except Exception as e: + logger.error(f"Failed to save deployment store: {e}") + try: + tmp.unlink(missing_ok=True) + except Exception: + pass + + +def _match(record: dict, kwargs: dict) -> bool: + """Match a record against filter kwargs, supporting __in and __isnull suffixes.""" + for key, val in kwargs.items(): + if key.endswith("__in"): + field = key[: -len("__in")] + if record.get(field) not in val: + return False + elif key.endswith("__isnull"): + field = key[: -len("__isnull")] + is_null = record.get(field) is None + if is_null != val: + return False + else: + if record.get(key) != val: + return False + return True + + +class _QuerySet: + def __init__(self, records: List[dict]): + self._records = records + + def filter(self, **kwargs) -> "_QuerySet": + return _QuerySet([r for r in self._records if _match(r, kwargs)]) + + def order_by(self, *fields) -> "_QuerySet": + records = list(self._records) + for field in reversed(fields): + reverse = field.startswith("-") + fname = field.lstrip("-") + records.sort(key=lambda r: _sort_key(r, fname), reverse=reverse) + return _QuerySet(records) + + def first(self) -> Optional["ModelDeployment"]: + if not self._records: + return None + return ModelDeployment._from_dict(self._records[0]) + + def exists(self) -> bool: + return len(self._records) > 0 + + def count(self) -> int: + return len(self._records) + + def get(self, **kwargs) -> "ModelDeployment": + matches = [r for r in self._records if _match(r, kwargs)] + if not matches: + raise ModelDeployment.DoesNotExist(f"No record matching {kwargs}") + if len(matches) > 1: + raise Exception(f"Multiple records matching {kwargs}") + return ModelDeployment._from_dict(matches[0]) + + def __iter__(self): + return (ModelDeployment._from_dict(r) for r in self._records) + + def __getitem__(self, key): + if isinstance(key, slice): + return _QuerySet(self._records[key]) + return ModelDeployment._from_dict(self._records[key]) + + def __len__(self) -> int: + return len(self._records) + + +class _Manager: + def create(self, **kwargs) -> "ModelDeployment": + with _lock: + data = _load_raw() + record = { + "id": data["next_id"], + "container_id": kwargs.get("container_id", ""), + "container_name": kwargs.get("container_name", ""), + "model_name": kwargs.get("model_name", ""), + "device": kwargs.get("device", ""), + "deployed_at": _now().isoformat(), + "stopped_at": None, + "status": kwargs.get("status", "running"), + "stopped_by_user": kwargs.get("stopped_by_user", False), + "port": kwargs.get("port", None), + "device_id": kwargs.get("device_id", 0), + "workflow_log_path": kwargs.get("workflow_log_path", None), + } + data["next_id"] += 1 + data["records"].append(record) + _save_raw(data) + return ModelDeployment._from_dict(record) + + def all(self) -> _QuerySet: + with _lock: + data = _load_raw() + return _QuerySet(list(data["records"])) + + def filter(self, **kwargs) -> _QuerySet: + return self.all().filter(**kwargs) + + def get(self, **kwargs) -> "ModelDeployment": + return self.all().get(**kwargs) + + +class ModelDeployment: + class DoesNotExist(Exception): + pass + + objects: _Manager # set below + + def __init__(self): + self.id: Optional[int] = None + self.container_id: str = "" + self.container_name: str = "" + self.model_name: str = "" + self.device: str = "" + self.deployed_at: Optional[datetime] = None + self.stopped_at: Optional[datetime] = None + self.status: str = "running" + self.stopped_by_user: bool = False + self.port: Optional[int] = None + self.device_id: int = 0 + self.workflow_log_path: Optional[str] = None + + @classmethod + def _from_dict(cls, d: dict) -> "ModelDeployment": + obj = cls() + obj.id = d.get("id") + obj.container_id = d.get("container_id", "") + obj.container_name = d.get("container_name", "") + obj.model_name = d.get("model_name", "") + obj.device = d.get("device", "") + obj.deployed_at = _parse_dt(d.get("deployed_at")) + obj.stopped_at = _parse_dt(d.get("stopped_at")) + obj.status = d.get("status", "running") + obj.stopped_by_user = d.get("stopped_by_user", False) + obj.port = d.get("port") + obj.device_id = d.get("device_id", 0) + obj.workflow_log_path = d.get("workflow_log_path") + return obj + + def _to_dict(self) -> dict: + return { + "id": self.id, + "container_id": self.container_id, + "container_name": self.container_name, + "model_name": self.model_name, + "device": self.device, + "deployed_at": self.deployed_at.isoformat() if self.deployed_at else None, + "stopped_at": self.stopped_at.isoformat() if self.stopped_at else None, + "status": self.status, + "stopped_by_user": self.stopped_by_user, + "port": self.port, + "device_id": self.device_id, + "workflow_log_path": self.workflow_log_path, + } + + def save(self) -> None: + with _lock: + data = _load_raw() + for i, r in enumerate(data["records"]): + if r.get("id") == self.id: + data["records"][i] = self._to_dict() + _save_raw(data) + return + # Not found — append as new (shouldn't happen in normal flow) + logger.warning(f"save() called on deployment id={self.id} not found in store; appending") + data["records"].append(self._to_dict()) + _save_raw(data) + + def __str__(self) -> str: + return f"{self.model_name} on {self.device} - {self.status}" + + +ModelDeployment.objects = _Manager() diff --git a/app/backend/docker_control/docker_utils.py b/app/backend/docker_control/docker_utils.py index 0c4ab8f5..25abecd5 100644 --- a/app/backend/docker_control/docker_utils.py +++ b/app/backend/docker_control/docker_utils.py @@ -86,7 +86,7 @@ def map_board_type_to_device_name(board_type): logger.info(f"Mapped board type '{board_type}' to device name '{device_name}'") return device_name -def run_container(impl, weights_id): +def run_container(impl, weights_id, device_id=0): """Run a docker container via TT Inference Server API""" if (impl.model_type == ModelTypes.CHAT): # For chat models, we use the TT Inference Server API to run the container @@ -103,11 +103,29 @@ def run_container(impl, weights_id): "workflow": "server", # Default workflow for container runs "device": device, # Use mapped device name "docker_server": True, - "dev_mode": True + "dev_mode": True, + "chip_id": device_id, # Pin to specific chip; requires inference server support } logger.info(f"API payload: {payload}") + # Write a "starting" record immediately so history shows the deployment in-progress + pending_record = None + try: + pending_record = ModelDeployment.objects.create( + container_id=f"pending_{impl.model_name}", + container_name=f"pending_{impl.model_name}", + model_name=impl.model_name, + device=device, + device_id=device_id, + status="starting", + stopped_by_user=False, + port=7000, + ) + logger.info(f"Created pending deployment record for {impl.model_name}") + except Exception as e: + logger.warning(f"Could not create pending deployment record: {e}") + # Make POST request to TT Inference Server API api_url = "http://172.18.0.1:8001/run" @@ -128,17 +146,17 @@ def run_container(impl, weights_id): # Update deploy cache on success update_deploy_cache() - + # Notify agent about new container deployment notify_agent_of_new_container(api_result["container_name"]) - - # Save deployment record to database + + # Update the pending record (or create one if pending write failed) container_id = None container_name = "unknown" try: container_id = api_result.get("container_id") container_name = api_result.get("container_name", "unknown") - + # If container_id is not in response, try to get it from Docker by name if not container_id and container_name: try: @@ -148,30 +166,33 @@ def run_container(impl, weights_id): logger.info(f"Retrieved container_id {container_id} from Docker for {container_name}") except Exception as docker_error: logger.warning(f"Could not get container_id from Docker: {docker_error}") - # Use container_name as fallback ID if we can't get the actual ID container_id = container_name - + if container_id: - # Extract workflow log path from API response workflow_log_path = api_result.get("docker_log_file_path") logger.info(f"Extracted workflow_log_path from api_result: {workflow_log_path}") - logger.info(f"workflow_log_path type: {type(workflow_log_path)}, is None: {workflow_log_path is None}") - - ModelDeployment.objects.create( - container_id=container_id, - container_name=container_name, - model_name=impl.model_name, - device=device, - status="running", - stopped_by_user=False, - port=7000, # TT Inference Server default port - workflow_log_path=workflow_log_path - ) - logger.info(f"Saved deployment record for {container_name} (ID: {container_id})") - if workflow_log_path: - logger.info(f"Workflow log path saved: {workflow_log_path}") + + if pending_record: + # Update the pending record with real container info + pending_record.container_id = container_id + pending_record.container_name = container_name + pending_record.status = "running" + pending_record.workflow_log_path = workflow_log_path + pending_record.save() + logger.info(f"Updated pending record to running for {container_name} (ID: {container_id})") else: - logger.warning(f"Workflow log path is None/empty for {container_name}") + ModelDeployment.objects.create( + container_id=container_id, + container_name=container_name, + model_name=impl.model_name, + device=device, + device_id=device_id, + status="running", + stopped_by_user=False, + port=7000, + workflow_log_path=workflow_log_path + ) + logger.info(f"Saved deployment record for {container_name} (ID: {container_id})") else: logger.warning(f"Could not save deployment record: no container_id or container_name") except Exception as e: @@ -229,7 +250,7 @@ def run_container(impl, weights_id): run_kwargs = copy.deepcopy(impl.docker_config) # handle runtime configuration changes to docker kwargs - device_mounts = get_devices_mounts(impl) + device_mounts = get_devices_mounts(impl, device_id) if device_mounts: run_kwargs.update({"devices": device_mounts}) run_kwargs.update({"ports": get_port_mounts(impl)}) @@ -292,6 +313,7 @@ def run_container(impl, weights_id): container_name=container_name, model_name=impl.model_name, device=device_name, + device_id=device_id, status="running", stopped_by_user=False, port=host_port @@ -355,22 +377,47 @@ def get_runtime_device_configuration(device_configurations): return next(iter(device_configurations)) -def get_devices_mounts(impl): +def get_devices_mounts(impl, device_id=0): device_config = get_runtime_device_configuration(impl.device_configurations) assert isinstance(device_config, DeviceConfigurations) - # TODO: add logic to handle multiple devices and multiple containers - single_device_mounts = ["/dev/tenstorrent/0:/dev/tenstorrent/0"] + + # Single-chip device configurations: pin to the requested chip slot + single_chip_configs = { + DeviceConfigurations.E150, + DeviceConfigurations.N150, + DeviceConfigurations.N150_WH_ARCH_YAML, + DeviceConfigurations.N300, + DeviceConfigurations.N300_WH_ARCH_YAML, + DeviceConfigurations.P100, + DeviceConfigurations.P150, + DeviceConfigurations.P300c, + } + + # Multi-chip configurations manage their own chip allocation; expose full directory all_device_mounts = ["/dev/tenstorrent:/dev/tenstorrent"] - device_map = { - DeviceConfigurations.E150: single_device_mounts, - DeviceConfigurations.N150: single_device_mounts, - DeviceConfigurations.N150_WH_ARCH_YAML: single_device_mounts, - DeviceConfigurations.N300: single_device_mounts, - DeviceConfigurations.N300x4_WH_ARCH_YAML: all_device_mounts, - DeviceConfigurations.N300x4: all_device_mounts, + + if device_config in single_chip_configs: + return [f"/dev/tenstorrent/{device_id}:/dev/tenstorrent/{device_id}"] + + # Multi-chip (T3K, Galaxy, N300x4, P150X4, P150X8, etc.) + multi_chip_configs = { + DeviceConfigurations.N150X4, + DeviceConfigurations.N300x4, + DeviceConfigurations.N300x4_WH_ARCH_YAML, + DeviceConfigurations.T3K, + DeviceConfigurations.T3K_RING, + DeviceConfigurations.T3K_LINE, + DeviceConfigurations.P150X4, + DeviceConfigurations.P150X8, + DeviceConfigurations.P300Cx2, + DeviceConfigurations.P300Cx4, + DeviceConfigurations.GALAXY, + DeviceConfigurations.GALAXY_T3K, } - device_mounts = device_map.get(device_config) - return device_mounts + if device_config in multi_chip_configs: + return all_device_mounts + + return None def get_port_mounts(impl): @@ -550,12 +597,12 @@ def update_deploy_cache(): if is_tt_inference_container: logger.info(f"Detected TT Inference Server container: {con['name']} (ID: {con_id})") - # Try to find the model implementation from the database + # Try to find the model implementation from the deployment store deployment_found = False try: from docker_control.models import ModelDeployment deployment = ModelDeployment.objects.filter(container_id=con_id).first() - + if deployment: # Find the model implementation by model name model_impl = None @@ -565,11 +612,12 @@ def update_deploy_cache(): logger.info(f"Matched TT Inference Server container to model_impl: {model_impl.model_name}") deployment_found = True break - + if not model_impl: logger.warning(f"Could not find model_impl for {deployment.model_name} in container {con['name']}") else: - logger.warning(f"No deployment record found for TT Inference Server container {con_id}") + # No record by container_id — could be a pre-existing container or still starting up + logger.debug(f"No deployment record found for TT Inference Server container {con_id}") except Exception as e: # Check if this is a migration/database issue error_str = str(e).lower() @@ -582,13 +630,25 @@ def update_deploy_cache(): if not deployment_found: logger.info(f"Using fallback logic to match container {con['name']}") # Try to match by container name + # First try exact match model_impl = None for k, v in model_implmentations.items(): - if v.model_name in con["name"]: + if v.model_name == con["name"]: model_impl = v - logger.info(f"Matched container by name to model_impl: {model_impl.model_name}") + logger.info(f"Matched container by exact name to model_impl: {model_impl.model_name}") break - + + # Fall back to longest-substring match (prevents short names like "Llama-3.1-8B" + # from beating "Llama-3.1-8B-Instruct" on container name "Llama-3.1-8B-Instruct") + if not model_impl: + best_match_len = 0 + for k, v in model_implmentations.items(): + if v.model_name in con["name"] and len(v.model_name) > best_match_len: + model_impl = v + best_match_len = len(v.model_name) + if model_impl: + logger.info(f"Matched container by name substring to model_impl: {model_impl.model_name}") + if not model_impl: logger.warning(f"Could not match TT Inference Server container {con['name']} to any model_impl. Skipping.") continue @@ -655,195 +715,92 @@ def remove_id_prefix(s): def perform_reset(): + """ + Reset the TT board using tt-smi -r (up to 2 attempts, 30-second timeout each). + + The tt-smi -s pre-check has been intentionally removed: when the board is in + a bad state tt-smi -s itself hangs, which makes recovery worse. We go + straight to tt-smi -r and let the result speak for itself. + """ try: - logger.info("Running initial tt-smi -s command to check device detection.") - - # Initial check to see if Tenstorrent devices are detected - def check_device_detection(): - process = subprocess.Popen( - ["tt-smi", "-s"], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - stdin=subprocess.DEVNULL, # Prevents interactive command-line interface - text=True, - ) - output = [] - detected_chips = 0 - warnings = [] - for line in iter(process.stdout.readline, ""): - logger.info(f"tt-smi output: {line.strip()}") - output.append(line) - lower_line = line.lower() - if "detected chips" in lower_line: - # Expect format like: "Detected Chips: 2" - try: - parts = line.strip().split(":") - if len(parts) == 2: - detected_chips = int(parts[1].strip().split()[0]) - except (ValueError, IndexError) as e: - warnings.append(f"Unable to parse detected chips from line: {line.strip()}") - logger.warning(f"Unable to parse detected chips from line '{line.strip()}': {e}") - if "response_q out of sync" in lower_line or "rd_ptr" in lower_line: - warnings.append(line.strip()) - if "No Tenstorrent devices detected" in line: - return { - "status": "error", - "message": "No Tenstorrent devices detected! Please check your hardware and try again.", - "output": "".join(output), - "http_status": 503, # Service Unavailable - } - process.stdout.close() - return_code = process.wait() - - # Parse JSON output if text parsing didn't find chips - if detected_chips == 0: - full_output = "".join(output) - try: - json_data = json.loads(full_output) - if "device_info" in json_data and isinstance(json_data["device_info"], list): - detected_chips = len(json_data["device_info"]) - logger.info(f"Detected {detected_chips} chips from JSON output") - except json.JSONDecodeError as e: - logger.warning(f"Could not parse tt-smi output as JSON: {e}") - - # If chips are detected, allow reset but surface warnings/return code - if detected_chips > 0: - if return_code != 0: - warnings.append(f"tt-smi -s exited with code {return_code}") - status_val = "success" if not warnings and return_code == 0 else "warning" - return { - "status": status_val, - "output": "".join(output), - "warnings": warnings, - "detected_chips": detected_chips, - "return_code": return_code, - } - if return_code != 0: - return { - "status": "error", - "message": f"tt-smi -s command failed with return code {return_code}. Please check if tt-smi is properly installed.", - "output": "".join(output), - "http_status": 500, # Internal Server Error - } - return { - "status": "success", - "message": "No Tenstorrent devices detected. tt-smi executed successfully.", - "output": "".join(output), - "detected_chips": 0, - "return_code": return_code, - } + logger.info("Starting board reset — running tt-smi -r directly (no pre-check)") - # Run the device detection check - detection_result = check_device_detection() - detection_warnings = detection_result.get("warnings", []) - detection_output = detection_result.get("output", "") - if detection_result.get("status") == "error": - return detection_result - if detection_output: - cumulative_output = [detection_output] - else: - cumulative_output = [] - if detection_warnings: - cumulative_output.append("Warnings during device detection:\n") - cumulative_output.extend([w + "\n" for w in detection_warnings]) - - logger.info("Running tt-smi reset command.") - - def stream_command_output(command): - logger.info(f"Executing command: {' '.join(command)}") - process = subprocess.Popen( - command, - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT, - stdin=subprocess.DEVNULL, # Prevents interactive command-line interface - text=True, - ) - output = [] - for line in iter(process.stdout.readline, ""): - logger.info(f"Command output: {line.strip()}") - output.append(line) - process.stdout.close() - return_code = process.wait() - if return_code != 0: - logger.info(f"Command failed with return code {return_code}") - output.append(f"Command failed with return code {return_code}") - error_message = "tt-smi reset failed. Please check if:\n" - error_message += "1. The Tenstorrent device is properly connected\n" - error_message += "2. You have the correct permissions to access the device\n" - error_message += "3. The tt-smi utility is properly installed\n" - error_message += "4. The device firmware is up to date" - return { - "status": "error", - "message": error_message, - "output": "".join(output), - "http_status": 500, # Internal Server Error - } - else: - logger.info( - f"Command completed successfully with return code {return_code}" + # Signal that a reset is in progress so the device-state endpoint reports RESETTING + SystemResourceService.set_resetting_state() + + MAX_ATTEMPTS = 2 + last_output = "" + + for attempt in range(1, MAX_ATTEMPTS + 1): + logger.info(f"Reset attempt {attempt} of {MAX_ATTEMPTS}") + try: + process = subprocess.Popen( + ["tt-smi", "-r"], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + stdin=subprocess.DEVNULL, + text=True, + preexec_fn=os.setsid, ) - return {"status": "success", "output": "".join(output)} - - # Attempt software resets first (up to MAX_RESET_ATTEMPTS) - MAX_RESET_ATTEMPTS = 3 - reset_attempts = 0 - reset_success = False - - # Try tt-smi reset with retries (no reset config file; use default tt-smi behavior) - while reset_attempts < MAX_RESET_ATTEMPTS and not reset_success: - reset_attempts += 1 - logger.info(f"Reset attempt {reset_attempts} of {MAX_RESET_ATTEMPTS}") - cumulative_output.append(f"Attempting reset {reset_attempts} of {MAX_RESET_ATTEMPTS}...\n") - - # Perform reset using tt-smi default behavior (no reset_config.json) - cumulative_output.append("Executing tt-smi -r with default reset configuration.\n") - reset_result = stream_command_output(["tt-smi", "-r"]) - cumulative_output.append(reset_result.get('output', '') + "\n") - - if reset_result.get("status") == "success": - logger.info(f"Reset attempt {reset_attempts} succeeded") - reset_success = True - break - - logger.warning(f"Reset attempt {reset_attempts} failed") - # Small delay between attempts - time.sleep(2) - - # If all reset attempts failed - if not reset_success: - all_output = "".join(cumulative_output) - logger.error(f"All {MAX_RESET_ATTEMPTS} reset attempts failed") - return { - "status": "error", - "message": f"All {MAX_RESET_ATTEMPTS} reset attempts failed using tt-smi --reset command.", - "output": all_output, - "http_status": 500 - } - all_output = "".join(cumulative_output) - if reset_success: - return { - "status": "success", - "message": f"Reset successful after {reset_attempts} attempt(s)", - "output": all_output, - "warnings": detection_warnings, - "http_status": 200 - } - else: - return { - "status": "error", - "message": "All reset attempts failed with no specific error", - "output": all_output, - "warnings": detection_warnings, - "http_status": 500 - } + try: + stdout, _ = process.communicate(timeout=30) + last_output = stdout + logger.info(f"tt-smi -r attempt {attempt} output: {stdout.strip()!r:.200}") + + if process.returncode == 0: + logger.info(f"Reset succeeded on attempt {attempt}") + SystemResourceService.clear_device_state_cache() + return { + "status": "success", + "message": f"Board reset successfully after {attempt} attempt(s)", + "attempts_used": attempt, + "output": stdout, + "http_status": 200, + } + + logger.warning( + f"Reset attempt {attempt} failed: exit code {process.returncode}" + ) + + except subprocess.TimeoutExpired: + logger.warning(f"Reset attempt {attempt} timed out after 30s") + try: + os.killpg(os.getpgid(process.pid), signal.SIGTERM) + process.wait(timeout=2) + except Exception: + try: + os.killpg(os.getpgid(process.pid), signal.SIGKILL) + except Exception: + pass + last_output = "(timeout)" + + except Exception as exc: + logger.error(f"Reset attempt {attempt} raised exception: {exc}") + last_output = str(exc) + + # All attempts failed + logger.error(f"All {MAX_ATTEMPTS} reset attempts failed") + SystemResourceService.clear_device_state_cache() + return { + "status": "error", + "message": ( + f"Board did not recover after {MAX_ATTEMPTS} reset attempts. " + "Manual intervention may be required." + ), + "attempts_used": MAX_ATTEMPTS, + "output": last_output, + "http_status": 500, + } except Exception as e: - logger.exception("Exception occurred during reset operation.") + logger.exception("Unexpected error during reset operation") + SystemResourceService.clear_device_state_cache() return { "status": "error", "message": str(e), - "output": "An exception occurred during the reset operation.", + "attempts_used": 0, + "output": "", "http_status": 500, } diff --git a/app/backend/docker_control/migrations/0001_initial.py b/app/backend/docker_control/migrations/0001_initial.py deleted file mode 100644 index 0b4c168d..00000000 --- a/app/backend/docker_control/migrations/0001_initial.py +++ /dev/null @@ -1,33 +0,0 @@ -# Generated by Django 5.0.4 on 2025-11-12 15:18 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - initial = True - - dependencies = [ - ] - - operations = [ - migrations.CreateModel( - name='ModelDeployment', - fields=[ - ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('container_id', models.CharField(db_index=True, max_length=255, unique=True)), - ('container_name', models.CharField(db_index=True, max_length=255)), - ('model_name', models.CharField(db_index=True, max_length=255)), - ('device', models.CharField(max_length=50)), - ('deployed_at', models.DateTimeField(auto_now_add=True, db_index=True)), - ('stopped_at', models.DateTimeField(blank=True, null=True)), - ('status', models.CharField(db_index=True, default='running', max_length=50)), - ('stopped_by_user', models.BooleanField(default=False)), - ('port', models.IntegerField(blank=True, null=True)), - ], - options={ - 'ordering': ['-deployed_at'], - 'indexes': [models.Index(fields=['status', '-deployed_at'], name='docker_cont_status_a5afde_idx'), models.Index(fields=['model_name', '-deployed_at'], name='docker_cont_model_n_2ecff9_idx')], - }, - ), - ] diff --git a/app/backend/docker_control/migrations/0002_modeldeployment_workflow_log_path.py b/app/backend/docker_control/migrations/0002_modeldeployment_workflow_log_path.py deleted file mode 100644 index 518dde93..00000000 --- a/app/backend/docker_control/migrations/0002_modeldeployment_workflow_log_path.py +++ /dev/null @@ -1,18 +0,0 @@ -# Generated by Django 5.0.4 on 2025-11-12 21:35 - -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ('docker_control', '0001_initial'), - ] - - operations = [ - migrations.AddField( - model_name='modeldeployment', - name='workflow_log_path', - field=models.CharField(blank=True, help_text='Path to workflow log file from tt-inference-server', max_length=512, null=True), - ), - ] diff --git a/app/backend/docker_control/models.py b/app/backend/docker_control/models.py index a94f60ff..7f6b1f02 100644 --- a/app/backend/docker_control/models.py +++ b/app/backend/docker_control/models.py @@ -2,39 +2,6 @@ # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC -from django.db import models -from django.utils import timezone +from docker_control.deployment_store import ModelDeployment - -class ModelDeployment(models.Model): - """Track all model deployments with full history""" - # Deployment identification - container_id = models.CharField(max_length=255, unique=True, db_index=True) - container_name = models.CharField(max_length=255, db_index=True) - - # Model information - model_name = models.CharField(max_length=255, db_index=True) - device = models.CharField(max_length=50) # n150, n300, etc. - - # Deployment metadata - deployed_at = models.DateTimeField(auto_now_add=True, db_index=True) - stopped_at = models.DateTimeField(null=True, blank=True) - - # Status tracking - status = models.CharField(max_length=50, default="running", db_index=True) - # Choices: starting, running, stopped, exited, dead, error - stopped_by_user = models.BooleanField(default=False) # True if user clicked stop/delete - - # Container details - port = models.IntegerField(null=True, blank=True) - workflow_log_path = models.CharField(max_length=512, null=True, blank=True, help_text="Path to workflow log file from tt-inference-server") - - class Meta: - ordering = ['-deployed_at'] - indexes = [ - models.Index(fields=['status', '-deployed_at']), - models.Index(fields=['model_name', '-deployed_at']), - ] - - def __str__(self): - return f"{self.model_name} on {self.device} - {self.status}" +__all__ = ["ModelDeployment"] diff --git a/app/backend/docker_control/views.py b/app/backend/docker_control/views.py index 741a8bcf..f9ce640d 100644 --- a/app/backend/docker_control/views.py +++ b/app/backend/docker_control/views.py @@ -11,10 +11,11 @@ from rest_framework.renderers import JSONRenderer from django.views.decorators.csrf import csrf_exempt from django.utils.decorators import method_decorator -import json +import json import shutil import subprocess import os +from pathlib import Path import re import os @@ -43,6 +44,15 @@ logger = get_logger(__name__) logger.info(f"importing {__name__}") +# Build model_name → status lookup from catalog JSON +_CATALOG_PATH = Path(__file__).parent.parent / "shared_config/models_from_inference_server.json" +try: + _catalog = json.loads(_CATALOG_PATH.read_text()) + _status_lookup: dict[str, str | None] = {m["model_name"]: m.get("status") for m in _catalog["models"]} +except Exception: + logger.warning(f"Could not load model catalog from {_CATALOG_PATH}; status will be null for all models") + _status_lookup = {} + # Track when deployment started deployment_start_times = {} # {job_id: timestamp} - Track when deployment started @@ -188,7 +198,9 @@ def get(self, request, *args, **kwargs): "is_compatible": is_compatible, "compatible_boards": compatible_boards, "model_type": impl.model_type.value, - "current_board": current_board + "display_model_type": impl.display_model_type, + "current_board": current_board, + "status": _status_lookup.get(impl.model_name), }) return Response(data, status=status.HTTP_200_OK) @@ -209,8 +221,9 @@ def post(self, request, *args, **kwargs): if serializer.is_valid(): impl_id = request.data.get("model_id") weights_id = request.data.get("weights_id") + device_id = int(request.data.get("device_id", 0)) impl = model_implmentations[impl_id] - response = run_container(impl, weights_id) + response = run_container(impl, weights_id, device_id=device_id) # Ensure job_id is set for progress tracking # Use job_id from API response, or fallback to container_id or container_name @@ -706,14 +719,7 @@ def get(self, request, model_id): logger.info(f"Checking status for image: {image_name}:{image_tag}") image_status = check_image_exists(image_name, image_tag) logger.info(f"Image status result: {image_status}") - - # Add pull progress if available - if model_id in pull_progress: - image_status['pull_in_progress'] = True - image_status['progress'] = pull_progress[model_id] - else: - image_status['pull_in_progress'] = False - + image_status['pull_in_progress'] = False return Response(image_status, status=status.HTTP_200_OK) except KeyError: logger.warning(f"Model {model_id} not found in model_implementations") @@ -1172,6 +1178,7 @@ def get(self, request): 'container_name': deployment.container_name, 'model_name': deployment.model_name, 'device': deployment.device, + 'device_id': deployment.device_id, 'deployed_at': deployment.deployed_at.isoformat() if deployment.deployed_at else None, 'stopped_at': deployment.stopped_at.isoformat() if deployment.stopped_at else None, 'status': deployment.status, diff --git a/app/backend/model_control/model_utils.py b/app/backend/model_control/model_utils.py index 4e91e214..ad44619e 100644 --- a/app/backend/model_control/model_utils.py +++ b/app/backend/model_control/model_utils.py @@ -26,6 +26,53 @@ encoded_jwt = jwt.encode(json_payload, backend_config.jwt_secret, algorithm="HS256") AUTH_TOKEN = os.getenv('CLOUD_CHAT_UI_AUTH_TOKEN', '') +def messages_to_prompt(messages: list) -> str: + """Convert chat messages list to a plain text prompt for base/completion models.""" + parts = [] + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + if role == "system": + parts.append(content) + elif role == "user": + parts.append(f"User: {content}") + elif role == "assistant": + parts.append(f"Assistant: {content}") + parts.append("Assistant:") + return "\n\n".join(parts) + + +def get_model_name_from_container(internal_url: str, fallback: str) -> str: + """Query vLLM /v1/models to get the exact model name loaded in the container. + + Args: + internal_url: Raw internal URL from deploy cache (e.g. "container:7000/v1/chat/completions") + fallback: Value to return if the query fails (typically hf_model_id) + + Returns: + The actual model name reported by vLLM, or fallback on any error. + """ + try: + # Strip the route path to get just host:port + # e.g. "container:7000/v1/chat/completions" -> "container:7000" + base = internal_url.split("/")[0] + models_url = f"http://{base}/v1/models" + headers = {"Authorization": f"Bearer {encoded_jwt}"} + response = requests.get(models_url, headers=headers, timeout=3) + if response.status_code == 200: + model_id = response.json()["data"][0]["id"] + logger.info(f"Resolved actual model name from /v1/models: {model_id}") + return model_id + else: + logger.warning( + f"GET {models_url} returned {response.status_code}, using fallback: {fallback}" + ) + return fallback + except Exception as e: + logger.warning(f"Failed to query /v1/models ({e}), using fallback: {fallback}") + return fallback + + def get_deploy_cache(): # the cache is initialized when by docker_control is imported def get_all_records(): @@ -173,7 +220,7 @@ def stream_to_cloud_model(url, json_data): json_data["top_k"] = int(top_k) if top_k is not None else 20 json_data["top_p"] = float(top_p) if top_p is not None else 0.9 json_data["max_tokens"] = int(max_tokens) if max_tokens is not None else 512 - json_data["stream_options"] = {"include_usage": True, "continuous_usage_stats": True} + json_data["stream_options"] = {"include_usage": True} # Log final parameters being used logger.info("=== Final Model Parameters ===") @@ -231,7 +278,7 @@ def stream_to_cloud_model(url, json_data): chunk_dict = json.loads(sub_chunk) logger.info(f"Successfully parsed JSON: {chunk_dict}") - usage = chunk_dict.get("usage", {}) + usage = chunk_dict.get("usage") or {} completion_tokens = usage.get("completion_tokens", 0) prompt_tokens = usage.get("prompt_tokens", 0) logger.info(f"Usage info: {usage}, completion tokens: {completion_tokens}") @@ -314,7 +361,7 @@ def stream_response_from_external_api(url, json_data): json_data["top_k"] = int(top_k) if top_k is not None else 20 json_data["top_p"] = float(top_p) if top_p is not None else 0.9 json_data["max_tokens"] = int(max_tokens) if max_tokens is not None else 512 - json_data["stream_options"] = {"include_usage": True, "continuous_usage_stats": True} + json_data["stream_options"] = {"include_usage": True} # Log final parameters being used logger.info("=== Final Model Parameters ===") @@ -366,7 +413,7 @@ def stream_response_from_external_api(url, json_data): elif new_chunk != "": chunk_dict = json.loads(new_chunk) - usage = chunk_dict.get("usage", {}) + usage = chunk_dict.get("usage") or {} completion_tokens = usage.get("completion_tokens", 0) prompt_tokens = usage.get("prompt_tokens", 0) @@ -383,6 +430,10 @@ def stream_response_from_external_api(url, json_data): logger.info("stream_response_from_external done") + except requests.exceptions.HTTPError as e: + body = e.response.text if e.response is not None else "(no body)" + logger.error(f"HTTPError {e.response.status_code}: {body}") + yield f"error: {str(e)}" except requests.RequestException as e: logger.error(f"RequestException: {str(e)}") yield f"error: {str(e)}" diff --git a/app/backend/model_control/pipeline_views.py b/app/backend/model_control/pipeline_views.py new file mode 100644 index 00000000..dea491ef --- /dev/null +++ b/app/backend/model_control/pipeline_views.py @@ -0,0 +1,167 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +""" +Voice pipeline view: Whisper STT → LLM → TTS (optional). +Accepts multipart/form-data and streams SSE events to the client. +""" + +import base64 +import json +import time + +import requests +from django.http import StreamingHttpResponse +from rest_framework.views import APIView + +from model_control.model_utils import ( + encoded_jwt, + get_deploy_cache, + stream_response_from_external_api, +) +from shared_config.logger_config import get_logger + +logger = get_logger(__name__) + + +class VoicePipelineView(APIView): + """ + POST /models-api/pipeline/voice/ + + Multipart fields: + audio_file – audio blob + whisper_deploy_id – deploy_id of running Whisper + llm_deploy_id – deploy_id of running LLM + tts_deploy_id – (optional) deploy_id of running speecht5_tts + system_prompt – (optional) string + """ + + def post(self, request, *args, **kwargs): + audio_file = request.FILES.get("audio_file") + whisper_deploy_id = request.data.get("whisper_deploy_id") + llm_deploy_id = request.data.get("llm_deploy_id") + tts_deploy_id = request.data.get("tts_deploy_id") + system_prompt = request.data.get( + "system_prompt", + "You are a helpful assistant. Be concise.", + ) + + if not audio_file: + from rest_framework.response import Response + from rest_framework import status + return Response( + {"error": "audio_file is required"}, + status=status.HTTP_400_BAD_REQUEST, + ) + if not whisper_deploy_id or not llm_deploy_id: + from rest_framework.response import Response + from rest_framework import status + return Response( + {"error": "whisper_deploy_id and llm_deploy_id are required"}, + status=status.HTTP_400_BAD_REQUEST, + ) + + def event_stream(): + headers = {"Authorization": f"Bearer {encoded_jwt}"} + deploy_cache = get_deploy_cache() + + # ------------------------------------------------------------------ + # Step 1: STT (Whisper) + # ------------------------------------------------------------------ + try: + whisper_deploy = deploy_cache[whisper_deploy_id] + whisper_url = "http://" + whisper_deploy["internal_url"] + file_payload = { + "file": (audio_file.name, audio_file, audio_file.content_type) + } + stt_resp = requests.post( + whisper_url, files=file_payload, headers=headers, timeout=60 + ) + stt_resp.raise_for_status() + transcript = stt_resp.json().get("text", "") + yield f"data: {json.dumps({'type': 'transcript', 'text': transcript})}\n\n" + except Exception as exc: + logger.error(f"STT step failed: {exc}") + yield f"data: {json.dumps({'type': 'error', 'stage': 'stt', 'message': str(exc)})}\n\n" + return + + if not transcript: + yield f"data: {json.dumps({'type': 'error', 'stage': 'stt', 'message': 'Empty transcript'})}\n\n" + return + + # ------------------------------------------------------------------ + # Step 2: LLM streaming + # ------------------------------------------------------------------ + llm_deploy = deploy_cache[llm_deploy_id] + llm_url = "http://" + llm_deploy["internal_url"] + hf_model_id = llm_deploy["model_impl"].hf_model_id + + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": transcript}) + + llm_payload = { + "model": hf_model_id, + "messages": messages, + "stream": True, + "max_tokens": 512, + } + + llm_full_text = "" + try: + for chunk in stream_response_from_external_api(llm_url, llm_payload): + if isinstance(chunk, bytes): + chunk = chunk.decode("utf-8") + llm_full_text += chunk + yield f"data: {json.dumps({'type': 'llm_chunk', 'text': chunk})}\n\n" + except Exception as exc: + logger.error(f"LLM step failed: {exc}") + yield f"data: {json.dumps({'type': 'error', 'stage': 'llm', 'message': str(exc)})}\n\n" + return + + # ------------------------------------------------------------------ + # Step 3: TTS (optional) + # ------------------------------------------------------------------ + if tts_deploy_id and llm_full_text.strip(): + try: + tts_deploy = deploy_cache[tts_deploy_id] + tts_url = "http://" + tts_deploy["internal_url"] + + tts_resp = requests.post( + tts_url, + json={"text": llm_full_text.strip()}, + headers=headers, + timeout=30, + ) + tts_resp.raise_for_status() + + task_id = tts_resp.json().get("task_id") + status_url = tts_url.replace("/enqueue", f"/status/{task_id}") + + # Poll for completion + for _ in range(120): + st = requests.get(status_url, headers=headers, timeout=10) + if st.status_code != 404 and st.json().get("status") == "Completed": + break + time.sleep(1) + + audio_url = tts_url.replace("/enqueue", f"/fetch_audio/{task_id}") + audio_resp = requests.get(audio_url, headers=headers, timeout=30) + audio_resp.raise_for_status() + + audio_b64 = base64.b64encode(audio_resp.content).decode("utf-8") + content_type = audio_resp.headers.get("Content-Type", "audio/wav") + data_uri = f"data:{content_type};base64,{audio_b64}" + yield f"data: {json.dumps({'type': 'audio_url', 'url': data_uri})}\n\n" + except Exception as exc: + logger.error(f"TTS step failed: {exc}") + yield f"data: {json.dumps({'type': 'error', 'stage': 'tts', 'message': str(exc)})}\n\n" + # Don't abort — transcript and LLM response were already sent + + yield f"data: {json.dumps({'type': 'done'})}\n\n" + + response = StreamingHttpResponse(event_stream(), content_type="text/event-stream") + response["Cache-Control"] = "no-cache" + response["X-Accel-Buffering"] = "no" + return response diff --git a/app/backend/model_control/urls.py b/app/backend/model_control/urls.py index 158dfde3..74590b91 100644 --- a/app/backend/model_control/urls.py +++ b/app/backend/model_control/urls.py @@ -5,6 +5,7 @@ # model_control/urls.py from django.urls import path from . import views +from .pipeline_views import VoicePipelineView urlpatterns = [ path("inference/", views.InferenceView.as_view()), @@ -18,6 +19,8 @@ path("object-detection-cloud/", views.ObjectDetectionInferenceCloudView.as_view()), path("speech-recognition/", views.SpeechRecognitionInferenceView.as_view()), path("speech-recognition-cloud/", views.SpeechRecognitionInferenceCloudView.as_view()), + path("tts/", views.TtsInferenceView.as_view()), + path("pipeline/voice/", VoicePipelineView.as_view()), path("health/", views.ModelHealthView.as_view()), path("inference_cloud/", views.InferenceCloudView.as_view()), path("logs//", views.ContainerLogsView.as_view(), name="container-logs"), diff --git a/app/backend/model_control/views.py b/app/backend/model_control/views.py index 64bdc46b..6c44d110 100644 --- a/app/backend/model_control/views.py +++ b/app/backend/model_control/views.py @@ -42,6 +42,8 @@ def select_renderer(self, request, renderers, format_suffix): from model_control.model_utils import ( encoded_jwt, get_deploy_cache, + get_model_name_from_container, + messages_to_prompt, stream_response_from_external_api, stream_response_from_agent_api, health_check, @@ -85,8 +87,18 @@ def post(self, request, *args, **kwargs): internal_url = "http://" + deploy["internal_url"] logger.info(f"internal_url:= {internal_url}") logger.info(f"using vllm model:= {deploy["model_impl"].model_name}") - data["model"] = deploy["model_impl"].hf_model_id - + data["model"] = get_model_name_from_container( + deploy["internal_url"], fallback=deploy["model_impl"].hf_model_id + ) + + # Route base/completion models to /v1/completions with a plain prompt + service_route = deploy["model_impl"].service_route + logger.info(f"service_route:= {service_route}") + if service_route == "/v1/completions": + messages = data.pop("messages", []) + data["prompt"] = messages_to_prompt(messages) + data.pop("stream_options", None) + # Create a generator that can be cancelled def generate_response(): try: @@ -116,7 +128,9 @@ def post(self, request, *agrs, **kwargs): if deploy_id and deploy_id in deploy_cache: deploy = deploy_cache[deploy_id] logger.info(f"using vllm model:= {deploy['model_impl'].model_name}") - data["model"] = deploy["model_impl"].hf_model_id + data["model"] = get_model_name_from_container( + deploy["internal_url"], fallback=deploy["model_impl"].hf_model_id + ) else: logger.info("No valid deployment found, proceeding with agent-only mode (cloud LLM)") # Remove deploy_id from data since it's not needed for agent @@ -615,6 +629,57 @@ def post(self, request, *args, **kwargs): return Response(inference_data.json(), status=status.HTTP_200_OK) +class TtsInferenceView(APIView): + """Text-to-speech inference: POST text → /enqueue → poll → return audio blob.""" + def post(self, request, *args, **kwargs): + data = request.data + logger.info(f"{self.__class__.__name__} data:={data}") + serializer = InferenceSerializer(data=data) + if serializer.is_valid(): + deploy_id = data.get("deploy_id") + text = data.get("text") or data.get("prompt") + if not text: + return Response({"error": "text is required"}, status=status.HTTP_400_BAD_REQUEST) + deploy = get_deploy_cache()[deploy_id] + internal_url = "http://" + deploy["internal_url"] + try: + headers = {"Authorization": f"Bearer {encoded_jwt}"} + inference_data = requests.post(internal_url, json={"text": text}, headers=headers, timeout=30) + inference_data.raise_for_status() + + # Poll status until completed + task_id = inference_data.json().get("task_id") + get_status_url = internal_url.replace("/enqueue", f"/status/{task_id}") + ready = False + for _ in range(120): # up to ~2 minutes + status_resp = requests.get(get_status_url, headers=headers, timeout=10) + if status_resp.status_code != status.HTTP_404_NOT_FOUND: + status_resp.raise_for_status() + if status_resp.json().get("status") == "Completed": + ready = True + break + time.sleep(1) + + if not ready: + return Response({"error": "TTS task timed out"}, status=status.HTTP_504_GATEWAY_TIMEOUT) + + # Fetch audio result + get_audio_url = internal_url.replace("/enqueue", f"/fetch_audio/{task_id}") + audio_resp = requests.get(get_audio_url, headers=headers, stream=True, timeout=30) + audio_resp.raise_for_status() + + content_type = audio_resp.headers.get("Content-Type", "audio/wav") + django_response = HttpResponse(audio_resp.content, content_type=content_type) + django_response["Content-Disposition"] = "attachment; filename=tts_output.wav" + return django_response + + except requests.exceptions.HTTPError as http_err: + logger.error(f"TTS HTTP error: {http_err}") + return Response(status=status.HTTP_500_INTERNAL_SERVER_ERROR) + else: + return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) + + class ContainerLogsView(View): # Define event detection configuration before the get method SIMPLE_EVENT_KEYWORDS = [ diff --git a/app/backend/shared_config/model_config.py b/app/backend/shared_config/model_config.py index 9d9a7e93..22033b21 100644 --- a/app/backend/shared_config/model_config.py +++ b/app/backend/shared_config/model_config.py @@ -2,6 +2,7 @@ # # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC +import json import os from dataclasses import dataclass, asdict from typing import Set, Dict, Any, Union @@ -11,7 +12,6 @@ from shared_config.backend_config import backend_config from shared_config.setup_config import SetupTypes from shared_config.model_type_config import ModelTypes -from shared_config.model_type_config import ModelTypes from shared_config.logger_config import get_logger logger = get_logger(__name__) @@ -62,6 +62,7 @@ class ModelImpl: service_port: int = 7000 env_file: str = "" health_route: str = "/health" + display_model_type: str = "LLM" def __post_init__(self): # _init methods compute values that are dependent on other values @@ -216,203 +217,136 @@ def base_docker_config(): } -# model_ids are unique strings to define a model, they could be uuids but -# using friendly strings prefixed with id_ is more helpful for debugging +# --------------------------------------------------------------------------- +# JSON-based model loader +# --------------------------------------------------------------------------- + +CATALOG_JSON = Path(__file__).parent / "models_from_inference_server.json" + +# device_type strings in the catalog → DeviceConfigurations member names +# (only names that actually exist in the enum; others are skipped) +_CATALOG_DEVICE_MAP = { + "N150": "N150", + "N300": "N300", + "T3K": "T3K", + "N150X4": "N150X4", + "P100": "P100", + "P150": "P150", + "P150X4": "P150X4", + "P150X8": "P150X8", + "GALAXY": "GALAXY", + "GALAXY_T3K": "GALAXY_T3K", +} + + +def load_model_implementations_from_json(json_path: Path) -> list: + with open(json_path) as f: + catalog = json.load(f) + impls = [] + for entry in catalog["models"]: + docker_image = entry.get("docker_image") or "" + if ":" in docker_image: + image_name, image_tag = docker_image.rsplit(":", 1) + else: + image_name, image_tag = docker_image, "latest" -# Helper device configuration sets for easier management -N150_N300 = {DeviceConfigurations.N150, DeviceConfigurations.N150_WH_ARCH_YAML, DeviceConfigurations.N300, DeviceConfigurations.N300_WH_ARCH_YAML} -ALL_BOARDS = {DeviceConfigurations.N150, DeviceConfigurations.N150_WH_ARCH_YAML, DeviceConfigurations.N300, DeviceConfigurations.N300_WH_ARCH_YAML, DeviceConfigurations.N300x4, DeviceConfigurations.N300x4_WH_ARCH_YAML} -T3000_ONLY = {DeviceConfigurations.N300x4, DeviceConfigurations.N300x4_WH_ARCH_YAML} + device_configs = { + DeviceConfigurations[_CATALOG_DEVICE_MAP[d]] + for d in entry.get("device_configurations", []) + if d in _CATALOG_DEVICE_MAP + } -model_implmentations_list = [ - # Speech Recognition - Can run on N150 and N300 - ModelImpl( - model_name="Whisper-Distil-Large-v3", - model_id="id_whisper_distil_large_v3_v0.1.0", - image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-whisper-distil-large-v3-dev", - image_tag="v0.0.1-tt-metal-1a1a9e2bb102", - device_configurations=ALL_BOARDS, # Can run on N150 and N300 - docker_config=base_docker_config(), - shm_size="32G", - service_port=7000, - service_route="/inference", - health_route="/", - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.SPEECH_RECOGNITION, - ), - # TODO: add this model back in when its in tt-inference-server-main branch - # Image Generation - Can run on N150 and N300 - # ModelImpl( - # model_name="Stable-Diffusion-3.5-medium", - # model_id="id_stable_diffusion_3.5_mediumv0.1.0", - # image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-stable-diffusion-3.5-src-base", - # image_tag="v0.0.1-tt-metal-a0560feb3eed", - # device_configurations=ALL_BOARDS, # Can run on N150 and N300 - # docker_config=base_docker_config(), - # shm_size="32G", - # service_port=7000, - # service_route="/enqueue", - # health_route="/", - # setup_type=SetupTypes.TT_INFERENCE_SERVER, - # model_type=ModelTypes.IMAGE_GENERATION, - # ), - - # Image Generation - Can run on N150 and N300 - ModelImpl( - model_name="Stable-Diffusion-1.4", - model_id="id_stable_diffusionv0.1.0", - image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-stable-diffusion-1.4-src-base", - image_tag="v0.0.1-tt-metal-cc8b4e1dac99", - device_configurations=ALL_BOARDS, # Can run on N150 and N300 - docker_config=base_docker_config(), - shm_size="32G", - service_port=7000, - service_route="/enqueue", - health_route="/", - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.IMAGE_GENERATION, - ), + try: + model_type = ModelTypes[entry["model_type"]] + except KeyError: + model_type = ModelTypes.CHAT + + try: + setup_type = SetupTypes[entry["setup_type"]] + except KeyError: + setup_type = SetupTypes.TT_INFERENCE_SERVER + + cfg = base_docker_config() + cfg["environment"].update(entry.get("env_vars") or {}) + + impl = ModelImpl( + model_name=entry["model_name"], + hf_model_id=entry.get("hf_model_id"), + image_name=image_name, + image_tag=image_tag, + device_configurations=device_configs, + docker_config=cfg, + service_route=entry["service_route"], + setup_type=setup_type, + model_type=model_type, + version=entry.get("version", "0.0.1"), + shm_size=entry.get("shm_size", "32G"), + display_model_type=entry.get("display_model_type", "LLM"), + ) + impls.append(impl) + return impls - # Object Detection - Can run on all boards + +# --------------------------------------------------------------------------- +# Hardcoded models NOT present in tt-inference-server catalog +# --------------------------------------------------------------------------- + +_ALL_WH_BOARDS = { + DeviceConfigurations.N150, + DeviceConfigurations.N150_WH_ARCH_YAML, + DeviceConfigurations.N300, + DeviceConfigurations.N300_WH_ARCH_YAML, + DeviceConfigurations.N300x4, + DeviceConfigurations.N300x4_WH_ARCH_YAML, +} + +_hardcoded_impls = [ + # Object Detection - legacy YOLOv4 (not in tt-inference-server catalog) ModelImpl( model_name="YOLOv4", model_id="id_yolov4v0.0.1", image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-yolov4-src-base", image_tag="v0.0.1-tt-metal-65d246482b3f", - device_configurations=ALL_BOARDS, # Can run on all boards + device_configurations=_ALL_WH_BOARDS, docker_config=base_docker_config(), shm_size="32G", service_port=7000, service_route="/objdetection_v2", setup_type=SetupTypes.NO_SETUP, - model_type=ModelTypes.OBJECT_DETECTION - ), - - # Mock Chat - # TODO: currently not working. - # remove this model for now until its in tt-inference-server-main branch - # TODO: add / make a new mock model - # ModelImpl( - # hf_model_id="meta-llama/Llama-3.1-70B-Instruct", - # model_name="Mock-Llama-3.1-70B-Instruct", - # model_id="id_mock_vllm_modelv0.0.1", - # image_name="ghcr.io/tenstorrent/tt-inference-server/mock.vllm.openai.api", - # image_tag="v0.0.1-tt-metal-385904186f81-384f1790c3be", - # device_configurations={DeviceConfigurations.CPU}, - # docker_config=base_docker_config(), - # shm_size="1G", - # service_port=7000, - # service_route="/v1/chat/completions", - # setup_type=SetupTypes.MAKE_VOLUMES, - # model_type=ModelTypes.MOCK - # ), - - # --- Chat Models --- - - # 1B, 3B, 8B, 11B models - Can run on all boards - ModelImpl( - hf_model_id="meta-llama/Llama-3.2-1B-Instruct", - image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - device_configurations=ALL_BOARDS, # Can run on all boards - docker_config=base_docker_config(), - service_route="/v1/chat/completions", - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.CHAT - - ), - ModelImpl( - hf_model_id="meta-llama/Llama-3.2-3B-Instruct", - image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - device_configurations=ALL_BOARDS, # Can run on all boards - docker_config=base_docker_config(), - service_route="/v1/chat/completions", - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.CHAT - - ), - ModelImpl( - hf_model_id="meta-llama/Llama-3.1-8B-Instruct", - image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - device_configurations=ALL_BOARDS | {DeviceConfigurations.P300Cx2}, - docker_config=base_docker_config(), - service_route="/v1/chat/completions", - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.CHAT - + model_type=ModelTypes.OBJECT_DETECTION, + display_model_type="CNN", ), - # TODO: add this model back in when its in tt-inference-server-main branch - # ModelImpl( - # hf_model_id="meta-llama/Llama-3.2-11B-Vision-Instruct", - # image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - # image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - # device_configurations=ALL_BOARDS, # Can run on all boards - # docker_config=base_docker_config(), - # service_route="/v1/chat/completions", - # setup_type=SetupTypes.TT_INFERENCE_SERVER, - # model_type=ModelTypes.CHAT - - # ), - - # 32B models - T3000 and P300Cx2 + # Legacy Stable-Diffusion-1.4 (not in tt-inference-server catalog) ModelImpl( - hf_model_id="Qwen/Qwen3-32B", - image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - device_configurations={DeviceConfigurations.N300x4, DeviceConfigurations.N300x4_WH_ARCH_YAML, DeviceConfigurations.P300Cx2}, - docker_config=base_docker_config(), - service_route="/v1/chat/completions", - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.CHAT - ), - - # 70B models - Only T3000 - - ModelImpl( - hf_model_id="meta-llama/Llama-3.1-70B-Instruct", - image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - device_configurations=T3000_ONLY, # Only T3000 + model_name="Stable-Diffusion-1.4", + model_id="id_stable_diffusionv0.1.0", + image_name="ghcr.io/tenstorrent/tt-inference-server/tt-metal-stable-diffusion-1.4-src-base", + image_tag="v0.0.1-tt-metal-cc8b4e1dac99", + device_configurations=_ALL_WH_BOARDS, docker_config=base_docker_config(), shm_size="32G", service_port=7000, - service_route="/v1/chat/completions", - env_file=os.environ.get("VLLM_LLAMA31_ENV_FILE"), - setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.CHAT - ), - # ModelImpl( - # hf_model_id="meta-llama/Llama-3.1-70B-Instruct", - # image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - # image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - # device_configurations=T3000_ONLY, # Only T3000 - # docker_config=base_docker_config(), - # service_route="/v1/chat/completions", - # setup_type=SetupTypes.TT_INFERENCE_SERVER, - # model_type=ModelTypes.CHAT - # ), - ModelImpl( - hf_model_id="meta-llama/Llama-3.3-70B-Instruct", - image_name="ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-20.04-amd64", - image_tag="0.0.4-v0.56.0-rc47-e2e0002ac7dc", - device_configurations=T3000_ONLY | {DeviceConfigurations.P300Cx2}, - docker_config=base_docker_config(), - service_route="/v1/chat/completions", + service_route="/enqueue", + health_route="/", setup_type=SetupTypes.TT_INFERENCE_SERVER, - model_type=ModelTypes.CHAT + model_type=ModelTypes.IMAGE_GENERATION, + display_model_type="IMAGE", ), - #! Add new model vLLM model implementations here ] + def validate_model_implemenation_config(impl): # no / in model_id strings, model_id will be used in path names - assert not "/" in impl.model_id + assert "/" not in impl.model_id + + +# --------------------------------------------------------------------------- +# Build final model_implmentations dict +# --------------------------------------------------------------------------- +_json_impls = load_model_implementations_from_json(CATALOG_JSON) -# build and validate the model_implmentations config model_implmentations = {} -for impl in model_implmentations_list: +for impl in _json_impls + _hardcoded_impls: validate_model_implemenation_config(impl) model_implmentations[impl.model_id] = impl \ No newline at end of file diff --git a/app/backend/shared_config/model_type_config.py b/app/backend/shared_config/model_type_config.py index 769754c1..a5d0e584 100644 --- a/app/backend/shared_config/model_type_config.py +++ b/app/backend/shared_config/model_type_config.py @@ -9,4 +9,9 @@ class ModelTypes(Enum): CHAT = "chat" OBJECT_DETECTION = "object_detection" IMAGE_GENERATION = "image_generation" - SPEECH_RECOGNITION = "speech_recognition" \ No newline at end of file + SPEECH_RECOGNITION = "speech_recognition" + VLM = "vlm" + TTS = "tts" + VIDEO = "video_generation" + EMBEDDING = "embedding" + CNN = "cnn" \ No newline at end of file diff --git a/app/backend/shared_config/models_from_inference_server.json b/app/backend/shared_config/models_from_inference_server.json new file mode 100644 index 00000000..4a678104 --- /dev/null +++ b/app/backend/shared_config/models_from_inference_server.json @@ -0,0 +1,1740 @@ +{ + "source": { + "artifact_version": "0.9.0", + "generated_at": "2026-02-24T23:09:31.324226+00:00" + }, + "total_models": 60, + "models": [ + { + "model_name": "DeepSeek-R1-Distill-Llama-70B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "P150X4", + "P150X8", + "T3K" + ], + "hf_model_id": "deepseek-ai/DeepSeek-R1-Distill-Llama-70B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 70 + }, + { + "model_name": "distil-large-v3", + "model_type": "SPEECH_RECOGNITION", + "display_model_type": "AUDIO", + "device_configurations": [ + "GALAXY", + "N150", + "T3K" + ], + "hf_model_id": "distil-whisper/distil-large-v3", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "FLUX.1-dev", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "black-forest-labs/FLUX.1-dev", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-c180ef7", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "FLUX.1-schnell", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "black-forest-labs/FLUX.1-schnell", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-c180ef7", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "Llama-3.1-70B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "P150X4", + "P150X8", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.1-70B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 70 + }, + { + "model_name": "Llama-3.1-70B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "P150X4", + "P150X8", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.1-70B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 70 + }, + { + "model_name": "Llama-3.1-8B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "N150", + "N300", + "P100", + "P150", + "P150X4", + "P150X8", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.1-8B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-25305db-6e67d2d", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 8 + }, + { + "model_name": "Llama-3.1-8B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "N150", + "N300", + "P100", + "P150", + "P150X4", + "P150X8", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.1-8B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-25305db-6e67d2d", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 8 + }, + { + "model_name": "Llama-3.3-70B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "P150X4", + "P150X8", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.3-70B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 70 + }, + { + "model_name": "Mistral-7B-Instruct-v0.3", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.3", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-9b67e09-a91b644", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 7 + }, + { + "model_name": "mochi-1-preview", + "model_type": "VIDEO", + "display_model_type": "VIDEO", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "genmo/mochi-1-preview", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "Motif-Image-6B-Preview", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "Motif-Technologies/Motif-Image-6B-Preview", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-c180ef7", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 6 + }, + { + "model_name": "Qwen3-32B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "P150X8", + "T3K" + ], + "hf_model_id": "Qwen/Qwen3-32B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 32 + }, + { + "model_name": "speecht5_tts", + "model_type": "TTS", + "display_model_type": "TEXT_TO_SPEECH", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "microsoft/speecht5_tts", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-a9b09e0", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "stable-diffusion-3.5-large", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "stabilityai/stable-diffusion-3.5-large", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-c180ef7", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "stable-diffusion-xl-1.0-inpainting-0.1", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "diffusers/stable-diffusion-xl-1.0-inpainting-0.1", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.5.0-fbbbd2da8cfab49ddf43d28dd9c0813a3c3ee2bd", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "stable-diffusion-xl-base-1.0", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "stabilityai/stable-diffusion-xl-base-1.0", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "stable-diffusion-xl-base-1.0-img-2-img", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "stabilityai/stable-diffusion-xl-base-1.0-img-2-img", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "Wan2.2-T2V-A14B-Diffusers", + "model_type": "VIDEO", + "display_model_type": "VIDEO", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "Wan-AI/Wan2.2-T2V-A14B-Diffusers", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 14 + }, + { + "model_name": "whisper-large-v3", + "model_type": "SPEECH_RECOGNITION", + "display_model_type": "AUDIO", + "device_configurations": [ + "GALAXY", + "N150", + "T3K" + ], + "hf_model_id": "openai/whisper-large-v3", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "COMPLETE", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-65718bb", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "Llama-3.2-11B-Vision", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "N300", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-11B-Vision", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-v0.61.1-rc1-5cbc982", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 11 + }, + { + "model_name": "Llama-3.2-11B-Vision-Instruct", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "N300", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-11B-Vision-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-v0.61.1-rc1-5cbc982", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 11 + }, + { + "model_name": "Llama-3.2-1B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-1B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-9b67e09-a91b644", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 1 + }, + { + "model_name": "Llama-3.2-1B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-1B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-9b67e09-a91b644", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 1 + }, + { + "model_name": "Llama-3.2-3B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-3B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-20edc39-03cb300", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 3 + }, + { + "model_name": "Llama-3.2-3B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-3B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-20edc39-03cb300", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 3 + }, + { + "model_name": "Llama-3.2-90B-Vision", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-90B-Vision", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-v0.61.1-rc1-5cbc982", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "MAX_PREFILL_CHUNK_SIZE": 16 + }, + "param_count": 90 + }, + { + "model_name": "Llama-3.2-90B-Vision-Instruct", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "T3K" + ], + "hf_model_id": "meta-llama/Llama-3.2-90B-Vision-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-v0.61.1-rc1-5cbc982", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "MAX_PREFILL_CHUNK_SIZE": 16 + }, + "param_count": 90 + }, + { + "model_name": "Qwen-Image", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "Qwen/Qwen-Image", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-be88351", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "TT_DIT_CACHE_DIR": "/tmp/TT_DIT_CACHE" + }, + "param_count": null + }, + { + "model_name": "Qwen-Image-2512", + "model_type": "IMAGE_GENERATION", + "display_model_type": "IMAGE", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "Qwen/Qwen-Image-2512", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.9.0-be88351", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "TT_DIT_CACHE_DIR": "/tmp/TT_DIT_CACHE" + }, + "param_count": null + }, + { + "model_name": "Qwen2.5-72B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-72B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-13f44c5-0edd242", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "MAX_PREFILL_CHUNK_SIZE": "16" + }, + "param_count": 72 + }, + { + "model_name": "Qwen2.5-72B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-72B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-13f44c5-0edd242", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, + "MAX_PREFILL_CHUNK_SIZE": "16" + }, + "param_count": 72 + }, + { + "model_name": "Qwen2.5-VL-72B-Instruct", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-VL-72B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c18569e-b2894d3", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 72 + }, + { + "model_name": "Qwen3-8B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "Qwen/Qwen3-8B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-e95ffa5-48eba14", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 8 + }, + { + "model_name": "QwQ-32B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "Qwen/QwQ-32B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "FUNCTIONAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-e95ffa5-48eba14", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 32 + }, + { + "model_name": "AFM-4.5B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N300", + "T3K" + ], + "hf_model_id": "arcee-ai/AFM-4.5B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-ae65ee5-35f023f", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": 4 + }, + { + "model_name": "bge-large-en-v1.5", + "model_type": "EMBEDDING", + "display_model_type": "EMBEDDING", + "device_configurations": [ + "GALAXY", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "BAAI/bge-large-en-v1.5", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.2.0-2496be4518bca0a7a5b497a4cda3cfe7e2f59756", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM__MAX_NUM_BATCHED_TOKENS": "3072", + "VLLM__MAX_MODEL_LENGTH": "384", + "VLLM__MIN_CONTEXT_LENGTH": "32", + "VLLM__MAX_NUM_SEQS": "8", + "MAX_BATCH_SIZE": "8", + "DEFAULT_THROTTLE_LEVEL": "0" + }, + "param_count": null + }, + { + "model_name": "DeepSeek-R1-0528", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY" + ], + "hf_model_id": "deepseek-ai/DeepSeek-R1-0528", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-e3d97e5-a186bf4", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1", + "VLLM_USE_V1": "1" + }, + "param_count": null + }, + { + "model_name": "efficientnet", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "efficientnet", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "gemma-3-1b-it", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150" + ], + "hf_model_id": "google/gemma-3-1b-it", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c254ee3-c4f2327", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 1 + }, + { + "model_name": "gemma-3-27b-it", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "google/gemma-3-27b-it", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-0b10c51-3499ffa", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 27 + }, + { + "model_name": "gemma-3-4b-it", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "google/gemma-3-4b-it", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c254ee3-c4f2327", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 4 + }, + { + "model_name": "gpt-oss-120b", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "T3K" + ], + "hf_model_id": "openai/gpt-oss-120b", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-65718bb-409b1cd", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 120 + }, + { + "model_name": "gpt-oss-20b", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "openai/gpt-oss-20b", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-60ffb199-3499ffa1", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 20 + }, + { + "model_name": "medgemma-27b-it", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "GALAXY", + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "google/medgemma-27b-it", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-0b10c51-3499ffa", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 27 + }, + { + "model_name": "medgemma-4b-it", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "google/medgemma-4b-it", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c254ee3-c4f2327", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_USE_V1": "1" + }, + "param_count": 4 + }, + { + "model_name": "mobilenetv2", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "mobilenetv2", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "Qwen2.5-7B", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150X4", + "N300" + ], + "hf_model_id": "Qwen/Qwen2.5-7B", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-5b5db8a-e771fff", + "service_route": "/v1/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 7 + }, + { + "model_name": "Qwen2.5-7B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "N150X4", + "N300" + ], + "hf_model_id": "Qwen/Qwen2.5-7B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-5b5db8a-e771fff", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 7 + }, + { + "model_name": "Qwen2.5-Coder-32B-Instruct", + "model_type": "CHAT", + "display_model_type": "LLM", + "device_configurations": [ + "GALAXY_T3K", + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-Coder-32B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-17a5973-aa4ae1e", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 32 + }, + { + "model_name": "Qwen2.5-VL-32B-Instruct", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-VL-32B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c18569e-b2894d3", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 32 + }, + { + "model_name": "Qwen2.5-VL-3B-Instruct", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-VL-3B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c18569e-b2894d3", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 3 + }, + { + "model_name": "Qwen2.5-VL-7B-Instruct", + "model_type": "VLM", + "display_model_type": "VLM", + "device_configurations": [ + "N150", + "N300", + "T3K" + ], + "hf_model_id": "Qwen/Qwen2.5-VL-7B-Instruct", + "inference_engine": "vLLM", + "supported_modalities": [ + "text", + "image" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-inference-server/vllm-tt-metal-src-release-ubuntu-22.04-amd64:0.9.0-c18569e-b2894d3", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1 + }, + "param_count": 7 + }, + { + "model_name": "Qwen3-Embedding-4B", + "model_type": "EMBEDDING", + "display_model_type": "EMBEDDING", + "device_configurations": [ + "GALAXY", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "Qwen/Qwen3-Embedding-4B", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM__MAX_NUM_BATCHED_TOKENS": "1024", + "VLLM__MAX_MODEL_LENGTH": "1024", + "VLLM__MIN_CONTEXT_LENGTH": "32", + "VLLM__MAX_NUM_SEQS": "1", + "MAX_BATCH_SIZE": "1", + "DEFAULT_THROTTLE_LEVEL": "0" + }, + "param_count": 4 + }, + { + "model_name": "Qwen3-Embedding-8B", + "model_type": "EMBEDDING", + "display_model_type": "EMBEDDING", + "device_configurations": [ + "GALAXY", + "N150", + "N300", + "T3K" + ], + "hf_model_id": "Qwen/Qwen3-Embedding-8B", + "inference_engine": "media", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-media-inference-server:0.2.0-2496be4518bca0a7a5b497a4cda3cfe7e2f59756", + "service_route": "/enqueue", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1", + "VLLM__MAX_NUM_BATCHED_TOKENS": "1024", + "VLLM__MAX_MODEL_LENGTH": "1024", + "VLLM__MIN_CONTEXT_LENGTH": "32", + "VLLM__MAX_NUM_SEQS": "1" + }, + "param_count": 8 + }, + { + "model_name": "resnet-50", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "resnet-50", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "segformer", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "segformer", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "unet", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "unet", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "vit", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "vit", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + }, + { + "model_name": "vovnet", + "model_type": "CNN", + "display_model_type": "CNN", + "device_configurations": [ + "N150", + "N300" + ], + "hf_model_id": "vovnet", + "inference_engine": "forge", + "supported_modalities": [ + "text" + ], + "status": "EXPERIMENTAL", + "version": "0.9.0", + "docker_image": "ghcr.io/tenstorrent/tt-shield/tt-media-inference-server-forge:a9b09e0b611da6deb4d8972e8296148fd864e5fd_98dcf62_60920940673", + "service_route": "/v1/chat/completions", + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": { + "VLLM_CONFIGURE_LOGGING": "1", + "VLLM_RPC_TIMEOUT": "900000", + "VLLM_TARGET_DEVICE": "tt", + "TORCHDYNAMO_DISABLE": "1" + }, + "param_count": null + } + ] +} diff --git a/app/backend/shared_config/sync_models_from_inference_server.py b/app/backend/shared_config/sync_models_from_inference_server.py new file mode 100644 index 00000000..e750208d --- /dev/null +++ b/app/backend/shared_config/sync_models_from_inference_server.py @@ -0,0 +1,257 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC + +""" +Sync script: reads ../../tt-inference-server/model_specs_output.json and +normalizes it into models_from_inference_server.json (co-located with this script). + +Run from any directory: + python app/backend/shared_config/sync_models_from_inference_server.py +""" + +import json +import os +from datetime import datetime, timezone +from pathlib import Path + +# --------------------------------------------------------------------------- +# Paths +# --------------------------------------------------------------------------- +SCRIPT_DIR = Path(__file__).parent +OUTPUT_JSON = SCRIPT_DIR / "models_from_inference_server.json" + +# Source JSON resolution order: +# 1. Explicit --source CLI argument +# 2. TT_INFERENCE_ARTIFACT_PATH env var (set by run.py after artifact download) +# 3. .artifacts/tt-inference-server/ next to repo root (artifact default location) +# 4. tt-inference-server/ next to repo root (legacy submodule path) +_REPO_ROOT = SCRIPT_DIR / "../../.." +_CANDIDATE_SOURCES = [ + _REPO_ROOT / ".artifacts/tt-inference-server/model_specs_output.json", + _REPO_ROOT / "tt-inference-server/model_specs_output.json", +] + + +def resolve_source_json(override: str | None = None) -> Path: + """Return the path to model_specs_output.json, trying candidates in order.""" + if override: + p = Path(override) + if not p.exists(): + raise FileNotFoundError(f"--source path not found: {p}") + return p.resolve() + + # Check env var set by run.py + artifact_path = os.environ.get("TT_INFERENCE_ARTIFACT_PATH") + if artifact_path: + p = Path(artifact_path) / "model_specs_output.json" + if p.exists(): + return p.resolve() + + # Try static candidates + for candidate in _CANDIDATE_SOURCES: + if candidate.exists(): + return candidate.resolve() + + raise FileNotFoundError( + "Cannot find model_specs_output.json. Tried:\n" + + "\n".join(f" {c.resolve()}" for c in _CANDIDATE_SOURCES) + ) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- +DEVICE_SPECIFIC_ENV_KEYS = {"WH_ARCH_YAML", "MESH_DEVICE", "ARCH_NAME"} + +STATUS_ORDER = {"COMPLETE": 3, "FUNCTIONAL": 2, "EXPERIMENTAL": 1} + +# device_type string (from tt-inference-server) → DeviceConfigurations member name +# Only include device_types that exist in DeviceConfigurations enum +DEVICE_TYPE_TO_CONFIG = { + "N150": "N150", + "N300": "N300", + "T3K": "T3K", + "N150X4": "N150X4", + "P100": "P100", + "P150": "P150", + "P150X4": "P150X4", + "P150X8": "P150X8", + "GALAXY": "GALAXY", + "GALAXY_T3K": "GALAXY_T3K", +} + + +def map_model_type(raw_model_type: str, inference_engine: str) -> str: + """Map tt-inference-server model_type + inference_engine to tt-studio ModelTypes.""" + if raw_model_type == "LLM" and inference_engine == "vLLM": + return "CHAT" + if raw_model_type == "VLM": + return "VLM" + if raw_model_type == "IMAGE": + return "IMAGE_GENERATION" + if raw_model_type == "AUDIO": + return "SPEECH_RECOGNITION" + if raw_model_type == "TEXT_TO_SPEECH" or raw_model_type == "TTS": + return "TTS" + if raw_model_type == "VIDEO": + return "VIDEO" + if raw_model_type == "EMBEDDING": + return "EMBEDDING" + # CNN + media engine = image generation (FLUX, Motif, etc.) + if raw_model_type == "CNN" and inference_engine == "media": + return "IMAGE_GENERATION" + # CNN + forge = computer vision / object detection (resnet, vit, etc.) + if raw_model_type == "CNN" and inference_engine == "forge": + return "CNN" + return "CHAT" + + +CHAT_CAPABLE_PATTERNS = [ + "instruct", "-chat", "chat-", "-it-", "-it", "assistant", + # Reasoning / thinking models that do have chat templates + "deepseek-r1", "qwq", "qwen3", "gpt-oss", +] + + +def is_chat_capable(hf_model_id: str) -> bool: + lower = hf_model_id.lower() + return any(p in lower for p in CHAT_CAPABLE_PATTERNS) + + +def map_service_route(inference_engine: str, hf_model_id: str = "") -> str: + """Derive service_route from inference_engine (and model id for vLLM).""" + if inference_engine == "vLLM": + return "/v1/chat/completions" if is_chat_capable(hf_model_id) else "/v1/completions" + if inference_engine == "media": + return "/enqueue" + if inference_engine == "forge": + return "/v1/chat/completions" + return "/v1/chat/completions" + + +def filter_env_vars(env_vars: dict) -> dict: + """Strip device-specific env vars that ModelImpl.__post_init__ handles.""" + return {k: v for k, v in env_vars.items() if k not in DEVICE_SPECIFIC_ENV_KEYS} + + +def pick_higher_status(current: str | None, candidate: str) -> str: + """Return whichever status is higher priority.""" + if current is None: + return candidate + return current if STATUS_ORDER.get(current, 0) >= STATUS_ORDER.get(candidate, 0) else candidate + + +def normalize(source_path: Path) -> list[dict]: + with open(source_path) as f: + raw = json.load(f) + + # group by model_name, skipping GPU entries + by_model: dict[str, list[dict]] = {} + for entry in raw.values(): + if entry.get("device_type") == "GPU": + continue + name = entry["model_name"] + by_model.setdefault(name, []).append(entry) + + models = [] + for model_name, entries in by_model.items(): + # Use first entry as the canonical source for model-level fields + first = entries[0] + + # Aggregate device_types + device_configurations = sorted( + { + DEVICE_TYPE_TO_CONFIG[e["device_type"]] + for e in entries + if e.get("device_type") in DEVICE_TYPE_TO_CONFIG + } + ) + + # Pick highest status + status = None + for e in entries: + status = pick_higher_status(status, e.get("status", "EXPERIMENTAL")) + + # Model-level env_vars (from first entry, strip device-specific keys) + env_vars = filter_env_vars(first.get("env_vars") or {}) + + inference_engine = first.get("inference_engine", "vLLM") + raw_model_type = first.get("model_type", "LLM") + + models.append({ + "model_name": model_name, + "model_type": map_model_type(raw_model_type, inference_engine), + "display_model_type": raw_model_type, + "device_configurations": device_configurations, + "hf_model_id": first.get("hf_model_repo"), + "inference_engine": inference_engine, + "supported_modalities": first.get("supported_modalities", ["text"]), + "status": status, + "version": first.get("version", "0.0.1"), + "docker_image": first.get("docker_image"), + "service_route": map_service_route(inference_engine, hf_model_id=first.get("hf_model_repo", "")), + "shm_size": "32G", + "setup_type": "TT_INFERENCE_SERVER", + "env_vars": env_vars, + "param_count": first.get("param_count"), + }) + + # Sort: by status (highest first), then alphabetically by model_name + models.sort(key=lambda m: (-STATUS_ORDER.get(m["status"], 0), m["model_name"].lower())) + return models + + +def main(): + import argparse + parser = argparse.ArgumentParser(description="Sync model catalog from tt-inference-server") + parser.add_argument("--source", default=None, help="Path to model_specs_output.json (overrides auto-detection)") + args = parser.parse_args() + + source_path = resolve_source_json(args.source) + print(f"Reading: {source_path}") + + if not source_path.exists(): + raise FileNotFoundError(f"Source not found: {source_path}") + + models = normalize(source_path) + + # Resolve artifact version from VERSION file or env vars (avoid leaking absolute paths) + artifact_version = None + version_file = source_path.parent / "VERSION" + if version_file.exists(): + artifact_version = version_file.read_text().strip() + if not artifact_version: + artifact_version = ( + os.environ.get("TT_INFERENCE_ARTIFACT_VERSION") + or os.environ.get("TT_INFERENCE_ARTIFACT_BRANCH") + or "unknown" + ) + + catalog = { + "source": { + "artifact_version": artifact_version, + "generated_at": datetime.now(timezone.utc).isoformat(), + }, + "total_models": len(models), + "models": models, + } + + out_path = OUTPUT_JSON.resolve() + with open(out_path, "w") as f: + json.dump(catalog, f, indent=2) + f.write("\n") + + print(f"Written {len(models)} models → {out_path}") + + # Print a summary + from collections import Counter + status_counts = Counter(m["status"] for m in models) + type_counts = Counter(m["model_type"] for m in models) + display_type_counts = Counter(m["display_model_type"] for m in models) + print(f" Status distribution: {dict(status_counts)}") + print(f" Type distribution: {dict(type_counts)}") + print(f" Display type distribution: {dict(display_type_counts)}") + + +if __name__ == "__main__": + main() diff --git a/app/docker-compose.dev-mode.yml b/app/docker-compose.dev-mode.yml index 9cfccb6b..6fd24a51 100644 --- a/app/docker-compose.dev-mode.yml +++ b/app/docker-compose.dev-mode.yml @@ -10,7 +10,8 @@ services: volumes: # Mount the local api directory for live code changes - ./backend:/backend - command: python ./manage.py runserver 0.0.0.0:8000 + command: > + python manage.py runserver 0.0.0.0:8000 environment: - DEBUG=True # Allow container to access host services (docker-control-service) diff --git a/app/docker-compose.yml b/app/docker-compose.yml index 19f07ba6..a55e9f51 100644 --- a/app/docker-compose.yml +++ b/app/docker-compose.yml @@ -20,7 +20,8 @@ services: - tt_studio_network ports: - "8000:8000" - command: gunicorn --workers 3 --bind 0.0.0.0:8000 --preload --timeout 1200 api.wsgi:application + command: > + gunicorn --workers 3 --bind 0.0.0.0:8000 --preload --timeout 1200 api.wsgi:application depends_on: tt_studio_chroma: condition: service_healthy @@ -58,7 +59,7 @@ services: # Mount the local api directory for live code changes - ./backend:/backend # Mount tt-inference-server workflow logs for viewing deployment logs - - ${TT_STUDIO_ROOT}/tt-inference-server/workflow_logs:${TT_STUDIO_ROOT}/tt-inference-server/workflow_logs:ro + - ${TT_STUDIO_ROOT}/.artifacts/tt-inference-server/workflow_logs:${TT_STUDIO_ROOT}/.artifacts/tt-inference-server/workflow_logs:ro healthcheck: # On first application load resources for transformers/etc diff --git a/app/frontend/src/api/modelsDeployedApis.ts b/app/frontend/src/api/modelsDeployedApis.ts index 0bc1cd28..934bffed 100644 --- a/app/frontend/src/api/modelsDeployedApis.ts +++ b/app/frontend/src/api/modelsDeployedApis.ts @@ -48,19 +48,55 @@ interface DeployedModelInfo { id: string; modelName: string; status: string; + model_type?: string; internal_url?: string; health_url?: string; model_impl?: { model_name?: string; hf_model_id?: string; + model_type?: string; }; } export const ModelType = { ChatModel: "ChatModel", + VLM: "VLM", ImageGeneration: "ImageGeneration", + VideoGeneration: "VideoGeneration", ObjectDetectionModel: "ObjectDetectionModel", SpeechRecognitionModel: "SpeechRecognitionModel", + TTS: "TTS", + Embedding: "Embedding", + CNN: "CNN", +}; + +/** + * Map backend model_type strings (from catalog/API) to frontend ModelType constants. + * Falls back to ChatModel for unknown types. + */ +export const getModelTypeFromBackendType = (backendType: string): string => { + switch (backendType) { + case "chat": + return ModelType.ChatModel; + case "vlm": + return ModelType.VLM; + case "image_generation": + return ModelType.ImageGeneration; + case "video_generation": + return ModelType.VideoGeneration; + case "object_detection": + return ModelType.ObjectDetectionModel; + case "speech_recognition": + return ModelType.SpeechRecognitionModel; + case "tts": + return ModelType.TTS; + case "embedding": + return ModelType.Embedding; + case "cnn": + return ModelType.CNN; + default: + return ModelType.ChatModel; + } }; export const fetchModels = async (): Promise => { @@ -214,12 +250,13 @@ export const handleRedeploy = (modelName: string): void => { export const handleModelNavigationClick = ( modelID: string, modelName: string, - navigate: NavigateFunction + navigate: NavigateFunction, + modelType?: string ): void => { - const modelType = getModelTypeFromName(modelName); - const destination = getDestinationFromModelType(modelType); - console.log(`${modelType} button clicked for model: ${modelID}`); - console.log(`Opening ${modelType} for model: ${modelName}`); + const resolvedModelType = modelType ?? getModelTypeFromName(modelName); + const destination = getDestinationFromModelType(resolvedModelType); + console.log(`${resolvedModelType} button clicked for model: ${modelID}`); + console.log(`Opening ${resolvedModelType} for model: ${modelName}`); customToast.success(`${destination.slice(1)} page opened!`); navigate(destination, { @@ -233,14 +270,109 @@ export const getDestinationFromModelType = (modelType: string): string => { switch (modelType) { case ModelType.ChatModel: return "/chat"; + case ModelType.VLM: + return "/chat"; // VLM reuses the chat UI (supports image content) case ModelType.ImageGeneration: return "/image-generation"; + case ModelType.VideoGeneration: + return "/chat"; // placeholder until video UI exists case ModelType.ObjectDetectionModel: return "/object-detection"; case ModelType.SpeechRecognitionModel: return "/speech-to-text"; + case ModelType.TTS: + return "/speech-to-text"; // TTS shares the audio page for now + case ModelType.Embedding: + return "/chat"; // placeholder + case ModelType.CNN: + return "/object-detection"; // CNN reuses object detection UI default: - return "/chat"; // /chat is the default + return "/chat"; + } +}; + +// ----- deployModel with device_id support ----- +export const deployModel = async ( + modelId: string, + weightsId: string, + deviceId: number = 0, +): Promise<{ job_id?: string; status?: string; message?: string }> => { + const payload = JSON.stringify({ + model_id: modelId, + weights_id: weightsId, + device_id: deviceId, + }); + const response = await fetch("/docker-api/deploy/", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: payload, + }); + return response.json(); +}; + +// ----- Voice Pipeline ----- +export interface VoicePipelineRequest { + audioFile: File; + whisperDeployId: string; + llmDeployId: string; + ttsDeployId?: string; + systemPrompt?: string; +} + +/** + * Calls the voice pipeline endpoint and returns an SSE EventSource. + * The caller is responsible for closing the EventSource when done. + */ +export const runVoicePipeline = async ( + req: VoicePipelineRequest, + onTranscript: (text: string) => void, + onLlmChunk: (text: string) => void, + onAudio: (dataUrl: string) => void, + onError: (stage: string, message: string) => void, + onDone: () => void, +): Promise => { + const form = new FormData(); + form.append("audio_file", req.audioFile); + form.append("whisper_deploy_id", req.whisperDeployId); + form.append("llm_deploy_id", req.llmDeployId); + if (req.ttsDeployId) form.append("tts_deploy_id", req.ttsDeployId); + if (req.systemPrompt) form.append("system_prompt", req.systemPrompt); + + const response = await fetch("/models-api/pipeline/voice/", { + method: "POST", + body: form, + }); + + if (!response.ok || !response.body) { + onError("pipeline", `HTTP ${response.status}`); + return; + } + + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + buffer += decoder.decode(value, { stream: true }); + + const lines = buffer.split("\n"); + buffer = lines.pop() ?? ""; + + for (const line of lines) { + if (!line.startsWith("data: ")) continue; + try { + const evt = JSON.parse(line.slice(6)); + if (evt.type === "transcript") onTranscript(evt.text); + else if (evt.type === "llm_chunk") onLlmChunk(evt.text); + else if (evt.type === "audio_url") onAudio(evt.url); + else if (evt.type === "error") onError(evt.stage ?? "unknown", evt.message); + else if (evt.type === "done") onDone(); + } catch { + // skip malformed lines + } + } } }; @@ -298,6 +430,7 @@ export const fetchDeployedModelsInfo = async (): Promise< modelData.model_impl?.hf_model_id || "Unknown Model", status: "deployed", + model_type: modelData.model_impl?.model_type, internal_url: modelData.internal_url, health_url: modelData.health_url, model_impl: modelData.model_impl, diff --git a/app/frontend/src/components/FirstStepForm.tsx b/app/frontend/src/components/FirstStepForm.tsx index 0dbe346f..267b2676 100644 --- a/app/frontend/src/components/FirstStepForm.tsx +++ b/app/frontend/src/components/FirstStepForm.tsx @@ -8,15 +8,10 @@ import axios from "axios"; import { useEffect, useState } from "react"; import { Bot, - // Cpu, - // CheckCircle, XCircle, - MessageSquare, - // Image, - Eye, - Mic, - Palette, - // Camera, + CheckCircle2, + Zap, + FlaskConical, } from "lucide-react"; import { Tooltip, @@ -48,57 +43,73 @@ import BoardBadge from "./BoardBadge"; import { DeployedModelsWarning } from "./DeployedModelsWarning"; import { useModels } from "../hooks/useModels"; -// Model type configuration with icons and labels -const MODEL_TYPE_CONFIG = { - chat: { - label: "Chat & Language Models", - icon: MessageSquare, +// Status configuration with icons and labels +const STATUS_CONFIG = { + COMPLETE: { + label: "Complete", + icon: CheckCircle2, + color: "text-green-600", + bgColor: "bg-green-50 dark:bg-green-900/20", + borderColor: "border-green-200 dark:border-green-800", + }, + FUNCTIONAL: { + label: "Functional", + icon: Zap, color: "text-blue-500", bgColor: "bg-blue-50 dark:bg-blue-900/20", borderColor: "border-blue-200 dark:border-blue-800", }, - image_generation: { - label: "Image Generation", - icon: Palette, - color: "text-purple-500", - bgColor: "bg-purple-50 dark:bg-purple-900/20", - borderColor: "border-purple-200 dark:border-purple-800", - }, - object_detection: { - label: "Object Detection", - icon: Eye, - color: "text-emerald-500", - bgColor: "bg-emerald-50 dark:bg-emerald-900/20", - borderColor: "border-emerald-200 dark:border-emerald-800", - }, - speech_recognition: { - label: "Speech Recognition", - icon: Mic, - color: "text-orange-500", - bgColor: "bg-orange-50 dark:bg-orange-900/20", - borderColor: "border-orange-200 dark:border-orange-800", - }, - mock: { - label: "Test Models", - icon: Bot, - color: "text-gray-500", - bgColor: "bg-gray-50 dark:bg-gray-900/20", - borderColor: "border-gray-200 dark:border-gray-800", + EXPERIMENTAL: { + label: "Experimental", + icon: FlaskConical, + color: "text-amber-500", + bgColor: "bg-amber-50 dark:bg-amber-900/20", + borderColor: "border-amber-200 dark:border-amber-800", }, }; +// Model type configuration for grouping by inference server type +const TYPE_CONFIG: Record = { + LLM: { label: "LLM Models", order: 1 }, + VLM: { label: "VLM Models", order: 2 }, + VIDEO: { label: "Video Models", order: 3 }, + IMAGE: { label: "Image Models", order: 4 }, + AUDIO: { label: "Audio Models", order: 5 }, + TEXT_TO_SPEECH: { label: "TTS Models", order: 6 }, + EMBEDDING: { label: "Embedding Models", order: 7 }, + CNN: { label: "CNN Models", order: 8 }, +}; + const FirstFormSchema = z.object({ model: z.string().nonempty("Please select a model."), }); +// Multi-chip boards where the user needs to pick a chip slot. +// Single-chip boards (N150, N300 standalone, E150, P100, P150, P300c) always have +// only one chip so no picker is needed there. +const MULTI_CHIP_BOARD_SLOTS: Record = { + T3K: 4, // 4x N300 + T3000: 4, + N150X4: 4, + N300x4: 4, + P150X4: 4, + P150X8: 8, + P300Cx2: 4, // 2 cards × 2 chips + P300Cx4: 8, // 4 cards × 2 chips + GALAXY: 32, + GALAXY_T3K: 32, +}; + export function FirstStepForm({ setSelectedModel, setFormError, + setSelectedDeviceId, autoDeployModel, isAutoDeploying, }: { setSelectedModel: (model: string) => void; setFormError: (hasError: boolean) => void; + setSelectedDeviceId?: (deviceId: number) => void; autoDeployModel?: string | null; isAutoDeploying?: boolean; }) { @@ -112,6 +123,7 @@ export function FirstStepForm({ const [models, setModels] = useState([]); const [isLoading, setIsLoading] = useState(true); const [isWarningDismissed, setIsWarningDismissed] = useState(false); + const [deviceId, setDeviceId] = useState(0); // Refresh models context when component mounts useEffect(() => { @@ -185,9 +197,12 @@ export function FirstStepForm({ console.log( "📝 FirstStepForm: Setting selectedModel to:", - selectedModel.id + selectedModel.id, + "device_id:", + deviceId, ); setSelectedModel(selectedModel.id); + if (setSelectedDeviceId) setSelectedDeviceId(deviceId); console.log( "📝 FirstStepForm: selectedModel set, waiting for status check..." ); @@ -235,33 +250,35 @@ export function FirstStepForm({ } }, [autoDeployModel, models, isAutoDeploying, form, onSubmit]); - // Get current board info and group models by type and compatibility + // Get current board info and group models by status and compatibility const currentBoard = models[0]?.current_board || "unknown"; - // Group models by type and compatibility + // Status priority order for sorting + const STATUS_ORDER: Record = { + COMPLETE: 3, + FUNCTIONAL: 2, + EXPERIMENTAL: 1, + }; + + // Group models by display type, then by status, then by hardware compatibility + type CompatibilityGroup = { compatible: Model[]; incompatible: Model[]; unknown: Model[] }; const groupModelsByType = () => { - const grouped: Record< - string, - { - compatible: Model[]; - incompatible: Model[]; - unknown: Model[]; - } - > = {}; + const grouped: Record> = {}; models.forEach((model) => { - const modelType = model.model_type || "unknown"; + const displayType = model.display_model_type || "LLM"; + const modelStatus = model.status || "EXPERIMENTAL"; - if (!grouped[modelType]) { - grouped[modelType] = { compatible: [], incompatible: [], unknown: [] }; - } + if (!grouped[displayType]) grouped[displayType] = {}; + if (!grouped[displayType][modelStatus]) + grouped[displayType][modelStatus] = { compatible: [], incompatible: [], unknown: [] }; if (model.is_compatible === true) { - grouped[modelType].compatible.push(model); + grouped[displayType][modelStatus].compatible.push(model); } else if (model.is_compatible === false) { - grouped[modelType].incompatible.push(model); + grouped[displayType][modelStatus].incompatible.push(model); } else { - grouped[modelType].unknown.push(model); + grouped[displayType][modelStatus].unknown.push(model); } }); @@ -340,99 +357,106 @@ export function FirstStepForm({ )} - {/* Render models grouped by type */} - {Object.entries(groupedModels).map( - ([modelType, modelsByCompatibility], typeIndex) => { - const typeConfig = - MODEL_TYPE_CONFIG[ - modelType as keyof typeof MODEL_TYPE_CONFIG - ]; - const hasModels = - modelsByCompatibility.compatible.length + - modelsByCompatibility.incompatible.length + - modelsByCompatibility.unknown.length > - 0; - - if (!hasModels) return null; - - const IconComponent = typeConfig?.icon || Bot; + {/* Render models grouped by type, then by status */} + {Object.entries(groupedModels) + .sort(([a], [b]) => { + const orderA = TYPE_CONFIG[a]?.order ?? 99; + const orderB = TYPE_CONFIG[b]?.order ?? 99; + return orderA - orderB; + }) + .map(([displayType, statusGroups], typeIndex) => { + const typeConfig = TYPE_CONFIG[displayType]; + const typeLabel = typeConfig?.label || `${displayType} Models`; return ( -
- {/* Model Type Header */} +
+ {/* Type Group Header */} {typeIndex > 0 && ( -
+
)} -
- - {typeConfig?.label || modelType} +
+ {typeLabel}
- {/* Compatible Models */} - {modelsByCompatibility.compatible.map((model) => ( - -
- - ● - - {model.name} - - Compatible - -
-
- ))} - - {/* Incompatible Models */} - {modelsByCompatibility.incompatible.map((model) => ( - -
- - ● - - - {model.name} - - - Incompatible - -
-
- ))} - - {/* Unknown Compatibility Models */} - {modelsByCompatibility.unknown.map((model) => ( - -
- - ● - - {model.name} - - Unknown - -
-
- ))} + {/* Status sub-groups within this type */} + {Object.entries(statusGroups) + .sort( + ([a], [b]) => + (STATUS_ORDER[b] ?? 0) - (STATUS_ORDER[a] ?? 0) + ) + .map(([modelStatus, modelsByCompatibility]) => { + const statusConfig = + STATUS_CONFIG[modelStatus as keyof typeof STATUS_CONFIG]; + const hasModels = + modelsByCompatibility.compatible.length + + modelsByCompatibility.incompatible.length + + modelsByCompatibility.unknown.length > 0; + + if (!hasModels) return null; + + const IconComponent = statusConfig?.icon || Bot; + + return ( +
+ {/* Status Sub-Header */} +
+ + {statusConfig?.label || modelStatus} +
+ + {/* Compatible Models */} + {modelsByCompatibility.compatible.map((model: Model) => ( + +
+ + {model.name} + Compatible +
+
+ ))} + + {/* Incompatible Models */} + {modelsByCompatibility.incompatible.map((model: Model) => ( + +
+ + {model.name} + Incompatible +
+
+ ))} + + {/* Unknown Compatibility Models */} + {modelsByCompatibility.unknown.map((model: Model) => ( + +
+ + {model.name} + Unknown +
+
+ ))} +
+ ); + })}
); - } - )} + })} {/* If no models loaded yet */} {models.length === 0 && !isLoading && ( @@ -443,6 +467,36 @@ export function FirstStepForm({ + {/* Device ID picker — only for multi-chip boards (T3K=4 slots, Galaxy=32, etc.) */} + {(() => { + const selected = models.find((m) => m.name === form.watch("model")); + const board = selected?.current_board ?? currentBoard; + const maxSlots = MULTI_CHIP_BOARD_SLOTS[board]; + if (!maxSlots) return null; // single-chip board — no choice needed + const maxId = maxSlots - 1; + return ( +
+ + { + const v = parseInt(e.target.value, 10); + setDeviceId(isNaN(v) ? 0 : Math.max(0, Math.min(maxId, v))); + }} + className="w-20 rounded-md border border-gray-300 dark:border-gray-600 bg-white dark:bg-gray-800 px-2 py-1 text-sm text-gray-900 dark:text-gray-100 focus:outline-none focus:ring-2 focus:ring-TT-purple-accent" + /> + + /dev/tenstorrent/{deviceId}  ·  {maxSlots} chips available + +
+ ); + })()} + {/* Summary info */} {models.length > 0 && !isLoading && (
diff --git a/app/frontend/src/components/Footer.tsx b/app/frontend/src/components/Footer.tsx index 513e9fe3..f60aa5eb 100644 --- a/app/frontend/src/components/Footer.tsx +++ b/app/frontend/src/components/Footer.tsx @@ -7,6 +7,7 @@ import { Badge } from "./ui/badge"; import { useTheme } from "../hooks/useTheme"; import { useNavigate, useLocation } from "react-router-dom"; import { useModels } from "../hooks/useModels"; +import { useDeviceState } from "../hooks/useDeviceState"; import { Tooltip, TooltipContent, @@ -31,34 +32,19 @@ interface FooterProps { className?: string; } -interface SystemStatus { +interface SystemResources { cpuUsage: number; memoryUsage: number; memoryTotal: string; - boardName: string; - temperature: number; - devices: Array<{ - index: number; - board_type: string; - temperature: number; - power: number; - voltage: number; - }>; - hardware_status?: "healthy" | "error" | "unknown"; - hardware_error?: string; - error?: string; } const REFRESH_COOLDOWN_MS = 2 * 60 * 1000; // 2 minutes cooldown between manual refreshes const Footer: React.FC = ({ className }) => { - const [systemStatus, setSystemStatus] = useState({ + const [systemResources, setSystemResources] = useState({ cpuUsage: 0, memoryUsage: 0, memoryTotal: "0 GB", - boardName: "Unknown", - temperature: 0, - devices: [], }); const [loading, setLoading] = useState(true); const [error, setError] = useState(null); @@ -67,6 +53,7 @@ const Footer: React.FC = ({ className }) => { const [showTTStudioModal, setShowTTStudioModal] = useState(false); const [bugReportLoading, setBugReportLoading] = useState(false); const { models } = useModels(); + const { deviceState, refresh: refreshDeviceState } = useDeviceState(); const navigate = useNavigate(); const location = useLocation(); const { theme } = useTheme(); @@ -82,8 +69,8 @@ const Footer: React.FC = ({ className }) => { // Check if we should hide the footer const shouldHideFooter = location.pathname === "/chat"; - // Fetch system status from API - const fetchSystemStatus = async () => { + // Fetch only CPU/memory resources (board info comes from DeviceStateContext) + const fetchSystemResources = async () => { try { const response = await fetch("/board-api/footer-data/"); if (!response.ok) { @@ -96,18 +83,15 @@ const Footer: React.FC = ({ className }) => { } const data = await response.json(); - setSystemStatus(data); + setSystemResources({ + cpuUsage: data.cpuUsage ?? 0, + memoryUsage: data.memoryUsage ?? 0, + memoryTotal: data.memoryTotal ?? "0 GB", + }); setError(null); } catch (err) { - console.error("Failed to fetch system status:", err); + console.error("Failed to fetch system resources:", err); setError(err instanceof Error ? err.message : "Unknown error"); - // Keep previous data or use fallback - setSystemStatus((prev) => ({ - ...prev, - boardName: prev.hardware_status === "error" ? prev.boardName : "Error", - hardware_status: prev.hardware_status === "error" ? "error" : "unknown", - error: err instanceof Error ? err.message : "Unknown error", - })); } finally { setLoading(false); } @@ -129,18 +113,8 @@ const Footer: React.FC = ({ className }) => { try { setRefreshing(true); - const response = await fetch("/board-api/refresh-cache/", { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - }); - - if (!response.ok) { - throw new Error(`HTTP error! status: ${response.status}`); - } - - await fetchSystemStatus(); + // Trigger an immediate re-poll of device state via context + refreshDeviceState(); } catch (err) { console.error("Failed to refresh board detection:", err); setError(err instanceof Error ? err.message : "Unknown error"); @@ -151,26 +125,57 @@ const Footer: React.FC = ({ className }) => { }; useEffect(() => { - // Initial fetch on mount only - fetchSystemStatus(); - - // No more timer-based polling - will refresh on model deployment events + // Fetch CPU/memory once on mount (board info is handled by DeviceStateContext) + fetchSystemResources(); + // eslint-disable-next-line react-hooks/exhaustive-deps }, []); const textColor = theme === "dark" ? "text-zinc-300" : "text-gray-700"; const borderColor = theme === "dark" ? "border-zinc-700" : "border-gray-200"; const bgColor = theme === "dark" ? "bg-zinc-900/95" : "bg-white/95"; const mutedTextColor = theme === "dark" ? "text-zinc-400" : "text-gray-500"; - const normalizedBoardName = systemStatus.boardName?.toLowerCase(); + + // Derive board info from DeviceStateContext + const boardName = deviceState?.board_name ?? "Unknown"; + const deviceStateName = deviceState?.state ?? "UNKNOWN"; + const devices = deviceState?.devices ?? []; + const avgTemperature = + devices.length > 0 + ? Math.round( + (devices.reduce((sum, d) => sum + (d.temperature ?? 0), 0) / + devices.length) * + 10 + ) / 10 + : 0; + const isHardwareHealthy = deviceStateName === "HEALTHY"; + const isHardwareError = + deviceStateName === "BAD_STATE" || deviceStateName === "NOT_PRESENT"; + const normalizedBoardName = boardName.toLowerCase(); const isBoardDetectionIssue = - systemStatus.hardware_status === "error" || + isHardwareError || !!error || normalizedBoardName === "error" || - normalizedBoardName === "unknown"; + normalizedBoardName === "unknown" || + normalizedBoardName === "not present" || + normalizedBoardName === "bad state"; const remainingCooldownMs = getRemainingCooldownMs(); const isInCooldown = remainingCooldownMs > 0; const cooldownSeconds = Math.ceil(remainingCooldownMs / 1000); + // Legacy-compatible derived values used by bug-report and render + const hardwareStatus: "healthy" | "error" | "unknown" = + deviceStateName === "HEALTHY" + ? "healthy" + : deviceStateName === "BAD_STATE" || deviceStateName === "NOT_PRESENT" + ? "error" + : "unknown"; + const hardwareError = + deviceStateName === "BAD_STATE" + ? "Board is in a bad state (unresponsive). Reset recommended." + : deviceStateName === "NOT_PRESENT" + ? "No Tenstorrent device detected. Check hardware connection." + : null; + // Handle click on deployed models section const handleDeployedModelsClick = () => { navigate("/models-deployed"); @@ -302,19 +307,19 @@ const Footer: React.FC = ({ className }) => { **Time:** ${new Date().toLocaleTimeString()} ### System Information -- **Board:** ${systemStatus.boardName} -- **Hardware Status:** ${systemStatus.hardware_status || "unknown"} -- **CPU Usage:** ${systemStatus.cpuUsage.toFixed(2)}% -- **Memory Usage:** ${systemStatus.memoryUsage.toFixed(1)}% (${systemStatus.memoryTotal}) -- **Temperature:** ${systemStatus.temperature.toFixed(1)}°C -- **Devices:** ${systemStatus.devices.length} device(s) +- **Board:** ${boardName} +- **Hardware Status:** ${hardwareStatus || "unknown"} +- **CPU Usage:** ${systemResources.cpuUsage.toFixed(2)}% +- **Memory Usage:** ${systemResources.memoryUsage.toFixed(1)}% (${systemResources.memoryTotal}) +- **Temperature:** ${avgTemperature.toFixed(1)}°C +- **Devices:** ${devices.length} device(s) - **Current URL:** ${currentUrl} - **User Agent:** ${userAgent} ### Hardware Details ${ - systemStatus.devices.length > 0 - ? systemStatus.devices + devices.length > 0 + ? devices .map( (device, index) => `**Device ${index + 1}:** @@ -332,7 +337,7 @@ ${models.length > 0 ? models.map((model) => `- ${model.name} (${model.status})`) ### Error Information ${error ? `**System Error:** ${error}` : "No system errors detected"} -${systemStatus.hardware_error ? `**Hardware Error:** ${systemStatus.hardware_error}` : "No hardware errors detected"} +${hardwareError ? `**Hardware Error:** ${hardwareError}` : "No hardware errors detected"} ### FastAPI Logs ${fastapiLogs} @@ -379,15 +384,15 @@ Add any other context about the problem here. : text; }; const limitDevicesList = (maxDevices: number) => { - if (systemStatus.devices.length <= maxDevices) return undefined; - const blocks = systemStatus.devices + if (devices.length <= maxDevices) return undefined; + const blocks = devices .map( (device, index) => `**Device ${index + 1}:**\n- Board Type: ${device.board_type}\n- Temperature: ${device.temperature.toFixed(1)}°C\n- Power: ${device.power.toFixed(2)}W\n- Voltage: ${device.voltage.toFixed(2)}V` ) .slice(0, maxDevices) .join("\n\n"); - return `${blocks}\n\n... (${systemStatus.devices.length - maxDevices} more device entries truncated)`; + return `${blocks}\n\n... (${devices.length - maxDevices} more device entries truncated)`; }; const MAX_URL_LENGTH = 7000; // conservative safety limit for GitHub new-issue URL @@ -406,15 +411,15 @@ Add any other context about the problem here. **Time:** ${new Date().toLocaleTimeString()} ### System Information -- **Board:** ${systemStatus.boardName} -- **Hardware Status:** ${systemStatus.hardware_status || "unknown"} -- **CPU Usage:** ${systemStatus.cpuUsage.toFixed(2)}% -- **Memory Usage:** ${systemStatus.memoryUsage.toFixed(1)}% (${systemStatus.memoryTotal}) -- **Temperature:** ${systemStatus.temperature.toFixed(1)}°C -- **Devices:** ${systemStatus.devices.length} device(s) +- **Board:** ${boardName} +- **Hardware Status:** ${hardwareStatus || "unknown"} +- **CPU Usage:** ${systemResources.cpuUsage.toFixed(2)}% +- **Memory Usage:** ${systemResources.memoryUsage.toFixed(1)}% (${systemResources.memoryTotal}) +- **Temperature:** ${avgTemperature.toFixed(1)}°C +- **Devices:** ${devices.length} device(s) ### Hardware Details (truncated) -${devicesLimited ?? (systemStatus.devices.length ? "(within limit)" : "No hardware devices detected")} +${devicesLimited ?? (devices.length ? "(within limit)" : "No hardware devices detected")} ### Deployed Models ${ @@ -428,7 +433,7 @@ ${ ### Error Information ${error ? `**System Error:** ${error}` : "No system errors detected"} -${systemStatus.hardware_error ? `**Hardware Error:** ${systemStatus.hardware_error}` : "No hardware errors detected"} +${hardwareError ? `**Hardware Error:** ${hardwareError}` : "No hardware errors detected"} ### FastAPI Logs (truncated) ${truncatedFastapi} @@ -530,16 +535,16 @@ Full logs have been copied to your clipboard and downloaded as a file. Please pa **Time:** ${new Date().toLocaleTimeString()} ### System Information -- **Board:** ${systemStatus.boardName} -- **Hardware Status:** ${systemStatus.hardware_status || "unknown"} -- **CPU Usage:** ${systemStatus.cpuUsage.toFixed(2)}% -- **Memory Usage:** ${systemStatus.memoryUsage.toFixed(1)}% (${systemStatus.memoryTotal}) -- **Temperature:** ${systemStatus.temperature.toFixed(1)}°C -- **Devices:** ${systemStatus.devices.length} device(s) +- **Board:** ${boardName} +- **Hardware Status:** ${hardwareStatus || "unknown"} +- **CPU Usage:** ${systemResources.cpuUsage.toFixed(2)}% +- **Memory Usage:** ${systemResources.memoryUsage.toFixed(1)}% (${systemResources.memoryTotal}) +- **Temperature:** ${avgTemperature.toFixed(1)}°C +- **Devices:** ${devices.length} device(s) ### Error Information ${error ? `**System Error:** ${error}` : "No system errors detected"} -${systemStatus.hardware_error ? `**Hardware Error:** ${systemStatus.hardware_error}` : "No hardware errors detected"} +${hardwareError ? `**Hardware Error:** ${hardwareError}` : "No hardware errors detected"} ### FastAPI Logs ${fallbackFastapiLogs} @@ -622,7 +627,7 @@ Add any other context about the problem here. TT Studio 2.0.1
- {systemStatus.boardName?.toLowerCase().includes("t3k") ? ( + {boardName?.toLowerCase().includes("t3k") ? (
- {systemStatus.boardName} + {boardName}
- ) : systemStatus.boardName?.toLowerCase().includes("n300") ? ( + ) : boardName?.toLowerCase().includes("n300") ? (
- {systemStatus.boardName} + {boardName}
) : (
- {systemStatus.boardName} - {systemStatus.hardware_status === "error" && " ⚠️"} + {boardName} + {hardwareStatus === "error" && " ⚠️"} {isBoardDetectionIssue && ( @@ -724,10 +729,10 @@ Add any other context about the problem here. )}
)} - {(error || systemStatus.hardware_error) && ( + {(error || hardwareError) && ( ⚠️ @@ -794,23 +799,23 @@ Add any other context about the problem here. SYSTEM RESOURCES USAGE: - RAM: {systemStatus.memoryUsage.toFixed(1)}% ( - {systemStatus.memoryTotal}) | CPU:{" "} - {systemStatus.cpuUsage.toFixed(2)}% - {systemStatus.hardware_status === "healthy" && ( - <> | TEMP: {systemStatus.temperature.toFixed(1)}°C + RAM: {systemResources.memoryUsage.toFixed(1)}% ( + {systemResources.memoryTotal}) | CPU:{" "} + {systemResources.cpuUsage.toFixed(2)}% + {hardwareStatus === "healthy" && ( + <> | TEMP: {avgTemperature.toFixed(1)}°C )} - {systemStatus.hardware_status === "error" && ( + {hardwareStatus === "error" && ( <> | TT HARDWARE: UNAVAILABLE )} - {systemStatus.hardware_status === "unknown" && ( + {hardwareStatus === "unknown" && ( <> | TT HARDWARE: CHECKING... )} - {systemStatus.devices.length > 1 && - systemStatus.hardware_status === "healthy" && ( + {devices.length > 1 && + hardwareStatus === "healthy" && ( - ({systemStatus.devices.length} devices) + ({devices.length} devices) )}
diff --git a/app/frontend/src/components/NavBar.tsx b/app/frontend/src/components/NavBar.tsx index 5c9dda0e..1b970b00 100644 --- a/app/frontend/src/components/NavBar.tsx +++ b/app/frontend/src/components/NavBar.tsx @@ -13,6 +13,7 @@ import { Image, Eye, AudioLines, + Mic, ChevronRight, ChevronLeft, type LucideIcon, @@ -46,6 +47,7 @@ import { getDestinationFromModelType, ModelType, getModelTypeFromName, + getModelTypeFromBackendType, } from "../api/modelsDeployedApis"; import { useHeroSection } from "../hooks/useHeroSection"; @@ -386,7 +388,12 @@ export default function NavBar() { if (models.length > 0) { const firstModel = models[0]; if (firstModel.id && firstModel.name) { - handleModelNavigationClick(firstModel.id, firstModel.name, navigate); + handleModelNavigationClick( + firstModel.id, + firstModel.name, + navigate, + firstModel.model_type + ); } else { console.error("Model ID or name is undefined"); } @@ -467,6 +474,13 @@ export default function NavBar() { label: "Logs", tooltip: "View system logs", }, + { + type: "link", + to: "/voice-pipeline", + icon: Mic, + label: "Voice Pipeline", + tooltip: "End-to-end voice demo (Whisper → LLM → TTS)", + }, ]; // Define model-based navigation items (shown only when isDeployedEnabled is true) @@ -484,7 +498,9 @@ export default function NavBar() { if (models.length > 0) { // Show navigation items for each deployed model return models.map((model) => { - const modelType = getModelTypeFromName(model.name); + const modelType = model.model_type + ? getModelTypeFromBackendType(model.model_type) + : getModelTypeFromName(model.name); console.log(`Model: ${model.name}, Type: ${modelType}`); return { type: "button", @@ -545,7 +561,9 @@ export default function NavBar() { // In TT-Studio mode, show only deployed models console.log("TT-Studio mode - creating navigation for deployed models"); return models.map((model) => { - const modelType = getModelTypeFromName(model.name); + const modelType = model.model_type + ? getModelTypeFromBackendType(model.model_type) + : getModelTypeFromName(model.name); console.log(`TT-Studio Model: ${model.name}, Type: ${modelType}`); return { type: "button", diff --git a/app/frontend/src/components/ResetIcon.tsx b/app/frontend/src/components/ResetIcon.tsx index 8497ea06..9d27e3e2 100644 --- a/app/frontend/src/components/ResetIcon.tsx +++ b/app/frontend/src/components/ResetIcon.tsx @@ -1,11 +1,19 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC -import React, { useState, useEffect } from "react"; +import React, { useState } from "react"; import axios from "axios"; -import { Cpu, CheckCircle, AlertTriangle } from "lucide-react"; +import { + Cpu, + CheckCircle, + XCircle, + AlertTriangle, + Loader2, + Trash2, + RotateCcw, + ChevronDown, +} from "lucide-react"; import { Spinner } from "./ui/spinner"; -import { customToast } from "./CustomToaster"; import { useTheme } from "../hooks/useTheme"; import { Button } from "./ui/button"; import { @@ -15,369 +23,542 @@ import { DialogHeader, DialogTitle, DialogTrigger, - DialogDescription, } from "./ui/dialog"; -import { - Accordion, - AccordionContent, - AccordionItem, - AccordionTrigger, -} from "./ui/accordion"; import { ScrollArea } from "./ui/scroll-area"; import { fetchModels, deleteModel } from "../api/modelsDeployedApis"; import { useModels } from "../hooks/useModels"; +import { useDeviceState } from "../hooks/useDeviceState"; import BoardBadge from "./BoardBadge"; +type ResetStep = "deleting" | "resetting" | "done" | "failed" | null; + interface ResetIconProps { onReset?: () => void; } -// Board info interface -interface BoardInfo { - type: string; - name: string; +// ── Shared step-row (mirrors DeleteModelDialog) ────────────────────────────── +function StepRow({ + number, + icon, + label, + sublabel, + state, +}: { + number: number; + icon: React.ReactNode; + label: string; + sublabel?: string; + state: "pending" | "active" | "done" | "skipped"; +}) { + return ( +
+
+ {state === "active" ? ( + + ) : state === "done" ? ( + + ) : state === "skipped" ? ( + + ) : ( +
+ {number} +
+ )} +
+
+
+ {icon} + {label} +
+ {sublabel && state === "active" && ( +
{sublabel}
+ )} + {state === "done" && ( +
Completed
+ )} + {state === "skipped" && ( +
+ No models deployed — skipped +
+ )} +
+
+ ); +} + +// ── Board status banner ─────────────────────────────────────────────────────── +function BoardStatusBanner({ + state, + boardType, +}: { + state: string; + boardType: string; +}) { + if (state === "BAD_STATE") { + return ( +
+ +
+ Board unresponsive +

+ The board is present but not responding. A reset is strongly + recommended. +

+
+
+ ); + } + if (state === "NOT_PRESENT") { + return ( +
+ +
+ No device detected +

+ /dev/tenstorrent{" "} + not found. Check your hardware connection. +

+
+
+ ); + } + if (state === "HEALTHY" && boardType !== "unknown") { + return ( +
+ + + Board is healthy — reset + is available if needed. + +
+ ); + } + return null; } +// ── Main component ──────────────────────────────────────────────────────────── const ResetIcon: React.FC = ({ onReset }) => { const { theme } = useTheme(); - const { refreshModels } = useModels(); - const [isLoading, setIsLoading] = useState(false); - const [isCompleted, setIsCompleted] = useState(false); + const { models, refreshModels } = useModels(); + const { deviceState, refresh: refreshDeviceState } = useDeviceState(); + const [isDialogOpen, setIsDialogOpen] = useState(false); + const [resetStep, setResetStep] = useState(null); const [errorMessage, setErrorMessage] = useState(null); + const [cmdOutput, setCmdOutput] = useState(null); + const [showOutput, setShowOutput] = useState(false); const [resetHistory, setResetHistory] = useState([]); - const [fullOutput, setFullOutput] = useState(null); - const [boardInfo, setBoardInfo] = useState(null); - const [boardLoading, setBoardLoading] = useState(false); - - // Fetch board information when dialog opens - useEffect(() => { - if (isDialogOpen && !boardInfo) { - fetchBoardInfo(); - } - }, [isDialogOpen]); - const fetchBoardInfo = async () => { - setBoardLoading(true); - try { - const response = await axios.get<{ type: string; name: string }>( - "/docker-api/board-info/" - ); - setBoardInfo(response.data); - } catch (error) { - console.error("Error fetching board info:", error); - // Set default values if detection fails - setBoardInfo({ type: "unknown", name: "Unknown Board" }); - } finally { - setBoardLoading(false); - } - }; + const isLoading = + resetStep === "deleting" || resetStep === "resetting"; + const isCompleted = resetStep === "done"; + const isFailed = resetStep === "failed"; - const iconColor = theme === "dark" ? "text-zinc-200" : "text-black"; - const hoverIconColor = - theme === "dark" ? "hover:text-zinc-300" : "hover:text-gray-700"; - const buttonBackgroundColor = theme === "dark" ? "bg-zinc-900" : "bg-white"; - const hoverButtonBackgroundColor = - theme === "dark" ? "hover:bg-zinc-700" : "hover:bg-gray-200"; + const boardType = deviceState?.board_type ?? "unknown"; + const deviceStateName = deviceState?.state ?? "UNKNOWN"; + const isBadState = deviceStateName === "BAD_STATE"; + const isNotPresent = deviceStateName === "NOT_PRESENT"; + const isResettingContext = deviceStateName === "RESETTING"; + const deployedCount = models.length; + + // Step states for the progress rows + const step1State: "pending" | "active" | "done" | "skipped" = + resetStep === "deleting" + ? "active" + : resetStep === "resetting" || resetStep === "done" || resetStep === "failed" + ? deployedCount === 0 + ? "skipped" + : "done" + : "pending"; + + const step2State: "pending" | "active" | "done" | "skipped" = + resetStep === "resetting" + ? "active" + : resetStep === "done" + ? "done" + : "pending"; + + // ── Reset execution ───────────────────────────────────────────────────────── + const executeReset = async () => { + setErrorMessage(null); + setCmdOutput(null); + setShowOutput(false); - // Function to delete all deployed models - const deleteAllModels = async (): Promise => { try { - const models = await fetchModels(); // Fetch all deployed models - console.log("Models to delete:", models); - for (const model of models) { - await customToast.promise(deleteModel(model.id), { - loading: `Deleting Model ID: ${model.id.substring(0, 4)}...`, - success: `Model ID: ${model.id.substring(0, 4)} deleted successfully.`, - error: `Failed to delete Model ID: ${model.id.substring(0, 4)}.`, - }); + // Step 1: delete deployed models + setResetStep("deleting"); + const currentModels = await fetchModels(); + for (const model of currentModels) { + await deleteModel(model.id); } - - // Refresh the ModelsContext to sync with backend await refreshModels(); - } catch (error) { - console.error("Error deleting models:", error); - throw new Error("Failed to delete all models."); - } - }; - const resetBoardAsync = async (): Promise => { - const response = await axios.post("/docker-api/reset_board/", null, { - responseType: "blob", - }); - - const reader = response.data.stream().getReader(); - const decoder = new TextDecoder(); - let output = ""; - let success = true; - const statusCode = response.status; - - while (true) { - const { done, value } = await reader.read(); - if (done) break; - - const chunk = decoder.decode(value, { stream: true }); - output += chunk; - - // Check for failure in each chunk - if ( - chunk.includes("Command failed") || - chunk.includes("No Tenstorrent devices detected") || - chunk.includes("Exiting") || - chunk.includes("Error") - ) { - success = false; - } - } + // Step 2: run board reset + setResetStep("resetting"); + const response = await axios.post("/docker-api/reset_board/", null, { + responseType: "blob", + }); - const finalChunk = decoder.decode(); - if (finalChunk) { - output += finalChunk; - if ( - finalChunk.includes("Command failed") || - finalChunk.includes("No Tenstorrent devices detected") || - finalChunk.includes("Exiting") || - finalChunk.includes("Error") - ) { - success = false; - } - } + const reader = response.data.stream().getReader(); + const decoder = new TextDecoder(); + let output = ""; + let success = true; - const styledOutput = success - ? ` - Board Reset Successfully - ----------------------- -
${output}
- ` - : ` - Board Reset Failed - ----------------------- -
${output}
- `; - - setFullOutput(styledOutput); - - if (!success) { - if (statusCode === 501) { - throw new Error( - "No Tenstorrent devices detected. Please check your hardware connection and try again." - ); - } else { - // Parse the error message from the output - const errorLines = output - .split("\n") - .filter( - (line) => - line.includes("tt-smi reset failed") || - line.includes("Please check if:") || - line.includes("1.") || - line.includes("2.") || - line.includes("3.") || - line.includes("4.") - ); - if (errorLines.length > 0) { - throw new Error(errorLines.join("\n")); - } else { - throw new Error( - "Board reset failed. Please check the command output for details." - ); + // eslint-disable-next-line no-constant-condition + while (true) { + const { done, value } = await reader.read(); + if (done) break; + const chunk = decoder.decode(value, { stream: true }); + output += chunk; + if ( + chunk.includes("Command failed") || + chunk.includes("No Tenstorrent devices detected") || + chunk.includes("Error") + ) { + success = false; + } + } + const tail = decoder.decode(); + if (tail) { + output += tail; + if ( + tail.includes("Command failed") || + tail.includes("No Tenstorrent devices detected") || + tail.includes("Error") + ) { + success = false; } } - } - - setIsCompleted(true); - setResetHistory((prevHistory) => [...prevHistory, new Date()]); - setTimeout(() => setIsCompleted(false), 5000); - }; - - const resetBoard = async (): Promise => { - setIsLoading(true); - setIsCompleted(false); - setErrorMessage(null); - setIsDialogOpen(false); - try { - await deleteAllModels(); + setCmdOutput(output); - await customToast.promise(resetBoardAsync(), { - loading: "Resetting board...", - success: "Board reset successfully!", - error: "Failed to reset board.", - }); - - if (onReset) { - console.log("Calling onReset prop function"); - onReset(); - } - } catch (error) { - console.error("Error resetting board:", error); - - if (error instanceof Error) { - const errorOutput = ` - Error Resetting Board - ----------------------- -
${error.message}
- `; - setFullOutput(errorOutput); - setErrorMessage(error.message); - } else { - setErrorMessage("An unknown error occurred"); + if (!success) { + throw new Error( + response.status === 501 + ? "No Tenstorrent devices detected. Check hardware connection." + : "Board reset failed. See command output for details." + ); } - setIsDialogOpen(true); - } finally { - setIsLoading(false); + setResetStep("done"); + setResetHistory((prev) => [...prev, new Date()]); + refreshDeviceState(); + if (onReset) onReset(); + } catch (err) { + setErrorMessage( + err instanceof Error ? err.message : "An unknown error occurred." + ); + setResetStep("failed"); } }; - const handleDialogOpenChange = (isOpen: boolean) => { - setIsDialogOpen(isOpen); - if (isOpen) { + const handleOpen = () => { + setIsDialogOpen(true); + // Only reset state when there's nothing in progress — otherwise re-show current progress + if (!isLoading) { + setResetStep(null); setErrorMessage(null); + setCmdOutput(null); + setShowOutput(false); } }; + const handleClose = () => { + setIsDialogOpen(false); + // Do NOT reset state — any in-progress reset continues in the background. + // State is only cleared on the next fresh open (see handleOpen above). + }; + + // ── Navbar trigger button ─────────────────────────────────────────────────── + const iconColor = theme === "dark" ? "text-zinc-200" : "text-black"; + const hoverIconColor = + theme === "dark" ? "hover:text-zinc-300" : "hover:text-gray-700"; + const btnBg = theme === "dark" ? "bg-zinc-900" : "bg-white"; + const btnHover = + theme === "dark" ? "hover:bg-zinc-700" : "hover:bg-gray-200"; + return ( - + (open ? handleOpen() : handleClose())} + > + + {/* ── HEADER ── */} -
-
- - - Reset Card - -
- {boardInfo && boardInfo.type !== "unknown" && ( - - )} - {boardLoading && ( -
- - - Detecting... - +
+
+ {isLoading ? ( +
+ +
+ ) : isCompleted ? ( +
+ +
+ ) : isFailed ? ( +
+ +
+ ) : ( +
+ +
+ )} +
+ + {isLoading + ? resetStep === "deleting" + ? "Removing deployed models…" + : "Resetting board…" + : isCompleted + ? "Reset complete" + : isFailed + ? "Reset failed" + : "Reset Card"} + + {isLoading && ( +

+ Step {resetStep === "deleting" ? "1" : "2"} of 2 — do not + close this window +

+ )}
+
+ {/* Board badge — only when idle */} + {!isLoading && !isCompleted && !isFailed && boardType !== "unknown" && ( + )}
- - Are you sure you want to reset the card? - - {boardInfo && boardInfo.type === "unknown" && ( -
- -
-
- No Tenstorrent device detected -
-
- Device /dev/tenstorrent not found. Please check - your hardware connection and ensure the device is properly - installed. + +
+ {/* ── IDLE: board status + step overview ── */} + {!isLoading && !isCompleted && !isFailed && ( + <> + + + {isResettingContext && ( +
+ + Board is already resetting… +
+ )} + + {/* Step overview */} + } + label={ + deployedCount > 0 + ? `Stop ${deployedCount} deployed model${deployedCount > 1 ? "s" : ""}` + : "Stop deployed models" + } + state="pending" + /> + } + label="Reset the board (tt-smi -r)" + state="pending" + /> + + {/* Warning */} +
+ + + Warning: This will + interrupt any ongoing processes on the card. + {resetHistory.length > 0 && ( + + Last reset:{" "} + {resetHistory[resetHistory.length - 1].toLocaleTimeString()} + + )} +
-
-
- )} -
-
-
- Warning! This action will stop all deployed models and might - interrupt ongoing processes. -
- {resetHistory.length > 0 && ( -
- Note: This card was reset in the last 5 minutes. Frequent resets - may cause issues. Please wait before resetting again. + + )} + + {/* ── LOADING: step progress ── */} + {isLoading && ( + <> + } + label={ + deployedCount > 0 + ? `Stop ${deployedCount} deployed model${deployedCount > 1 ? "s" : ""}` + : "Stop deployed models" + } + sublabel="Sending stop signal to all containers…" + state={step1State} + /> + } + label="Reset the board" + sublabel="Running tt-smi -r, this may take 10–30 seconds…" + state={step2State} + /> + + )} + + {/* ── COMPLETED ── */} + {isCompleted && ( + <> + } + label="Deployed models removed" + state={deployedCount === 0 ? "skipped" : "done"} + /> + } + label="Board reset" + state="done" + /> + {cmdOutput && ( + + )} + {showOutput && cmdOutput && ( + +
+                    {cmdOutput}
+                  
+
+ )} + + )} + + {/* ── FAILED ── */} + {isFailed && ( + <> +
+ +
+

+ {errorMessage} +

+ {cmdOutput && ( + + )} +
- )} -
+ {showOutput && cmdOutput && ( + +
+                    {cmdOutput}
+                  
+
+ )} + + )}
- {errorMessage && ( -
-
- -
-
Error:
-
-                  {errorMessage}
-                
-
-
-
- )} - - - - Reset History - - -
    - {resetHistory.length > 0 ? ( - resetHistory.map((resetTime, index) => ( -
  • {resetTime.toLocaleString()}
  • - )) + + {/* ── FOOTER ── */} + + {(isCompleted || isFailed) ? ( + + ) : ( + <> + +
-
-
- {fullOutput && ( - - - Command Output - - - -
- - - + + )} - - - -
diff --git a/app/frontend/src/components/SelectionSteps.tsx b/app/frontend/src/components/SelectionSteps.tsx index 07f8211d..e230e2f2 100644 --- a/app/frontend/src/components/SelectionSteps.tsx +++ b/app/frontend/src/components/SelectionSteps.tsx @@ -22,6 +22,8 @@ export interface Model { compatible_boards: string[]; // List of boards this model can run on model_type: string; // Type of model (e.g., CHAT, IMAGE_GENERATION, etc.) current_board: string; // The detected board type + status?: "EXPERIMENTAL" | "FUNCTIONAL" | "COMPLETE" | null; + display_model_type?: string; } export default function StepperDemo() { @@ -40,6 +42,7 @@ export default function StepperDemo() { }; const [selectedModel, setSelectedModel] = useState(null); + const [selectedDeviceId, setSelectedDeviceId] = useState(0); const [loading, setLoading] = useState(false); const [formError, setFormError] = useState(false); const [isAutoDeploying, setIsAutoDeploying] = useState(false); @@ -72,9 +75,11 @@ export default function StepperDemo() { console.log("Found model for auto-deploy:", model); // Deploy with default weights + const deviceIdParam = parseInt(searchParams.get("device-id") ?? "0", 10); const deployPayload = { model_id: model.id, weights_id: "", // Empty string for default weights + device_id: isNaN(deviceIdParam) ? 0 : deviceIdParam, }; console.log("Auto-deploy payload:", deployPayload); @@ -137,6 +142,7 @@ export default function StepperDemo() { const payload = JSON.stringify({ model_id, weights_id, + device_id: selectedDeviceId, }); console.log("📦 Deploying with default weights:", { model_id, weights_id }); @@ -213,6 +219,7 @@ export default function StepperDemo() { console.log("🔄 setSelectedModel called with:", modelId); setSelectedModel(modelId); }} + setSelectedDeviceId={setSelectedDeviceId} setFormError={setFormError} autoDeployModel={autoDeployModel} isAutoDeploying={isAutoDeploying} diff --git a/app/frontend/src/components/chatui/runInference.ts b/app/frontend/src/components/chatui/runInference.ts index 251c7cd0..09dffb2d 100644 --- a/app/frontend/src/components/chatui/runInference.ts +++ b/app/frontend/src/components/chatui/runInference.ts @@ -331,8 +331,8 @@ export const runInference = async ( metricsTracker.recordUsage(usage); } - // Handle generated text content - const content = jsonData.choices[0]?.delta?.content || ""; + // Handle generated text content (chat completions use delta.content, text completions use text) + const content = jsonData.choices[0]?.delta?.content ?? jsonData.choices[0]?.text ?? ""; if (content) { // Record first token arrival metricsTracker.recordFirstToken(); diff --git a/app/frontend/src/components/models/DeleteModelDialog.tsx b/app/frontend/src/components/models/DeleteModelDialog.tsx index 887436f2..e8948286 100644 --- a/app/frontend/src/components/models/DeleteModelDialog.tsx +++ b/app/frontend/src/components/models/DeleteModelDialog.tsx @@ -1,7 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: © 2025 Tenstorrent AI ULC -// React import not needed for modern JSX transform +import type { ReactNode } from "react"; +import { AlertTriangle, CheckCircle, Loader2, Trash2, RotateCcw } from "lucide-react"; import { Dialog, DialogContent, @@ -10,70 +11,176 @@ import { DialogTitle, } from "../ui/dialog"; import { Button } from "../ui/button"; -import { AlertTriangle } from "lucide-react"; + +export type DeleteStep = "deleting" | "resetting" | null; interface Props { open: boolean; modelId: string; isLoading: boolean; + deleteStep: DeleteStep; onConfirm: () => void; onCancel: () => void; } +function StepRow({ + number, + icon, + label, + sublabel, + state, +}: { + number: number; + icon: ReactNode; + label: string; + sublabel?: string; + state: "pending" | "active" | "done"; +}) { + return ( +
+
+ {state === "active" ? ( + + ) : state === "done" ? ( + + ) : ( +
+ {number} +
+ )} +
+
+
+ + {icon} + {label} + +
+ {sublabel && state === "active" && ( +
{sublabel}
+ )} + {state === "done" && ( +
Completed
+ )} +
+
+ ); +} + export default function DeleteModelDialog({ open, - modelId: _modelId, // Marked as intentionally unused for now + modelId: _modelId, isLoading, + deleteStep, onConfirm, onCancel, }: Props) { + const step1State = + deleteStep === "deleting" + ? "active" + : deleteStep === "resetting" + ? "done" + : "pending"; + + const step2State = + deleteStep === "resetting" ? "active" : "pending"; + return ( - !v && onCancel()}> - + !v && !isLoading && onCancel()}> + -
-
- - - Delete Model & Reset Card +
+ {isLoading ? ( +
+ +
+ ) : ( +
+ +
+ )} +
+ + {isLoading + ? deleteStep === "deleting" + ? "Removing model…" + : "Resetting board…" + : "Delete Model & Reset Card"} + {isLoading && ( +

+ Step {deleteStep === "deleting" ? "1" : "2"} of 2 — do not close this window +

+ )}
-
- -
-
- Warning! This action will stop and remove the model, then reset - the card. -
-
- Deleting a model will attempt to stop and remove the model - container. -
- After deletion, the card will automatically be reset using{" "} - tt-smi reset. -
- - This may interrupt any ongoing processes on the card. - -
-
+ +
+ {/* Step 1 */} + } + label="Stop & remove model container" + sublabel="Sending stop signal to the container…" + state={step1State} + /> + + {/* Step 2 */} + } + label="Reset the board" + sublabel="Running tt-smi -r, this may take 10–30 seconds…" + state={step2State} + />
- + + {/* Warning — only shown when idle */} + {!isLoading && ( +
+ + + Warning: This will + interrupt any ongoing processes on the card and cannot be undone. + +
+ )} + + diff --git a/app/frontend/src/components/models/ModelsDeployedCard.tsx b/app/frontend/src/components/models/ModelsDeployedCard.tsx index 2f68ca20..b9e437c7 100644 --- a/app/frontend/src/components/models/ModelsDeployedCard.tsx +++ b/app/frontend/src/components/models/ModelsDeployedCard.tsx @@ -30,7 +30,7 @@ import type { } from "../../types/models"; import ModelsToolbar from "./ModelsToolbar.tsx"; import ModelsTable from "./ModelsTable.tsx"; -import DeleteModelDialog from "./DeleteModelDialog.tsx"; +import DeleteModelDialog, { type DeleteStep } from "./DeleteModelDialog.tsx"; import LogStreamDialog from "./Logs/LogStreamDialog.tsx"; import { useNavigate } from "react-router-dom"; import { useTablePrefs } from "../../hooks/useTablePrefs"; @@ -131,6 +131,7 @@ export default function ModelsDeployedCard(): JSX.Element { const [showDeleteModal, setShowDeleteModal] = useState(false); const [deleteTargetId, setDeleteTargetId] = useState(null); const [isProcessingDelete, setIsProcessingDelete] = useState(false); + const [deleteStep, setDeleteStep] = useState(null); useEffect(() => { loadModels(); @@ -150,30 +151,28 @@ export default function ModelsDeployedCard(): JSX.Element { setIsProcessingDelete(true); const truncatedModelId = deleteTargetId.substring(0, 4); try { + // Step 1: stop & remove the model (backend also runs tt-smi -r internally) + setDeleteStep("deleting"); await customToast.promise(deleteModel(deleteTargetId), { - loading: `Attempting to delete Model ID: ${truncatedModelId}...`, - success: `Model ID: ${truncatedModelId} has been deleted.`, - error: `Failed to delete Model ID: ${truncatedModelId}.`, + loading: `Stopping model ${truncatedModelId}…`, + success: `Model ${truncatedModelId} stopped.`, + error: `Failed to stop model ${truncatedModelId}.`, }); - // Simulate resetCard same as original placeholder - await customToast.promise( - new Promise((resolve) => window.setTimeout(resolve, 2000)), - { - loading: "Resetting card (tt-smi reset)...", - success: "Card reset successfully!", - error: "Failed to reset card.", - } - ); + + // Step 2: board reset is handled by the stop API, show progress while cleanup settles + setDeleteStep("resetting"); + await new Promise((resolve) => window.setTimeout(resolve, 2000)); + await refreshModels(); triggerHardwareRefresh(); setShowDeleteModal(false); setDeleteTargetId(null); - // Slight delay then refresh health window.setTimeout(() => { refreshAllHealth(); }, 1000); } finally { setIsProcessingDelete(false); + setDeleteStep(null); } }, [deleteTargetId, refreshModels, triggerHardwareRefresh, refreshAllHealth]); @@ -353,8 +352,9 @@ export default function ModelsDeployedCard(): JSX.Element { open={showDeleteModal} modelId={deleteTargetId || ""} isLoading={isProcessingDelete} + deleteStep={deleteStep} onConfirm={handleConfirmDelete} - onCancel={() => setShowDeleteModal(false)} + onCancel={() => !isProcessingDelete && setShowDeleteModal(false)} /> ); diff --git a/app/frontend/src/components/pipeline/VoicePipelineDemo.tsx b/app/frontend/src/components/pipeline/VoicePipelineDemo.tsx new file mode 100644 index 00000000..fcdf11dc --- /dev/null +++ b/app/frontend/src/components/pipeline/VoicePipelineDemo.tsx @@ -0,0 +1,378 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +import { useEffect, useRef, useState } from "react"; +import { Mic, Square, Volume2, CheckCircle, Loader2, Circle } from "lucide-react"; +import { Button } from "../ui/button"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "../ui/select"; +import { runVoicePipeline } from "../../api/modelsDeployedApis"; +import { customToast } from "../CustomToaster"; + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +interface DeployedModelInfo { + id: string; + modelName: string; + model_type?: string; +} + +type PipelineStage = "idle" | "recording" | "stt" | "llm" | "tts" | "done"; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +async function fetchDeployedByType( + modelType: string +): Promise { + try { + const res = await fetch("/models-api/deployed/"); + if (!res.ok) return []; + const data = await res.json(); + return Object.entries(data) + .map(([id, info]: [string, any]) => ({ + id, + modelName: + info.model_impl?.model_name || + info.model_impl?.hf_model_id || + "Unknown", + model_type: info.model_impl?.model_type, + })) + .filter((m) => m.model_type === modelType); + } catch { + return []; + } +} + +// --------------------------------------------------------------------------- +// Stage indicator +// --------------------------------------------------------------------------- + +const STAGES: { key: PipelineStage; label: string }[] = [ + { key: "recording", label: "Mic" }, + { key: "stt", label: "Whisper" }, + { key: "llm", label: "LLM" }, + { key: "tts", label: "TTS" }, +]; + +const STAGE_ORDER: Record = { + idle: -1, + recording: 0, + stt: 1, + llm: 2, + tts: 3, + done: 4, +}; + +function StageIndicator({ current }: { current: PipelineStage }) { + return ( +
+ {STAGES.map((s, i) => { + const order = STAGE_ORDER[s.key]; + const currentOrder = STAGE_ORDER[current]; + const isDone = currentOrder > order; + const isActive = current === s.key; + + return ( +
+ {i > 0 && ( +
+ )} +
+ {isDone ? ( + + ) : isActive ? ( + + ) : ( + + )} + + {s.label} + +
+
+ ); + })} +
+ ); +} + +// --------------------------------------------------------------------------- +// Main component +// --------------------------------------------------------------------------- + +export default function VoicePipelineDemo() { + // Model dropdowns + const [sttModels, setSttModels] = useState([]); + const [llmModels, setLlmModels] = useState([]); + const [ttsModels, setTtsModels] = useState([]); + + const [whisperDeployId, setWhisperDeployId] = useState(""); + const [llmDeployId, setLlmDeployId] = useState(""); + const [ttsDeployId, setTtsDeployId] = useState(""); + + // Recording + const [isRecording, setIsRecording] = useState(false); + const mediaRecorderRef = useRef(null); + const chunksRef = useRef([]); + + // Pipeline state + const [stage, setStage] = useState("idle"); + const [transcript, setTranscript] = useState(""); + const [llmResponse, setLlmResponse] = useState(""); + const [audioUrl, setAudioUrl] = useState(null); + const audioRef = useRef(null); + + // Fetch deployed models on mount + useEffect(() => { + Promise.all([ + fetchDeployedByType("speech_recognition"), + fetchDeployedByType("chat"), + fetchDeployedByType("tts"), + ]).then(([stt, llm, tts]) => { + setSttModels(stt); + setLlmModels(llm); + setTtsModels(tts); + if (stt.length > 0) setWhisperDeployId(stt[0].id); + if (llm.length > 0) setLlmDeployId(llm[0].id); + if (tts.length > 0) setTtsDeployId(tts[0].id); + }); + }, []); + + const startRecording = async () => { + try { + const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + const mr = new MediaRecorder(stream); + chunksRef.current = []; + mr.ondataavailable = (e) => chunksRef.current.push(e.data); + mr.start(); + mediaRecorderRef.current = mr; + setIsRecording(true); + setStage("recording"); + setTranscript(""); + setLlmResponse(""); + setAudioUrl(null); + } catch (err) { + customToast.error("Microphone access denied"); + } + }; + + const stopRecording = () => { + const mr = mediaRecorderRef.current; + if (!mr) return; + mr.onstop = async () => { + const blob = new Blob(chunksRef.current, { type: "audio/webm" }); + const file = new File([blob], "recording.webm", { type: "audio/webm" }); + await runPipeline(file); + }; + mr.stop(); + mr.stream.getTracks().forEach((t) => t.stop()); + setIsRecording(false); + }; + + const runPipeline = async (audioFile: File) => { + if (!whisperDeployId || !llmDeployId) { + customToast.error("Please select STT and LLM models"); + setStage("idle"); + return; + } + + setStage("stt"); + let llmText = ""; + + await runVoicePipeline( + { + audioFile, + whisperDeployId, + llmDeployId, + ttsDeployId: ttsDeployId || undefined, + }, + // onTranscript + (text) => { + setTranscript(text); + setStage("llm"); + }, + // onLlmChunk + (chunk) => { + llmText += chunk; + setLlmResponse((prev) => prev + chunk); + }, + // onAudio + (url) => { + setAudioUrl(url); + setStage("tts"); + // Auto-play + setTimeout(() => { + if (audioRef.current) { + audioRef.current.src = url; + audioRef.current.play().catch(() => {}); + } + }, 100); + }, + // onError + (stage, message) => { + customToast.error(`Pipeline error (${stage}): ${message}`); + setStage("idle"); + }, + // onDone + () => { + setStage("done"); + } + ); + }; + + return ( +
+

+ Voice Pipeline Demo +

+

+ Mic → Whisper STT → LLM → TTS → Speaker +

+ + {/* Model selectors */} +
+
+ + +
+ +
+ + +
+ +
+ + +
+
+ + {/* Stage indicator */} +
+ +
+ + {/* Record button */} +
+ {isRecording ? ( + + ) : ( + + )} +
+ + {/* Outputs */} + {transcript && ( +
+

+ Transcript +

+

+ {transcript} +

+
+ )} + + {llmResponse && ( +
+

+ LLM Response +

+

+ {llmResponse} +

+
+ )} + + {audioUrl && ( +
+ +
+ )} + + {/* Hidden audio element for autoplay */} + {!audioUrl &&
+ ); +} diff --git a/app/frontend/src/contexts/DeviceStateContext.ts b/app/frontend/src/contexts/DeviceStateContext.ts new file mode 100644 index 00000000..a9768597 --- /dev/null +++ b/app/frontend/src/contexts/DeviceStateContext.ts @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +import { createContext } from "react"; + +export type DeviceState = + | "HEALTHY" + | "BAD_STATE" + | "RESETTING" + | "NOT_PRESENT" + | "UNKNOWN"; + +export interface DeviceInfo { + index: number; + board_type: string; + bus_id: string; + temperature: number; + power: number; + voltage: number; +} + +export interface DeviceStateData { + state: DeviceState; + board_type: string; + board_name: string; + devices: DeviceInfo[]; + last_updated: string; + reset_suggested: boolean; +} + +export interface DeviceStateContextType { + deviceState: DeviceStateData | null; + loading: boolean; + error: string | null; + /** Immediately re-fetch device state and reschedule polling. */ + refresh: () => void; +} + +export const DeviceStateContext = createContext< + DeviceStateContextType | undefined +>(undefined); diff --git a/app/frontend/src/contexts/ModelsContext.ts b/app/frontend/src/contexts/ModelsContext.ts index 57dd2202..c7c64e62 100644 --- a/app/frontend/src/contexts/ModelsContext.ts +++ b/app/frontend/src/contexts/ModelsContext.ts @@ -11,6 +11,7 @@ export interface Model { status: string; health: string; ports: string; + model_type?: string; } export interface ModelsContextType { diff --git a/app/frontend/src/hooks/useDeviceState.ts b/app/frontend/src/hooks/useDeviceState.ts new file mode 100644 index 00000000..af0eee89 --- /dev/null +++ b/app/frontend/src/hooks/useDeviceState.ts @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +import { useContext } from "react"; +import { DeviceStateContext } from "../contexts/DeviceStateContext"; + +export const useDeviceState = () => { + const context = useContext(DeviceStateContext); + if (context === undefined) { + throw new Error("useDeviceState must be used within a DeviceStateProvider"); + } + return context; +}; diff --git a/app/frontend/src/pages/VoicePipelinePage.tsx b/app/frontend/src/pages/VoicePipelinePage.tsx new file mode 100644 index 00000000..de5c2b54 --- /dev/null +++ b/app/frontend/src/pages/VoicePipelinePage.tsx @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +import VoicePipelineDemo from "../components/pipeline/VoicePipelineDemo"; + +export default function VoicePipelinePage() { + return ; +} diff --git a/app/frontend/src/providers/DeviceStateContext.tsx b/app/frontend/src/providers/DeviceStateContext.tsx new file mode 100644 index 00000000..9da7d048 --- /dev/null +++ b/app/frontend/src/providers/DeviceStateContext.tsx @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: © 2026 Tenstorrent AI ULC + +import React, { useState, useCallback, useEffect, useRef } from "react"; +import { + DeviceStateContext, + type DeviceStateData, +} from "../contexts/DeviceStateContext"; + +/** + * Adaptive poll intervals by device state. + * Fast polling during recovery states so the UI updates promptly. + */ +const POLL_INTERVALS: Record = { + HEALTHY: 30_000, + BAD_STATE: 5_000, + RESETTING: 2_000, + NOT_PRESENT: 30_000, + UNKNOWN: 10_000, +}; + +export const DeviceStateProvider: React.FC<{ children: React.ReactNode }> = ({ + children, +}) => { + const [deviceState, setDeviceState] = useState(null); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + + // Store the current state in a ref so the scheduled callback always reads + // the latest value without creating stale closures. + const stateRef = useRef("UNKNOWN"); + const timerRef = useRef | null>(null); + // pollRef lets us call poll() from the refresh callback without circular deps. + const pollRef = useRef<() => Promise>(async () => {}); + + const scheduleNext = useCallback(() => { + if (timerRef.current) clearTimeout(timerRef.current); + const interval = POLL_INTERVALS[stateRef.current] ?? 10_000; + timerRef.current = setTimeout(() => pollRef.current(), interval); + }, []); + + useEffect(() => { + const poll = async () => { + try { + const response = await fetch("/board-api/device-state/"); + if (!response.ok) + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + const data: DeviceStateData = await response.json(); + stateRef.current = data.state; + setDeviceState(data); + setError(null); + } catch (err) { + setError(err instanceof Error ? err.message : "Unknown error"); + } finally { + setLoading(false); + scheduleNext(); + } + }; + + pollRef.current = poll; + poll(); + + return () => { + if (timerRef.current) clearTimeout(timerRef.current); + }; + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); + + const refresh = useCallback(() => { + if (timerRef.current) clearTimeout(timerRef.current); + pollRef.current(); + }, []); + + return ( + + {children} + + ); +}; diff --git a/app/frontend/src/providers/ModelsContext.tsx b/app/frontend/src/providers/ModelsContext.tsx index 0c02fef9..ce2ce1bf 100644 --- a/app/frontend/src/providers/ModelsContext.tsx +++ b/app/frontend/src/providers/ModelsContext.tsx @@ -39,6 +39,7 @@ export const ModelsProvider: React.FC<{ children: React.ReactNode }> = ({ status: dockerModel?.status || "deployed", health: dockerModel?.health || "unknown", ports: dockerModel?.ports || "No ports", + model_type: deployedModel.model_type, }; }); diff --git a/app/frontend/src/routes/index.tsx b/app/frontend/src/routes/index.tsx index 7627de6f..0bd5b45e 100644 --- a/app/frontend/src/routes/index.tsx +++ b/app/frontend/src/routes/index.tsx @@ -4,6 +4,7 @@ import { BrowserRouter as Router, Routes, Route } from "react-router-dom"; import { RefreshProvider } from "../providers/RefreshContext"; import { ModelsProvider } from "../providers/ModelsContext"; +import { DeviceStateProvider } from "../providers/DeviceStateContext"; import { getRoutes } from "./route-config"; import { MainLayout } from "../layouts/MainLayout"; @@ -18,23 +19,25 @@ const AppRouter = () => { ); return ( - - - - - {routes - .filter((route) => route.condition !== false) - .map((route) => ( - {route.element}} - /> - ))} - - - - + + + + + + {routes + .filter((route) => route.condition !== false) + .map((route) => ( + {route.element}} + /> + ))} + + + + + ); }; diff --git a/app/frontend/src/routes/route-config.tsx b/app/frontend/src/routes/route-config.tsx index 7225f053..7f88ea73 100644 --- a/app/frontend/src/routes/route-config.tsx +++ b/app/frontend/src/routes/route-config.tsx @@ -52,6 +52,7 @@ import ImageGenPage from "../pages/ImageGenPage"; import AudioDetectionPage from "../pages/AudioDetectionPage"; import ApiInfoPage from "../pages/ApiInfoPage"; import DeploymentHistoryPage from "../pages/DeploymentHistoryPage"; +import VoicePipelinePage from "../pages/VoicePipelinePage"; // Define route configuration type export interface RouteConfig { @@ -123,6 +124,11 @@ export const getRoutes = (): RouteConfig[] => { element: , condition: true, }, + { + path: "/voice-pipeline", + element: , + condition: true, + }, { // catch all for all other routes path: "*", diff --git a/run.py b/run.py index 4b22f218..7a1f1777 100644 --- a/run.py +++ b/run.py @@ -1248,27 +1248,28 @@ def wait_for_all_services(skip_fastapi=False, is_deployed_mode=False): print("\n⚠️ Some services may not be fully ready, but main app may still be accessible.") return all_healthy -def wait_for_frontend_and_open_browser(host="localhost", port=3000, timeout=60, auto_deploy_model=None): +def wait_for_frontend_and_open_browser(host="localhost", port=3000, timeout=60, auto_deploy_model=None, device_id=0): """ Wait for frontend service to be healthy before opening browser. - + Args: host: Frontend host port: Frontend port timeout: Timeout in seconds auto_deploy_model: Model name to auto-deploy (optional) - + device_id: Chip slot index for auto-deploy (default 0) + Returns: bool: True if browser opened successfully, False otherwise """ base_url = f"http://{host}:{port}/" - + # Add auto-deploy parameter if specified if auto_deploy_model: from urllib.parse import urlencode - params = urlencode({"auto-deploy": auto_deploy_model}) + params = urlencode({"auto-deploy": auto_deploy_model, "device-id": device_id}) frontend_url = f"{base_url}?{params}" - print(f"\n🤖 Auto-deploying model: {auto_deploy_model}") + print(f"\n🤖 Auto-deploying model: {auto_deploy_model} on chip {device_id}") else: frontend_url = base_url @@ -2220,6 +2221,33 @@ def handle_remove_readonly(func, path, exc): print(f" See: https://github.com/tenstorrent/tt-inference-server/releases") return False +def _sync_model_catalog(): + """Regenerate models_from_inference_server.json from the downloaded artifact.""" + sync_script = os.path.join(TT_STUDIO_ROOT, "app", "backend", "shared_config", "sync_models_from_inference_server.py") + if not os.path.exists(sync_script): + print(f"{C_YELLOW}⚠️ Model catalog sync script not found at {sync_script}, skipping.{C_RESET}") + return + print(f"\n{C_CYAN}🔄 Syncing model catalog from artifact...{C_RESET}") + try: + result = subprocess.run( + [sys.executable, sync_script], + capture_output=True, text=True, timeout=30, + ) + if result.returncode == 0: + print(f"{C_GREEN}✅ Model catalog synced.{C_RESET}") + if result.stdout.strip(): + for line in result.stdout.strip().splitlines(): + print(f" {line}") + print(f"{C_YELLOW}💡 Reminder: commit app/backend/shared_config/models_from_inference_server.json") + print(f" so CI/CD Docker image builds use the updated catalog.{C_RESET}") + else: + print(f"{C_YELLOW}⚠️ Model catalog sync exited with code {result.returncode}:{C_RESET}") + if result.stderr.strip(): + print(result.stderr.strip()[:500]) + except Exception as e: + print(f"{C_YELLOW}⚠️ Model catalog sync failed: {e}{C_RESET}") + + def setup_fastapi_environment(): """Set up the inference-api FastAPI environment.""" print(f"🔧 Setting up inference-api environment...") @@ -3485,6 +3513,8 @@ def main(): help="🔍 Check for missing SPDX license headers without adding them") parser.add_argument("--auto-deploy", type=str, metavar="MODEL_NAME", help="🤖 Automatically deploy the specified model after startup (e.g., 'Llama-3.2-1B-Instruct')") + parser.add_argument("--device-id", type=int, default=0, metavar="CHIP_ID", + help="🔌 Chip slot index (0-7) to use when auto-deploying a model (default: 0)") parser.add_argument("--fix-docker", action="store_true", help="🔧 Automatically fix Docker service and permission issues") parser.add_argument("--easy", action="store_true", @@ -3796,6 +3826,8 @@ def main(): if not setup_tt_inference_server(): print(f"{C_RED}⛔ Failed to setup TT Inference Server. Continuing without FastAPI server.{C_RESET}") else: + # Sync model catalog from the newly downloaded artifact + _sync_model_catalog() # Setup FastAPI environment if not setup_fastapi_environment(): print(f"{C_RED}⛔ Failed to setup FastAPI environment. Continuing without FastAPI server.{C_RESET}") @@ -3899,12 +3931,14 @@ def main(): host, port, timeout = get_frontend_config() # Use the new function that reuses existing infrastructure - if not wait_for_frontend_and_open_browser(host, port, timeout, args.auto_deploy): - auto_deploy_param = f"?auto-deploy={args.auto_deploy}" if args.auto_deploy else "" + device_id_val = getattr(args, "device_id", 0) + if not wait_for_frontend_and_open_browser(host, port, timeout, args.auto_deploy, device_id=device_id_val): + auto_deploy_param = f"?auto-deploy={args.auto_deploy}&device-id={device_id_val}" if args.auto_deploy else "" print(f"{C_YELLOW}⚠️ Browser opening failed. Please manually navigate to http://{host}:{port}{auto_deploy_param}{C_RESET}") else: host, port, _ = get_frontend_config() - auto_deploy_param = f"?auto-deploy={args.auto_deploy}" if args.auto_deploy else "" + device_id_val = getattr(args, "device_id", 0) + auto_deploy_param = f"?auto-deploy={args.auto_deploy}&device-id={device_id_val}" if args.auto_deploy else "" print(f"{C_BLUE}🌐 Automatic browser opening disabled. Access TT-Studio at: {C_CYAN}http://{host}:{port}{auto_deploy_param}{C_RESET}") # If in dev mode, show logs similar to startup.sh diff --git a/tt-inference-server b/tt-inference-server new file mode 160000 index 00000000..ac1892b7 --- /dev/null +++ b/tt-inference-server @@ -0,0 +1 @@ +Subproject commit ac1892b7e69f08e7020031dab3f9a30a0dcbe269