Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,7 @@ inference-api/__pycache__/
CLAUDE.md
docs/RAG_PRODUCTIONIZATION_PLAN.md
docs/DOCKER_CONTROL_SERVICE_PLAN.md
!app/backend/shared_config/models_from_inference_server.json

request-venv/*
app/.env-old
3 changes: 3 additions & 0 deletions app/.env.default
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ TT_INFERENCE_ARTIFACT_VERSION=v0.8.0
# Security Credentials (REQUIRED - keep secret in production!)
JWT_SECRET=test-secret-456
DJANGO_SECRET_KEY=django-insecure-default

# TTS Inference Server API Key (media inference engine)
TTS_API_KEY=your-tts-api-key
HF_TOKEN=hf_***

# Docker Control Service (secure Docker operations API)
Expand Down
2 changes: 1 addition & 1 deletion app/backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ RUN if [ "$VITE_ENABLE_DEPLOYED" != "true" ]; then \
. "$HOME/.cargo/env" && \
# Clone and install tt-smi
mkdir -p /opt/tenstorrent-tools && \
git clone https://github.com/tenstorrent/tt-smi.git /opt/tenstorrent-tools/tt-smi && \
git clone --branch v4.0.0 --depth 1 https://github.com/tenstorrent/tt-smi.git /opt/tenstorrent-tools/tt-smi && \
cd /opt/tenstorrent-tools/tt-smi && \
pip3 install --upgrade pip && \
pip3 install . && \
Expand Down
39 changes: 0 additions & 39 deletions app/backend/api/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,6 @@
# Application definition

INSTALLED_APPS = [
"django.contrib.admin",
"django.contrib.auth",
"django.contrib.contenttypes",
"django.contrib.sessions",
"django.contrib.messages",
"django.contrib.staticfiles",
"docker_control.apps.DockerControlConfig",
"model_control",
Expand All @@ -81,11 +76,8 @@
MIDDLEWARE = [
"corsheaders.middleware.CorsMiddleware",
"django.middleware.security.SecurityMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"django.middleware.clickjacking.XFrameOptionsMiddleware",
]

Expand All @@ -100,25 +92,12 @@
"context_processors": [
"django.template.context_processors.debug",
"django.template.context_processors.request",
"django.contrib.auth.context_processors.auth",
"django.contrib.messages.context_processors.messages",
],
},
},
]

WSGI_APPLICATION = "api.wsgi.application"
SESSIONS_ENGINE = "django.contrib.sessions.backends.cache"
# Database
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases

# SQLite database for deployment history and other persistent data
DATABASES = {
"default": {
"ENGINE": "django.db.backends.sqlite3",
"NAME": backend_config.backend_cache_root / "db.sqlite3",
}
}

# local memory thread-safe default
# the LOCATION for locmem.LocMemCache cache backend is just a name for tracking
Expand All @@ -135,24 +114,6 @@
},
}

# Password validation
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators

AUTH_PASSWORD_VALIDATORS = [
{
"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator",
},
{
"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator",
},
{
"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator",
},
{
"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator",
},
]

# Internationalization
# https://docs.djangoproject.com/en/4.2/topics/i18n/

Expand Down
5 changes: 3 additions & 2 deletions app/backend/api/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,18 @@
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""

from django.contrib import admin
from api.views import UpStatusView
from django.urls import include, path
from model_control.views import OpenAIAudioSpeechView

urlpatterns = [
path("admin/", admin.site.urls),
path("up/", UpStatusView.as_view()),
path("docker/", include("docker_control.urls")),
path("models/", include("model_control.urls")),
path("reset_board/", include("docker_control.urls")),
path("collections/", include("vector_db_control.urls")),
path("logs/", include("logs_control.urls")),
path("board/", include("board_control.urls")),
# OpenAI-compatible audio endpoint
path("v1/audio/speech", OpenAIAudioSpeechView.as_view()),
]
250 changes: 245 additions & 5 deletions app/backend/board_control/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,19 @@

class SystemResourceService:
"""Service for monitoring system resources and TT device telemetry"""

# Cache keys and timeout
TT_SMI_CACHE_KEY = "tt_smi_data"
TT_SMI_CACHE_TIMEOUT = 3600 # Cache for 1 hour (since we'll refresh on events only)
BOARD_TYPE_CACHE_KEY = "board_type_data"
BOARD_TYPE_CACHE_TIMEOUT = 3600 # Cache board type for 1 hour (since it rarely changes)

# Device state cache keys
DEVICE_STATE_CACHE_KEY = "device_state_v2"
DEVICE_RESETTING_KEY = "device_resetting"

@staticmethod
def get_tt_smi_data(timeout=10):
def get_tt_smi_data(timeout=30):
"""Get raw tt-smi data with caching to reduce expensive calls"""
# Check cache first
cached_data = cache.get(SystemResourceService.TT_SMI_CACHE_KEY)
Expand Down Expand Up @@ -412,9 +416,245 @@ def force_refresh_tt_smi_cache():
# Clear the existing cache
cache.delete(SystemResourceService.TT_SMI_CACHE_KEY)
cache.delete(SystemResourceService.BOARD_TYPE_CACHE_KEY)

# Fetch fresh data
SystemResourceService.get_tt_smi_data()
SystemResourceService.get_board_type()

logger.info("tt-smi cache refreshed successfully")

logger.info("tt-smi cache refreshed successfully")

# -------------------------------------------------------------------------
# Device State Machine — single source of truth
# -------------------------------------------------------------------------

@staticmethod
def _extract_board_type_from_data(data):
"""Extract canonical board-type string from tt-smi JSON data."""
if not data or "device_info" not in data or not data["device_info"]:
return "unknown"

board_types = []
for info in data["device_info"]:
board_info = info.get("board_info", {})
board_types.append(board_info.get("board_type", "unknown"))

if not board_types:
return "unknown"

# Strip "local"/"remote" suffix if present
filtered = [bt.rsplit(" ", 1)[0] for bt in board_types]
unique = set(filtered)

if len(unique) > 1:
logger.warning(f"Mixed board types detected: {unique}")
return "unknown"

raw = unique.pop()
num_devices = len(data["device_info"])
raw_lower = raw.lower()

if "n150" in raw_lower:
return "N150X4" if num_devices >= 4 else "N150"
if "n300" in raw_lower:
return "T3K" if num_devices >= 4 else "N300"
if "p300" in raw_lower:
if num_devices >= 8:
return "P300Cx4"
if num_devices >= 4:
return "P300Cx2"
return "P300c"
if "p150" in raw_lower:
if num_devices >= 8:
return "P150X8"
if num_devices >= 4:
return "P150X4"
return "P150"
if "p100" in raw_lower:
return "P100"
if "e150" in raw_lower:
return "E150"
if "galaxy" in raw_lower:
return "GALAXY_T3K" if "t3k" in raw_lower else "GALAXY"

logger.warning(f"Unknown board type string: {raw!r}")
return "unknown"

@staticmethod
def _extract_devices_from_data(data):
"""Extract device summary list from tt-smi JSON data."""
devices = []
if not data or "device_info" not in data:
return devices

for idx, device in enumerate(data["device_info"]):
board_info = device.get("board_info", {})
telemetry = device.get("telemetry", {})

def _f(v):
try:
return float(v) if v is not None else 0.0
except (TypeError, ValueError):
return 0.0

devices.append({
"index": idx,
"board_type": board_info.get("board_type", "Unknown"),
"bus_id": board_info.get("bus_id", "N/A"),
"temperature": _f(telemetry.get("asic_temperature")),
"power": _f(telemetry.get("power")),
"voltage": _f(telemetry.get("voltage")),
})
return devices

@staticmethod
def get_device_state():
"""
Single authoritative device state resolver.

States:
HEALTHY — tt-smi -s succeeded, devices visible
BAD_STATE — /dev/tenstorrent present but tt-smi timed out / errored
RESETTING — tt-smi -r is actively running
NOT_PRESENT — /dev/tenstorrent path does not exist
UNKNOWN — can't determine (startup / tt-smi missing)
"""
# RESETTING takes priority — check before cache
if cache.get(SystemResourceService.DEVICE_RESETTING_KEY):
return {
"state": "RESETTING",
"board_type": "unknown",
"board_name": "Resetting…",
"devices": [],
"last_updated": timezone.now().isoformat(),
"reset_suggested": False,
}

# Return cached result if still fresh
cached = cache.get(SystemResourceService.DEVICE_STATE_CACHE_KEY)
if cached is not None:
return cached

# Check physical device presence
if not os.path.exists("/dev/tenstorrent"):
result = {
"state": "NOT_PRESENT",
"board_type": "unknown",
"board_name": "Not Present",
"devices": [],
"last_updated": timezone.now().isoformat(),
"reset_suggested": False,
}
cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=15)
return result

# Try tt-smi -s with 30-second timeout (Docker cold-start can be slower than host)
try:
logger.info("Running tt-smi -s for device state check")
process = subprocess.Popen(
["tt-smi", "-s"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
stdin=subprocess.DEVNULL,
text=True,
preexec_fn=os.setsid,
)

try:
stdout, stderr = process.communicate(timeout=30)
except subprocess.TimeoutExpired:
logger.error("tt-smi -s timed out after 30s — board in BAD_STATE")
try:
os.killpg(os.getpgid(process.pid), signal.SIGTERM)
process.wait(timeout=2)
except Exception:
try:
os.killpg(os.getpgid(process.pid), signal.SIGKILL)
except Exception:
pass
result = {
"state": "BAD_STATE",
"board_type": "unknown",
"board_name": "Bad State",
"devices": [],
"last_updated": timezone.now().isoformat(),
"reset_suggested": True,
}
cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=10)
return result

if process.returncode != 0:
logger.error(f"tt-smi -s exit code {process.returncode}: {stderr.strip()!r}")
result = {
"state": "BAD_STATE",
"board_type": "unknown",
"board_name": "Bad State",
"devices": [],
"last_updated": timezone.now().isoformat(),
"reset_suggested": True,
}
cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=10)
return result

try:
data = json.loads(stdout)
except json.JSONDecodeError as e:
logger.error(f"Failed to parse tt-smi JSON: {e}")
result = {
"state": "BAD_STATE",
"board_type": "unknown",
"board_name": "Bad State",
"devices": [],
"last_updated": timezone.now().isoformat(),
"reset_suggested": True,
}
cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=10)
return result

board_type = SystemResourceService._extract_board_type_from_data(data)
devices = SystemResourceService._extract_devices_from_data(data)
result = {
"state": "HEALTHY",
"board_type": board_type,
"board_name": board_type,
"devices": devices,
"last_updated": timezone.now().isoformat(),
"reset_suggested": False,
}
cache.set(SystemResourceService.DEVICE_STATE_CACHE_KEY, result, timeout=30)
return result

except FileNotFoundError:
logger.error("tt-smi command not found")
# Don't cache UNKNOWN so each call re-checks (tt-smi may be installed later)
return {
"state": "UNKNOWN",
"board_type": "unknown",
"board_name": "Unknown",
"devices": [],
"last_updated": timezone.now().isoformat(),
"reset_suggested": False,
}
except Exception as e:
logger.error(f"Unexpected error in get_device_state: {e}")
return {
"state": "UNKNOWN",
"board_type": "unknown",
"board_name": "Unknown",
"devices": [],
"last_updated": timezone.now().isoformat(),
"reset_suggested": False,
}

@staticmethod
def set_resetting_state():
"""Mark the device as actively resetting (clears state cache)."""
cache.set(SystemResourceService.DEVICE_RESETTING_KEY, True, timeout=120)
cache.delete(SystemResourceService.DEVICE_STATE_CACHE_KEY)
logger.info("Device state set to RESETTING")

@staticmethod
def clear_device_state_cache():
"""Clear device state cache and resetting flag after reset completes."""
cache.delete(SystemResourceService.DEVICE_STATE_CACHE_KEY)
cache.delete(SystemResourceService.DEVICE_RESETTING_KEY)
logger.info("Device state cache cleared")
Loading
Loading