Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions dream-server/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,26 @@ TOGETHER_API_KEY=
# LLM Settings (llama-server)
# ═══════════════════════════════════════════════════════════════════

# Model GGUF filename (must exist in data/models/)
GGUF_FILE=Qwen3-8B-Q4_K_M.gguf
# Optional model family profile for installer-driven tier selection.
# qwen = keep the current stable DreamServer defaults
# gemma4 = always use Gemma 4 tier mappings when possible
# auto = prefer Gemma 4 on capable hardware, keep Qwen fallback for minimum/cloud paths
MODEL_PROFILE=qwen

# Model GGUF filename override (installer normally rewrites this from tier + MODEL_PROFILE)
GGUF_FILE=Qwen3.5-9B-Q4_K_M.gguf

# Context window size (tokens)
CTX_SIZE=16384

# GPU backend: nvidia or amd
GPU_BACKEND=nvidia

# Model name (used by OpenClaw and dashboard)
LLM_MODEL=qwen3-8b
# Model name override (installer normally rewrites this from tier + MODEL_PROFILE)
LLM_MODEL=qwen3.5-9b

# Optional llama.cpp image override (installer sets this automatically for Gemma 4 profiles)
# LLAMA_SERVER_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda-b8648

# llama-server inference tuning (advanced)
# LLAMA_BATCH_SIZE=2048 # Batch size for prompt processing (higher = faster prefill)
Expand Down
14 changes: 14 additions & 0 deletions dream-server/.env.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@
"description": "Inference backend: llama-server or lemonade",
"default": "llama-server"
},
"MODEL_PROFILE": {
"type": "string",
"description": "Tier-aware local model family selection: qwen, gemma4, or auto",
"enum": [
"qwen",
"gemma4",
"auto"
],
"default": "qwen"
},
"LLM_API_BASE_PATH": {
"type": "string",
"description": "Base API path for the inference backend",
Expand Down Expand Up @@ -109,6 +119,10 @@
"type": "string",
"description": "Model name used by OpenClaw and dashboard"
},
"LLAMA_SERVER_IMAGE": {
"type": "string",
"description": "Optional llama.cpp container image override for model families that require newer runtime support"
},
"TIER": {
"type": "string",
"description": "Hardware tier (1, 2, 3, 4, CLOUD, SH_COMPACT, SH_LARGE, NV_ULTRA)"
Expand Down
67 changes: 54 additions & 13 deletions dream-server/extensions/services/dashboard-api/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -325,21 +325,62 @@ def get_model_info() -> Optional[ModelInfo]:
env_path = Path(INSTALL_DIR) / ".env"
if env_path.exists():
try:
env_values = {}
with open(env_path) as f:
for line in f:
if line.startswith("LLM_MODEL="):
model_name = line.split("=", 1)[1].strip().strip('"\'')
size_gb, context, quant = 15.0, 32768, None
import re as _re
name_lower = model_name.lower()
if _re.search(r'\b7b\b', name_lower): size_gb = 4.0
elif _re.search(r'\b14b\b', name_lower): size_gb = 8.0
elif _re.search(r'\b32b\b', name_lower): size_gb = 16.0
elif _re.search(r'\b70b\b', name_lower): size_gb = 35.0
if "awq" in name_lower: quant = "AWQ"
elif "gptq" in name_lower: quant = "GPTQ"
elif "gguf" in name_lower: quant = "GGUF"
return ModelInfo(name=model_name, size_gb=size_gb, context_length=context, quantization=quant)
if "=" not in line or line.lstrip().startswith("#"):
continue
key, value = line.split("=", 1)
env_values[key.strip()] = value.strip().strip('"\'')

model_name = env_values.get("LLM_MODEL")
if model_name:
size_gb, quant = 15.0, None
context = int(env_values.get("MAX_CONTEXT") or env_values.get("CTX_SIZE") or 32768)

import re as _re

name_lower = model_name.lower()
if "gemma-4-e2b" in name_lower:
size_gb = 2.8
elif "gemma-4-e4b" in name_lower:
size_gb = 5.3
elif "gemma-4-26b" in name_lower:
size_gb = 18.0
elif "gemma-4-31b" in name_lower:
size_gb = 19.8
elif _re.search(r'\b2b\b', name_lower):
size_gb = 1.5
elif _re.search(r'\b4b\b', name_lower):
size_gb = 2.8
elif _re.search(r'\b7b\b', name_lower):
size_gb = 4.0
elif _re.search(r'\b8b\b', name_lower):
size_gb = 4.5
elif _re.search(r'\b9b\b', name_lower):
size_gb = 5.8
elif _re.search(r'\b14b\b', name_lower):
size_gb = 8.0
elif _re.search(r'\b26b\b', name_lower):
size_gb = 18.0
elif _re.search(r'\b30b\b', name_lower):
size_gb = 18.6
elif _re.search(r'\b31b\b', name_lower):
size_gb = 19.8
elif _re.search(r'\b32b\b', name_lower):
size_gb = 16.0
elif _re.search(r'\b70b\b', name_lower):
size_gb = 35.0

gguf_file = env_values.get("GGUF_FILE", "").lower()
if "awq" in name_lower:
quant = "AWQ"
elif "gptq" in name_lower:
quant = "GPTQ"
elif "gguf" in name_lower or gguf_file.endswith(".gguf"):
quant = "GGUF"

return ModelInfo(name=model_name, size_gb=size_gb, context_length=context, quantization=quant)
except OSError as e:
logger.warning("Failed to read .env for model info: %s", e)
return None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -152,17 +152,17 @@ async def api_features(api_key: str = Depends(verify_api_key)):
tier_recommendations = []
if memory_type == "unified" and gpu_info and gpu_info.gpu_backend == "amd":
if gpu_vram_gb >= 90:
tier_recommendations = ["Strix Halo 90+ — running qwen3-coder-next (80B MoE, 3B active)", "Plenty of headroom for the flagship model + bootstrap simultaneously", "Voice and Documents work alongside the LLM"]
tier_recommendations = ["Strix Halo 90+ — flagship local profile supported", "Plenty of headroom for large local models plus bootstrap simultaneously", "Voice and Documents work alongside the LLM"]
else:
tier_recommendations = ["Strix Halo Compact — running qwen3:30b-a3b (30B MoE, 3B active)", "Fast MoE inference with low memory footprint", "Voice and Documents work alongside the LLM"]
tier_recommendations = ["Strix Halo Compact — balanced local profile supported", "Fast inference with good room for voice, documents, and agents", "Voice and Documents work alongside the LLM"]
elif gpu_vram_gb >= 80:
tier_recommendations = ["Your GPU can run all features simultaneously", "Consider enabling Voice + Documents for the full experience", "Image generation is supported at full quality"]
elif gpu_vram_gb >= 24:
tier_recommendations = ["Great GPU for local AI — most features will run well", "Voice and Documents work together", "Image generation may require model unloading"]
elif gpu_vram_gb >= 16:
tier_recommendations = ["Solid GPU for core features", "Voice works well with the default model", "For images, use a smaller chat model"]
elif gpu_vram_gb >= 8:
tier_recommendations = ["Entry-level GPU — focus on chat first", "Voice is possible with a smaller model", "Consider using the 7B model for better speed"]
tier_recommendations = ["Entry-level GPU — focus on chat first", "Voice is possible with a compact local profile", "Use the smaller local model profile for better speed"]
else:
tier_recommendations = ["Limited GPU memory — chat will work with small models", "Consider cloud hybrid mode for better quality"]

Expand Down Expand Up @@ -201,7 +201,7 @@ def _svc_port(service_id: str) -> int:
"documents": {"steps": ["Ensure Qdrant vector database is running", "Enable the 'Document Q&A' workflow", "Upload documents via the workflow endpoint"], "links": [{"label": "Workflows", "url": f"{dashboard_url}/workflows"}]},
"workflows": {"steps": [f"Ensure n8n is running on port {_svc_port('n8n')}", "Open the Workflows page to see available automations", "Click 'Enable' on any workflow to import it"], "links": [{"label": "n8n Dashboard", "url": n8n_url}, {"label": "Workflows", "url": f"{dashboard_url}/workflows"}]},
"images": {"steps": ["Image generation requires additional setup", "Coming soon in a future update"], "links": []},
"coding": {"steps": ["Switch to the Qwen2.5-Coder model for best results", "Use the model manager to download and load it", "Chat will now be optimized for code"], "links": [{"label": "Model Manager", "url": f"{dashboard_url}/models"}]},
"coding": {"steps": ["Switch to a coding-oriented local model profile for best results", "Use the model manager to download and load it", "Chat will now be optimized for code"], "links": [{"label": "Model Manager", "url": f"{dashboard_url}/models"}]},
"observability": {"steps": [f"Langfuse is running on port {_svc_port('langfuse')}", "Open Langfuse to view LLM traces and evaluations", "LiteLLM automatically sends traces — no additional configuration needed"], "links": [{"label": "Open Langfuse", "url": _svc_url("langfuse")}]},
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,10 @@ class OpenAICompatibleProvider(LLMProvider):
"qwen3-14b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
"qwen3-30b-a3b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
"qwen3.5:27b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
"gemma-4-e2b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
"gemma-4-e4b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
"gemma-4-26b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
"gemma-4-31b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
"qwen": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
"llama": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
"mistral": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
Expand Down
Loading
Loading