Light-Heart-Labs · Lightheartdevs · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/dream-server/.env.example b/dream-server/.env.example
@@ -57,17 +57,26 @@ TOGETHER_API_KEY=
 # LLM Settings (llama-server)
 # ═══════════════════════════════════════════════════════════════════
 
-# Model GGUF filename (must exist in data/models/)
-GGUF_FILE=Qwen3-8B-Q4_K_M.gguf
+# Optional model family profile for installer-driven tier selection.
+# qwen   = keep the current stable DreamServer defaults
+# gemma4 = always use Gemma 4 tier mappings when possible
+# auto   = prefer Gemma 4 on capable hardware, keep Qwen fallback for minimum/cloud paths
+MODEL_PROFILE=qwen
+
+# Model GGUF filename override (installer normally rewrites this from tier + MODEL_PROFILE)
+GGUF_FILE=Qwen3.5-9B-Q4_K_M.gguf
 
 # Context window size (tokens)
 CTX_SIZE=16384
 
 # GPU backend: nvidia or amd
 GPU_BACKEND=nvidia
 
-# Model name (used by OpenClaw and dashboard)
-LLM_MODEL=qwen3-8b
+# Model name override (installer normally rewrites this from tier + MODEL_PROFILE)
+LLM_MODEL=qwen3.5-9b
+
+# Optional llama.cpp image override (installer sets this automatically for Gemma 4 profiles)
+# LLAMA_SERVER_IMAGE=ghcr.io/ggml-org/llama.cpp:server-cuda-b8648
 
 # llama-server inference tuning (advanced)
 # LLAMA_BATCH_SIZE=2048      # Batch size for prompt processing (higher = faster prefill)

diff --git a/dream-server/.env.schema.json b/dream-server/.env.schema.json
@@ -37,6 +37,16 @@
       "description": "Inference backend: llama-server or lemonade",
       "default": "llama-server"
     },
+    "MODEL_PROFILE": {
+      "type": "string",
+      "description": "Tier-aware local model family selection: qwen, gemma4, or auto",
+      "enum": [
+        "qwen",
+        "gemma4",
+        "auto"
+      ],
+      "default": "qwen"
+    },
     "LLM_API_BASE_PATH": {
       "type": "string",
       "description": "Base API path for the inference backend",
@@ -109,6 +119,10 @@
       "type": "string",
       "description": "Model name used by OpenClaw and dashboard"
     },
+    "LLAMA_SERVER_IMAGE": {
+      "type": "string",
+      "description": "Optional llama.cpp container image override for model families that require newer runtime support"
+    },
     "TIER": {
       "type": "string",
       "description": "Hardware tier (1, 2, 3, 4, CLOUD, SH_COMPACT, SH_LARGE, NV_ULTRA)"

diff --git a/dream-server/extensions/services/dashboard-api/helpers.py b/dream-server/extensions/services/dashboard-api/helpers.py
@@ -325,21 +325,62 @@ def get_model_info() -> Optional[ModelInfo]:
     env_path = Path(INSTALL_DIR) / ".env"
     if env_path.exists():
         try:
+            env_values = {}
             with open(env_path) as f:
                 for line in f:
-                    if line.startswith("LLM_MODEL="):
-                        model_name = line.split("=", 1)[1].strip().strip('"\'')
-                        size_gb, context, quant = 15.0, 32768, None
-                        import re as _re
-                        name_lower = model_name.lower()
-                        if _re.search(r'\b7b\b', name_lower): size_gb = 4.0
-                        elif _re.search(r'\b14b\b', name_lower): size_gb = 8.0
-                        elif _re.search(r'\b32b\b', name_lower): size_gb = 16.0
-                        elif _re.search(r'\b70b\b', name_lower): size_gb = 35.0
-                        if "awq" in name_lower: quant = "AWQ"
-                        elif "gptq" in name_lower: quant = "GPTQ"
-                        elif "gguf" in name_lower: quant = "GGUF"
-                        return ModelInfo(name=model_name, size_gb=size_gb, context_length=context, quantization=quant)
+                    if "=" not in line or line.lstrip().startswith("#"):
+                        continue
+                    key, value = line.split("=", 1)
+                    env_values[key.strip()] = value.strip().strip('"\'')
+
+            model_name = env_values.get("LLM_MODEL")
+            if model_name:
+                size_gb, quant = 15.0, None
+                context = int(env_values.get("MAX_CONTEXT") or env_values.get("CTX_SIZE") or 32768)
+
+                import re as _re
+
+                name_lower = model_name.lower()
+                if "gemma-4-e2b" in name_lower:
+                    size_gb = 2.8
+                elif "gemma-4-e4b" in name_lower:
+                    size_gb = 5.3
+                elif "gemma-4-26b" in name_lower:
+                    size_gb = 18.0
+                elif "gemma-4-31b" in name_lower:
+                    size_gb = 19.8
+                elif _re.search(r'\b2b\b', name_lower):
+                    size_gb = 1.5
+                elif _re.search(r'\b4b\b', name_lower):
+                    size_gb = 2.8
+                elif _re.search(r'\b7b\b', name_lower):
+                    size_gb = 4.0
+                elif _re.search(r'\b8b\b', name_lower):
+                    size_gb = 4.5
+                elif _re.search(r'\b9b\b', name_lower):
+                    size_gb = 5.8
+                elif _re.search(r'\b14b\b', name_lower):
+                    size_gb = 8.0
+                elif _re.search(r'\b26b\b', name_lower):
+                    size_gb = 18.0
+                elif _re.search(r'\b30b\b', name_lower):
+                    size_gb = 18.6
+                elif _re.search(r'\b31b\b', name_lower):
+                    size_gb = 19.8
+                elif _re.search(r'\b32b\b', name_lower):
+                    size_gb = 16.0
+                elif _re.search(r'\b70b\b', name_lower):
+                    size_gb = 35.0
+
+                gguf_file = env_values.get("GGUF_FILE", "").lower()
+                if "awq" in name_lower:
+                    quant = "AWQ"
+                elif "gptq" in name_lower:
+                    quant = "GPTQ"
+                elif "gguf" in name_lower or gguf_file.endswith(".gguf"):
+                    quant = "GGUF"
+
+                return ModelInfo(name=model_name, size_gb=size_gb, context_length=context, quantization=quant)
         except OSError as e:
             logger.warning("Failed to read .env for model info: %s", e)
     return None

diff --git a/dream-server/extensions/services/dashboard-api/routers/features.py b/dream-server/extensions/services/dashboard-api/routers/features.py
@@ -152,17 +152,17 @@ async def api_features(api_key: str = Depends(verify_api_key)):
     tier_recommendations = []
     if memory_type == "unified" and gpu_info and gpu_info.gpu_backend == "amd":
         if gpu_vram_gb >= 90:
-            tier_recommendations = ["Strix Halo 90+ — running qwen3-coder-next (80B MoE, 3B active)", "Plenty of headroom for the flagship model + bootstrap simultaneously", "Voice and Documents work alongside the LLM"]
+            tier_recommendations = ["Strix Halo 90+ — flagship local profile supported", "Plenty of headroom for large local models plus bootstrap simultaneously", "Voice and Documents work alongside the LLM"]
         else:
-            tier_recommendations = ["Strix Halo Compact — running qwen3:30b-a3b (30B MoE, 3B active)", "Fast MoE inference with low memory footprint", "Voice and Documents work alongside the LLM"]
+            tier_recommendations = ["Strix Halo Compact — balanced local profile supported", "Fast inference with good room for voice, documents, and agents", "Voice and Documents work alongside the LLM"]
     elif gpu_vram_gb >= 80:
         tier_recommendations = ["Your GPU can run all features simultaneously", "Consider enabling Voice + Documents for the full experience", "Image generation is supported at full quality"]
     elif gpu_vram_gb >= 24:
         tier_recommendations = ["Great GPU for local AI — most features will run well", "Voice and Documents work together", "Image generation may require model unloading"]
     elif gpu_vram_gb >= 16:
         tier_recommendations = ["Solid GPU for core features", "Voice works well with the default model", "For images, use a smaller chat model"]
     elif gpu_vram_gb >= 8:
-        tier_recommendations = ["Entry-level GPU — focus on chat first", "Voice is possible with a smaller model", "Consider using the 7B model for better speed"]
+        tier_recommendations = ["Entry-level GPU — focus on chat first", "Voice is possible with a compact local profile", "Use the smaller local model profile for better speed"]
     else:
         tier_recommendations = ["Limited GPU memory — chat will work with small models", "Consider cloud hybrid mode for better quality"]
 
@@ -201,7 +201,7 @@ def _svc_port(service_id: str) -> int:
         "documents": {"steps": ["Ensure Qdrant vector database is running", "Enable the 'Document Q&A' workflow", "Upload documents via the workflow endpoint"], "links": [{"label": "Workflows", "url": f"{dashboard_url}/workflows"}]},
         "workflows": {"steps": [f"Ensure n8n is running on port {_svc_port('n8n')}", "Open the Workflows page to see available automations", "Click 'Enable' on any workflow to import it"], "links": [{"label": "n8n Dashboard", "url": n8n_url}, {"label": "Workflows", "url": f"{dashboard_url}/workflows"}]},
         "images": {"steps": ["Image generation requires additional setup", "Coming soon in a future update"], "links": []},
-        "coding": {"steps": ["Switch to the Qwen2.5-Coder model for best results", "Use the model manager to download and load it", "Chat will now be optimized for code"], "links": [{"label": "Model Manager", "url": f"{dashboard_url}/models"}]},
+        "coding": {"steps": ["Switch to a coding-oriented local model profile for best results", "Use the model manager to download and load it", "Chat will now be optimized for code"], "links": [{"label": "Model Manager", "url": f"{dashboard_url}/models"}]},
         "observability": {"steps": [f"Langfuse is running on port {_svc_port('langfuse')}", "Open Langfuse to view LLM traces and evaluations", "LiteLLM automatically sends traces — no additional configuration needed"], "links": [{"label": "Open Langfuse", "url": _svc_url("langfuse")}]},
     }
 

diff --git a/dream-server/extensions/services/token-spy/providers/openai.py b/dream-server/extensions/services/token-spy/providers/openai.py
@@ -52,6 +52,10 @@ class OpenAICompatibleProvider(LLMProvider):
         "qwen3-14b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
         "qwen3-30b-a3b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
         "qwen3.5:27b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
+        "gemma-4-e2b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
+        "gemma-4-e4b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
+        "gemma-4-26b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
+        "gemma-4-31b": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
         "qwen": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
         "llama": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},
         "mistral": {"input": 0.0, "output": 0.0, "cache_read": 0.0, "cache_write": 0.0},