diff --git a/Readme.md b/Readme.md index 26ecf21..43c853f 100644 --- a/Readme.md +++ b/Readme.md @@ -92,16 +92,15 @@ This python script provides a comprehensive graphical interface for `llama.cpp a * **llama.cpp** built with server support (`llama-server and ik_llama` executable) * **requests** - Required for version checking and updates * Install with: `pip install requests` +* **gguf** - Required for for GGUF model analysis + * Install with: `pip install git+https://github.com/excosy/ik_gguf.git` + * `pip install gguf` also work if new quant types from ik_llama are not used ### Optional (Recommended) * **PyTorch** (`torch`) - **Required if you want automatic GPU detection and selection** * Install in your virtual environment: `pip install torch` * Without PyTorch, you can still manually configure GPU settings * Enables automatic CUDA device detection and system resource information -* **llama-cpp-python** - **Optional fallback for GGUF model analysis** - * Install in your virtual environment: `pip install llama-cpp-python` - * Provides enhanced model analysis when llama.cpp tools are unavailable - * The launcher works without it using built-in GGUF parsing and llama.cpp tools * **psutil** - **Optional for enhanced system information** * Provides detailed CPU and RAM information across platforms * Install with: `pip install psutil` @@ -138,7 +137,7 @@ Or follow the [Dependencies](#-dependencies) section above to install dependenci You'll need to build `llama.cpp or ik_llama` separately and point the launcher to the build directory. Here's an example build configuration: -> **⚠️ Example Environment Disclaimer:** +> **⚠️ Example Environment Disclaimer:** > The following build example was tested on **Ubuntu 24.04** with **CUDA 12.9** and **GCC 13**. Your build flags may need adjustment based on your system configuration, CUDA version, GCC version, and GPU architecture. ```bash @@ -161,7 +160,7 @@ CC=/usr/bin/gcc-13 CXX=/usr/bin/g++-13 cmake .. \ make -j$(nproc) ``` -> **📚 Need More Build Help?** +> **📚 Need More Build Help?** > For additional building guidance, platform-specific instructions, and troubleshooting, refer to the official [llama.cpp documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md). **Key Build Flags Explained:** @@ -179,4 +178,4 @@ make -j$(nproc) ## 🚀 Core Components -This launcher aims to streamline your `llama.cpp` server workflow when working with and testing multiple models while making it more accessible and efficient for both new and experienced users. \ No newline at end of file +This launcher aims to streamline your `llama.cpp` server workflow when working with and testing multiple models while making it more accessible and efficient for both new and experienced users. diff --git a/llamacpp-server-launcher.py b/llamacpp-server-launcher.py index ea962d3..ee9bfbe 100755 --- a/llamacpp-server-launcher.py +++ b/llamacpp-server-launcher.py @@ -39,10 +39,12 @@ def debug_print(message, force=False): # Import system helper functions from system import ( - get_gpu_info_static, get_ram_info_static, get_cpu_info_static, - analyze_gguf_with_llamacpp_tools, calculate_total_gguf_size, - parse_gguf_header_simple, analyze_gguf_model_static, SystemInfoManager, - LLAMA_CPP_PYTHON_AVAILABLE + get_gpu_info_static, + get_ram_info_static, + get_cpu_info_static, + calculate_total_gguf_size, + analyze_gguf_model_static, + SystemInfoManager, ) @@ -313,6 +315,9 @@ def __init__(self, root: tk.Tk): self.model_architecture_var = tk.StringVar(value="N/A") self.model_filesize_var = tk.StringVar(value="N/A") self.model_total_layers_var = tk.StringVar(value="N/A") # Total layers from GGUF analysis + self.model_context_length_var = tk.StringVar(value="N/A") # Max Context Length + self.model_expert_count_var = tk.StringVar(value="N/A") # Total Expert Count of MoE + self.model_expert_used_count_var = tk.StringVar(value="N/A") # Expert Used Count of MoE self.model_kv_cache_type_var = tk.StringVar(value="N/A") # Current selected KV cache type # --- Recommendation Variables --- @@ -668,7 +673,7 @@ def _setup_main_tab(self, parent): r += 1 # --- Multi-modal Projection (mmproj) --- - self.mmproj_enabled_check = ttk.Checkbutton(inner, variable=self.mmproj_enabled, + self.mmproj_enabled_check = ttk.Checkbutton(inner, variable=self.mmproj_enabled, text="Enable automatic mmproj file detection", state=tk.NORMAL) self.mmproj_enabled_check.grid(column=0, row=r, columnspan=4, sticky="w", padx=10, pady=(3,10)); r += 1 @@ -697,9 +702,9 @@ def _setup_main_tab(self, parent): self.ctx_entry.bind("", self._override_ctx_size) self.ctx_entry.bind("", self._override_ctx_size) - ctx_slider = ttk.Scale(ctx_f, from_=1024, to=131072, orient="horizontal", + self.ctx_slider = ttk.Scale(ctx_f, from_=1024, to=131072, orient="horizontal", variable=self.ctx_size, command=self._update_ctx_label_from_slider) - ctx_slider.grid(column=0, row=0, sticky="ew", padx=(0, 5)) + self.ctx_slider.grid(column=0, row=0, sticky="ew", padx=(0, 5)) self.ctx_label.grid(column=1, row=0, padx=5) self.ctx_entry.grid(column=2, row=0, padx=5) ttk.Button(ctx_f, text="Set", command=self._override_ctx_size, width=4).grid(column=3, row=0, padx=(0, 5)) @@ -909,6 +914,21 @@ def _setup_advanced_tab(self, parent): ttk.Label(inner, textvariable=self.model_total_layers_var)\ .grid(column=1, row=r, sticky="w", padx=5, pady=3, columnspan=3); r += 1 + ttk.Label(inner, text="Context Length:")\ + .grid(column=0, row=r, sticky="w", padx=10, pady=3) + ttk.Label(inner, textvariable=self.model_context_length_var)\ + .grid(column=1, row=r, sticky="w", padx=5, pady=3, columnspan=2); r += 1 + + ttk.Label(inner, text="Total Expert Count:")\ + .grid(column=0, row=r, sticky="w", padx=10, pady=3) + ttk.Label(inner, textvariable=self.model_expert_count_var)\ + .grid(column=1, row=r, sticky="w", padx=5, pady=3, columnspan=2); r += 1 + + ttk.Label(inner, text="Expert Used Count:")\ + .grid(column=0, row=r, sticky="w", padx=10, pady=3) + ttk.Label(inner, textvariable=self.model_expert_used_count_var)\ + .grid(column=1, row=r, sticky="w", padx=5, pady=3, columnspan=2); r += 1 + ttk.Label(inner, text="Current KV Cache Type:")\ .grid(column=0, row=r, sticky="w", padx=10, pady=3) ttk.Label(inner, textvariable=self.model_kv_cache_type_var)\ @@ -1877,50 +1897,23 @@ def _on_model_selected(self, event=None): # Update current KV cache type display immediately # This is called by _update_recommendations, which is triggered below - if LLAMA_CPP_PYTHON_AVAILABLE: - self.gpu_layers_status_var.set("Analyzing model...") - self.gpu_layers_slider.config(state=tk.DISABLED) - # Entry remains enabled, validated state will apply - self._reset_model_info_display() # Reset info fields before analysis starts - self.current_model_analysis = {} # Clear old analysis - self._update_recommendations() # Update recommendations display based on no analysis yet - - # Start analysis thread - if self.analysis_thread and self.analysis_thread.is_alive(): - print("DEBUG: Previous analysis thread is still running, cancelling old analysis.", file=sys.stderr) - # Ideally, you'd have a way to signal the thread to stop. - # For simplicity here, we just let the old thread finish and ignore its result - # if a new analysis starts, by checking self.model_path in _update_ui_after_analysis. - pass # No explicit cancel mechanism here - - self.analysis_thread = Thread(target=self._run_gguf_analysis, args=(full_path_str,), daemon=True) - self.analysis_thread.start() - else: - # Analysis not available - check what options we have - backend = self.backend_selection.get() - if backend == "ik_llama": - backend_dir = self.ik_llama_dir.get().strip() - backend_name = "ik_llama" - else: - backend_dir = self.llama_cpp_dir.get().strip() - backend_name = "llama.cpp" - - if backend_dir: - self.gpu_layers_status_var.set(f"Analyzing model using {backend_name} tools...") - # Try the analysis even without llama-cpp-python - self.analysis_thread = Thread(target=self._run_gguf_analysis, args=(full_path_str,), daemon=True) - self.analysis_thread.start() - else: - self.gpu_layers_status_var.set(f"Analysis available: Set {backend_name} directory or install llama-cpp-python") - self._reset_gpu_layer_controls(keep_entry_enabled=True) # Keep entry enabled if lib missing - self._reset_model_info_display() - self.model_architecture_var.set("Analysis Unavailable") - self.model_filesize_var.set("Analysis Unavailable") - self.model_total_layers_var.set("Analysis Unavailable") - self.current_model_analysis = {} - self._update_recommendations() # Update recommendations based on no analysis - self._generate_default_config_name() # Generate default name even without analysis - self._update_manual_model_visibility() # Update manual model section visibility + self.gpu_layers_status_var.set("Analyzing model...") + self.gpu_layers_slider.config(state=tk.DISABLED) + # Entry remains enabled, validated state will apply + self._reset_model_info_display() # Reset info fields before analysis starts + self.current_model_analysis = {} # Clear old analysis + self._update_recommendations() # Update recommendations display based on no analysis yet + + # Start analysis thread + if self.analysis_thread and self.analysis_thread.is_alive(): + print("DEBUG: Previous analysis thread is still running, cancelling old analysis.", file=sys.stderr) + # Ideally, you'd have a way to signal the thread to stop. + # For simplicity here, we just let the old thread finish and ignore its result + # if a new analysis starts, by checking self.model_path in _update_ui_after_analysis. + pass # No explicit cancel mechanism here + + self.analysis_thread = Thread(target=self._run_gguf_analysis, args=(full_path_str,), daemon=True) + self.analysis_thread.start() else: @@ -1942,38 +1935,8 @@ def _run_gguf_analysis(self, model_path_str): # Check if the currently selected model in the GUI still matches the one being analyzed # This prevents updating the UI with stale results if the user quickly selects another model if self.model_path.get() == model_path_str: - # Try backend-specific tools first, fall back to llama-cpp-python if available - backend = self.backend_selection.get() - if backend == "ik_llama": - backend_dir = self.ik_llama_dir.get().strip() - backend_name = "ik_llama" - else: - backend_dir = self.llama_cpp_dir.get().strip() - backend_name = "llama.cpp" - - if backend_dir: - print(f"DEBUG: Trying {backend_name} tools from: {backend_dir}", file=sys.stderr) - analysis_result = analyze_gguf_with_llamacpp_tools(model_path_str, backend_dir) - - # If backend tools failed and we have llama-cpp-python available, try that as fallback - if analysis_result.get("error") and LLAMA_CPP_PYTHON_AVAILABLE: - print(f"DEBUG: {backend_name} tools failed, falling back to llama-cpp-python", file=sys.stderr) - analysis_result = analyze_gguf_model_static(model_path_str) - else: - # No backend directory set, try simple GGUF parser first - print(f"DEBUG: No {backend_name} directory set, trying simple GGUF parser", file=sys.stderr) - analysis_result = parse_gguf_header_simple(model_path_str) - - # If simple parser failed and we have llama-cpp-python available, try that as fallback - if analysis_result.get("error") and LLAMA_CPP_PYTHON_AVAILABLE: - print("DEBUG: Simple GGUF parser failed, falling back to llama-cpp-python", file=sys.stderr) - analysis_result = analyze_gguf_model_static(model_path_str) - - # Only update UI if the model path hasn't changed while analyzing - if self.model_path.get() == model_path_str: - self.root.after(0, self._update_ui_after_analysis, analysis_result) - else: - print(f"DEBUG: Analysis for {model_path_str} finished, but model selection changed. Discarding result.", file=sys.stderr) + analysis_result = analyze_gguf_model_static(model_path_str) + self.root.after(0, self._update_ui_after_analysis, analysis_result) else: print(f"DEBUG: Analysis started for {model_path_str}, but model selection changed before analysis began. Skipping.", file=sys.stderr) @@ -2099,6 +2062,20 @@ def _update_ui_after_analysis(self, analysis_result): # This will set the slider and potentially update the entry format (-1 vs number) self._sync_gpu_layers_from_entry() + if int(context_length := analysis_result.get("context_length")) > 0: + self.model_context_length_var.set(context_length) + context_length = int(context_length) + self.ctx_slider.config(to=context_length) + if self.ctx_size.get() > context_length: + self.ctx_size.set(context_length) + self._sync_ctx_display(context_length) + + if int(expert_count := analysis_result.get("expert_count")) > 0: + self.model_expert_count_var.set(expert_count) + + if int(expert_used_count := analysis_result.get("expert_used_count")) > 0: + self.model_expert_used_count_var.set(expert_used_count) + # --- Update Recommendations based on new analysis --- self._update_recommendations() @@ -2142,6 +2119,9 @@ def _reset_model_info_display(self): self.model_architecture_var.set("N/A") self.model_filesize_var.set("N/A") self.model_total_layers_var.set("N/A") + self.model_context_length_var.set("N/A") + self.model_expert_count_var.set("N/A") + self.model_expert_used_count_var.set("N/A") # KV Cache Type display is linked to the variable, not reset here diff --git a/requirements.txt b/requirements.txt index c3d9349..5e852ab 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Required dependencies requests>=2.31.0 # Used in about_tab.py for version checking +git+https://github.com/excosy/ik_gguf.git # patched gguf-py, for GGUF model analysis # Optional but recommended dependencies torch>=2.0.0 # Required for GPU detection and selection -llama-cpp-python # Optional fallback for GGUF model analysis -psutil>=5.9.0 # Optional for enhanced system information \ No newline at end of file +psutil>=5.9.0 # Optional for enhanced system information diff --git a/system.py b/system.py index c76a39c..c724f3c 100644 --- a/system.py +++ b/system.py @@ -8,6 +8,7 @@ import ctypes import struct from pathlib import Path +from gguf import GGUFReader, GGUFValueType # Add debug prints for Python environment print("\n=== Python Environment Debug Info ===", file=sys.stderr) @@ -29,20 +30,6 @@ torch = None print(f"Warning: PyTorch import failed: {e}", file=sys.stderr) - -try: - # Import Llama only if llama_cpp_python is importable - from llama_cpp import Llama - LLAMA_CPP_PYTHON_AVAILABLE = True -except ImportError: - LLAMA_CPP_PYTHON_AVAILABLE = False - Llama = None # Define Llama as None if import fails -except Exception as e: - # Catch other potential issues during llama_cpp import (e.g., libllama.so not found) - LLAMA_CPP_PYTHON_AVAILABLE = False - Llama = None - print(f"Warning: llama-cpp-python import failed: {e}", file=sys.stderr) - # Check for requests module (required for version checking) try: import requests @@ -74,10 +61,6 @@ if not TORCH_AVAILABLE: MISSING_DEPS.append("PyTorch (required for GPU detection and CUDA features)") -# Optional dependencies -if not LLAMA_CPP_PYTHON_AVAILABLE: - MISSING_DEPS.append("llama-cpp-python (optional - provides fallback GGUF analysis if llama.cpp tools unavailable)") - if not PSUTIL_AVAILABLE: MISSING_DEPS.append("psutil (optional - provides enhanced system information)") @@ -95,6 +78,13 @@ # ═════════════════════════════════════════════════════════════════════ # Helper Functions (These remain outside the class as they don't need 'self') # ═════════════════════════════════════════════════════════════════════ +def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]: + file_endian = reader.endianess.name + if reader.byte_order == 'S': + host_endian = 'BIG' if file_endian == 'LITTLE' else 'LITTLE' + else: + host_endian = file_endian + return (host_endian, file_endian) def get_gpu_info_with_venv(venv_path=None): """Get GPU information using PyTorch, optionally from a virtual environment.""" @@ -110,9 +100,9 @@ def get_gpu_info_from_venv(venv_path): import subprocess import json from pathlib import Path - + venv_path = Path(venv_path) - + # Determine the Python executable in the venv if sys.platform == "win32": python_exe = venv_path / "Scripts" / "python.exe" @@ -122,12 +112,12 @@ def get_gpu_info_from_venv(venv_path): python_exe = venv_path / "bin" / "python" if not python_exe.exists(): python_exe = venv_path / "python" # Some venv structures - + if not python_exe.exists(): print(f"DEBUG: Python executable not found in venv: {venv_path}", file=sys.stderr) # Fall back to current process detection return get_gpu_info_static() - + # Create a small Python script to check for PyTorch/CUDA in the venv detection_script = ''' import sys @@ -136,7 +126,7 @@ def get_gpu_info_from_venv(venv_path): try: import torch torch_available = torch.cuda.is_available() - + if torch_available: device_count = torch.cuda.device_count() gpu_info = { @@ -144,7 +134,7 @@ def get_gpu_info_from_venv(venv_path): "device_count": device_count, "devices": [] } - + for i in range(device_count): props = torch.cuda.get_device_properties(i) gpu_info["devices"].append({ @@ -155,7 +145,7 @@ def get_gpu_info_from_venv(venv_path): "compute_capability": f"{props.major}.{props.minor}", "multi_processor_count": props.multi_processor_count }) - + print(json.dumps(gpu_info)) else: print(json.dumps({"available": False, "message": "CUDA not available via PyTorch in venv", "device_count": 0, "devices": []})) @@ -165,7 +155,7 @@ def get_gpu_info_from_venv(venv_path): except Exception as e: print(json.dumps({"available": False, "message": f"Error in venv GPU detection: {e}", "device_count": 0, "devices": []})) ''' - + try: print(f"DEBUG: Running GPU detection in venv: {venv_path}", file=sys.stderr) # Run the detection script in the virtual environment @@ -175,14 +165,14 @@ def get_gpu_info_from_venv(venv_path): text=True, timeout=30 ) - + if result.returncode == 0: try: output = result.stdout.strip() if not output: print("DEBUG: Venv GPU detection returned empty output", file=sys.stderr) return _create_fallback_gpu_info("Empty output from venv detection") - + gpu_info = json.loads(output) print(f"DEBUG: Venv GPU detection successful: {gpu_info.get('device_count', 0)} devices", file=sys.stderr) return gpu_info @@ -194,7 +184,7 @@ def get_gpu_info_from_venv(venv_path): error_msg = result.stderr.strip() if result.stderr else "Unknown error" print(f"DEBUG: Venv GPU detection failed with return code {result.returncode}", file=sys.stderr) print(f"DEBUG: Error output: {error_msg}", file=sys.stderr) - + # Check for specific error types if "ModuleNotFoundError" in error_msg or "ImportError" in error_msg: return _create_fallback_gpu_info("Required modules not found in venv") @@ -202,7 +192,7 @@ def get_gpu_info_from_venv(venv_path): return _create_fallback_gpu_info("CUDA error in venv") else: return _create_fallback_gpu_info(f"Venv detection failed: {error_msg}") - + except subprocess.TimeoutExpired: print("DEBUG: Venv GPU detection timed out after 30 seconds", file=sys.stderr) return _create_fallback_gpu_info("Detection timeout") @@ -220,16 +210,16 @@ def _create_fallback_gpu_info(reason): """Create fallback GPU info with specific reason, then try current process detection.""" print(f"DEBUG: Creating fallback GPU info due to: {reason}", file=sys.stderr) print("DEBUG: Attempting current process GPU detection as fallback", file=sys.stderr) - + # Try current process detection as fallback fallback_info = get_gpu_info_static() - + # If current process detection also fails, return a clear error message if not fallback_info.get('available', False): fallback_info['message'] = f"Venv detection failed ({reason}), current process also failed" else: fallback_info['message'] = f"Using current process (venv failed: {reason})" - + return fallback_info def get_gpu_info_static(): @@ -372,169 +362,36 @@ def get_cpu_info_static(): print(f"Failed to get CPU info: {str(e)}", file=sys.stderr) return {"error": f"Failed to get CPU info: {str(e)}", "logical_cores": 4, "physical_cores": 2} - -def analyze_gguf_with_llamacpp_tools(model_path_str, llama_cpp_dir=None): - """Analyze GGUF model using llama.cpp/ik_llama tools instead of llama-cpp-python.""" - import subprocess - import json - from pathlib import Path - - model_path = Path(model_path_str) - if not model_path.is_file(): - return {"error": f"Model file not found: {model_path}", "n_layers": None, "architecture": "N/A", "file_size_bytes": 0} - - # Calculate total size across all shards if this is a multi-part file - total_size_bytes, shard_count, all_shards = calculate_total_gguf_size(model_path_str) - - analysis_result = { - "path": str(model_path), - "file_size_bytes": total_size_bytes, - "file_size_gb": round(total_size_bytes / (1024**3), 2), - "architecture": "unknown", - "n_layers": None, - "metadata": {}, - "error": None, - "message": None, - "shard_count": shard_count, - "all_shards": [str(p) for p in all_shards] - } - - # Try to find analysis tools from both llama.cpp and ik_llama - tools_to_try = [] - - if llama_cpp_dir: - backend_base_dir = Path(llama_cpp_dir) - # Common locations for llama.cpp/ik_llama tools - search_paths = [ - backend_base_dir, - backend_base_dir / "build" / "bin", - backend_base_dir / "build", - backend_base_dir / "bin", - ] - - # Tool names for both llama.cpp and ik_llama backends - tool_names = [ - # Standard llama.cpp tools - "llama-inspect", "gguf-dump", "llama-server", - # Possible ik_llama specific tools - "ik-llama-inspect", "ik_llama_inspect", "ik-llama-server", "ik_llama_server" - ] - if sys.platform == "win32": - tool_names = [name + ".exe" for name in tool_names] - - for search_path in search_paths: - for tool_name in tool_names: - tool_path = search_path / tool_name - if tool_path.is_file(): - tools_to_try.append((str(tool_path), tool_name.replace(".exe", ""))) - - # Try each tool - for tool_path, tool_name in tools_to_try: - try: - if tool_name == "llama-inspect": - # llama-inspect typically outputs JSON or structured data - result = subprocess.run([tool_path, str(model_path)], - capture_output=True, text=True, timeout=30) - if result.returncode == 0: - # Parse the output (this would need to be adapted based on actual llama-inspect output format) - output = result.stdout.strip() - # Try to extract key information from the output - if "layers" in output.lower() or "block_count" in output.lower(): - # Parse layer count from output - for line in output.split('\n'): - if 'block_count' in line.lower() or 'n_layers' in line.lower(): - try: - # Extract number from line (this is a simple approach, may need refinement) - import re - numbers = re.findall(r'\d+', line) - if numbers: - analysis_result["n_layers"] = int(numbers[-1]) - break - except: - pass - - # Extract architecture if possible - for line in output.split('\n'): - if 'architecture' in line.lower(): - # Simple extraction - would need refinement based on actual output format - parts = line.split() - if len(parts) > 1: - analysis_result["architecture"] = parts[-1] - break - - analysis_result["message"] = f"Analyzed using {tool_name}" - return analysis_result - - elif tool_name == "llama-server": - # Try to get model info from llama-server without starting it - # Some versions support --model-info or similar flags - for flag in ["--model-info", "--print-model-info", "--help"]: - try: - result = subprocess.run([tool_path, flag, "-m", str(model_path)], - capture_output=True, text=True, timeout=10) - if result.returncode == 0 and ("layer" in result.stdout.lower() or "block" in result.stdout.lower()): - # Similar parsing as above - output = result.stdout.strip() - for line in output.split('\n'): - if 'block' in line.lower() or 'layer' in line.lower(): - try: - import re - numbers = re.findall(r'\d+', line) - if numbers: - analysis_result["n_layers"] = int(numbers[-1]) - analysis_result["message"] = f"Analyzed using {tool_name}" - return analysis_result - except: - pass - break - except: - continue - - except subprocess.TimeoutExpired: - continue - except Exception as e: - print(f"DEBUG: Tool {tool_name} failed: {e}", file=sys.stderr) - continue - - # If no tools worked, try simple GGUF header parsing - try: - return parse_gguf_header_simple(model_path_str) - except Exception as e: - analysis_result["error"] = f"All analysis methods failed. Last error: {e}" - analysis_result["message"] = "Could not analyze model with available tools" - return analysis_result - - def calculate_total_gguf_size(model_path_str): """Calculate total size across all GGUF shards if this is a multi-part file.""" import re from pathlib import Path - + model_path = Path(model_path_str) - + # Check if this looks like a multi-part GGUF file (e.g., "00001-of-00003.gguf") shard_pattern = re.search(r'-(\d+)-of-(\d+)\.gguf$', model_path.name, re.IGNORECASE) if not shard_pattern: # Not a multi-part file, return single file size return model_path.stat().st_size, 1, [model_path] - + current_shard = int(shard_pattern.group(1)) total_shards = int(shard_pattern.group(2)) - + print(f"DEBUG: Detected multi-part GGUF: shard {current_shard} of {total_shards}", file=sys.stderr) - + # Find all related shard files base_name = model_path.name[:shard_pattern.start()] # Everything before "-00001-of-00003.gguf" parent_dir = model_path.parent - + total_size = 0 found_shards = [] missing_shards = [] - + for shard_num in range(1, total_shards + 1): shard_name = f"{base_name}-{shard_num:05d}-of-{total_shards:05d}.gguf" shard_path = parent_dir / shard_name - + if shard_path.exists(): shard_size = shard_path.stat().st_size total_size += shard_size @@ -543,252 +400,74 @@ def calculate_total_gguf_size(model_path_str): else: missing_shards.append(shard_name) print(f"DEBUG: Missing shard {shard_num}: {shard_name}", file=sys.stderr) - + if missing_shards: print(f"WARNING: Missing {len(missing_shards)} shards: {missing_shards}", file=sys.stderr) # Return what we found, but note it's incomplete return total_size, len(found_shards), found_shards - + print(f"DEBUG: Total size across {total_shards} shards: {total_size / (1024**3):.2f} GB", file=sys.stderr) return total_size, total_shards, found_shards - -def parse_gguf_header_simple(model_path_str): - """Simple GGUF header parser to extract basic metadata without dependencies.""" - import struct - from pathlib import Path - - model_path = Path(model_path_str) - - # Calculate total size across all shards if this is a multi-part file - total_size_bytes, shard_count, all_shards = calculate_total_gguf_size(model_path_str) - - analysis_result = { - "path": str(model_path), - "file_size_bytes": total_size_bytes, - "file_size_gb": round(total_size_bytes / (1024**3), 2), - "architecture": "unknown", - "n_layers": None, - "metadata": {}, - "error": None, - "message": f"Analyzed using simple GGUF parser ({shard_count} shard{'s' if shard_count != 1 else ''})", - "shard_count": shard_count, - "all_shards": [str(p) for p in all_shards] +def dump_gguf_metadata(model_path_str, json_array = False, no_tensors = False): + """ + refer https://github.com/ggml-org/llama.cpp/blob/main/gguf-py/gguf/scripts/gguf_dump.py + json_array: Include full array values in JSON output (long) + no-tensors: Don't dump tensor metadata + """ + + reader = GGUFReader(model_path_str, 'r') + host_endian, file_endian = get_file_host_endian(reader) + metadata: dict[str, Any] = {} + tensors: dict[str, Any] = {} + result = { + "filename": Path(model_path_str).stem, + "endian": file_endian, + "metadata": metadata, + "tensors": tensors, } - - try: - with open(model_path, 'rb') as f: - # Read GGUF magic number - magic = f.read(4) - if magic != b'GGUF': - analysis_result["error"] = "Not a valid GGUF file" - return analysis_result - - # Read version - version = struct.unpack(' 100000: # Sanity check for string length - print(f"DEBUG: Skipping large string for key '{key}': {value_len} bytes", file=sys.stderr) - f.seek(f.tell() + value_len) # Skip the string - continue - value_bytes = f.read(value_len) - if len(value_bytes) < value_len: - break # End of file - value = value_bytes.decode('utf-8', errors='replace') - except Exception as e: - print(f"DEBUG: Failed to read string for key '{key}': {e}", file=sys.stderr) - continue - elif value_type == 4: # Uint32 - try: - value_bytes = f.read(4) - if len(value_bytes) < 4: - break - value = struct.unpack(' 100000: # Sanity check - break - f.seek(f.tell() + str_len) - except: - break - elif array_type == 4: # Uint32 array - f.seek(f.tell() + array_len * 4) - elif array_type == 6: # Uint64 array - f.seek(f.tell() + array_len * 8) - elif array_type == 7: # Float32 array - f.seek(f.tell() + array_len * 4) - else: - # Unknown array type, try to skip conservatively - print(f"DEBUG: Unknown array type {array_type}, attempting to skip", file=sys.stderr) - break - except Exception as e: - print(f"DEBUG: Failed to skip array for key '{key}': {e}", file=sys.stderr) - break - continue - else: - print(f"DEBUG: Unknown value type {value_type} for key '{key}', skipping", file=sys.stderr) - continue - - if value is not None: - analysis_result["metadata"][key] = value - - # Extract key information - if key == "general.architecture": - analysis_result["architecture"] = str(value) - elif any(pattern in key.lower() for pattern in ['.block_count', '.n_layers', '.layer_count', '.num_layer']): - if isinstance(value, (int, float)) and value > 0: - analysis_result["n_layers"] = int(value) - print(f"DEBUG: Found layer count in key '{key}': {analysis_result['n_layers']}", file=sys.stderr) - - # Also check for model name patterns that might indicate layer count - if 'name' in key.lower() and isinstance(value, str): - # Try to extract parameter count from model name (e.g., "235B" -> estimate layers) - import re - param_match = re.search(r'(\d+)B', value.upper()) - if param_match and analysis_result["n_layers"] is None: - param_count = int(param_match.group(1)) - # Rough estimate: large models typically have ~80-120 layers per billion parameters - # For very large models like 235B, use a more conservative estimate - if param_count >= 100: - estimated_layers = max(80, min(200, param_count // 2)) # Very conservative for huge models - else: - estimated_layers = max(32, min(120, param_count * 2)) # More typical estimate - print(f"DEBUG: Estimated {estimated_layers} layers from model name '{value}' ({param_count}B parameters)", file=sys.stderr) - analysis_result["n_layers"] = estimated_layers - - except Exception as parse_error: - # Skip this metadata entry if parsing fails - print(f"DEBUG: Failed to parse metadata entry: {parse_error}", file=sys.stderr) - continue - - # If we still don't have layer count, make a reasonable guess based on file size - if analysis_result["n_layers"] is None: - file_size_gb = analysis_result["file_size_gb"] - if file_size_gb > 100: # Very large model (100+ GB) - estimated_layers = 120 # Conservative estimate for huge models - print(f"DEBUG: No layer count found, estimating {estimated_layers} layers based on very large file size ({file_size_gb:.1f} GB)", file=sys.stderr) - elif file_size_gb > 50: # Large model (50-100 GB) - estimated_layers = 80 - print(f"DEBUG: No layer count found, estimating {estimated_layers} layers based on large file size ({file_size_gb:.1f} GB)", file=sys.stderr) - elif file_size_gb > 20: # Medium-large model (20-50 GB) - estimated_layers = 60 - print(f"DEBUG: No layer count found, estimating {estimated_layers} layers based on medium-large file size ({file_size_gb:.1f} GB)", file=sys.stderr) - elif file_size_gb > 10: # Medium model (10-20 GB) - estimated_layers = 40 - print(f"DEBUG: No layer count found, estimating {estimated_layers} layers based on medium file size ({file_size_gb:.1f} GB)", file=sys.stderr) - elif file_size_gb > 3: # Small-medium model (3-10 GB) - estimated_layers = 32 - print(f"DEBUG: No layer count found, estimating {estimated_layers} layers based on small-medium file size ({file_size_gb:.1f} GB)", file=sys.stderr) - else: # Small model (< 3 GB) - estimated_layers = 24 - print(f"DEBUG: No layer count found, estimating {estimated_layers} layers based on small file size ({file_size_gb:.1f} GB)", file=sys.stderr) - - analysis_result["n_layers"] = estimated_layers - analysis_result["message"] += f" (estimated {estimated_layers} layers from file size)" - - return analysis_result - - except Exception as e: - analysis_result["error"] = f"Failed to parse GGUF header: {e}" - return analysis_result - + for idx, field in enumerate(reader.fields.values()): + curr: dict[str, Any] = { + "index": idx, + "type": field.types[0].name if field.types else 'UNKNOWN', + "offset": field.offset, + } + metadata[field.name] = curr + if field.types[:1] == [GGUFValueType.ARRAY]: + curr["array_types"] = [t.name for t in field.types][1:] + if not json_array: + continue + curr["value"] = field.contents() + else: + curr["value"] = field.contents() + if not no_tensors: + for idx, tensor in enumerate(reader.tensors): + tensors[tensor.name] = { + "index": idx, + "shape": tensor.shape.tolist(), + "type": tensor.tensor_type.name, + "offset": tensor.field.offset, + } + return result def analyze_gguf_model_static(model_path_str): """Analyze a GGUF model file and extract metadata (static method).""" - if not LLAMA_CPP_PYTHON_AVAILABLE or not Llama: - return {"error": "llama-cpp-python library not found.", "n_layers": None, "architecture": "N/A", "file_size_bytes": 0} - model_path = Path(model_path_str) if not model_path.is_file(): return {"error": f"Model file not found: {model_path}", "n_layers": None, "architecture": "N/A", "file_size_bytes": 0} # Calculate total size across all shards if this is a multi-part file total_size_bytes, shard_count, all_shards = calculate_total_gguf_size(model_path_str) - + analysis_result = { "path": str(model_path), "file_size_bytes": total_size_bytes, "file_size_gb": round(total_size_bytes / (1024**3), 2), "architecture": "unknown", "n_layers": None, + "expert_count": 0, + "expert_used_count": 0, + "context_length": 0, "metadata": {}, "error": None, "message": None, @@ -796,102 +475,38 @@ def analyze_gguf_model_static(model_path_str): "all_shards": [str(p) for p in all_shards] } - llm_meta = None # Initialize llm_meta outside the try block + llm_meta = {} # Initialize llm_meta outside the try block try: - # File size already calculated from shards above, no need to recalculate - - try: - # Use minimal parameters for quick metadata load - # Setting n_gpu_layers=0 is important to avoid trying to load layers onto potentially unavailable GPUs - # Using minimal n_ctx, n_batch, n_threads for minimal resource usage - llm_meta = Llama(model_path=str(model_path), n_ctx=32, n_threads=1, n_batch=32, - verbose=False, n_gpu_layers=0, logits_all=False) # logits_all=False to save memory/time - except Exception as load_exc: - analysis_result["error"] = f"Failed to load model for metadata analysis: {load_exc}" - print(f"ERROR: Failed to load model '{model_path.name}' for analysis: {load_exc}", file=sys.stderr) - # No traceback here, the caller should handle it or the log message is enough - return analysis_result # Exit early if basic load fails - - - # --- Extract Metadata --- - # Attempt 1: Check common metadata keys and attributes - if hasattr(llm_meta, 'metadata') and isinstance(llm_meta.metadata, dict) and llm_meta.metadata: - analysis_result["metadata"] = llm_meta.metadata - # Check various common keys for layer count - analysis_result["n_layers"] = llm_meta.metadata.get('llama.block_count') - if analysis_result["n_layers"] is None: - analysis_result["n_layers"] = llm_meta.metadata.get('general.architecture.block_count') - # Add more architecture-specific keys as needed - if analysis_result["n_layers"] is None: analysis_result["n_layers"] = llm_meta.metadata.get('qwen2.block_count') - if analysis_result["n_layers"] is None: analysis_result["n_layers"] = llm_meta.metadata.get('gemma.block_count') - if analysis_result["n_layers"] is None: analysis_result["n_layers"] = llm_meta.metadata.get('bert.block_count') - if analysis_result["n_layers"] is None: analysis_result["n_layers"] = llm_meta.metadata.get('model.block_count') # Fallback - - - # Check various common keys for architecture - analysis_result["architecture"] = llm_meta.metadata.get('general.architecture', 'unknown') - if analysis_result["architecture"] == 'unknown': analysis_result["architecture"] = llm_meta.metadata.get('qwen2.architecture', 'unknown') - if analysis_result["architecture"] == 'unknown': analysis_result["architecture"] = llm_meta.metadata.get('gemma.architecture', 'unknown') - if analysis_result["architecture"] == 'unknown': analysis_result["architecture"] = llm_meta.metadata.get('bert.architecture', 'unknown') - - - # Attempt 2: Check direct attributes if metadata didn't yield layers - if analysis_result["n_layers"] is None: - if hasattr(llm_meta, 'n_layer'): # Common recent name - analysis_result["n_layers"] = getattr(llm_meta, 'n_layer', None) - if analysis_result["n_layers"] is None and hasattr(llm_meta, 'n_layers'): # Older name - analysis_result["n_layers"] = getattr(llm_meta, 'n_layers', None) - - # Fallback architecture if still unknown (less reliable) - if analysis_result["architecture"] == 'unknown' and hasattr(llm_meta, 'model_type'): - analysis_result["architecture"] = getattr(llm_meta, 'model_type', 'unknown') - - - # Validate n_layers - if analysis_result["n_layers"] is not None: - try: - analysis_result["n_layers"] = int(analysis_result["n_layers"]) - if analysis_result["n_layers"] <= 0: - analysis_result["n_layers"] = None # Treat non-positive as unknown - analysis_result["message"] = "Layer count found was not positive." - except (ValueError, TypeError): - analysis_result["n_layers"] = None # Treat non-integer as unknown - analysis_result["message"] = "Layer count metadata found was not an integer." - - if analysis_result["n_layers"] is None: - if not analysis_result["message"]: # If no specific message set above - analysis_result["message"] = "Layer count metadata not found or recognized." - - - return analysis_result - - except Exception as e: - # Catch any other unexpected errors during metadata processing or attribute access - print(f"ERROR: Failed during GGUF metadata processing for '{model_path.name}': {e}", file=sys.stderr) - traceback.print_exc(file=sys.stderr) - analysis_result["error"] = f"Unexpected error during analysis: {e}" - analysis_result["n_layers"] = None # Ensure layers is None on error - return analysis_result - finally: - # Ensure the temporary Llama object is deleted if it was created - if llm_meta: - try: - del llm_meta - except Exception as clean_exc: - print(f"Warning: Failed to delete llama_cpp.Llama instance in analyze_gguf_model_static finally block: {clean_exc}", file=sys.stderr) + llm_meta = dump_gguf_metadata(model_path_str, no_tensors = True)["metadata"] + except Exception as load_exc: + analysis_result["error"] = f"Failed to load model for metadata analysis: {load_exc}" + print(f"ERROR: Failed to load model '{model_path.name}' for analysis: {load_exc}", file=sys.stderr) + # No traceback here, the caller should handle it or the log message is enough + return analysis_result # Exit early if basic load fails + + # --- Extract Metadata --- + # Attempt 1: Check common metadata keys and attributes + for k,v in llm_meta.items(): + if ".block_count" in k: analysis_result["n_layers"] = llm_meta.get(k)["value"] + if ".architecture" in k: analysis_result["architecture"] = llm_meta.get(k)["value"] + if ".context_length" in k: analysis_result["context_length"] = llm_meta.get(k)["value"] + if ".expert_count" in k: analysis_result["expert_count"] = llm_meta.get(k)["value"] + if ".expert_used_count" in k: analysis_result["expert_used_count"] = llm_meta.get(k)["value"] + + return analysis_result class SystemInfoManager: """Manages system information fetching and processing for the launcher.""" - + def __init__(self, launcher_instance): """Initialize with reference to the main launcher instance.""" self.launcher = launcher_instance - + def fetch_system_info(self): """Fetches GPU, RAM, and CPU info and populates class attributes.""" print("Fetching system info...", file=sys.stderr) - + # Get the configured virtual environment path from the launcher venv_path = None if hasattr(self.launcher, 'venv_dir'): @@ -901,7 +516,7 @@ def fetch_system_info(self): print(f"DEBUG: Using configured venv for GPU detection: {venv_path}", file=sys.stderr) else: print("DEBUG: No venv configured, using current process for GPU detection", file=sys.stderr) - + self.launcher.gpu_info = get_gpu_info_with_venv(venv_path) self.launcher.ram_info = get_ram_info_static() self.launcher.cpu_info = get_cpu_info_static() # Fetch CPU info here @@ -929,4 +544,4 @@ def fetch_system_info(self): self.launcher.recommended_threads_batch_var.set(f"Recommended: {self.launcher.logical_cores} (Your CPU logical cores)") # Display initial GPU detection status message - self.launcher.gpu_detected_status_var.set(self.launcher.gpu_info['message'] if not self.launcher.gpu_info['available'] and self.launcher.gpu_info.get('message') else "") \ No newline at end of file + self.launcher.gpu_detected_status_var.set(self.launcher.gpu_info['message'] if not self.launcher.gpu_info['available'] and self.launcher.gpu_info.get('message') else "")