diff --git a/Readme.md b/Readme.md
index 26ecf21..43c853f 100644
--- a/Readme.md
+++ b/Readme.md
@@ -92,16 +92,15 @@ This python script provides a comprehensive graphical interface for `llama.cpp a
 *   **llama.cpp** built with server support (`llama-server and ik_llama` executable)
 *   **requests** - Required for version checking and updates
     *   Install with: `pip install requests`
+*   **gguf** - Required for for GGUF model analysis
+    *   Install with: `pip install git+https://github.com/excosy/ik_gguf.git`
+    *   `pip install gguf` also work if new quant types from ik_llama are not used
 
 ### Optional (Recommended)
 *   **PyTorch** (`torch`) - **Required if you want automatic GPU detection and selection**
     *   Install in your virtual environment: `pip install torch`
     *   Without PyTorch, you can still manually configure GPU settings
     *   Enables automatic CUDA device detection and system resource information
-*   **llama-cpp-python** - **Optional fallback for GGUF model analysis**
-    *   Install in your virtual environment: `pip install llama-cpp-python`
-    *   Provides enhanced model analysis when llama.cpp tools are unavailable
-    *   The launcher works without it using built-in GGUF parsing and llama.cpp tools
 *   **psutil** - **Optional for enhanced system information**
     *   Provides detailed CPU and RAM information across platforms
     *   Install with: `pip install psutil`
@@ -138,7 +137,7 @@ Or follow the [Dependencies](#-dependencies) section above to install dependenci
 
 You'll need to build `llama.cpp or ik_llama` separately and point the launcher to the build directory. Here's an example build configuration:
 
-> **⚠️ Example Environment Disclaimer:**  
+> **⚠️ Example Environment Disclaimer:**
 > The following build example was tested on **Ubuntu 24.04** with **CUDA 12.9** and **GCC 13**. Your build flags may need adjustment based on your system configuration, CUDA version, GCC version, and GPU architecture.
 
 ```bash
@@ -161,7 +160,7 @@ CC=/usr/bin/gcc-13 CXX=/usr/bin/g++-13 cmake .. \
 make -j$(nproc)
 ```
 
-> **📚 Need More Build Help?**  
+> **📚 Need More Build Help?**
 > For additional building guidance, platform-specific instructions, and troubleshooting, refer to the official [llama.cpp documentation](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md).
 
 **Key Build Flags Explained:**
@@ -179,4 +178,4 @@ make -j$(nproc)
 
 ## 🚀 Core Components
 
-This launcher aims to streamline your `llama.cpp` server workflow when working with and testing multiple models while making it more accessible and efficient for both new and experienced users.
\ No newline at end of file
+This launcher aims to streamline your `llama.cpp` server workflow when working with and testing multiple models while making it more accessible and efficient for both new and experienced users.
diff --git a/llamacpp-server-launcher.py b/llamacpp-server-launcher.py
index ea962d3..ee9bfbe 100755
--- a/llamacpp-server-launcher.py
+++ b/llamacpp-server-launcher.py
@@ -39,10 +39,12 @@ def debug_print(message, force=False):
 
 # Import system helper functions
 from system import (
-    get_gpu_info_static, get_ram_info_static, get_cpu_info_static,
-    analyze_gguf_with_llamacpp_tools, calculate_total_gguf_size,
-    parse_gguf_header_simple, analyze_gguf_model_static, SystemInfoManager,
-    LLAMA_CPP_PYTHON_AVAILABLE
+    get_gpu_info_static,
+    get_ram_info_static,
+    get_cpu_info_static,
+    calculate_total_gguf_size,
+    analyze_gguf_model_static,
+    SystemInfoManager,
 )
 
 
@@ -313,6 +315,9 @@ def __init__(self, root: tk.Tk):
         self.model_architecture_var = tk.StringVar(value="N/A")
         self.model_filesize_var = tk.StringVar(value="N/A")
         self.model_total_layers_var = tk.StringVar(value="N/A") # Total layers from GGUF analysis
+        self.model_context_length_var = tk.StringVar(value="N/A") # Max Context Length
+        self.model_expert_count_var = tk.StringVar(value="N/A") # Total Expert Count of MoE
+        self.model_expert_used_count_var = tk.StringVar(value="N/A") # Expert Used Count of MoE
         self.model_kv_cache_type_var = tk.StringVar(value="N/A") # Current selected KV cache type
 
         # --- Recommendation Variables ---
@@ -668,7 +673,7 @@ def _setup_main_tab(self, parent):
         r += 1
 
         # --- Multi-modal Projection (mmproj) ---
-        self.mmproj_enabled_check = ttk.Checkbutton(inner, variable=self.mmproj_enabled, 
+        self.mmproj_enabled_check = ttk.Checkbutton(inner, variable=self.mmproj_enabled,
                                                    text="Enable automatic mmproj file detection", state=tk.NORMAL)
         self.mmproj_enabled_check.grid(column=0, row=r, columnspan=4, sticky="w", padx=10, pady=(3,10)); r += 1
 
@@ -697,9 +702,9 @@ def _setup_main_tab(self, parent):
         self.ctx_entry.bind("<FocusOut>", self._override_ctx_size)
         self.ctx_entry.bind("<Return>", self._override_ctx_size)
 
-        ctx_slider = ttk.Scale(ctx_f, from_=1024, to=131072, orient="horizontal",
+        self.ctx_slider = ttk.Scale(ctx_f, from_=1024, to=131072, orient="horizontal",
                                variable=self.ctx_size, command=self._update_ctx_label_from_slider)
-        ctx_slider.grid(column=0, row=0, sticky="ew", padx=(0, 5))
+        self.ctx_slider.grid(column=0, row=0, sticky="ew", padx=(0, 5))
         self.ctx_label.grid(column=1, row=0, padx=5)
         self.ctx_entry.grid(column=2, row=0, padx=5)
         ttk.Button(ctx_f, text="Set", command=self._override_ctx_size, width=4).grid(column=3, row=0, padx=(0, 5))
@@ -909,6 +914,21 @@ def _setup_advanced_tab(self, parent):
         ttk.Label(inner, textvariable=self.model_total_layers_var)\
             .grid(column=1, row=r, sticky="w", padx=5, pady=3, columnspan=3); r += 1
 
+        ttk.Label(inner, text="Context Length:")\
+            .grid(column=0, row=r, sticky="w", padx=10, pady=3)
+        ttk.Label(inner, textvariable=self.model_context_length_var)\
+            .grid(column=1, row=r, sticky="w", padx=5, pady=3, columnspan=2); r += 1
+
+        ttk.Label(inner, text="Total Expert Count:")\
+            .grid(column=0, row=r, sticky="w", padx=10, pady=3)
+        ttk.Label(inner, textvariable=self.model_expert_count_var)\
+            .grid(column=1, row=r, sticky="w", padx=5, pady=3, columnspan=2); r += 1
+
+        ttk.Label(inner, text="Expert Used Count:")\
+            .grid(column=0, row=r, sticky="w", padx=10, pady=3)
+        ttk.Label(inner, textvariable=self.model_expert_used_count_var)\
+            .grid(column=1, row=r, sticky="w", padx=5, pady=3, columnspan=2); r += 1
+
         ttk.Label(inner, text="Current KV Cache Type:")\
             .grid(column=0, row=r, sticky="w", padx=10, pady=3)
         ttk.Label(inner, textvariable=self.model_kv_cache_type_var)\
@@ -1877,50 +1897,23 @@ def _on_model_selected(self, event=None):
             # Update current KV cache type display immediately
             # This is called by _update_recommendations, which is triggered below
 
-            if LLAMA_CPP_PYTHON_AVAILABLE:
-                 self.gpu_layers_status_var.set("Analyzing model...")
-                 self.gpu_layers_slider.config(state=tk.DISABLED)
-                 # Entry remains enabled, validated state will apply
-                 self._reset_model_info_display() # Reset info fields before analysis starts
-                 self.current_model_analysis = {} # Clear old analysis
-                 self._update_recommendations() # Update recommendations display based on no analysis yet
-
-                 # Start analysis thread
-                 if self.analysis_thread and self.analysis_thread.is_alive():
-                      print("DEBUG: Previous analysis thread is still running, cancelling old analysis.", file=sys.stderr)
-                      # Ideally, you'd have a way to signal the thread to stop.
-                      # For simplicity here, we just let the old thread finish and ignore its result
-                      # if a new analysis starts, by checking self.model_path in _update_ui_after_analysis.
-                      pass # No explicit cancel mechanism here
-
-                 self.analysis_thread = Thread(target=self._run_gguf_analysis, args=(full_path_str,), daemon=True)
-                 self.analysis_thread.start()
-            else:
-                 # Analysis not available - check what options we have
-                 backend = self.backend_selection.get()
-                 if backend == "ik_llama":
-                     backend_dir = self.ik_llama_dir.get().strip()
-                     backend_name = "ik_llama"
-                 else:
-                     backend_dir = self.llama_cpp_dir.get().strip()
-                     backend_name = "llama.cpp"
-
-                 if backend_dir:
-                     self.gpu_layers_status_var.set(f"Analyzing model using {backend_name} tools...")
-                     # Try the analysis even without llama-cpp-python
-                     self.analysis_thread = Thread(target=self._run_gguf_analysis, args=(full_path_str,), daemon=True)
-                     self.analysis_thread.start()
-                 else:
-                     self.gpu_layers_status_var.set(f"Analysis available: Set {backend_name} directory or install llama-cpp-python")
-                     self._reset_gpu_layer_controls(keep_entry_enabled=True) # Keep entry enabled if lib missing
-                     self._reset_model_info_display()
-                     self.model_architecture_var.set("Analysis Unavailable")
-                     self.model_filesize_var.set("Analysis Unavailable")
-                     self.model_total_layers_var.set("Analysis Unavailable")
-                     self.current_model_analysis = {}
-                     self._update_recommendations() # Update recommendations based on no analysis
-                     self._generate_default_config_name() # Generate default name even without analysis
-                     self._update_manual_model_visibility() # Update manual model section visibility
+            self.gpu_layers_status_var.set("Analyzing model...")
+            self.gpu_layers_slider.config(state=tk.DISABLED)
+            # Entry remains enabled, validated state will apply
+            self._reset_model_info_display() # Reset info fields before analysis starts
+            self.current_model_analysis = {} # Clear old analysis
+            self._update_recommendations() # Update recommendations display based on no analysis yet
+
+            # Start analysis thread
+            if self.analysis_thread and self.analysis_thread.is_alive():
+                print("DEBUG: Previous analysis thread is still running, cancelling old analysis.", file=sys.stderr)
+                # Ideally, you'd have a way to signal the thread to stop.
+                # For simplicity here, we just let the old thread finish and ignore its result
+                # if a new analysis starts, by checking self.model_path in _update_ui_after_analysis.
+                pass # No explicit cancel mechanism here
+
+            self.analysis_thread = Thread(target=self._run_gguf_analysis, args=(full_path_str,), daemon=True)
+            self.analysis_thread.start()
 
 
         else:
@@ -1942,38 +1935,8 @@ def _run_gguf_analysis(self, model_path_str):
         # Check if the currently selected model in the GUI still matches the one being analyzed
         # This prevents updating the UI with stale results if the user quickly selects another model
         if self.model_path.get() == model_path_str:
-             # Try backend-specific tools first, fall back to llama-cpp-python if available
-             backend = self.backend_selection.get()
-             if backend == "ik_llama":
-                 backend_dir = self.ik_llama_dir.get().strip()
-                 backend_name = "ik_llama"
-             else:
-                 backend_dir = self.llama_cpp_dir.get().strip()
-                 backend_name = "llama.cpp"
-
-             if backend_dir:
-                 print(f"DEBUG: Trying {backend_name} tools from: {backend_dir}", file=sys.stderr)
-                 analysis_result = analyze_gguf_with_llamacpp_tools(model_path_str, backend_dir)
-
-                 # If backend tools failed and we have llama-cpp-python available, try that as fallback
-                 if analysis_result.get("error") and LLAMA_CPP_PYTHON_AVAILABLE:
-                     print(f"DEBUG: {backend_name} tools failed, falling back to llama-cpp-python", file=sys.stderr)
-                     analysis_result = analyze_gguf_model_static(model_path_str)
-             else:
-                 # No backend directory set, try simple GGUF parser first
-                 print(f"DEBUG: No {backend_name} directory set, trying simple GGUF parser", file=sys.stderr)
-                 analysis_result = parse_gguf_header_simple(model_path_str)
-
-                 # If simple parser failed and we have llama-cpp-python available, try that as fallback
-                 if analysis_result.get("error") and LLAMA_CPP_PYTHON_AVAILABLE:
-                     print("DEBUG: Simple GGUF parser failed, falling back to llama-cpp-python", file=sys.stderr)
-                     analysis_result = analyze_gguf_model_static(model_path_str)
-
-             # Only update UI if the model path hasn't changed while analyzing
-             if self.model_path.get() == model_path_str:
-                self.root.after(0, self._update_ui_after_analysis, analysis_result)
-             else:
-                print(f"DEBUG: Analysis for {model_path_str} finished, but model selection changed. Discarding result.", file=sys.stderr)
+            analysis_result = analyze_gguf_model_static(model_path_str)
+            self.root.after(0, self._update_ui_after_analysis, analysis_result)
         else:
             print(f"DEBUG: Analysis started for {model_path_str}, but model selection changed before analysis began. Skipping.", file=sys.stderr)
 
@@ -2099,6 +2062,20 @@ def _update_ui_after_analysis(self, analysis_result):
             # This will set the slider and potentially update the entry format (-1 vs number)
             self._sync_gpu_layers_from_entry()
 
+        if int(context_length := analysis_result.get("context_length")) > 0:
+            self.model_context_length_var.set(context_length)
+            context_length = int(context_length)
+            self.ctx_slider.config(to=context_length)
+            if self.ctx_size.get() > context_length:
+                self.ctx_size.set(context_length)
+                self._sync_ctx_display(context_length)
+
+        if int(expert_count := analysis_result.get("expert_count")) > 0:
+            self.model_expert_count_var.set(expert_count)
+
+        if int(expert_used_count := analysis_result.get("expert_used_count")) > 0:
+            self.model_expert_used_count_var.set(expert_used_count)
+
         # --- Update Recommendations based on new analysis ---
         self._update_recommendations()
 
@@ -2142,6 +2119,9 @@ def _reset_model_info_display(self):
          self.model_architecture_var.set("N/A")
          self.model_filesize_var.set("N/A")
          self.model_total_layers_var.set("N/A")
+         self.model_context_length_var.set("N/A")
+         self.model_expert_count_var.set("N/A")
+         self.model_expert_used_count_var.set("N/A")
          # KV Cache Type display is linked to the variable, not reset here
 
 
diff --git a/requirements.txt b/requirements.txt
index c3d9349..5e852ab 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Required dependencies
 requests>=2.31.0  # Used in about_tab.py for version checking
+git+https://github.com/excosy/ik_gguf.git  # patched gguf-py, for GGUF model analysis
 
 # Optional but recommended dependencies
 torch>=2.0.0      # Required for GPU detection and selection
-llama-cpp-python  # Optional fallback for GGUF model analysis
-psutil>=5.9.0     # Optional for enhanced system information 
\ No newline at end of file
+psutil>=5.9.0     # Optional for enhanced system information
diff --git a/system.py b/system.py
index c76a39c..c724f3c 100644
--- a/system.py
+++ b/system.py
@@ -8,6 +8,7 @@
 import ctypes
 import struct
 from pathlib import Path
+from gguf import GGUFReader, GGUFValueType
 
 # Add debug prints for Python environment
 print("\n=== Python Environment Debug Info ===", file=sys.stderr)
@@ -29,20 +30,6 @@
     torch = None
     print(f"Warning: PyTorch import failed: {e}", file=sys.stderr)
 
-
-try:
-    # Import Llama only if llama_cpp_python is importable
-    from llama_cpp import Llama
-    LLAMA_CPP_PYTHON_AVAILABLE = True
-except ImportError:
-    LLAMA_CPP_PYTHON_AVAILABLE = False
-    Llama = None # Define Llama as None if import fails
-except Exception as e:
-    # Catch other potential issues during llama_cpp import (e.g., libllama.so not found)
-    LLAMA_CPP_PYTHON_AVAILABLE = False
-    Llama = None
-    print(f"Warning: llama-cpp-python import failed: {e}", file=sys.stderr)
-
 # Check for requests module (required for version checking)
 try:
     import requests
@@ -74,10 +61,6 @@
 if not TORCH_AVAILABLE:
     MISSING_DEPS.append("PyTorch (required for GPU detection and CUDA features)")
 
-# Optional dependencies
-if not LLAMA_CPP_PYTHON_AVAILABLE:
-    MISSING_DEPS.append("llama-cpp-python (optional - provides fallback GGUF analysis if llama.cpp tools unavailable)")
-
 if not PSUTIL_AVAILABLE:
     MISSING_DEPS.append("psutil (optional - provides enhanced system information)")
 
@@ -95,6 +78,13 @@
 # ═════════════════════════════════════════════════════════════════════
 #  Helper Functions (These remain outside the class as they don't need 'self')
 # ═════════════════════════════════════════════════════════════════════
+def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
+    file_endian = reader.endianess.name
+    if reader.byte_order == 'S':
+        host_endian = 'BIG' if file_endian == 'LITTLE' else 'LITTLE'
+    else:
+        host_endian = file_endian
+    return (host_endian, file_endian)
 
 def get_gpu_info_with_venv(venv_path=None):
     """Get GPU information using PyTorch, optionally from a virtual environment."""
@@ -110,9 +100,9 @@ def get_gpu_info_from_venv(venv_path):
     import subprocess
     import json
     from pathlib import Path
-    
+
     venv_path = Path(venv_path)
-    
+
     # Determine the Python executable in the venv
     if sys.platform == "win32":
         python_exe = venv_path / "Scripts" / "python.exe"
@@ -122,12 +112,12 @@ def get_gpu_info_from_venv(venv_path):
         python_exe = venv_path / "bin" / "python"
         if not python_exe.exists():
             python_exe = venv_path / "python"  # Some venv structures
-    
+
     if not python_exe.exists():
         print(f"DEBUG: Python executable not found in venv: {venv_path}", file=sys.stderr)
         # Fall back to current process detection
         return get_gpu_info_static()
-    
+
     # Create a small Python script to check for PyTorch/CUDA in the venv
     detection_script = '''
 import sys
@@ -136,7 +126,7 @@ def get_gpu_info_from_venv(venv_path):
 try:
     import torch
     torch_available = torch.cuda.is_available()
-    
+
     if torch_available:
         device_count = torch.cuda.device_count()
         gpu_info = {
@@ -144,7 +134,7 @@ def get_gpu_info_from_venv(venv_path):
             "device_count": device_count,
             "devices": []
         }
-        
+
         for i in range(device_count):
             props = torch.cuda.get_device_properties(i)
             gpu_info["devices"].append({
@@ -155,7 +145,7 @@ def get_gpu_info_from_venv(venv_path):
                 "compute_capability": f"{props.major}.{props.minor}",
                 "multi_processor_count": props.multi_processor_count
             })
-        
+
         print(json.dumps(gpu_info))
     else:
         print(json.dumps({"available": False, "message": "CUDA not available via PyTorch in venv", "device_count": 0, "devices": []}))
@@ -165,7 +155,7 @@ def get_gpu_info_from_venv(venv_path):
 except Exception as e:
     print(json.dumps({"available": False, "message": f"Error in venv GPU detection: {e}", "device_count": 0, "devices": []}))
 '''
-    
+
     try:
         print(f"DEBUG: Running GPU detection in venv: {venv_path}", file=sys.stderr)
         # Run the detection script in the virtual environment
@@ -175,14 +165,14 @@ def get_gpu_info_from_venv(venv_path):
             text=True,
             timeout=30
         )
-        
+
         if result.returncode == 0:
             try:
                 output = result.stdout.strip()
                 if not output:
                     print("DEBUG: Venv GPU detection returned empty output", file=sys.stderr)
                     return _create_fallback_gpu_info("Empty output from venv detection")
-                
+
                 gpu_info = json.loads(output)
                 print(f"DEBUG: Venv GPU detection successful: {gpu_info.get('device_count', 0)} devices", file=sys.stderr)
                 return gpu_info
@@ -194,7 +184,7 @@ def get_gpu_info_from_venv(venv_path):
             error_msg = result.stderr.strip() if result.stderr else "Unknown error"
             print(f"DEBUG: Venv GPU detection failed with return code {result.returncode}", file=sys.stderr)
             print(f"DEBUG: Error output: {error_msg}", file=sys.stderr)
-            
+
             # Check for specific error types
             if "ModuleNotFoundError" in error_msg or "ImportError" in error_msg:
                 return _create_fallback_gpu_info("Required modules not found in venv")
@@ -202,7 +192,7 @@ def get_gpu_info_from_venv(venv_path):
                 return _create_fallback_gpu_info("CUDA error in venv")
             else:
                 return _create_fallback_gpu_info(f"Venv detection failed: {error_msg}")
-            
+
     except subprocess.TimeoutExpired:
         print("DEBUG: Venv GPU detection timed out after 30 seconds", file=sys.stderr)
         return _create_fallback_gpu_info("Detection timeout")
@@ -220,16 +210,16 @@ def _create_fallback_gpu_info(reason):
     """Create fallback GPU info with specific reason, then try current process detection."""
     print(f"DEBUG: Creating fallback GPU info due to: {reason}", file=sys.stderr)
     print("DEBUG: Attempting current process GPU detection as fallback", file=sys.stderr)
-    
+
     # Try current process detection as fallback
     fallback_info = get_gpu_info_static()
-    
+
     # If current process detection also fails, return a clear error message
     if not fallback_info.get('available', False):
         fallback_info['message'] = f"Venv detection failed ({reason}), current process also failed"
     else:
         fallback_info['message'] = f"Using current process (venv failed: {reason})"
-    
+
     return fallback_info
 
 def get_gpu_info_static():
@@ -372,169 +362,36 @@ def get_cpu_info_static():
         print(f"Failed to get CPU info: {str(e)}", file=sys.stderr)
         return {"error": f"Failed to get CPU info: {str(e)}", "logical_cores": 4, "physical_cores": 2}
 
-
-def analyze_gguf_with_llamacpp_tools(model_path_str, llama_cpp_dir=None):
-    """Analyze GGUF model using llama.cpp/ik_llama tools instead of llama-cpp-python."""
-    import subprocess
-    import json
-    from pathlib import Path
-    
-    model_path = Path(model_path_str)
-    if not model_path.is_file():
-        return {"error": f"Model file not found: {model_path}", "n_layers": None, "architecture": "N/A", "file_size_bytes": 0}
-
-    # Calculate total size across all shards if this is a multi-part file
-    total_size_bytes, shard_count, all_shards = calculate_total_gguf_size(model_path_str)
-    
-    analysis_result = {
-        "path": str(model_path),
-        "file_size_bytes": total_size_bytes,
-        "file_size_gb": round(total_size_bytes / (1024**3), 2),
-        "architecture": "unknown",
-        "n_layers": None,
-        "metadata": {},
-        "error": None,
-        "message": None,
-        "shard_count": shard_count,
-        "all_shards": [str(p) for p in all_shards]
-    }
-
-    # Try to find analysis tools from both llama.cpp and ik_llama
-    tools_to_try = []
-    
-    if llama_cpp_dir:
-        backend_base_dir = Path(llama_cpp_dir)
-        # Common locations for llama.cpp/ik_llama tools
-        search_paths = [
-            backend_base_dir,
-            backend_base_dir / "build" / "bin",
-            backend_base_dir / "build",
-            backend_base_dir / "bin",
-        ]
-        
-        # Tool names for both llama.cpp and ik_llama backends
-        tool_names = [
-            # Standard llama.cpp tools
-            "llama-inspect", "gguf-dump", "llama-server",
-            # Possible ik_llama specific tools
-            "ik-llama-inspect", "ik_llama_inspect", "ik-llama-server", "ik_llama_server"
-        ]
-        if sys.platform == "win32":
-            tool_names = [name + ".exe" for name in tool_names]
-        
-        for search_path in search_paths:
-            for tool_name in tool_names:
-                tool_path = search_path / tool_name
-                if tool_path.is_file():
-                    tools_to_try.append((str(tool_path), tool_name.replace(".exe", "")))
-
-    # Try each tool
-    for tool_path, tool_name in tools_to_try:
-        try:
-            if tool_name == "llama-inspect":
-                # llama-inspect typically outputs JSON or structured data
-                result = subprocess.run([tool_path, str(model_path)], 
-                                      capture_output=True, text=True, timeout=30)
-                if result.returncode == 0:
-                    # Parse the output (this would need to be adapted based on actual llama-inspect output format)
-                    output = result.stdout.strip()
-                    # Try to extract key information from the output
-                    if "layers" in output.lower() or "block_count" in output.lower():
-                        # Parse layer count from output
-                        for line in output.split('\n'):
-                            if 'block_count' in line.lower() or 'n_layers' in line.lower():
-                                try:
-                                    # Extract number from line (this is a simple approach, may need refinement)
-                                    import re
-                                    numbers = re.findall(r'\d+', line)
-                                    if numbers:
-                                        analysis_result["n_layers"] = int(numbers[-1])
-                                        break
-                                except:
-                                    pass
-                    
-                    # Extract architecture if possible
-                    for line in output.split('\n'):
-                        if 'architecture' in line.lower():
-                            # Simple extraction - would need refinement based on actual output format
-                            parts = line.split()
-                            if len(parts) > 1:
-                                analysis_result["architecture"] = parts[-1]
-                            break
-                    
-                    analysis_result["message"] = f"Analyzed using {tool_name}"
-                    return analysis_result
-                    
-            elif tool_name == "llama-server":
-                # Try to get model info from llama-server without starting it
-                # Some versions support --model-info or similar flags
-                for flag in ["--model-info", "--print-model-info", "--help"]:
-                    try:
-                        result = subprocess.run([tool_path, flag, "-m", str(model_path)], 
-                                              capture_output=True, text=True, timeout=10)
-                        if result.returncode == 0 and ("layer" in result.stdout.lower() or "block" in result.stdout.lower()):
-                            # Similar parsing as above
-                            output = result.stdout.strip()
-                            for line in output.split('\n'):
-                                if 'block' in line.lower() or 'layer' in line.lower():
-                                    try:
-                                        import re
-                                        numbers = re.findall(r'\d+', line)
-                                        if numbers:
-                                            analysis_result["n_layers"] = int(numbers[-1])
-                                            analysis_result["message"] = f"Analyzed using {tool_name}"
-                                            return analysis_result
-                                    except:
-                                        pass
-                            break
-                    except:
-                        continue
-                        
-        except subprocess.TimeoutExpired:
-            continue
-        except Exception as e:
-            print(f"DEBUG: Tool {tool_name} failed: {e}", file=sys.stderr)
-            continue
-
-    # If no tools worked, try simple GGUF header parsing
-    try:
-        return parse_gguf_header_simple(model_path_str)
-    except Exception as e:
-        analysis_result["error"] = f"All analysis methods failed. Last error: {e}"
-        analysis_result["message"] = "Could not analyze model with available tools"
-        return analysis_result
-
-
 def calculate_total_gguf_size(model_path_str):
     """Calculate total size across all GGUF shards if this is a multi-part file."""
     import re
     from pathlib import Path
-    
+
     model_path = Path(model_path_str)
-    
+
     # Check if this looks like a multi-part GGUF file (e.g., "00001-of-00003.gguf")
     shard_pattern = re.search(r'-(\d+)-of-(\d+)\.gguf$', model_path.name, re.IGNORECASE)
     if not shard_pattern:
         # Not a multi-part file, return single file size
         return model_path.stat().st_size, 1, [model_path]
-    
+
     current_shard = int(shard_pattern.group(1))
     total_shards = int(shard_pattern.group(2))
-    
+
     print(f"DEBUG: Detected multi-part GGUF: shard {current_shard} of {total_shards}", file=sys.stderr)
-    
+
     # Find all related shard files
     base_name = model_path.name[:shard_pattern.start()]  # Everything before "-00001-of-00003.gguf"
     parent_dir = model_path.parent
-    
+
     total_size = 0
     found_shards = []
     missing_shards = []
-    
+
     for shard_num in range(1, total_shards + 1):
         shard_name = f"{base_name}-{shard_num:05d}-of-{total_shards:05d}.gguf"
         shard_path = parent_dir / shard_name
-        
+
         if shard_path.exists():
             shard_size = shard_path.stat().st_size
             total_size += shard_size
@@ -543,252 +400,74 @@ def calculate_total_gguf_size(model_path_str):
         else:
             missing_shards.append(shard_name)
             print(f"DEBUG: Missing shard {shard_num}: {shard_name}", file=sys.stderr)
-    
+
     if missing_shards:
         print(f"WARNING: Missing {len(missing_shards)} shards: {missing_shards}", file=sys.stderr)
         # Return what we found, but note it's incomplete
         return total_size, len(found_shards), found_shards
-    
+
     print(f"DEBUG: Total size across {total_shards} shards: {total_size / (1024**3):.2f} GB", file=sys.stderr)
     return total_size, total_shards, found_shards
 
-
-def parse_gguf_header_simple(model_path_str):
-    """Simple GGUF header parser to extract basic metadata without dependencies."""
-    import struct
-    from pathlib import Path
-    
-    model_path = Path(model_path_str)
-    
-    # Calculate total size across all shards if this is a multi-part file
-    total_size_bytes, shard_count, all_shards = calculate_total_gguf_size(model_path_str)
-    
-    analysis_result = {
-        "path": str(model_path),
-        "file_size_bytes": total_size_bytes,
-        "file_size_gb": round(total_size_bytes / (1024**3), 2),
-        "architecture": "unknown",
-        "n_layers": None,
-        "metadata": {},
-        "error": None,
-        "message": f"Analyzed using simple GGUF parser ({shard_count} shard{'s' if shard_count != 1 else ''})",
-        "shard_count": shard_count,
-        "all_shards": [str(p) for p in all_shards]
+def dump_gguf_metadata(model_path_str, json_array = False, no_tensors = False):
+    """
+    refer https://github.com/ggml-org/llama.cpp/blob/main/gguf-py/gguf/scripts/gguf_dump.py
+    json_array: Include full array values in JSON output (long)
+    no-tensors: Don't dump tensor metadata
+    """
+
+    reader = GGUFReader(model_path_str, 'r')
+    host_endian, file_endian = get_file_host_endian(reader)
+    metadata: dict[str, Any] = {}
+    tensors: dict[str, Any] = {}
+    result = {
+        "filename": Path(model_path_str).stem,
+        "endian": file_endian,
+        "metadata": metadata,
+        "tensors": tensors,
     }
-    
-    try:
-        with open(model_path, 'rb') as f:
-            # Read GGUF magic number
-            magic = f.read(4)
-            if magic != b'GGUF':
-                analysis_result["error"] = "Not a valid GGUF file"
-                return analysis_result
-            
-            # Read version
-            version = struct.unpack('<I', f.read(4))[0]
-            
-            # Read tensor count and metadata count
-            tensor_count = struct.unpack('<Q', f.read(8))[0]
-            metadata_count = struct.unpack('<Q', f.read(8))[0]
-            
-            # Read metadata key-value pairs
-            for _ in range(metadata_count):
-                try:
-                    # Read key length and key
-                    key_len = struct.unpack('<Q', f.read(8))[0]
-                    key = f.read(key_len).decode('utf-8')
-                    
-                    # Read value type
-                    value_type = struct.unpack('<I', f.read(4))[0]
-                    
-                    # Parse value based on type (improved error handling)
-                    value = None
-                    if value_type == 8:  # String
-                        try:
-                            value_len_bytes = f.read(8)
-                            if len(value_len_bytes) < 8:
-                                break  # End of file
-                            value_len = struct.unpack('<Q', value_len_bytes)[0]
-                            if value_len > 100000:  # Sanity check for string length
-                                print(f"DEBUG: Skipping large string for key '{key}': {value_len} bytes", file=sys.stderr)
-                                f.seek(f.tell() + value_len)  # Skip the string
-                                continue
-                            value_bytes = f.read(value_len)
-                            if len(value_bytes) < value_len:
-                                break  # End of file
-                            value = value_bytes.decode('utf-8', errors='replace')
-                        except Exception as e:
-                            print(f"DEBUG: Failed to read string for key '{key}': {e}", file=sys.stderr)
-                            continue
-                    elif value_type == 4:  # Uint32
-                        try:
-                            value_bytes = f.read(4)
-                            if len(value_bytes) < 4:
-                                break
-                            value = struct.unpack('<I', value_bytes)[0]
-                        except Exception as e:
-                            print(f"DEBUG: Failed to read uint32 for key '{key}': {e}", file=sys.stderr)
-                            continue
-                    elif value_type == 5:  # Int32
-                        try:
-                            value_bytes = f.read(4)
-                            if len(value_bytes) < 4:
-                                break
-                            value = struct.unpack('<i', value_bytes)[0]
-                        except Exception as e:
-                            print(f"DEBUG: Failed to read int32 for key '{key}': {e}", file=sys.stderr)
-                            continue
-                    elif value_type == 6:  # Uint64
-                        try:
-                            value_bytes = f.read(8)
-                            if len(value_bytes) < 8:
-                                break
-                            value = struct.unpack('<Q', value_bytes)[0]
-                        except Exception as e:
-                            print(f"DEBUG: Failed to read uint64 for key '{key}': {e}", file=sys.stderr)
-                            continue
-                    elif value_type == 7:  # Float32
-                        try:
-                            value_bytes = f.read(4)
-                            if len(value_bytes) < 4:
-                                break
-                            value = struct.unpack('<f', value_bytes)[0]
-                        except Exception as e:
-                            print(f"DEBUG: Failed to read float32 for key '{key}': {e}", file=sys.stderr)
-                            continue
-                    elif value_type == 9:  # Bool
-                        try:
-                            bool_byte = f.read(1)
-                            if len(bool_byte) < 1:
-                                break
-                            value = bool_byte[0] != 0
-                        except Exception as e:
-                            print(f"DEBUG: Failed to read bool for key '{key}': {e}", file=sys.stderr)
-                            continue
-                    elif value_type == 10:  # Array
-                        try:
-                            array_type_bytes = f.read(4)
-                            array_len_bytes = f.read(8)
-                            if len(array_type_bytes) < 4 or len(array_len_bytes) < 8:
-                                break
-                            array_type = struct.unpack('<I', array_type_bytes)[0]
-                            array_len = struct.unpack('<Q', array_len_bytes)[0]
-                            
-                            print(f"DEBUG: Skipping array for key '{key}': type {array_type}, length {array_len}", file=sys.stderr)
-                            
-                            # Skip array data based on type
-                            if array_type == 8:  # String array
-                                for i in range(min(array_len, 10000)):  # Limit to prevent infinite loops
-                                    try:
-                                        str_len_bytes = f.read(8)
-                                        if len(str_len_bytes) < 8:
-                                            break
-                                        str_len = struct.unpack('<Q', str_len_bytes)[0]
-                                        if str_len > 100000:  # Sanity check
-                                            break
-                                        f.seek(f.tell() + str_len)
-                                    except:
-                                        break
-                            elif array_type == 4:  # Uint32 array
-                                f.seek(f.tell() + array_len * 4)
-                            elif array_type == 6:  # Uint64 array
-                                f.seek(f.tell() + array_len * 8)
-                            elif array_type == 7:  # Float32 array
-                                f.seek(f.tell() + array_len * 4)
-                            else:
-                                # Unknown array type, try to skip conservatively
-                                print(f"DEBUG: Unknown array type {array_type}, attempting to skip", file=sys.stderr)
-                                break
-                        except Exception as e:
-                            print(f"DEBUG: Failed to skip array for key '{key}': {e}", file=sys.stderr)
-                            break
-                        continue
-                    else:
-                        print(f"DEBUG: Unknown value type {value_type} for key '{key}', skipping", file=sys.stderr)
-                        continue
-                    
-                    if value is not None:
-                        analysis_result["metadata"][key] = value
-                        
-                        # Extract key information
-                        if key == "general.architecture":
-                            analysis_result["architecture"] = str(value)
-                        elif any(pattern in key.lower() for pattern in ['.block_count', '.n_layers', '.layer_count', '.num_layer']):
-                            if isinstance(value, (int, float)) and value > 0:
-                                analysis_result["n_layers"] = int(value)
-                                print(f"DEBUG: Found layer count in key '{key}': {analysis_result['n_layers']}", file=sys.stderr)
-                        
-                        # Also check for model name patterns that might indicate layer count
-                        if 'name' in key.lower() and isinstance(value, str):
-                            # Try to extract parameter count from model name (e.g., "235B" -> estimate layers)
-                            import re
-                            param_match = re.search(r'(\d+)B', value.upper())
-                            if param_match and analysis_result["n_layers"] is None:
-                                param_count = int(param_match.group(1))
-                                # Rough estimate: large models typically have ~80-120 layers per billion parameters
-                                # For very large models like 235B, use a more conservative estimate
-                                if param_count >= 100:
-                                    estimated_layers = max(80, min(200, param_count // 2))  # Very conservative for huge models
-                                else:
-                                    estimated_layers = max(32, min(120, param_count * 2))  # More typical estimate
-                                print(f"DEBUG: Estimated {estimated_layers} layers from model name '{value}' ({param_count}B parameters)", file=sys.stderr)
-                                analysis_result["n_layers"] = estimated_layers
-                        
-                except Exception as parse_error:
-                    # Skip this metadata entry if parsing fails
-                    print(f"DEBUG: Failed to parse metadata entry: {parse_error}", file=sys.stderr)
-                    continue
-            
-            # If we still don't have layer count, make a reasonable guess based on file size
-            if analysis_result["n_layers"] is None:
-                file_size_gb = analysis_result["file_size_gb"]
-                if file_size_gb > 100:  # Very large model (100+ GB)
-                    estimated_layers = 120  # Conservative estimate for huge models
-                    print(f"DEBUG: No layer count found, estimating {estimated_layers} layers based on very large file size ({file_size_gb:.1f} GB)", file=sys.stderr)
-                elif file_size_gb > 50:  # Large model (50-100 GB)
-                    estimated_layers = 80
-                    print(f"DEBUG: No layer count found, estimating {estimated_layers} layers based on large file size ({file_size_gb:.1f} GB)", file=sys.stderr)
-                elif file_size_gb > 20:  # Medium-large model (20-50 GB)
-                    estimated_layers = 60
-                    print(f"DEBUG: No layer count found, estimating {estimated_layers} layers based on medium-large file size ({file_size_gb:.1f} GB)", file=sys.stderr)
-                elif file_size_gb > 10:  # Medium model (10-20 GB)
-                    estimated_layers = 40
-                    print(f"DEBUG: No layer count found, estimating {estimated_layers} layers based on medium file size ({file_size_gb:.1f} GB)", file=sys.stderr)
-                elif file_size_gb > 3:   # Small-medium model (3-10 GB)
-                    estimated_layers = 32
-                    print(f"DEBUG: No layer count found, estimating {estimated_layers} layers based on small-medium file size ({file_size_gb:.1f} GB)", file=sys.stderr)
-                else:  # Small model (< 3 GB)
-                    estimated_layers = 24
-                    print(f"DEBUG: No layer count found, estimating {estimated_layers} layers based on small file size ({file_size_gb:.1f} GB)", file=sys.stderr)
-                
-                analysis_result["n_layers"] = estimated_layers
-                analysis_result["message"] += f" (estimated {estimated_layers} layers from file size)"
-            
-            return analysis_result
-            
-    except Exception as e:
-        analysis_result["error"] = f"Failed to parse GGUF header: {e}"
-        return analysis_result
-
+    for idx, field in enumerate(reader.fields.values()):
+        curr: dict[str, Any] = {
+            "index": idx,
+            "type": field.types[0].name if field.types else 'UNKNOWN',
+            "offset": field.offset,
+        }
+        metadata[field.name] = curr
+        if field.types[:1] == [GGUFValueType.ARRAY]:
+            curr["array_types"] = [t.name for t in field.types][1:]
+            if not json_array:
+                continue
+            curr["value"] = field.contents()
+        else:
+            curr["value"] = field.contents()
+    if not no_tensors:
+        for idx, tensor in enumerate(reader.tensors):
+            tensors[tensor.name] = {
+                "index": idx,
+                "shape": tensor.shape.tolist(),
+                "type": tensor.tensor_type.name,
+                "offset": tensor.field.offset,
+            }
+    return result
 
 def analyze_gguf_model_static(model_path_str):
     """Analyze a GGUF model file and extract metadata (static method)."""
-    if not LLAMA_CPP_PYTHON_AVAILABLE or not Llama:
-        return {"error": "llama-cpp-python library not found.", "n_layers": None, "architecture": "N/A", "file_size_bytes": 0}
-
     model_path = Path(model_path_str)
     if not model_path.is_file():
          return {"error": f"Model file not found: {model_path}", "n_layers": None, "architecture": "N/A", "file_size_bytes": 0}
 
     # Calculate total size across all shards if this is a multi-part file
     total_size_bytes, shard_count, all_shards = calculate_total_gguf_size(model_path_str)
-    
+
     analysis_result = {
         "path": str(model_path),
         "file_size_bytes": total_size_bytes,
         "file_size_gb": round(total_size_bytes / (1024**3), 2),
         "architecture": "unknown",
         "n_layers": None,
+        "expert_count": 0,
+        "expert_used_count": 0,
+        "context_length": 0,
         "metadata": {},
         "error": None,
         "message": None,
@@ -796,102 +475,38 @@ def analyze_gguf_model_static(model_path_str):
         "all_shards": [str(p) for p in all_shards]
     }
 
-    llm_meta = None # Initialize llm_meta outside the try block
+    llm_meta = {} # Initialize llm_meta outside the try block
 
     try:
-        # File size already calculated from shards above, no need to recalculate
-
-        try:
-             # Use minimal parameters for quick metadata load
-             # Setting n_gpu_layers=0 is important to avoid trying to load layers onto potentially unavailable GPUs
-             # Using minimal n_ctx, n_batch, n_threads for minimal resource usage
-             llm_meta = Llama(model_path=str(model_path), n_ctx=32, n_threads=1, n_batch=32,
-                              verbose=False, n_gpu_layers=0, logits_all=False) # logits_all=False to save memory/time
-        except Exception as load_exc:
-             analysis_result["error"] = f"Failed to load model for metadata analysis: {load_exc}"
-             print(f"ERROR: Failed to load model '{model_path.name}' for analysis: {load_exc}", file=sys.stderr)
-             # No traceback here, the caller should handle it or the log message is enough
-             return analysis_result # Exit early if basic load fails
-
-
-        # --- Extract Metadata ---
-        # Attempt 1: Check common metadata keys and attributes
-        if hasattr(llm_meta, 'metadata') and isinstance(llm_meta.metadata, dict) and llm_meta.metadata:
-            analysis_result["metadata"] = llm_meta.metadata
-            # Check various common keys for layer count
-            analysis_result["n_layers"] = llm_meta.metadata.get('llama.block_count')
-            if analysis_result["n_layers"] is None:
-                 analysis_result["n_layers"] = llm_meta.metadata.get('general.architecture.block_count')
-            # Add more architecture-specific keys as needed
-            if analysis_result["n_layers"] is None: analysis_result["n_layers"] = llm_meta.metadata.get('qwen2.block_count')
-            if analysis_result["n_layers"] is None: analysis_result["n_layers"] = llm_meta.metadata.get('gemma.block_count')
-            if analysis_result["n_layers"] is None: analysis_result["n_layers"] = llm_meta.metadata.get('bert.block_count')
-            if analysis_result["n_layers"] is None: analysis_result["n_layers"] = llm_meta.metadata.get('model.block_count') # Fallback
-
-
-            # Check various common keys for architecture
-            analysis_result["architecture"] = llm_meta.metadata.get('general.architecture', 'unknown')
-            if analysis_result["architecture"] == 'unknown': analysis_result["architecture"] = llm_meta.metadata.get('qwen2.architecture', 'unknown')
-            if analysis_result["architecture"] == 'unknown': analysis_result["architecture"] = llm_meta.metadata.get('gemma.architecture', 'unknown')
-            if analysis_result["architecture"] == 'unknown': analysis_result["architecture"] = llm_meta.metadata.get('bert.architecture', 'unknown')
-
-
-        # Attempt 2: Check direct attributes if metadata didn't yield layers
-        if analysis_result["n_layers"] is None:
-             if hasattr(llm_meta, 'n_layer'): # Common recent name
-                  analysis_result["n_layers"] = getattr(llm_meta, 'n_layer', None)
-             if analysis_result["n_layers"] is None and hasattr(llm_meta, 'n_layers'): # Older name
-                  analysis_result["n_layers"] = getattr(llm_meta, 'n_layers', None)
-
-        # Fallback architecture if still unknown (less reliable)
-        if analysis_result["architecture"] == 'unknown' and hasattr(llm_meta, 'model_type'):
-             analysis_result["architecture"] = getattr(llm_meta, 'model_type', 'unknown')
-
-
-        # Validate n_layers
-        if analysis_result["n_layers"] is not None:
-            try:
-                analysis_result["n_layers"] = int(analysis_result["n_layers"])
-                if analysis_result["n_layers"] <= 0:
-                    analysis_result["n_layers"] = None # Treat non-positive as unknown
-                    analysis_result["message"] = "Layer count found was not positive."
-            except (ValueError, TypeError):
-                analysis_result["n_layers"] = None # Treat non-integer as unknown
-                analysis_result["message"] = "Layer count metadata found was not an integer."
-
-        if analysis_result["n_layers"] is None:
-             if not analysis_result["message"]: # If no specific message set above
-                  analysis_result["message"] = "Layer count metadata not found or recognized."
-
-
-        return analysis_result
-
-    except Exception as e:
-        # Catch any other unexpected errors during metadata processing or attribute access
-        print(f"ERROR: Failed during GGUF metadata processing for '{model_path.name}': {e}", file=sys.stderr)
-        traceback.print_exc(file=sys.stderr)
-        analysis_result["error"] = f"Unexpected error during analysis: {e}"
-        analysis_result["n_layers"] = None # Ensure layers is None on error
-        return analysis_result
-    finally:
-        # Ensure the temporary Llama object is deleted if it was created
-        if llm_meta:
-             try:
-                  del llm_meta
-             except Exception as clean_exc:
-                  print(f"Warning: Failed to delete llama_cpp.Llama instance in analyze_gguf_model_static finally block: {clean_exc}", file=sys.stderr) 
+        llm_meta = dump_gguf_metadata(model_path_str, no_tensors = True)["metadata"]
+    except Exception as load_exc:
+        analysis_result["error"] = f"Failed to load model for metadata analysis: {load_exc}"
+        print(f"ERROR: Failed to load model '{model_path.name}' for analysis: {load_exc}", file=sys.stderr)
+        # No traceback here, the caller should handle it or the log message is enough
+        return analysis_result # Exit early if basic load fails
+
+    # --- Extract Metadata ---
+    # Attempt 1: Check common metadata keys and attributes
+    for k,v in llm_meta.items():
+        if ".block_count" in k: analysis_result["n_layers"] = llm_meta.get(k)["value"]
+        if ".architecture" in k: analysis_result["architecture"] = llm_meta.get(k)["value"]
+        if ".context_length" in k: analysis_result["context_length"] = llm_meta.get(k)["value"]
+        if ".expert_count" in k: analysis_result["expert_count"] = llm_meta.get(k)["value"]
+        if ".expert_used_count" in k: analysis_result["expert_used_count"] = llm_meta.get(k)["value"]
+
+    return analysis_result
 
 class SystemInfoManager:
     """Manages system information fetching and processing for the launcher."""
-    
+
     def __init__(self, launcher_instance):
         """Initialize with reference to the main launcher instance."""
         self.launcher = launcher_instance
-    
+
     def fetch_system_info(self):
         """Fetches GPU, RAM, and CPU info and populates class attributes."""
         print("Fetching system info...", file=sys.stderr)
-        
+
         # Get the configured virtual environment path from the launcher
         venv_path = None
         if hasattr(self.launcher, 'venv_dir'):
@@ -901,7 +516,7 @@ def fetch_system_info(self):
                 print(f"DEBUG: Using configured venv for GPU detection: {venv_path}", file=sys.stderr)
             else:
                 print("DEBUG: No venv configured, using current process for GPU detection", file=sys.stderr)
-        
+
         self.launcher.gpu_info = get_gpu_info_with_venv(venv_path)
         self.launcher.ram_info = get_ram_info_static()
         self.launcher.cpu_info = get_cpu_info_static() # Fetch CPU info here
@@ -929,4 +544,4 @@ def fetch_system_info(self):
         self.launcher.recommended_threads_batch_var.set(f"Recommended: {self.launcher.logical_cores} (Your CPU logical cores)")
 
         # Display initial GPU detection status message
-        self.launcher.gpu_detected_status_var.set(self.launcher.gpu_info['message'] if not self.launcher.gpu_info['available'] and self.launcher.gpu_info.get('message') else "") 
\ No newline at end of file
+        self.launcher.gpu_detected_status_var.set(self.launcher.gpu_info['message'] if not self.launcher.gpu_info['available'] and self.launcher.gpu_info.get('message') else "")