diff --git a/.flake8 b/.flake8
index 669d231f1f6..16de5486797 100644
--- a/.flake8
+++ b/.flake8
@@ -15,4 +15,5 @@ exclude =
     build,
     # This contains builds that we don't want to check
     dist  # This is generated with `python build .` for package releases
+    scripts/tune
 # max-complexity = 10
diff --git a/pyrightconfig.json b/pyrightconfig.json
index 5320fe5864a..d7b32ae2a93 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -19,4 +19,7 @@
       "pythonVersion": "3.10",
     },
   ],
+  "exclude": [
+    "scripts/tune"
+  ]
  }
diff --git a/scripts/tune/tune.py b/scripts/tune/tune.py
new file mode 100644
index 00000000000..eff17d30636
--- /dev/null
+++ b/scripts/tune/tune.py
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""
+Optimize runtime parameters for llama-simple binary using eval time measurements.
+Usage: python tune_tps.py --model /path/to/model.gguf
+"""
+import os
+import time
+import argparse
+from functools import partial
+
+import numpy as np
+# pip install scikit-optimize
+from skopt import gp_minimize, expected_minimum
+from skopt.plots import plot_objective, plot_convergence
+from skopt.space import Categorical
+import matplotlib.pyplot as plt
+import json
+
+BAD_CONFIGURATIONS = []
+
+# Progress tracking global variables
+progress_start_time = None
+progress_current_call = 0
+progress_total_calls = 0
+progress_best_score = float('inf')
+
+def display_progress():
+    """Display current optimization progress with time estimates."""
+    global progress_start_time, progress_current_call, progress_total_calls, progress_best_score
+
+    if progress_start_time is None:
+        return
+
+    elapsed_time = time.time() - progress_start_time
+    if progress_current_call > -1:
+        avg_time_per_call = elapsed_time / progress_current_call
+        remaining_calls = progress_total_calls - progress_current_call
+        estimated_remaining_time = avg_time_per_call * remaining_calls
+
+        progress_percent = (progress_current_call / progress_total_calls) * 100
+
+        print(f"\n{'='*60}")
+        print(f"OPTIMIZATION PROGRESS")
+        print(f"{'='*60}")
+        print(f"Iteration: {progress_current_call}/{progress_total_calls} ({progress_percent:.1f}%)")
+        print(f"Elapsed time: {elapsed_time:.1f}s")
+        print(f"Est. remaining time: {estimated_remaining_time:.1f}s")
+        print(f"Best metric so far: {progress_best_score:.4f}")
+        print(f"{'='*60}\n")
+
+def run_iterations(get_opts_fn, run_binary_fn, run_options, model_path, binary_path="./build/bin/llama-cli", iterations=1):
+    """Run llama-siple with specified options and return eval time."""
+    try:
+        run_options_str = get_opts_fn(run_options, model_path, binary_path)
+        print(run_options_str)
+
+        results = []
+
+        # Run the test (can increase iterations for more stable results)
+        for _ in range(iterations):
+            results.append(run_binary_fn(run_options_str))
+
+        # Return eval time as the objective (we want to minimize this)
+        return np.mean(results)
+
+    except Exception as e:
+        BAD_CONFIGURATIONS.append(run_options)
+        print("ERROR:", e, run_options)
+        print("BAD_CONFIGURATIONS:", BAD_CONFIGURATIONS)
+        return 1000  # High penalty for failed runs
+
+
+def optimize_runtime_with_progress(x, get_opts_fn, run_binary_fn, run_options_list, model_path, llama_simple_path):
+    """Objective function for optimization with progress tracking."""
+    global progress_current_call, progress_best_score
+
+    progress_current_call += 1
+
+    run_options = {
+        run_options_list[i][0]: run_options_list[i][1][run_options_list[i][1].index(x[i])]
+        for i in range(len(run_options_list))
+    }
+
+    result = run_iterations(get_opts_fn, run_binary_fn, run_options, model_path, llama_simple_path)
+
+    # Update best score
+    if result < progress_best_score:
+        progress_best_score = result
+
+    # Display progress every call
+    display_progress()
+
+    return result
+
+
+def load_cache(cache_filename):
+    """Load cached optimization results."""
+    try:
+        with open(cache_filename, "r") as cache_file:
+            cache_data = json.load(cache_file)
+            return cache_data["x0"], cache_data["y0"]
+    except:
+        pass
+    return None, None
+
+
+def save_cache(cache_filename, x0, y0):
+    """Save optimization results to cache."""
+    # Convert numpy int64 objects to Python int objects
+    x0 = [[int(item) if isinstance(item, np.int64) else item for item in sublist] for sublist in x0]
+    y0 = [int(item) if isinstance(item, np.int64) else item for item in y0]
+
+    cache_data = {"x0": x0, "y0": y0}
+    with open(cache_filename, "w") as cache_file:
+        json.dump(cache_data, cache_file)
+
+
+def plot_iterations(result):
+    """Plot optimization iterations."""
+    search_space = result.space
+    x_iters = result.x_iters
+    func_vals = result.func_vals
+    search_space_names = [dim.name for dim in search_space]
+    opts = search_space_names + ["objective_r"]
+
+    num_params = len(opts) + 1
+    fig, axs = plt.subplots(num_params, figsize=(8, num_params * 8), sharex=True)
+    iterations = list(range(1, len(x_iters) + 1))
+
+    for i, param in enumerate(opts):
+        if param == "objective_r":
+            param_values = func_vals
+        else:
+            param_index = search_space_names.index(param)
+            param_values = [x[param_index] for x in x_iters]
+
+        axs[i].scatter(iterations, param_values)
+        axs[i].set_xlabel("Iteration")
+        axs[i].set_ylabel(param)
+
+    plot_convergence(result, true_minimum=0, ax=axs[-1])
+    return axs
+
+def parse_args(default_bin):
+    parser = argparse.ArgumentParser(description='Optimize llama-simple runtime parameters')
+    parser.add_argument('--model', '-m', required=True, help='Path to the GGUF model file')
+    parser.add_argument('--ngl', type=int, required=True, help='Max number of GPU layers')
+    parser.add_argument('--llama-binary', default=default_bin,
+                       help='Path to llama-simple binary (default: ./build/bin/llama-simple)')
+    parser.add_argument('--n-calls', type=int, default=50,
+                       help='Number of optimization calls (default: 20)')
+    parser.add_argument('--cache', default='cache_simple.json',
+                       help='Cache file name (default: cache_simple.json)')
+    parser.add_argument('--single-execution', type=str,
+                       help='Run single execution with specified options (format: "--param1=value1 --param2=value2")')
+
+    args = parser.parse_args()
+    return args
+
+def main(args, get_opts_fn, run_binary_fn, run_options_list):
+
+    # Check if llama-simple binary exists
+    if not os.path.exists(args.llama_binary):
+        print(f"Error: llama-simple binary not found at {args.llama_binary}")
+        print("Please build llama.cpp first or specify correct path with --llama-binary")
+        return
+
+    # Check if model exists
+    if not os.path.exists(args.model):
+        print(f"Error: Model file not found at {args.model}")
+        return
+
+    # Handle single execution mode
+    if args.single_execution:
+        try:
+            print("Single execution")
+            run_options = args.single_execution
+            run_iterations(get_opts_fn, run_binary_fn, run_options, args.model, args.llama_binary)
+            return
+        except ValueError as e:
+            print(f"Error parsing single execution options: {e}")
+            return
+
+    # Initialize progress tracking
+    global progress_start_time, progress_total_calls
+    progress_start_time = time.time()
+    progress_total_calls = args.n_calls
+
+    # Create optimization dimensions
+    dimensions = [Categorical(opt[1]) for opt in run_options_list]
+    for i, opt in enumerate(run_options_list):
+        dimensions[i].name = opt[0]
+
+    # Load cache
+    x0, y0 = load_cache(args.cache)
+
+    # Create objective function
+    objective_function = partial(optimize_runtime_with_progress,
+                               get_opts_fn=get_opts_fn,
+                               run_binary_fn=run_binary_fn,
+                               run_options_list=run_options_list,
+                               model_path=args.model,
+                               llama_simple_path=args.llama_binary)
+
+    print(f"Starting optimization with {args.n_calls} calls and {args.ngl} gpu layers...")
+    print(f"Using model: {args.model}")
+    print(f"Cache file: {args.cache}")
+
+    # Run optimization
+    result = gp_minimize(objective_function, dimensions,
+                        n_calls=args.n_calls,
+                        n_initial_points=min(10, args.n_calls),
+                        random_state=42,
+                        x0=x0, y0=y0,
+                        initial_point_generator="lhs")
+
+    # Save results
+    save_cache(args.cache, result.x_iters, result.func_vals)
+
+    # Print results
+    print(f"\nBest options found: {result.x}")
+    print(f"Minimum eval time: {result.fun:.4f} seconds")
+
+    # Convert result.x back to human-readable format - FIX: Find index of value in options list
+    best_options = {}
+    for i, (name, options) in enumerate(run_options_list):
+        # Find the value in result.x[i] and locate its index in the options list
+        value = result.x[i]
+        if value in options:
+            best_options[name] = value
+        else:
+            # Fallback: use the first option if value not found
+            print(f"Warning: Value '{value}' not found in options for {name}, using first option")
+            best_options[name] = options[0]
+
+    print("\nBest configuration:")
+    for name, value in best_options.items():
+        print(f"  {name}: {value}")
+
+    min_x, _ = expected_minimum(result)
+    print(f"Expected minimum: {min_x}")
+
+    if BAD_CONFIGURATIONS:
+        print(f"\nBAD_CONFIGURATIONS: {len(BAD_CONFIGURATIONS)}")
+
+    # Plot results
+    try:
+        plot_iterations(result)
+        plot_objective(result)
+        # Might need PyQt6
+        plt.show()
+    except Exception as e:
+        print(f"Plotting failed: {e}")
diff --git a/scripts/tune/tune_quality.py b/scripts/tune/tune_quality.py
new file mode 100644
index 00000000000..ffae2551558
--- /dev/null
+++ b/scripts/tune/tune_quality.py
@@ -0,0 +1,330 @@
+#!/usr/bin/env python3
+"""
+BERTScore-based translation quality optimization for llama.cpp models.
+Uses BERTScore to evaluate translation quality instead of HellaSwag accuracy.
+"""
+import subprocess
+import sys
+import os
+import re
+import json
+import hashlib
+import numpy as np
+from typing import Dict, List, Tuple, Any, Optional
+from collections import Counter
+
+# Import bert_score for translation quality evaluation
+import bert_score
+
+# Import language_tool_python for grammar checking
+import language_tool_python
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, script_dir)
+from tune import parse_args, main
+
+# Configuration
+BERTSCORE_MODEL = 'microsoft/deberta-v3-base'
+
+# Translation benchmarks for quality evaluation
+# Tiny subset of https://openslr.org/100
+TRANSLATION_BENCHMARKS = [
+    {
+        "prompt": "Translate the following English text to French:\n\nEnglish: As you can see, it does not look like a slam lesson, it is a language lesson, a language which allows to give orders to machines and computers the language of the 21st century: the computer code.\nFrench:",
+        "ground_truth": "Comme vous pouvez le constater, il ne s'agit pas d'un cours de slam, il s'agit d'un cours de langue, une langue qui permet de donner des ordres à des machines et à des ordinateurs, la langue du 21e siècle : le code informatique.",
+        "tool": "fr-FR"
+    },
+    {
+        "prompt": "Translate the following English text to Spanish:\n\nEnglish: Some years ago, when I was diving in the Lombok Strait, in Indonesia, 98 feet below the water, with that feeling of weightlessness, surrounded by a great biodiversity of reefs, corals, sea turtles, ocean sunfishes and fishes of all colors, I had an intense feeling of connection with nature.\nSpanish:",
+        "ground_truth": "Hace unos años, cuando me encontraba buceando en el estrecho de Lombok, en Indonesia, a 30 metros debajo del agua, con esa sensación de ingravidez, rodeado de una gran biodiversidad, de arrecifes, de corales, de tortugas, de peces mola mola y de peces de todos los colores, tuve una intensa sensación de estar conectado con la naturaleza.",
+        "tool": "es-ES"
+    },
+    {
+        "prompt": "Translate the following English text to Portuguese:\n\nEnglish: Have you ever stopped to think about clothes for disabled people?\nPortuguese:",
+        "ground_truth": "Vocês já pararam pra pensar como é o vestuário das pessoas com deficiência?",
+        "tool": "pt-PT"
+    }
+]
+
+def get_metrics(metrics_filepath: str, ground_truth: str, prediction: str, tool: str) -> Dict[str, Any]:
+    """
+    Calculate BERTScore and other quality metrics for translation evaluation.
+    Caches results to avoid recomputation.
+    """
+    print(f"Calculating metrics: {metrics_filepath}")
+
+    metrics = {
+        'bertscore_model': None,
+        'bertscore_P': None,
+        'bertscore_R': None,
+        'bertscore_F1': None,
+        'grammar_errors': None,
+        'repetition_score': None,
+        'objective_r': None
+    }
+
+    # Load cached scores
+    try:
+        with open(metrics_filepath, 'r', encoding='utf-8') as f:
+            metrics.update(json.load(f))
+    except FileNotFoundError:
+        pass
+
+    # Calculate BERTScore if not cached or model changed
+    if (not metrics["bertscore_P"] or not metrics["bertscore_R"] or
+        not metrics["bertscore_F1"] or metrics["bertscore_model"] != BERTSCORE_MODEL):
+        try:
+            metrics["bertscore_model"] = BERTSCORE_MODEL
+            score = bert_score.score([prediction], [ground_truth], model_type=BERTSCORE_MODEL)
+            metrics["bertscore_P"], metrics["bertscore_R"], metrics["bertscore_F1"] = (
+                score[0].item(), score[1].item(), score[2].item()
+            )
+        except Exception as e:
+            print(f"Warning: BERTScore calculation failed: {e}")
+            metrics["bertscore_P"] = metrics["bertscore_R"] = metrics["bertscore_F1"] = 0.0
+
+    # Calculate grammar errors if not cached
+    if metrics["grammar_errors"] is None:
+        metrics["grammar_errors"] = 0.0
+
+    language_tool = language_tool_python.LanguageTool(tool)
+    try:
+        matches = language_tool.check(prediction)
+        metrics["grammar_errors"] = len(matches) / max(len(prediction.split()), 1)
+    except Exception as e:
+        print(f"Warning: Grammar checking failed: {e}")
+        metrics["grammar_errors"] = 0.0
+
+    # Calculate repetition score if not cached
+    if metrics["repetition_score"] is None:
+        try:
+            words = prediction.split()
+            if len(words) > 0:
+                word_counts = Counter(words)
+                repeated_words = sum(count - 1 for count in word_counts.values() if count > 1)
+                metrics["repetition_score"] = repeated_words / len(words)
+            else:
+                metrics["repetition_score"] = 0.0
+        except Exception as e:
+            print(f"Warning: Repetition calculation failed: {e}")
+            metrics["repetition_score"] = 0.0
+
+    # Calculate objective score (we want to minimize this)
+    # Higher BERTScore Recall = better translation quality = lower objective value
+    # Add penalties for grammar errors and repetitions
+    if metrics["bertscore_R"] is not None:
+        grammar_penalty = metrics["grammar_errors"] * 0.1  # Small penalty for grammar errors
+        repetition_penalty = metrics["repetition_score"] * 0.05  # Small penalty for repetitions
+        metrics["objective_r"] = -(metrics["bertscore_R"] - grammar_penalty - repetition_penalty)
+    else:
+        metrics["objective_r"] = 1.0  # Bad score if BERTScore failed
+
+    # Save metrics to cache
+    try:
+        with open(metrics_filepath, 'w', encoding='utf-8') as f:
+            json.dump(metrics, f, indent=2, ensure_ascii=False)
+    except Exception as e:
+        print(f"Warning: Failed to save metrics: {e}")
+
+    return metrics
+
+def run_binary(run_options_str):
+    """Run the binary and evaluate translation quality using BERTScore."""
+    try:
+        # Parse the command to extract parameters
+        parts = run_options_str.split()
+        model_path = None
+        binary_path = None
+
+        # Find model path and binary path
+        for i, part in enumerate(parts):
+            if part == "-m" and i + 1 < len(parts):
+                model_path = parts[i + 1]
+            elif part.endswith("llama-cli") or part.endswith("main"):
+                binary_path = part
+
+        if not model_path or not binary_path:
+            print("Error: Could not parse model path or binary path from command")
+            return 100.0
+
+        # Create output directory for this run
+        run_hash = hashlib.md5(run_options_str.encode()).hexdigest()[:8]
+        output_dir = f"translation_eval_{run_hash}"
+        os.makedirs(output_dir, exist_ok=True)
+
+        all_scores = []
+
+        # Run translation benchmarks
+        for i, benchmark in enumerate(TRANSLATION_BENCHMARKS):
+            print(f"Running benchmark {i+1}/{len(TRANSLATION_BENCHMARKS)}")
+
+            # Build command for this benchmark - use the base command and add benchmark-specific params
+            benchmark_cmd = run_options_str.split()
+
+            # Add benchmark-specific parameters
+            benchmark_cmd.extend(["--prompt", benchmark["prompt"]])
+
+            # Run the command
+            try:
+                process = subprocess.run(benchmark_cmd,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE,
+                                       timeout=120,  # 2 minute timeout per benchmark
+                                       check=False)
+
+                if process.returncode != 0:
+                    print(f"Warning: Benchmark {i+1} failed with return code {process.returncode}")
+                    print(f"STDERR: {process.stderr.decode()}")
+                    all_scores.append(1.0)  # Bad score for failed runs
+                    continue
+
+                # Extract prediction from output
+                output = process.stdout.decode()
+                prediction = output.strip()
+
+                # Remove the prompt from prediction if it's included
+                if benchmark["prompt"] in prediction:
+                    prediction = prediction.split(benchmark["prompt"])[-1].strip()
+
+                # Calculate metrics
+                metrics_filepath = os.path.join(output_dir, f"benchmark_{i}_metrics.json")
+                metrics = get_metrics(metrics_filepath,
+                                    benchmark["ground_truth"], prediction, benchmark["tool"])
+
+                objective_score = metrics.get("objective_r", 1.0)
+                all_scores.append(objective_score)
+
+                print(f"Benchmark {i+1} - BERTScore R: {metrics.get('bertscore_R', 0):.4f}, "
+                      f"Objective: {objective_score:.4f}")
+
+            except subprocess.TimeoutExpired:
+                print(f"Warning: Benchmark {i+1} timed out")
+                all_scores.append(1.0)  # Bad score for timeouts
+            except Exception as e:
+                print(f"Error running benchmark {i+1}: {e}")
+                all_scores.append(1.0)  # Bad score for errors
+
+        # Calculate average score across all benchmarks
+        if all_scores:
+            avg_score = np.mean(all_scores)
+            print(f"Average translation quality objective score: {avg_score:.4f}")
+            return avg_score
+        else:
+            print("Warning: No successful benchmarks")
+            return 100.0  # Bad score if no benchmarks succeeded
+
+    except Exception as e:
+        print(f"Error in run_binary: {e}")
+        return 100.0  # Bad score for any other errors
+
+if __name__ == "__main__":
+    args = parse_args(default_bin='./build/bin/llama-cli')
+
+    # Define quality-focused sampling parameters for optimization
+    run_options_list = [
+        # Core Sampling Parameters (Most Critical for Quality)
+
+        # 1. Temperature - Controls randomness vs determinism
+        ("--temp", [
+            "--temp 0.1",   # Very focused, deterministic
+            "--temp 0.3",   # Focused, good for factual tasks
+            "--temp 0.5",   # Moderate creativity
+            "--temp 0.7",   # Balanced (recommended default)
+            "--temp 0.8",   # Good balance
+            "--temp 0.9",   # More creative
+            "--temp 1.0",   # Creative but coherent
+            "--temp 1.2"    # More creative, potentially less coherent
+        ]),
+
+        # 2. Top-p (Nucleus Sampling) - Controls diversity while maintaining quality
+        ("--top-p", [
+            "--top-p 0.5",   # Very focused
+            "--top-p 0.7",   # Focused, higher quality
+            "--top-p 0.8",   # Good balance
+            "--top-p 0.85",  # Balanced
+            "--top-p 0.9",   # Good balance (recommended)
+            "--top-p 0.95",  # Standard default
+            "--top-p 0.98",  # More diverse
+            "--top-p 1.0"    # No nucleus filtering
+        ]),
+
+        # 3. Top-k - Limits token selection to most probable candidates
+        ("--top-k", [
+            "--top-k 10",   # Very focused
+            "--top-k 20",   # More focused, higher quality
+            "--top-k 30",   # Balanced
+            "--top-k 40",   # Good balance (default)
+            "--top-k 50",   # Balanced, more diverse
+            "--top-k 60",   # More diverse
+            "--top-k 80",   # Very diverse
+            "--top-k 100"   # Most diverse
+        ]),
+
+        # 4. Min-p - Filters out low-probability tokens
+        ("--min-p", [
+            "--min-p 0.01",  # Very permissive
+            "--min-p 0.02",  # Permissive
+            "--min-p 0.05",  # Good default
+            "--min-p 0.08",  # More restrictive
+            "--min-p 0.1",   # Restrictive, higher quality
+            "--min-p 0.15",  # Very restrictive
+            "--min-p 0.2"    # Extremely restrictive
+        ]),
+
+        # Repetition Control (Critical for Coherence)
+
+        # 5. Repeat Penalty - Prevents repetitive text
+        ("--repeat-penalty", [
+            "--repeat-penalty 1.0",   # Disabled
+            "--repeat-penalty 1.02",  # Very light penalty
+            "--repeat-penalty 1.05",  # Light penalty (recommended)
+            "--repeat-penalty 1.1",   # Moderate penalty (recommended)
+            "--repeat-penalty 1.15",  # Moderate-strong penalty
+            "--repeat-penalty 1.2",   # Strong penalty
+            "--repeat-penalty 1.25",  # Very strong penalty
+            "--repeat-penalty 1.3"    # Extreme penalty
+        ]),
+
+        # 6. Repeat Last N - How far back to look for repetitions
+        ("--repeat-last-n", [
+            "--repeat-last-n 16",   # Short context
+            "--repeat-last-n 32",   # Short-medium context
+            "--repeat-last-n 64",   # Balanced default
+            "--repeat-last-n 96",   # Medium-large context
+            "--repeat-last-n 128",  # Large context
+            "--repeat-last-n 192",  # Very large context
+            "--repeat-last-n 256"   # Maximum context
+        ]),
+
+        # Advanced Quality Parameters
+
+        # 7. Typical-p - Promotes contextually coherent tokens
+        ("--typical", [
+            "--typical 1.0",   # Disabled
+            "--typical 0.95",  # Light filtering
+            "--typical 0.9",   # Recommended for quality
+            "--typical 0.85",  # Moderate filtering
+            "--typical 0.8",   # Strong filtering
+            "--typical 0.75",  # Very strong filtering
+            "--typical 0.7"    # Extreme filtering
+        ]),
+
+        # 8. Mirostat - Adaptive sampling for consistent quality
+        ("--mirostat", [
+            "--mirostat 0",  # Disabled (default)
+            "--mirostat 1",  # Mirostat v1
+            "--mirostat 2"   # Mirostat v2 (often better quality)
+        ]),
+
+        # Keep seed constant for reproducible results
+        ("--seed", ["-s 42"]),
+    ]
+
+    def run_str(run_options, model_path, binary_path):
+        """Build command string for llama-cli with translation evaluation."""
+        if isinstance(run_options, dict):
+            run_options = " ".join(run_options.values())
+        # Use the main binary for translation evaluation
+        return f"{binary_path} -m {model_path} --threads 8 -ngl {args.ngl} {run_options}"
+
+    main(args, run_str, run_binary, run_options_list)
diff --git a/scripts/tune/tune_requirements.txt b/scripts/tune/tune_requirements.txt
new file mode 100644
index 00000000000..50cb56bbe89
--- /dev/null
+++ b/scripts/tune/tune_requirements.txt
@@ -0,0 +1,3 @@
+language_tool_python
+bert_score
+scikit-optimize
diff --git a/scripts/tune/tune_tps.py b/scripts/tune/tune_tps.py
new file mode 100644
index 00000000000..858471348ad
--- /dev/null
+++ b/scripts/tune/tune_tps.py
@@ -0,0 +1,80 @@
+import subprocess
+import sys
+import os
+import re
+
+script_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.insert(0, script_dir)
+from tune import parse_args, main
+
+def run_str(run_options, model_path, binary_path):
+        run_opts = " ".join(run_options.values())
+        return f"{binary_path} -m {model_path} -p 'Hello, how are you?' -n 1 {run_opts}"
+
+def run_binary(run_options_str):
+        process = subprocess.run(run_options_str,
+                                 stdout=subprocess.PIPE,
+                                 stderr=subprocess.PIPE,
+                                 shell=True,
+                                 check=True,
+                                 )
+        if process.returncode != 0:
+            raise Exception(f"Error running: '{run_options_str}':\n{process.stderr}")
+
+        # Parse timing information from stderr
+        stderr_text = process.stderr.decode()
+
+        # Updated regex patterns for llama-simple output
+        prompt_eval_time_pattern = r"prompt eval time\s*=\s*([\d.]+)\s*ms"
+        eval_time_pattern = r"eval time\s*=\s*([\d.]+)\s*ms"
+
+        prompt_match = re.search(prompt_eval_time_pattern, stderr_text)
+        eval_match = re.search(eval_time_pattern, stderr_text)
+
+        if prompt_match and eval_match:
+            prompt_eval_time = float(prompt_match.group(1)) / 1000  # Convert to seconds
+            eval_time = float(eval_match.group(1)) / 1000  # Convert to seconds
+        else:
+            # Fallback: look for any timing patterns
+            print("Warning: Could not parse timing info, using fallback")
+            print("STDERR:", stderr_text)
+            return 1000  # High penalty for failed parsing
+
+        print("prompt eval time:", prompt_eval_time)
+        print("eval time:", eval_time)
+
+        return eval_time
+
+if __name__ == "__main__":
+    args = parse_args(default_bin='./build/bin/llama-cli')
+    # Define runtime options to optimize - Core Performance Parameters
+    run_options_list = [
+        # 1. Batch Processing Parameters (most critical for throughput)
+        ("--batch-size", ["--batch-size 31", "--batch-size 64", "--batch-size 128", "--batch-size 256", "--batch-size 512", "--batch-size 1024", "--batch-size 2048"]),
+        ("--ubatch-size", ["--ubatch-size 32", "--ubatch-size 64", "--ubatch-size 128", "--ubatch-size 256", "--ubatch-size 512"]),
+
+        # 2. Context and Memory Parameters
+        ("--ctx-size", ["-c 512", "-c 1024", "-c 2048", "-c 4096", "-c 8192"]),
+        ("--defrag-thold", ["--defrag-thold -1", "--defrag-thold 0.1", "--defrag-thold 0.2", "--defrag-thold 0.5"]),
+
+        # 3. GPU Offloading Parameters (critical for GPU performance)
+        # Set range to a value that makes sense for your model
+        ("--n-gpu-layers", [f"--n-gpu-layers {i}" for i in range(args.ngl)]),
+
+        # 4. CPU Optimization Parameters
+        ("--threads", ["-t 4", "-t 8", "-t 12", "-t 16"]),
+        # ("--prio", ["--prio 0", "--prio 1", "--prio 2"]),
+
+        # 5. Memory and Caching Parameters
+        # ("--use-mmap", ["", "--no-mmap"]),
+        ("--use-mlock", ["--mlock", ""]),
+        ("--kv-unified", ["--kv-unified", ""]),
+
+        # 6. Advanced Performance Features
+        ("--flash-attn", ["--flash-attn", ""]),
+        # ("--no-kv-offload", ["--no-kv-offload", ""]),  # Empty string means don't use the flag
+
+        # Keep seed constant for reproducible results
+        ("--seed", ["-s 42"]),
+    ]
+    main(args, run_str, run_binary, run_options_list)