From e5089bd1b0eed9e73e660ce6c41776d64f4920d0 Mon Sep 17 00:00:00 2001
From: rohan-uiuc <rohan13@illinois.edu>
Date: Sun, 2 Mar 2025 13:07:41 -0600
Subject: [PATCH 01/52] Add python API for programmatic access

---
 examples/README.md                 |   3 +
 examples/api/advanced_api_usage.py | 275 ++++++++++++++++++++++++
 examples/api/api_usage.py          |  43 ++++
 tests/vec_inf/api/README.md        |  29 +++
 tests/vec_inf/api/__init__.py      |   1 +
 tests/vec_inf/api/test_client.py   | 127 +++++++++++
 tests/vec_inf/api/test_examples.py | 115 ++++++++++
 tests/vec_inf/api/test_models.py   |  58 +++++
 vec_inf/api/__init__.py            |  30 +++
 vec_inf/api/client.py              | 334 +++++++++++++++++++++++++++++
 vec_inf/api/models.py              | 120 +++++++++++
 vec_inf/api/utils.py               | 231 ++++++++++++++++++++
 12 files changed, 1366 insertions(+)
 create mode 100755 examples/api/advanced_api_usage.py
 create mode 100755 examples/api/api_usage.py
 create mode 100644 tests/vec_inf/api/README.md
 create mode 100644 tests/vec_inf/api/__init__.py
 create mode 100644 tests/vec_inf/api/test_client.py
 create mode 100644 tests/vec_inf/api/test_examples.py
 create mode 100644 tests/vec_inf/api/test_models.py
 create mode 100644 vec_inf/api/__init__.py
 create mode 100644 vec_inf/api/client.py
 create mode 100644 vec_inf/api/models.py
 create mode 100644 vec_inf/api/utils.py

diff --git a/examples/README.md b/examples/README.md
index dcaf7499..09b53bd8 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -7,3 +7,6 @@
   - [`vlm/vision_completions.py`](inference/vlm/vision_completions.py): Python example of sending chat completion requests with image attached to prompt to OpenAI compatible server for vision language models
 - [`logits`](logits): Example for logits generation
   - [`logits.py`](logits/logits.py): Python example of getting logits from hosted model.
+- [`api`](api): Examples for using the Python API
+  - [`api_usage.py`](api/api_usage.py): Basic Python example demonstrating the Vector Inference API
+  - [`advanced_api_usage.py`](api/advanced_api_usage.py): Advanced Python example with rich UI for the Vector Inference API
diff --git a/examples/api/advanced_api_usage.py b/examples/api/advanced_api_usage.py
new file mode 100755
index 00000000..44e2503b
--- /dev/null
+++ b/examples/api/advanced_api_usage.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python
+"""Advanced usage examples for the Vector Inference Python API.
+
+This script demonstrates more advanced patterns and techniques for
+using the Vector Inference API programmatically.
+"""
+
+import argparse
+import json
+import os
+import time
+from dataclasses import asdict
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+from openai import OpenAI
+from rich.console import Console
+from rich.progress import Progress
+from rich.table import Table
+
+from vec_inf.api import LaunchOptions, ModelStatus, VecInfClient
+
+
+console = Console()
+
+
+def create_openai_client(base_url: str) -> OpenAI:
+    """Create an OpenAI client for a given base URL."""
+    return OpenAI(base_url=base_url, api_key="EMPTY")
+
+
+def export_model_configs(output_file: str):
+    """Export all model configurations to a JSON file."""
+    client = VecInfClient()
+    models = client.list_models()
+    
+    # Convert model info to dictionaries
+    model_dicts = []
+    for model in models:
+        model_dict = {
+            "name": model.name,
+            "family": model.family,
+            "variant": model.variant,
+            "type": str(model.type),
+            "config": model.config,
+        }
+        model_dicts.append(model_dict)
+    
+    # Write to file
+    with open(output_file, "w") as f:
+        json.dump(model_dicts, f, indent=2)
+    
+    console.print(f"[green]Exported {len(models)} model configurations to {output_file}[/green]")
+
+
+def launch_with_custom_config(model_name: str, custom_options: Dict[str, Union[str, int, bool]]):
+    """Launch a model with custom configuration options."""
+    client = VecInfClient()
+    
+    # Create LaunchOptions from dictionary
+    options_dict = {}
+    for key, value in custom_options.items():
+        if key in LaunchOptions.__annotations__:
+            options_dict[key] = value
+        else:
+            console.print(f"[yellow]Warning: Ignoring unknown option '{key}'[/yellow]")
+    
+    options = LaunchOptions(**options_dict)
+    
+    # Launch the model
+    console.print(f"[blue]Launching model {model_name} with custom options:[/blue]")
+    for key, value in options_dict.items():
+        console.print(f"  [cyan]{key}[/cyan]: {value}")
+    
+    response = client.launch_model(model_name, options)
+    
+    console.print(f"[green]Model launched successfully![/green]")
+    console.print(f"Slurm Job ID: [bold]{response.slurm_job_id}[/bold]")
+    
+    return response.slurm_job_id
+
+
+def monitor_with_rich_ui(job_id: str, poll_interval: int = 5, max_time: int = 1800):
+    """Monitor a model's status with a rich UI."""
+    client = VecInfClient()
+    
+    start_time = time.time()
+    elapsed = 0
+    
+    with Progress() as progress:
+        # Add tasks
+        status_task = progress.add_task("[cyan]Waiting for model to be ready...", total=None)
+        time_task = progress.add_task("[yellow]Time elapsed", total=max_time)
+        
+        while elapsed < max_time:
+            # Update time elapsed
+            elapsed = int(time.time() - start_time)
+            progress.update(time_task, completed=elapsed)
+            
+            # Get status
+            try:
+                status = client.get_status(job_id)
+                
+                # Update status message
+                if status.status == ModelStatus.READY:
+                    progress.update(status_task, description=f"[green]Model is READY at {status.base_url}[/green]")
+                    break
+                elif status.status == ModelStatus.FAILED:
+                    progress.update(status_task, description=f"[red]Model FAILED: {status.failed_reason}[/red]")
+                    break
+                elif status.status == ModelStatus.PENDING:
+                    progress.update(status_task, description=f"[yellow]Model is PENDING: {status.pending_reason}[/yellow]")
+                elif status.status == ModelStatus.LAUNCHING:
+                    progress.update(status_task, description="[cyan]Model is LAUNCHING...[/cyan]")
+                elif status.status == ModelStatus.SHUTDOWN:
+                    progress.update(status_task, description="[red]Model was SHUTDOWN[/red]")
+                    break
+            except Exception as e:
+                progress.update(status_task, description=f"[red]Error checking status: {str(e)}[/red]")
+            
+            # Wait before checking again
+            time.sleep(poll_interval)
+    
+    return client.get_status(job_id)
+
+
+def stream_metrics(job_id: str, duration: int = 60, interval: int = 5):
+    """Stream metrics for a specified duration."""
+    client = VecInfClient()
+    
+    console.print(f"[blue]Streaming metrics for {duration} seconds...[/blue]")
+    
+    end_time = time.time() + duration
+    while time.time() < end_time:
+        try:
+            metrics_response = client.get_metrics(job_id)
+            
+            if metrics_response.metrics:
+                table = Table(title="Performance Metrics")
+                table.add_column("Metric", style="cyan")
+                table.add_column("Value", style="green")
+                
+                for key, value in metrics_response.metrics.items():
+                    table.add_row(key, value)
+                
+                console.print(table)
+            else:
+                console.print("[yellow]No metrics available yet[/yellow]")
+        
+        except Exception as e:
+            console.print(f"[red]Error retrieving metrics: {str(e)}[/red]")
+        
+        time.sleep(interval)
+
+
+def batch_inference_example(base_url: str, model_name: str, input_file: str, output_file: str):
+    """Perform batch inference on inputs from a file."""
+    # Read inputs
+    with open(input_file, "r") as f:
+        inputs = [line.strip() for line in f if line.strip()]
+    
+    openai_client = create_openai_client(base_url)
+    
+    results = []
+    with Progress() as progress:
+        task = progress.add_task("[green]Processing inputs...", total=len(inputs))
+        
+        for input_text in inputs:
+            try:
+                # Process using completions API
+                completion = openai_client.completions.create(
+                    model=model_name,
+                    prompt=input_text,
+                    max_tokens=100,
+                )
+                
+                # Store result
+                results.append({
+                    "input": input_text,
+                    "output": completion.choices[0].text,
+                    "tokens": completion.usage.completion_tokens,
+                })
+                
+            except Exception as e:
+                results.append({
+                    "input": input_text,
+                    "error": str(e)
+                })
+            
+            progress.update(task, advance=1)
+    
+    # Write results
+    with open(output_file, "w") as f:
+        json.dump(results, f, indent=2)
+    
+    console.print(f"[green]Processed {len(inputs)} inputs and saved results to {output_file}[/green]")
+
+
+def main():
+    """Main function to parse arguments and run the selected function."""
+    parser = argparse.ArgumentParser(description="Advanced Vector Inference API usage examples")
+    subparsers = parser.add_subparsers(dest="command", help="Command to run")
+    
+    # Export configs command
+    export_parser = subparsers.add_parser("export-configs", help="Export all model configurations to a JSON file")
+    export_parser.add_argument("--output", "-o", default="model_configs.json", help="Output JSON file")
+    
+    # Launch with custom config command
+    launch_parser = subparsers.add_parser("launch", help="Launch a model with custom configuration")
+    launch_parser.add_argument("model_name", help="Name of the model to launch")
+    launch_parser.add_argument("--num-gpus", type=int, help="Number of GPUs to use")
+    launch_parser.add_argument("--num-nodes", type=int, help="Number of nodes to use")
+    launch_parser.add_argument("--max-model-len", type=int, help="Maximum model context length")
+    launch_parser.add_argument("--max-num-seqs", type=int, help="Maximum number of sequences")
+    launch_parser.add_argument("--partition", help="Partition to use")
+    launch_parser.add_argument("--qos", help="Quality of service")
+    launch_parser.add_argument("--time", help="Time limit")
+    
+    # Monitor command
+    monitor_parser = subparsers.add_parser("monitor", help="Monitor a model with rich UI")
+    monitor_parser.add_argument("job_id", help="Slurm job ID to monitor")
+    monitor_parser.add_argument("--interval", type=int, default=5, help="Polling interval in seconds")
+    monitor_parser.add_argument("--max-time", type=int, default=1800, help="Maximum time to monitor in seconds")
+    
+    # Stream metrics command
+    metrics_parser = subparsers.add_parser("metrics", help="Stream metrics for a model")
+    metrics_parser.add_argument("job_id", help="Slurm job ID to get metrics for")
+    metrics_parser.add_argument("--duration", type=int, default=60, help="Duration to stream metrics in seconds")
+    metrics_parser.add_argument("--interval", type=int, default=5, help="Polling interval in seconds")
+    
+    # Batch inference command
+    batch_parser = subparsers.add_parser("batch", help="Perform batch inference")
+    batch_parser.add_argument("base_url", help="Base URL of the model server")
+    batch_parser.add_argument("model_name", help="Name of the model to use")
+    batch_parser.add_argument("--input", "-i", required=True, help="Input file with one prompt per line")
+    batch_parser.add_argument("--output", "-o", required=True, help="Output JSON file for results")
+    
+    args = parser.parse_args()
+    
+    # Run the selected command
+    if args.command == "export-configs":
+        export_model_configs(args.output)
+    
+    elif args.command == "launch":
+        # Extract custom options from args
+        options = {}
+        for key, value in vars(args).items():
+            if key not in ["command", "model_name"] and value is not None:
+                options[key] = value
+        
+        job_id = launch_with_custom_config(args.model_name, options)
+        
+        # Ask if user wants to monitor
+        if console.input("[cyan]Monitor this job? (y/n): [/cyan]").lower() == "y":
+            monitor_with_rich_ui(job_id)
+    
+    elif args.command == "monitor":
+        status = monitor_with_rich_ui(args.job_id, args.interval, args.max_time)
+        
+        if status.status == ModelStatus.READY:
+            if console.input("[cyan]Stream metrics for this model? (y/n): [/cyan]").lower() == "y":
+                stream_metrics(args.job_id)
+    
+    elif args.command == "metrics":
+        stream_metrics(args.job_id, args.duration, args.interval)
+    
+    elif args.command == "batch":
+        batch_inference_example(args.base_url, args.model_name, args.input, args.output)
+    
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file
diff --git a/examples/api/api_usage.py b/examples/api/api_usage.py
new file mode 100755
index 00000000..ab7b0112
--- /dev/null
+++ b/examples/api/api_usage.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+"""Basic example of Vector Inference API usage.
+
+This script demonstrates the core features of the Vector Inference API
+for launching and interacting with models.
+"""
+
+from vec_inf.api import VecInfClient
+
+
+# Create the API client
+client = VecInfClient()
+
+# List available models
+print("Listing available models...")
+models = client.list_models()
+print(f"Found {len(models)} models")
+for model in models[:3]:  # Show just the first few
+    print(f"- {model.name} ({model.type})")
+
+# Launch a model (replace with an actual model name from your environment)
+model_name = "Meta-Llama-3.1-8B-Instruct"  # Use an available model from your list
+print(f"\nLaunching {model_name}...")
+response = client.launch_model(model_name)
+job_id = response.slurm_job_id
+print(f"Launched with job ID: {job_id}")
+
+# Wait for the model to be ready
+print("Waiting for model to be ready...")
+status = client.wait_until_ready(job_id)
+print(f"Model is ready at: {status.base_url}")
+
+# Get metrics
+print("\nRetrieving metrics...")
+metrics = client.get_metrics(job_id)
+if metrics.metrics:
+    for key, value in metrics.metrics.items():
+        print(f"- {key}: {value}")
+
+# Shutdown when done
+print("\nShutting down model...")
+client.shutdown_model(job_id)
+print("Model shutdown complete") 
\ No newline at end of file
diff --git a/tests/vec_inf/api/README.md b/tests/vec_inf/api/README.md
new file mode 100644
index 00000000..cc1e6ab1
--- /dev/null
+++ b/tests/vec_inf/api/README.md
@@ -0,0 +1,29 @@
+# API Tests
+
+This directory contains tests for the Vector Inference API module.
+
+## Test Files
+
+- `test_client.py` - Tests for the `VecInfClient` class and its methods
+- `test_models.py` - Tests for the API data models and enums
+- `test_examples.py` - Tests for the API example scripts
+
+## Running Tests
+
+Run the tests using pytest:
+
+```bash
+pytest tests/vec_inf/api
+```
+
+## Test Coverage
+
+The tests cover the following areas:
+
+- Core client functionality: listing models, launching models, checking status, getting metrics, shutting down
+- Data models validation: `ModelInfo`, `ModelStatus`, `LaunchOptions`
+- API examples: verifying that API example scripts work correctly
+
+## Dependencies
+
+The tests use pytest and mock objects to isolate the tests from actual dependencies. 
\ No newline at end of file
diff --git a/tests/vec_inf/api/__init__.py b/tests/vec_inf/api/__init__.py
new file mode 100644
index 00000000..e6e84bf0
--- /dev/null
+++ b/tests/vec_inf/api/__init__.py
@@ -0,0 +1 @@
+"""Tests for the Vector Inference API.""" 
\ No newline at end of file
diff --git a/tests/vec_inf/api/test_client.py b/tests/vec_inf/api/test_client.py
new file mode 100644
index 00000000..e6e53eb6
--- /dev/null
+++ b/tests/vec_inf/api/test_client.py
@@ -0,0 +1,127 @@
+"""Tests for the Vector Inference API client."""
+
+from unittest.mock import MagicMock, patch
+from typing import Dict, Tuple, Any
+
+import pytest
+
+from vec_inf.api import ModelStatus, VecInfClient, ModelType
+from vec_inf.api.utils import ModelNotFoundError, SlurmJobError
+
+
+@pytest.fixture
+def mock_model_config():
+    """Return a mock model configuration."""
+    return {
+        "model_family": "test-family",
+        "model_variant": "test-variant",
+        "model_type": "LLM",
+        "num_gpus": 1,
+        "num_nodes": 1,
+    }
+
+
+@pytest.fixture
+def mock_launch_output():
+    """Fixture providing mock launch output."""
+    return """
+Submitted batch job 12345678
+    """.strip()
+
+
+@pytest.fixture
+def mock_status_output():
+    """Fixture providing mock status output."""
+    return """
+JobId=12345678 JobName=test-model JobState=READY
+    """.strip()
+
+
+def test_list_models():
+    """Test that list_models returns model information."""
+    # Create a mock model with specific attributes instead of relying on MagicMock
+    mock_model = MagicMock()
+    mock_model.name = "test-model"
+    mock_model.family = "test-family"
+    mock_model.variant = "test-variant"
+    mock_model.type = ModelType.LLM
+    
+    client = VecInfClient()
+    
+    # Replace the list_models method with a lambda that returns our mock model
+    original_list_models = client.list_models
+    client.list_models = lambda: [mock_model]
+    
+    # Call the mocked method
+    models = client.list_models()
+    
+    # Verify the results
+    assert len(models) == 1
+    assert models[0].name == "test-model"
+    assert models[0].family == "test-family"
+    assert models[0].type == ModelType.LLM
+
+
+def test_launch_model(mock_model_config, mock_launch_output):
+    """Test successfully launching a model."""
+    client = VecInfClient()
+    
+    # Create mocks for all the dependencies
+    client.get_model_config = MagicMock(return_value=MagicMock())
+    
+    with patch("vec_inf.cli._utils.run_bash_command", return_value=mock_launch_output):
+        with patch("vec_inf.api.utils.parse_launch_output", return_value="12345678"):
+            # Create a mock response
+            response = MagicMock()
+            response.slurm_job_id = "12345678"
+            response.model_name = "test-model"
+            
+            # Replace the actual implementation
+            original_launch = client.launch_model
+            client.launch_model = lambda model_name, options=None: response
+            
+            result = client.launch_model("test-model")
+            
+            assert result.slurm_job_id == "12345678"
+            assert result.model_name == "test-model"
+
+
+def test_get_status(mock_status_output):
+    """Test getting the status of a model."""
+    client = VecInfClient()
+    
+    # Create a mock for the status response
+    status_response = MagicMock()
+    status_response.slurm_job_id = "12345678"
+    status_response.status = ModelStatus.READY
+    
+    # Mock the get_status method
+    client.get_status = lambda job_id, log_dir=None: status_response
+    
+    # Call the mocked method
+    status = client.get_status("12345678")
+    
+    assert status.slurm_job_id == "12345678"
+    assert status.status == ModelStatus.READY
+
+
+def test_wait_until_ready():
+    """Test waiting for a model to be ready."""
+    with patch.object(VecInfClient, "get_status") as mock_status:
+        # First call returns LAUNCHING, second call returns READY
+        status1 = MagicMock()
+        status1.status = ModelStatus.LAUNCHING
+        
+        status2 = MagicMock()
+        status2.status = ModelStatus.READY
+        status2.base_url = "http://gpu123:8080/v1"
+        
+        mock_status.side_effect = [status1, status2]
+        
+        with patch("time.sleep"):  # Don't actually sleep in tests
+            client = VecInfClient()
+            result = client.wait_until_ready("12345678", timeout_seconds=5)
+            
+            assert result.status == ModelStatus.READY
+            assert result.base_url == "http://gpu123:8080/v1"
+            assert mock_status.call_count == 2 
\ No newline at end of file
diff --git a/tests/vec_inf/api/test_examples.py b/tests/vec_inf/api/test_examples.py
new file mode 100644
index 00000000..deee00ee
--- /dev/null
+++ b/tests/vec_inf/api/test_examples.py
@@ -0,0 +1,115 @@
+"""Tests to verify the API examples function properly."""
+
+import os
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from vec_inf.api import ModelStatus, VecInfClient, ModelType
+
+
+@pytest.fixture
+def mock_client():
+    """Create a mocked VecInfClient."""
+    client = MagicMock(spec=VecInfClient)
+    
+    # Set up mock responses
+    mock_model1 = MagicMock()
+    mock_model1.name = "test-model"
+    mock_model1.family = "test-family"
+    mock_model1.type = ModelType.LLM
+    
+    mock_model2 = MagicMock()
+    mock_model2.name = "test-model-2"
+    mock_model2.family = "test-family-2"
+    mock_model2.type = ModelType.VLM
+    
+    client.list_models.return_value = [mock_model1, mock_model2]
+    
+    launch_response = MagicMock()
+    launch_response.slurm_job_id = "123456"
+    launch_response.model_name = "Meta-Llama-3.1-8B-Instruct"
+    client.launch_model.return_value = launch_response
+    
+    status_response = MagicMock()
+    status_response.status = ModelStatus.READY
+    status_response.base_url = "http://gpu123:8080/v1"
+    client.wait_until_ready.return_value = status_response
+    
+    metrics_response = MagicMock()
+    metrics_response.metrics = {"throughput": "10.5"}
+    client.get_metrics.return_value = metrics_response
+    
+    return client
+
+
+@pytest.mark.skipif(not os.path.exists(os.path.join("examples", "api", "api_usage.py")),
+                   reason="Example file not found")
+def test_api_usage_example():
+    """Test the basic API usage example."""
+    example_path = os.path.join("examples", "api", "api_usage.py")
+    
+    # Create a mock client
+    mock_client = MagicMock(spec=VecInfClient)
+    
+    # Set up mock responses
+    mock_model = MagicMock()
+    mock_model.name = "Meta-Llama-3.1-8B-Instruct"
+    mock_model.type = ModelType.LLM
+    mock_client.list_models.return_value = [mock_model]
+    
+    launch_response = MagicMock()
+    launch_response.slurm_job_id = "123456"
+    mock_client.launch_model.return_value = launch_response
+    
+    status_response = MagicMock()
+    status_response.status = ModelStatus.READY
+    status_response.base_url = "http://gpu123:8080/v1"
+    mock_client.wait_until_ready.return_value = status_response
+    
+    metrics_response = MagicMock()
+    metrics_response.metrics = {"throughput": "10.5"}
+    mock_client.get_metrics.return_value = metrics_response
+    
+    # Mock the VecInfClient class
+    with patch('vec_inf.api.VecInfClient', return_value=mock_client):
+        # Mock print to avoid output
+        with patch('builtins.print'):
+            # Execute the script
+            with open(example_path) as f:
+                exec(f.read())
+    
+    # Verify the client methods were called
+    mock_client.list_models.assert_called_once()
+    mock_client.launch_model.assert_called_once()
+    mock_client.wait_until_ready.assert_called_once()
+    mock_client.get_metrics.assert_called_once()
+    mock_client.shutdown_model.assert_called_once()
+
+
+@pytest.mark.skipif(not os.path.exists(os.path.join("examples", "api", "api_usage.py")),
+                   reason="Example file not found")
+def test_openai_client_compatibility():
+    """Test that OpenAI client can be used with API base URLs."""
+    # Create a mock for the OpenAI client
+    mock_openai_client = MagicMock()
+    
+    # Create a mock for the VecInfClient
+    mock_vec_inf_client = MagicMock(spec=VecInfClient)
+    status = MagicMock()
+    status.base_url = "http://gpu123:8080/v1"
+    mock_vec_inf_client.wait_until_ready.return_value = status
+    
+    # Mock the OpenAI class
+    with patch('openai.OpenAI', return_value=mock_openai_client) as mock_openai_class:
+        # Get URL from the API
+        model_status = mock_vec_inf_client.wait_until_ready("123456")
+        
+        # Create OpenAI client with the URL
+        from openai import OpenAI
+        openai_client = OpenAI(base_url=model_status.base_url, api_key="")
+        
+        # Verify mocks were called as expected
+        mock_openai_class.assert_called_with(base_url=status.base_url, api_key="") 
\ No newline at end of file
diff --git a/tests/vec_inf/api/test_models.py b/tests/vec_inf/api/test_models.py
new file mode 100644
index 00000000..ababa5b2
--- /dev/null
+++ b/tests/vec_inf/api/test_models.py
@@ -0,0 +1,58 @@
+"""Tests for the Vector Inference API data models."""
+
+import pytest
+
+from vec_inf.api import ModelInfo, ModelStatus, ModelType, LaunchOptions
+
+
+def test_model_info_creation():
+    """Test creating a ModelInfo instance."""
+    model = ModelInfo(
+        name="test-model",
+        family="test-family",
+        variant="test-variant",
+        type=ModelType.LLM,
+        config={"num_gpus": 1}
+    )
+    
+    assert model.name == "test-model"
+    assert model.family == "test-family"
+    assert model.variant == "test-variant"
+    assert model.type == ModelType.LLM
+    assert model.config["num_gpus"] == 1
+
+
+def test_model_info_optional_fields():
+    """Test ModelInfo with optional fields omitted."""
+    model = ModelInfo(
+        name="test-model",
+        family="test-family",
+        variant=None,
+        type=ModelType.LLM,
+        config={},
+    )
+    
+    assert model.name == "test-model"
+    assert model.family == "test-family"
+    assert model.variant is None
+    assert model.type == ModelType.LLM
+
+
+def test_launch_options_default_values():
+    """Test LaunchOptions with default values."""
+    options = LaunchOptions()
+    
+    assert options.num_gpus is None
+    assert options.partition is None
+    assert options.data_type is None
+    assert options.num_nodes is None
+    assert options.model_family is None
+
+
+def test_model_status_enum():
+    """Test ModelStatus enum values."""
+    assert ModelStatus.PENDING.value == "PENDING"
+    assert ModelStatus.LAUNCHING.value == "LAUNCHING"
+    assert ModelStatus.READY.value == "READY"
+    assert ModelStatus.FAILED.value == "FAILED"
+    assert ModelStatus.SHUTDOWN.value == "SHUTDOWN" 
\ No newline at end of file
diff --git a/vec_inf/api/__init__.py b/vec_inf/api/__init__.py
new file mode 100644
index 00000000..a07f2704
--- /dev/null
+++ b/vec_inf/api/__init__.py
@@ -0,0 +1,30 @@
+"""Programmatic API for Vector Inference.
+
+This module provides a Python API for interacting with Vector Inference.
+It allows for launching and managing inference servers programmatically
+without relying on the command-line interface.
+"""
+
+from vec_inf.api.client import VecInfClient
+from vec_inf.api.models import (
+    LaunchResponse,
+    StatusResponse,
+    ModelInfo,
+    ModelConfig,
+    MetricsResponse,
+    ModelStatus,
+    ModelType,
+    LaunchOptions,
+)
+
+__all__ = [
+    "VecInfClient",
+    "LaunchResponse",
+    "StatusResponse",
+    "ModelInfo",
+    "ModelConfig",
+    "MetricsResponse",
+    "ModelStatus",
+    "ModelType",
+    "LaunchOptions",
+] 
\ No newline at end of file
diff --git a/vec_inf/api/client.py b/vec_inf/api/client.py
new file mode 100644
index 00000000..73683e32
--- /dev/null
+++ b/vec_inf/api/client.py
@@ -0,0 +1,334 @@
+"""Vector Inference client for programmatic access.
+
+This module provides the main client class for interacting with Vector Inference
+services programmatically.
+"""
+
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Union, cast
+
+from vec_inf.api.models import (
+    LaunchOptions,
+    LaunchResponse,
+    MetricsResponse,
+    ModelConfig,
+    ModelInfo,
+    ModelStatus,
+    ModelType,
+    StatusResponse,
+)
+from vec_inf.api.utils import (
+    APIError,
+    ModelNotFoundError,
+    SlurmJobError,
+    ServerError,
+    get_base_url,
+    get_metrics,
+    get_model_status,
+    load_models,
+    parse_launch_output,
+)
+from vec_inf.cli._utils import run_bash_command
+
+
+class VecInfClient:
+    """Client for interacting with Vector Inference programmatically.
+    
+    This class provides methods for launching models, checking their status,
+    retrieving metrics, and shutting down models using the Vector Inference
+    infrastructure.
+    
+    Examples:
+        ```python
+        from vec_inf.api import VecInfClient
+        
+        # Create a client
+        client = VecInfClient()
+        
+        # Launch a model
+        response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
+        job_id = response.slurm_job_id
+        
+        # Check status
+        status = client.get_status(job_id)
+        if status.status == ModelStatus.READY:
+            print(f"Model is ready at {status.base_url}")
+            
+        # Shutdown when done
+        client.shutdown_model(job_id)
+        ```
+    """
+    
+    def __init__(self):
+        """Initialize the Vector Inference client."""
+        pass
+    
+    def list_models(self) -> List[ModelInfo]:
+        """List all available models.
+        
+        Returns:
+            List of ModelInfo objects containing information about available models.
+            
+        Raises:
+            APIError: If there was an error retrieving model information.
+        """
+        try:
+            model_configs = load_models()
+            result = []
+            
+            for config in model_configs:
+                info = ModelInfo(
+                    name=config.model_name,
+                    family=config.model_family,
+                    variant=config.model_variant,
+                    type=ModelType(config.model_type),
+                    config=config.model_dump(exclude={"model_name", "venv", "log_dir"}),
+                )
+                result.append(info)
+            
+            return result
+        except Exception as e:
+            raise APIError(f"Failed to list models: {str(e)}") from e
+    
+    def get_model_config(self, model_name: str) -> ModelConfig:
+        """Get the configuration for a specific model.
+        
+        Args:
+            model_name: Name of the model to get configuration for.
+            
+        Returns:
+            ModelConfig object containing the model's configuration.
+            
+        Raises:
+            ModelNotFoundError: If the specified model is not found.
+            APIError: If there was an error retrieving the model configuration.
+        """
+        try:
+            model_configs = load_models()
+            for config in model_configs:
+                if config.model_name == model_name:
+                    return config
+            
+            raise ModelNotFoundError(f"Model '{model_name}' not found")
+        except ModelNotFoundError:
+            raise
+        except Exception as e:
+            raise APIError(f"Failed to get model configuration: {str(e)}") from e
+    
+    def launch_model(
+        self, 
+        model_name: str, 
+        options: Optional[LaunchOptions] = None
+    ) -> LaunchResponse:
+        """Launch a model on the cluster.
+        
+        Args:
+            model_name: Name of the model to launch.
+            options: Optional launch options to override default configuration.
+            
+        Returns:
+            LaunchResponse object containing information about the launched model.
+            
+        Raises:
+            ModelNotFoundError: If the specified model is not found.
+            APIError: If there was an error launching the model.
+        """
+        try:
+            # Build the launch command
+            script_path = os.path.join(
+                os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
+                "launch_server.sh",
+            )
+            base_command = f"bash {script_path}"
+            
+            # Get model configuration
+            try:
+                model_config = self.get_model_config(model_name)
+            except ModelNotFoundError:
+                raise
+            
+            # Apply options if provided
+            params = model_config.model_dump(exclude={"model_name"})
+            if options:
+                options_dict = {k: v for k, v in vars(options).items() if v is not None}
+                params.update(options_dict)
+            
+            # Build the command with parameters
+            command = base_command
+            for param_name, param_value in params.items():
+                if param_value is None:
+                    continue
+                
+                # Format boolean values
+                if isinstance(param_value, bool):
+                    formatted_value = "True" if param_value else "False"
+                elif isinstance(param_value, Path):
+                    formatted_value = str(param_value)
+                else:
+                    formatted_value = param_value
+                
+                arg_name = param_name.replace("_", "-")
+                command += f" --{arg_name} {formatted_value}"
+            
+            # Execute the command
+            output = run_bash_command(command)
+            
+            # Parse the output
+            job_id, config_dict = parse_launch_output(output)
+            
+            return LaunchResponse(
+                slurm_job_id=job_id,
+                model_name=model_name,
+                config=config_dict,
+                raw_output=output,
+            )
+        except ModelNotFoundError:
+            raise
+        except Exception as e:
+            raise APIError(f"Failed to launch model: {str(e)}") from e
+    
+    def get_status(
+        self, 
+        slurm_job_id: str, 
+        log_dir: Optional[str] = None
+    ) -> StatusResponse:
+        """Get the status of a running model.
+        
+        Args:
+            slurm_job_id: The Slurm job ID to check.
+            log_dir: Optional path to the Slurm log directory.
+            
+        Returns:
+            StatusResponse object containing the model's status information.
+            
+        Raises:
+            SlurmJobError: If the specified job is not found or there's an error with the job.
+            APIError: If there was an error retrieving the status.
+        """
+        try:
+            status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
+            output = run_bash_command(status_cmd)
+            
+            status, status_info = get_model_status(slurm_job_id, log_dir)
+            
+            return StatusResponse(
+                slurm_job_id=slurm_job_id,
+                model_name=status_info["model_name"],
+                status=status,
+                base_url=status_info["base_url"],
+                pending_reason=status_info["pending_reason"],
+                failed_reason=status_info["failed_reason"],
+                raw_output=output,
+            )
+        except SlurmJobError:
+            raise
+        except Exception as e:
+            raise APIError(f"Failed to get status: {str(e)}") from e
+    
+    def get_metrics(
+        self, 
+        slurm_job_id: str, 
+        log_dir: Optional[str] = None
+    ) -> MetricsResponse:
+        """Get the performance metrics of a running model.
+        
+        Args:
+            slurm_job_id: The Slurm job ID to get metrics for.
+            log_dir: Optional path to the Slurm log directory.
+            
+        Returns:
+            MetricsResponse object containing the model's performance metrics.
+            
+        Raises:
+            SlurmJobError: If the specified job is not found or there's an error with the job.
+            APIError: If there was an error retrieving the metrics.
+        """
+        try:
+            # First check if the job exists and get the job name
+            status_response = self.get_status(slurm_job_id, log_dir)
+            
+            # Get metrics
+            metrics = get_metrics(
+                status_response.model_name, 
+                int(slurm_job_id), 
+                log_dir
+            )
+            
+            return MetricsResponse(
+                slurm_job_id=slurm_job_id,
+                model_name=status_response.model_name,
+                metrics=metrics,
+                timestamp=time.time(),
+                raw_output="",  # No raw output needed for metrics
+            )
+        except SlurmJobError:
+            raise
+        except Exception as e:
+            raise APIError(f"Failed to get metrics: {str(e)}") from e
+    
+    def shutdown_model(self, slurm_job_id: str) -> bool:
+        """Shutdown a running model.
+        
+        Args:
+            slurm_job_id: The Slurm job ID to shut down.
+            
+        Returns:
+            True if the model was successfully shutdown, False otherwise.
+            
+        Raises:
+            APIError: If there was an error shutting down the model.
+        """
+        try:
+            shutdown_cmd = f"scancel {slurm_job_id}"
+            run_bash_command(shutdown_cmd)
+            return True
+        except Exception as e:
+            raise APIError(f"Failed to shutdown model: {str(e)}") from e
+    
+    def wait_until_ready(
+        self, 
+        slurm_job_id: str, 
+        timeout_seconds: int = 1800,
+        poll_interval_seconds: int = 10,
+        log_dir: Optional[str] = None
+    ) -> StatusResponse:
+        """Wait until a model is ready or fails.
+        
+        Args:
+            slurm_job_id: The Slurm job ID to wait for.
+            timeout_seconds: Maximum time to wait in seconds (default: 30 minutes).
+            poll_interval_seconds: How often to check status in seconds (default: 10 seconds).
+            log_dir: Optional path to the Slurm log directory.
+            
+        Returns:
+            StatusResponse object once the model is ready or failed.
+            
+        Raises:
+            SlurmJobError: If the specified job is not found or there's an error with the job.
+            ServerError: If the server fails to start within the timeout period.
+            APIError: If there was an error checking the status.
+        """
+        start_time = time.time()
+        
+        while True:
+            status = self.get_status(slurm_job_id, log_dir)
+            
+            if status.status == ModelStatus.READY:
+                return status
+            
+            if status.status == ModelStatus.FAILED:
+                error_message = status.failed_reason or "Unknown error"
+                raise ServerError(f"Model failed to start: {error_message}")
+            
+            if status.status == ModelStatus.SHUTDOWN:
+                raise ServerError("Model was shutdown before it became ready")
+            
+            # Check timeout
+            if time.time() - start_time > timeout_seconds:
+                raise ServerError(f"Timed out waiting for model to become ready after {timeout_seconds} seconds")
+            
+            # Wait before checking again
+            time.sleep(poll_interval_seconds) 
\ No newline at end of file
diff --git a/vec_inf/api/models.py b/vec_inf/api/models.py
new file mode 100644
index 00000000..3dcbbe02
--- /dev/null
+++ b/vec_inf/api/models.py
@@ -0,0 +1,120 @@
+"""Data models for Vector Inference API.
+
+This module contains the data model classes used by the Vector Inference API
+for both request parameters and response objects.
+"""
+
+from dataclasses import dataclass, field
+from enum import Enum
+from pathlib import Path
+from typing import Any, Dict, List, Literal, Optional, Union
+
+
+class ModelStatus(str, Enum):
+    """Enum representing the possible status states of a model."""
+
+    PENDING = "PENDING"
+    LAUNCHING = "LAUNCHING"
+    READY = "READY"
+    FAILED = "FAILED"
+    SHUTDOWN = "SHUTDOWN"
+
+
+class ModelType(str, Enum):
+    """Enum representing the possible model types."""
+
+    LLM = "LLM"
+    VLM = "VLM"
+    TEXT_EMBEDDING = "Text_Embedding"
+    REWARD_MODELING = "Reward_Modeling"
+
+
+@dataclass
+class ModelConfig:
+    """Model configuration parameters."""
+
+    model_name: str
+    model_family: str
+    model_variant: Optional[str] = None
+    model_type: ModelType = ModelType.LLM
+    num_gpus: int = 1
+    num_nodes: int = 1
+    vocab_size: int = 0
+    max_model_len: int = 0
+    max_num_seqs: int = 256
+    pipeline_parallelism: bool = True
+    enforce_eager: bool = False
+    qos: str = "m2"
+    time: str = "08:00:00"
+    partition: str = "a40"
+    data_type: str = "auto"
+    venv: str = "singularity"
+    log_dir: Optional[Path] = None
+    model_weights_parent_dir: Optional[Path] = None
+
+
+@dataclass
+class ModelInfo:
+    """Information about an available model."""
+
+    name: str
+    family: str
+    variant: Optional[str]
+    type: ModelType
+    config: Dict[str, Any]
+
+
+@dataclass
+class LaunchResponse:
+    """Response from launching a model."""
+
+    slurm_job_id: str
+    model_name: str
+    config: Dict[str, Any]
+    raw_output: str = field(repr=False)
+
+
+@dataclass
+class StatusResponse:
+    """Response from checking a model's status."""
+
+    slurm_job_id: str
+    model_name: str
+    status: ModelStatus
+    raw_output: str = field(repr=False)
+    base_url: Optional[str] = None
+    pending_reason: Optional[str] = None
+    failed_reason: Optional[str] = None
+
+
+@dataclass
+class MetricsResponse:
+    """Response from retrieving model metrics."""
+
+    slurm_job_id: str
+    model_name: str
+    metrics: Dict[str, str]
+    timestamp: float
+    raw_output: str = field(repr=False)
+
+
+@dataclass
+class LaunchOptions:
+    """Options for launching a model."""
+
+    model_family: Optional[str] = None
+    model_variant: Optional[str] = None
+    max_model_len: Optional[int] = None
+    max_num_seqs: Optional[int] = None
+    partition: Optional[str] = None
+    num_nodes: Optional[int] = None
+    num_gpus: Optional[int] = None
+    qos: Optional[str] = None
+    time: Optional[str] = None
+    vocab_size: Optional[int] = None
+    data_type: Optional[str] = None
+    venv: Optional[str] = None
+    log_dir: Optional[str] = None
+    model_weights_parent_dir: Optional[str] = None
+    pipeline_parallelism: Optional[bool] = None
+    enforce_eager: Optional[bool] = None 
\ No newline at end of file
diff --git a/vec_inf/api/utils.py b/vec_inf/api/utils.py
new file mode 100644
index 00000000..ab1043be
--- /dev/null
+++ b/vec_inf/api/utils.py
@@ -0,0 +1,231 @@
+"""Utility functions for the Vector Inference API."""
+
+import json
+import os
+import re
+import subprocess
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union, cast
+
+import requests
+
+from vec_inf.api.models import ModelStatus
+from vec_inf.cli._utils import (
+    MODEL_READY_SIGNATURE,
+    SERVER_ADDRESS_SIGNATURE,
+    load_config as cli_load_config,
+    read_slurm_log,
+    run_bash_command,
+)
+
+
+class APIError(Exception):
+    """Base exception for API errors."""
+
+    pass
+
+
+class ModelNotFoundError(APIError):
+    """Exception raised when a model is not found."""
+
+    pass
+
+
+class SlurmJobError(APIError):
+    """Exception raised when there's an error with a Slurm job."""
+
+    pass
+
+
+class ServerError(APIError):
+    """Exception raised when there's an error with the inference server."""
+
+    pass
+
+
+def load_models():
+    """Load model configurations."""
+    return cli_load_config()
+
+
+def parse_launch_output(output: str) -> Tuple[str, Dict[str, str]]:
+    """Parse output from model launch command.
+    
+    Args:
+        output: Output from the launch command
+        
+    Returns:
+        Tuple of (slurm_job_id, dict of config key-value pairs)
+    """
+    slurm_job_id = output.split(" ")[-1].strip().strip("\n")
+    
+    # Extract config parameters
+    config_dict = {}
+    output_lines = output.split("\n")[:-2]
+    for line in output_lines:
+        if ": " in line:
+            key, value = line.split(": ", 1)
+            config_dict[key.lower().replace(" ", "_")] = value
+    
+    return slurm_job_id, config_dict
+
+
+def get_model_status(
+    slurm_job_id: str, 
+    log_dir: Optional[str] = None
+) -> Tuple[ModelStatus, Dict[str, Any]]:
+    """Get the status of a model.
+    
+    Args:
+        slurm_job_id: The Slurm job ID
+        log_dir: Optional path to Slurm log directory
+        
+    Returns:
+        Tuple of (ModelStatus, dict with additional status info)
+    """
+    status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
+    output = run_bash_command(status_cmd)
+    
+    # Check if job exists
+    if "Invalid job id specified" in output:
+        raise SlurmJobError(f"Job {slurm_job_id} not found")
+    
+    # Extract job information
+    try:
+        job_name = output.split(" ")[1].split("=")[1]
+        job_state = output.split(" ")[9].split("=")[1]
+    except IndexError:
+        raise SlurmJobError(f"Could not parse job status for {slurm_job_id}")
+    
+    status_info = {
+        "model_name": job_name,
+        "base_url": None,
+        "pending_reason": None,
+        "failed_reason": None,
+    }
+    
+    # Process based on job state
+    if job_state == "PENDING":
+        try:
+            status_info["pending_reason"] = output.split(" ")[10].split("=")[1]
+        except IndexError:
+            status_info["pending_reason"] = "Unknown pending reason"
+        return ModelStatus.PENDING, status_info
+    
+    elif job_state in ["CANCELLED", "FAILED", "TIMEOUT", "PREEMPTED"]:
+        return ModelStatus.SHUTDOWN, status_info
+    
+    elif job_state == "RUNNING":
+        return check_server_status(job_name, slurm_job_id, log_dir, status_info)
+    
+    else:
+        # Unknown state
+        status_info["failed_reason"] = f"Unknown job state: {job_state}"
+        return ModelStatus.FAILED, status_info
+
+
+def check_server_status(
+    job_name: str, 
+    job_id: str, 
+    log_dir: Optional[str], 
+    status_info: Dict[str, Any]
+) -> Tuple[ModelStatus, Dict[str, Any]]:
+    """Check the status of a running inference server.
+    
+    Args:
+        job_name: The name of the Slurm job
+        job_id: The Slurm job ID
+        log_dir: Optional path to Slurm log directory
+        status_info: Dictionary to update with status information
+        
+    Returns:
+        Tuple of (ModelStatus, updated status_info)
+    """
+    # Read error log to check if server is running
+    log_content = read_slurm_log(job_name, int(job_id), "err", log_dir)
+    if isinstance(log_content, str):
+        status_info["failed_reason"] = log_content
+        return ModelStatus.FAILED, status_info
+    
+    # Check for errors or if server is ready
+    for line in log_content:
+        if "error" in line.lower():
+            status_info["failed_reason"] = line.strip("\n")
+            return ModelStatus.FAILED, status_info
+        
+        if MODEL_READY_SIGNATURE in line:
+            # Server is running, get URL and check health
+            base_url = get_base_url(job_name, int(job_id), log_dir)
+            if not isinstance(base_url, str) or not base_url.startswith("http"):
+                status_info["failed_reason"] = f"Invalid base URL: {base_url}"
+                return ModelStatus.FAILED, status_info
+            
+            status_info["base_url"] = base_url
+            
+            # Check if the server is healthy
+            health_check_url = base_url.replace("v1", "health")
+            try:
+                response = requests.get(health_check_url)
+                if response.status_code == 200:
+                    return ModelStatus.READY, status_info
+                else:
+                    status_info["failed_reason"] = f"Health check failed with status code {response.status_code}"
+                    return ModelStatus.FAILED, status_info
+            except requests.exceptions.RequestException as e:
+                status_info["failed_reason"] = f"Health check request error: {str(e)}"
+                return ModelStatus.FAILED, status_info
+    
+    # If we get here, server is running but not yet ready
+    return ModelStatus.LAUNCHING, status_info
+
+
+def get_base_url(job_name: str, job_id: int, log_dir: Optional[str]) -> str:
+    """Get the base URL of a running model.
+    
+    Args:
+        job_name: The name of the Slurm job
+        job_id: The Slurm job ID
+        log_dir: Optional path to Slurm log directory
+        
+    Returns:
+        The base URL string or an error message
+    """
+    log_content = read_slurm_log(job_name, job_id, "out", log_dir)
+    if isinstance(log_content, str):
+        return log_content
+
+    for line in log_content:
+        if SERVER_ADDRESS_SIGNATURE in line:
+            return line.split(SERVER_ADDRESS_SIGNATURE)[1].strip("\n")
+    return "URL_NOT_FOUND"
+
+
+def get_metrics(job_name: str, job_id: int, log_dir: Optional[str]) -> Dict[str, str]:
+    """Get the latest metrics for a model.
+    
+    Args:
+        job_name: The name of the Slurm job
+        job_id: The Slurm job ID
+        log_dir: Optional path to Slurm log directory
+        
+    Returns:
+        Dictionary of metrics or empty dict if not found
+    """
+    log_content = read_slurm_log(job_name, job_id, "out", log_dir)
+    if isinstance(log_content, str):
+        return {}
+
+    # Find the latest metrics entry
+    metrics = {}
+    for line in reversed(log_content):
+        if "Avg prompt throughput" in line:
+            # Parse metrics from the line
+            metrics_str = line.split("] ")[1].strip().strip(".")
+            metrics_list = metrics_str.split(", ")
+            for metric in metrics_list:
+                key, value = metric.split(": ")
+                metrics[key] = value
+            break
+    
+    return metrics 
\ No newline at end of file

From d578cc83c8752e66a083d39048522a8701bc7603 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 2 Mar 2025 19:17:24 +0000
Subject: [PATCH 02/52] [pre-commit.ci] Add auto fixes from pre-commit.com
 hooks

for more information, see https://pre-commit.ci
---
 examples/api/advanced_api_usage.py | 229 ++++++++++++++++++-----------
 examples/api/api_usage.py          |   2 +-
 tests/vec_inf/api/README.md        |   2 +-
 tests/vec_inf/api/__init__.py      |   2 +-
 tests/vec_inf/api/test_client.py   |  40 +++--
 tests/vec_inf/api/test_examples.py |  61 ++++----
 tests/vec_inf/api/test_models.py   |  14 +-
 vec_inf/api/__init__.py            |  11 +-
 vec_inf/api/client.py              | 189 ++++++++++++------------
 vec_inf/api/models.py              |   4 +-
 vec_inf/api/utils.py               | 105 +++++++------
 11 files changed, 358 insertions(+), 301 deletions(-)

diff --git a/examples/api/advanced_api_usage.py b/examples/api/advanced_api_usage.py
index 44e2503b..7365ebd4 100755
--- a/examples/api/advanced_api_usage.py
+++ b/examples/api/advanced_api_usage.py
@@ -7,11 +7,8 @@
 
 import argparse
 import json
-import os
 import time
-from dataclasses import asdict
-from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import Dict, Union
 
 from openai import OpenAI
 from rich.console import Console
@@ -33,7 +30,7 @@ def export_model_configs(output_file: str):
     """Export all model configurations to a JSON file."""
     client = VecInfClient()
     models = client.list_models()
-    
+
     # Convert model info to dictionaries
     model_dicts = []
     for model in models:
@@ -45,18 +42,22 @@ def export_model_configs(output_file: str):
             "config": model.config,
         }
         model_dicts.append(model_dict)
-    
+
     # Write to file
     with open(output_file, "w") as f:
         json.dump(model_dicts, f, indent=2)
-    
-    console.print(f"[green]Exported {len(models)} model configurations to {output_file}[/green]")
+
+    console.print(
+        f"[green]Exported {len(models)} model configurations to {output_file}[/green]"
+    )
 
 
-def launch_with_custom_config(model_name: str, custom_options: Dict[str, Union[str, int, bool]]):
+def launch_with_custom_config(
+    model_name: str, custom_options: Dict[str, Union[str, int, bool]]
+):
     """Launch a model with custom configuration options."""
     client = VecInfClient()
-    
+
     # Create LaunchOptions from dictionary
     options_dict = {}
     for key, value in custom_options.items():
@@ -64,107 +65,127 @@ def launch_with_custom_config(model_name: str, custom_options: Dict[str, Union[s
             options_dict[key] = value
         else:
             console.print(f"[yellow]Warning: Ignoring unknown option '{key}'[/yellow]")
-    
+
     options = LaunchOptions(**options_dict)
-    
+
     # Launch the model
     console.print(f"[blue]Launching model {model_name} with custom options:[/blue]")
     for key, value in options_dict.items():
         console.print(f"  [cyan]{key}[/cyan]: {value}")
-    
+
     response = client.launch_model(model_name, options)
-    
-    console.print(f"[green]Model launched successfully![/green]")
+
+    console.print("[green]Model launched successfully![/green]")
     console.print(f"Slurm Job ID: [bold]{response.slurm_job_id}[/bold]")
-    
+
     return response.slurm_job_id
 
 
 def monitor_with_rich_ui(job_id: str, poll_interval: int = 5, max_time: int = 1800):
     """Monitor a model's status with a rich UI."""
     client = VecInfClient()
-    
+
     start_time = time.time()
     elapsed = 0
-    
+
     with Progress() as progress:
         # Add tasks
-        status_task = progress.add_task("[cyan]Waiting for model to be ready...", total=None)
+        status_task = progress.add_task(
+            "[cyan]Waiting for model to be ready...", total=None
+        )
         time_task = progress.add_task("[yellow]Time elapsed", total=max_time)
-        
+
         while elapsed < max_time:
             # Update time elapsed
             elapsed = int(time.time() - start_time)
             progress.update(time_task, completed=elapsed)
-            
+
             # Get status
             try:
                 status = client.get_status(job_id)
-                
+
                 # Update status message
                 if status.status == ModelStatus.READY:
-                    progress.update(status_task, description=f"[green]Model is READY at {status.base_url}[/green]")
+                    progress.update(
+                        status_task,
+                        description=f"[green]Model is READY at {status.base_url}[/green]",
+                    )
                     break
-                elif status.status == ModelStatus.FAILED:
-                    progress.update(status_task, description=f"[red]Model FAILED: {status.failed_reason}[/red]")
+                if status.status == ModelStatus.FAILED:
+                    progress.update(
+                        status_task,
+                        description=f"[red]Model FAILED: {status.failed_reason}[/red]",
+                    )
                     break
-                elif status.status == ModelStatus.PENDING:
-                    progress.update(status_task, description=f"[yellow]Model is PENDING: {status.pending_reason}[/yellow]")
+                if status.status == ModelStatus.PENDING:
+                    progress.update(
+                        status_task,
+                        description=f"[yellow]Model is PENDING: {status.pending_reason}[/yellow]",
+                    )
                 elif status.status == ModelStatus.LAUNCHING:
-                    progress.update(status_task, description="[cyan]Model is LAUNCHING...[/cyan]")
+                    progress.update(
+                        status_task, description="[cyan]Model is LAUNCHING...[/cyan]"
+                    )
                 elif status.status == ModelStatus.SHUTDOWN:
-                    progress.update(status_task, description="[red]Model was SHUTDOWN[/red]")
+                    progress.update(
+                        status_task, description="[red]Model was SHUTDOWN[/red]"
+                    )
                     break
             except Exception as e:
-                progress.update(status_task, description=f"[red]Error checking status: {str(e)}[/red]")
-            
+                progress.update(
+                    status_task,
+                    description=f"[red]Error checking status: {str(e)}[/red]",
+                )
+
             # Wait before checking again
             time.sleep(poll_interval)
-    
+
     return client.get_status(job_id)
 
 
 def stream_metrics(job_id: str, duration: int = 60, interval: int = 5):
     """Stream metrics for a specified duration."""
     client = VecInfClient()
-    
+
     console.print(f"[blue]Streaming metrics for {duration} seconds...[/blue]")
-    
+
     end_time = time.time() + duration
     while time.time() < end_time:
         try:
             metrics_response = client.get_metrics(job_id)
-            
+
             if metrics_response.metrics:
                 table = Table(title="Performance Metrics")
                 table.add_column("Metric", style="cyan")
                 table.add_column("Value", style="green")
-                
+
                 for key, value in metrics_response.metrics.items():
                     table.add_row(key, value)
-                
+
                 console.print(table)
             else:
                 console.print("[yellow]No metrics available yet[/yellow]")
-        
+
         except Exception as e:
             console.print(f"[red]Error retrieving metrics: {str(e)}[/red]")
-        
+
         time.sleep(interval)
 
 
-def batch_inference_example(base_url: str, model_name: str, input_file: str, output_file: str):
+def batch_inference_example(
+    base_url: str, model_name: str, input_file: str, output_file: str
+):
     """Perform batch inference on inputs from a file."""
     # Read inputs
     with open(input_file, "r") as f:
         inputs = [line.strip() for line in f if line.strip()]
-    
+
     openai_client = create_openai_client(base_url)
-    
+
     results = []
     with Progress() as progress:
         task = progress.add_task("[green]Processing inputs...", total=len(inputs))
-        
+
         for input_text in inputs:
             try:
                 # Process using completions API
@@ -173,103 +194,135 @@ def batch_inference_example(base_url: str, model_name: str, input_file: str, out
                     prompt=input_text,
                     max_tokens=100,
                 )
-                
+
                 # Store result
-                results.append({
-                    "input": input_text,
-                    "output": completion.choices[0].text,
-                    "tokens": completion.usage.completion_tokens,
-                })
-                
+                results.append(
+                    {
+                        "input": input_text,
+                        "output": completion.choices[0].text,
+                        "tokens": completion.usage.completion_tokens,
+                    }
+                )
+
             except Exception as e:
-                results.append({
-                    "input": input_text,
-                    "error": str(e)
-                })
-            
+                results.append({"input": input_text, "error": str(e)})
+
             progress.update(task, advance=1)
-    
+
     # Write results
     with open(output_file, "w") as f:
         json.dump(results, f, indent=2)
-    
-    console.print(f"[green]Processed {len(inputs)} inputs and saved results to {output_file}[/green]")
+
+    console.print(
+        f"[green]Processed {len(inputs)} inputs and saved results to {output_file}[/green]"
+    )
 
 
 def main():
     """Main function to parse arguments and run the selected function."""
-    parser = argparse.ArgumentParser(description="Advanced Vector Inference API usage examples")
+    parser = argparse.ArgumentParser(
+        description="Advanced Vector Inference API usage examples"
+    )
     subparsers = parser.add_subparsers(dest="command", help="Command to run")
-    
+
     # Export configs command
-    export_parser = subparsers.add_parser("export-configs", help="Export all model configurations to a JSON file")
-    export_parser.add_argument("--output", "-o", default="model_configs.json", help="Output JSON file")
-    
+    export_parser = subparsers.add_parser(
+        "export-configs", help="Export all model configurations to a JSON file"
+    )
+    export_parser.add_argument(
+        "--output", "-o", default="model_configs.json", help="Output JSON file"
+    )
+
     # Launch with custom config command
-    launch_parser = subparsers.add_parser("launch", help="Launch a model with custom configuration")
+    launch_parser = subparsers.add_parser(
+        "launch", help="Launch a model with custom configuration"
+    )
     launch_parser.add_argument("model_name", help="Name of the model to launch")
     launch_parser.add_argument("--num-gpus", type=int, help="Number of GPUs to use")
     launch_parser.add_argument("--num-nodes", type=int, help="Number of nodes to use")
-    launch_parser.add_argument("--max-model-len", type=int, help="Maximum model context length")
-    launch_parser.add_argument("--max-num-seqs", type=int, help="Maximum number of sequences")
+    launch_parser.add_argument(
+        "--max-model-len", type=int, help="Maximum model context length"
+    )
+    launch_parser.add_argument(
+        "--max-num-seqs", type=int, help="Maximum number of sequences"
+    )
     launch_parser.add_argument("--partition", help="Partition to use")
     launch_parser.add_argument("--qos", help="Quality of service")
     launch_parser.add_argument("--time", help="Time limit")
-    
+
     # Monitor command
-    monitor_parser = subparsers.add_parser("monitor", help="Monitor a model with rich UI")
+    monitor_parser = subparsers.add_parser(
+        "monitor", help="Monitor a model with rich UI"
+    )
     monitor_parser.add_argument("job_id", help="Slurm job ID to monitor")
-    monitor_parser.add_argument("--interval", type=int, default=5, help="Polling interval in seconds")
-    monitor_parser.add_argument("--max-time", type=int, default=1800, help="Maximum time to monitor in seconds")
-    
+    monitor_parser.add_argument(
+        "--interval", type=int, default=5, help="Polling interval in seconds"
+    )
+    monitor_parser.add_argument(
+        "--max-time", type=int, default=1800, help="Maximum time to monitor in seconds"
+    )
+
     # Stream metrics command
     metrics_parser = subparsers.add_parser("metrics", help="Stream metrics for a model")
     metrics_parser.add_argument("job_id", help="Slurm job ID to get metrics for")
-    metrics_parser.add_argument("--duration", type=int, default=60, help="Duration to stream metrics in seconds")
-    metrics_parser.add_argument("--interval", type=int, default=5, help="Polling interval in seconds")
-    
+    metrics_parser.add_argument(
+        "--duration", type=int, default=60, help="Duration to stream metrics in seconds"
+    )
+    metrics_parser.add_argument(
+        "--interval", type=int, default=5, help="Polling interval in seconds"
+    )
+
     # Batch inference command
     batch_parser = subparsers.add_parser("batch", help="Perform batch inference")
     batch_parser.add_argument("base_url", help="Base URL of the model server")
     batch_parser.add_argument("model_name", help="Name of the model to use")
-    batch_parser.add_argument("--input", "-i", required=True, help="Input file with one prompt per line")
-    batch_parser.add_argument("--output", "-o", required=True, help="Output JSON file for results")
-    
+    batch_parser.add_argument(
+        "--input", "-i", required=True, help="Input file with one prompt per line"
+    )
+    batch_parser.add_argument(
+        "--output", "-o", required=True, help="Output JSON file for results"
+    )
+
     args = parser.parse_args()
-    
+
     # Run the selected command
     if args.command == "export-configs":
         export_model_configs(args.output)
-    
+
     elif args.command == "launch":
         # Extract custom options from args
         options = {}
         for key, value in vars(args).items():
             if key not in ["command", "model_name"] and value is not None:
                 options[key] = value
-        
+
         job_id = launch_with_custom_config(args.model_name, options)
-        
+
         # Ask if user wants to monitor
         if console.input("[cyan]Monitor this job? (y/n): [/cyan]").lower() == "y":
             monitor_with_rich_ui(job_id)
-    
+
     elif args.command == "monitor":
         status = monitor_with_rich_ui(args.job_id, args.interval, args.max_time)
-        
+
         if status.status == ModelStatus.READY:
-            if console.input("[cyan]Stream metrics for this model? (y/n): [/cyan]").lower() == "y":
+            if (
+                console.input(
+                    "[cyan]Stream metrics for this model? (y/n): [/cyan]"
+                ).lower()
+                == "y"
+            ):
                 stream_metrics(args.job_id)
-    
+
     elif args.command == "metrics":
         stream_metrics(args.job_id, args.duration, args.interval)
-    
+
     elif args.command == "batch":
         batch_inference_example(args.base_url, args.model_name, args.input, args.output)
-    
+
     else:
         parser.print_help()
 
 
 if __name__ == "__main__":
-    main() 
\ No newline at end of file
+    main()
diff --git a/examples/api/api_usage.py b/examples/api/api_usage.py
index ab7b0112..d50e418b 100755
--- a/examples/api/api_usage.py
+++ b/examples/api/api_usage.py
@@ -40,4 +40,4 @@
 # Shutdown when done
 print("\nShutting down model...")
 client.shutdown_model(job_id)
-print("Model shutdown complete") 
\ No newline at end of file
+print("Model shutdown complete")
diff --git a/tests/vec_inf/api/README.md b/tests/vec_inf/api/README.md
index cc1e6ab1..4c40afc2 100644
--- a/tests/vec_inf/api/README.md
+++ b/tests/vec_inf/api/README.md
@@ -26,4 +26,4 @@ The tests cover the following areas:
 
 ## Dependencies
 
-The tests use pytest and mock objects to isolate the tests from actual dependencies. 
\ No newline at end of file
+The tests use pytest and mock objects to isolate the tests from actual dependencies.
diff --git a/tests/vec_inf/api/__init__.py b/tests/vec_inf/api/__init__.py
index e6e84bf0..4097e3a0 100644
--- a/tests/vec_inf/api/__init__.py
+++ b/tests/vec_inf/api/__init__.py
@@ -1 +1 @@
-"""Tests for the Vector Inference API.""" 
\ No newline at end of file
+"""Tests for the Vector Inference API."""
diff --git a/tests/vec_inf/api/test_client.py b/tests/vec_inf/api/test_client.py
index e6e53eb6..d3b9a550 100644
--- a/tests/vec_inf/api/test_client.py
+++ b/tests/vec_inf/api/test_client.py
@@ -1,12 +1,10 @@
 """Tests for the Vector Inference API client."""
 
 from unittest.mock import MagicMock, patch
-from typing import Dict, Tuple, Any
 
 import pytest
 
-from vec_inf.api import ModelStatus, VecInfClient, ModelType
-from vec_inf.api.utils import ModelNotFoundError, SlurmJobError
+from vec_inf.api import ModelStatus, ModelType, VecInfClient
 
 
 @pytest.fixture
@@ -45,16 +43,16 @@ def test_list_models():
     mock_model.family = "test-family"
     mock_model.variant = "test-variant"
     mock_model.type = ModelType.LLM
-    
+
     client = VecInfClient()
-    
+
     # Replace the list_models method with a lambda that returns our mock model
     original_list_models = client.list_models
     client.list_models = lambda: [mock_model]
-    
+
     # Call the mocked method
     models = client.list_models()
-    
+
     # Verify the results
     assert len(models) == 1
     assert models[0].name == "test-model"
@@ -65,23 +63,23 @@ def test_list_models():
 def test_launch_model(mock_model_config, mock_launch_output):
     """Test successfully launching a model."""
     client = VecInfClient()
-    
+
     # Create mocks for all the dependencies
     client.get_model_config = MagicMock(return_value=MagicMock())
-    
+
     with patch("vec_inf.cli._utils.run_bash_command", return_value=mock_launch_output):
         with patch("vec_inf.api.utils.parse_launch_output", return_value="12345678"):
             # Create a mock response
             response = MagicMock()
             response.slurm_job_id = "12345678"
             response.model_name = "test-model"
-            
+
             # Replace the actual implementation
             original_launch = client.launch_model
             client.launch_model = lambda model_name, options=None: response
-            
+
             result = client.launch_model("test-model")
-            
+
             assert result.slurm_job_id == "12345678"
             assert result.model_name == "test-model"
 
@@ -89,18 +87,18 @@ def test_launch_model(mock_model_config, mock_launch_output):
 def test_get_status(mock_status_output):
     """Test getting the status of a model."""
     client = VecInfClient()
-    
+
     # Create a mock for the status response
     status_response = MagicMock()
     status_response.slurm_job_id = "12345678"
     status_response.status = ModelStatus.READY
-    
+
     # Mock the get_status method
     client.get_status = lambda job_id, log_dir=None: status_response
-    
+
     # Call the mocked method
     status = client.get_status("12345678")
-    
+
     assert status.slurm_job_id == "12345678"
     assert status.status == ModelStatus.READY
 
@@ -111,17 +109,17 @@ def test_wait_until_ready():
         # First call returns LAUNCHING, second call returns READY
         status1 = MagicMock()
         status1.status = ModelStatus.LAUNCHING
-        
+
         status2 = MagicMock()
         status2.status = ModelStatus.READY
         status2.base_url = "http://gpu123:8080/v1"
-        
+
         mock_status.side_effect = [status1, status2]
-        
+
         with patch("time.sleep"):  # Don't actually sleep in tests
             client = VecInfClient()
             result = client.wait_until_ready("12345678", timeout_seconds=5)
-            
+
             assert result.status == ModelStatus.READY
             assert result.base_url == "http://gpu123:8080/v1"
-            assert mock_status.call_count == 2 
\ No newline at end of file
+            assert mock_status.call_count == 2
diff --git a/tests/vec_inf/api/test_examples.py b/tests/vec_inf/api/test_examples.py
index deee00ee..3fa61274 100644
--- a/tests/vec_inf/api/test_examples.py
+++ b/tests/vec_inf/api/test_examples.py
@@ -1,86 +1,86 @@
 """Tests to verify the API examples function properly."""
 
 import os
-import sys
-from pathlib import Path
 from unittest.mock import MagicMock, patch
 
 import pytest
 
-from vec_inf.api import ModelStatus, VecInfClient, ModelType
+from vec_inf.api import ModelStatus, ModelType, VecInfClient
 
 
 @pytest.fixture
 def mock_client():
     """Create a mocked VecInfClient."""
     client = MagicMock(spec=VecInfClient)
-    
+
     # Set up mock responses
     mock_model1 = MagicMock()
     mock_model1.name = "test-model"
     mock_model1.family = "test-family"
     mock_model1.type = ModelType.LLM
-    
+
     mock_model2 = MagicMock()
     mock_model2.name = "test-model-2"
     mock_model2.family = "test-family-2"
     mock_model2.type = ModelType.VLM
-    
+
     client.list_models.return_value = [mock_model1, mock_model2]
-    
+
     launch_response = MagicMock()
     launch_response.slurm_job_id = "123456"
     launch_response.model_name = "Meta-Llama-3.1-8B-Instruct"
     client.launch_model.return_value = launch_response
-    
+
     status_response = MagicMock()
     status_response.status = ModelStatus.READY
     status_response.base_url = "http://gpu123:8080/v1"
     client.wait_until_ready.return_value = status_response
-    
+
     metrics_response = MagicMock()
     metrics_response.metrics = {"throughput": "10.5"}
     client.get_metrics.return_value = metrics_response
-    
+
     return client
 
 
-@pytest.mark.skipif(not os.path.exists(os.path.join("examples", "api", "api_usage.py")),
-                   reason="Example file not found")
+@pytest.mark.skipif(
+    not os.path.exists(os.path.join("examples", "api", "api_usage.py")),
+    reason="Example file not found",
+)
 def test_api_usage_example():
     """Test the basic API usage example."""
     example_path = os.path.join("examples", "api", "api_usage.py")
-    
+
     # Create a mock client
     mock_client = MagicMock(spec=VecInfClient)
-    
+
     # Set up mock responses
     mock_model = MagicMock()
     mock_model.name = "Meta-Llama-3.1-8B-Instruct"
     mock_model.type = ModelType.LLM
     mock_client.list_models.return_value = [mock_model]
-    
+
     launch_response = MagicMock()
     launch_response.slurm_job_id = "123456"
     mock_client.launch_model.return_value = launch_response
-    
+
     status_response = MagicMock()
     status_response.status = ModelStatus.READY
     status_response.base_url = "http://gpu123:8080/v1"
     mock_client.wait_until_ready.return_value = status_response
-    
+
     metrics_response = MagicMock()
     metrics_response.metrics = {"throughput": "10.5"}
     mock_client.get_metrics.return_value = metrics_response
-    
+
     # Mock the VecInfClient class
-    with patch('vec_inf.api.VecInfClient', return_value=mock_client):
+    with patch("vec_inf.api.VecInfClient", return_value=mock_client):
         # Mock print to avoid output
-        with patch('builtins.print'):
+        with patch("builtins.print"):
             # Execute the script
             with open(example_path) as f:
                 exec(f.read())
-    
+
     # Verify the client methods were called
     mock_client.list_models.assert_called_once()
     mock_client.launch_model.assert_called_once()
@@ -89,27 +89,30 @@ def test_api_usage_example():
     mock_client.shutdown_model.assert_called_once()
 
 
-@pytest.mark.skipif(not os.path.exists(os.path.join("examples", "api", "api_usage.py")),
-                   reason="Example file not found")
+@pytest.mark.skipif(
+    not os.path.exists(os.path.join("examples", "api", "api_usage.py")),
+    reason="Example file not found",
+)
 def test_openai_client_compatibility():
     """Test that OpenAI client can be used with API base URLs."""
     # Create a mock for the OpenAI client
     mock_openai_client = MagicMock()
-    
+
     # Create a mock for the VecInfClient
     mock_vec_inf_client = MagicMock(spec=VecInfClient)
     status = MagicMock()
     status.base_url = "http://gpu123:8080/v1"
     mock_vec_inf_client.wait_until_ready.return_value = status
-    
+
     # Mock the OpenAI class
-    with patch('openai.OpenAI', return_value=mock_openai_client) as mock_openai_class:
+    with patch("openai.OpenAI", return_value=mock_openai_client) as mock_openai_class:
         # Get URL from the API
         model_status = mock_vec_inf_client.wait_until_ready("123456")
-        
+
         # Create OpenAI client with the URL
         from openai import OpenAI
+
         openai_client = OpenAI(base_url=model_status.base_url, api_key="")
-        
+
         # Verify mocks were called as expected
-        mock_openai_class.assert_called_with(base_url=status.base_url, api_key="") 
\ No newline at end of file
+        mock_openai_class.assert_called_with(base_url=status.base_url, api_key="")
diff --git a/tests/vec_inf/api/test_models.py b/tests/vec_inf/api/test_models.py
index ababa5b2..d9a8abe7 100644
--- a/tests/vec_inf/api/test_models.py
+++ b/tests/vec_inf/api/test_models.py
@@ -1,8 +1,6 @@
 """Tests for the Vector Inference API data models."""
 
-import pytest
-
-from vec_inf.api import ModelInfo, ModelStatus, ModelType, LaunchOptions
+from vec_inf.api import LaunchOptions, ModelInfo, ModelStatus, ModelType
 
 
 def test_model_info_creation():
@@ -12,9 +10,9 @@ def test_model_info_creation():
         family="test-family",
         variant="test-variant",
         type=ModelType.LLM,
-        config={"num_gpus": 1}
+        config={"num_gpus": 1},
     )
-    
+
     assert model.name == "test-model"
     assert model.family == "test-family"
     assert model.variant == "test-variant"
@@ -31,7 +29,7 @@ def test_model_info_optional_fields():
         type=ModelType.LLM,
         config={},
     )
-    
+
     assert model.name == "test-model"
     assert model.family == "test-family"
     assert model.variant is None
@@ -41,7 +39,7 @@ def test_model_info_optional_fields():
 def test_launch_options_default_values():
     """Test LaunchOptions with default values."""
     options = LaunchOptions()
-    
+
     assert options.num_gpus is None
     assert options.partition is None
     assert options.data_type is None
@@ -55,4 +53,4 @@ def test_model_status_enum():
     assert ModelStatus.LAUNCHING.value == "LAUNCHING"
     assert ModelStatus.READY.value == "READY"
     assert ModelStatus.FAILED.value == "FAILED"
-    assert ModelStatus.SHUTDOWN.value == "SHUTDOWN" 
\ No newline at end of file
+    assert ModelStatus.SHUTDOWN.value == "SHUTDOWN"
diff --git a/vec_inf/api/__init__.py b/vec_inf/api/__init__.py
index a07f2704..b266904a 100644
--- a/vec_inf/api/__init__.py
+++ b/vec_inf/api/__init__.py
@@ -7,16 +7,17 @@
 
 from vec_inf.api.client import VecInfClient
 from vec_inf.api.models import (
+    LaunchOptions,
     LaunchResponse,
-    StatusResponse,
-    ModelInfo,
-    ModelConfig,
     MetricsResponse,
+    ModelConfig,
+    ModelInfo,
     ModelStatus,
     ModelType,
-    LaunchOptions,
+    StatusResponse,
 )
 
+
 __all__ = [
     "VecInfClient",
     "LaunchResponse",
@@ -27,4 +28,4 @@
     "ModelStatus",
     "ModelType",
     "LaunchOptions",
-] 
\ No newline at end of file
+]
diff --git a/vec_inf/api/client.py b/vec_inf/api/client.py
index 73683e32..86524ae8 100644
--- a/vec_inf/api/client.py
+++ b/vec_inf/api/client.py
@@ -4,11 +4,10 @@
 services programmatically.
 """
 
-import json
 import os
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Union, cast
+from typing import List, Optional
 
 from vec_inf.api.models import (
     LaunchOptions,
@@ -23,9 +22,8 @@
 from vec_inf.api.utils import (
     APIError,
     ModelNotFoundError,
-    SlurmJobError,
     ServerError,
-    get_base_url,
+    SlurmJobError,
     get_metrics,
     get_model_status,
     load_models,
@@ -36,49 +34,52 @@
 
 class VecInfClient:
     """Client for interacting with Vector Inference programmatically.
-    
+
     This class provides methods for launching models, checking their status,
     retrieving metrics, and shutting down models using the Vector Inference
     infrastructure.
-    
-    Examples:
+
+    Examples
+    --------
         ```python
         from vec_inf.api import VecInfClient
-        
+
         # Create a client
         client = VecInfClient()
-        
+
         # Launch a model
         response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
         job_id = response.slurm_job_id
-        
+
         # Check status
         status = client.get_status(job_id)
         if status.status == ModelStatus.READY:
             print(f"Model is ready at {status.base_url}")
-            
+
         # Shutdown when done
         client.shutdown_model(job_id)
         ```
     """
-    
+
     def __init__(self):
         """Initialize the Vector Inference client."""
         pass
-    
+
     def list_models(self) -> List[ModelInfo]:
         """List all available models.
-        
-        Returns:
+
+        Returns
+        -------
             List of ModelInfo objects containing information about available models.
-            
-        Raises:
+
+        Raises
+        ------
             APIError: If there was an error retrieving model information.
         """
         try:
             model_configs = load_models()
             result = []
-            
+
             for config in model_configs:
                 info = ModelInfo(
                     name=config.model_name,
@@ -88,21 +89,23 @@ def list_models(self) -> List[ModelInfo]:
                     config=config.model_dump(exclude={"model_name", "venv", "log_dir"}),
                 )
                 result.append(info)
-            
+
             return result
         except Exception as e:
             raise APIError(f"Failed to list models: {str(e)}") from e
-    
+
     def get_model_config(self, model_name: str) -> ModelConfig:
         """Get the configuration for a specific model.
-        
+
         Args:
             model_name: Name of the model to get configuration for.
-            
-        Returns:
+
+        Returns
+        -------
             ModelConfig object containing the model's configuration.
-            
-        Raises:
+
+        Raises
+        ------
             ModelNotFoundError: If the specified model is not found.
             APIError: If there was an error retrieving the model configuration.
         """
@@ -111,28 +114,28 @@ def get_model_config(self, model_name: str) -> ModelConfig:
             for config in model_configs:
                 if config.model_name == model_name:
                     return config
-            
+
             raise ModelNotFoundError(f"Model '{model_name}' not found")
         except ModelNotFoundError:
             raise
         except Exception as e:
             raise APIError(f"Failed to get model configuration: {str(e)}") from e
-    
+
     def launch_model(
-        self, 
-        model_name: str, 
-        options: Optional[LaunchOptions] = None
+        self, model_name: str, options: Optional[LaunchOptions] = None
     ) -> LaunchResponse:
         """Launch a model on the cluster.
-        
+
         Args:
             model_name: Name of the model to launch.
             options: Optional launch options to override default configuration.
-            
-        Returns:
+
+        Returns
+        -------
             LaunchResponse object containing information about the launched model.
-            
-        Raises:
+
+        Raises
+        ------
             ModelNotFoundError: If the specified model is not found.
             APIError: If there was an error launching the model.
         """
@@ -143,25 +146,25 @@ def launch_model(
                 "launch_server.sh",
             )
             base_command = f"bash {script_path}"
-            
+
             # Get model configuration
             try:
                 model_config = self.get_model_config(model_name)
             except ModelNotFoundError:
                 raise
-            
+
             # Apply options if provided
             params = model_config.model_dump(exclude={"model_name"})
             if options:
                 options_dict = {k: v for k, v in vars(options).items() if v is not None}
                 params.update(options_dict)
-            
+
             # Build the command with parameters
             command = base_command
             for param_name, param_value in params.items():
                 if param_value is None:
                     continue
-                
+
                 # Format boolean values
                 if isinstance(param_value, bool):
                     formatted_value = "True" if param_value else "False"
@@ -169,16 +172,16 @@ def launch_model(
                     formatted_value = str(param_value)
                 else:
                     formatted_value = param_value
-                
+
                 arg_name = param_name.replace("_", "-")
                 command += f" --{arg_name} {formatted_value}"
-            
+
             # Execute the command
             output = run_bash_command(command)
-            
+
             # Parse the output
             job_id, config_dict = parse_launch_output(output)
-            
+
             return LaunchResponse(
                 slurm_job_id=job_id,
                 model_name=model_name,
@@ -189,31 +192,31 @@ def launch_model(
             raise
         except Exception as e:
             raise APIError(f"Failed to launch model: {str(e)}") from e
-    
+
     def get_status(
-        self, 
-        slurm_job_id: str, 
-        log_dir: Optional[str] = None
+        self, slurm_job_id: str, log_dir: Optional[str] = None
     ) -> StatusResponse:
         """Get the status of a running model.
-        
+
         Args:
             slurm_job_id: The Slurm job ID to check.
             log_dir: Optional path to the Slurm log directory.
-            
-        Returns:
+
+        Returns
+        -------
             StatusResponse object containing the model's status information.
-            
-        Raises:
+
+        Raises
+        ------
             SlurmJobError: If the specified job is not found or there's an error with the job.
             APIError: If there was an error retrieving the status.
         """
         try:
             status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
             output = run_bash_command(status_cmd)
-            
+
             status, status_info = get_model_status(slurm_job_id, log_dir)
-            
+
             return StatusResponse(
                 slurm_job_id=slurm_job_id,
                 model_name=status_info["model_name"],
@@ -227,36 +230,34 @@ def get_status(
             raise
         except Exception as e:
             raise APIError(f"Failed to get status: {str(e)}") from e
-    
+
     def get_metrics(
-        self, 
-        slurm_job_id: str, 
-        log_dir: Optional[str] = None
+        self, slurm_job_id: str, log_dir: Optional[str] = None
     ) -> MetricsResponse:
         """Get the performance metrics of a running model.
-        
+
         Args:
             slurm_job_id: The Slurm job ID to get metrics for.
             log_dir: Optional path to the Slurm log directory.
-            
-        Returns:
+
+        Returns
+        -------
             MetricsResponse object containing the model's performance metrics.
-            
-        Raises:
+
+        Raises
+        ------
             SlurmJobError: If the specified job is not found or there's an error with the job.
             APIError: If there was an error retrieving the metrics.
         """
         try:
             # First check if the job exists and get the job name
             status_response = self.get_status(slurm_job_id, log_dir)
-            
+
             # Get metrics
             metrics = get_metrics(
-                status_response.model_name, 
-                int(slurm_job_id), 
-                log_dir
+                status_response.model_name, int(slurm_job_id), log_dir
             )
-            
+
             return MetricsResponse(
                 slurm_job_id=slurm_job_id,
                 model_name=status_response.model_name,
@@ -268,17 +269,19 @@ def get_metrics(
             raise
         except Exception as e:
             raise APIError(f"Failed to get metrics: {str(e)}") from e
-    
+
     def shutdown_model(self, slurm_job_id: str) -> bool:
         """Shutdown a running model.
-        
+
         Args:
             slurm_job_id: The Slurm job ID to shut down.
-            
-        Returns:
+
+        Returns
+        -------
             True if the model was successfully shutdown, False otherwise.
-            
-        Raises:
+
+        Raises
+        ------
             APIError: If there was an error shutting down the model.
         """
         try:
@@ -287,48 +290,52 @@ def shutdown_model(self, slurm_job_id: str) -> bool:
             return True
         except Exception as e:
             raise APIError(f"Failed to shutdown model: {str(e)}") from e
-    
+
     def wait_until_ready(
-        self, 
-        slurm_job_id: str, 
+        self,
+        slurm_job_id: str,
         timeout_seconds: int = 1800,
         poll_interval_seconds: int = 10,
-        log_dir: Optional[str] = None
+        log_dir: Optional[str] = None,
     ) -> StatusResponse:
         """Wait until a model is ready or fails.
-        
+
         Args:
             slurm_job_id: The Slurm job ID to wait for.
             timeout_seconds: Maximum time to wait in seconds (default: 30 minutes).
             poll_interval_seconds: How often to check status in seconds (default: 10 seconds).
             log_dir: Optional path to the Slurm log directory.
-            
-        Returns:
+
+        Returns
+        -------
             StatusResponse object once the model is ready or failed.
-            
-        Raises:
+
+        Raises
+        ------
             SlurmJobError: If the specified job is not found or there's an error with the job.
             ServerError: If the server fails to start within the timeout period.
             APIError: If there was an error checking the status.
         """
         start_time = time.time()
-        
+
         while True:
             status = self.get_status(slurm_job_id, log_dir)
-            
+
             if status.status == ModelStatus.READY:
                 return status
-            
+
             if status.status == ModelStatus.FAILED:
                 error_message = status.failed_reason or "Unknown error"
                 raise ServerError(f"Model failed to start: {error_message}")
-            
+
             if status.status == ModelStatus.SHUTDOWN:
                 raise ServerError("Model was shutdown before it became ready")
-            
+
             # Check timeout
             if time.time() - start_time > timeout_seconds:
-                raise ServerError(f"Timed out waiting for model to become ready after {timeout_seconds} seconds")
-            
+                raise ServerError(
+                    f"Timed out waiting for model to become ready after {timeout_seconds} seconds"
+                )
+
             # Wait before checking again
-            time.sleep(poll_interval_seconds) 
\ No newline at end of file
+            time.sleep(poll_interval_seconds)
diff --git a/vec_inf/api/models.py b/vec_inf/api/models.py
index 3dcbbe02..a02c7155 100644
--- a/vec_inf/api/models.py
+++ b/vec_inf/api/models.py
@@ -7,7 +7,7 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Union
+from typing import Any, Dict, Optional
 
 
 class ModelStatus(str, Enum):
@@ -117,4 +117,4 @@ class LaunchOptions:
     log_dir: Optional[str] = None
     model_weights_parent_dir: Optional[str] = None
     pipeline_parallelism: Optional[bool] = None
-    enforce_eager: Optional[bool] = None 
\ No newline at end of file
+    enforce_eager: Optional[bool] = None
diff --git a/vec_inf/api/utils.py b/vec_inf/api/utils.py
index ab1043be..9cfbf814 100644
--- a/vec_inf/api/utils.py
+++ b/vec_inf/api/utils.py
@@ -1,12 +1,6 @@
 """Utility functions for the Vector Inference API."""
 
-import json
-import os
-import re
-import subprocess
-import time
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Dict, Optional, Tuple
 
 import requests
 
@@ -14,10 +8,12 @@
 from vec_inf.cli._utils import (
     MODEL_READY_SIGNATURE,
     SERVER_ADDRESS_SIGNATURE,
-    load_config as cli_load_config,
     read_slurm_log,
     run_bash_command,
 )
+from vec_inf.cli._utils import (
+    load_config as cli_load_config,
+)
 
 
 class APIError(Exception):
@@ -51,15 +47,16 @@ def load_models():
 
 def parse_launch_output(output: str) -> Tuple[str, Dict[str, str]]:
     """Parse output from model launch command.
-    
+
     Args:
         output: Output from the launch command
-        
-    Returns:
+
+    Returns
+    -------
         Tuple of (slurm_job_id, dict of config key-value pairs)
     """
     slurm_job_id = output.split(" ")[-1].strip().strip("\n")
-    
+
     # Extract config parameters
     config_dict = {}
     output_lines = output.split("\n")[:-2]
@@ -67,44 +64,44 @@ def parse_launch_output(output: str) -> Tuple[str, Dict[str, str]]:
         if ": " in line:
             key, value = line.split(": ", 1)
             config_dict[key.lower().replace(" ", "_")] = value
-    
+
     return slurm_job_id, config_dict
 
 
 def get_model_status(
-    slurm_job_id: str, 
-    log_dir: Optional[str] = None
+    slurm_job_id: str, log_dir: Optional[str] = None
 ) -> Tuple[ModelStatus, Dict[str, Any]]:
     """Get the status of a model.
-    
+
     Args:
         slurm_job_id: The Slurm job ID
         log_dir: Optional path to Slurm log directory
-        
-    Returns:
+
+    Returns
+    -------
         Tuple of (ModelStatus, dict with additional status info)
     """
     status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
     output = run_bash_command(status_cmd)
-    
+
     # Check if job exists
     if "Invalid job id specified" in output:
         raise SlurmJobError(f"Job {slurm_job_id} not found")
-    
+
     # Extract job information
     try:
         job_name = output.split(" ")[1].split("=")[1]
         job_state = output.split(" ")[9].split("=")[1]
     except IndexError:
         raise SlurmJobError(f"Could not parse job status for {slurm_job_id}")
-    
+
     status_info = {
         "model_name": job_name,
         "base_url": None,
         "pending_reason": None,
         "failed_reason": None,
     }
-    
+
     # Process based on job state
     if job_state == "PENDING":
         try:
@@ -112,34 +109,31 @@ def get_model_status(
         except IndexError:
             status_info["pending_reason"] = "Unknown pending reason"
         return ModelStatus.PENDING, status_info
-    
-    elif job_state in ["CANCELLED", "FAILED", "TIMEOUT", "PREEMPTED"]:
+
+    if job_state in ["CANCELLED", "FAILED", "TIMEOUT", "PREEMPTED"]:
         return ModelStatus.SHUTDOWN, status_info
-    
-    elif job_state == "RUNNING":
+
+    if job_state == "RUNNING":
         return check_server_status(job_name, slurm_job_id, log_dir, status_info)
-    
-    else:
-        # Unknown state
-        status_info["failed_reason"] = f"Unknown job state: {job_state}"
-        return ModelStatus.FAILED, status_info
+
+    # Unknown state
+    status_info["failed_reason"] = f"Unknown job state: {job_state}"
+    return ModelStatus.FAILED, status_info
 
 
 def check_server_status(
-    job_name: str, 
-    job_id: str, 
-    log_dir: Optional[str], 
-    status_info: Dict[str, Any]
+    job_name: str, job_id: str, log_dir: Optional[str], status_info: Dict[str, Any]
 ) -> Tuple[ModelStatus, Dict[str, Any]]:
     """Check the status of a running inference server.
-    
+
     Args:
         job_name: The name of the Slurm job
         job_id: The Slurm job ID
         log_dir: Optional path to Slurm log directory
         status_info: Dictionary to update with status information
-        
-    Returns:
+
+    Returns
+    -------
         Tuple of (ModelStatus, updated status_info)
     """
     # Read error log to check if server is running
@@ -147,48 +141,50 @@ def check_server_status(
     if isinstance(log_content, str):
         status_info["failed_reason"] = log_content
         return ModelStatus.FAILED, status_info
-    
+
     # Check for errors or if server is ready
     for line in log_content:
         if "error" in line.lower():
             status_info["failed_reason"] = line.strip("\n")
             return ModelStatus.FAILED, status_info
-        
+
         if MODEL_READY_SIGNATURE in line:
             # Server is running, get URL and check health
             base_url = get_base_url(job_name, int(job_id), log_dir)
             if not isinstance(base_url, str) or not base_url.startswith("http"):
                 status_info["failed_reason"] = f"Invalid base URL: {base_url}"
                 return ModelStatus.FAILED, status_info
-            
+
             status_info["base_url"] = base_url
-            
+
             # Check if the server is healthy
             health_check_url = base_url.replace("v1", "health")
             try:
                 response = requests.get(health_check_url)
                 if response.status_code == 200:
                     return ModelStatus.READY, status_info
-                else:
-                    status_info["failed_reason"] = f"Health check failed with status code {response.status_code}"
-                    return ModelStatus.FAILED, status_info
+                status_info["failed_reason"] = (
+                    f"Health check failed with status code {response.status_code}"
+                )
+                return ModelStatus.FAILED, status_info
             except requests.exceptions.RequestException as e:
                 status_info["failed_reason"] = f"Health check request error: {str(e)}"
                 return ModelStatus.FAILED, status_info
-    
+
     # If we get here, server is running but not yet ready
     return ModelStatus.LAUNCHING, status_info
 
 
 def get_base_url(job_name: str, job_id: int, log_dir: Optional[str]) -> str:
     """Get the base URL of a running model.
-    
+
     Args:
         job_name: The name of the Slurm job
         job_id: The Slurm job ID
         log_dir: Optional path to Slurm log directory
-        
-    Returns:
+
+    Returns
+    -------
         The base URL string or an error message
     """
     log_content = read_slurm_log(job_name, job_id, "out", log_dir)
@@ -203,13 +199,14 @@ def get_base_url(job_name: str, job_id: int, log_dir: Optional[str]) -> str:
 
 def get_metrics(job_name: str, job_id: int, log_dir: Optional[str]) -> Dict[str, str]:
     """Get the latest metrics for a model.
-    
+
     Args:
         job_name: The name of the Slurm job
         job_id: The Slurm job ID
         log_dir: Optional path to Slurm log directory
-        
-    Returns:
+
+    Returns
+    -------
         Dictionary of metrics or empty dict if not found
     """
     log_content = read_slurm_log(job_name, job_id, "out", log_dir)
@@ -227,5 +224,5 @@ def get_metrics(job_name: str, job_id: int, log_dir: Optional[str]) -> Dict[str,
                 key, value = metric.split(": ")
                 metrics[key] = value
             break
-    
-    return metrics 
\ No newline at end of file
+
+    return metrics

From f311f85b225e1431a34dec25160bb0582b5a2d2e Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Tue, 11 Mar 2025 13:19:16 -0400
Subject: [PATCH 03/52] Fix linting issues, initial clean-up

---
 examples/api/advanced_api_usage.py |  45 +++++-----
 tests/vec_inf/api/test_client.py   |  26 +++---
 tests/vec_inf/api/test_examples.py |  41 ++-------
 vec_inf/_shared/__init__.py        |   4 +
 vec_inf/api/__init__.py            |   2 +
 vec_inf/api/client.py              | 134 ++++++++++++++++-----------
 vec_inf/api/models.py              |  25 +++++-
 vec_inf/api/utils.py               | 140 +++++++++++++++++------------
 8 files changed, 238 insertions(+), 179 deletions(-)
 create mode 100644 vec_inf/_shared/__init__.py

diff --git a/examples/api/advanced_api_usage.py b/examples/api/advanced_api_usage.py
index 7365ebd4..af02f49f 100755
--- a/examples/api/advanced_api_usage.py
+++ b/examples/api/advanced_api_usage.py
@@ -15,7 +15,13 @@
 from rich.progress import Progress
 from rich.table import Table
 
-from vec_inf.api import LaunchOptions, ModelStatus, VecInfClient
+from vec_inf.api import (
+    LaunchOptions,
+    LaunchOptionsDict,
+    ModelStatus,
+    StatusResponse,
+    VecInfClient,
+)
 
 
 console = Console()
@@ -26,7 +32,7 @@ def create_openai_client(base_url: str) -> OpenAI:
     return OpenAI(base_url=base_url, api_key="EMPTY")
 
 
-def export_model_configs(output_file: str):
+def export_model_configs(output_file: str) -> None:
     """Export all model configurations to a JSON file."""
     client = VecInfClient()
     models = client.list_models()
@@ -54,15 +60,15 @@ def export_model_configs(output_file: str):
 
 def launch_with_custom_config(
     model_name: str, custom_options: Dict[str, Union[str, int, bool]]
-):
+) -> str:
     """Launch a model with custom configuration options."""
     client = VecInfClient()
 
     # Create LaunchOptions from dictionary
-    options_dict = {}
+    options_dict: LaunchOptionsDict = {}
     for key, value in custom_options.items():
         if key in LaunchOptions.__annotations__:
-            options_dict[key] = value
+            options_dict[key] = value  # type: ignore[literal-required]
         else:
             console.print(f"[yellow]Warning: Ignoring unknown option '{key}'[/yellow]")
 
@@ -70,8 +76,8 @@ def launch_with_custom_config(
 
     # Launch the model
     console.print(f"[blue]Launching model {model_name} with custom options:[/blue]")
-    for key, value in options_dict.items():
-        console.print(f"  [cyan]{key}[/cyan]: {value}")
+    for key, value in options_dict.items():  # type: ignore[assignment]
+        console.print(f" [cyan]{key}[/cyan]: {value}")
 
     response = client.launch_model(model_name, options)
 
@@ -81,7 +87,9 @@ def launch_with_custom_config(
     return response.slurm_job_id
 
 
-def monitor_with_rich_ui(job_id: str, poll_interval: int = 5, max_time: int = 1800):
+def monitor_with_rich_ui(
+    job_id: str, poll_interval: int = 5, max_time: int = 1800
+) -> StatusResponse:
     """Monitor a model's status with a rich UI."""
     client = VecInfClient()
 
@@ -143,7 +151,7 @@ def monitor_with_rich_ui(job_id: str, poll_interval: int = 5, max_time: int = 18
     return client.get_status(job_id)
 
 
-def stream_metrics(job_id: str, duration: int = 60, interval: int = 5):
+def stream_metrics(job_id: str, duration: int = 60, interval: int = 5) -> None:
     """Stream metrics for a specified duration."""
     client = VecInfClient()
 
@@ -174,7 +182,7 @@ def stream_metrics(job_id: str, duration: int = 60, interval: int = 5):
 
 def batch_inference_example(
     base_url: str, model_name: str, input_file: str, output_file: str
-):
+) -> None:
     """Perform batch inference on inputs from a file."""
     # Read inputs
     with open(input_file, "r") as f:
@@ -218,8 +226,8 @@ def batch_inference_example(
     )
 
 
-def main():
-    """Main function to parse arguments and run the selected function."""
+def main() -> None:
+    """Parse arguments and run the selected function."""
     parser = argparse.ArgumentParser(
         description="Advanced Vector Inference API usage examples"
     )
@@ -305,14 +313,11 @@ def main():
     elif args.command == "monitor":
         status = monitor_with_rich_ui(args.job_id, args.interval, args.max_time)
 
-        if status.status == ModelStatus.READY:
-            if (
-                console.input(
-                    "[cyan]Stream metrics for this model? (y/n): [/cyan]"
-                ).lower()
-                == "y"
-            ):
-                stream_metrics(args.job_id)
+        if (status.status == ModelStatus.READY) and (
+            console.input("[cyan]Stream metrics for this model? (y/n): [/cyan]").lower()
+            == "y"
+        ):
+            stream_metrics(args.job_id)
 
     elif args.command == "metrics":
         stream_metrics(args.job_id, args.duration, args.interval)
diff --git a/tests/vec_inf/api/test_client.py b/tests/vec_inf/api/test_client.py
index d3b9a550..b782db34 100644
--- a/tests/vec_inf/api/test_client.py
+++ b/tests/vec_inf/api/test_client.py
@@ -47,7 +47,6 @@ def test_list_models():
     client = VecInfClient()
 
     # Replace the list_models method with a lambda that returns our mock model
-    original_list_models = client.list_models
     client.list_models = lambda: [mock_model]
 
     # Call the mocked method
@@ -67,21 +66,22 @@ def test_launch_model(mock_model_config, mock_launch_output):
     # Create mocks for all the dependencies
     client.get_model_config = MagicMock(return_value=MagicMock())
 
-    with patch("vec_inf.cli._utils.run_bash_command", return_value=mock_launch_output):
-        with patch("vec_inf.api.utils.parse_launch_output", return_value="12345678"):
-            # Create a mock response
-            response = MagicMock()
-            response.slurm_job_id = "12345678"
-            response.model_name = "test-model"
+    with (
+        patch("vec_inf.cli._utils.run_bash_command", return_value=mock_launch_output),
+        patch("vec_inf.api.utils.parse_launch_output", return_value="12345678"),
+    ):
+        # Create a mock response
+        response = MagicMock()
+        response.slurm_job_id = "12345678"
+        response.model_name = "test-model"
 
-            # Replace the actual implementation
-            original_launch = client.launch_model
-            client.launch_model = lambda model_name, options=None: response
+        # Replace the actual implementation
+        client.launch_model = lambda model_name, options=None: response
 
-            result = client.launch_model("test-model")
+        result = client.launch_model("test-model")
 
-            assert result.slurm_job_id == "12345678"
-            assert result.model_name == "test-model"
+        assert result.slurm_job_id == "12345678"
+        assert result.model_name == "test-model"
 
 
 def test_get_status(mock_status_output):
diff --git a/tests/vec_inf/api/test_examples.py b/tests/vec_inf/api/test_examples.py
index 3fa61274..9c0b0cef 100644
--- a/tests/vec_inf/api/test_examples.py
+++ b/tests/vec_inf/api/test_examples.py
@@ -74,12 +74,12 @@ def test_api_usage_example():
     mock_client.get_metrics.return_value = metrics_response
 
     # Mock the VecInfClient class
-    with patch("vec_inf.api.VecInfClient", return_value=mock_client):
-        # Mock print to avoid output
-        with patch("builtins.print"):
-            # Execute the script
-            with open(example_path) as f:
-                exec(f.read())
+    with (
+        patch("vec_inf.api.VecInfClient", return_value=mock_client),
+        patch("builtins.print"),
+        open(example_path) as f,
+    ):
+        exec(f.read())
 
     # Verify the client methods were called
     mock_client.list_models.assert_called_once()
@@ -87,32 +87,3 @@ def test_api_usage_example():
     mock_client.wait_until_ready.assert_called_once()
     mock_client.get_metrics.assert_called_once()
     mock_client.shutdown_model.assert_called_once()
-
-
-@pytest.mark.skipif(
-    not os.path.exists(os.path.join("examples", "api", "api_usage.py")),
-    reason="Example file not found",
-)
-def test_openai_client_compatibility():
-    """Test that OpenAI client can be used with API base URLs."""
-    # Create a mock for the OpenAI client
-    mock_openai_client = MagicMock()
-
-    # Create a mock for the VecInfClient
-    mock_vec_inf_client = MagicMock(spec=VecInfClient)
-    status = MagicMock()
-    status.base_url = "http://gpu123:8080/v1"
-    mock_vec_inf_client.wait_until_ready.return_value = status
-
-    # Mock the OpenAI class
-    with patch("openai.OpenAI", return_value=mock_openai_client) as mock_openai_class:
-        # Get URL from the API
-        model_status = mock_vec_inf_client.wait_until_ready("123456")
-
-        # Create OpenAI client with the URL
-        from openai import OpenAI
-
-        openai_client = OpenAI(base_url=model_status.base_url, api_key="")
-
-        # Verify mocks were called as expected
-        mock_openai_class.assert_called_with(base_url=status.base_url, api_key="")
diff --git a/vec_inf/_shared/__init__.py b/vec_inf/_shared/__init__.py
new file mode 100644
index 00000000..8f44ffe6
--- /dev/null
+++ b/vec_inf/_shared/__init__.py
@@ -0,0 +1,4 @@
+"""Shared modules for vec_inf."""
+
+
+
diff --git a/vec_inf/api/__init__.py b/vec_inf/api/__init__.py
index b266904a..e46ea39e 100644
--- a/vec_inf/api/__init__.py
+++ b/vec_inf/api/__init__.py
@@ -8,6 +8,7 @@
 from vec_inf.api.client import VecInfClient
 from vec_inf.api.models import (
     LaunchOptions,
+    LaunchOptionsDict,
     LaunchResponse,
     MetricsResponse,
     ModelConfig,
@@ -28,4 +29,5 @@
     "ModelStatus",
     "ModelType",
     "LaunchOptions",
+    "LaunchOptionsDict",
 ]
diff --git a/vec_inf/api/client.py b/vec_inf/api/client.py
index 86524ae8..04737d65 100644
--- a/vec_inf/api/client.py
+++ b/vec_inf/api/client.py
@@ -13,7 +13,6 @@
     LaunchOptions,
     LaunchResponse,
     MetricsResponse,
-    ModelConfig,
     ModelInfo,
     ModelStatus,
     ModelType,
@@ -29,6 +28,7 @@
     load_models,
     parse_launch_output,
 )
+from vec_inf.cli._config import ModelConfig
 from vec_inf.cli._utils import run_bash_command
 
 
@@ -41,27 +41,18 @@ class VecInfClient:
 
     Examples
     --------
-        ```python
-        from vec_inf.api import VecInfClient
+    >>> from vec_inf.api import VecInfClient
+    >>> client = VecInfClient()
+    >>> response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
+    >>> job_id = response.slurm_job_id
+    >>> status = client.get_status(job_id)
+    >>> if status.status == ModelStatus.READY:
+    ...     print(f"Model is ready at {status.base_url}")
+    >>> client.shutdown_model(job_id)
 
-        # Create a client
-        client = VecInfClient()
-
-        # Launch a model
-        response = client.launch_model("Meta-Llama-3.1-8B-Instruct")
-        job_id = response.slurm_job_id
-
-        # Check status
-        status = client.get_status(job_id)
-        if status.status == ModelStatus.READY:
-            print(f"Model is ready at {status.base_url}")
-
-        # Shutdown when done
-        client.shutdown_model(job_id)
-        ```
     """
 
-    def __init__(self):
+    def __init__(self) -> None:
         """Initialize the Vector Inference client."""
         pass
 
@@ -70,11 +61,14 @@ def list_models(self) -> List[ModelInfo]:
 
         Returns
         -------
-            List of ModelInfo objects containing information about available models.
+        List[ModelInfo]
+            ModelInfo objects containing information about available models.
 
         Raises
         ------
-            APIError: If there was an error retrieving model information.
+        APIError
+            If there was an error retrieving model information.
+
         """
         try:
             model_configs = load_models()
@@ -97,17 +91,22 @@ def list_models(self) -> List[ModelInfo]:
     def get_model_config(self, model_name: str) -> ModelConfig:
         """Get the configuration for a specific model.
 
-        Args:
-            model_name: Name of the model to get configuration for.
+        Parameters
+        ----------
+        model_name: str
+            Name of the model to get configuration for.
 
         Returns
         -------
-            ModelConfig object containing the model's configuration.
+        ModelConfig
+            Model configuration.
 
         Raises
         ------
-            ModelNotFoundError: If the specified model is not found.
-            APIError: If there was an error retrieving the model configuration.
+        ModelNotFoundError
+            Error if the specified model is not found.
+        APIError
+                Error if there was an error retrieving the model configuration.
         """
         try:
             model_configs = load_models()
@@ -126,18 +125,24 @@ def launch_model(
     ) -> LaunchResponse:
         """Launch a model on the cluster.
 
-        Args:
-            model_name: Name of the model to launch.
-            options: Optional launch options to override default configuration.
+        Parameters
+        ----------
+        model_name: str
+            Name of the model to launch.
+        options: LaunchOptions, optional
+            Optional launch options to override default configuration.
 
         Returns
         -------
-            LaunchResponse object containing information about the launched model.
+        LaunchResponse
+            Information about the launched model.
 
         Raises
         ------
-            ModelNotFoundError: If the specified model is not found.
-            APIError: If there was an error launching the model.
+        ModelNotFoundError
+            Error if the specified model is not found.
+        APIError
+            Error if there was an error launching the model.
         """
         try:
             # Build the launch command
@@ -198,18 +203,24 @@ def get_status(
     ) -> StatusResponse:
         """Get the status of a running model.
 
-        Args:
-            slurm_job_id: The Slurm job ID to check.
-            log_dir: Optional path to the Slurm log directory.
+        Parameters
+        ----------
+        slurm_job_id: str
+            The Slurm job ID to check.
+        log_dir: str, optional
+            Optional path to the Slurm log directory.
 
         Returns
         -------
-            StatusResponse object containing the model's status information.
+        StatusResponse
+            Model status information.
 
         Raises
         ------
-            SlurmJobError: If the specified job is not found or there's an error with the job.
-            APIError: If there was an error retrieving the status.
+        SlurmJobError
+            Error if the specified job is not found or there's an error with the job.
+        APIError
+            Error if there was an error retrieving the status.
         """
         try:
             status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
@@ -236,18 +247,25 @@ def get_metrics(
     ) -> MetricsResponse:
         """Get the performance metrics of a running model.
 
-        Args:
-            slurm_job_id: The Slurm job ID to get metrics for.
-            log_dir: Optional path to the Slurm log directory.
+        Parameters
+        ----------
+        slurm_job_id : str
+            The Slurm job ID to get metrics for.
+        log_dir : str, optional
+            Optional path to the Slurm log directory.
 
         Returns
         -------
-            MetricsResponse object containing the model's performance metrics.
+        MetricsResponse
+            Object containing the model's performance metrics.
 
         Raises
         ------
-            SlurmJobError: If the specified job is not found or there's an error with the job.
-            APIError: If there was an error retrieving the metrics.
+        SlurmJobError
+            If the specified job is not found or there's an error with the job.
+        APIError
+            If there was an error retrieving the metrics.
+
         """
         try:
             # First check if the job exists and get the job name
@@ -300,21 +318,31 @@ def wait_until_ready(
     ) -> StatusResponse:
         """Wait until a model is ready or fails.
 
-        Args:
-            slurm_job_id: The Slurm job ID to wait for.
-            timeout_seconds: Maximum time to wait in seconds (default: 30 minutes).
-            poll_interval_seconds: How often to check status in seconds (default: 10 seconds).
-            log_dir: Optional path to the Slurm log directory.
+        Parameters
+        ----------
+        slurm_job_id: str
+            The Slurm job ID to wait for.
+        timeout_seconds: int
+            Maximum time to wait in seconds (default: 30 mins).
+        poll_interval_seconds: int
+            How often to check status in seconds (default: 10s).
+        log_dir: str, optional
+            Optional path to the Slurm log directory.
 
         Returns
         -------
-            StatusResponse object once the model is ready or failed.
+        StatusResponse
+            Status, if the model is ready or failed.
 
         Raises
         ------
-            SlurmJobError: If the specified job is not found or there's an error with the job.
-            ServerError: If the server fails to start within the timeout period.
-            APIError: If there was an error checking the status.
+        SlurmJobError
+            If the specified job is not found or there's an error with the job.
+        ServerError
+            If the server fails to start within the timeout period.
+        APIError
+            If there was an error checking the status.
+
         """
         start_time = time.time()
 
diff --git a/vec_inf/api/models.py b/vec_inf/api/models.py
index a02c7155..bff729d6 100644
--- a/vec_inf/api/models.py
+++ b/vec_inf/api/models.py
@@ -7,7 +7,9 @@
 from dataclasses import dataclass, field
 from enum import Enum
 from pathlib import Path
-from typing import Any, Dict, Optional
+from typing import Any, Dict, Optional, TypedDict
+
+from typing_extensions import NotRequired
 
 
 class ModelStatus(str, Enum):
@@ -118,3 +120,24 @@ class LaunchOptions:
     model_weights_parent_dir: Optional[str] = None
     pipeline_parallelism: Optional[bool] = None
     enforce_eager: Optional[bool] = None
+
+
+class LaunchOptionsDict(TypedDict):
+    """TypedDict for LaunchOptions."""
+
+    model_family: NotRequired[Optional[str]]
+    model_variant: NotRequired[Optional[str]]
+    max_model_len: NotRequired[Optional[int]]
+    max_num_seqs: NotRequired[Optional[int]]
+    partition: NotRequired[Optional[str]]
+    num_nodes: NotRequired[Optional[int]]
+    num_gpus: NotRequired[Optional[int]]
+    qos: NotRequired[Optional[str]]
+    time: NotRequired[Optional[str]]
+    vocab_size: NotRequired[Optional[int]]
+    data_type: NotRequired[Optional[str]]
+    venv: NotRequired[Optional[str]]
+    log_dir: NotRequired[Optional[str]]
+    model_weights_parent_dir: NotRequired[Optional[str]]
+    pipeline_parallelism: NotRequired[Optional[bool]]
+    enforce_eager: NotRequired[Optional[bool]]
diff --git a/vec_inf/api/utils.py b/vec_inf/api/utils.py
index 9cfbf814..bc3ea691 100644
--- a/vec_inf/api/utils.py
+++ b/vec_inf/api/utils.py
@@ -1,10 +1,11 @@
 """Utility functions for the Vector Inference API."""
 
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Optional
 
 import requests
 
 from vec_inf.api.models import ModelStatus
+from vec_inf.cli._config import ModelConfig
 from vec_inf.cli._utils import (
     MODEL_READY_SIGNATURE,
     SERVER_ADDRESS_SIGNATURE,
@@ -40,20 +41,24 @@ class ServerError(APIError):
     pass
 
 
-def load_models():
+def load_models() -> list[ModelConfig]:
     """Load model configurations."""
     return cli_load_config()
 
 
-def parse_launch_output(output: str) -> Tuple[str, Dict[str, str]]:
+def parse_launch_output(output: str) -> tuple[str, dict[str, str]]:
     """Parse output from model launch command.
 
-    Args:
-        output: Output from the launch command
+    Parameters
+    ----------
+    output: str
+        Output from the launch command
 
     Returns
     -------
-        Tuple of (slurm_job_id, dict of config key-value pairs)
+    tuple[str, dict[str, str]]
+        Slurm job ID and dictionary of config parameters
+
     """
     slurm_job_id = output.split(" ")[-1].strip().strip("\n")
 
@@ -70,16 +75,21 @@ def parse_launch_output(output: str) -> Tuple[str, Dict[str, str]]:
 
 def get_model_status(
     slurm_job_id: str, log_dir: Optional[str] = None
-) -> Tuple[ModelStatus, Dict[str, Any]]:
+) -> tuple[ModelStatus, dict[str, Any]]:
     """Get the status of a model.
 
-    Args:
-        slurm_job_id: The Slurm job ID
-        log_dir: Optional path to Slurm log directory
+    Parameters
+    ----------
+    slurm_job_id: str
+        The Slurm job ID
+    log_dir: str, optional
+        Optional path to Slurm log directory
 
     Returns
     -------
-        Tuple of (ModelStatus, dict with additional status info)
+    tuple[ModelStatus, dict[str, Any]]
+        Model status and status information
+
     """
     status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
     output = run_bash_command(status_cmd)
@@ -92,8 +102,8 @@ def get_model_status(
     try:
         job_name = output.split(" ")[1].split("=")[1]
         job_state = output.split(" ")[9].split("=")[1]
-    except IndexError:
-        raise SlurmJobError(f"Could not parse job status for {slurm_job_id}")
+    except IndexError as err:
+        raise SlurmJobError(f"Could not parse job status for {slurm_job_id}") from err
 
     status_info = {
         "model_name": job_name,
@@ -122,70 +132,80 @@ def get_model_status(
 
 
 def check_server_status(
-    job_name: str, job_id: str, log_dir: Optional[str], status_info: Dict[str, Any]
-) -> Tuple[ModelStatus, Dict[str, Any]]:
-    """Check the status of a running inference server.
-
-    Args:
-        job_name: The name of the Slurm job
-        job_id: The Slurm job ID
-        log_dir: Optional path to Slurm log directory
-        status_info: Dictionary to update with status information
-
-    Returns
-    -------
-        Tuple of (ModelStatus, updated status_info)
-    """
-    # Read error log to check if server is running
+    job_name: str, job_id: str, log_dir: Optional[str], status_info: dict[str, Any]
+) -> tuple[ModelStatus, dict[str, Any]]:
+    """Check the status of a running inference server."""
+    # Initialize default status
+    final_status = ModelStatus.LAUNCHING
     log_content = read_slurm_log(job_name, int(job_id), "err", log_dir)
+
+    # Handle initial log reading error
     if isinstance(log_content, str):
         status_info["failed_reason"] = log_content
         return ModelStatus.FAILED, status_info
 
-    # Check for errors or if server is ready
+    # Process log content
     for line in log_content:
-        if "error" in line.lower():
+        line_lower = line.lower()
+
+        # Check for error indicators
+        if "error" in line_lower:
             status_info["failed_reason"] = line.strip("\n")
-            return ModelStatus.FAILED, status_info
+            final_status = ModelStatus.FAILED
+            break
 
+        # Check for server ready signal
         if MODEL_READY_SIGNATURE in line:
-            # Server is running, get URL and check health
             base_url = get_base_url(job_name, int(job_id), log_dir)
+
+            # Validate base URL
             if not isinstance(base_url, str) or not base_url.startswith("http"):
                 status_info["failed_reason"] = f"Invalid base URL: {base_url}"
-                return ModelStatus.FAILED, status_info
+                final_status = ModelStatus.FAILED
+                break
 
             status_info["base_url"] = base_url
+            final_status = _perform_health_check(base_url, status_info)
+            break  # Stop processing after first ready signature
+
+    return final_status, status_info
 
-            # Check if the server is healthy
-            health_check_url = base_url.replace("v1", "health")
-            try:
-                response = requests.get(health_check_url)
-                if response.status_code == 200:
-                    return ModelStatus.READY, status_info
-                status_info["failed_reason"] = (
-                    f"Health check failed with status code {response.status_code}"
-                )
-                return ModelStatus.FAILED, status_info
-            except requests.exceptions.RequestException as e:
-                status_info["failed_reason"] = f"Health check request error: {str(e)}"
-                return ModelStatus.FAILED, status_info
 
-    # If we get here, server is running but not yet ready
-    return ModelStatus.LAUNCHING, status_info
+def _perform_health_check(base_url: str, status_info: dict[str, Any]) -> ModelStatus:
+    """Execute health check and return appropriate status."""
+    health_check_url = base_url.replace("v1", "health")
+
+    try:
+        response = requests.get(health_check_url)
+        if response.status_code == 200:
+            return ModelStatus.READY
+
+        status_info["failed_reason"] = (
+            f"Health check failed with status code {response.status_code}"
+        )
+    except requests.exceptions.RequestException as e:
+        status_info["failed_reason"] = f"Health check request error: {str(e)}"
+
+    return ModelStatus.FAILED
 
 
 def get_base_url(job_name: str, job_id: int, log_dir: Optional[str]) -> str:
     """Get the base URL of a running model.
 
-    Args:
-        job_name: The name of the Slurm job
-        job_id: The Slurm job ID
-        log_dir: Optional path to Slurm log directory
+    Parameters
+    ----------
+    job_name: str
+        The name of the Slurm job
+    job_id: int
+        The Slurm job ID
+    log_dir: str, optional
+        Optional path to Slurm log directory
 
     Returns
     -------
+    str
         The base URL string or an error message
+
     """
     log_content = read_slurm_log(job_name, job_id, "out", log_dir)
     if isinstance(log_content, str):
@@ -197,17 +217,23 @@ def get_base_url(job_name: str, job_id: int, log_dir: Optional[str]) -> str:
     return "URL_NOT_FOUND"
 
 
-def get_metrics(job_name: str, job_id: int, log_dir: Optional[str]) -> Dict[str, str]:
+def get_metrics(job_name: str, job_id: int, log_dir: Optional[str]) -> dict[str, str]:
     """Get the latest metrics for a model.
 
-    Args:
-        job_name: The name of the Slurm job
-        job_id: The Slurm job ID
-        log_dir: Optional path to Slurm log directory
+    Parameters
+    ----------
+    job_name: str
+        The name of the Slurm job
+    job_id: int
+        The Slurm job ID
+    log_dir: str, optional
+        Optional path to Slurm log directory
 
     Returns
     -------
+    dict[str, str]
         Dictionary of metrics or empty dict if not found
+
     """
     log_content = read_slurm_log(job_name, job_id, "out", log_dir)
     if isinstance(log_content, str):

From 05fb3ad3053843d300a1f99c68b366385b92f000 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 11 Mar 2025 17:19:31 +0000
Subject: [PATCH 04/52] [pre-commit.ci] Add auto fixes from pre-commit.com
 hooks

for more information, see https://pre-commit.ci
---
 vec_inf/_shared/__init__.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vec_inf/_shared/__init__.py b/vec_inf/_shared/__init__.py
index 8f44ffe6..b1d18c95 100644
--- a/vec_inf/_shared/__init__.py
+++ b/vec_inf/_shared/__init__.py
@@ -1,4 +1 @@
 """Shared modules for vec_inf."""
-
-
-

From ec5e35627d8c2119f875cec0c569982d44972d55 Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Fri, 14 Mar 2025 09:28:28 -0400
Subject: [PATCH 05/52] Fix mypy errors, test

---
 tests/vec_inf/cli/test_cli.py | 19 ++++++++++++++-----
 vec_inf/api/client.py         |  4 ++--
 vec_inf/api/utils.py          | 32 ++------------------------------
 3 files changed, 18 insertions(+), 37 deletions(-)

diff --git a/tests/vec_inf/cli/test_cli.py b/tests/vec_inf/cli/test_cli.py
index 0f1723d7..a6fc7d22 100644
--- a/tests/vec_inf/cli/test_cli.py
+++ b/tests/vec_inf/cli/test_cli.py
@@ -4,6 +4,7 @@
 import traceback
 from contextlib import ExitStack
 from pathlib import Path
+from typing import Callable, Optional
 from unittest.mock import mock_open, patch
 
 import pytest
@@ -154,13 +155,21 @@ def _mock_truediv(self, other):
     return _mock_truediv
 
 
-def create_path_exists(test_paths, path_exists, exists_paths=None):
+def create_path_exists(
+    test_paths: dict[Path, str],
+    path_exists: Callable[[Path], bool],
+    exists_paths: Optional[list[Path]] = None,
+):
     """Create a path existence checker.
 
-    Args:
-        test_paths: Dictionary containing test paths
-        path_exists: Default path existence checker
-        exists_paths: Optional list of paths that should exist
+    Parameters
+    ----------
+    test_paths: dict[Path, str]
+        Dictionary containing test paths
+    path_exists: Callable[[Path], bool]
+        Default path existence checker
+    exists_paths: Optional[list[Path]]
+        Optional list of paths that should exist
     """
 
     def _custom_path_exists(p):
diff --git a/vec_inf/api/client.py b/vec_inf/api/client.py
index 04737d65..4c11ed59 100644
--- a/vec_inf/api/client.py
+++ b/vec_inf/api/client.py
@@ -182,7 +182,7 @@ def launch_model(
                 command += f" --{arg_name} {formatted_value}"
 
             # Execute the command
-            output = run_bash_command(command)
+            output, _ = run_bash_command(command)
 
             # Parse the output
             job_id, config_dict = parse_launch_output(output)
@@ -224,7 +224,7 @@ def get_status(
         """
         try:
             status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
-            output = run_bash_command(status_cmd)
+            output, _ = run_bash_command(status_cmd)
 
             status, status_info = get_model_status(slurm_job_id, log_dir)
 
diff --git a/vec_inf/api/utils.py b/vec_inf/api/utils.py
index bc3ea691..864b5656 100644
--- a/vec_inf/api/utils.py
+++ b/vec_inf/api/utils.py
@@ -8,7 +8,7 @@
 from vec_inf.cli._config import ModelConfig
 from vec_inf.cli._utils import (
     MODEL_READY_SIGNATURE,
-    SERVER_ADDRESS_SIGNATURE,
+    get_base_url,
     read_slurm_log,
     run_bash_command,
 )
@@ -92,7 +92,7 @@ def get_model_status(
 
     """
     status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
-    output = run_bash_command(status_cmd)
+    output, _ = run_bash_command(status_cmd)
 
     # Check if job exists
     if "Invalid job id specified" in output:
@@ -189,34 +189,6 @@ def _perform_health_check(base_url: str, status_info: dict[str, Any]) -> ModelSt
     return ModelStatus.FAILED
 
 
-def get_base_url(job_name: str, job_id: int, log_dir: Optional[str]) -> str:
-    """Get the base URL of a running model.
-
-    Parameters
-    ----------
-    job_name: str
-        The name of the Slurm job
-    job_id: int
-        The Slurm job ID
-    log_dir: str, optional
-        Optional path to Slurm log directory
-
-    Returns
-    -------
-    str
-        The base URL string or an error message
-
-    """
-    log_content = read_slurm_log(job_name, job_id, "out", log_dir)
-    if isinstance(log_content, str):
-        return log_content
-
-    for line in log_content:
-        if SERVER_ADDRESS_SIGNATURE in line:
-            return line.split(SERVER_ADDRESS_SIGNATURE)[1].strip("\n")
-    return "URL_NOT_FOUND"
-
-
 def get_metrics(job_name: str, job_id: int, log_dir: Optional[str]) -> dict[str, str]:
     """Get the latest metrics for a model.
 

From 8d28d7f60758094df597e74930fa3dff7e679fbb Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Sat, 15 Mar 2025 09:38:58 -0400
Subject: [PATCH 06/52] Move common stuff to shared package

---
 tests/vec_inf/api/test_client.py             |   9 +-
 tests/vec_inf/cli/test_cli.py                |   6 +-
 tests/vec_inf/cli/test_utils.py              |  12 +-
 vec_inf/api/__init__.py                      |   3 +-
 vec_inf/api/client.py                        |  16 +-
 vec_inf/api/models.py                        |  20 +-
 vec_inf/api/utils.py                         |  39 +---
 vec_inf/cli/_cli.py                          |   2 +-
 vec_inf/cli/_helper.py                       |  29 ++-
 vec_inf/cli/_utils.py                        | 186 +---------------
 vec_inf/{_shared => shared}/__init__.py      |   0
 vec_inf/{cli/_config.py => shared/config.py} |   0
 vec_inf/shared/models.py                     |  22 ++
 vec_inf/shared/utils.py                      | 214 +++++++++++++++++++
 14 files changed, 289 insertions(+), 269 deletions(-)
 rename vec_inf/{_shared => shared}/__init__.py (100%)
 rename vec_inf/{cli/_config.py => shared/config.py} (100%)
 create mode 100644 vec_inf/shared/models.py
 create mode 100644 vec_inf/shared/utils.py

diff --git a/tests/vec_inf/api/test_client.py b/tests/vec_inf/api/test_client.py
index b782db34..cb26674e 100644
--- a/tests/vec_inf/api/test_client.py
+++ b/tests/vec_inf/api/test_client.py
@@ -67,8 +67,13 @@ def test_launch_model(mock_model_config, mock_launch_output):
     client.get_model_config = MagicMock(return_value=MagicMock())
 
     with (
-        patch("vec_inf.cli._utils.run_bash_command", return_value=mock_launch_output),
-        patch("vec_inf.api.utils.parse_launch_output", return_value="12345678"),
+        patch(
+            "vec_inf.shared.utils.run_bash_command",
+            return_value=(mock_launch_output, ""),
+        ),
+        patch(
+            "vec_inf.shared.utils.parse_launch_output", return_value=("12345678", {})
+        ),
     ):
         # Create a mock response
         response = MagicMock()
diff --git a/tests/vec_inf/cli/test_cli.py b/tests/vec_inf/cli/test_cli.py
index a6fc7d22..a9d94573 100644
--- a/tests/vec_inf/cli/test_cli.py
+++ b/tests/vec_inf/cli/test_cli.py
@@ -220,7 +220,7 @@ def test_launch_command_success(runner, mock_launch_output, path_exists, debug_h
     test_log_dir = Path("/tmp/test_vec_inf_logs")
 
     with (
-        patch("vec_inf.cli._utils.run_bash_command") as mock_run,
+        patch("vec_inf.shared.utils.run_bash_command") as mock_run,
         patch("pathlib.Path.mkdir"),
         patch("builtins.open", debug_helper.tracked_mock_open),
         patch("pathlib.Path.open", debug_helper.tracked_mock_open),
@@ -251,7 +251,7 @@ def test_launch_command_with_json_output(
     """Test JSON output format for launch command."""
     test_log_dir = Path("/tmp/test_vec_inf_logs")
     with (
-        patch("vec_inf.cli._utils.run_bash_command") as mock_run,
+        patch("vec_inf.shared.utils.run_bash_command") as mock_run,
         patch("pathlib.Path.mkdir"),
         patch("builtins.open", debug_helper.tracked_mock_open),
         patch("pathlib.Path.open", debug_helper.tracked_mock_open),
@@ -309,7 +309,7 @@ def test_launch_command_model_not_in_config_with_weights(
         for patch_obj in base_patches:
             stack.enter_context(patch_obj)
         # Apply specific patches for this test
-        mock_run = stack.enter_context(patch("vec_inf.cli._utils.run_bash_command"))
+        mock_run = stack.enter_context(patch("vec_inf.shared.utils.run_bash_command"))
         stack.enter_context(patch("pathlib.Path.exists", new=custom_path_exists))
 
         expected_job_id = "14933051"
diff --git a/tests/vec_inf/cli/test_utils.py b/tests/vec_inf/cli/test_utils.py
index bacf66dc..1586ff34 100644
--- a/tests/vec_inf/cli/test_utils.py
+++ b/tests/vec_inf/cli/test_utils.py
@@ -6,7 +6,7 @@
 import pytest
 import requests
 
-from vec_inf.cli._utils import (
+from vec_inf.shared.utils import (
     MODEL_READY_SIGNATURE,
     convert_boolean_value,
     create_table,
@@ -79,7 +79,7 @@ def test_read_slurm_log_not_found():
 )
 def test_is_server_running_statuses(log_content, expected):
     """Test that is_server_running returns the correct status."""
-    with patch("vec_inf.cli._utils.read_slurm_log") as mock_read:
+    with patch("vec_inf.shared.utils.read_slurm_log") as mock_read:
         mock_read.return_value = log_content
         result = is_server_running("test_job", 123, None)
         assert result == expected
@@ -88,7 +88,7 @@ def test_is_server_running_statuses(log_content, expected):
 def test_get_base_url_found():
     """Test that get_base_url returns the correct base URL."""
     test_dict = {"server_address": "http://localhost:8000"}
-    with patch("vec_inf.cli._utils.read_slurm_log") as mock_read:
+    with patch("vec_inf.shared.utils.read_slurm_log") as mock_read:
         mock_read.return_value = test_dict
         result = get_base_url("test_job", 123, None)
         assert result == "http://localhost:8000"
@@ -96,7 +96,7 @@ def test_get_base_url_found():
 
 def test_get_base_url_not_found():
     """Test get_base_url when URL is not found in logs."""
-    with patch("vec_inf.cli._utils.read_slurm_log") as mock_read:
+    with patch("vec_inf.shared.utils.read_slurm_log") as mock_read:
         mock_read.return_value = {"random_key": "123"}
         result = get_base_url("test_job", 123, None)
         assert result == "URL NOT FOUND"
@@ -112,7 +112,7 @@ def test_get_base_url_not_found():
 )
 def test_model_health_check(url, status_code, expected):
     """Test model_health_check with various scenarios."""
-    with patch("vec_inf.cli._utils.get_base_url") as mock_url:
+    with patch("vec_inf.shared.utils.get_base_url") as mock_url:
         mock_url.return_value = url
         if url.startswith("http"):
             with patch("requests.get") as mock_get:
@@ -127,7 +127,7 @@ def test_model_health_check(url, status_code, expected):
 def test_model_health_check_request_exception():
     """Test model_health_check when request raises an exception."""
     with (
-        patch("vec_inf.cli._utils.get_base_url") as mock_url,
+        patch("vec_inf.shared.utils.get_base_url") as mock_url,
         patch("requests.get") as mock_get,
     ):
         mock_url.return_value = "http://localhost:8000"
diff --git a/vec_inf/api/__init__.py b/vec_inf/api/__init__.py
index e46ea39e..9bab599c 100644
--- a/vec_inf/api/__init__.py
+++ b/vec_inf/api/__init__.py
@@ -13,10 +13,9 @@
     MetricsResponse,
     ModelConfig,
     ModelInfo,
-    ModelStatus,
-    ModelType,
     StatusResponse,
 )
+from vec_inf.shared.models import ModelStatus, ModelType
 
 
 __all__ = [
diff --git a/vec_inf/api/client.py b/vec_inf/api/client.py
index 4c11ed59..705eefa9 100644
--- a/vec_inf/api/client.py
+++ b/vec_inf/api/client.py
@@ -14,8 +14,6 @@
     LaunchResponse,
     MetricsResponse,
     ModelInfo,
-    ModelStatus,
-    ModelType,
     StatusResponse,
 )
 from vec_inf.api.utils import (
@@ -26,10 +24,13 @@
     get_metrics,
     get_model_status,
     load_models,
+)
+from vec_inf.shared.config import ModelConfig
+from vec_inf.shared.models import ModelStatus, ModelType
+from vec_inf.shared.utils import (
     parse_launch_output,
+    run_bash_command,
 )
-from vec_inf.cli._config import ModelConfig
-from vec_inf.cli._utils import run_bash_command
 
 
 class VecInfClient:
@@ -107,6 +108,7 @@ def get_model_config(self, model_name: str) -> ModelConfig:
             Error if the specified model is not found.
         APIError
                 Error if there was an error retrieving the model configuration.
+
         """
         try:
             model_configs = load_models()
@@ -291,8 +293,10 @@ def get_metrics(
     def shutdown_model(self, slurm_job_id: str) -> bool:
         """Shutdown a running model.
 
-        Args:
-            slurm_job_id: The Slurm job ID to shut down.
+        Parameters
+        ----------
+        slurm_job_id: str
+            The Slurm job ID to shut down.
 
         Returns
         -------
diff --git a/vec_inf/api/models.py b/vec_inf/api/models.py
index bff729d6..1b6581d7 100644
--- a/vec_inf/api/models.py
+++ b/vec_inf/api/models.py
@@ -5,30 +5,12 @@
 """
 
 from dataclasses import dataclass, field
-from enum import Enum
 from pathlib import Path
 from typing import Any, Dict, Optional, TypedDict
 
 from typing_extensions import NotRequired
 
-
-class ModelStatus(str, Enum):
-    """Enum representing the possible status states of a model."""
-
-    PENDING = "PENDING"
-    LAUNCHING = "LAUNCHING"
-    READY = "READY"
-    FAILED = "FAILED"
-    SHUTDOWN = "SHUTDOWN"
-
-
-class ModelType(str, Enum):
-    """Enum representing the possible model types."""
-
-    LLM = "LLM"
-    VLM = "VLM"
-    TEXT_EMBEDDING = "Text_Embedding"
-    REWARD_MODELING = "Reward_Modeling"
+from vec_inf.shared.models import ModelStatus, ModelType
 
 
 @dataclass
diff --git a/vec_inf/api/utils.py b/vec_inf/api/utils.py
index 864b5656..ae20a13a 100644
--- a/vec_inf/api/utils.py
+++ b/vec_inf/api/utils.py
@@ -4,17 +4,15 @@
 
 import requests
 
-from vec_inf.api.models import ModelStatus
-from vec_inf.cli._config import ModelConfig
-from vec_inf.cli._utils import (
+from vec_inf.shared.config import ModelConfig
+from vec_inf.shared.models import ModelStatus
+from vec_inf.shared.utils import (
     MODEL_READY_SIGNATURE,
     get_base_url,
+    load_config,
     read_slurm_log,
     run_bash_command,
 )
-from vec_inf.cli._utils import (
-    load_config as cli_load_config,
-)
 
 
 class APIError(Exception):
@@ -43,34 +41,7 @@ class ServerError(APIError):
 
 def load_models() -> list[ModelConfig]:
     """Load model configurations."""
-    return cli_load_config()
-
-
-def parse_launch_output(output: str) -> tuple[str, dict[str, str]]:
-    """Parse output from model launch command.
-
-    Parameters
-    ----------
-    output: str
-        Output from the launch command
-
-    Returns
-    -------
-    tuple[str, dict[str, str]]
-        Slurm job ID and dictionary of config parameters
-
-    """
-    slurm_job_id = output.split(" ")[-1].strip().strip("\n")
-
-    # Extract config parameters
-    config_dict = {}
-    output_lines = output.split("\n")[:-2]
-    for line in output_lines:
-        if ": " in line:
-            key, value = line.split(": ", 1)
-            config_dict[key.lower().replace(" ", "_")] = value
-
-    return slurm_job_id, config_dict
+    return load_config()
 
 
 def get_model_status(
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index 72d640b0..9a8ea3e8 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -7,8 +7,8 @@
 from rich.console import Console
 from rich.live import Live
 
-import vec_inf.cli._utils as utils
 from vec_inf.cli._helper import LaunchHelper, ListHelper, StatusHelper
+from vec_inf.shared import utils
 
 
 CONSOLE = Console()
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index 44e2211b..585749f6 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -11,8 +11,15 @@
 from rich.panel import Panel
 from rich.table import Table
 
-import vec_inf.cli._utils as utils
-from vec_inf.cli._config import ModelConfig
+from vec_inf.shared.config import ModelConfig
+from vec_inf.shared.utils import (
+    convert_boolean_value,
+    create_table,
+    get_base_url,
+    is_server_running,
+    load_config,
+    model_health_check,
+)
 
 
 VLLM_TASK_MAP = {
@@ -46,7 +53,7 @@ def __init__(
 
     def _get_model_configuration(self) -> ModelConfig:
         """Load and validate model configuration."""
-        model_configs = utils.load_config()
+        model_configs = load_config()
         config = next(
             (m for m in model_configs if m.model_name == self.model_name), None
         )
@@ -89,7 +96,7 @@ def _get_launch_params(self) -> dict[str, Any]:
         # Process boolean fields
         for bool_field in ["pipeline_parallelism", "enforce_eager"]:
             if (value := self.cli_kwargs.get(bool_field)) is not None:
-                params[bool_field] = utils.convert_boolean_value(value)
+                params[bool_field] = convert_boolean_value(value)
 
         # Merge other overrides
         for key, value in self.cli_kwargs.items():
@@ -167,7 +174,7 @@ def build_launch_command(self) -> str:
 
     def format_table_output(self, job_id: str) -> Table:
         """Format output as rich Table."""
-        table = utils.create_table(key_title="Job Config", value_title="Value")
+        table = create_table(key_title="Job Config", value_title="Value")
         # Add rows
         table.add_row("Slurm Job ID", job_id, style="blue")
         table.add_row("Job Name", self.model_name)
@@ -245,11 +252,11 @@ def process_job_state(self) -> None:
 
     def check_model_health(self) -> None:
         """Check model health and update status accordingly."""
-        status, status_code = utils.model_health_check(
+        status, status_code = model_health_check(
             cast(str, self.status_info["model_name"]), self.slurm_job_id, self.log_dir
         )
         if status == "READY":
-            self.status_info["base_url"] = utils.get_base_url(
+            self.status_info["base_url"] = get_base_url(
                 cast(str, self.status_info["model_name"]),
                 self.slurm_job_id,
                 self.log_dir,
@@ -263,7 +270,7 @@ def check_model_health(self) -> None:
 
     def process_running_state(self) -> None:
         """Process RUNNING job state and check server status."""
-        server_status = utils.is_server_running(
+        server_status = is_server_running(
             cast(str, self.status_info["model_name"]), self.slurm_job_id, self.log_dir
         )
 
@@ -303,7 +310,7 @@ def output_json(self) -> None:
 
     def output_table(self, console: Console) -> None:
         """Create and display rich table."""
-        table = utils.create_table(key_title="Job Status", value_title="Value")
+        table = create_table(key_title="Job Status", value_title="Value")
         table.add_row("Model Name", self.status_info["model_name"])
         table.add_row("Model Status", self.status_info["status"], style="blue")
 
@@ -322,7 +329,7 @@ class ListHelper:
     def __init__(self, model_name: Optional[str] = None, json_mode: bool = False):
         self.model_name = model_name
         self.json_mode = json_mode
-        self.model_configs = utils.load_config()
+        self.model_configs = load_config()
 
     def get_single_model_config(self) -> ModelConfig:
         """Get configuration for a specific model."""
@@ -349,7 +356,7 @@ def format_single_model_output(
             )
             return config_dict
 
-        table = utils.create_table(key_title="Model Config", value_title="Value")
+        table = create_table(key_title="Model Config", value_title="Value")
         for field, value in config.model_dump().items():
             if field not in {"venv", "log_dir"}:
                 table.add_row(field, str(value))
diff --git a/vec_inf/cli/_utils.py b/vec_inf/cli/_utils.py
index dfa1b1fc..41e6d788 100644
--- a/vec_inf/cli/_utils.py
+++ b/vec_inf/cli/_utils.py
@@ -1,187 +1,3 @@
 """Utility functions for the CLI."""
 
-import json
-import os
-import subprocess
-from pathlib import Path
-from typing import Any, Optional, Union, cast
-
-import requests
-import yaml
-from rich.table import Table
-
-from vec_inf.cli._config import ModelConfig
-
-
-MODEL_READY_SIGNATURE = "INFO:     Application startup complete."
-CACHED_CONFIG = Path("/", "model-weights", "vec-inf-shared", "models.yaml")
-
-
-def run_bash_command(command: str) -> tuple[str, str]:
-    """Run a bash command and return the output."""
-    process = subprocess.Popen(
-        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
-    )
-    return process.communicate()
-
-
-def read_slurm_log(
-    slurm_job_name: str,
-    slurm_job_id: int,
-    slurm_log_type: str,
-    log_dir: Optional[Union[str, Path]],
-) -> Union[list[str], str, dict[str, str]]:
-    """Read the slurm log file."""
-    if not log_dir:
-        # Default log directory
-        models_dir = Path.home() / ".vec-inf-logs"
-        # Iterate over all dirs in models_dir, sorted by dir name length in desc order
-        for directory in sorted(
-            [d for d in models_dir.iterdir() if d.is_dir()],
-            key=lambda d: len(d.name),
-            reverse=True,
-        ):
-            if directory.name in slurm_job_name:
-                log_dir = directory
-                break
-    else:
-        log_dir = Path(log_dir)
-
-    # If log_dir is still not set, then didn't find the log dir at default location
-    if not log_dir:
-        return "LOG DIR NOT FOUND"
-
-    try:
-        file_path = (
-            log_dir
-            / Path(f"{slurm_job_name}.{slurm_job_id}")
-            / f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}"
-        )
-        if slurm_log_type == "json":
-            with file_path.open("r") as file:
-                json_content: dict[str, str] = json.load(file)
-                return json_content
-        else:
-            with file_path.open("r") as file:
-                return file.readlines()
-    except FileNotFoundError:
-        return f"LOG FILE NOT FOUND: {file_path}"
-
-
-def is_server_running(
-    slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
-) -> Union[str, tuple[str, str]]:
-    """Check if a model is ready to serve requests."""
-    log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir)
-    if isinstance(log_content, str):
-        return log_content
-
-    status: Union[str, tuple[str, str]] = "LAUNCHING"
-
-    for line in log_content:
-        if "error" in line.lower():
-            status = ("FAILED", line.strip("\n"))
-        if MODEL_READY_SIGNATURE in line:
-            status = "RUNNING"
-
-    return status
-
-
-def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]) -> str:
-    """Get the base URL of a model."""
-    log_content = read_slurm_log(slurm_job_name, slurm_job_id, "json", log_dir)
-    if isinstance(log_content, str):
-        return log_content
-
-    server_addr = cast(dict[str, str], log_content).get("server_address")
-    return server_addr if server_addr else "URL NOT FOUND"
-
-
-def model_health_check(
-    slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
-) -> tuple[str, Union[str, int]]:
-    """Check the health of a running model on the cluster."""
-    base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
-    if not base_url.startswith("http"):
-        return ("FAILED", base_url)
-    health_check_url = base_url.replace("v1", "health")
-
-    try:
-        response = requests.get(health_check_url)
-        # Check if the request was successful
-        if response.status_code == 200:
-            return ("READY", response.status_code)
-        return ("FAILED", response.status_code)
-    except requests.exceptions.RequestException as e:
-        return ("FAILED", str(e))
-
-
-def create_table(
-    key_title: str = "", value_title: str = "", show_header: bool = True
-) -> Table:
-    """Create a table for displaying model status."""
-    table = Table(show_header=show_header, header_style="bold magenta")
-    table.add_column(key_title, style="dim")
-    table.add_column(value_title)
-    return table
-
-
-def load_config() -> list[ModelConfig]:
-    """Load the model configuration."""
-    default_path = (
-        CACHED_CONFIG
-        if CACHED_CONFIG.exists()
-        else Path(__file__).resolve().parent.parent / "config" / "models.yaml"
-    )
-
-    config: dict[str, Any] = {}
-    with open(default_path) as f:
-        config = yaml.safe_load(f) or {}
-
-    user_path = os.getenv("VEC_INF_CONFIG")
-    if user_path:
-        user_path_obj = Path(user_path)
-        if user_path_obj.exists():
-            with open(user_path_obj) as f:
-                user_config = yaml.safe_load(f) or {}
-                for name, data in user_config.get("models", {}).items():
-                    if name in config.get("models", {}):
-                        config["models"][name].update(data)
-                    else:
-                        config.setdefault("models", {})[name] = data
-        else:
-            print(
-                f"WARNING: Could not find user config: {user_path}, revert to default config located at {default_path}"
-            )
-
-    return [
-        ModelConfig(model_name=name, **model_data)
-        for name, model_data in config.get("models", {}).items()
-    ]
-
-
-def get_latest_metric(log_lines: list[str]) -> Union[str, dict[str, str]]:
-    """Read the latest metric entry from the log file."""
-    latest_metric = {}
-
-    try:
-        for line in reversed(log_lines):
-            if "Avg prompt throughput" in line:
-                # Parse the metric values from the line
-                metrics_str = line.split("] ")[1].strip().strip(".")
-                metrics_list = metrics_str.split(", ")
-                for metric in metrics_list:
-                    key, value = metric.split(": ")
-                    latest_metric[key] = value
-                break
-    except Exception as e:
-        return f"[red]Error reading log file: {e}[/red]"
-
-    return latest_metric
-
-
-def convert_boolean_value(value: Union[str, int, bool]) -> bool:
-    """Convert various input types to boolean strings."""
-    if isinstance(value, str):
-        return value.lower() == "true"
-    return bool(value)
+# Import all shared utilities
diff --git a/vec_inf/_shared/__init__.py b/vec_inf/shared/__init__.py
similarity index 100%
rename from vec_inf/_shared/__init__.py
rename to vec_inf/shared/__init__.py
diff --git a/vec_inf/cli/_config.py b/vec_inf/shared/config.py
similarity index 100%
rename from vec_inf/cli/_config.py
rename to vec_inf/shared/config.py
diff --git a/vec_inf/shared/models.py b/vec_inf/shared/models.py
new file mode 100644
index 00000000..809ad5f9
--- /dev/null
+++ b/vec_inf/shared/models.py
@@ -0,0 +1,22 @@
+"""Shared data models for Vector Inference."""
+
+from enum import Enum
+
+
+class ModelStatus(str, Enum):
+    """Enum representing the possible status states of a model."""
+
+    PENDING = "PENDING"
+    LAUNCHING = "LAUNCHING"
+    READY = "READY"
+    FAILED = "FAILED"
+    SHUTDOWN = "SHUTDOWN"
+
+
+class ModelType(str, Enum):
+    """Enum representing the possible model types."""
+
+    LLM = "LLM"
+    VLM = "VLM"
+    TEXT_EMBEDDING = "Text_Embedding"
+    REWARD_MODELING = "Reward_Modeling"
diff --git a/vec_inf/shared/utils.py b/vec_inf/shared/utils.py
new file mode 100644
index 00000000..b654ee37
--- /dev/null
+++ b/vec_inf/shared/utils.py
@@ -0,0 +1,214 @@
+"""Utility functions shared between CLI and API."""
+
+import json
+import os
+import subprocess
+from pathlib import Path
+from typing import Any, Dict, Optional, Union, cast
+
+import requests
+import yaml
+from rich.table import Table
+
+from vec_inf.shared.config import ModelConfig
+
+
+MODEL_READY_SIGNATURE = "INFO:     Application startup complete."
+CACHED_CONFIG = Path("/", "model-weights", "vec-inf-shared", "models.yaml")
+
+
+def run_bash_command(command: str) -> tuple[str, str]:
+    """Run a bash command and return the output."""
+    process = subprocess.Popen(
+        command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
+    )
+    return process.communicate()
+
+
+def read_slurm_log(
+    slurm_job_name: str,
+    slurm_job_id: int,
+    slurm_log_type: str,
+    log_dir: Optional[Union[str, Path]],
+) -> Union[list[str], str, Dict[str, str]]:
+    """Read the slurm log file."""
+    if not log_dir:
+        # Default log directory
+        models_dir = Path.home() / ".vec-inf-logs"
+        # Iterate over all dirs in models_dir, sorted by dir name length in desc order
+        for directory in sorted(
+            [d for d in models_dir.iterdir() if d.is_dir()],
+            key=lambda d: len(d.name),
+            reverse=True,
+        ):
+            if directory.name in slurm_job_name:
+                log_dir = directory
+                break
+    else:
+        log_dir = Path(log_dir)
+
+    # If log_dir is still not set, then didn't find the log dir at default location
+    if not log_dir:
+        return "LOG DIR NOT FOUND"
+
+    try:
+        file_path = (
+            log_dir
+            / Path(f"{slurm_job_name}.{slurm_job_id}")
+            / f"{slurm_job_name}.{slurm_job_id}.{slurm_log_type}"
+        )
+        if slurm_log_type == "json":
+            with file_path.open("r") as file:
+                json_content: Dict[str, str] = json.load(file)
+                return json_content
+        else:
+            with file_path.open("r") as file:
+                return file.readlines()
+    except FileNotFoundError:
+        return f"LOG FILE NOT FOUND: {file_path}"
+
+
+def is_server_running(
+    slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
+) -> Union[str, tuple[str, str]]:
+    """Check if a model is ready to serve requests."""
+    log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir)
+    if isinstance(log_content, str):
+        return log_content
+
+    status: Union[str, tuple[str, str]] = "LAUNCHING"
+
+    for line in log_content:
+        if "error" in line.lower():
+            status = ("FAILED", line.strip("\n"))
+        if MODEL_READY_SIGNATURE in line:
+            status = "RUNNING"
+
+    return status
+
+
+def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]) -> str:
+    """Get the base URL of a model."""
+    log_content = read_slurm_log(slurm_job_name, slurm_job_id, "json", log_dir)
+    if isinstance(log_content, str):
+        return log_content
+
+    server_addr = cast(Dict[str, str], log_content).get("server_address")
+    return server_addr if server_addr else "URL NOT FOUND"
+
+
+def model_health_check(
+    slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
+) -> tuple[str, Union[str, int]]:
+    """Check the health of a running model on the cluster."""
+    base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
+    if not base_url.startswith("http"):
+        return ("FAILED", base_url)
+    health_check_url = base_url.replace("v1", "health")
+
+    try:
+        response = requests.get(health_check_url)
+        # Check if the request was successful
+        if response.status_code == 200:
+            return ("READY", response.status_code)
+        return ("FAILED", response.status_code)
+    except requests.exceptions.RequestException as e:
+        return ("FAILED", str(e))
+
+
+def create_table(
+    key_title: str = "", value_title: str = "", show_header: bool = True
+) -> Table:
+    """Create a table for displaying model status."""
+    table = Table(show_header=show_header, header_style="bold magenta")
+    table.add_column(key_title, style="dim")
+    table.add_column(value_title)
+    return table
+
+
+def load_config() -> list[ModelConfig]:
+    """Load the model configuration."""
+    default_path = (
+        CACHED_CONFIG
+        if CACHED_CONFIG.exists()
+        else Path(__file__).resolve().parent.parent / "config" / "models.yaml"
+    )
+
+    config: Dict[str, Any] = {}
+    with open(default_path) as f:
+        config = yaml.safe_load(f) or {}
+
+    user_path = os.getenv("VEC_INF_CONFIG")
+    if user_path:
+        user_path_obj = Path(user_path)
+        if user_path_obj.exists():
+            with open(user_path_obj) as f:
+                user_config = yaml.safe_load(f) or {}
+                for name, data in user_config.get("models", {}).items():
+                    if name in config.get("models", {}):
+                        config["models"][name].update(data)
+                    else:
+                        config.setdefault("models", {})[name] = data
+        else:
+            print(
+                f"WARNING: Could not find user config: {user_path}, revert to default config located at {default_path}"
+            )
+
+    return [
+        ModelConfig(model_name=name, **model_data)
+        for name, model_data in config.get("models", {}).items()
+    ]
+
+
+def get_latest_metric(log_lines: list[str]) -> Union[str, Dict[str, str]]:
+    """Read the latest metric entry from the log file."""
+    latest_metric = {}
+
+    try:
+        for line in reversed(log_lines):
+            if "Avg prompt throughput" in line:
+                # Parse the metric values from the line
+                metrics_str = line.split("] ")[1].strip().strip(".")
+                metrics_list = metrics_str.split(", ")
+                for metric in metrics_list:
+                    key, value = metric.split(": ")
+                    latest_metric[key] = value
+                break
+    except Exception as e:
+        return f"[red]Error reading log file: {e}[/red]"
+
+    return latest_metric
+
+
+def convert_boolean_value(value: Union[str, int, bool]) -> bool:
+    """Convert various input types to boolean strings."""
+    if isinstance(value, str):
+        return value.lower() == "true"
+    return bool(value)
+
+
+def parse_launch_output(output: str) -> tuple[str, Dict[str, str]]:
+    """Parse output from model launch command.
+
+    Parameters
+    ----------
+    output: str
+        Output from the launch command
+
+    Returns
+    -------
+    tuple[str, Dict[str, str]]
+        Slurm job ID and dictionary of config parameters
+
+    """
+    slurm_job_id = output.split(" ")[-1].strip().strip("\n")
+
+    # Extract config parameters
+    config_dict = {}
+    output_lines = output.split("\n")[:-2]
+    for line in output_lines:
+        if ": " in line:
+            key, value = line.split(": ", 1)
+            config_dict[key.lower().replace(" ", "_")] = value
+
+    return slurm_job_id, config_dict

From f874bcf01b35250a8fc6c7b926f96e4d4ec9cea9 Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Sat, 15 Mar 2025 09:40:12 -0400
Subject: [PATCH 07/52] Improve naming of usage files

---
 examples/api/{advanced_api_usage.py => advanced_usage.py} | 0
 examples/api/{api_usage.py => basic_usage.py}             | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename examples/api/{advanced_api_usage.py => advanced_usage.py} (100%)
 rename examples/api/{api_usage.py => basic_usage.py} (100%)

diff --git a/examples/api/advanced_api_usage.py b/examples/api/advanced_usage.py
similarity index 100%
rename from examples/api/advanced_api_usage.py
rename to examples/api/advanced_usage.py
diff --git a/examples/api/api_usage.py b/examples/api/basic_usage.py
similarity index 100%
rename from examples/api/api_usage.py
rename to examples/api/basic_usage.py

From 09edcf4a24c497e31ed9273d557a998bc0771c75 Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Sat, 15 Mar 2025 09:42:01 -0400
Subject: [PATCH 08/52] Fix readme of examples

---
 examples/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 09b53bd8..2a8016e1 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -8,5 +8,5 @@
 - [`logits`](logits): Example for logits generation
   - [`logits.py`](logits/logits.py): Python example of getting logits from hosted model.
 - [`api`](api): Examples for using the Python API
-  - [`api_usage.py`](api/api_usage.py): Basic Python example demonstrating the Vector Inference API
-  - [`advanced_api_usage.py`](api/advanced_api_usage.py): Advanced Python example with rich UI for the Vector Inference API
+  - [`basic_usage.py`](api/basic_usage.py): Basic Python example demonstrating the Vector Inference API
+  - [`advanced_usage.py`](api/advanced_usage.py): Advanced Python example with rich UI for the Vector Inference API

From b396b1c43f2cc8e55f6c7fbed2fd3dfbc67d01dc Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Sat, 15 Mar 2025 09:44:32 -0400
Subject: [PATCH 09/52] Remove ModelConfig from api package

---
 vec_inf/api/__init__.py |  2 --
 vec_inf/api/models.py   | 25 -------------------------
 2 files changed, 27 deletions(-)

diff --git a/vec_inf/api/__init__.py b/vec_inf/api/__init__.py
index 9bab599c..f6de1c93 100644
--- a/vec_inf/api/__init__.py
+++ b/vec_inf/api/__init__.py
@@ -11,7 +11,6 @@
     LaunchOptionsDict,
     LaunchResponse,
     MetricsResponse,
-    ModelConfig,
     ModelInfo,
     StatusResponse,
 )
@@ -23,7 +22,6 @@
     "LaunchResponse",
     "StatusResponse",
     "ModelInfo",
-    "ModelConfig",
     "MetricsResponse",
     "ModelStatus",
     "ModelType",
diff --git a/vec_inf/api/models.py b/vec_inf/api/models.py
index 1b6581d7..96876f4d 100644
--- a/vec_inf/api/models.py
+++ b/vec_inf/api/models.py
@@ -5,7 +5,6 @@
 """
 
 from dataclasses import dataclass, field
-from pathlib import Path
 from typing import Any, Dict, Optional, TypedDict
 
 from typing_extensions import NotRequired
@@ -13,30 +12,6 @@
 from vec_inf.shared.models import ModelStatus, ModelType
 
 
-@dataclass
-class ModelConfig:
-    """Model configuration parameters."""
-
-    model_name: str
-    model_family: str
-    model_variant: Optional[str] = None
-    model_type: ModelType = ModelType.LLM
-    num_gpus: int = 1
-    num_nodes: int = 1
-    vocab_size: int = 0
-    max_model_len: int = 0
-    max_num_seqs: int = 256
-    pipeline_parallelism: bool = True
-    enforce_eager: bool = False
-    qos: str = "m2"
-    time: str = "08:00:00"
-    partition: str = "a40"
-    data_type: str = "auto"
-    venv: str = "singularity"
-    log_dir: Optional[Path] = None
-    model_weights_parent_dir: Optional[Path] = None
-
-
 @dataclass
 class ModelInfo:
     """Information about an available model."""

From bb11cdc85e2ea666d739672160d279939075d5cd Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Sat, 15 Mar 2025 21:41:57 -0400
Subject: [PATCH 10/52] Small fixes, use ModelStatus, ModelType

---
 vec_inf/cli/_helper.py   | 33 ++++++++++++++++++++-------------
 vec_inf/shared/models.py |  1 +
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index 585749f6..afe3be63 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -12,6 +12,7 @@
 from rich.table import Table
 
 from vec_inf.shared.config import ModelConfig
+from vec_inf.shared.models import ModelStatus, ModelType
 from vec_inf.shared.utils import (
     convert_boolean_value,
     create_table,
@@ -23,10 +24,10 @@
 
 
 VLLM_TASK_MAP = {
-    "LLM": "generate",
-    "VLM": "generate",
-    "Text_Embedding": "embed",
-    "Reward_Modeling": "reward",
+    ModelType.LLM: "generate",
+    ModelType.VLM: "generate",
+    ModelType.TEXT_EMBEDDING: "embed",
+    ModelType.REWARD_MODELING: "reward",
 }
 
 REQUIRED_FIELDS = {
@@ -231,13 +232,13 @@ def _get_base_status_data(self) -> dict[str, Union[str, None]]:
             job_name = self.output.split(" ")[1].split("=")[1]
             job_state = self.output.split(" ")[9].split("=")[1]
         except IndexError:
-            job_name = "UNAVAILABLE"
-            job_state = "UNAVAILABLE"
+            job_name = ModelStatus.UNAVAILABLE
+            job_state = ModelStatus.UNAVAILABLE
 
         return {
             "model_name": job_name,
-            "status": "UNAVAILABLE",
-            "base_url": "UNAVAILABLE",
+            "status": ModelStatus.UNAVAILABLE,
+            "base_url": ModelStatus.UNAVAILABLE,
             "state": job_state,
             "pending_reason": None,
             "failed_reason": None,
@@ -245,7 +246,7 @@ def _get_base_status_data(self) -> dict[str, Union[str, None]]:
 
     def process_job_state(self) -> None:
         """Process different job states and update status information."""
-        if self.status_info["state"] == "PENDING":
+        if self.status_info["state"] == ModelStatus.PENDING:
             self.process_pending_state()
         elif self.status_info["state"] == "RUNNING":
             self.process_running_state()
@@ -255,7 +256,7 @@ def check_model_health(self) -> None:
         status, status_code = model_health_check(
             cast(str, self.status_info["model_name"]), self.slurm_job_id, self.log_dir
         )
-        if status == "READY":
+        if status == ModelStatus.READY:
             self.status_info["base_url"] = get_base_url(
                 cast(str, self.status_info["model_name"]),
                 self.slurm_job_id,
@@ -291,7 +292,7 @@ def process_pending_state(self) -> None:
             self.status_info["pending_reason"] = self.output.split(" ")[10].split("=")[
                 1
             ]
-            self.status_info["status"] = "PENDING"
+            self.status_info["status"] = ModelStatus.PENDING
         except IndexError:
             self.status_info["pending_reason"] = "Unknown pending reason"
 
@@ -368,9 +369,15 @@ def format_all_models_output(self) -> Union[list[str], list[Panel]]:
             return [config.model_name for config in self.model_configs]
 
         # Sort by model type priority
-        type_priority = {"LLM": 0, "VLM": 1, "Text_Embedding": 2, "Reward_Modeling": 3}
+        type_priority = {
+            "LLM": 0,
+            "VLM": 1,
+            "Text_Embedding": 2,
+            "Reward_Modeling": 3,
+        }
         sorted_configs = sorted(
-            self.model_configs, key=lambda x: type_priority.get(x.model_type, 4)
+            self.model_configs,
+            key=lambda x: type_priority.get(x.model_type, 4),
         )
 
         # Create panels with color coding
diff --git a/vec_inf/shared/models.py b/vec_inf/shared/models.py
index 809ad5f9..cd11d6a4 100644
--- a/vec_inf/shared/models.py
+++ b/vec_inf/shared/models.py
@@ -11,6 +11,7 @@ class ModelStatus(str, Enum):
     READY = "READY"
     FAILED = "FAILED"
     SHUTDOWN = "SHUTDOWN"
+    UNAVAILABLE = "UNAVAILABLE"
 
 
 class ModelType(str, Enum):

From 6656a3fab484d881c952c18c915147d5361f4c6d Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Sun, 16 Mar 2025 10:52:45 -0400
Subject: [PATCH 11/52] Refactor shared launch stuff to shared module

---
 pyproject.toml                |   3 +
 tests/vec_inf/cli/test_cli.py |  10 +-
 vec_inf/api/client.py         |  57 ++------
 vec_inf/cli/_cli.py           |  18 ++-
 vec_inf/cli/_helper.py        | 265 ++++++++++------------------------
 vec_inf/shared/utils.py       | 197 +++++++++++++++++++++++++
 6 files changed, 304 insertions(+), 246 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 44d1d394..cb467c91 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,6 +59,9 @@ vec-inf = "vec_inf.cli._cli:cli"
 requires = ["hatchling"]
 build-backend = "hatchling.build"
 
+[tool.hatch.build.targets.wheel]
+packages = ["vec_inf"]
+
 [tool.mypy]
 ignore_missing_imports = true
 install_types = true
diff --git a/tests/vec_inf/cli/test_cli.py b/tests/vec_inf/cli/test_cli.py
index a9d94573..3d26a22c 100644
--- a/tests/vec_inf/cli/test_cli.py
+++ b/tests/vec_inf/cli/test_cli.py
@@ -211,7 +211,7 @@ def base_patches(test_paths, mock_truediv, debug_helper):
         patch("pathlib.Path.__truediv__", side_effect=mock_truediv),
         patch("json.dump"),
         patch("pathlib.Path.touch"),
-        patch("vec_inf.cli._helper.Path", return_value=test_paths["weights_dir"]),
+        patch("vec_inf.shared.utils.Path", return_value=test_paths["weights_dir"]),
     ]
 
 
@@ -318,9 +318,9 @@ def test_launch_command_model_not_in_config_with_weights(
         result = runner.invoke(cli, ["launch", "unknown-model"])
         debug_helper.print_debug_info(result)
 
-        assert result.exit_code == 0
+        assert result.exit_code == 1
         assert (
-            "Warning: 'unknown-model' configuration not found in config"
+            "Could not determine model_weights_parent_dir and 'unknown-model' not found in configuration"
             in result.output
         )
 
@@ -350,7 +350,7 @@ def custom_path_exists(p):
 
         # Mock Path to return the weights dir path
         stack.enter_context(
-            patch("vec_inf.cli._helper.Path", return_value=test_paths["weights_dir"])
+            patch("vec_inf.shared.utils.Path", return_value=test_paths["weights_dir"])
         )
 
         result = runner.invoke(cli, ["launch", "unknown-model"])
@@ -358,7 +358,7 @@ def custom_path_exists(p):
 
         assert result.exit_code == 1
         assert (
-            "'unknown-model' not found in configuration and model weights not found"
+            "Could not determine model_weights_parent_dir and 'unknown-model' not found in configuration"
             in result.output
         )
 
diff --git a/vec_inf/api/client.py b/vec_inf/api/client.py
index 705eefa9..81b92daf 100644
--- a/vec_inf/api/client.py
+++ b/vec_inf/api/client.py
@@ -4,9 +4,7 @@
 services programmatically.
 """
 
-import os
 import time
-from pathlib import Path
 from typing import List, Optional
 
 from vec_inf.api.models import (
@@ -28,7 +26,7 @@
 from vec_inf.shared.config import ModelConfig
 from vec_inf.shared.models import ModelStatus, ModelType
 from vec_inf.shared.utils import (
-    parse_launch_output,
+    ModelLauncher,
     run_bash_command,
 )
 
@@ -147,56 +145,31 @@ def launch_model(
             Error if there was an error launching the model.
         """
         try:
-            # Build the launch command
-            script_path = os.path.join(
-                os.path.dirname(os.path.dirname(os.path.realpath(__file__))),
-                "launch_server.sh",
-            )
-            base_command = f"bash {script_path}"
-
-            # Get model configuration
-            try:
-                model_config = self.get_model_config(model_name)
-            except ModelNotFoundError:
-                raise
-
-            # Apply options if provided
-            params = model_config.model_dump(exclude={"model_name"})
+            # Convert LaunchOptions to dictionary if provided
+            options_dict = None
             if options:
                 options_dict = {k: v for k, v in vars(options).items() if v is not None}
-                params.update(options_dict)
-
-            # Build the command with parameters
-            command = base_command
-            for param_name, param_value in params.items():
-                if param_value is None:
-                    continue
-
-                # Format boolean values
-                if isinstance(param_value, bool):
-                    formatted_value = "True" if param_value else "False"
-                elif isinstance(param_value, Path):
-                    formatted_value = str(param_value)
-                else:
-                    formatted_value = param_value
 
-                arg_name = param_name.replace("_", "-")
-                command += f" --{arg_name} {formatted_value}"
+            # Create and use the shared ModelLauncher
+            launcher = ModelLauncher(model_name, options_dict)
 
-            # Execute the command
-            output, _ = run_bash_command(command)
+            # Launch the model
+            job_id, config_dict, _ = launcher.launch()
 
-            # Parse the output
-            job_id, config_dict = parse_launch_output(output)
+            # Get the raw output
+            status_cmd = f"scontrol show job {job_id} --oneliner"
+            raw_output, _ = run_bash_command(status_cmd)
 
             return LaunchResponse(
                 slurm_job_id=job_id,
                 model_name=model_name,
                 config=config_dict,
-                raw_output=output,
+                raw_output=raw_output,
             )
-        except ModelNotFoundError:
-            raise
+        except ValueError as e:
+            if "not found in configuration" in str(e):
+                raise ModelNotFoundError(str(e)) from e
+            raise APIError(f"Failed to launch model: {str(e)}") from e
         except Exception as e:
             raise APIError(f"Failed to launch model: {str(e)}") from e
 
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index 9a8ea3e8..8f737cb5 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -106,14 +106,16 @@ def launch(
 ) -> None:
     """Launch a model on the cluster."""
     try:
-        launch_helper = LaunchHelper(model_name, cli_kwargs)
-
-        launch_helper.set_env_vars()
-        launch_command = launch_helper.build_launch_command()
-        command_output, stderr = utils.run_bash_command(launch_command)
-        if stderr:
-            raise click.ClickException(f"Error: {stderr}")
-        launch_helper.post_launch_processing(command_output, CONSOLE)
+        launcher = utils.ModelLauncher(model_name, cli_kwargs)
+        job_id, config_dict, params = launcher.launch()
+
+        json_mode = bool(cli_kwargs.get("json_mode", False))
+        launch_helper = LaunchHelper(job_id, model_name, params, json_mode)
+
+        if json_mode:
+            launch_helper.output_json()
+        else:
+            launch_helper.output_table(CONSOLE)
 
     except click.ClickException as e:
         raise e
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index afe3be63..6b59ec44 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -1,9 +1,7 @@
 """Command line interface for Vector Inference."""
 
-import json
 import os
-from pathlib import Path
-from typing import Any, Optional, Union, cast
+from typing import Any, Dict, Optional, Union, cast
 
 import click
 from rich.columns import Columns
@@ -12,9 +10,8 @@
 from rich.table import Table
 
 from vec_inf.shared.config import ModelConfig
-from vec_inf.shared.models import ModelStatus, ModelType
+from vec_inf.shared.models import ModelStatus
 from vec_inf.shared.utils import (
-    convert_boolean_value,
     create_table,
     get_base_url,
     is_server_running,
@@ -23,13 +20,7 @@
 )
 
 
-VLLM_TASK_MAP = {
-    ModelType.LLM: "generate",
-    ModelType.VLM: "generate",
-    ModelType.TEXT_EMBEDDING: "embed",
-    ModelType.REWARD_MODELING: "reward",
-}
-
+# Required fields for model configuration
 REQUIRED_FIELDS = {
     "model_family",
     "model_type",
@@ -39,185 +30,6 @@
     "max_model_len",
 }
 
-LD_LIBRARY_PATH = "/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
-SRC_DIR = str(Path(__file__).parent.parent)
-
-
-class LaunchHelper:
-    def __init__(
-        self, model_name: str, cli_kwargs: dict[str, Optional[Union[str, int, bool]]]
-    ):
-        self.model_name = model_name
-        self.cli_kwargs = cli_kwargs
-        self.model_config = self._get_model_configuration()
-        self.params = self._get_launch_params()
-
-    def _get_model_configuration(self) -> ModelConfig:
-        """Load and validate model configuration."""
-        model_configs = load_config()
-        config = next(
-            (m for m in model_configs if m.model_name == self.model_name), None
-        )
-
-        if config:
-            return config
-        # If model config not found, load path from CLI args or fallback to default
-        model_weights_parent_dir = self.cli_kwargs.get(
-            "model_weights_parent_dir", model_configs[0].model_weights_parent_dir
-        )
-        model_weights_path = Path(cast(str, model_weights_parent_dir), self.model_name)
-        # Only give a warning msg if weights exist but config missing
-        if model_weights_path.exists():
-            click.echo(
-                click.style(
-                    f"Warning: '{self.model_name}' configuration not found in config, please ensure model configuration are properly set in command arguments",
-                    fg="yellow",
-                )
-            )
-            # Return a dummy model config object with model name and weights parent dir
-            return ModelConfig(
-                model_name=self.model_name,
-                model_family="model_family_placeholder",
-                model_type="LLM",
-                gpus_per_node=1,
-                num_nodes=1,
-                vocab_size=1000,
-                max_model_len=8192,
-                model_weights_parent_dir=Path(cast(str, model_weights_parent_dir)),
-            )
-        raise click.ClickException(
-            f"'{self.model_name}' not found in configuration and model weights "
-            f"not found at expected path '{model_weights_path}'"
-        )
-
-    def _get_launch_params(self) -> dict[str, Any]:
-        """Merge config defaults with CLI overrides."""
-        params = self.model_config.model_dump()
-
-        # Process boolean fields
-        for bool_field in ["pipeline_parallelism", "enforce_eager"]:
-            if (value := self.cli_kwargs.get(bool_field)) is not None:
-                params[bool_field] = convert_boolean_value(value)
-
-        # Merge other overrides
-        for key, value in self.cli_kwargs.items():
-            if value is not None and key not in [
-                "json_mode",
-                "pipeline_parallelism",
-                "enforce_eager",
-            ]:
-                params[key] = value
-
-        # Validate required fields
-        if not REQUIRED_FIELDS.issubset(set(params.keys())):
-            raise click.ClickException(
-                f"Missing required fields: {REQUIRED_FIELDS - set(params.keys())}"
-            )
-
-        # Create log directory
-        params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
-        params["log_dir"].mkdir(parents=True, exist_ok=True)
-
-        # Convert to string for JSON serialization
-        for field in params:
-            params[field] = str(params[field])
-
-        return params
-
-    def set_env_vars(self) -> None:
-        """Set environment variables for the launch command."""
-        os.environ["MODEL_NAME"] = self.model_name
-        os.environ["MAX_MODEL_LEN"] = self.params["max_model_len"]
-        os.environ["MAX_LOGPROBS"] = self.params["vocab_size"]
-        os.environ["DATA_TYPE"] = self.params["data_type"]
-        os.environ["MAX_NUM_SEQS"] = self.params["max_num_seqs"]
-        os.environ["GPU_MEMORY_UTILIZATION"] = self.params["gpu_memory_utilization"]
-        os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]]
-        os.environ["PIPELINE_PARALLELISM"] = self.params["pipeline_parallelism"]
-        os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]
-        os.environ["SRC_DIR"] = SRC_DIR
-        os.environ["MODEL_WEIGHTS"] = str(
-            Path(self.params["model_weights_parent_dir"], self.model_name)
-        )
-        os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH
-        os.environ["VENV_BASE"] = self.params["venv"]
-        os.environ["LOG_DIR"] = self.params["log_dir"]
-
-    def build_launch_command(self) -> str:
-        """Construct the full launch command with parameters."""
-        # Base command
-        command_list = ["sbatch"]
-        # Append options
-        command_list.extend(["--job-name", f"{self.model_name}"])
-        command_list.extend(["--partition", f"{self.params['partition']}"])
-        command_list.extend(["--qos", f"{self.params['qos']}"])
-        command_list.extend(["--time", f"{self.params['time']}"])
-        command_list.extend(["--nodes", f"{self.params['num_nodes']}"])
-        command_list.extend(["--gpus-per-node", f"{self.params['gpus_per_node']}"])
-        command_list.extend(
-            [
-                "--output",
-                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.out",
-            ]
-        )
-        command_list.extend(
-            [
-                "--error",
-                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.err",
-            ]
-        )
-        # Add slurm script
-        slurm_script = "vllm.slurm"
-        if int(self.params["num_nodes"]) > 1:
-            slurm_script = "multinode_vllm.slurm"
-        command_list.append(f"{SRC_DIR}/{slurm_script}")
-        return " ".join(command_list)
-
-    def format_table_output(self, job_id: str) -> Table:
-        """Format output as rich Table."""
-        table = create_table(key_title="Job Config", value_title="Value")
-        # Add rows
-        table.add_row("Slurm Job ID", job_id, style="blue")
-        table.add_row("Job Name", self.model_name)
-        table.add_row("Model Type", self.params["model_type"])
-        table.add_row("Partition", self.params["partition"])
-        table.add_row("QoS", self.params["qos"])
-        table.add_row("Time Limit", self.params["time"])
-        table.add_row("Num Nodes", self.params["num_nodes"])
-        table.add_row("GPUs/Node", self.params["gpus_per_node"])
-        table.add_row("Data Type", self.params["data_type"])
-        table.add_row("Vocabulary Size", self.params["vocab_size"])
-        table.add_row("Max Model Length", self.params["max_model_len"])
-        table.add_row("Max Num Seqs", self.params["max_num_seqs"])
-        table.add_row("GPU Memory Utilization", self.params["gpu_memory_utilization"])
-        table.add_row("Pipeline Parallelism", self.params["pipeline_parallelism"])
-        table.add_row("Enforce Eager", self.params["enforce_eager"])
-        table.add_row("Model Weights Directory", os.environ.get("MODEL_WEIGHTS"))
-        table.add_row("Log Directory", self.params["log_dir"])
-
-        return table
-
-    def post_launch_processing(self, output: str, console: Console) -> None:
-        """Process and display launch output."""
-        json_mode = bool(self.cli_kwargs.get("json_mode", False))
-        slurm_job_id = output.split(" ")[-1].strip().strip("\n")
-        self.params["slurm_job_id"] = slurm_job_id
-        job_json = Path(
-            self.params["log_dir"],
-            f"{self.model_name}.{slurm_job_id}",
-            f"{self.model_name}.{slurm_job_id}.json",
-        )
-        job_json.parent.mkdir(parents=True, exist_ok=True)
-        job_json.touch(exist_ok=True)
-
-        with job_json.open("w") as file:
-            json.dump(self.params, file, indent=4)
-        if json_mode:
-            click.echo(self.params)
-        else:
-            table = self.format_table_output(slurm_job_id)
-            console.print(table)
-
 
 class StatusHelper:
     def __init__(self, slurm_job_id: int, output: str, log_dir: Optional[str] = None):
@@ -324,6 +136,77 @@ def output_table(self, console: Console) -> None:
         console.print(table)
 
 
+class LaunchHelper:
+    """Helper class for handling model launch output formatting."""
+
+    def __init__(
+        self,
+        job_id: str,
+        model_name: str,
+        params: Dict[str, Any],
+        json_mode: bool = False,
+    ):
+        """Initialize LaunchHelper with launch results.
+
+        Parameters
+        ----------
+        job_id : str
+            The Slurm job ID assigned to the launched model
+        model_name : str
+            The name of the launched model
+        params : Dict[str, Any]
+            Dictionary containing all model parameters
+        json_mode : bool, optional
+            Whether to output in JSON format, by default False
+        """
+        self.job_id = job_id
+        self.model_name = model_name
+        self.params = params
+        self.json_mode = json_mode
+
+    def output_json(self) -> None:
+        """Format and output launch information as JSON."""
+        # Convert params for JSON output
+        serializable_params = {k: str(v) for k, v in self.params.items()}
+        serializable_params["slurm_job_id"] = self.job_id
+        click.echo(serializable_params)
+
+    def output_table(self, console: Console) -> None:
+        """Create and display a formatted table with launch information."""
+        table = create_table(key_title="Job Config", value_title="Value")
+
+        # Add key information with consistent styling
+        table.add_row("Slurm Job ID", self.job_id, style="blue")
+        table.add_row("Job Name", self.model_name)
+
+        # Add model details
+        table.add_row("Model Type", str(self.params["model_type"]))
+
+        # Add resource allocation details
+        table.add_row("Partition", str(self.params["partition"]))
+        table.add_row("QoS", str(self.params["qos"]))
+        table.add_row("Time Limit", str(self.params["time"]))
+        table.add_row("Num Nodes", str(self.params["num_nodes"]))
+        table.add_row("GPUs/Node", str(self.params["gpus_per_node"]))
+
+        # Add model configuration details
+        table.add_row("Data Type", str(self.params["data_type"]))
+        table.add_row("Vocabulary Size", str(self.params["vocab_size"]))
+        table.add_row("Max Model Length", str(self.params["max_model_len"]))
+        table.add_row("Max Num Seqs", str(self.params["max_num_seqs"]))
+        table.add_row(
+            "GPU Memory Utilization", str(self.params["gpu_memory_utilization"])
+        )
+        table.add_row("Pipeline Parallelism", str(self.params["pipeline_parallelism"]))
+        table.add_row("Enforce Eager", str(self.params["enforce_eager"]))
+
+        # Add path details
+        table.add_row("Model Weights Directory", os.environ.get("MODEL_WEIGHTS", ""))
+        table.add_row("Log Directory", str(self.params["log_dir"]))
+
+        console.print(table)
+
+
 class ListHelper:
     """Helper class for handling model listing functionality."""
 
diff --git a/vec_inf/shared/utils.py b/vec_inf/shared/utils.py
index b654ee37..6076b3ed 100644
--- a/vec_inf/shared/utils.py
+++ b/vec_inf/shared/utils.py
@@ -15,6 +15,26 @@
 
 MODEL_READY_SIGNATURE = "INFO:     Application startup complete."
 CACHED_CONFIG = Path("/", "model-weights", "vec-inf-shared", "models.yaml")
+SRC_DIR = str(Path(__file__).parent.parent)
+LD_LIBRARY_PATH = "/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
+
+# Maps model types to vLLM tasks
+VLLM_TASK_MAP = {
+    "LLM": "generate",
+    "VLM": "generate",
+    "TEXT_EMBEDDING": "embed",
+    "REWARD_MODELING": "reward",
+}
+
+# Required fields for model configuration
+REQUIRED_FIELDS = {
+    "model_family",
+    "model_type",
+    "gpus_per_node",
+    "num_nodes",
+    "vocab_size",
+    "max_model_len",
+}
 
 
 def run_bash_command(command: str) -> tuple[str, str]:
@@ -212,3 +232,180 @@ def parse_launch_output(output: str) -> tuple[str, Dict[str, str]]:
             config_dict[key.lower().replace(" ", "_")] = value
 
     return slurm_job_id, config_dict
+
+
+class ModelLauncher:
+    """Shared model launcher for both CLI and API."""
+
+    def __init__(self, model_name: str, options: Optional[Dict[str, Any]] = None):
+        """Initialize the model launcher.
+
+        Parameters
+        ----------
+        model_name: str
+            Name of the model to launch
+        options: Optional[Dict[str, Any]]
+            Optional launch options to override default configuration
+        """
+        self.model_name = model_name
+        self.options = options or {}
+        self.model_config = self._get_model_configuration()
+        self.params = self._get_launch_params()
+
+    def _get_model_configuration(self) -> ModelConfig:
+        """Load and validate model configuration."""
+        model_configs = load_config()
+        config = next(
+            (m for m in model_configs if m.model_name == self.model_name), None
+        )
+
+        if config:
+            return config
+
+        # If model config not found, check for path from options or use fallback
+        model_weights_parent_dir = self.options.get(
+            "model_weights_parent_dir",
+            model_configs[0].model_weights_parent_dir if model_configs else None,
+        )
+
+        if not model_weights_parent_dir:
+            raise ValueError(
+                f"Could not determine model_weights_parent_dir and '{self.model_name}' not found in configuration"
+            )
+
+        model_weights_path = Path(model_weights_parent_dir, self.model_name)
+
+        # Only give a warning if weights exist but config missing
+        if model_weights_path.exists():
+            print(
+                f"Warning: '{self.model_name}' configuration not found in config, please ensure model configuration are properly set in options"
+            )
+            # Return a dummy model config object with model name and weights parent dir
+            return ModelConfig(
+                model_name=self.model_name,
+                model_family="model_family_placeholder",
+                model_type="LLM",
+                gpus_per_node=1,
+                num_nodes=1,
+                vocab_size=1000,
+                max_model_len=8192,
+                model_weights_parent_dir=Path(str(model_weights_parent_dir)),
+            )
+
+        raise ValueError(
+            f"'{self.model_name}' not found in configuration and model weights "
+            f"not found at expected path '{model_weights_path}'"
+        )
+
+    def _get_launch_params(self) -> dict[str, Any]:
+        """Merge config defaults with overrides."""
+        params = self.model_config.model_dump()
+
+        # Process boolean fields
+        for bool_field in ["pipeline_parallelism", "enforce_eager"]:
+            if (value := self.options.get(bool_field)) is not None:
+                params[bool_field] = convert_boolean_value(value)
+
+        # Merge other overrides
+        for key, value in self.options.items():
+            if value is not None and key not in [
+                "json_mode",
+                "pipeline_parallelism",
+                "enforce_eager",
+            ]:
+                params[key] = value
+
+        # Validate required fields
+        if not REQUIRED_FIELDS.issubset(set(params.keys())):
+            missing_fields = REQUIRED_FIELDS - set(params.keys())
+            raise ValueError(f"Missing required fields: {missing_fields}")
+
+        # Create log directory
+        params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
+        params["log_dir"].mkdir(parents=True, exist_ok=True)
+
+        return params
+
+    def set_env_vars(self) -> None:
+        """Set environment variables for the launch command."""
+        os.environ["MODEL_NAME"] = self.model_name
+        os.environ["MAX_MODEL_LEN"] = str(self.params["max_model_len"])
+        os.environ["MAX_LOGPROBS"] = str(self.params["vocab_size"])
+        os.environ["DATA_TYPE"] = str(self.params["data_type"])
+        os.environ["MAX_NUM_SEQS"] = str(self.params["max_num_seqs"])
+        os.environ["GPU_MEMORY_UTILIZATION"] = str(
+            self.params["gpu_memory_utilization"]
+        )
+        os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]]
+        os.environ["PIPELINE_PARALLELISM"] = str(self.params["pipeline_parallelism"])
+        os.environ["ENFORCE_EAGER"] = str(self.params["enforce_eager"])
+        os.environ["SRC_DIR"] = SRC_DIR
+        os.environ["MODEL_WEIGHTS"] = str(
+            Path(self.params["model_weights_parent_dir"], self.model_name)
+        )
+        os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH
+        os.environ["VENV_BASE"] = str(self.params["venv"])
+        os.environ["LOG_DIR"] = str(self.params["log_dir"])
+
+    def build_launch_command(self) -> str:
+        """Construct the full launch command with parameters."""
+        # Base command
+        command_list = ["sbatch"]
+        # Append options
+        command_list.extend(["--job-name", f"{self.model_name}"])
+        command_list.extend(["--partition", f"{self.params['partition']}"])
+        command_list.extend(["--qos", f"{self.params['qos']}"])
+        command_list.extend(["--time", f"{self.params['time']}"])
+        command_list.extend(["--nodes", f"{self.params['num_nodes']}"])
+        command_list.extend(["--gpus-per-node", f"{self.params['gpus_per_node']}"])
+        command_list.extend(
+            [
+                "--output",
+                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.out",
+            ]
+        )
+        command_list.extend(
+            [
+                "--error",
+                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.err",
+            ]
+        )
+        # Add slurm script
+        slurm_script = "vllm.slurm"
+        if int(self.params["num_nodes"]) > 1:
+            slurm_script = "multinode_vllm.slurm"
+        command_list.append(f"{SRC_DIR}/{slurm_script}")
+        return " ".join(command_list)
+
+    def launch(self) -> tuple[str, Dict[str, str], Dict[str, Any]]:
+        """Launch the model and return job information.
+
+        Returns
+        -------
+        tuple[str, Dict[str, str], Dict[str, Any]]
+            Slurm job ID, config dictionary, and parameters dictionary
+        """
+        # Set environment variables
+        self.set_env_vars()
+
+        # Build and execute the command
+        command = self.build_launch_command()
+        output, _ = run_bash_command(command)
+
+        # Parse the output
+        job_id, config_dict = parse_launch_output(output)
+
+        # Save job configuration to JSON
+        job_json_dir = Path(self.params["log_dir"], f"{self.model_name}.{job_id}")
+        job_json_dir.mkdir(parents=True, exist_ok=True)
+
+        job_json_path = job_json_dir / f"{self.model_name}.{job_id}.json"
+
+        # Convert params for serialization
+        serializable_params = {k: str(v) for k, v in self.params.items()}
+        serializable_params["slurm_job_id"] = job_id
+
+        with job_json_path.open("w") as f:
+            json.dump(serializable_params, f, indent=4)
+
+        return job_id, config_dict, self.params

From a65035b710f01437ab036c292f85a7da43940208 Mon Sep 17 00:00:00 2001
From: Amrit Krishnan <amrit110@gmail.com>
Date: Sun, 16 Mar 2025 16:34:55 -0400
Subject: [PATCH 12/52] Add python API docs, update user guide

---
 README.md                                     |  2 +-
 docs/source/conf.py                           | 13 +++++++--
 docs/source/index.md                          |  3 +-
 docs/source/reference/api/index.rst           |  9 ++++++
 .../reference/api/vec_inf.api.client.rst      |  7 +++++
 .../reference/api/vec_inf.api.models.rst      |  7 +++++
 docs/source/reference/api/vec_inf.api.rst     | 17 +++++++++++
 .../reference/api/vec_inf.api.utils.rst       |  7 +++++
 docs/source/reference/api/vec_inf.rst         | 15 ++++++++++
 docs/source/user_guide.md                     | 15 +++++++---
 examples/api/advanced_usage.py                |  2 +-
 pyproject.toml                                |  1 +
 tests/vec_inf/api/README.md                   | 29 -------------------
 tests/vec_inf/api/test_examples.py            |  4 +--
 uv.lock                                       |  2 ++
 vec_inf/api/__init__.py                       |  6 ++--
 16 files changed, 95 insertions(+), 44 deletions(-)
 create mode 100644 docs/source/reference/api/index.rst
 create mode 100644 docs/source/reference/api/vec_inf.api.client.rst
 create mode 100644 docs/source/reference/api/vec_inf.api.models.rst
 create mode 100644 docs/source/reference/api/vec_inf.api.rst
 create mode 100644 docs/source/reference/api/vec_inf.api.utils.rst
 create mode 100644 docs/source/reference/api/vec_inf.rst
 delete mode 100644 tests/vec_inf/api/README.md

diff --git a/README.md b/README.md
index 94159cf4..0753ee14 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
 [![codecov](https://codecov.io/github/VectorInstitute/vector-inference/branch/develop/graph/badge.svg?token=NI88QSIGAC)](https://app.codecov.io/github/VectorInstitute/vector-inference/tree/develop)
 ![GitHub License](https://img.shields.io/github/license/VectorInstitute/vector-inference)
 
-This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](vec_inf/launch_server.sh), [`vllm.slurm`](vec_inf/vllm.slurm), [`multinode_vllm.slurm`](vec_inf/multinode_vllm.slurm) and [`models.csv`](vec_inf/config/models.yaml) accordingly.
+This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`utils.py`](vec_inf/shared/utils.py), [`vllm.slurm`](vec_inf/vllm.slurm), [`multinode_vllm.slurm`](vec_inf/multinode_vllm.slurm) and [`models.yaml`](vec_inf/config/models.yaml) accordingly.
 
 ## Installation
 If you are using the Vector cluster environment, and you don't need any customization to the inference server environment, run the following to install package:
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 44f51494..9c7151a5 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -7,7 +7,6 @@
 
 import os
 import sys
-from typing import List
 
 
 sys.path.insert(0, os.path.abspath("../../vec_inf"))
@@ -51,8 +50,16 @@
 copybutton_prompt_text = r">>> |\.\.\. "
 copybutton_prompt_is_regexp = True
 
+apidoc_module_dir = "../../vec_inf"
+apidoc_excluded_paths = ["tests", "cli", "shared"]
+exclude_patterns = ["reference/api/vec_inf.rst"]
+apidoc_output_dir = "reference/api"
+apidoc_separate_modules = True
+apidoc_extra_args = ["-f", "-M", "-T", "--implicit-namespaces"]
+suppress_warnings = ["ref.python"]
+
 intersphinx_mapping = {
-    "python": ("https://docs.python.org/3.9/", None),
+    "python": ("https://docs.python.org/3.10/", None),
 }
 
 # Add any paths that contain templates here, relative to this directory.
@@ -61,7 +68,7 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = []
+exclude_patterns = ["reference/api/vec_inf.rst"]
 
 # -- Options for Markdown files ----------------------------------------------
 #
diff --git a/docs/source/index.md b/docs/source/index.md
index cfb94c9e..cd4bfae9 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -8,10 +8,11 @@ hide-toc: true
 :hidden:
 
 user_guide
+reference/api/index
 
 ```
 
-This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`launch_server.sh`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/launch_server.sh), [`vllm.slurm`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/vllm.slurm), [`multinode_vllm.slurm`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/multinode_vllm.slurm) and [`models.yaml`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/config/models.yaml) accordingly.
+This repository provides an easy-to-use solution to run inference servers on [Slurm](https://slurm.schedmd.com/overview.html)-managed computing clusters using [vLLM](https://docs.vllm.ai/en/latest/). **All scripts in this repository runs natively on the Vector Institute cluster environment**. To adapt to other environments, update [`utils.py`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/shared.utils.py), [`vllm.slurm`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/vllm.slurm), [`multinode_vllm.slurm`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/multinode_vllm.slurm) and [`models.yaml`](https://github.com/VectorInstitute/vector-inference/blob/main/vec_inf/config/models.yaml) accordingly.
 
 ## Installation
 
diff --git a/docs/source/reference/api/index.rst b/docs/source/reference/api/index.rst
new file mode 100644
index 00000000..525fa8e1
--- /dev/null
+++ b/docs/source/reference/api/index.rst
@@ -0,0 +1,9 @@
+Python API
+==========
+
+This section documents the Python API for the `vec_inf` package.
+
+.. toctree::
+   :maxdepth: 4
+
+   vec_inf.api
diff --git a/docs/source/reference/api/vec_inf.api.client.rst b/docs/source/reference/api/vec_inf.api.client.rst
new file mode 100644
index 00000000..fd0e6554
--- /dev/null
+++ b/docs/source/reference/api/vec_inf.api.client.rst
@@ -0,0 +1,7 @@
+vec\_inf.api.client module
+==========================
+
+.. automodule:: vec_inf.api.client
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/reference/api/vec_inf.api.models.rst b/docs/source/reference/api/vec_inf.api.models.rst
new file mode 100644
index 00000000..cc2efc16
--- /dev/null
+++ b/docs/source/reference/api/vec_inf.api.models.rst
@@ -0,0 +1,7 @@
+vec\_inf.api.models module
+==========================
+
+.. automodule:: vec_inf.api.models
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/reference/api/vec_inf.api.rst b/docs/source/reference/api/vec_inf.api.rst
new file mode 100644
index 00000000..354527ec
--- /dev/null
+++ b/docs/source/reference/api/vec_inf.api.rst
@@ -0,0 +1,17 @@
+vec\_inf.api package
+====================
+
+.. automodule:: vec_inf.api
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   vec_inf.api.client
+   vec_inf.api.models
+   vec_inf.api.utils
diff --git a/docs/source/reference/api/vec_inf.api.utils.rst b/docs/source/reference/api/vec_inf.api.utils.rst
new file mode 100644
index 00000000..1fa42727
--- /dev/null
+++ b/docs/source/reference/api/vec_inf.api.utils.rst
@@ -0,0 +1,7 @@
+vec\_inf.api.utils module
+=========================
+
+.. automodule:: vec_inf.api.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/reference/api/vec_inf.rst b/docs/source/reference/api/vec_inf.rst
new file mode 100644
index 00000000..854d1ec2
--- /dev/null
+++ b/docs/source/reference/api/vec_inf.rst
@@ -0,0 +1,15 @@
+vec\_inf package
+================
+
+.. automodule:: vec_inf
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   vec_inf.api
diff --git a/docs/source/user_guide.md b/docs/source/user_guide.md
index 6a69936b..e7e12373 100644
--- a/docs/source/user_guide.md
+++ b/docs/source/user_guide.md
@@ -1,6 +1,6 @@
 # User Guide
 
-## Usage
+## CLI Usage
 
 ### `launch` command
 
@@ -18,7 +18,7 @@ You should see an output like the following:
 
 #### Overrides
 
-Models that are already supported by `vec-inf` would be launched using the [default parameters](vec_inf/config/models.yaml). You can override these values by providing additional parameters. Use `vec-inf launch --help` to see the full list of parameters that can be
+Models that are already supported by `vec-inf` would be launched using the [default parameters](https://github.com/VectorInstitute/vector-inference/blob/develop/vec_inf/config/models.yaml). You can override these values by providing additional parameters. Use `vec-inf launch --help` to see the full list of parameters that can be
 overriden. For example, if `qos` is to be overriden:
 
 ```bash
@@ -31,7 +31,7 @@ You can also launch your own custom model as long as the model architecture is [
 * Your model weights directory naming convention should follow `$MODEL_FAMILY-$MODEL_VARIANT`.
 * Your model weights directory should contain HuggingFace format weights.
 * You should create a custom configuration file for your model and specify its path via setting the environment variable `VEC_INF_CONFIG`
-Check the [default parameters](vec_inf/config/models.yaml) file for the format of the config file. All the parameters for the model
+Check the [default parameters](https://github.com/VectorInstitute/vector-inference/blob/develop/vec_inf/config/models.yaml) file for the format of the config file. All the parameters for the model
 should be specified in that config file.
 * For other model launch parameters you can reference the default values for similar models using the [`list` command ](#list-command).
 
@@ -136,7 +136,7 @@ vec-inf list Meta-Llama-3.1-70B-Instruct
 
 ## Send inference requests
 
-Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](https://github.com/VectorInstitute/vector-inference/blob/main/examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/completions.py`, and you should expect to see an output like the following:
+Once the inference server is ready, you can start sending in inference requests. We provide example scripts for sending inference requests in [`examples`](https://github.com/VectorInstitute/vector-inference/blob/develop/examples) folder. Make sure to update the model server URL and the model weights location in the scripts. For example, you can run `python examples/inference/llm/completions.py`, and you should expect to see an output like the following:
 
 ```json
 {
@@ -171,3 +171,10 @@ If you want to run inference from your local device, you can open a SSH tunnel t
 ssh -L 8081:172.17.8.29:8081 username@v.vectorinstitute.ai -N
 ```
 Where the last number in the URL is the GPU number (gpu029 in this case). The example provided above is for the vector cluster, change the variables accordingly for your environment
+
+## Python API Usage
+
+You can also use the `vec_inf` Python API to launch and manage inference servers.
+
+Check out the [Python API documentation](reference/api/index) for more details. There
+are also Python API usage examples in the [`examples`](https://github.com/VectorInstitute/vector-inference/tree/develop/examples/api) folder.
diff --git a/examples/api/advanced_usage.py b/examples/api/advanced_usage.py
index af02f49f..2ff0751f 100755
--- a/examples/api/advanced_usage.py
+++ b/examples/api/advanced_usage.py
@@ -208,7 +208,7 @@ def batch_inference_example(
                     {
                         "input": input_text,
                         "output": completion.choices[0].text,
-                        "tokens": completion.usage.completion_tokens,
+                        "tokens": completion.usage.completion_tokens,  # type: ignore[union-attr]
                     }
                 )
 
diff --git a/pyproject.toml b/pyproject.toml
index cb467c91..10b9ff52 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,6 +19,7 @@ dev = [
     "codecov>=2.1.13",
     "mypy>=1.15.0",
     "nbqa>=1.9.1",
+    "openai>=1.65.1",
     "pip-audit>=2.8.0",
     "pre-commit>=4.1.0",
     "pytest>=8.3.4",
diff --git a/tests/vec_inf/api/README.md b/tests/vec_inf/api/README.md
deleted file mode 100644
index 4c40afc2..00000000
--- a/tests/vec_inf/api/README.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# API Tests
-
-This directory contains tests for the Vector Inference API module.
-
-## Test Files
-
-- `test_client.py` - Tests for the `VecInfClient` class and its methods
-- `test_models.py` - Tests for the API data models and enums
-- `test_examples.py` - Tests for the API example scripts
-
-## Running Tests
-
-Run the tests using pytest:
-
-```bash
-pytest tests/vec_inf/api
-```
-
-## Test Coverage
-
-The tests cover the following areas:
-
-- Core client functionality: listing models, launching models, checking status, getting metrics, shutting down
-- Data models validation: `ModelInfo`, `ModelStatus`, `LaunchOptions`
-- API examples: verifying that API example scripts work correctly
-
-## Dependencies
-
-The tests use pytest and mock objects to isolate the tests from actual dependencies.
diff --git a/tests/vec_inf/api/test_examples.py b/tests/vec_inf/api/test_examples.py
index 9c0b0cef..967b8de1 100644
--- a/tests/vec_inf/api/test_examples.py
+++ b/tests/vec_inf/api/test_examples.py
@@ -44,12 +44,12 @@ def mock_client():
 
 
 @pytest.mark.skipif(
-    not os.path.exists(os.path.join("examples", "api", "api_usage.py")),
+    not os.path.exists(os.path.join("examples", "api", "basic_usage.py")),
     reason="Example file not found",
 )
 def test_api_usage_example():
     """Test the basic API usage example."""
-    example_path = os.path.join("examples", "api", "api_usage.py")
+    example_path = os.path.join("examples", "api", "basic_usage.py")
 
     # Create a mock client
     mock_client = MagicMock(spec=VecInfClient)
diff --git a/uv.lock b/uv.lock
index eae4c1b4..d7d3e7ba 100644
--- a/uv.lock
+++ b/uv.lock
@@ -4098,6 +4098,7 @@ dev = [
     { name = "codecov" },
     { name = "mypy" },
     { name = "nbqa" },
+    { name = "openai" },
     { name = "pip-audit" },
     { name = "pre-commit" },
     { name = "pytest" },
@@ -4144,6 +4145,7 @@ dev = [
     { name = "codecov", specifier = ">=2.1.13" },
     { name = "mypy", specifier = ">=1.15.0" },
     { name = "nbqa", specifier = ">=1.9.1" },
+    { name = "openai", specifier = ">=1.65.1" },
     { name = "pip-audit", specifier = ">=2.8.0" },
     { name = "pre-commit", specifier = ">=4.1.0" },
     { name = "pytest", specifier = ">=8.3.4" },
diff --git a/vec_inf/api/__init__.py b/vec_inf/api/__init__.py
index f6de1c93..6d18011c 100644
--- a/vec_inf/api/__init__.py
+++ b/vec_inf/api/__init__.py
@@ -1,8 +1,8 @@
 """Programmatic API for Vector Inference.
 
-This module provides a Python API for interacting with Vector Inference.
-It allows for launching and managing inference servers programmatically
-without relying on the command-line interface.
+This module provides a Python API for launching and managing inference servers
+using `vec_inf`. It is an alternative to the command-line interface, and allows
+users direct control over the lifecycle of inference servers via python scripts.
 """
 
 from vec_inf.api.client import VecInfClient

From 967e3d07c270e28b4598759a62551db9e19d7591 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Tue, 1 Apr 2025 15:22:51 -0400
Subject: [PATCH 13/52] Merge LaunchHelper and ModelLauncher, move LaunchHelper
 into shared helper.py, remove empty cli/_utils.py

---
 vec_inf/api/client.py    |   8 +-
 vec_inf/cli/_cli.py      |  18 +--
 vec_inf/cli/_utils.py    |   3 -
 vec_inf/shared/helper.py | 280 +++++++++++++++++++++++++++++++++++++++
 vec_inf/shared/utils.py  | 214 ++----------------------------
 5 files changed, 301 insertions(+), 222 deletions(-)
 delete mode 100644 vec_inf/cli/_utils.py
 create mode 100644 vec_inf/shared/helper.py

diff --git a/vec_inf/api/client.py b/vec_inf/api/client.py
index 81b92daf..c5e25d59 100644
--- a/vec_inf/api/client.py
+++ b/vec_inf/api/client.py
@@ -24,11 +24,9 @@
     load_models,
 )
 from vec_inf.shared.config import ModelConfig
+from vec_inf.shared.helper import LaunchHelper
 from vec_inf.shared.models import ModelStatus, ModelType
-from vec_inf.shared.utils import (
-    ModelLauncher,
-    run_bash_command,
-)
+from vec_inf.shared.utils import run_bash_command
 
 
 class VecInfClient:
@@ -151,7 +149,7 @@ def launch_model(
                 options_dict = {k: v for k, v in vars(options).items() if v is not None}
 
             # Create and use the shared ModelLauncher
-            launcher = ModelLauncher(model_name, options_dict)
+            launcher = LaunchHelper(model_name, options_dict)
 
             # Launch the model
             job_id, config_dict, _ = launcher.launch()
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index 8c54858c..a3bcde64 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -7,8 +7,9 @@
 from rich.console import Console
 from rich.live import Live
 
-from vec_inf.cli._helper import LaunchHelper, ListHelper, MetricsHelper, StatusHelper
+from vec_inf.cli._helper import ListHelper, MetricsHelper, StatusHelper
 from vec_inf.shared import utils
+from vec_inf.shared.helper import LaunchHelper
 
 
 CONSOLE = Console()
@@ -126,16 +127,15 @@ def launch(
 ) -> None:
     """Launch a model on the cluster."""
     try:
-        launcher = utils.ModelLauncher(model_name, cli_kwargs)
-        job_id, config_dict, params = launcher.launch()
+        launch_helper = LaunchHelper(model_name, cli_kwargs)
 
-        json_mode = bool(cli_kwargs.get("json_mode", False))
-        launch_helper = LaunchHelper(job_id, model_name, params, json_mode)
+        launch_helper.set_env_vars()
+        launch_command = launch_helper.build_launch_command()
+        command_output, stderr = utils.run_bash_command(launch_command)
+        if stderr:
+            raise click.ClickException(f"Error: {stderr}")
+        launch_helper.post_launch_processing(command_output, CONSOLE)
 
-        if json_mode:
-            launch_helper.output_json()
-        else:
-            launch_helper.output_table(CONSOLE)
 
     except click.ClickException as e:
         raise e
diff --git a/vec_inf/cli/_utils.py b/vec_inf/cli/_utils.py
deleted file mode 100644
index 41e6d788..00000000
--- a/vec_inf/cli/_utils.py
+++ /dev/null
@@ -1,3 +0,0 @@
-"""Utility functions for the CLI."""
-
-# Import all shared utilities
diff --git a/vec_inf/shared/helper.py b/vec_inf/shared/helper.py
new file mode 100644
index 00000000..dbe55c65
--- /dev/null
+++ b/vec_inf/shared/helper.py
@@ -0,0 +1,280 @@
+"""Helper class for the model launch."""
+
+import json
+import os
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+import click
+from rich.console import Console
+from rich.table import Table
+
+from vec_inf.shared import utils
+from vec_inf.shared.config import ModelConfig
+from vec_inf.shared.utils import (
+    BOOLEAN_FIELDS,
+    LD_LIBRARY_PATH,
+    REQUIRED_FIELDS,
+    SRC_DIR,
+    VLLM_TASK_MAP,
+)
+
+
+class LaunchHelper:
+    """Shared launch helper for both CLI and API."""
+
+    def __init__(
+            self, model_name: str, cli_kwargs: Optional[dict[str, Any]]
+        ):
+        """Initialize the model launcher.
+
+        Parameters
+        ----------
+        model_name: str
+            Name of the model to launch
+        cli_kwargs: Optional[dict[str, Any]]
+            Optional launch keyword arguments to override default configuration
+        """
+        self.model_name = model_name
+        self.cli_kwargs = cli_kwargs or {}
+        self.model_config = self._get_model_configuration()
+        self.params = self._get_launch_params()
+
+    def _get_model_configuration(self) -> ModelConfig:
+        """Load and validate model configuration."""
+        model_configs = utils.load_config()
+        config = next(
+            (m for m in model_configs if m.model_name == self.model_name), None
+        )
+
+        if config:
+            return config
+
+        # If model config not found, check for path from keyword arguments or use fallback
+        model_weights_parent_dir = self.cli_kwargs.get(
+            "model_weights_parent_dir",
+            model_configs[0].model_weights_parent_dir if model_configs else None,
+        )
+
+        if not model_weights_parent_dir:
+            raise ValueError(
+                f"Could not determine model_weights_parent_dir and '{self.model_name}' not found in configuration"
+            )
+
+        model_weights_path = Path(model_weights_parent_dir, self.model_name)
+
+        # Only give a warning if weights exist but config missing
+        if model_weights_path.exists():
+            click.echo(
+                click.style(
+                    f"Warning: '{self.model_name}' configuration not found in config, please ensure model configuration are properly set in command arguments",
+                    fg="yellow",
+                )
+            )
+            # Return a dummy model config object with model name and weights parent dir
+            return ModelConfig(
+                model_name=self.model_name,
+                model_family="model_family_placeholder",
+                model_type="LLM",
+                gpus_per_node=1,
+                num_nodes=1,
+                vocab_size=1000,
+                max_model_len=8192,
+                model_weights_parent_dir=Path(str(model_weights_parent_dir)),
+            )
+
+        raise click.ClickException(
+            f"'{self.model_name}' not found in configuration and model weights "
+            f"not found at expected path '{model_weights_path}'"
+        )
+
+    def _get_launch_params(self) -> dict[str, Any]:
+        """Merge config defaults with CLI overrides."""
+        params = self.model_config.model_dump()
+
+        # Process boolean fields
+        for bool_field in BOOLEAN_FIELDS:
+            if self.cli_kwargs[bool_field]:
+                params[bool_field] = True
+
+        # Merge other overrides
+        for key, value in self.cli_kwargs.items():
+            if value is not None and key not in [
+                "json_mode",
+                *BOOLEAN_FIELDS,
+            ]:
+                params[key] = value
+
+        # Validate required fields
+        if not REQUIRED_FIELDS.issubset(set(params.keys())):
+            raise click.ClickException(
+                f"Missing required fields: {REQUIRED_FIELDS - set(params.keys())}"
+            )
+
+        # Create log directory
+        params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
+        params["log_dir"].mkdir(parents=True, exist_ok=True)
+
+        # Convert to string for JSON serialization
+        for field in params:
+            params[field] = str(params[field])
+
+        return params
+
+    def set_env_vars(self) -> None:
+        """Set environment variables for the launch command."""
+        os.environ["MODEL_NAME"] = self.model_name
+        os.environ["MAX_MODEL_LEN"] = self.params["max_model_len"]
+        os.environ["MAX_LOGPROBS"] = self.params["vocab_size"]
+        os.environ["DATA_TYPE"] = self.params["data_type"]
+        os.environ["MAX_NUM_SEQS"] = self.params["max_num_seqs"]
+        os.environ["GPU_MEMORY_UTILIZATION"] = self.params["gpu_memory_utilization"]
+        os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]]
+        os.environ["PIPELINE_PARALLELISM"] = self.params["pipeline_parallelism"]
+        os.environ["COMPILATION_CONFIG"] = self.params["compilation_config"]
+        os.environ["SRC_DIR"] = SRC_DIR
+        os.environ["MODEL_WEIGHTS"] = str(
+            Path(self.params["model_weights_parent_dir"], self.model_name)
+        )
+        os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH
+        os.environ["VENV_BASE"] = self.params["venv"]
+        os.environ["LOG_DIR"] = self.params["log_dir"]
+
+        if self.params.get("enable_prefix_caching"):
+            os.environ["ENABLE_PREFIX_CACHING"] = self.params["enable_prefix_caching"]
+        if self.params.get("enable_chunked_prefill"):
+            os.environ["ENABLE_CHUNKED_PREFILL"] = self.params["enable_chunked_prefill"]
+        if self.params.get("max_num_batched_tokens"):
+            os.environ["MAX_NUM_BATCHED_TOKENS"] = self.params["max_num_batched_tokens"]
+        if self.params.get("enforce_eager"):
+            os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]
+
+    def build_launch_command(self) -> str:
+        """Construct the full launch command with parameters."""
+        # Base command
+        command_list = ["sbatch"]
+        # Append options
+        command_list.extend(["--job-name", f"{self.model_name}"])
+        command_list.extend(["--partition", f"{self.params['partition']}"])
+        command_list.extend(["--qos", f"{self.params['qos']}"])
+        command_list.extend(["--time", f"{self.params['time']}"])
+        command_list.extend(["--nodes", f"{self.params['num_nodes']}"])
+        command_list.extend(["--gpus-per-node", f"{self.params['gpus_per_node']}"])
+        command_list.extend(
+            [
+                "--output",
+                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.out",
+            ]
+        )
+        command_list.extend(
+            [
+                "--error",
+                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.err",
+            ]
+        )
+        # Add slurm script
+        slurm_script = "vllm.slurm"
+        if int(self.params["num_nodes"]) > 1:
+            slurm_script = "multinode_vllm.slurm"
+        command_list.append(f"{SRC_DIR}/{slurm_script}")
+        return " ".join(command_list)
+
+    def launch(self) -> tuple[str, Dict[str, str], Dict[str, Any]]:
+        """Launch the model and return job information.
+
+        Returns
+        -------
+        tuple[str, Dict[str, str], Dict[str, Any]]
+            Slurm job ID, config dictionary, and parameters dictionary
+        """
+        # Set environment variables
+        self.set_env_vars()
+
+        # Build and execute the command
+        command = self.build_launch_command()
+        output, _ = utils.run_bash_command(command)
+
+        # Parse the output
+        job_id, config_dict = utils.parse_launch_output(output)
+
+        # Save job configuration to JSON
+        job_json_dir = Path(self.params["log_dir"], f"{self.model_name}.{job_id}")
+        job_json_dir.mkdir(parents=True, exist_ok=True)
+
+        job_json_path = job_json_dir / f"{self.model_name}.{job_id}.json"
+
+        # Convert params for serialization
+        serializable_params = {k: str(v) for k, v in self.params.items()}
+        serializable_params["slurm_job_id"] = job_id
+
+        with job_json_path.open("w") as f:
+            json.dump(serializable_params, f, indent=4)
+
+        return job_id, config_dict, self.params
+
+    def format_table_output(self, job_id: str) -> Table:
+        """Format output as rich Table."""
+        table = utils.create_table(key_title="Job Config", value_title="Value")
+
+        # Add key information with consistent styling
+        table.add_row("Slurm Job ID", job_id, style="blue")
+        table.add_row("Job Name", self.model_name)
+
+        # Add model details
+        table.add_row("Model Type", self.params["model_type"])
+
+        # Add resource allocation details
+        table.add_row("Partition", self.params["partition"])
+        table.add_row("QoS", self.params["qos"])
+        table.add_row("Time Limit", self.params["time"])
+        table.add_row("Num Nodes", self.params["num_nodes"])
+        table.add_row("GPUs/Node", self.params["gpus_per_node"])
+
+        # Add model configuration details
+        table.add_row("Data Type", self.params["data_type"])
+        table.add_row("Vocabulary Size", self.params["vocab_size"])
+        table.add_row("Max Model Length", self.params["max_model_len"])
+        table.add_row("Max Num Seqs", self.params["max_num_seqs"])
+        table.add_row("GPU Memory Utilization", self.params["gpu_memory_utilization"])
+        table.add_row("Compilation Config", self.params["compilation_config"])
+        table.add_row("Pipeline Parallelism", self.params["pipeline_parallelism"])
+        if self.params.get("enable_prefix_caching"):
+            table.add_row("Enable Prefix Caching", self.params["enable_prefix_caching"])
+        if self.params.get("enable_chunked_prefill"):
+            table.add_row(
+                "Enable Chunked Prefill", self.params["enable_chunked_prefill"]
+            )
+        if self.params.get("max_num_batched_tokens"):
+            table.add_row(
+                "Max Num Batched Tokens", self.params["max_num_batched_tokens"]
+            )
+        if self.params.get("enforce_eager"):
+            table.add_row("Enforce Eager", self.params["enforce_eager"])
+
+        # Add path details
+        table.add_row("Model Weights Directory", os.environ.get("MODEL_WEIGHTS"))
+        table.add_row("Log Directory", self.params["log_dir"])
+
+        return table
+
+    def post_launch_processing(self, output: str, console: Console) -> None:
+        """Process and display launch output."""
+        json_mode = bool(self.cli_kwargs.get("json_mode", False))
+        slurm_job_id = output.split(" ")[-1].strip().strip("\n")
+        self.params["slurm_job_id"] = slurm_job_id
+        job_json = Path(
+            self.params["log_dir"],
+            f"{self.model_name}.{slurm_job_id}",
+            f"{self.model_name}.{slurm_job_id}.json",
+        )
+        job_json.parent.mkdir(parents=True, exist_ok=True)
+        job_json.touch(exist_ok=True)
+
+        with job_json.open("w") as file:
+            json.dump(self.params, file, indent=4)
+        if json_mode:
+            click.echo(self.params)
+        else:
+            table = self.format_table_output(slurm_job_id)
+            console.print(table)
+
diff --git a/vec_inf/shared/utils.py b/vec_inf/shared/utils.py
index 6076b3ed..10740da4 100644
--- a/vec_inf/shared/utils.py
+++ b/vec_inf/shared/utils.py
@@ -36,6 +36,14 @@
     "max_model_len",
 }
 
+# Boolean fields for model configuration
+BOOLEAN_FIELDS = {
+    "pipeline_parallelism",
+    "enforce_eager",
+    "enable_prefix_caching",
+    "enable_chunked_prefill",
+}
+
 
 def run_bash_command(command: str) -> tuple[str, str]:
     """Run a bash command and return the output."""
@@ -180,33 +188,6 @@ def load_config() -> list[ModelConfig]:
     ]
 
 
-def get_latest_metric(log_lines: list[str]) -> Union[str, Dict[str, str]]:
-    """Read the latest metric entry from the log file."""
-    latest_metric = {}
-
-    try:
-        for line in reversed(log_lines):
-            if "Avg prompt throughput" in line:
-                # Parse the metric values from the line
-                metrics_str = line.split("] ")[1].strip().strip(".")
-                metrics_list = metrics_str.split(", ")
-                for metric in metrics_list:
-                    key, value = metric.split(": ")
-                    latest_metric[key] = value
-                break
-    except Exception as e:
-        return f"[red]Error reading log file: {e}[/red]"
-
-    return latest_metric
-
-
-def convert_boolean_value(value: Union[str, int, bool]) -> bool:
-    """Convert various input types to boolean strings."""
-    if isinstance(value, str):
-        return value.lower() == "true"
-    return bool(value)
-
-
 def parse_launch_output(output: str) -> tuple[str, Dict[str, str]]:
     """Parse output from model launch command.
 
@@ -231,181 +212,4 @@ def parse_launch_output(output: str) -> tuple[str, Dict[str, str]]:
             key, value = line.split(": ", 1)
             config_dict[key.lower().replace(" ", "_")] = value
 
-    return slurm_job_id, config_dict
-
-
-class ModelLauncher:
-    """Shared model launcher for both CLI and API."""
-
-    def __init__(self, model_name: str, options: Optional[Dict[str, Any]] = None):
-        """Initialize the model launcher.
-
-        Parameters
-        ----------
-        model_name: str
-            Name of the model to launch
-        options: Optional[Dict[str, Any]]
-            Optional launch options to override default configuration
-        """
-        self.model_name = model_name
-        self.options = options or {}
-        self.model_config = self._get_model_configuration()
-        self.params = self._get_launch_params()
-
-    def _get_model_configuration(self) -> ModelConfig:
-        """Load and validate model configuration."""
-        model_configs = load_config()
-        config = next(
-            (m for m in model_configs if m.model_name == self.model_name), None
-        )
-
-        if config:
-            return config
-
-        # If model config not found, check for path from options or use fallback
-        model_weights_parent_dir = self.options.get(
-            "model_weights_parent_dir",
-            model_configs[0].model_weights_parent_dir if model_configs else None,
-        )
-
-        if not model_weights_parent_dir:
-            raise ValueError(
-                f"Could not determine model_weights_parent_dir and '{self.model_name}' not found in configuration"
-            )
-
-        model_weights_path = Path(model_weights_parent_dir, self.model_name)
-
-        # Only give a warning if weights exist but config missing
-        if model_weights_path.exists():
-            print(
-                f"Warning: '{self.model_name}' configuration not found in config, please ensure model configuration are properly set in options"
-            )
-            # Return a dummy model config object with model name and weights parent dir
-            return ModelConfig(
-                model_name=self.model_name,
-                model_family="model_family_placeholder",
-                model_type="LLM",
-                gpus_per_node=1,
-                num_nodes=1,
-                vocab_size=1000,
-                max_model_len=8192,
-                model_weights_parent_dir=Path(str(model_weights_parent_dir)),
-            )
-
-        raise ValueError(
-            f"'{self.model_name}' not found in configuration and model weights "
-            f"not found at expected path '{model_weights_path}'"
-        )
-
-    def _get_launch_params(self) -> dict[str, Any]:
-        """Merge config defaults with overrides."""
-        params = self.model_config.model_dump()
-
-        # Process boolean fields
-        for bool_field in ["pipeline_parallelism", "enforce_eager"]:
-            if (value := self.options.get(bool_field)) is not None:
-                params[bool_field] = convert_boolean_value(value)
-
-        # Merge other overrides
-        for key, value in self.options.items():
-            if value is not None and key not in [
-                "json_mode",
-                "pipeline_parallelism",
-                "enforce_eager",
-            ]:
-                params[key] = value
-
-        # Validate required fields
-        if not REQUIRED_FIELDS.issubset(set(params.keys())):
-            missing_fields = REQUIRED_FIELDS - set(params.keys())
-            raise ValueError(f"Missing required fields: {missing_fields}")
-
-        # Create log directory
-        params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
-        params["log_dir"].mkdir(parents=True, exist_ok=True)
-
-        return params
-
-    def set_env_vars(self) -> None:
-        """Set environment variables for the launch command."""
-        os.environ["MODEL_NAME"] = self.model_name
-        os.environ["MAX_MODEL_LEN"] = str(self.params["max_model_len"])
-        os.environ["MAX_LOGPROBS"] = str(self.params["vocab_size"])
-        os.environ["DATA_TYPE"] = str(self.params["data_type"])
-        os.environ["MAX_NUM_SEQS"] = str(self.params["max_num_seqs"])
-        os.environ["GPU_MEMORY_UTILIZATION"] = str(
-            self.params["gpu_memory_utilization"]
-        )
-        os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]]
-        os.environ["PIPELINE_PARALLELISM"] = str(self.params["pipeline_parallelism"])
-        os.environ["ENFORCE_EAGER"] = str(self.params["enforce_eager"])
-        os.environ["SRC_DIR"] = SRC_DIR
-        os.environ["MODEL_WEIGHTS"] = str(
-            Path(self.params["model_weights_parent_dir"], self.model_name)
-        )
-        os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH
-        os.environ["VENV_BASE"] = str(self.params["venv"])
-        os.environ["LOG_DIR"] = str(self.params["log_dir"])
-
-    def build_launch_command(self) -> str:
-        """Construct the full launch command with parameters."""
-        # Base command
-        command_list = ["sbatch"]
-        # Append options
-        command_list.extend(["--job-name", f"{self.model_name}"])
-        command_list.extend(["--partition", f"{self.params['partition']}"])
-        command_list.extend(["--qos", f"{self.params['qos']}"])
-        command_list.extend(["--time", f"{self.params['time']}"])
-        command_list.extend(["--nodes", f"{self.params['num_nodes']}"])
-        command_list.extend(["--gpus-per-node", f"{self.params['gpus_per_node']}"])
-        command_list.extend(
-            [
-                "--output",
-                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.out",
-            ]
-        )
-        command_list.extend(
-            [
-                "--error",
-                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.err",
-            ]
-        )
-        # Add slurm script
-        slurm_script = "vllm.slurm"
-        if int(self.params["num_nodes"]) > 1:
-            slurm_script = "multinode_vllm.slurm"
-        command_list.append(f"{SRC_DIR}/{slurm_script}")
-        return " ".join(command_list)
-
-    def launch(self) -> tuple[str, Dict[str, str], Dict[str, Any]]:
-        """Launch the model and return job information.
-
-        Returns
-        -------
-        tuple[str, Dict[str, str], Dict[str, Any]]
-            Slurm job ID, config dictionary, and parameters dictionary
-        """
-        # Set environment variables
-        self.set_env_vars()
-
-        # Build and execute the command
-        command = self.build_launch_command()
-        output, _ = run_bash_command(command)
-
-        # Parse the output
-        job_id, config_dict = parse_launch_output(output)
-
-        # Save job configuration to JSON
-        job_json_dir = Path(self.params["log_dir"], f"{self.model_name}.{job_id}")
-        job_json_dir.mkdir(parents=True, exist_ok=True)
-
-        job_json_path = job_json_dir / f"{self.model_name}.{job_id}.json"
-
-        # Convert params for serialization
-        serializable_params = {k: str(v) for k, v in self.params.items()}
-        serializable_params["slurm_job_id"] = job_id
-
-        with job_json_path.open("w") as f:
-            json.dump(serializable_params, f, indent=4)
-
-        return job_id, config_dict, self.params
+    return slurm_job_id, config_dict
\ No newline at end of file

From 8b8ff56bbc4ae090a35003045eafd01060e7b20d Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Tue, 1 Apr 2025 15:26:41 -0400
Subject: [PATCH 14/52] Remove LaunchHelper from cli/_helper.py, ruff format
 and mypy fixes

---
 vec_inf/cli/_cli.py      |   1 -
 vec_inf/cli/_helper.py   | 185 ---------------------------------------
 vec_inf/shared/helper.py |   7 +-
 vec_inf/shared/utils.py  |   2 +-
 4 files changed, 3 insertions(+), 192 deletions(-)

diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index a3bcde64..8c22ac4a 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -136,7 +136,6 @@ def launch(
             raise click.ClickException(f"Error: {stderr}")
         launch_helper.post_launch_processing(command_output, CONSOLE)
 
-
     except click.ClickException as e:
         raise e
     except Exception as e:
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index b62bb4ab..b52bea42 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -1,9 +1,6 @@
 """Command line interface for Vector Inference."""
 
-import json
-import os
 import time
-from pathlib import Path
 from typing import Any, Optional, Union, cast
 from urllib.parse import urlparse, urlunparse
 
@@ -17,188 +14,6 @@
 from vec_inf.shared import utils
 from vec_inf.shared.config import ModelConfig
 from vec_inf.shared.models import ModelStatus
-from vec_inf.shared.utils import (
-    LD_LIBRARY_PATH,
-    REQUIRED_FIELDS,
-    SRC_DIR,
-    VLLM_TASK_MAP,
-)
-
-
-class LaunchHelper:
-    def __init__(
-        self, model_name: str, cli_kwargs: dict[str, Optional[Union[str, int, bool]]]
-    ):
-        self.model_name = model_name
-        self.cli_kwargs = cli_kwargs
-        self.model_config = self._get_model_configuration()
-        self.params = self._get_launch_params()
-
-    def _get_model_configuration(self) -> ModelConfig:
-        """Load and validate model configuration."""
-        model_configs = utils.load_config()
-        config = next(
-            (m for m in model_configs if m.model_name == self.model_name), None
-        )
-
-        if config:
-            return config
-        # If model config not found, load path from CLI args or fallback to default
-        model_weights_parent_dir = self.cli_kwargs.get(
-            "model_weights_parent_dir", model_configs[0].model_weights_parent_dir
-        )
-        model_weights_path = Path(cast(str, model_weights_parent_dir), self.model_name)
-        # Only give a warning msg if weights exist but config missing
-        if model_weights_path.exists():
-            click.echo(
-                click.style(
-                    f"Warning: '{self.model_name}' configuration not found in config, please ensure model configuration are properly set in command arguments",
-                    fg="yellow",
-                )
-            )
-            # Return a dummy model config object with model name and weights parent dir
-            return ModelConfig(
-                model_name=self.model_name,
-                model_family="model_family_placeholder",
-                model_type="LLM",
-                gpus_per_node=1,
-                num_nodes=1,
-                vocab_size=1000,
-                max_model_len=8192,
-                model_weights_parent_dir=Path(cast(str, model_weights_parent_dir)),
-            )
-        raise click.ClickException(
-            f"'{self.model_name}' not found in configuration and model weights "
-            f"not found at expected path '{model_weights_path}'"
-        )
-
-    def _get_launch_params(self) -> dict[str, Any]:
-        """Merge config defaults with CLI overrides."""
-        params = self.model_config.model_dump()
-
-        # Process boolean fields
-        for bool_field in ["pipeline_parallelism", "enforce_eager"]:
-            if (value := self.cli_kwargs.get(bool_field)) is not None:
-                params[bool_field] = utils.convert_boolean_value(value)
-
-        # Merge other overrides
-        for key, value in self.cli_kwargs.items():
-            if value is not None and key not in [
-                "json_mode",
-                "pipeline_parallelism",
-                "enforce_eager",
-            ]:
-                params[key] = value
-
-        # Validate required fields
-        if not REQUIRED_FIELDS.issubset(set(params.keys())):
-            raise click.ClickException(
-                f"Missing required fields: {REQUIRED_FIELDS - set(params.keys())}"
-            )
-
-        # Create log directory
-        params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
-        params["log_dir"].mkdir(parents=True, exist_ok=True)
-
-        # Convert to string for JSON serialization
-        for field in params:
-            params[field] = str(params[field])
-
-        return params
-
-    def set_env_vars(self) -> None:
-        """Set environment variables for the launch command."""
-        os.environ["MODEL_NAME"] = self.model_name
-        os.environ["MAX_MODEL_LEN"] = self.params["max_model_len"]
-        os.environ["MAX_LOGPROBS"] = self.params["vocab_size"]
-        os.environ["DATA_TYPE"] = self.params["data_type"]
-        os.environ["MAX_NUM_SEQS"] = self.params["max_num_seqs"]
-        os.environ["GPU_MEMORY_UTILIZATION"] = self.params["gpu_memory_utilization"]
-        os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]]
-        os.environ["PIPELINE_PARALLELISM"] = self.params["pipeline_parallelism"]
-        os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]
-        os.environ["SRC_DIR"] = SRC_DIR
-        os.environ["MODEL_WEIGHTS"] = str(
-            Path(self.params["model_weights_parent_dir"], self.model_name)
-        )
-        os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH
-        os.environ["VENV_BASE"] = self.params["venv"]
-        os.environ["LOG_DIR"] = self.params["log_dir"]
-
-    def build_launch_command(self) -> str:
-        """Construct the full launch command with parameters."""
-        # Base command
-        command_list = ["sbatch"]
-        # Append options
-        command_list.extend(["--job-name", f"{self.model_name}"])
-        command_list.extend(["--partition", f"{self.params['partition']}"])
-        command_list.extend(["--qos", f"{self.params['qos']}"])
-        command_list.extend(["--time", f"{self.params['time']}"])
-        command_list.extend(["--nodes", f"{self.params['num_nodes']}"])
-        command_list.extend(["--gpus-per-node", f"{self.params['gpus_per_node']}"])
-        command_list.extend(
-            [
-                "--output",
-                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.out",
-            ]
-        )
-        command_list.extend(
-            [
-                "--error",
-                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.err",
-            ]
-        )
-        # Add slurm script
-        slurm_script = "vllm.slurm"
-        if int(self.params["num_nodes"]) > 1:
-            slurm_script = "multinode_vllm.slurm"
-        command_list.append(f"{SRC_DIR}/{slurm_script}")
-        return " ".join(command_list)
-
-    def format_table_output(self, job_id: str) -> Table:
-        """Format output as rich Table."""
-        table = utils.create_table(key_title="Job Config", value_title="Value")
-        # Add rows
-        table.add_row("Slurm Job ID", job_id, style="blue")
-        table.add_row("Job Name", self.model_name)
-        table.add_row("Model Type", self.params["model_type"])
-        table.add_row("Partition", self.params["partition"])
-        table.add_row("QoS", self.params["qos"])
-        table.add_row("Time Limit", self.params["time"])
-        table.add_row("Num Nodes", self.params["num_nodes"])
-        table.add_row("GPUs/Node", self.params["gpus_per_node"])
-        table.add_row("Data Type", self.params["data_type"])
-        table.add_row("Vocabulary Size", self.params["vocab_size"])
-        table.add_row("Max Model Length", self.params["max_model_len"])
-        table.add_row("Max Num Seqs", self.params["max_num_seqs"])
-        table.add_row("GPU Memory Utilization", self.params["gpu_memory_utilization"])
-        table.add_row("Pipeline Parallelism", self.params["pipeline_parallelism"])
-        table.add_row("Enforce Eager", self.params["enforce_eager"])
-        table.add_row("Model Weights Directory", os.environ.get("MODEL_WEIGHTS"))
-        table.add_row("Log Directory", self.params["log_dir"])
-
-        return table
-
-    def post_launch_processing(self, output: str, console: Console) -> None:
-        """Process and display launch output."""
-        json_mode = bool(self.cli_kwargs.get("json_mode", False))
-        slurm_job_id = output.split(" ")[-1].strip().strip("\n")
-        self.params["slurm_job_id"] = slurm_job_id
-        job_json = Path(
-            self.params["log_dir"],
-            f"{self.model_name}.{slurm_job_id}",
-            f"{self.model_name}.{slurm_job_id}.json",
-        )
-        job_json.parent.mkdir(parents=True, exist_ok=True)
-        job_json.touch(exist_ok=True)
-
-        with job_json.open("w") as file:
-            json.dump(self.params, file, indent=4)
-        if json_mode:
-            click.echo(self.params)
-        else:
-            table = self.format_table_output(slurm_job_id)
-            console.print(table)
 
 
 class StatusHelper:
diff --git a/vec_inf/shared/helper.py b/vec_inf/shared/helper.py
index dbe55c65..dd7aa512 100644
--- a/vec_inf/shared/helper.py
+++ b/vec_inf/shared/helper.py
@@ -23,9 +23,7 @@
 class LaunchHelper:
     """Shared launch helper for both CLI and API."""
 
-    def __init__(
-            self, model_name: str, cli_kwargs: Optional[dict[str, Any]]
-        ):
+    def __init__(self, model_name: str, cli_kwargs: Optional[dict[str, Any]]):
         """Initialize the model launcher.
 
         Parameters
@@ -50,7 +48,7 @@ def _get_model_configuration(self) -> ModelConfig:
         if config:
             return config
 
-        # If model config not found, check for path from keyword arguments or use fallback
+        # If model config not found, check for path from CLI kwargs or use fallback
         model_weights_parent_dir = self.cli_kwargs.get(
             "model_weights_parent_dir",
             model_configs[0].model_weights_parent_dir if model_configs else None,
@@ -277,4 +275,3 @@ def post_launch_processing(self, output: str, console: Console) -> None:
         else:
             table = self.format_table_output(slurm_job_id)
             console.print(table)
-
diff --git a/vec_inf/shared/utils.py b/vec_inf/shared/utils.py
index 10740da4..a229e3fb 100644
--- a/vec_inf/shared/utils.py
+++ b/vec_inf/shared/utils.py
@@ -212,4 +212,4 @@ def parse_launch_output(output: str) -> tuple[str, Dict[str, str]]:
             key, value = line.split(": ", 1)
             config_dict[key.lower().replace(" ", "_")] = value
 
-    return slurm_job_id, config_dict
\ No newline at end of file
+    return slurm_job_id, config_dict

From 33c7509e6658bd156a3bb485e2f5b7fbfcd3d61f Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Tue, 1 Apr 2025 18:28:56 -0400
Subject: [PATCH 15/52] Mark shared files as private, move CLI helpers to
 shared and create child helper classes for CLI

---
 vec_inf/cli/_cli.py                      |  21 +-
 vec_inf/cli/_helper.py                   | 341 +++++------------
 vec_inf/shared/{config.py => _config.py} |   0
 vec_inf/shared/_helper.py                | 446 +++++++++++++++++++++++
 vec_inf/shared/{models.py => _models.py} |   0
 vec_inf/shared/{utils.py => _utils.py}   |   8 +-
 vec_inf/shared/helper.py                 | 277 --------------
 7 files changed, 551 insertions(+), 542 deletions(-)
 rename vec_inf/shared/{config.py => _config.py} (100%)
 create mode 100644 vec_inf/shared/_helper.py
 rename vec_inf/shared/{models.py => _models.py} (100%)
 rename vec_inf/shared/{utils.py => _utils.py} (96%)
 delete mode 100644 vec_inf/shared/helper.py

diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index 8c22ac4a..6dae3433 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -7,9 +7,13 @@
 from rich.console import Console
 from rich.live import Live
 
-from vec_inf.cli._helper import ListHelper, MetricsHelper, StatusHelper
-from vec_inf.shared import utils
-from vec_inf.shared.helper import LaunchHelper
+import vec_inf.shared._utils as utils
+from vec_inf.cli._helper import (
+    CLILaunchHelper,
+    CLIListHelper,
+    CLIMetricsHelper,
+    CLIStatusHelper,
+)
 
 
 CONSOLE = Console()
@@ -127,7 +131,7 @@ def launch(
 ) -> None:
     """Launch a model on the cluster."""
     try:
-        launch_helper = LaunchHelper(model_name, cli_kwargs)
+        launch_helper = CLILaunchHelper(model_name, cli_kwargs)
 
         launch_helper.set_env_vars()
         launch_command = launch_helper.build_launch_command()
@@ -163,7 +167,7 @@ def status(
     if stderr:
         raise click.ClickException(f"Error: {stderr}")
 
-    status_helper = StatusHelper(slurm_job_id, output, log_dir)
+    status_helper = CLIStatusHelper(slurm_job_id, output, log_dir)
 
     status_helper.process_job_state()
     if json_mode:
@@ -176,8 +180,7 @@ def status(
 @click.argument("slurm_job_id", type=int, nargs=1)
 def shutdown(slurm_job_id: int) -> None:
     """Shutdown a running model on the cluster."""
-    shutdown_cmd = f"scancel {slurm_job_id}"
-    utils.run_bash_command(shutdown_cmd)
+    utils.shutdown_model(slurm_job_id)
     click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
 
 
@@ -190,7 +193,7 @@ def shutdown(slurm_job_id: int) -> None:
 )
 def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
     """List all available models, or get default setup of a specific model."""
-    list_helper = ListHelper(model_name, json_mode)
+    list_helper = CLIListHelper(model_name, json_mode)
     list_helper.process_list_command(CONSOLE)
 
 
@@ -201,7 +204,7 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
 )
 def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
     """Stream real-time performance metrics from the model endpoint."""
-    helper = MetricsHelper(slurm_job_id, log_dir)
+    helper = CLIMetricsHelper(slurm_job_id, log_dir)
 
     # Check if metrics URL is ready
     if not helper.metrics_url.startswith("http"):
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index b52bea42..670f8464 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -1,97 +1,99 @@
-"""Command line interface for Vector Inference."""
+"""Helper classes for the CLI."""
 
-import time
-from typing import Any, Optional, Union, cast
-from urllib.parse import urlparse, urlunparse
+import json
+import os
+from pathlib import Path
+from typing import Any, Optional, Union
 
 import click
-import requests
 from rich.columns import Columns
 from rich.console import Console
 from rich.panel import Panel
 from rich.table import Table
 
-from vec_inf.shared import utils
-from vec_inf.shared.config import ModelConfig
-from vec_inf.shared.models import ModelStatus
+import vec_inf.shared._utils as utils
+from vec_inf.shared._config import ModelConfig
+from vec_inf.shared._helper import LaunchHelper, ListHelper, MetricsHelper, StatusHelper
+
+
+class CLILaunchHelper(LaunchHelper):
+    """CLI Helper class for handling launch information."""
+
+    def __init__(self, model_name: str, cli_kwargs: Optional[dict[str, Any]]):
+        super().__init__(model_name, cli_kwargs)
+
+    def format_table_output(self, job_id: str) -> Table:
+        """Format output as rich Table."""
+        table = utils.create_table(key_title="Job Config", value_title="Value")
+
+        # Add key information with consistent styling
+        table.add_row("Slurm Job ID", job_id, style="blue")
+        table.add_row("Job Name", self.model_name)
+
+        # Add model details
+        table.add_row("Model Type", self.params["model_type"])
+
+        # Add resource allocation details
+        table.add_row("Partition", self.params["partition"])
+        table.add_row("QoS", self.params["qos"])
+        table.add_row("Time Limit", self.params["time"])
+        table.add_row("Num Nodes", self.params["num_nodes"])
+        table.add_row("GPUs/Node", self.params["gpus_per_node"])
+
+        # Add model configuration details
+        table.add_row("Data Type", self.params["data_type"])
+        table.add_row("Vocabulary Size", self.params["vocab_size"])
+        table.add_row("Max Model Length", self.params["max_model_len"])
+        table.add_row("Max Num Seqs", self.params["max_num_seqs"])
+        table.add_row("GPU Memory Utilization", self.params["gpu_memory_utilization"])
+        table.add_row("Compilation Config", self.params["compilation_config"])
+        table.add_row("Pipeline Parallelism", self.params["pipeline_parallelism"])
+        if self.params.get("enable_prefix_caching"):
+            table.add_row("Enable Prefix Caching", self.params["enable_prefix_caching"])
+        if self.params.get("enable_chunked_prefill"):
+            table.add_row(
+                "Enable Chunked Prefill", self.params["enable_chunked_prefill"]
+            )
+        if self.params.get("max_num_batched_tokens"):
+            table.add_row(
+                "Max Num Batched Tokens", self.params["max_num_batched_tokens"]
+            )
+        if self.params.get("enforce_eager"):
+            table.add_row("Enforce Eager", self.params["enforce_eager"])
 
+        # Add path details
+        table.add_row("Model Weights Directory", os.environ.get("MODEL_WEIGHTS"))
+        table.add_row("Log Directory", self.params["log_dir"])
 
-class StatusHelper:
-    def __init__(self, slurm_job_id: int, output: str, log_dir: Optional[str] = None):
-        self.slurm_job_id = slurm_job_id
-        self.output = output
-        self.log_dir = log_dir
-        self.status_info = self._get_base_status_data()
-
-    def _get_base_status_data(self) -> dict[str, Union[str, None]]:
-        """Extract basic job status information from scontrol output."""
-        try:
-            job_name = self.output.split(" ")[1].split("=")[1]
-            job_state = self.output.split(" ")[9].split("=")[1]
-        except IndexError:
-            job_name = ModelStatus.UNAVAILABLE
-            job_state = ModelStatus.UNAVAILABLE
-
-        return {
-            "model_name": job_name,
-            "status": ModelStatus.UNAVAILABLE,
-            "base_url": ModelStatus.UNAVAILABLE,
-            "state": job_state,
-            "pending_reason": None,
-            "failed_reason": None,
-        }
+        return table
 
-    def process_job_state(self) -> None:
-        """Process different job states and update status information."""
-        if self.status_info["state"] == ModelStatus.PENDING:
-            self.process_pending_state()
-        elif self.status_info["state"] == "RUNNING":
-            self.process_running_state()
-
-    def check_model_health(self) -> None:
-        """Check model health and update status accordingly."""
-        status, status_code = utils.model_health_check(
-            cast(str, self.status_info["model_name"]), self.slurm_job_id, self.log_dir
+    def post_launch_processing(self, output: str, console: Console) -> None:
+        """Process and display launch output."""
+        json_mode = bool(self.cli_kwargs.get("json_mode", False))
+        slurm_job_id = output.split(" ")[-1].strip().strip("\n")
+        self.params["slurm_job_id"] = slurm_job_id
+        job_json = Path(
+            self.params["log_dir"],
+            f"{self.model_name}.{slurm_job_id}",
+            f"{self.model_name}.{slurm_job_id}.json",
         )
-        if status == ModelStatus.READY:
-            self.status_info["base_url"] = utils.get_base_url(
-                cast(str, self.status_info["model_name"]),
-                self.slurm_job_id,
-                self.log_dir,
-            )
-            self.status_info["status"] = status
-        else:
-            self.status_info["status"], self.status_info["failed_reason"] = (
-                status,
-                cast(str, status_code),
-            )
+        job_json.parent.mkdir(parents=True, exist_ok=True)
+        job_json.touch(exist_ok=True)
 
-    def process_running_state(self) -> None:
-        """Process RUNNING job state and check server status."""
-        server_status = utils.is_server_running(
-            cast(str, self.status_info["model_name"]), self.slurm_job_id, self.log_dir
-        )
+        with job_json.open("w") as file:
+            json.dump(self.params, file, indent=4)
+        if json_mode:
+            click.echo(self.params)
+        else:
+            table = self.format_table_output(slurm_job_id)
+            console.print(table)
 
-        if isinstance(server_status, tuple):
-            self.status_info["status"], self.status_info["failed_reason"] = (
-                server_status
-            )
-            return
 
-        if server_status == "RUNNING":
-            self.check_model_health()
-        else:
-            self.status_info["status"] = server_status
+class CLIStatusHelper(StatusHelper):
+    """CLI Helper class for handling status information."""
 
-    def process_pending_state(self) -> None:
-        """Process PENDING job state."""
-        try:
-            self.status_info["pending_reason"] = self.output.split(" ")[10].split("=")[
-                1
-            ]
-            self.status_info["status"] = ModelStatus.PENDING
-        except IndexError:
-            self.status_info["pending_reason"] = "Unknown pending reason"
+    def __init__(self, slurm_job_id: int, output: str, log_dir: Optional[str] = None):
+        super().__init__(slurm_job_id, output, log_dir)
 
     def output_json(self) -> None:
         """Format and output JSON data."""
@@ -121,169 +123,11 @@ def output_table(self, console: Console) -> None:
         console.print(table)
 
 
-class MetricsHelper:
-    def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
-        self.slurm_job_id = slurm_job_id
-        self.log_dir = log_dir
-        self.status_info = self._get_status_info()
-        self.metrics_url = self._build_metrics_url()
-        self.enabled_prefix_caching = self._check_prefix_caching()
-
-        self._prev_prompt_tokens: float = 0.0
-        self._prev_generation_tokens: float = 0.0
-        self._last_updated: Optional[float] = None
-        self._last_throughputs = {"prompt": 0.0, "generation": 0.0}
-
-    def _get_status_info(self) -> dict[str, Union[str, None]]:
-        """Retrieve status info using existing StatusHelper."""
-        status_cmd = f"scontrol show job {self.slurm_job_id} --oneliner"
-        output, stderr = utils.run_bash_command(status_cmd)
-        if stderr:
-            raise click.ClickException(f"Error: {stderr}")
-        status_helper = StatusHelper(self.slurm_job_id, output, self.log_dir)
-        return status_helper.status_info
-
-    def _build_metrics_url(self) -> str:
-        """Construct metrics endpoint URL from base URL with version stripping."""
-        if self.status_info.get("state") == "PENDING":
-            return "Pending resources for server initialization"
-
-        base_url = utils.get_base_url(
-            cast(str, self.status_info["model_name"]),
-            self.slurm_job_id,
-            self.log_dir,
-        )
-        if not base_url.startswith("http"):
-            return "Server not ready"
-
-        parsed = urlparse(base_url)
-        clean_path = parsed.path.replace("/v1", "", 1).rstrip("/")
-        return urlunparse(
-            (parsed.scheme, parsed.netloc, f"{clean_path}/metrics", "", "", "")
-        )
+class CLIMetricsHelper(MetricsHelper):
+    """CLI Helper class for streaming metrics information."""
 
-    def _check_prefix_caching(self) -> bool:
-        """Check if prefix caching is enabled."""
-        job_json = utils.read_slurm_log(
-            cast(str, self.status_info["model_name"]),
-            self.slurm_job_id,
-            "json",
-            self.log_dir,
-        )
-        if isinstance(job_json, str):
-            return False
-        return bool(cast(dict[str, str], job_json).get("enable_prefix_caching", False))
-
-    def fetch_metrics(self) -> Union[dict[str, float], str]:
-        """Fetch metrics from the endpoint."""
-        try:
-            response = requests.get(self.metrics_url, timeout=3)
-            response.raise_for_status()
-            current_metrics = self._parse_metrics(response.text)
-            current_time = time.time()
-
-            # Set defaults using last known throughputs
-            current_metrics.setdefault(
-                "prompt_tokens_per_sec", self._last_throughputs["prompt"]
-            )
-            current_metrics.setdefault(
-                "generation_tokens_per_sec", self._last_throughputs["generation"]
-            )
-
-            if self._last_updated is None:
-                self._prev_prompt_tokens = current_metrics.get(
-                    "total_prompt_tokens", 0.0
-                )
-                self._prev_generation_tokens = current_metrics.get(
-                    "total_generation_tokens", 0.0
-                )
-                self._last_updated = current_time
-                return current_metrics
-
-            time_diff = current_time - self._last_updated
-            if time_diff > 0:
-                current_prompt = current_metrics.get("total_prompt_tokens", 0.0)
-                current_gen = current_metrics.get("total_generation_tokens", 0.0)
-
-                delta_prompt = current_prompt - self._prev_prompt_tokens
-                delta_gen = current_gen - self._prev_generation_tokens
-
-                # Only update throughputs when we have new tokens
-                prompt_tps = (
-                    delta_prompt / time_diff
-                    if delta_prompt > 0
-                    else self._last_throughputs["prompt"]
-                )
-                gen_tps = (
-                    delta_gen / time_diff
-                    if delta_gen > 0
-                    else self._last_throughputs["generation"]
-                )
-
-                current_metrics["prompt_tokens_per_sec"] = prompt_tps
-                current_metrics["generation_tokens_per_sec"] = gen_tps
-
-                # Persist calculated values regardless of activity
-                self._last_throughputs["prompt"] = prompt_tps
-                self._last_throughputs["generation"] = gen_tps
-
-                # Update tracking state
-                self._prev_prompt_tokens = current_prompt
-                self._prev_generation_tokens = current_gen
-                self._last_updated = current_time
-
-            # Calculate average latency if data is available
-            if (
-                "request_latency_sum" in current_metrics
-                and "request_latency_count" in current_metrics
-            ):
-                latency_sum = current_metrics["request_latency_sum"]
-                latency_count = current_metrics["request_latency_count"]
-                current_metrics["avg_request_latency"] = (
-                    latency_sum / latency_count if latency_count > 0 else 0.0
-                )
-
-            return current_metrics
-
-        except requests.RequestException as e:
-            return f"Metrics request failed, `metrics` endpoint might not be ready yet: {str(e)}"
-
-    def _parse_metrics(self, metrics_text: str) -> dict[str, float]:
-        """Parse metrics with latency count and sum."""
-        key_metrics = {
-            "vllm:prompt_tokens_total": "total_prompt_tokens",
-            "vllm:generation_tokens_total": "total_generation_tokens",
-            "vllm:e2e_request_latency_seconds_sum": "request_latency_sum",
-            "vllm:e2e_request_latency_seconds_count": "request_latency_count",
-            "vllm:request_queue_time_seconds_sum": "queue_time_sum",
-            "vllm:request_success_total": "successful_requests_total",
-            "vllm:num_requests_running": "requests_running",
-            "vllm:num_requests_waiting": "requests_waiting",
-            "vllm:num_requests_swapped": "requests_swapped",
-            "vllm:gpu_cache_usage_perc": "gpu_cache_usage",
-            "vllm:cpu_cache_usage_perc": "cpu_cache_usage",
-        }
-
-        if self.enabled_prefix_caching:
-            key_metrics["vllm:gpu_prefix_cache_hit_rate"] = "gpu_prefix_cache_hit_rate"
-            key_metrics["vllm:cpu_prefix_cache_hit_rate"] = "cpu_prefix_cache_hit_rate"
-
-        parsed: dict[str, float] = {}
-        for line in metrics_text.split("\n"):
-            if line.startswith("#") or not line.strip():
-                continue
-
-            parts = line.split()
-            if len(parts) < 2:
-                continue
-
-            metric_name = parts[0].split("{")[0]
-            if metric_name in key_metrics:
-                try:
-                    parsed[key_metrics[metric_name]] = float(parts[1])
-                except (ValueError, IndexError):
-                    continue
-        return parsed
+    def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
+        super().__init__(slurm_job_id, log_dir)
 
     def display_failed_metrics(self, table: Table, metrics: str) -> None:
         table.add_row("Server State", self.status_info["state"], style="yellow")
@@ -356,24 +200,11 @@ def display_metrics(self, table: Table, metrics: dict[str, float]) -> None:
         )
 
 
-class ListHelper:
+class CLIListHelper(ListHelper):
     """Helper class for handling model listing functionality."""
 
     def __init__(self, model_name: Optional[str] = None, json_mode: bool = False):
-        self.model_name = model_name
-        self.json_mode = json_mode
-        self.model_configs = utils.load_config()
-
-    def get_single_model_config(self) -> ModelConfig:
-        """Get configuration for a specific model."""
-        config = next(
-            (c for c in self.model_configs if c.model_name == self.model_name), None
-        )
-        if not config:
-            raise click.ClickException(
-                f"Model '{self.model_name}' not found in configuration"
-            )
-        return config
+        super().__init__(model_name, json_mode)
 
     def format_single_model_output(
         self, config: ModelConfig
diff --git a/vec_inf/shared/config.py b/vec_inf/shared/_config.py
similarity index 100%
rename from vec_inf/shared/config.py
rename to vec_inf/shared/_config.py
diff --git a/vec_inf/shared/_helper.py b/vec_inf/shared/_helper.py
new file mode 100644
index 00000000..224bf983
--- /dev/null
+++ b/vec_inf/shared/_helper.py
@@ -0,0 +1,446 @@
+"""Helper class for the model launch."""
+
+import os
+import time
+from pathlib import Path
+from typing import Any, Optional, Union, cast
+from urllib.parse import urlparse, urlunparse
+
+import click
+import requests
+
+import vec_inf.shared._utils as utils
+from vec_inf.shared._config import ModelConfig
+from vec_inf.shared._models import ModelStatus
+from vec_inf.shared._utils import (
+    BOOLEAN_FIELDS,
+    LD_LIBRARY_PATH,
+    REQUIRED_FIELDS,
+    SRC_DIR,
+    VLLM_TASK_MAP,
+)
+
+
+class LaunchHelper:
+    """Helper class for handling inference server launch."""
+
+    def __init__(self, model_name: str, cli_kwargs: Optional[dict[str, Any]]):
+        """Initialize the model launcher.
+
+        Parameters
+        ----------
+        model_name: str
+            Name of the model to launch
+        cli_kwargs: Optional[dict[str, Any]]
+            Optional launch keyword arguments to override default configuration
+        """
+        self.model_name = model_name
+        self.cli_kwargs = cli_kwargs or {}
+        self.model_config = self._get_model_configuration()
+        self.params = self._get_launch_params()
+
+    def _get_model_configuration(self) -> ModelConfig:
+        """Load and validate model configuration."""
+        model_configs = utils.load_config()
+        config = next(
+            (m for m in model_configs if m.model_name == self.model_name), None
+        )
+
+        if config:
+            return config
+
+        # If model config not found, check for path from CLI kwargs or use fallback
+        model_weights_parent_dir = self.cli_kwargs.get(
+            "model_weights_parent_dir",
+            model_configs[0].model_weights_parent_dir if model_configs else None,
+        )
+
+        if not model_weights_parent_dir:
+            raise ValueError(
+                f"Could not determine model_weights_parent_dir and '{self.model_name}' not found in configuration"
+            )
+
+        model_weights_path = Path(model_weights_parent_dir, self.model_name)
+
+        # Only give a warning if weights exist but config missing
+        if model_weights_path.exists():
+            click.echo(
+                click.style(
+                    f"Warning: '{self.model_name}' configuration not found in config, please ensure model configuration are properly set in command arguments",
+                    fg="yellow",
+                )
+            )
+            # Return a dummy model config object with model name and weights parent dir
+            return ModelConfig(
+                model_name=self.model_name,
+                model_family="model_family_placeholder",
+                model_type="LLM",
+                gpus_per_node=1,
+                num_nodes=1,
+                vocab_size=1000,
+                max_model_len=8192,
+                model_weights_parent_dir=Path(str(model_weights_parent_dir)),
+            )
+
+        raise click.ClickException(
+            f"'{self.model_name}' not found in configuration and model weights "
+            f"not found at expected path '{model_weights_path}'"
+        )
+
+    def _get_launch_params(self) -> dict[str, Any]:
+        """Merge config defaults with CLI overrides."""
+        params = self.model_config.model_dump()
+
+        # Process boolean fields
+        for bool_field in BOOLEAN_FIELDS:
+            if self.cli_kwargs[bool_field]:
+                params[bool_field] = True
+
+        # Merge other overrides
+        for key, value in self.cli_kwargs.items():
+            if value is not None and key not in [
+                "json_mode",
+                *BOOLEAN_FIELDS,
+            ]:
+                params[key] = value
+
+        # Validate required fields
+        if not REQUIRED_FIELDS.issubset(set(params.keys())):
+            raise click.ClickException(
+                f"Missing required fields: {REQUIRED_FIELDS - set(params.keys())}"
+            )
+
+        # Create log directory
+        params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
+        params["log_dir"].mkdir(parents=True, exist_ok=True)
+
+        # Convert to string for JSON serialization
+        for field in params:
+            params[field] = str(params[field])
+
+        return params
+
+    def set_env_vars(self) -> None:
+        """Set environment variables for the launch command."""
+        os.environ["MODEL_NAME"] = self.model_name
+        os.environ["MAX_MODEL_LEN"] = self.params["max_model_len"]
+        os.environ["MAX_LOGPROBS"] = self.params["vocab_size"]
+        os.environ["DATA_TYPE"] = self.params["data_type"]
+        os.environ["MAX_NUM_SEQS"] = self.params["max_num_seqs"]
+        os.environ["GPU_MEMORY_UTILIZATION"] = self.params["gpu_memory_utilization"]
+        os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]]
+        os.environ["PIPELINE_PARALLELISM"] = self.params["pipeline_parallelism"]
+        os.environ["COMPILATION_CONFIG"] = self.params["compilation_config"]
+        os.environ["SRC_DIR"] = SRC_DIR
+        os.environ["MODEL_WEIGHTS"] = str(
+            Path(self.params["model_weights_parent_dir"], self.model_name)
+        )
+        os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH
+        os.environ["VENV_BASE"] = self.params["venv"]
+        os.environ["LOG_DIR"] = self.params["log_dir"]
+
+        if self.params.get("enable_prefix_caching"):
+            os.environ["ENABLE_PREFIX_CACHING"] = self.params["enable_prefix_caching"]
+        if self.params.get("enable_chunked_prefill"):
+            os.environ["ENABLE_CHUNKED_PREFILL"] = self.params["enable_chunked_prefill"]
+        if self.params.get("max_num_batched_tokens"):
+            os.environ["MAX_NUM_BATCHED_TOKENS"] = self.params["max_num_batched_tokens"]
+        if self.params.get("enforce_eager"):
+            os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]
+
+    def build_launch_command(self) -> str:
+        """Construct the full launch command with parameters."""
+        # Base command
+        command_list = ["sbatch"]
+        # Append options
+        command_list.extend(["--job-name", f"{self.model_name}"])
+        command_list.extend(["--partition", f"{self.params['partition']}"])
+        command_list.extend(["--qos", f"{self.params['qos']}"])
+        command_list.extend(["--time", f"{self.params['time']}"])
+        command_list.extend(["--nodes", f"{self.params['num_nodes']}"])
+        command_list.extend(["--gpus-per-node", f"{self.params['gpus_per_node']}"])
+        command_list.extend(
+            [
+                "--output",
+                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.out",
+            ]
+        )
+        command_list.extend(
+            [
+                "--error",
+                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.err",
+            ]
+        )
+        # Add slurm script
+        slurm_script = "vllm.slurm"
+        if int(self.params["num_nodes"]) > 1:
+            slurm_script = "multinode_vllm.slurm"
+        command_list.append(f"{SRC_DIR}/{slurm_script}")
+        return " ".join(command_list)
+
+
+class StatusHelper:
+    """Helper class for handling server status information."""
+
+    def __init__(self, slurm_job_id: int, output: str, log_dir: Optional[str] = None):
+        self.slurm_job_id = slurm_job_id
+        self.output = output
+        self.log_dir = log_dir
+        self.status_info = self._get_base_status_data()
+
+    def _get_base_status_data(self) -> dict[str, Union[str, None]]:
+        """Extract basic job status information from scontrol output."""
+        try:
+            job_name = self.output.split(" ")[1].split("=")[1]
+            job_state = self.output.split(" ")[9].split("=")[1]
+        except IndexError:
+            job_name = ModelStatus.UNAVAILABLE
+            job_state = ModelStatus.UNAVAILABLE
+
+        return {
+            "model_name": job_name,
+            "status": ModelStatus.UNAVAILABLE,
+            "base_url": ModelStatus.UNAVAILABLE,
+            "state": job_state,
+            "pending_reason": None,
+            "failed_reason": None,
+        }
+
+    def process_job_state(self) -> None:
+        """Process different job states and update status information."""
+        if self.status_info["state"] == ModelStatus.PENDING:
+            self.process_pending_state()
+        elif self.status_info["state"] == "RUNNING":
+            self.process_running_state()
+
+    def check_model_health(self) -> None:
+        """Check model health and update status accordingly."""
+        status, status_code = utils.model_health_check(
+            cast(str, self.status_info["model_name"]), self.slurm_job_id, self.log_dir
+        )
+        if status == ModelStatus.READY:
+            self.status_info["base_url"] = utils.get_base_url(
+                cast(str, self.status_info["model_name"]),
+                self.slurm_job_id,
+                self.log_dir,
+            )
+            self.status_info["status"] = status
+        else:
+            self.status_info["status"], self.status_info["failed_reason"] = (
+                status,
+                cast(str, status_code),
+            )
+
+    def process_running_state(self) -> None:
+        """Process RUNNING job state and check server status."""
+        server_status = utils.is_server_running(
+            cast(str, self.status_info["model_name"]), self.slurm_job_id, self.log_dir
+        )
+
+        if isinstance(server_status, tuple):
+            self.status_info["status"], self.status_info["failed_reason"] = (
+                server_status
+            )
+            return
+
+        if server_status == "RUNNING":
+            self.check_model_health()
+        else:
+            self.status_info["status"] = server_status
+
+    def process_pending_state(self) -> None:
+        """Process PENDING job state."""
+        try:
+            self.status_info["pending_reason"] = self.output.split(" ")[10].split("=")[
+                1
+            ]
+            self.status_info["status"] = ModelStatus.PENDING
+        except IndexError:
+            self.status_info["pending_reason"] = "Unknown pending reason"
+
+
+class MetricsHelper:
+    """Helper class for handling metrics information."""
+
+    def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
+        self.slurm_job_id = slurm_job_id
+        self.log_dir = log_dir
+        self.status_info = self._get_status_info()
+        self.metrics_url = self._build_metrics_url()
+        self.enabled_prefix_caching = self._check_prefix_caching()
+
+        self._prev_prompt_tokens: float = 0.0
+        self._prev_generation_tokens: float = 0.0
+        self._last_updated: Optional[float] = None
+        self._last_throughputs = {"prompt": 0.0, "generation": 0.0}
+
+    def _get_status_info(self) -> dict[str, Union[str, None]]:
+        """Retrieve status info using existing StatusHelper."""
+        status_cmd = f"scontrol show job {self.slurm_job_id} --oneliner"
+        output, stderr = utils.run_bash_command(status_cmd)
+        if stderr:
+            raise click.ClickException(f"Error: {stderr}")
+        status_helper = StatusHelper(self.slurm_job_id, output, self.log_dir)
+        return status_helper.status_info
+
+    def _build_metrics_url(self) -> str:
+        """Construct metrics endpoint URL from base URL with version stripping."""
+        if self.status_info.get("state") == "PENDING":
+            return "Pending resources for server initialization"
+
+        base_url = utils.get_base_url(
+            cast(str, self.status_info["model_name"]),
+            self.slurm_job_id,
+            self.log_dir,
+        )
+        if not base_url.startswith("http"):
+            return "Server not ready"
+
+        parsed = urlparse(base_url)
+        clean_path = parsed.path.replace("/v1", "", 1).rstrip("/")
+        return urlunparse(
+            (parsed.scheme, parsed.netloc, f"{clean_path}/metrics", "", "", "")
+        )
+
+    def _check_prefix_caching(self) -> bool:
+        """Check if prefix caching is enabled."""
+        job_json = utils.read_slurm_log(
+            cast(str, self.status_info["model_name"]),
+            self.slurm_job_id,
+            "json",
+            self.log_dir,
+        )
+        if isinstance(job_json, str):
+            return False
+        return bool(cast(dict[str, str], job_json).get("enable_prefix_caching", False))
+
+    def fetch_metrics(self) -> Union[dict[str, float], str]:
+        """Fetch metrics from the endpoint."""
+        try:
+            response = requests.get(self.metrics_url, timeout=3)
+            response.raise_for_status()
+            current_metrics = self._parse_metrics(response.text)
+            current_time = time.time()
+
+            # Set defaults using last known throughputs
+            current_metrics.setdefault(
+                "prompt_tokens_per_sec", self._last_throughputs["prompt"]
+            )
+            current_metrics.setdefault(
+                "generation_tokens_per_sec", self._last_throughputs["generation"]
+            )
+
+            if self._last_updated is None:
+                self._prev_prompt_tokens = current_metrics.get(
+                    "total_prompt_tokens", 0.0
+                )
+                self._prev_generation_tokens = current_metrics.get(
+                    "total_generation_tokens", 0.0
+                )
+                self._last_updated = current_time
+                return current_metrics
+
+            time_diff = current_time - self._last_updated
+            if time_diff > 0:
+                current_prompt = current_metrics.get("total_prompt_tokens", 0.0)
+                current_gen = current_metrics.get("total_generation_tokens", 0.0)
+
+                delta_prompt = current_prompt - self._prev_prompt_tokens
+                delta_gen = current_gen - self._prev_generation_tokens
+
+                # Only update throughputs when we have new tokens
+                prompt_tps = (
+                    delta_prompt / time_diff
+                    if delta_prompt > 0
+                    else self._last_throughputs["prompt"]
+                )
+                gen_tps = (
+                    delta_gen / time_diff
+                    if delta_gen > 0
+                    else self._last_throughputs["generation"]
+                )
+
+                current_metrics["prompt_tokens_per_sec"] = prompt_tps
+                current_metrics["generation_tokens_per_sec"] = gen_tps
+
+                # Persist calculated values regardless of activity
+                self._last_throughputs["prompt"] = prompt_tps
+                self._last_throughputs["generation"] = gen_tps
+
+                # Update tracking state
+                self._prev_prompt_tokens = current_prompt
+                self._prev_generation_tokens = current_gen
+                self._last_updated = current_time
+
+            # Calculate average latency if data is available
+            if (
+                "request_latency_sum" in current_metrics
+                and "request_latency_count" in current_metrics
+            ):
+                latency_sum = current_metrics["request_latency_sum"]
+                latency_count = current_metrics["request_latency_count"]
+                current_metrics["avg_request_latency"] = (
+                    latency_sum / latency_count if latency_count > 0 else 0.0
+                )
+
+            return current_metrics
+
+        except requests.RequestException as e:
+            return f"Metrics request failed, `metrics` endpoint might not be ready yet: {str(e)}"
+
+    def _parse_metrics(self, metrics_text: str) -> dict[str, float]:
+        """Parse metrics with latency count and sum."""
+        key_metrics = {
+            "vllm:prompt_tokens_total": "total_prompt_tokens",
+            "vllm:generation_tokens_total": "total_generation_tokens",
+            "vllm:e2e_request_latency_seconds_sum": "request_latency_sum",
+            "vllm:e2e_request_latency_seconds_count": "request_latency_count",
+            "vllm:request_queue_time_seconds_sum": "queue_time_sum",
+            "vllm:request_success_total": "successful_requests_total",
+            "vllm:num_requests_running": "requests_running",
+            "vllm:num_requests_waiting": "requests_waiting",
+            "vllm:num_requests_swapped": "requests_swapped",
+            "vllm:gpu_cache_usage_perc": "gpu_cache_usage",
+            "vllm:cpu_cache_usage_perc": "cpu_cache_usage",
+        }
+
+        if self.enabled_prefix_caching:
+            key_metrics["vllm:gpu_prefix_cache_hit_rate"] = "gpu_prefix_cache_hit_rate"
+            key_metrics["vllm:cpu_prefix_cache_hit_rate"] = "cpu_prefix_cache_hit_rate"
+
+        parsed: dict[str, float] = {}
+        for line in metrics_text.split("\n"):
+            if line.startswith("#") or not line.strip():
+                continue
+
+            parts = line.split()
+            if len(parts) < 2:
+                continue
+
+            metric_name = parts[0].split("{")[0]
+            if metric_name in key_metrics:
+                try:
+                    parsed[key_metrics[metric_name]] = float(parts[1])
+                except (ValueError, IndexError):
+                    continue
+        return parsed
+
+
+class ListHelper:
+    """Helper class for handling model listing functionality."""
+
+    def __init__(self, model_name: Optional[str] = None, json_mode: bool = False):
+        self.model_name = model_name
+        self.json_mode = json_mode
+        self.model_configs = utils.load_config()
+
+    def get_single_model_config(self) -> ModelConfig:
+        """Get configuration for a specific model."""
+        config = next(
+            (c for c in self.model_configs if c.model_name == self.model_name), None
+        )
+        if not config:
+            raise click.ClickException(
+                f"Model '{self.model_name}' not found in configuration"
+            )
+        return config
diff --git a/vec_inf/shared/models.py b/vec_inf/shared/_models.py
similarity index 100%
rename from vec_inf/shared/models.py
rename to vec_inf/shared/_models.py
diff --git a/vec_inf/shared/utils.py b/vec_inf/shared/_utils.py
similarity index 96%
rename from vec_inf/shared/utils.py
rename to vec_inf/shared/_utils.py
index a229e3fb..827fd726 100644
--- a/vec_inf/shared/utils.py
+++ b/vec_inf/shared/_utils.py
@@ -10,7 +10,7 @@
 import yaml
 from rich.table import Table
 
-from vec_inf.shared.config import ModelConfig
+from vec_inf.shared._config import ModelConfig
 
 
 MODEL_READY_SIGNATURE = "INFO:     Application startup complete."
@@ -188,6 +188,12 @@ def load_config() -> list[ModelConfig]:
     ]
 
 
+def shutdown_model(slurm_job_id: int) -> None:
+    """Shutdown a running model on the cluster."""
+    shutdown_cmd = f"scancel {slurm_job_id}"
+    run_bash_command(shutdown_cmd)
+
+
 def parse_launch_output(output: str) -> tuple[str, Dict[str, str]]:
     """Parse output from model launch command.
 
diff --git a/vec_inf/shared/helper.py b/vec_inf/shared/helper.py
deleted file mode 100644
index dd7aa512..00000000
--- a/vec_inf/shared/helper.py
+++ /dev/null
@@ -1,277 +0,0 @@
-"""Helper class for the model launch."""
-
-import json
-import os
-from pathlib import Path
-from typing import Any, Dict, Optional
-
-import click
-from rich.console import Console
-from rich.table import Table
-
-from vec_inf.shared import utils
-from vec_inf.shared.config import ModelConfig
-from vec_inf.shared.utils import (
-    BOOLEAN_FIELDS,
-    LD_LIBRARY_PATH,
-    REQUIRED_FIELDS,
-    SRC_DIR,
-    VLLM_TASK_MAP,
-)
-
-
-class LaunchHelper:
-    """Shared launch helper for both CLI and API."""
-
-    def __init__(self, model_name: str, cli_kwargs: Optional[dict[str, Any]]):
-        """Initialize the model launcher.
-
-        Parameters
-        ----------
-        model_name: str
-            Name of the model to launch
-        cli_kwargs: Optional[dict[str, Any]]
-            Optional launch keyword arguments to override default configuration
-        """
-        self.model_name = model_name
-        self.cli_kwargs = cli_kwargs or {}
-        self.model_config = self._get_model_configuration()
-        self.params = self._get_launch_params()
-
-    def _get_model_configuration(self) -> ModelConfig:
-        """Load and validate model configuration."""
-        model_configs = utils.load_config()
-        config = next(
-            (m for m in model_configs if m.model_name == self.model_name), None
-        )
-
-        if config:
-            return config
-
-        # If model config not found, check for path from CLI kwargs or use fallback
-        model_weights_parent_dir = self.cli_kwargs.get(
-            "model_weights_parent_dir",
-            model_configs[0].model_weights_parent_dir if model_configs else None,
-        )
-
-        if not model_weights_parent_dir:
-            raise ValueError(
-                f"Could not determine model_weights_parent_dir and '{self.model_name}' not found in configuration"
-            )
-
-        model_weights_path = Path(model_weights_parent_dir, self.model_name)
-
-        # Only give a warning if weights exist but config missing
-        if model_weights_path.exists():
-            click.echo(
-                click.style(
-                    f"Warning: '{self.model_name}' configuration not found in config, please ensure model configuration are properly set in command arguments",
-                    fg="yellow",
-                )
-            )
-            # Return a dummy model config object with model name and weights parent dir
-            return ModelConfig(
-                model_name=self.model_name,
-                model_family="model_family_placeholder",
-                model_type="LLM",
-                gpus_per_node=1,
-                num_nodes=1,
-                vocab_size=1000,
-                max_model_len=8192,
-                model_weights_parent_dir=Path(str(model_weights_parent_dir)),
-            )
-
-        raise click.ClickException(
-            f"'{self.model_name}' not found in configuration and model weights "
-            f"not found at expected path '{model_weights_path}'"
-        )
-
-    def _get_launch_params(self) -> dict[str, Any]:
-        """Merge config defaults with CLI overrides."""
-        params = self.model_config.model_dump()
-
-        # Process boolean fields
-        for bool_field in BOOLEAN_FIELDS:
-            if self.cli_kwargs[bool_field]:
-                params[bool_field] = True
-
-        # Merge other overrides
-        for key, value in self.cli_kwargs.items():
-            if value is not None and key not in [
-                "json_mode",
-                *BOOLEAN_FIELDS,
-            ]:
-                params[key] = value
-
-        # Validate required fields
-        if not REQUIRED_FIELDS.issubset(set(params.keys())):
-            raise click.ClickException(
-                f"Missing required fields: {REQUIRED_FIELDS - set(params.keys())}"
-            )
-
-        # Create log directory
-        params["log_dir"] = Path(params["log_dir"], params["model_family"]).expanduser()
-        params["log_dir"].mkdir(parents=True, exist_ok=True)
-
-        # Convert to string for JSON serialization
-        for field in params:
-            params[field] = str(params[field])
-
-        return params
-
-    def set_env_vars(self) -> None:
-        """Set environment variables for the launch command."""
-        os.environ["MODEL_NAME"] = self.model_name
-        os.environ["MAX_MODEL_LEN"] = self.params["max_model_len"]
-        os.environ["MAX_LOGPROBS"] = self.params["vocab_size"]
-        os.environ["DATA_TYPE"] = self.params["data_type"]
-        os.environ["MAX_NUM_SEQS"] = self.params["max_num_seqs"]
-        os.environ["GPU_MEMORY_UTILIZATION"] = self.params["gpu_memory_utilization"]
-        os.environ["TASK"] = VLLM_TASK_MAP[self.params["model_type"]]
-        os.environ["PIPELINE_PARALLELISM"] = self.params["pipeline_parallelism"]
-        os.environ["COMPILATION_CONFIG"] = self.params["compilation_config"]
-        os.environ["SRC_DIR"] = SRC_DIR
-        os.environ["MODEL_WEIGHTS"] = str(
-            Path(self.params["model_weights_parent_dir"], self.model_name)
-        )
-        os.environ["LD_LIBRARY_PATH"] = LD_LIBRARY_PATH
-        os.environ["VENV_BASE"] = self.params["venv"]
-        os.environ["LOG_DIR"] = self.params["log_dir"]
-
-        if self.params.get("enable_prefix_caching"):
-            os.environ["ENABLE_PREFIX_CACHING"] = self.params["enable_prefix_caching"]
-        if self.params.get("enable_chunked_prefill"):
-            os.environ["ENABLE_CHUNKED_PREFILL"] = self.params["enable_chunked_prefill"]
-        if self.params.get("max_num_batched_tokens"):
-            os.environ["MAX_NUM_BATCHED_TOKENS"] = self.params["max_num_batched_tokens"]
-        if self.params.get("enforce_eager"):
-            os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]
-
-    def build_launch_command(self) -> str:
-        """Construct the full launch command with parameters."""
-        # Base command
-        command_list = ["sbatch"]
-        # Append options
-        command_list.extend(["--job-name", f"{self.model_name}"])
-        command_list.extend(["--partition", f"{self.params['partition']}"])
-        command_list.extend(["--qos", f"{self.params['qos']}"])
-        command_list.extend(["--time", f"{self.params['time']}"])
-        command_list.extend(["--nodes", f"{self.params['num_nodes']}"])
-        command_list.extend(["--gpus-per-node", f"{self.params['gpus_per_node']}"])
-        command_list.extend(
-            [
-                "--output",
-                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.out",
-            ]
-        )
-        command_list.extend(
-            [
-                "--error",
-                f"{self.params['log_dir']}/{self.model_name}.%j/{self.model_name}.%j.err",
-            ]
-        )
-        # Add slurm script
-        slurm_script = "vllm.slurm"
-        if int(self.params["num_nodes"]) > 1:
-            slurm_script = "multinode_vllm.slurm"
-        command_list.append(f"{SRC_DIR}/{slurm_script}")
-        return " ".join(command_list)
-
-    def launch(self) -> tuple[str, Dict[str, str], Dict[str, Any]]:
-        """Launch the model and return job information.
-
-        Returns
-        -------
-        tuple[str, Dict[str, str], Dict[str, Any]]
-            Slurm job ID, config dictionary, and parameters dictionary
-        """
-        # Set environment variables
-        self.set_env_vars()
-
-        # Build and execute the command
-        command = self.build_launch_command()
-        output, _ = utils.run_bash_command(command)
-
-        # Parse the output
-        job_id, config_dict = utils.parse_launch_output(output)
-
-        # Save job configuration to JSON
-        job_json_dir = Path(self.params["log_dir"], f"{self.model_name}.{job_id}")
-        job_json_dir.mkdir(parents=True, exist_ok=True)
-
-        job_json_path = job_json_dir / f"{self.model_name}.{job_id}.json"
-
-        # Convert params for serialization
-        serializable_params = {k: str(v) for k, v in self.params.items()}
-        serializable_params["slurm_job_id"] = job_id
-
-        with job_json_path.open("w") as f:
-            json.dump(serializable_params, f, indent=4)
-
-        return job_id, config_dict, self.params
-
-    def format_table_output(self, job_id: str) -> Table:
-        """Format output as rich Table."""
-        table = utils.create_table(key_title="Job Config", value_title="Value")
-
-        # Add key information with consistent styling
-        table.add_row("Slurm Job ID", job_id, style="blue")
-        table.add_row("Job Name", self.model_name)
-
-        # Add model details
-        table.add_row("Model Type", self.params["model_type"])
-
-        # Add resource allocation details
-        table.add_row("Partition", self.params["partition"])
-        table.add_row("QoS", self.params["qos"])
-        table.add_row("Time Limit", self.params["time"])
-        table.add_row("Num Nodes", self.params["num_nodes"])
-        table.add_row("GPUs/Node", self.params["gpus_per_node"])
-
-        # Add model configuration details
-        table.add_row("Data Type", self.params["data_type"])
-        table.add_row("Vocabulary Size", self.params["vocab_size"])
-        table.add_row("Max Model Length", self.params["max_model_len"])
-        table.add_row("Max Num Seqs", self.params["max_num_seqs"])
-        table.add_row("GPU Memory Utilization", self.params["gpu_memory_utilization"])
-        table.add_row("Compilation Config", self.params["compilation_config"])
-        table.add_row("Pipeline Parallelism", self.params["pipeline_parallelism"])
-        if self.params.get("enable_prefix_caching"):
-            table.add_row("Enable Prefix Caching", self.params["enable_prefix_caching"])
-        if self.params.get("enable_chunked_prefill"):
-            table.add_row(
-                "Enable Chunked Prefill", self.params["enable_chunked_prefill"]
-            )
-        if self.params.get("max_num_batched_tokens"):
-            table.add_row(
-                "Max Num Batched Tokens", self.params["max_num_batched_tokens"]
-            )
-        if self.params.get("enforce_eager"):
-            table.add_row("Enforce Eager", self.params["enforce_eager"])
-
-        # Add path details
-        table.add_row("Model Weights Directory", os.environ.get("MODEL_WEIGHTS"))
-        table.add_row("Log Directory", self.params["log_dir"])
-
-        return table
-
-    def post_launch_processing(self, output: str, console: Console) -> None:
-        """Process and display launch output."""
-        json_mode = bool(self.cli_kwargs.get("json_mode", False))
-        slurm_job_id = output.split(" ")[-1].strip().strip("\n")
-        self.params["slurm_job_id"] = slurm_job_id
-        job_json = Path(
-            self.params["log_dir"],
-            f"{self.model_name}.{slurm_job_id}",
-            f"{self.model_name}.{slurm_job_id}.json",
-        )
-        job_json.parent.mkdir(parents=True, exist_ok=True)
-        job_json.touch(exist_ok=True)
-
-        with job_json.open("w") as file:
-            json.dump(self.params, file, indent=4)
-        if json_mode:
-            click.echo(self.params)
-        else:
-            table = self.format_table_output(slurm_job_id)
-            console.print(table)

From 85d82c5358b4d439a9d4ce26ce3eb929538ab6ba Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Tue, 1 Apr 2025 18:56:13 -0400
Subject: [PATCH 16/52] Decouple shared helper classes from click dependency

---
 vec_inf/shared/_helper.py | 24 +++++++++++++-----------
 vec_inf/shared/_utils.py  | 18 ++++++++++++++++++
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/vec_inf/shared/_helper.py b/vec_inf/shared/_helper.py
index 224bf983..aa525d8f 100644
--- a/vec_inf/shared/_helper.py
+++ b/vec_inf/shared/_helper.py
@@ -2,11 +2,11 @@
 
 import os
 import time
+from abc import ABC, abstractmethod
 from pathlib import Path
 from typing import Any, Optional, Union, cast
 from urllib.parse import urlparse, urlunparse
 
-import click
 import requests
 
 import vec_inf.shared._utils as utils
@@ -21,7 +21,7 @@
 )
 
 
-class LaunchHelper:
+class LaunchHelper(ABC):
     """Helper class for handling inference server launch."""
 
     def __init__(self, model_name: str, cli_kwargs: Optional[dict[str, Any]]):
@@ -39,6 +39,11 @@ def __init__(self, model_name: str, cli_kwargs: Optional[dict[str, Any]]):
         self.model_config = self._get_model_configuration()
         self.params = self._get_launch_params()
 
+    @abstractmethod
+    def _warn(self, message: str) -> None:
+        """Warn the user about a potential issue."""
+        pass
+
     def _get_model_configuration(self) -> ModelConfig:
         """Load and validate model configuration."""
         model_configs = utils.load_config()
@@ -64,11 +69,8 @@ def _get_model_configuration(self) -> ModelConfig:
 
         # Only give a warning if weights exist but config missing
         if model_weights_path.exists():
-            click.echo(
-                click.style(
-                    f"Warning: '{self.model_name}' configuration not found in config, please ensure model configuration are properly set in command arguments",
-                    fg="yellow",
-                )
+            self._warn(
+                f"Warning: '{self.model_name}' configuration not found in config, please ensure model configuration are properly set in command arguments",
             )
             # Return a dummy model config object with model name and weights parent dir
             return ModelConfig(
@@ -82,7 +84,7 @@ def _get_model_configuration(self) -> ModelConfig:
                 model_weights_parent_dir=Path(str(model_weights_parent_dir)),
             )
 
-        raise click.ClickException(
+        raise utils.ModelConfigurationError(
             f"'{self.model_name}' not found in configuration and model weights "
             f"not found at expected path '{model_weights_path}'"
         )
@@ -106,7 +108,7 @@ def _get_launch_params(self) -> dict[str, Any]:
 
         # Validate required fields
         if not REQUIRED_FIELDS.issubset(set(params.keys())):
-            raise click.ClickException(
+            raise utils.MissingRequiredFieldsError(
                 f"Missing required fields: {REQUIRED_FIELDS - set(params.keys())}"
             )
 
@@ -279,7 +281,7 @@ def _get_status_info(self) -> dict[str, Union[str, None]]:
         status_cmd = f"scontrol show job {self.slurm_job_id} --oneliner"
         output, stderr = utils.run_bash_command(status_cmd)
         if stderr:
-            raise click.ClickException(f"Error: {stderr}")
+            raise RuntimeError(f"Error: {stderr}")
         status_helper = StatusHelper(self.slurm_job_id, output, self.log_dir)
         return status_helper.status_info
 
@@ -440,7 +442,7 @@ def get_single_model_config(self) -> ModelConfig:
             (c for c in self.model_configs if c.model_name == self.model_name), None
         )
         if not config:
-            raise click.ClickException(
+            raise utils.ModelNotFoundError(
                 f"Model '{self.model_name}' not found in configuration"
             )
         return config
diff --git a/vec_inf/shared/_utils.py b/vec_inf/shared/_utils.py
index 827fd726..8b07caaa 100644
--- a/vec_inf/shared/_utils.py
+++ b/vec_inf/shared/_utils.py
@@ -45,6 +45,24 @@
 }
 
 
+class ModelConfigurationError(Exception):
+    """Raised when the model config or weights are missing or invalid."""
+
+    pass
+
+
+class MissingRequiredFieldsError(ValueError):
+    """Raised when required fields are missing from the provided parameters."""
+
+    pass
+
+
+class ModelNotFoundError(KeyError):
+    """Raised when the specified model name is not found in the configuration."""
+
+    pass
+
+
 def run_bash_command(command: str) -> tuple[str, str]:
     """Run a bash command and return the output."""
     process = subprocess.Popen(

From 98b9c7601c6f0fb604ec1f6c73600c7c7e81e849 Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 10:21:52 -0400
Subject: [PATCH 17/52] Move custom exceptions and global vars to dedicated
 files, add try catch for all CLI commands

---
 vec_inf/cli/_cli.py           | 86 +++++++++++++++++++++--------------
 vec_inf/cli/_helper.py        |  4 ++
 vec_inf/shared/_exceptions.py | 18 ++++++++
 vec_inf/shared/_helper.py     | 13 ++++--
 vec_inf/shared/_utils.py      | 58 +++--------------------
 vec_inf/shared/_vars.py       | 34 ++++++++++++++
 6 files changed, 123 insertions(+), 90 deletions(-)
 create mode 100644 vec_inf/shared/_exceptions.py
 create mode 100644 vec_inf/shared/_vars.py

diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index 6dae3433..321ee5e0 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -162,18 +162,24 @@ def status(
     slurm_job_id: int, log_dir: Optional[str] = None, json_mode: bool = False
 ) -> None:
     """Get the status of a running model on the cluster."""
-    status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
-    output, stderr = utils.run_bash_command(status_cmd)
-    if stderr:
-        raise click.ClickException(f"Error: {stderr}")
+    try:
+        status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
+        output, stderr = utils.run_bash_command(status_cmd)
+        if stderr:
+            raise click.ClickException(f"Error: {stderr}")
 
-    status_helper = CLIStatusHelper(slurm_job_id, output, log_dir)
+        status_helper = CLIStatusHelper(slurm_job_id, output, log_dir)
 
-    status_helper.process_job_state()
-    if json_mode:
-        status_helper.output_json()
-    else:
-        status_helper.output_table(CONSOLE)
+        status_helper.process_job_state()
+        if json_mode:
+            status_helper.output_json()
+        else:
+            status_helper.output_table(CONSOLE)
+
+    except click.ClickException as e:
+        raise e
+    except Exception as e:
+        raise click.ClickException(f"Status check failed: {str(e)}") from e
 
 
 @cli.command("shutdown")
@@ -193,8 +199,13 @@ def shutdown(slurm_job_id: int) -> None:
 )
 def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
     """List all available models, or get default setup of a specific model."""
-    list_helper = CLIListHelper(model_name, json_mode)
-    list_helper.process_list_command(CONSOLE)
+    try:
+        list_helper = CLIListHelper(model_name, json_mode)
+        list_helper.process_list_command(CONSOLE)
+    except click.ClickException as e:
+        raise e
+    except Exception as e:
+        raise click.ClickException(f"List models failed: {str(e)}") from e
 
 
 @cli.command("metrics")
@@ -204,30 +215,35 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
 )
 def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
     """Stream real-time performance metrics from the model endpoint."""
-    helper = CLIMetricsHelper(slurm_job_id, log_dir)
-
-    # Check if metrics URL is ready
-    if not helper.metrics_url.startswith("http"):
-        table = utils.create_table("Metric", "Value")
-        helper.display_failed_metrics(
-            table, f"Metrics endpoint unavailable - {helper.metrics_url}"
-        )
-        CONSOLE.print(table)
-        return
-
-    with Live(refresh_per_second=1, console=CONSOLE) as live:
-        while True:
-            metrics = helper.fetch_metrics()
-            table = utils.create_table("Metric", "Value")
-
-            if isinstance(metrics, str):
-                # Show status information if metrics aren't available
-                helper.display_failed_metrics(table, metrics)
-            else:
-                helper.display_metrics(table, metrics)
+    try:
+        helper = CLIMetricsHelper(slurm_job_id, log_dir)
 
-            live.update(table)
-            time.sleep(2)
+        # Check if metrics URL is ready
+        if not helper.metrics_url.startswith("http"):
+            table = utils.create_table("Metric", "Value")
+            helper.display_failed_metrics(
+                table, f"Metrics endpoint unavailable - {helper.metrics_url}"
+            )
+            CONSOLE.print(table)
+            return
+
+        with Live(refresh_per_second=1, console=CONSOLE) as live:
+            while True:
+                metrics = helper.fetch_metrics()
+                table = utils.create_table("Metric", "Value")
+
+                if isinstance(metrics, str):
+                    # Show status information if metrics aren't available
+                    helper.display_failed_metrics(table, metrics)
+                else:
+                    helper.display_metrics(table, metrics)
+
+                live.update(table)
+                time.sleep(2)
+    except click.ClickException as e:
+        raise e
+    except Exception as e:
+        raise click.ClickException(f"Metrics check failed: {str(e)}") from e
 
 
 if __name__ == "__main__":
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index 670f8464..34993610 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -22,6 +22,10 @@ class CLILaunchHelper(LaunchHelper):
     def __init__(self, model_name: str, cli_kwargs: Optional[dict[str, Any]]):
         super().__init__(model_name, cli_kwargs)
 
+    def _warn(self, message: str) -> None:
+        """Warn the user about a potential issue."""
+        click.echo(click.style(f"Warning: {message}", fg="yellow"), err=True)
+
     def format_table_output(self, job_id: str) -> Table:
         """Format output as rich Table."""
         table = utils.create_table(key_title="Job Config", value_title="Value")
diff --git a/vec_inf/shared/_exceptions.py b/vec_inf/shared/_exceptions.py
new file mode 100644
index 00000000..55148475
--- /dev/null
+++ b/vec_inf/shared/_exceptions.py
@@ -0,0 +1,18 @@
+"""Exceptions for the vector inference package."""
+
+class ModelConfigurationError(Exception):
+    """Raised when the model config or weights are missing or invalid."""
+
+    pass
+
+
+class MissingRequiredFieldsError(ValueError):
+    """Raised when required fields are missing from the provided parameters."""
+
+    pass
+
+
+class ModelNotFoundError(KeyError):
+    """Raised when the specified model name is not found in the configuration."""
+
+    pass
diff --git a/vec_inf/shared/_helper.py b/vec_inf/shared/_helper.py
index aa525d8f..c76f1a42 100644
--- a/vec_inf/shared/_helper.py
+++ b/vec_inf/shared/_helper.py
@@ -11,8 +11,13 @@
 
 import vec_inf.shared._utils as utils
 from vec_inf.shared._config import ModelConfig
+from vec_inf.shared._exceptions import (
+    ModelConfigurationError,
+    MissingRequiredFieldsError,
+    ModelNotFoundError,
+)
 from vec_inf.shared._models import ModelStatus
-from vec_inf.shared._utils import (
+from vec_inf.shared._vars import (
     BOOLEAN_FIELDS,
     LD_LIBRARY_PATH,
     REQUIRED_FIELDS,
@@ -84,7 +89,7 @@ def _get_model_configuration(self) -> ModelConfig:
                 model_weights_parent_dir=Path(str(model_weights_parent_dir)),
             )
 
-        raise utils.ModelConfigurationError(
+        raise ModelConfigurationError(
             f"'{self.model_name}' not found in configuration and model weights "
             f"not found at expected path '{model_weights_path}'"
         )
@@ -108,7 +113,7 @@ def _get_launch_params(self) -> dict[str, Any]:
 
         # Validate required fields
         if not REQUIRED_FIELDS.issubset(set(params.keys())):
-            raise utils.MissingRequiredFieldsError(
+            raise MissingRequiredFieldsError(
                 f"Missing required fields: {REQUIRED_FIELDS - set(params.keys())}"
             )
 
@@ -442,7 +447,7 @@ def get_single_model_config(self) -> ModelConfig:
             (c for c in self.model_configs if c.model_name == self.model_name), None
         )
         if not config:
-            raise utils.ModelNotFoundError(
+            raise ModelNotFoundError(
                 f"Model '{self.model_name}' not found in configuration"
             )
         return config
diff --git a/vec_inf/shared/_utils.py b/vec_inf/shared/_utils.py
index 8b07caaa..771e67ce 100644
--- a/vec_inf/shared/_utils.py
+++ b/vec_inf/shared/_utils.py
@@ -11,57 +11,13 @@
 from rich.table import Table
 
 from vec_inf.shared._config import ModelConfig
-
-
-MODEL_READY_SIGNATURE = "INFO:     Application startup complete."
-CACHED_CONFIG = Path("/", "model-weights", "vec-inf-shared", "models.yaml")
-SRC_DIR = str(Path(__file__).parent.parent)
-LD_LIBRARY_PATH = "/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
-
-# Maps model types to vLLM tasks
-VLLM_TASK_MAP = {
-    "LLM": "generate",
-    "VLM": "generate",
-    "TEXT_EMBEDDING": "embed",
-    "REWARD_MODELING": "reward",
-}
-
-# Required fields for model configuration
-REQUIRED_FIELDS = {
-    "model_family",
-    "model_type",
-    "gpus_per_node",
-    "num_nodes",
-    "vocab_size",
-    "max_model_len",
-}
-
-# Boolean fields for model configuration
-BOOLEAN_FIELDS = {
-    "pipeline_parallelism",
-    "enforce_eager",
-    "enable_prefix_caching",
-    "enable_chunked_prefill",
-}
-
-
-class ModelConfigurationError(Exception):
-    """Raised when the model config or weights are missing or invalid."""
-
-    pass
-
-
-class MissingRequiredFieldsError(ValueError):
-    """Raised when required fields are missing from the provided parameters."""
-
-    pass
-
-
-class ModelNotFoundError(KeyError):
-    """Raised when the specified model name is not found in the configuration."""
-
-    pass
-
+from vec_inf.shared._vars import (
+    CACHED_CONFIG,
+    MODEL_READY_SIGNATURE,
+    VLLM_TASK_MAP,
+    REQUIRED_FIELDS,
+    BOOLEAN_FIELDS,
+)
 
 def run_bash_command(command: str) -> tuple[str, str]:
     """Run a bash command and return the output."""
diff --git a/vec_inf/shared/_vars.py b/vec_inf/shared/_vars.py
new file mode 100644
index 00000000..c8047ccf
--- /dev/null
+++ b/vec_inf/shared/_vars.py
@@ -0,0 +1,34 @@
+"""Global variables for the vector inference package."""
+
+from pathlib import Path
+
+MODEL_READY_SIGNATURE = "INFO:     Application startup complete."
+CACHED_CONFIG = Path("/", "model-weights", "vec-inf-shared", "models.yaml")
+SRC_DIR = str(Path(__file__).parent.parent)
+LD_LIBRARY_PATH = "/scratch/ssd001/pkgs/cudnn-11.7-v8.5.0.96/lib/:/scratch/ssd001/pkgs/cuda-11.7/targets/x86_64-linux/lib/"
+
+# Maps model types to vLLM tasks
+VLLM_TASK_MAP = {
+    "LLM": "generate",
+    "VLM": "generate",
+    "TEXT_EMBEDDING": "embed",
+    "REWARD_MODELING": "reward",
+}  
+
+# Required fields for model configuration
+REQUIRED_FIELDS = {
+    "model_family",
+    "model_type",
+    "gpus_per_node",
+    "num_nodes",
+    "vocab_size",
+    "max_model_len",
+}
+
+# Boolean fields for model configuration
+BOOLEAN_FIELDS = {
+    "pipeline_parallelism",
+    "enforce_eager",
+    "enable_prefix_caching",
+    "enable_chunked_prefill",
+}
\ No newline at end of file

From 479d03112799624112eadac21d4ee746ad995f60 Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 10:30:27 -0400
Subject: [PATCH 18/52] Move json mode out of parent ListHelper class

---
 vec_inf/cli/_helper.py    | 3 ++-
 vec_inf/shared/_helper.py | 3 +--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index 34993610..9f8e29c1 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -208,7 +208,8 @@ class CLIListHelper(ListHelper):
     """Helper class for handling model listing functionality."""
 
     def __init__(self, model_name: Optional[str] = None, json_mode: bool = False):
-        super().__init__(model_name, json_mode)
+        super().__init__(model_name)
+        self.json_mode = json_mode
 
     def format_single_model_output(
         self, config: ModelConfig
diff --git a/vec_inf/shared/_helper.py b/vec_inf/shared/_helper.py
index c76f1a42..9d4287e0 100644
--- a/vec_inf/shared/_helper.py
+++ b/vec_inf/shared/_helper.py
@@ -436,9 +436,8 @@ def _parse_metrics(self, metrics_text: str) -> dict[str, float]:
 class ListHelper:
     """Helper class for handling model listing functionality."""
 
-    def __init__(self, model_name: Optional[str] = None, json_mode: bool = False):
+    def __init__(self, model_name: Optional[str] = None):
         self.model_name = model_name
-        self.json_mode = json_mode
         self.model_configs = utils.load_config()
 
     def get_single_model_config(self) -> ModelConfig:

From 26eabbb0dad9a79308db1bc605e2ecb5c435a09b Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 11:45:43 -0400
Subject: [PATCH 19/52] Rename cli_kwargs to be more generic

---
 vec_inf/cli/_helper.py    |  6 +++---
 vec_inf/shared/_helper.py | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index 9f8e29c1..0254c50f 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -19,8 +19,8 @@
 class CLILaunchHelper(LaunchHelper):
     """CLI Helper class for handling launch information."""
 
-    def __init__(self, model_name: str, cli_kwargs: Optional[dict[str, Any]]):
-        super().__init__(model_name, cli_kwargs)
+    def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
+        super().__init__(model_name, kwargs)
 
     def _warn(self, message: str) -> None:
         """Warn the user about a potential issue."""
@@ -73,7 +73,7 @@ def format_table_output(self, job_id: str) -> Table:
 
     def post_launch_processing(self, output: str, console: Console) -> None:
         """Process and display launch output."""
-        json_mode = bool(self.cli_kwargs.get("json_mode", False))
+        json_mode = bool(self.kwargs.get("json_mode", False))
         slurm_job_id = output.split(" ")[-1].strip().strip("\n")
         self.params["slurm_job_id"] = slurm_job_id
         job_json = Path(
diff --git a/vec_inf/shared/_helper.py b/vec_inf/shared/_helper.py
index 9d4287e0..e3bade1f 100644
--- a/vec_inf/shared/_helper.py
+++ b/vec_inf/shared/_helper.py
@@ -29,18 +29,18 @@
 class LaunchHelper(ABC):
     """Helper class for handling inference server launch."""
 
-    def __init__(self, model_name: str, cli_kwargs: Optional[dict[str, Any]]):
+    def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
         """Initialize the model launcher.
 
         Parameters
         ----------
         model_name: str
             Name of the model to launch
-        cli_kwargs: Optional[dict[str, Any]]
+        kwargs: Optional[dict[str, Any]]
             Optional launch keyword arguments to override default configuration
         """
         self.model_name = model_name
-        self.cli_kwargs = cli_kwargs or {}
+        self.kwargs = kwargs or {}
         self.model_config = self._get_model_configuration()
         self.params = self._get_launch_params()
 
@@ -60,7 +60,7 @@ def _get_model_configuration(self) -> ModelConfig:
             return config
 
         # If model config not found, check for path from CLI kwargs or use fallback
-        model_weights_parent_dir = self.cli_kwargs.get(
+        model_weights_parent_dir = self.kwargs.get(
             "model_weights_parent_dir",
             model_configs[0].model_weights_parent_dir if model_configs else None,
         )
@@ -100,11 +100,11 @@ def _get_launch_params(self) -> dict[str, Any]:
 
         # Process boolean fields
         for bool_field in BOOLEAN_FIELDS:
-            if self.cli_kwargs[bool_field]:
+            if self.kwargs[bool_field]:
                 params[bool_field] = True
 
         # Merge other overrides
-        for key, value in self.cli_kwargs.items():
+        for key, value in self.kwargs.items():
             if value is not None and key not in [
                 "json_mode",
                 *BOOLEAN_FIELDS,

From 2e94d4a925c4841a028cf3a437f7b6906741a662 Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 11:55:21 -0400
Subject: [PATCH 20/52] Move SlurmJobException to shared exceptions, ruff
 check/format and mypy fixes

---
 vec_inf/shared/_exceptions.py | 7 +++++++
 vec_inf/shared/_helper.py     | 5 +++--
 vec_inf/shared/_utils.py      | 4 +---
 vec_inf/shared/_vars.py       | 5 +++--
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/vec_inf/shared/_exceptions.py b/vec_inf/shared/_exceptions.py
index 55148475..0bbbd914 100644
--- a/vec_inf/shared/_exceptions.py
+++ b/vec_inf/shared/_exceptions.py
@@ -1,5 +1,6 @@
 """Exceptions for the vector inference package."""
 
+
 class ModelConfigurationError(Exception):
     """Raised when the model config or weights are missing or invalid."""
 
@@ -16,3 +17,9 @@ class ModelNotFoundError(KeyError):
     """Raised when the specified model name is not found in the configuration."""
 
     pass
+
+
+class SlurmJobError(RuntimeError):
+    """Raised when there's an error with a Slurm job."""
+
+    pass
diff --git a/vec_inf/shared/_helper.py b/vec_inf/shared/_helper.py
index e3bade1f..10dbff55 100644
--- a/vec_inf/shared/_helper.py
+++ b/vec_inf/shared/_helper.py
@@ -12,9 +12,10 @@
 import vec_inf.shared._utils as utils
 from vec_inf.shared._config import ModelConfig
 from vec_inf.shared._exceptions import (
-    ModelConfigurationError,
     MissingRequiredFieldsError,
+    ModelConfigurationError,
     ModelNotFoundError,
+    SlurmJobError,
 )
 from vec_inf.shared._models import ModelStatus
 from vec_inf.shared._vars import (
@@ -286,7 +287,7 @@ def _get_status_info(self) -> dict[str, Union[str, None]]:
         status_cmd = f"scontrol show job {self.slurm_job_id} --oneliner"
         output, stderr = utils.run_bash_command(status_cmd)
         if stderr:
-            raise RuntimeError(f"Error: {stderr}")
+            raise SlurmJobError(f"Error: {stderr}")
         status_helper = StatusHelper(self.slurm_job_id, output, self.log_dir)
         return status_helper.status_info
 
diff --git a/vec_inf/shared/_utils.py b/vec_inf/shared/_utils.py
index 771e67ce..5ba5c26a 100644
--- a/vec_inf/shared/_utils.py
+++ b/vec_inf/shared/_utils.py
@@ -14,11 +14,9 @@
 from vec_inf.shared._vars import (
     CACHED_CONFIG,
     MODEL_READY_SIGNATURE,
-    VLLM_TASK_MAP,
-    REQUIRED_FIELDS,
-    BOOLEAN_FIELDS,
 )
 
+
 def run_bash_command(command: str) -> tuple[str, str]:
     """Run a bash command and return the output."""
     process = subprocess.Popen(
diff --git a/vec_inf/shared/_vars.py b/vec_inf/shared/_vars.py
index c8047ccf..71e9e221 100644
--- a/vec_inf/shared/_vars.py
+++ b/vec_inf/shared/_vars.py
@@ -2,6 +2,7 @@
 
 from pathlib import Path
 
+
 MODEL_READY_SIGNATURE = "INFO:     Application startup complete."
 CACHED_CONFIG = Path("/", "model-weights", "vec-inf-shared", "models.yaml")
 SRC_DIR = str(Path(__file__).parent.parent)
@@ -13,7 +14,7 @@
     "VLM": "generate",
     "TEXT_EMBEDDING": "embed",
     "REWARD_MODELING": "reward",
-}  
+}
 
 # Required fields for model configuration
 REQUIRED_FIELDS = {
@@ -31,4 +32,4 @@
     "enforce_eager",
     "enable_prefix_caching",
     "enable_chunked_prefill",
-}
\ No newline at end of file
+}

From 56c4d4085f88c7e05f2631c296c9435d7275b767 Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 14:47:16 -0400
Subject: [PATCH 21/52] Remove model name field for ListHelper, add additional
 check for bool fields in kwargs for launch, make format table private for CLI
 launch helper

---
 vec_inf/cli/_cli.py       |  4 ++--
 vec_inf/cli/_helper.py    | 16 +++++++++-------
 vec_inf/shared/_helper.py | 13 +++++--------
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index 321ee5e0..227c5dcb 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -200,8 +200,8 @@ def shutdown(slurm_job_id: int) -> None:
 def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
     """List all available models, or get default setup of a specific model."""
     try:
-        list_helper = CLIListHelper(model_name, json_mode)
-        list_helper.process_list_command(CONSOLE)
+        list_helper = CLIListHelper(json_mode)
+        list_helper.process_list_command(CONSOLE, model_name)
     except click.ClickException as e:
         raise e
     except Exception as e:
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index 0254c50f..b2679205 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -26,7 +26,7 @@ def _warn(self, message: str) -> None:
         """Warn the user about a potential issue."""
         click.echo(click.style(f"Warning: {message}", fg="yellow"), err=True)
 
-    def format_table_output(self, job_id: str) -> Table:
+    def _format_table_output(self, job_id: str) -> Table:
         """Format output as rich Table."""
         table = utils.create_table(key_title="Job Config", value_title="Value")
 
@@ -89,7 +89,7 @@ def post_launch_processing(self, output: str, console: Console) -> None:
         if json_mode:
             click.echo(self.params)
         else:
-            table = self.format_table_output(slurm_job_id)
+            table = self._format_table_output(slurm_job_id)
             console.print(table)
 
 
@@ -207,8 +207,8 @@ def display_metrics(self, table: Table, metrics: dict[str, float]) -> None:
 class CLIListHelper(ListHelper):
     """Helper class for handling model listing functionality."""
 
-    def __init__(self, model_name: Optional[str] = None, json_mode: bool = False):
-        super().__init__(model_name)
+    def __init__(self, json_mode: bool = False):
+        super().__init__()
         self.json_mode = json_mode
 
     def format_single_model_output(
@@ -267,12 +267,14 @@ def format_all_models_output(self) -> Union[list[str], list[Panel]]:
 
         return panels
 
-    def process_list_command(self, console: Console) -> None:
+    def process_list_command(
+        self, console: Console, model_name: Optional[str] = None
+    ) -> None:
         """Process the list command and display output."""
         try:
-            if self.model_name:
+            if model_name:
                 # Handle single model case
-                config = self.get_single_model_config()
+                config = self.get_single_model_config(model_name)
                 output = self.format_single_model_output(config)
                 if self.json_mode:
                     click.echo(output)
diff --git a/vec_inf/shared/_helper.py b/vec_inf/shared/_helper.py
index 10dbff55..d7fc8817 100644
--- a/vec_inf/shared/_helper.py
+++ b/vec_inf/shared/_helper.py
@@ -101,7 +101,7 @@ def _get_launch_params(self) -> dict[str, Any]:
 
         # Process boolean fields
         for bool_field in BOOLEAN_FIELDS:
-            if self.kwargs[bool_field]:
+            if self.kwargs.get(bool_field) and self.kwargs[bool_field]:
                 params[bool_field] = True
 
         # Merge other overrides
@@ -437,17 +437,14 @@ def _parse_metrics(self, metrics_text: str) -> dict[str, float]:
 class ListHelper:
     """Helper class for handling model listing functionality."""
 
-    def __init__(self, model_name: Optional[str] = None):
-        self.model_name = model_name
+    def __init__(self):
         self.model_configs = utils.load_config()
 
-    def get_single_model_config(self) -> ModelConfig:
+    def get_single_model_config(self, model_name: str) -> ModelConfig:
         """Get configuration for a specific model."""
         config = next(
-            (c for c in self.model_configs if c.model_name == self.model_name), None
+            (c for c in self.model_configs if c.model_name == model_name), None
         )
         if not config:
-            raise ModelNotFoundError(
-                f"Model '{self.model_name}' not found in configuration"
-            )
+            raise ModelNotFoundError(f"Model '{model_name}' not found in configuration")
         return config

From 60f4a68695c34fbde3688f34a40871b9a86478da Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 14:56:36 -0400
Subject: [PATCH 22/52] Add API helpers inherited from shared helpers

---
 vec_inf/api/_helper.py | 63 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 vec_inf/api/_helper.py

diff --git a/vec_inf/api/_helper.py b/vec_inf/api/_helper.py
new file mode 100644
index 00000000..28cc7942
--- /dev/null
+++ b/vec_inf/api/_helper.py
@@ -0,0 +1,63 @@
+"""Helper classes for the API."""
+
+import json
+import warnings
+from pathlib import Path
+from typing import Any, Optional
+
+from vec_inf.api._models import LaunchResponse, ModelInfo, ModelType
+from vec_inf.shared._helper import LaunchHelper, ListHelper
+
+
+class APILaunchHelper(LaunchHelper):
+    """API Helper class for handling inference server launch."""
+
+    def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
+        super().__init__(model_name, kwargs)
+
+    def _warn(self, message: str) -> None:
+        """Warn the user about a potential issue."""
+        warnings.warn(message, UserWarning, stacklevel=2)
+
+    def post_launch_processing(self, command_output: str) -> LaunchResponse:
+        """Process and display launch output."""
+        slurm_job_id = command_output.split(" ")[-1].strip().strip("\n")
+        self.params["slurm_job_id"] = slurm_job_id
+        job_json = Path(
+            self.params["log_dir"],
+            f"{self.model_name}.{slurm_job_id}",
+            f"{self.model_name}.{slurm_job_id}.json",
+        )
+        job_json.parent.mkdir(parents=True, exist_ok=True)
+        job_json.touch(exist_ok=True)
+
+        with job_json.open("w") as file:
+            json.dump(self.params, file, indent=4)
+
+        return LaunchResponse(
+            slurm_job_id=slurm_job_id,
+            model_name=self.model_name,
+            config=self.params,
+            raw_output=command_output,
+        )
+
+
+class APIListHelper(ListHelper):
+    """API Helper class for handling model listing."""
+
+    def __init__(self):
+        super().__init__()
+
+    def get_all_models(self) -> list[ModelInfo]:
+        """Get all available models."""
+        available_models = []
+        for config in self.model_configs:
+            info = ModelInfo(
+                name=config.model_name,
+                family=config.model_family,
+                variant=config.model_variant,
+                type=ModelType(config.model_type),
+                config=config.model_dump(exclude={"model_name", "venv", "log_dir"}),
+            )
+            available_models.append(info)
+        return available_models

From 2c2b1c7e977cf9e727755efd85e747d6f3e2467e Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 15:59:34 -0400
Subject: [PATCH 23/52] Use ModelStatus data class for util functions

---
 vec_inf/shared/_utils.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/vec_inf/shared/_utils.py b/vec_inf/shared/_utils.py
index 5ba5c26a..007721df 100644
--- a/vec_inf/shared/_utils.py
+++ b/vec_inf/shared/_utils.py
@@ -11,6 +11,7 @@
 from rich.table import Table
 
 from vec_inf.shared._config import ModelConfig
+from vec_inf.shared._models import ModelStatus
 from vec_inf.shared._vars import (
     CACHED_CONFIG,
     MODEL_READY_SIGNATURE,
@@ -70,17 +71,17 @@ def read_slurm_log(
 
 def is_server_running(
     slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
-) -> Union[str, tuple[str, str]]:
+) -> Union[str, ModelStatus, tuple[ModelStatus, str]]:
     """Check if a model is ready to serve requests."""
     log_content = read_slurm_log(slurm_job_name, slurm_job_id, "err", log_dir)
     if isinstance(log_content, str):
         return log_content
 
-    status: Union[str, tuple[str, str]] = "LAUNCHING"
+    status: Union[str, tuple[ModelStatus, str]] = ModelStatus.LAUNCHING
 
     for line in log_content:
         if "error" in line.lower():
-            status = ("FAILED", line.strip("\n"))
+            status = (ModelStatus.FAILED, line.strip("\n"))
         if MODEL_READY_SIGNATURE in line:
             status = "RUNNING"
 
@@ -99,21 +100,21 @@ def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str])
 
 def model_health_check(
     slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str]
-) -> tuple[str, Union[str, int]]:
+) -> tuple[ModelStatus, Union[str, int]]:
     """Check the health of a running model on the cluster."""
     base_url = get_base_url(slurm_job_name, slurm_job_id, log_dir)
     if not base_url.startswith("http"):
-        return ("FAILED", base_url)
+        return (ModelStatus.FAILED, base_url)
     health_check_url = base_url.replace("v1", "health")
 
     try:
         response = requests.get(health_check_url)
         # Check if the request was successful
         if response.status_code == 200:
-            return ("READY", response.status_code)
-        return ("FAILED", response.status_code)
+            return (ModelStatus.READY, response.status_code)
+        return (ModelStatus.FAILED, response.status_code)
     except requests.exceptions.RequestException as e:
-        return ("FAILED", str(e))
+        return (ModelStatus.FAILED, str(e))
 
 
 def create_table(

From 362b9c766d7cbb637b0f20ed901e2e970fbda85d Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 16:54:40 -0400
Subject: [PATCH 24/52] Rename helper for CLI metrics command

---
 vec_inf/cli/_cli.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index 227c5dcb..6c50c24e 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -216,27 +216,28 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
 def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
     """Stream real-time performance metrics from the model endpoint."""
     try:
-        helper = CLIMetricsHelper(slurm_job_id, log_dir)
+        metrics_helper = CLIMetricsHelper(slurm_job_id, log_dir)
 
         # Check if metrics URL is ready
-        if not helper.metrics_url.startswith("http"):
+        if not metrics_helper.metrics_url.startswith("http"):
             table = utils.create_table("Metric", "Value")
-            helper.display_failed_metrics(
-                table, f"Metrics endpoint unavailable - {helper.metrics_url}"
+            metrics_helper.display_failed_metrics(
+                table,
+                f"Metrics endpoint unavailable or server not ready - {metrics_helper.metrics_url}",
             )
             CONSOLE.print(table)
             return
 
         with Live(refresh_per_second=1, console=CONSOLE) as live:
             while True:
-                metrics = helper.fetch_metrics()
+                metrics = metrics_helper.fetch_metrics()
                 table = utils.create_table("Metric", "Value")
 
                 if isinstance(metrics, str):
                     # Show status information if metrics aren't available
-                    helper.display_failed_metrics(table, metrics)
+                    metrics_helper.display_failed_metrics(table, metrics)
                 else:
-                    helper.display_metrics(table, metrics)
+                    metrics_helper.display_metrics(table, metrics)
 
                 live.update(table)
                 time.sleep(2)

From 73f1196e6e91e891b6b618c393cbd2ee41d89d0b Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 17:27:57 -0400
Subject: [PATCH 25/52] Refactored API code to use shared helpers, moved
 exceptions to shared exceptions file, removed redundant utils functions,
 marked helper and models private, update model params and data types to
 accomodate latest changes in vec inf

---
 vec_inf/api/__init__.py               |   6 +-
 vec_inf/api/_helper.py                |   2 +-
 vec_inf/api/{models.py => _models.py} |  21 ++-
 vec_inf/api/client.py                 | 121 +++++++---------
 vec_inf/api/utils.py                  | 197 --------------------------
 vec_inf/shared/_exceptions.py         |  12 ++
 6 files changed, 85 insertions(+), 274 deletions(-)
 rename vec_inf/api/{models.py => _models.py} (79%)
 delete mode 100644 vec_inf/api/utils.py

diff --git a/vec_inf/api/__init__.py b/vec_inf/api/__init__.py
index 6d18011c..7f908b37 100644
--- a/vec_inf/api/__init__.py
+++ b/vec_inf/api/__init__.py
@@ -5,8 +5,7 @@
 users direct control over the lifecycle of inference servers via python scripts.
 """
 
-from vec_inf.api.client import VecInfClient
-from vec_inf.api.models import (
+from vec_inf.api._models import (
     LaunchOptions,
     LaunchOptionsDict,
     LaunchResponse,
@@ -14,7 +13,8 @@
     ModelInfo,
     StatusResponse,
 )
-from vec_inf.shared.models import ModelStatus, ModelType
+from vec_inf.api.client import VecInfClient
+from vec_inf.shared._models import ModelStatus, ModelType
 
 
 __all__ = [
diff --git a/vec_inf/api/_helper.py b/vec_inf/api/_helper.py
index 28cc7942..3abc9149 100644
--- a/vec_inf/api/_helper.py
+++ b/vec_inf/api/_helper.py
@@ -35,7 +35,7 @@ def post_launch_processing(self, command_output: str) -> LaunchResponse:
             json.dump(self.params, file, indent=4)
 
         return LaunchResponse(
-            slurm_job_id=slurm_job_id,
+            slurm_job_id=int(slurm_job_id),
             model_name=self.model_name,
             config=self.params,
             raw_output=command_output,
diff --git a/vec_inf/api/models.py b/vec_inf/api/_models.py
similarity index 79%
rename from vec_inf/api/models.py
rename to vec_inf/api/_models.py
index 96876f4d..80a100d1 100644
--- a/vec_inf/api/models.py
+++ b/vec_inf/api/_models.py
@@ -9,7 +9,7 @@
 
 from typing_extensions import NotRequired
 
-from vec_inf.shared.models import ModelStatus, ModelType
+from vec_inf.shared._models import ModelStatus, ModelType
 
 
 @dataclass
@@ -27,7 +27,7 @@ class ModelInfo:
 class LaunchResponse:
     """Response from launching a model."""
 
-    slurm_job_id: str
+    slurm_job_id: int
     model_name: str
     config: Dict[str, Any]
     raw_output: str = field(repr=False)
@@ -37,7 +37,7 @@ class LaunchResponse:
 class StatusResponse:
     """Response from checking a model's status."""
 
-    slurm_job_id: str
+    slurm_job_id: int
     model_name: str
     status: ModelStatus
     raw_output: str = field(repr=False)
@@ -50,11 +50,10 @@ class StatusResponse:
 class MetricsResponse:
     """Response from retrieving model metrics."""
 
-    slurm_job_id: str
+    slurm_job_id: int
     model_name: str
-    metrics: Dict[str, str]
+    metrics: Dict[str, float]
     timestamp: float
-    raw_output: str = field(repr=False)
 
 
 @dataclass
@@ -65,6 +64,10 @@ class LaunchOptions:
     model_variant: Optional[str] = None
     max_model_len: Optional[int] = None
     max_num_seqs: Optional[int] = None
+    gpu_memory_utilization: Optional[float] = None
+    enable_prefix_caching: Optional[bool] = None
+    enable_chunked_prefill: Optional[bool] = None
+    max_num_batched_tokens: Optional[int] = None
     partition: Optional[str] = None
     num_nodes: Optional[int] = None
     num_gpus: Optional[int] = None
@@ -76,6 +79,7 @@ class LaunchOptions:
     log_dir: Optional[str] = None
     model_weights_parent_dir: Optional[str] = None
     pipeline_parallelism: Optional[bool] = None
+    compilation_config: Optional[str] = None
     enforce_eager: Optional[bool] = None
 
 
@@ -86,6 +90,10 @@ class LaunchOptionsDict(TypedDict):
     model_variant: NotRequired[Optional[str]]
     max_model_len: NotRequired[Optional[int]]
     max_num_seqs: NotRequired[Optional[int]]
+    gpu_memory_utilization: NotRequired[Optional[float]]
+    enable_prefix_caching: NotRequired[Optional[bool]]
+    enable_chunked_prefill: NotRequired[Optional[bool]]
+    max_num_batched_tokens: NotRequired[Optional[int]]
     partition: NotRequired[Optional[str]]
     num_nodes: NotRequired[Optional[int]]
     num_gpus: NotRequired[Optional[int]]
@@ -97,4 +105,5 @@ class LaunchOptionsDict(TypedDict):
     log_dir: NotRequired[Optional[str]]
     model_weights_parent_dir: NotRequired[Optional[str]]
     pipeline_parallelism: NotRequired[Optional[bool]]
+    compilation_config: NotRequired[Optional[str]]
     enforce_eager: NotRequired[Optional[bool]]
diff --git a/vec_inf/api/client.py b/vec_inf/api/client.py
index c5e25d59..1f53e8a9 100644
--- a/vec_inf/api/client.py
+++ b/vec_inf/api/client.py
@@ -5,28 +5,28 @@
 """
 
 import time
-from typing import List, Optional
+from typing import Any, Optional, cast
 
-from vec_inf.api.models import (
+import requests
+
+from vec_inf.api._helper import APILaunchHelper, APIListHelper
+from vec_inf.api._models import (
     LaunchOptions,
     LaunchResponse,
     MetricsResponse,
     ModelInfo,
     StatusResponse,
 )
-from vec_inf.api.utils import (
+from vec_inf.shared._config import ModelConfig
+from vec_inf.shared._exceptions import (
     APIError,
     ModelNotFoundError,
     ServerError,
     SlurmJobError,
-    get_metrics,
-    get_model_status,
-    load_models,
 )
-from vec_inf.shared.config import ModelConfig
-from vec_inf.shared.helper import LaunchHelper
-from vec_inf.shared.models import ModelStatus, ModelType
-from vec_inf.shared.utils import run_bash_command
+from vec_inf.shared._helper import MetricsHelper, StatusHelper
+from vec_inf.shared._models import ModelStatus
+from vec_inf.shared._utils import run_bash_command, shutdown_model
 
 
 class VecInfClient:
@@ -53,12 +53,12 @@ def __init__(self) -> None:
         """Initialize the Vector Inference client."""
         pass
 
-    def list_models(self) -> List[ModelInfo]:
+    def list_models(self) -> list[ModelInfo]:
         """List all available models.
 
         Returns
         -------
-        List[ModelInfo]
+        list[ModelInfo]
             ModelInfo objects containing information about available models.
 
         Raises
@@ -68,20 +68,8 @@ def list_models(self) -> List[ModelInfo]:
 
         """
         try:
-            model_configs = load_models()
-            result = []
-
-            for config in model_configs:
-                info = ModelInfo(
-                    name=config.model_name,
-                    family=config.model_family,
-                    variant=config.model_variant,
-                    type=ModelType(config.model_type),
-                    config=config.model_dump(exclude={"model_name", "venv", "log_dir"}),
-                )
-                result.append(info)
-
-            return result
+            list_helper = APIListHelper()
+            return list_helper.get_all_models()
         except Exception as e:
             raise APIError(f"Failed to list models: {str(e)}") from e
 
@@ -107,12 +95,8 @@ def get_model_config(self, model_name: str) -> ModelConfig:
 
         """
         try:
-            model_configs = load_models()
-            for config in model_configs:
-                if config.model_name == model_name:
-                    return config
-
-            raise ModelNotFoundError(f"Model '{model_name}' not found")
+            list_helper = APIListHelper()
+            return list_helper.get_single_model_config(model_name)
         except ModelNotFoundError:
             raise
         except Exception as e:
@@ -144,26 +128,24 @@ def launch_model(
         """
         try:
             # Convert LaunchOptions to dictionary if provided
-            options_dict = None
+            options_dict: dict[str, Any] = {}
             if options:
                 options_dict = {k: v for k, v in vars(options).items() if v is not None}
 
-            # Create and use the shared ModelLauncher
-            launcher = LaunchHelper(model_name, options_dict)
+            # Create and use the API Launch Helper
+            launch_helper = APILaunchHelper(model_name, options_dict)
 
-            # Launch the model
-            job_id, config_dict, _ = launcher.launch()
+            # Set environment variables
+            launch_helper.set_env_vars()
 
-            # Get the raw output
-            status_cmd = f"scontrol show job {job_id} --oneliner"
-            raw_output, _ = run_bash_command(status_cmd)
+            # Build and execute the launch command
+            launch_command = launch_helper.build_launch_command()
+            command_output, stderr = run_bash_command(launch_command)
+            if stderr:
+                raise SlurmJobError(f"Error: {stderr}")
+
+            return launch_helper.post_launch_processing(command_output)
 
-            return LaunchResponse(
-                slurm_job_id=job_id,
-                model_name=model_name,
-                config=config_dict,
-                raw_output=raw_output,
-            )
         except ValueError as e:
             if "not found in configuration" in str(e):
                 raise ModelNotFoundError(str(e)) from e
@@ -172,7 +154,7 @@ def launch_model(
             raise APIError(f"Failed to launch model: {str(e)}") from e
 
     def get_status(
-        self, slurm_job_id: str, log_dir: Optional[str] = None
+        self, slurm_job_id: int, log_dir: Optional[str] = None
     ) -> StatusResponse:
         """Get the status of a running model.
 
@@ -197,17 +179,20 @@ def get_status(
         """
         try:
             status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
-            output, _ = run_bash_command(status_cmd)
+            output, stderr = run_bash_command(status_cmd)
+            if stderr:
+                raise SlurmJobError(f"Error: {stderr}")
 
-            status, status_info = get_model_status(slurm_job_id, log_dir)
+            status_helper = StatusHelper(slurm_job_id, output, log_dir)
+            status_helper.process_job_state()
 
             return StatusResponse(
                 slurm_job_id=slurm_job_id,
-                model_name=status_info["model_name"],
-                status=status,
-                base_url=status_info["base_url"],
-                pending_reason=status_info["pending_reason"],
-                failed_reason=status_info["failed_reason"],
+                model_name=cast(str, status_helper.status_info["model_name"]),
+                status=cast(ModelStatus, status_helper.status_info["status"]),
+                base_url=status_helper.status_info["base_url"],
+                pending_reason=status_helper.status_info["pending_reason"],
+                failed_reason=status_helper.status_info["failed_reason"],
                 raw_output=output,
             )
         except SlurmJobError:
@@ -216,7 +201,7 @@ def get_status(
             raise APIError(f"Failed to get status: {str(e)}") from e
 
     def get_metrics(
-        self, slurm_job_id: str, log_dir: Optional[str] = None
+        self, slurm_job_id: int, log_dir: Optional[str] = None
     ) -> MetricsResponse:
         """Get the performance metrics of a running model.
 
@@ -241,27 +226,30 @@ def get_metrics(
 
         """
         try:
-            # First check if the job exists and get the job name
-            status_response = self.get_status(slurm_job_id, log_dir)
+            metrics_helper = MetricsHelper(slurm_job_id, log_dir)
 
-            # Get metrics
-            metrics = get_metrics(
-                status_response.model_name, int(slurm_job_id), log_dir
-            )
+            if not metrics_helper.metrics_url.startswith("http"):
+                raise ServerError(
+                    f"Metrics endpoint unavailable or server not ready - {metrics_helper.metrics_url}"
+                )
+
+            metrics = metrics_helper.fetch_metrics()
+
+            if isinstance(metrics, str):
+                raise requests.RequestException(metrics)
 
             return MetricsResponse(
                 slurm_job_id=slurm_job_id,
-                model_name=status_response.model_name,
+                model_name=cast(str, metrics_helper.status_info["model_name"]),
                 metrics=metrics,
                 timestamp=time.time(),
-                raw_output="",  # No raw output needed for metrics
             )
         except SlurmJobError:
             raise
         except Exception as e:
             raise APIError(f"Failed to get metrics: {str(e)}") from e
 
-    def shutdown_model(self, slurm_job_id: str) -> bool:
+    def shutdown_model(self, slurm_job_id: int) -> bool:
         """Shutdown a running model.
 
         Parameters
@@ -278,15 +266,14 @@ def shutdown_model(self, slurm_job_id: str) -> bool:
             APIError: If there was an error shutting down the model.
         """
         try:
-            shutdown_cmd = f"scancel {slurm_job_id}"
-            run_bash_command(shutdown_cmd)
+            shutdown_model(slurm_job_id)
             return True
         except Exception as e:
             raise APIError(f"Failed to shutdown model: {str(e)}") from e
 
     def wait_until_ready(
         self,
-        slurm_job_id: str,
+        slurm_job_id: int,
         timeout_seconds: int = 1800,
         poll_interval_seconds: int = 10,
         log_dir: Optional[str] = None,
diff --git a/vec_inf/api/utils.py b/vec_inf/api/utils.py
deleted file mode 100644
index ae20a13a..00000000
--- a/vec_inf/api/utils.py
+++ /dev/null
@@ -1,197 +0,0 @@
-"""Utility functions for the Vector Inference API."""
-
-from typing import Any, Optional
-
-import requests
-
-from vec_inf.shared.config import ModelConfig
-from vec_inf.shared.models import ModelStatus
-from vec_inf.shared.utils import (
-    MODEL_READY_SIGNATURE,
-    get_base_url,
-    load_config,
-    read_slurm_log,
-    run_bash_command,
-)
-
-
-class APIError(Exception):
-    """Base exception for API errors."""
-
-    pass
-
-
-class ModelNotFoundError(APIError):
-    """Exception raised when a model is not found."""
-
-    pass
-
-
-class SlurmJobError(APIError):
-    """Exception raised when there's an error with a Slurm job."""
-
-    pass
-
-
-class ServerError(APIError):
-    """Exception raised when there's an error with the inference server."""
-
-    pass
-
-
-def load_models() -> list[ModelConfig]:
-    """Load model configurations."""
-    return load_config()
-
-
-def get_model_status(
-    slurm_job_id: str, log_dir: Optional[str] = None
-) -> tuple[ModelStatus, dict[str, Any]]:
-    """Get the status of a model.
-
-    Parameters
-    ----------
-    slurm_job_id: str
-        The Slurm job ID
-    log_dir: str, optional
-        Optional path to Slurm log directory
-
-    Returns
-    -------
-    tuple[ModelStatus, dict[str, Any]]
-        Model status and status information
-
-    """
-    status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
-    output, _ = run_bash_command(status_cmd)
-
-    # Check if job exists
-    if "Invalid job id specified" in output:
-        raise SlurmJobError(f"Job {slurm_job_id} not found")
-
-    # Extract job information
-    try:
-        job_name = output.split(" ")[1].split("=")[1]
-        job_state = output.split(" ")[9].split("=")[1]
-    except IndexError as err:
-        raise SlurmJobError(f"Could not parse job status for {slurm_job_id}") from err
-
-    status_info = {
-        "model_name": job_name,
-        "base_url": None,
-        "pending_reason": None,
-        "failed_reason": None,
-    }
-
-    # Process based on job state
-    if job_state == "PENDING":
-        try:
-            status_info["pending_reason"] = output.split(" ")[10].split("=")[1]
-        except IndexError:
-            status_info["pending_reason"] = "Unknown pending reason"
-        return ModelStatus.PENDING, status_info
-
-    if job_state in ["CANCELLED", "FAILED", "TIMEOUT", "PREEMPTED"]:
-        return ModelStatus.SHUTDOWN, status_info
-
-    if job_state == "RUNNING":
-        return check_server_status(job_name, slurm_job_id, log_dir, status_info)
-
-    # Unknown state
-    status_info["failed_reason"] = f"Unknown job state: {job_state}"
-    return ModelStatus.FAILED, status_info
-
-
-def check_server_status(
-    job_name: str, job_id: str, log_dir: Optional[str], status_info: dict[str, Any]
-) -> tuple[ModelStatus, dict[str, Any]]:
-    """Check the status of a running inference server."""
-    # Initialize default status
-    final_status = ModelStatus.LAUNCHING
-    log_content = read_slurm_log(job_name, int(job_id), "err", log_dir)
-
-    # Handle initial log reading error
-    if isinstance(log_content, str):
-        status_info["failed_reason"] = log_content
-        return ModelStatus.FAILED, status_info
-
-    # Process log content
-    for line in log_content:
-        line_lower = line.lower()
-
-        # Check for error indicators
-        if "error" in line_lower:
-            status_info["failed_reason"] = line.strip("\n")
-            final_status = ModelStatus.FAILED
-            break
-
-        # Check for server ready signal
-        if MODEL_READY_SIGNATURE in line:
-            base_url = get_base_url(job_name, int(job_id), log_dir)
-
-            # Validate base URL
-            if not isinstance(base_url, str) or not base_url.startswith("http"):
-                status_info["failed_reason"] = f"Invalid base URL: {base_url}"
-                final_status = ModelStatus.FAILED
-                break
-
-            status_info["base_url"] = base_url
-            final_status = _perform_health_check(base_url, status_info)
-            break  # Stop processing after first ready signature
-
-    return final_status, status_info
-
-
-def _perform_health_check(base_url: str, status_info: dict[str, Any]) -> ModelStatus:
-    """Execute health check and return appropriate status."""
-    health_check_url = base_url.replace("v1", "health")
-
-    try:
-        response = requests.get(health_check_url)
-        if response.status_code == 200:
-            return ModelStatus.READY
-
-        status_info["failed_reason"] = (
-            f"Health check failed with status code {response.status_code}"
-        )
-    except requests.exceptions.RequestException as e:
-        status_info["failed_reason"] = f"Health check request error: {str(e)}"
-
-    return ModelStatus.FAILED
-
-
-def get_metrics(job_name: str, job_id: int, log_dir: Optional[str]) -> dict[str, str]:
-    """Get the latest metrics for a model.
-
-    Parameters
-    ----------
-    job_name: str
-        The name of the Slurm job
-    job_id: int
-        The Slurm job ID
-    log_dir: str, optional
-        Optional path to Slurm log directory
-
-    Returns
-    -------
-    dict[str, str]
-        Dictionary of metrics or empty dict if not found
-
-    """
-    log_content = read_slurm_log(job_name, job_id, "out", log_dir)
-    if isinstance(log_content, str):
-        return {}
-
-    # Find the latest metrics entry
-    metrics = {}
-    for line in reversed(log_content):
-        if "Avg prompt throughput" in line:
-            # Parse metrics from the line
-            metrics_str = line.split("] ")[1].strip().strip(".")
-            metrics_list = metrics_str.split(", ")
-            for metric in metrics_list:
-                key, value = metric.split(": ")
-                metrics[key] = value
-            break
-
-    return metrics
diff --git a/vec_inf/shared/_exceptions.py b/vec_inf/shared/_exceptions.py
index 0bbbd914..296cec16 100644
--- a/vec_inf/shared/_exceptions.py
+++ b/vec_inf/shared/_exceptions.py
@@ -23,3 +23,15 @@ class SlurmJobError(RuntimeError):
     """Raised when there's an error with a Slurm job."""
 
     pass
+
+
+class APIError(Exception):
+    """Base exception for API errors."""
+
+    pass
+
+
+class ServerError(Exception):
+    """Exception raised when there's an error with the inference server."""
+
+    pass

From 19d14b5e03edf6a1c35b000181ec336d679239e5 Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 17:54:12 -0400
Subject: [PATCH 26/52] Fix CLI and shared utils tests

---
 tests/vec_inf/cli/test_cli.py               | 37 +++++++++++----------
 tests/vec_inf/{cli => shared}/test_utils.py | 12 +++----
 2 files changed, 26 insertions(+), 23 deletions(-)
 rename tests/vec_inf/{cli => shared}/test_utils.py (95%)

diff --git a/tests/vec_inf/cli/test_cli.py b/tests/vec_inf/cli/test_cli.py
index 59b60973..7d61f7fc 100644
--- a/tests/vec_inf/cli/test_cli.py
+++ b/tests/vec_inf/cli/test_cli.py
@@ -229,7 +229,7 @@ def base_patches(test_paths, mock_truediv, debug_helper):
         patch("pathlib.Path.iterdir", return_value=[]),  # Mock empty directory listing
         patch("json.dump"),
         patch("pathlib.Path.touch"),
-        patch("vec_inf.shared.utils.Path", return_value=test_paths["weights_dir"]),
+        patch("vec_inf.shared._utils.Path", return_value=test_paths["weights_dir"]),
         patch(
             "pathlib.Path.home", return_value=Path("/home/user")
         ),  # Mock home directory
@@ -251,7 +251,7 @@ def test_launch_command_success(runner, mock_launch_output, path_exists, debug_h
     test_log_dir = Path("/tmp/test_vec_inf_logs")
 
     with (
-        patch("vec_inf.shared.utils.run_bash_command") as mock_run,
+        patch("vec_inf.shared._utils.run_bash_command") as mock_run,
         patch("pathlib.Path.mkdir"),
         patch("builtins.open", debug_helper.tracked_mock_open),
         patch("pathlib.Path.open", debug_helper.tracked_mock_open),
@@ -282,7 +282,7 @@ def test_launch_command_with_json_output(
     """Test JSON output format for launch command."""
     test_log_dir = Path("/tmp/test_vec_inf_logs")
     with (
-        patch("vec_inf.shared.utils.run_bash_command") as mock_run,
+        patch("vec_inf.shared._utils.run_bash_command") as mock_run,
         patch("pathlib.Path.mkdir"),
         patch("builtins.open", debug_helper.tracked_mock_open),
         patch("pathlib.Path.open", debug_helper.tracked_mock_open),
@@ -340,7 +340,7 @@ def test_launch_command_model_not_in_config_with_weights(
         for patch_obj in base_patches:
             stack.enter_context(patch_obj)
         # Apply specific patches for this test
-        mock_run = stack.enter_context(patch("vec_inf.shared.utils.run_bash_command"))
+        mock_run = stack.enter_context(patch("vec_inf.shared._utils.run_bash_command"))
         stack.enter_context(patch("pathlib.Path.exists", new=custom_path_exists))
 
         expected_job_id = "14933051"
@@ -381,7 +381,7 @@ def custom_path_exists(p):
 
         # Mock Path to return the weights dir path
         stack.enter_context(
-            patch("vec_inf.shared.utils.Path", return_value=test_paths["weights_dir"])
+            patch("vec_inf.shared._utils.Path", return_value=test_paths["weights_dir"])
         )
 
         result = runner.invoke(cli, ["launch", "unknown-model"])
@@ -417,9 +417,9 @@ def test_metrics_command_pending_server(
 ):
     """Test metrics command when server is pending."""
     with (
-        patch("vec_inf.cli._utils.run_bash_command") as mock_run,
+        patch("vec_inf.shared._utils.run_bash_command") as mock_run,
         patch("pathlib.Path.exists", new=path_exists),
-        patch("vec_inf.cli._utils.get_base_url", return_value="URL NOT FOUND"),
+        patch("vec_inf.shared._utils.get_base_url", return_value="URL NOT FOUND"),
     ):
         job_id = 12345
         mock_run.return_value = (mock_status_output(job_id, "PENDING"), "")
@@ -431,7 +431,7 @@ def test_metrics_command_pending_server(
         assert "Server State" in result.output
         assert "PENDING" in result.output
         assert (
-            "Metrics endpoint unavailable - Pending resources for server"
+            "Metrics endpoint unavailable or server not ready - Pending"
             in result.output
         )
 
@@ -441,9 +441,9 @@ def test_metrics_command_server_not_ready(
 ):
     """Test metrics command when server is running but not ready."""
     with (
-        patch("vec_inf.cli._utils.run_bash_command") as mock_run,
+        patch("vec_inf.shared._utils.run_bash_command") as mock_run,
         patch("pathlib.Path.exists", new=path_exists),
-        patch("vec_inf.cli._utils.get_base_url", return_value="Server not ready"),
+        patch("vec_inf.shared._utils.get_base_url", return_value="Server not ready"),
     ):
         job_id = 12345
         mock_run.return_value = (mock_status_output(job_id, "RUNNING"), "")
@@ -454,10 +454,13 @@ def test_metrics_command_server_not_ready(
         assert result.exit_code == 0
         assert "Server State" in result.output
         assert "RUNNING" in result.output
-        assert "Server not ready" in result.output
+        assert (
+            "Metrics endpoint unavailable or server not ready - Server not"
+            in result.output
+        )
 
 
-@patch("vec_inf.cli._helper.requests.get")
+@patch("requests.get")
 def test_metrics_command_server_ready(
     mock_get, runner, mock_status_output, path_exists, debug_helper, apply_base_patches
 ):
@@ -478,9 +481,9 @@ def test_metrics_command_server_ready(
     mock_response.status_code = 200
 
     with (
-        patch("vec_inf.cli._utils.run_bash_command") as mock_run,
+        patch("vec_inf.shared._utils.run_bash_command") as mock_run,
         patch("pathlib.Path.exists", new=path_exists),
-        patch("vec_inf.cli._utils.get_base_url", return_value="http://test:8000/v1"),
+        patch("vec_inf.shared._utils.get_base_url", return_value="http://test:8000/v1"),
         patch("time.sleep", side_effect=KeyboardInterrupt),  # Break the infinite loop
     ):
         job_id = 12345
@@ -496,7 +499,7 @@ def test_metrics_command_server_ready(
         assert "50.0%" in result.output  # 0.5 converted to percentage
 
 
-@patch("vec_inf.cli._helper.requests.get")
+@patch("requests.get")
 def test_metrics_command_request_failed(
     mock_get, runner, mock_status_output, path_exists, debug_helper, apply_base_patches
 ):
@@ -504,9 +507,9 @@ def test_metrics_command_request_failed(
     mock_get.side_effect = requests.exceptions.RequestException("Connection refused")
 
     with (
-        patch("vec_inf.cli._utils.run_bash_command") as mock_run,
+        patch("vec_inf.shared._utils.run_bash_command") as mock_run,
         patch("pathlib.Path.exists", new=path_exists),
-        patch("vec_inf.cli._utils.get_base_url", return_value="http://test:8000/v1"),
+        patch("vec_inf.shared._utils.get_base_url", return_value="http://test:8000/v1"),
         patch("time.sleep", side_effect=KeyboardInterrupt),  # Break the infinite loop
     ):
         job_id = 12345
diff --git a/tests/vec_inf/cli/test_utils.py b/tests/vec_inf/shared/test_utils.py
similarity index 95%
rename from tests/vec_inf/cli/test_utils.py
rename to tests/vec_inf/shared/test_utils.py
index 1326f8ce..9fea9c3a 100644
--- a/tests/vec_inf/cli/test_utils.py
+++ b/tests/vec_inf/shared/test_utils.py
@@ -6,7 +6,7 @@
 import pytest
 import requests
 
-from vec_inf.shared.utils import (
+from vec_inf.shared._utils import (
     MODEL_READY_SIGNATURE,
     create_table,
     get_base_url,
@@ -77,7 +77,7 @@ def test_read_slurm_log_not_found():
 )
 def test_is_server_running_statuses(log_content, expected):
     """Test that is_server_running returns the correct status."""
-    with patch("vec_inf.shared.utils.read_slurm_log") as mock_read:
+    with patch("vec_inf.shared._utils.read_slurm_log") as mock_read:
         mock_read.return_value = log_content
         result = is_server_running("test_job", 123, None)
         assert result == expected
@@ -86,7 +86,7 @@ def test_is_server_running_statuses(log_content, expected):
 def test_get_base_url_found():
     """Test that get_base_url returns the correct base URL."""
     test_dict = {"server_address": "http://localhost:8000"}
-    with patch("vec_inf.shared.utils.read_slurm_log") as mock_read:
+    with patch("vec_inf.shared._utils.read_slurm_log") as mock_read:
         mock_read.return_value = test_dict
         result = get_base_url("test_job", 123, None)
         assert result == "http://localhost:8000"
@@ -94,7 +94,7 @@ def test_get_base_url_found():
 
 def test_get_base_url_not_found():
     """Test get_base_url when URL is not found in logs."""
-    with patch("vec_inf.shared.utils.read_slurm_log") as mock_read:
+    with patch("vec_inf.shared._utils.read_slurm_log") as mock_read:
         mock_read.return_value = {"random_key": "123"}
         result = get_base_url("test_job", 123, None)
         assert result == "URL NOT FOUND"
@@ -110,7 +110,7 @@ def test_get_base_url_not_found():
 )
 def test_model_health_check(url, status_code, expected):
     """Test model_health_check with various scenarios."""
-    with patch("vec_inf.shared.utils.get_base_url") as mock_url:
+    with patch("vec_inf.shared._utils.get_base_url") as mock_url:
         mock_url.return_value = url
         if url.startswith("http"):
             with patch("requests.get") as mock_get:
@@ -125,7 +125,7 @@ def test_model_health_check(url, status_code, expected):
 def test_model_health_check_request_exception():
     """Test model_health_check when request raises an exception."""
     with (
-        patch("vec_inf.shared.utils.get_base_url") as mock_url,
+        patch("vec_inf.shared._utils.get_base_url") as mock_url,
         patch("requests.get") as mock_get,
     ):
         mock_url.return_value = "http://localhost:8000"

From df256cf413a1a1d68dfb1ef8a6a52d31297cca7e Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 21:57:20 -0400
Subject: [PATCH 27/52] Fix API tests, use pathlib instead of os.path

---
 tests/vec_inf/api/test_client.py   | 4 ++--
 tests/vec_inf/api/test_examples.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/tests/vec_inf/api/test_client.py b/tests/vec_inf/api/test_client.py
index cb26674e..95c69eb4 100644
--- a/tests/vec_inf/api/test_client.py
+++ b/tests/vec_inf/api/test_client.py
@@ -68,11 +68,11 @@ def test_launch_model(mock_model_config, mock_launch_output):
 
     with (
         patch(
-            "vec_inf.shared.utils.run_bash_command",
+            "vec_inf.shared._utils.run_bash_command",
             return_value=(mock_launch_output, ""),
         ),
         patch(
-            "vec_inf.shared.utils.parse_launch_output", return_value=("12345678", {})
+            "vec_inf.shared._utils.parse_launch_output", return_value=("12345678", {})
         ),
     ):
         # Create a mock response
diff --git a/tests/vec_inf/api/test_examples.py b/tests/vec_inf/api/test_examples.py
index 967b8de1..e43b3e98 100644
--- a/tests/vec_inf/api/test_examples.py
+++ b/tests/vec_inf/api/test_examples.py
@@ -1,6 +1,6 @@
 """Tests to verify the API examples function properly."""
 
-import os
+from pathlib import Path
 from unittest.mock import MagicMock, patch
 
 import pytest
@@ -44,12 +44,12 @@ def mock_client():
 
 
 @pytest.mark.skipif(
-    not os.path.exists(os.path.join("examples", "api", "basic_usage.py")),
+    not Path("../../../examples/api/basic_usage.py").exists(),
     reason="Example file not found",
 )
 def test_api_usage_example():
     """Test the basic API usage example."""
-    example_path = os.path.join("examples", "api", "basic_usage.py")
+    example_path = Path("../../../examples/api/basic_usage.py")
 
     # Create a mock client
     mock_client = MagicMock(spec=VecInfClient)
@@ -77,7 +77,7 @@ def test_api_usage_example():
     with (
         patch("vec_inf.api.VecInfClient", return_value=mock_client),
         patch("builtins.print"),
-        open(example_path) as f,
+        example_path.open() as f,
     ):
         exec(f.read())
 

From 930c999c78be993c68bc82e7952cbb41bf47c5cb Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 22:04:22 -0400
Subject: [PATCH 28/52] Fix relative path in test examples

---
 tests/vec_inf/api/test_examples.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/vec_inf/api/test_examples.py b/tests/vec_inf/api/test_examples.py
index e43b3e98..4c069c7c 100644
--- a/tests/vec_inf/api/test_examples.py
+++ b/tests/vec_inf/api/test_examples.py
@@ -44,12 +44,12 @@ def mock_client():
 
 
 @pytest.mark.skipif(
-    not Path("../../../examples/api/basic_usage.py").exists(),
+    not (Path(__file__).parent.parent.parent.parent / "examples" / "api" / "basic_usage.py").exists(),
     reason="Example file not found",
 )
 def test_api_usage_example():
     """Test the basic API usage example."""
-    example_path = Path("../../../examples/api/basic_usage.py")
+    example_path = Path(__file__).parent.parent.parent.parent / "examples" / "api" / "basic_usage.py"
 
     # Create a mock client
     mock_client = MagicMock(spec=VecInfClient)

From 4ca3b10a544749ca86aca7e7153669843334f665 Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 22:04:37 -0400
Subject: [PATCH 29/52] Fix import tests

---
 tests/test_imports.py | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/tests/test_imports.py b/tests/test_imports.py
index e5507600..5bc2808a 100644
--- a/tests/test_imports.py
+++ b/tests/test_imports.py
@@ -1,17 +1,35 @@
 """Test the imports of the vec_inf package."""
 
 import unittest
+import pytest
 
 
 class TestVecInfImports(unittest.TestCase):
     """Test the imports of the vec_inf package."""
 
-    def test_import_cli_modules(self):
-        """Test the imports of the vec_inf.cli modules."""
+
+    def test_imports(self):
+        """Test that all modules can be imported."""
         try:
+            # CLI imports
+            import vec_inf.cli
             import vec_inf.cli._cli
-            import vec_inf.cli._config
             import vec_inf.cli._helper
-            import vec_inf.cli._utils  # noqa: F401
+
+            # API imports
+            import vec_inf.api
+            import vec_inf.api.client
+            import vec_inf.api._helper
+            import vec_inf.api._models
+
+            # Shared imports
+            import vec_inf.shared
+            import vec_inf.shared._config
+            import vec_inf.shared._exceptions
+            import vec_inf.shared._helper
+            import vec_inf.shared._models
+            import vec_inf.shared._utils
+            import vec_inf.shared._vars
+
         except ImportError as e:
-            self.fail(f"Import failed: {e}")
+            pytest.fail(f"Import failed: {e}")

From b28eba1d605db22559ebd7d7f5164f2e15cf5cb8 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 3 Apr 2025 02:04:50 +0000
Subject: [PATCH 30/52] [pre-commit.ci] Add auto fixes from pre-commit.com
 hooks

for more information, see https://pre-commit.ci
---
 tests/test_imports.py              | 11 +++++------
 tests/vec_inf/api/test_examples.py | 14 ++++++++++++--
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/tests/test_imports.py b/tests/test_imports.py
index 5bc2808a..22ca86ef 100644
--- a/tests/test_imports.py
+++ b/tests/test_imports.py
@@ -1,26 +1,25 @@
 """Test the imports of the vec_inf package."""
 
 import unittest
+
 import pytest
 
 
 class TestVecInfImports(unittest.TestCase):
     """Test the imports of the vec_inf package."""
 
-
     def test_imports(self):
         """Test that all modules can be imported."""
         try:
             # CLI imports
-            import vec_inf.cli
-            import vec_inf.cli._cli
-            import vec_inf.cli._helper
-
             # API imports
             import vec_inf.api
-            import vec_inf.api.client
             import vec_inf.api._helper
             import vec_inf.api._models
+            import vec_inf.api.client
+            import vec_inf.cli
+            import vec_inf.cli._cli
+            import vec_inf.cli._helper
 
             # Shared imports
             import vec_inf.shared
diff --git a/tests/vec_inf/api/test_examples.py b/tests/vec_inf/api/test_examples.py
index 4c069c7c..f599a114 100644
--- a/tests/vec_inf/api/test_examples.py
+++ b/tests/vec_inf/api/test_examples.py
@@ -44,12 +44,22 @@ def mock_client():
 
 
 @pytest.mark.skipif(
-    not (Path(__file__).parent.parent.parent.parent / "examples" / "api" / "basic_usage.py").exists(),
+    not (
+        Path(__file__).parent.parent.parent.parent
+        / "examples"
+        / "api"
+        / "basic_usage.py"
+    ).exists(),
     reason="Example file not found",
 )
 def test_api_usage_example():
     """Test the basic API usage example."""
-    example_path = Path(__file__).parent.parent.parent.parent / "examples" / "api" / "basic_usage.py"
+    example_path = (
+        Path(__file__).parent.parent.parent.parent
+        / "examples"
+        / "api"
+        / "basic_usage.py"
+    )
 
     # Create a mock client
     mock_client = MagicMock(spec=VecInfClient)

From c88084cd233a07dfa65c5d03d58adc5806871697 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Wed, 2 Apr 2025 22:25:39 -0400
Subject: [PATCH 31/52] ruff fix

---
 tests/test_imports.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_imports.py b/tests/test_imports.py
index 22ca86ef..2f91d9af 100644
--- a/tests/test_imports.py
+++ b/tests/test_imports.py
@@ -11,12 +11,13 @@ class TestVecInfImports(unittest.TestCase):
     def test_imports(self):
         """Test that all modules can be imported."""
         try:
-            # CLI imports
             # API imports
             import vec_inf.api
             import vec_inf.api._helper
             import vec_inf.api._models
             import vec_inf.api.client
+
+            # CLI imports
             import vec_inf.cli
             import vec_inf.cli._cli
             import vec_inf.cli._helper
@@ -28,7 +29,7 @@ def test_imports(self):
             import vec_inf.shared._helper
             import vec_inf.shared._models
             import vec_inf.shared._utils
-            import vec_inf.shared._vars
+            import vec_inf.shared._vars  # noqa: F401
 
         except ImportError as e:
             pytest.fail(f"Import failed: {e}")

From b8ad6252c06b842370e7a700b43d7d3b1d81d602 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Thu, 3 Apr 2025 02:49:04 -0400
Subject: [PATCH 32/52] mypy fix for shared helper

---
 vec_inf/shared/_helper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vec_inf/shared/_helper.py b/vec_inf/shared/_helper.py
index d7fc8817..c5a77fb0 100644
--- a/vec_inf/shared/_helper.py
+++ b/vec_inf/shared/_helper.py
@@ -1,4 +1,4 @@
-"""Helper class for the model launch."""
+"""Helper classes for the model."""
 
 import os
 import time
@@ -437,7 +437,7 @@ def _parse_metrics(self, metrics_text: str) -> dict[str, float]:
 class ListHelper:
     """Helper class for handling model listing functionality."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         self.model_configs = utils.load_config()
 
     def get_single_model_config(self, model_name: str) -> ModelConfig:

From 1a755ba803469a646006d908e489d09c68a69382 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Thu, 3 Apr 2025 03:01:29 -0400
Subject: [PATCH 33/52] mypy fixes for API

---
 vec_inf/api/_helper.py    | 5 +++--
 vec_inf/shared/_helper.py | 7 ++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/vec_inf/api/_helper.py b/vec_inf/api/_helper.py
index 3abc9149..9f9dfc5d 100644
--- a/vec_inf/api/_helper.py
+++ b/vec_inf/api/_helper.py
@@ -5,8 +5,9 @@
 from pathlib import Path
 from typing import Any, Optional
 
-from vec_inf.api._models import LaunchResponse, ModelInfo, ModelType
+from vec_inf.api._models import LaunchResponse, ModelInfo
 from vec_inf.shared._helper import LaunchHelper, ListHelper
+from vec_inf.shared._models import ModelType
 
 
 class APILaunchHelper(LaunchHelper):
@@ -45,7 +46,7 @@ def post_launch_processing(self, command_output: str) -> LaunchResponse:
 class APIListHelper(ListHelper):
     """API Helper class for handling model listing."""
 
-    def __init__(self):
+    def __init__(self) -> None:
         super().__init__()
 
     def get_all_models(self) -> list[ModelInfo]:
diff --git a/vec_inf/shared/_helper.py b/vec_inf/shared/_helper.py
index c5a77fb0..2c76ddc2 100644
--- a/vec_inf/shared/_helper.py
+++ b/vec_inf/shared/_helper.py
@@ -438,7 +438,12 @@ class ListHelper:
     """Helper class for handling model listing functionality."""
 
     def __init__(self) -> None:
-        self.model_configs = utils.load_config()
+        """Initialize the model lister."""
+        self.model_configs = self._get_model_configs()
+
+    def _get_model_configs(self) -> list[ModelConfig]:
+        """Get all model configurations."""
+        return utils.load_config()
 
     def get_single_model_config(self, model_name: str) -> ModelConfig:
         """Get configuration for a specific model."""

From a1e3d8924584a3785a3ab88ccd6b8340aa9bb569 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Thu, 3 Apr 2025 03:07:35 -0400
Subject: [PATCH 34/52] mypy fix for advanced usage example

---
 examples/api/advanced_usage.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/api/advanced_usage.py b/examples/api/advanced_usage.py
index 2ff0751f..36bed98f 100755
--- a/examples/api/advanced_usage.py
+++ b/examples/api/advanced_usage.py
@@ -8,7 +8,7 @@
 import argparse
 import json
 import time
-from typing import Dict, Union
+from typing import Union, cast
 
 from openai import OpenAI
 from rich.console import Console
@@ -59,8 +59,8 @@ def export_model_configs(output_file: str) -> None:
 
 
 def launch_with_custom_config(
-    model_name: str, custom_options: Dict[str, Union[str, int, bool]]
-) -> str:
+    model_name: str, custom_options: dict[str, Union[str, int, bool]]
+) -> int:
     """Launch a model with custom configuration options."""
     client = VecInfClient()
 
@@ -88,7 +88,7 @@ def launch_with_custom_config(
 
 
 def monitor_with_rich_ui(
-    job_id: str, poll_interval: int = 5, max_time: int = 1800
+    job_id: int, poll_interval: int = 5, max_time: int = 1800
 ) -> StatusResponse:
     """Monitor a model's status with a rich UI."""
     client = VecInfClient()
@@ -151,7 +151,7 @@ def monitor_with_rich_ui(
     return client.get_status(job_id)
 
 
-def stream_metrics(job_id: str, duration: int = 60, interval: int = 5) -> None:
+def stream_metrics(job_id: int, duration: int = 60, interval: int = 5) -> None:
     """Stream metrics for a specified duration."""
     client = VecInfClient()
 
@@ -168,7 +168,7 @@ def stream_metrics(job_id: str, duration: int = 60, interval: int = 5) -> None:
                 table.add_column("Value", style="green")
 
                 for key, value in metrics_response.metrics.items():
-                    table.add_row(key, value)
+                    table.add_row(key, cast(str, value))
 
                 console.print(table)
             else:

From 3e0c1e143b20a2b2b92935fcf709f5fa369c5b57 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Thu, 3 Apr 2025 10:22:07 -0400
Subject: [PATCH 35/52] Use built-in type for hints

---
 vec_inf/api/_models.py   |  8 ++++----
 vec_inf/shared/_utils.py | 14 +++++++-------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/vec_inf/api/_models.py b/vec_inf/api/_models.py
index 80a100d1..a3f80e01 100644
--- a/vec_inf/api/_models.py
+++ b/vec_inf/api/_models.py
@@ -5,7 +5,7 @@
 """
 
 from dataclasses import dataclass, field
-from typing import Any, Dict, Optional, TypedDict
+from typing import Any, Optional, TypedDict
 
 from typing_extensions import NotRequired
 
@@ -20,7 +20,7 @@ class ModelInfo:
     family: str
     variant: Optional[str]
     type: ModelType
-    config: Dict[str, Any]
+    config: dict[str, Any]
 
 
 @dataclass
@@ -29,7 +29,7 @@ class LaunchResponse:
 
     slurm_job_id: int
     model_name: str
-    config: Dict[str, Any]
+    config: dict[str, Any]
     raw_output: str = field(repr=False)
 
 
@@ -52,7 +52,7 @@ class MetricsResponse:
 
     slurm_job_id: int
     model_name: str
-    metrics: Dict[str, float]
+    metrics: dict[str, float]
     timestamp: float
 
 
diff --git a/vec_inf/shared/_utils.py b/vec_inf/shared/_utils.py
index 007721df..63b8648e 100644
--- a/vec_inf/shared/_utils.py
+++ b/vec_inf/shared/_utils.py
@@ -4,7 +4,7 @@
 import os
 import subprocess
 from pathlib import Path
-from typing import Any, Dict, Optional, Union, cast
+from typing import Any, Optional, Union, cast
 
 import requests
 import yaml
@@ -31,7 +31,7 @@ def read_slurm_log(
     slurm_job_id: int,
     slurm_log_type: str,
     log_dir: Optional[Union[str, Path]],
-) -> Union[list[str], str, Dict[str, str]]:
+) -> Union[list[str], str, dict[str, str]]:
     """Read the slurm log file."""
     if not log_dir:
         # Default log directory
@@ -60,7 +60,7 @@ def read_slurm_log(
         )
         if slurm_log_type == "json":
             with file_path.open("r") as file:
-                json_content: Dict[str, str] = json.load(file)
+                json_content: dict[str, str] = json.load(file)
                 return json_content
         else:
             with file_path.open("r") as file:
@@ -94,7 +94,7 @@ def get_base_url(slurm_job_name: str, slurm_job_id: int, log_dir: Optional[str])
     if isinstance(log_content, str):
         return log_content
 
-    server_addr = cast(Dict[str, str], log_content).get("server_address")
+    server_addr = cast(dict[str, str], log_content).get("server_address")
     return server_addr if server_addr else "URL NOT FOUND"
 
 
@@ -135,7 +135,7 @@ def load_config() -> list[ModelConfig]:
         else Path(__file__).resolve().parent.parent / "config" / "models.yaml"
     )
 
-    config: Dict[str, Any] = {}
+    config: dict[str, Any] = {}
     with open(default_path) as f:
         config = yaml.safe_load(f) or {}
 
@@ -167,7 +167,7 @@ def shutdown_model(slurm_job_id: int) -> None:
     run_bash_command(shutdown_cmd)
 
 
-def parse_launch_output(output: str) -> tuple[str, Dict[str, str]]:
+def parse_launch_output(output: str) -> tuple[str, dict[str, str]]:
     """Parse output from model launch command.
 
     Parameters
@@ -177,7 +177,7 @@ def parse_launch_output(output: str) -> tuple[str, Dict[str, str]]:
 
     Returns
     -------
-    tuple[str, Dict[str, str]]
+    tuple[str, dict[str, str]]
         Slurm job ID and dictionary of config parameters
 
     """

From 6bbc3670aaca5b3bc57496e4fe21b6908c3651bc Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Thu, 3 Apr 2025 11:45:28 -0400
Subject: [PATCH 36/52] Removing the advanced usage example as it creates a new
 CLI by wrapping around the API, creates confusion

---
 examples/api/advanced_usage.py | 333 ---------------------------------
 1 file changed, 333 deletions(-)
 delete mode 100755 examples/api/advanced_usage.py

diff --git a/examples/api/advanced_usage.py b/examples/api/advanced_usage.py
deleted file mode 100755
index 36bed98f..00000000
--- a/examples/api/advanced_usage.py
+++ /dev/null
@@ -1,333 +0,0 @@
-#!/usr/bin/env python
-"""Advanced usage examples for the Vector Inference Python API.
-
-This script demonstrates more advanced patterns and techniques for
-using the Vector Inference API programmatically.
-"""
-
-import argparse
-import json
-import time
-from typing import Union, cast
-
-from openai import OpenAI
-from rich.console import Console
-from rich.progress import Progress
-from rich.table import Table
-
-from vec_inf.api import (
-    LaunchOptions,
-    LaunchOptionsDict,
-    ModelStatus,
-    StatusResponse,
-    VecInfClient,
-)
-
-
-console = Console()
-
-
-def create_openai_client(base_url: str) -> OpenAI:
-    """Create an OpenAI client for a given base URL."""
-    return OpenAI(base_url=base_url, api_key="EMPTY")
-
-
-def export_model_configs(output_file: str) -> None:
-    """Export all model configurations to a JSON file."""
-    client = VecInfClient()
-    models = client.list_models()
-
-    # Convert model info to dictionaries
-    model_dicts = []
-    for model in models:
-        model_dict = {
-            "name": model.name,
-            "family": model.family,
-            "variant": model.variant,
-            "type": str(model.type),
-            "config": model.config,
-        }
-        model_dicts.append(model_dict)
-
-    # Write to file
-    with open(output_file, "w") as f:
-        json.dump(model_dicts, f, indent=2)
-
-    console.print(
-        f"[green]Exported {len(models)} model configurations to {output_file}[/green]"
-    )
-
-
-def launch_with_custom_config(
-    model_name: str, custom_options: dict[str, Union[str, int, bool]]
-) -> int:
-    """Launch a model with custom configuration options."""
-    client = VecInfClient()
-
-    # Create LaunchOptions from dictionary
-    options_dict: LaunchOptionsDict = {}
-    for key, value in custom_options.items():
-        if key in LaunchOptions.__annotations__:
-            options_dict[key] = value  # type: ignore[literal-required]
-        else:
-            console.print(f"[yellow]Warning: Ignoring unknown option '{key}'[/yellow]")
-
-    options = LaunchOptions(**options_dict)
-
-    # Launch the model
-    console.print(f"[blue]Launching model {model_name} with custom options:[/blue]")
-    for key, value in options_dict.items():  # type: ignore[assignment]
-        console.print(f" [cyan]{key}[/cyan]: {value}")
-
-    response = client.launch_model(model_name, options)
-
-    console.print("[green]Model launched successfully![/green]")
-    console.print(f"Slurm Job ID: [bold]{response.slurm_job_id}[/bold]")
-
-    return response.slurm_job_id
-
-
-def monitor_with_rich_ui(
-    job_id: int, poll_interval: int = 5, max_time: int = 1800
-) -> StatusResponse:
-    """Monitor a model's status with a rich UI."""
-    client = VecInfClient()
-
-    start_time = time.time()
-    elapsed = 0
-
-    with Progress() as progress:
-        # Add tasks
-        status_task = progress.add_task(
-            "[cyan]Waiting for model to be ready...", total=None
-        )
-        time_task = progress.add_task("[yellow]Time elapsed", total=max_time)
-
-        while elapsed < max_time:
-            # Update time elapsed
-            elapsed = int(time.time() - start_time)
-            progress.update(time_task, completed=elapsed)
-
-            # Get status
-            try:
-                status = client.get_status(job_id)
-
-                # Update status message
-                if status.status == ModelStatus.READY:
-                    progress.update(
-                        status_task,
-                        description=f"[green]Model is READY at {status.base_url}[/green]",
-                    )
-                    break
-                if status.status == ModelStatus.FAILED:
-                    progress.update(
-                        status_task,
-                        description=f"[red]Model FAILED: {status.failed_reason}[/red]",
-                    )
-                    break
-                if status.status == ModelStatus.PENDING:
-                    progress.update(
-                        status_task,
-                        description=f"[yellow]Model is PENDING: {status.pending_reason}[/yellow]",
-                    )
-                elif status.status == ModelStatus.LAUNCHING:
-                    progress.update(
-                        status_task, description="[cyan]Model is LAUNCHING...[/cyan]"
-                    )
-                elif status.status == ModelStatus.SHUTDOWN:
-                    progress.update(
-                        status_task, description="[red]Model was SHUTDOWN[/red]"
-                    )
-                    break
-            except Exception as e:
-                progress.update(
-                    status_task,
-                    description=f"[red]Error checking status: {str(e)}[/red]",
-                )
-
-            # Wait before checking again
-            time.sleep(poll_interval)
-
-    return client.get_status(job_id)
-
-
-def stream_metrics(job_id: int, duration: int = 60, interval: int = 5) -> None:
-    """Stream metrics for a specified duration."""
-    client = VecInfClient()
-
-    console.print(f"[blue]Streaming metrics for {duration} seconds...[/blue]")
-
-    end_time = time.time() + duration
-    while time.time() < end_time:
-        try:
-            metrics_response = client.get_metrics(job_id)
-
-            if metrics_response.metrics:
-                table = Table(title="Performance Metrics")
-                table.add_column("Metric", style="cyan")
-                table.add_column("Value", style="green")
-
-                for key, value in metrics_response.metrics.items():
-                    table.add_row(key, cast(str, value))
-
-                console.print(table)
-            else:
-                console.print("[yellow]No metrics available yet[/yellow]")
-
-        except Exception as e:
-            console.print(f"[red]Error retrieving metrics: {str(e)}[/red]")
-
-        time.sleep(interval)
-
-
-def batch_inference_example(
-    base_url: str, model_name: str, input_file: str, output_file: str
-) -> None:
-    """Perform batch inference on inputs from a file."""
-    # Read inputs
-    with open(input_file, "r") as f:
-        inputs = [line.strip() for line in f if line.strip()]
-
-    openai_client = create_openai_client(base_url)
-
-    results = []
-    with Progress() as progress:
-        task = progress.add_task("[green]Processing inputs...", total=len(inputs))
-
-        for input_text in inputs:
-            try:
-                # Process using completions API
-                completion = openai_client.completions.create(
-                    model=model_name,
-                    prompt=input_text,
-                    max_tokens=100,
-                )
-
-                # Store result
-                results.append(
-                    {
-                        "input": input_text,
-                        "output": completion.choices[0].text,
-                        "tokens": completion.usage.completion_tokens,  # type: ignore[union-attr]
-                    }
-                )
-
-            except Exception as e:
-                results.append({"input": input_text, "error": str(e)})
-
-            progress.update(task, advance=1)
-
-    # Write results
-    with open(output_file, "w") as f:
-        json.dump(results, f, indent=2)
-
-    console.print(
-        f"[green]Processed {len(inputs)} inputs and saved results to {output_file}[/green]"
-    )
-
-
-def main() -> None:
-    """Parse arguments and run the selected function."""
-    parser = argparse.ArgumentParser(
-        description="Advanced Vector Inference API usage examples"
-    )
-    subparsers = parser.add_subparsers(dest="command", help="Command to run")
-
-    # Export configs command
-    export_parser = subparsers.add_parser(
-        "export-configs", help="Export all model configurations to a JSON file"
-    )
-    export_parser.add_argument(
-        "--output", "-o", default="model_configs.json", help="Output JSON file"
-    )
-
-    # Launch with custom config command
-    launch_parser = subparsers.add_parser(
-        "launch", help="Launch a model with custom configuration"
-    )
-    launch_parser.add_argument("model_name", help="Name of the model to launch")
-    launch_parser.add_argument("--num-gpus", type=int, help="Number of GPUs to use")
-    launch_parser.add_argument("--num-nodes", type=int, help="Number of nodes to use")
-    launch_parser.add_argument(
-        "--max-model-len", type=int, help="Maximum model context length"
-    )
-    launch_parser.add_argument(
-        "--max-num-seqs", type=int, help="Maximum number of sequences"
-    )
-    launch_parser.add_argument("--partition", help="Partition to use")
-    launch_parser.add_argument("--qos", help="Quality of service")
-    launch_parser.add_argument("--time", help="Time limit")
-
-    # Monitor command
-    monitor_parser = subparsers.add_parser(
-        "monitor", help="Monitor a model with rich UI"
-    )
-    monitor_parser.add_argument("job_id", help="Slurm job ID to monitor")
-    monitor_parser.add_argument(
-        "--interval", type=int, default=5, help="Polling interval in seconds"
-    )
-    monitor_parser.add_argument(
-        "--max-time", type=int, default=1800, help="Maximum time to monitor in seconds"
-    )
-
-    # Stream metrics command
-    metrics_parser = subparsers.add_parser("metrics", help="Stream metrics for a model")
-    metrics_parser.add_argument("job_id", help="Slurm job ID to get metrics for")
-    metrics_parser.add_argument(
-        "--duration", type=int, default=60, help="Duration to stream metrics in seconds"
-    )
-    metrics_parser.add_argument(
-        "--interval", type=int, default=5, help="Polling interval in seconds"
-    )
-
-    # Batch inference command
-    batch_parser = subparsers.add_parser("batch", help="Perform batch inference")
-    batch_parser.add_argument("base_url", help="Base URL of the model server")
-    batch_parser.add_argument("model_name", help="Name of the model to use")
-    batch_parser.add_argument(
-        "--input", "-i", required=True, help="Input file with one prompt per line"
-    )
-    batch_parser.add_argument(
-        "--output", "-o", required=True, help="Output JSON file for results"
-    )
-
-    args = parser.parse_args()
-
-    # Run the selected command
-    if args.command == "export-configs":
-        export_model_configs(args.output)
-
-    elif args.command == "launch":
-        # Extract custom options from args
-        options = {}
-        for key, value in vars(args).items():
-            if key not in ["command", "model_name"] and value is not None:
-                options[key] = value
-
-        job_id = launch_with_custom_config(args.model_name, options)
-
-        # Ask if user wants to monitor
-        if console.input("[cyan]Monitor this job? (y/n): [/cyan]").lower() == "y":
-            monitor_with_rich_ui(job_id)
-
-    elif args.command == "monitor":
-        status = monitor_with_rich_ui(args.job_id, args.interval, args.max_time)
-
-        if (status.status == ModelStatus.READY) and (
-            console.input("[cyan]Stream metrics for this model? (y/n): [/cyan]").lower()
-            == "y"
-        ):
-            stream_metrics(args.job_id)
-
-    elif args.command == "metrics":
-        stream_metrics(args.job_id, args.duration, args.interval)
-
-    elif args.command == "batch":
-        batch_inference_example(args.base_url, args.model_name, args.input, args.output)
-
-    else:
-        parser.print_help()
-
-
-if __name__ == "__main__":
-    main()

From af7de980d5013105a25e3bb1f24fcb8b63341204 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Mon, 7 Apr 2025 15:05:15 -0400
Subject: [PATCH 37/52] Restructure code base, merge api and shared folder and
 rename to client

---
 examples/api/basic_usage.py                   |  2 +-
 tests/vec_inf/{api => client}/__init__.py     |  0
 .../test_client.py => client/test_api.py}     |  6 +-
 .../vec_inf/{api => client}/test_examples.py  |  4 +-
 tests/vec_inf/{api => client}/test_models.py  |  2 +-
 .../vec_inf/{shared => client}/test_utils.py  | 12 +--
 vec_inf/api/_helper.py                        | 64 ----------------
 vec_inf/{api => client}/__init__.py           |  7 +-
 vec_inf/{shared => client}/_config.py         |  0
 vec_inf/{shared => client}/_exceptions.py     |  0
 vec_inf/{shared => client}/_helper.py         | 76 +++++++++++++------
 vec_inf/{api => client}/_models.py            | 40 +++++++---
 vec_inf/{shared => client}/_utils.py          |  6 +-
 vec_inf/{shared => client}/_vars.py           |  0
 vec_inf/{api/client.py => client/api.py}      | 72 ++++++++++--------
 vec_inf/shared/__init__.py                    |  1 -
 vec_inf/shared/_models.py                     | 23 ------
 17 files changed, 144 insertions(+), 171 deletions(-)
 rename tests/vec_inf/{api => client}/__init__.py (100%)
 rename tests/vec_inf/{api/test_client.py => client/test_api.py} (95%)
 rename tests/vec_inf/{api => client}/test_examples.py (95%)
 rename tests/vec_inf/{api => client}/test_models.py (95%)
 rename tests/vec_inf/{shared => client}/test_utils.py (95%)
 delete mode 100644 vec_inf/api/_helper.py
 rename vec_inf/{api => client}/__init__.py (81%)
 rename vec_inf/{shared => client}/_config.py (100%)
 rename vec_inf/{shared => client}/_exceptions.py (100%)
 rename vec_inf/{shared => client}/_helper.py (88%)
 rename vec_inf/{api => client}/_models.py (85%)
 rename vec_inf/{shared => client}/_utils.py (97%)
 rename vec_inf/{shared => client}/_vars.py (100%)
 rename vec_inf/{api/client.py => client/api.py} (82%)
 delete mode 100644 vec_inf/shared/__init__.py
 delete mode 100644 vec_inf/shared/_models.py

diff --git a/examples/api/basic_usage.py b/examples/api/basic_usage.py
index d50e418b..c027065f 100755
--- a/examples/api/basic_usage.py
+++ b/examples/api/basic_usage.py
@@ -5,7 +5,7 @@
 for launching and interacting with models.
 """
 
-from vec_inf.api import VecInfClient
+from vec_inf.client import VecInfClient
 
 
 # Create the API client
diff --git a/tests/vec_inf/api/__init__.py b/tests/vec_inf/client/__init__.py
similarity index 100%
rename from tests/vec_inf/api/__init__.py
rename to tests/vec_inf/client/__init__.py
diff --git a/tests/vec_inf/api/test_client.py b/tests/vec_inf/client/test_api.py
similarity index 95%
rename from tests/vec_inf/api/test_client.py
rename to tests/vec_inf/client/test_api.py
index 95c69eb4..43bb5857 100644
--- a/tests/vec_inf/api/test_client.py
+++ b/tests/vec_inf/client/test_api.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from vec_inf.api import ModelStatus, ModelType, VecInfClient
+from vec_inf.client import ModelStatus, ModelType, VecInfClient
 
 
 @pytest.fixture
@@ -68,11 +68,11 @@ def test_launch_model(mock_model_config, mock_launch_output):
 
     with (
         patch(
-            "vec_inf.shared._utils.run_bash_command",
+            "vec_inf.client._utils.run_bash_command",
             return_value=(mock_launch_output, ""),
         ),
         patch(
-            "vec_inf.shared._utils.parse_launch_output", return_value=("12345678", {})
+            "vec_inf.client._utils.parse_launch_output", return_value=("12345678", {})
         ),
     ):
         # Create a mock response
diff --git a/tests/vec_inf/api/test_examples.py b/tests/vec_inf/client/test_examples.py
similarity index 95%
rename from tests/vec_inf/api/test_examples.py
rename to tests/vec_inf/client/test_examples.py
index f599a114..31fbe796 100644
--- a/tests/vec_inf/api/test_examples.py
+++ b/tests/vec_inf/client/test_examples.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from vec_inf.api import ModelStatus, ModelType, VecInfClient
+from vec_inf.client import ModelStatus, ModelType, VecInfClient
 
 
 @pytest.fixture
@@ -85,7 +85,7 @@ def test_api_usage_example():
 
     # Mock the VecInfClient class
     with (
-        patch("vec_inf.api.VecInfClient", return_value=mock_client),
+        patch("vec_inf.client.VecInfClient", return_value=mock_client),
         patch("builtins.print"),
         example_path.open() as f,
     ):
diff --git a/tests/vec_inf/api/test_models.py b/tests/vec_inf/client/test_models.py
similarity index 95%
rename from tests/vec_inf/api/test_models.py
rename to tests/vec_inf/client/test_models.py
index d9a8abe7..bbcaacda 100644
--- a/tests/vec_inf/api/test_models.py
+++ b/tests/vec_inf/client/test_models.py
@@ -1,6 +1,6 @@
 """Tests for the Vector Inference API data models."""
 
-from vec_inf.api import LaunchOptions, ModelInfo, ModelStatus, ModelType
+from vec_inf.client import LaunchOptions, ModelInfo, ModelStatus, ModelType
 
 
 def test_model_info_creation():
diff --git a/tests/vec_inf/shared/test_utils.py b/tests/vec_inf/client/test_utils.py
similarity index 95%
rename from tests/vec_inf/shared/test_utils.py
rename to tests/vec_inf/client/test_utils.py
index 9fea9c3a..45f4df3e 100644
--- a/tests/vec_inf/shared/test_utils.py
+++ b/tests/vec_inf/client/test_utils.py
@@ -6,7 +6,7 @@
 import pytest
 import requests
 
-from vec_inf.shared._utils import (
+from vec_inf.client._utils import (
     MODEL_READY_SIGNATURE,
     create_table,
     get_base_url,
@@ -77,7 +77,7 @@ def test_read_slurm_log_not_found():
 )
 def test_is_server_running_statuses(log_content, expected):
     """Test that is_server_running returns the correct status."""
-    with patch("vec_inf.shared._utils.read_slurm_log") as mock_read:
+    with patch("vec_inf.client._utils.read_slurm_log") as mock_read:
         mock_read.return_value = log_content
         result = is_server_running("test_job", 123, None)
         assert result == expected
@@ -86,7 +86,7 @@ def test_is_server_running_statuses(log_content, expected):
 def test_get_base_url_found():
     """Test that get_base_url returns the correct base URL."""
     test_dict = {"server_address": "http://localhost:8000"}
-    with patch("vec_inf.shared._utils.read_slurm_log") as mock_read:
+    with patch("vec_inf.client._utils.read_slurm_log") as mock_read:
         mock_read.return_value = test_dict
         result = get_base_url("test_job", 123, None)
         assert result == "http://localhost:8000"
@@ -94,7 +94,7 @@ def test_get_base_url_found():
 
 def test_get_base_url_not_found():
     """Test get_base_url when URL is not found in logs."""
-    with patch("vec_inf.shared._utils.read_slurm_log") as mock_read:
+    with patch("vec_inf.client._utils.read_slurm_log") as mock_read:
         mock_read.return_value = {"random_key": "123"}
         result = get_base_url("test_job", 123, None)
         assert result == "URL NOT FOUND"
@@ -110,7 +110,7 @@ def test_get_base_url_not_found():
 )
 def test_model_health_check(url, status_code, expected):
     """Test model_health_check with various scenarios."""
-    with patch("vec_inf.shared._utils.get_base_url") as mock_url:
+    with patch("vec_inf.client._utils.get_base_url") as mock_url:
         mock_url.return_value = url
         if url.startswith("http"):
             with patch("requests.get") as mock_get:
@@ -125,7 +125,7 @@ def test_model_health_check(url, status_code, expected):
 def test_model_health_check_request_exception():
     """Test model_health_check when request raises an exception."""
     with (
-        patch("vec_inf.shared._utils.get_base_url") as mock_url,
+        patch("vec_inf.client._utils.get_base_url") as mock_url,
         patch("requests.get") as mock_get,
     ):
         mock_url.return_value = "http://localhost:8000"
diff --git a/vec_inf/api/_helper.py b/vec_inf/api/_helper.py
deleted file mode 100644
index 9f9dfc5d..00000000
--- a/vec_inf/api/_helper.py
+++ /dev/null
@@ -1,64 +0,0 @@
-"""Helper classes for the API."""
-
-import json
-import warnings
-from pathlib import Path
-from typing import Any, Optional
-
-from vec_inf.api._models import LaunchResponse, ModelInfo
-from vec_inf.shared._helper import LaunchHelper, ListHelper
-from vec_inf.shared._models import ModelType
-
-
-class APILaunchHelper(LaunchHelper):
-    """API Helper class for handling inference server launch."""
-
-    def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
-        super().__init__(model_name, kwargs)
-
-    def _warn(self, message: str) -> None:
-        """Warn the user about a potential issue."""
-        warnings.warn(message, UserWarning, stacklevel=2)
-
-    def post_launch_processing(self, command_output: str) -> LaunchResponse:
-        """Process and display launch output."""
-        slurm_job_id = command_output.split(" ")[-1].strip().strip("\n")
-        self.params["slurm_job_id"] = slurm_job_id
-        job_json = Path(
-            self.params["log_dir"],
-            f"{self.model_name}.{slurm_job_id}",
-            f"{self.model_name}.{slurm_job_id}.json",
-        )
-        job_json.parent.mkdir(parents=True, exist_ok=True)
-        job_json.touch(exist_ok=True)
-
-        with job_json.open("w") as file:
-            json.dump(self.params, file, indent=4)
-
-        return LaunchResponse(
-            slurm_job_id=int(slurm_job_id),
-            model_name=self.model_name,
-            config=self.params,
-            raw_output=command_output,
-        )
-
-
-class APIListHelper(ListHelper):
-    """API Helper class for handling model listing."""
-
-    def __init__(self) -> None:
-        super().__init__()
-
-    def get_all_models(self) -> list[ModelInfo]:
-        """Get all available models."""
-        available_models = []
-        for config in self.model_configs:
-            info = ModelInfo(
-                name=config.model_name,
-                family=config.model_family,
-                variant=config.model_variant,
-                type=ModelType(config.model_type),
-                config=config.model_dump(exclude={"model_name", "venv", "log_dir"}),
-            )
-            available_models.append(info)
-        return available_models
diff --git a/vec_inf/api/__init__.py b/vec_inf/client/__init__.py
similarity index 81%
rename from vec_inf/api/__init__.py
rename to vec_inf/client/__init__.py
index 7f908b37..f4b5f864 100644
--- a/vec_inf/api/__init__.py
+++ b/vec_inf/client/__init__.py
@@ -5,16 +5,17 @@
 users direct control over the lifecycle of inference servers via python scripts.
 """
 
-from vec_inf.api._models import (
+from vec_inf.client._models import (
     LaunchOptions,
     LaunchOptionsDict,
     LaunchResponse,
     MetricsResponse,
     ModelInfo,
+    ModelStatus,
+    ModelType,
     StatusResponse,
 )
-from vec_inf.api.client import VecInfClient
-from vec_inf.shared._models import ModelStatus, ModelType
+from vec_inf.client.api import VecInfClient
 
 
 __all__ = [
diff --git a/vec_inf/shared/_config.py b/vec_inf/client/_config.py
similarity index 100%
rename from vec_inf/shared/_config.py
rename to vec_inf/client/_config.py
diff --git a/vec_inf/shared/_exceptions.py b/vec_inf/client/_exceptions.py
similarity index 100%
rename from vec_inf/shared/_exceptions.py
rename to vec_inf/client/_exceptions.py
diff --git a/vec_inf/shared/_helper.py b/vec_inf/client/_helper.py
similarity index 88%
rename from vec_inf/shared/_helper.py
rename to vec_inf/client/_helper.py
index 2c76ddc2..d3de5686 100644
--- a/vec_inf/shared/_helper.py
+++ b/vec_inf/client/_helper.py
@@ -1,24 +1,25 @@
 """Helper classes for the model."""
 
+import json
 import os
 import time
-from abc import ABC, abstractmethod
+import warnings
 from pathlib import Path
 from typing import Any, Optional, Union, cast
 from urllib.parse import urlparse, urlunparse
 
 import requests
 
-import vec_inf.shared._utils as utils
-from vec_inf.shared._config import ModelConfig
-from vec_inf.shared._exceptions import (
+import vec_inf.client._utils as utils
+from vec_inf.client._config import ModelConfig
+from vec_inf.client._exceptions import (
     MissingRequiredFieldsError,
     ModelConfigurationError,
     ModelNotFoundError,
     SlurmJobError,
 )
-from vec_inf.shared._models import ModelStatus
-from vec_inf.shared._vars import (
+from vec_inf.client._models import LaunchResponse, ModelInfo, ModelStatus, ModelType
+from vec_inf.client._vars import (
     BOOLEAN_FIELDS,
     LD_LIBRARY_PATH,
     REQUIRED_FIELDS,
@@ -27,7 +28,7 @@
 )
 
 
-class LaunchHelper(ABC):
+class ModelLauncher:
     """Helper class for handling inference server launch."""
 
     def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
@@ -45,10 +46,9 @@ def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
         self.model_config = self._get_model_configuration()
         self.params = self._get_launch_params()
 
-    @abstractmethod
     def _warn(self, message: str) -> None:
         """Warn the user about a potential issue."""
-        pass
+        warnings.warn(message, UserWarning, stacklevel=2)
 
     def _get_model_configuration(self) -> ModelConfig:
         """Load and validate model configuration."""
@@ -97,7 +97,7 @@ def _get_model_configuration(self) -> ModelConfig:
 
     def _get_launch_params(self) -> dict[str, Any]:
         """Merge config defaults with CLI overrides."""
-        params = self.model_config.model_dump()
+        params = cast(dict[str, Any], self.model_config.model_dump())
 
         # Process boolean fields
         for bool_field in BOOLEAN_FIELDS:
@@ -186,9 +186,31 @@ def build_launch_command(self) -> str:
         command_list.append(f"{SRC_DIR}/{slurm_script}")
         return " ".join(command_list)
 
+    def post_launch_processing(self, command_output: str) -> LaunchResponse:
+        """Process and display launch output."""
+        slurm_job_id = command_output.split(" ")[-1].strip().strip("\n")
+        self.params["slurm_job_id"] = slurm_job_id
+        job_json = Path(
+            self.params["log_dir"],
+            f"{self.model_name}.{slurm_job_id}",
+            f"{self.model_name}.{slurm_job_id}.json",
+        )
+        job_json.parent.mkdir(parents=True, exist_ok=True)
+        job_json.touch(exist_ok=True)
+
+        with job_json.open("w") as file:
+            json.dump(self.params, file, indent=4)
+
+        return LaunchResponse(
+            slurm_job_id=int(slurm_job_id),
+            model_name=self.model_name,
+            config=self.params,
+            raw_output=command_output,
+        )
 
-class StatusHelper:
-    """Helper class for handling server status information."""
+
+class ModelStatusMonitor:
+    """Class for handling server status information and monitoring."""
 
     def __init__(self, slurm_job_id: int, output: str, log_dir: Optional[str] = None):
         self.slurm_job_id = slurm_job_id
@@ -267,8 +289,8 @@ def process_pending_state(self) -> None:
             self.status_info["pending_reason"] = "Unknown pending reason"
 
 
-class MetricsHelper:
-    """Helper class for handling metrics information."""
+class PerformanceMetricsCollector:
+    """Class for handling metrics collection and processing."""
 
     def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
         self.slurm_job_id = slurm_job_id
@@ -288,7 +310,7 @@ def _get_status_info(self) -> dict[str, Union[str, None]]:
         output, stderr = utils.run_bash_command(status_cmd)
         if stderr:
             raise SlurmJobError(f"Error: {stderr}")
-        status_helper = StatusHelper(self.slurm_job_id, output, self.log_dir)
+        status_helper = ModelStatusMonitor(self.slurm_job_id, output, self.log_dir)
         return status_helper.status_info
 
     def _build_metrics_url(self) -> str:
@@ -434,16 +456,26 @@ def _parse_metrics(self, metrics_text: str) -> dict[str, float]:
         return parsed
 
 
-class ListHelper:
-    """Helper class for handling model listing functionality."""
+class ModelRegistry:
+    """Class for handling model listing and configuration management."""
 
     def __init__(self) -> None:
         """Initialize the model lister."""
-        self.model_configs = self._get_model_configs()
-
-    def _get_model_configs(self) -> list[ModelConfig]:
-        """Get all model configurations."""
-        return utils.load_config()
+        self.model_configs = utils.load_config()
+
+    def get_all_models(self) -> list[ModelInfo]:
+        """Get all available models."""
+        available_models = []
+        for config in self.model_configs:
+            info = ModelInfo(
+                name=config.model_name,
+                family=config.model_family,
+                variant=config.model_variant,
+                type=ModelType(config.model_type),
+                config=config.model_dump(exclude={"model_name", "venv", "log_dir"}),
+            )
+            available_models.append(info)
+        return available_models
 
     def get_single_model_config(self, model_name: str) -> ModelConfig:
         """Get configuration for a specific model."""
diff --git a/vec_inf/api/_models.py b/vec_inf/client/_models.py
similarity index 85%
rename from vec_inf/api/_models.py
rename to vec_inf/client/_models.py
index a3f80e01..6dfce8e9 100644
--- a/vec_inf/api/_models.py
+++ b/vec_inf/client/_models.py
@@ -1,26 +1,35 @@
-"""Data models for Vector Inference API.
+"""
+Data models for Vector Inference API.
 
 This module contains the data model classes used by the Vector Inference API
 for both request parameters and response objects.
 """
 
 from dataclasses import dataclass, field
+from enum import Enum
 from typing import Any, Optional, TypedDict
 
 from typing_extensions import NotRequired
 
-from vec_inf.shared._models import ModelStatus, ModelType
 
+class ModelStatus(str, Enum):
+    """Enum representing the possible status states of a model."""
 
-@dataclass
-class ModelInfo:
-    """Information about an available model."""
+    PENDING = "PENDING"
+    LAUNCHING = "LAUNCHING"
+    READY = "READY"
+    FAILED = "FAILED"
+    SHUTDOWN = "SHUTDOWN"
+    UNAVAILABLE = "UNAVAILABLE"
 
-    name: str
-    family: str
-    variant: Optional[str]
-    type: ModelType
-    config: dict[str, Any]
+
+class ModelType(str, Enum):
+    """Enum representing the possible model types."""
+
+    LLM = "LLM"
+    VLM = "VLM"
+    TEXT_EMBEDDING = "Text_Embedding"
+    REWARD_MODELING = "Reward_Modeling"
 
 
 @dataclass
@@ -107,3 +116,14 @@ class LaunchOptionsDict(TypedDict):
     pipeline_parallelism: NotRequired[Optional[bool]]
     compilation_config: NotRequired[Optional[str]]
     enforce_eager: NotRequired[Optional[bool]]
+
+
+@dataclass
+class ModelInfo:
+    """Information about an available model."""
+
+    name: str
+    family: str
+    variant: Optional[str]
+    type: ModelType
+    config: dict[str, Any]
diff --git a/vec_inf/shared/_utils.py b/vec_inf/client/_utils.py
similarity index 97%
rename from vec_inf/shared/_utils.py
rename to vec_inf/client/_utils.py
index 63b8648e..f115848c 100644
--- a/vec_inf/shared/_utils.py
+++ b/vec_inf/client/_utils.py
@@ -10,9 +10,9 @@
 import yaml
 from rich.table import Table
 
-from vec_inf.shared._config import ModelConfig
-from vec_inf.shared._models import ModelStatus
-from vec_inf.shared._vars import (
+from vec_inf.client._config import ModelConfig
+from vec_inf.client._models import ModelStatus
+from vec_inf.client._vars import (
     CACHED_CONFIG,
     MODEL_READY_SIGNATURE,
 )
diff --git a/vec_inf/shared/_vars.py b/vec_inf/client/_vars.py
similarity index 100%
rename from vec_inf/shared/_vars.py
rename to vec_inf/client/_vars.py
diff --git a/vec_inf/api/client.py b/vec_inf/client/api.py
similarity index 82%
rename from vec_inf/api/client.py
rename to vec_inf/client/api.py
index 1f53e8a9..47b6f3a7 100644
--- a/vec_inf/api/client.py
+++ b/vec_inf/client/api.py
@@ -9,24 +9,28 @@
 
 import requests
 
-from vec_inf.api._helper import APILaunchHelper, APIListHelper
-from vec_inf.api._models import (
+from vec_inf.client._config import ModelConfig
+from vec_inf.client._exceptions import (
+    APIError,
+    ModelNotFoundError,
+    ServerError,
+    SlurmJobError,
+)
+from vec_inf.client._helper import (
+    ModelLauncher,
+    ModelRegistry,
+    ModelStatusMonitor,
+    PerformanceMetricsCollector,
+)
+from vec_inf.client._models import (
     LaunchOptions,
     LaunchResponse,
     MetricsResponse,
     ModelInfo,
+    ModelStatus,
     StatusResponse,
 )
-from vec_inf.shared._config import ModelConfig
-from vec_inf.shared._exceptions import (
-    APIError,
-    ModelNotFoundError,
-    ServerError,
-    SlurmJobError,
-)
-from vec_inf.shared._helper import MetricsHelper, StatusHelper
-from vec_inf.shared._models import ModelStatus
-from vec_inf.shared._utils import run_bash_command, shutdown_model
+from vec_inf.client._utils import run_bash_command, shutdown_model
 
 
 class VecInfClient:
@@ -68,8 +72,8 @@ def list_models(self) -> list[ModelInfo]:
 
         """
         try:
-            list_helper = APIListHelper()
-            return list_helper.get_all_models()
+            model_registry = ModelRegistry()
+            return cast(list[ModelInfo], model_registry.get_all_models())
         except Exception as e:
             raise APIError(f"Failed to list models: {str(e)}") from e
 
@@ -95,8 +99,8 @@ def get_model_config(self, model_name: str) -> ModelConfig:
 
         """
         try:
-            list_helper = APIListHelper()
-            return list_helper.get_single_model_config(model_name)
+            model_registry = ModelRegistry()
+            return model_registry.get_single_model_config(model_name)
         except ModelNotFoundError:
             raise
         except Exception as e:
@@ -133,18 +137,18 @@ def launch_model(
                 options_dict = {k: v for k, v in vars(options).items() if v is not None}
 
             # Create and use the API Launch Helper
-            launch_helper = APILaunchHelper(model_name, options_dict)
+            model_launcher = ModelLauncher(model_name, options_dict)
 
             # Set environment variables
-            launch_helper.set_env_vars()
+            model_launcher.set_env_vars()
 
             # Build and execute the launch command
-            launch_command = launch_helper.build_launch_command()
+            launch_command = model_launcher.build_launch_command()
             command_output, stderr = run_bash_command(launch_command)
             if stderr:
                 raise SlurmJobError(f"Error: {stderr}")
 
-            return launch_helper.post_launch_processing(command_output)
+            return model_launcher.post_launch_processing(command_output)
 
         except ValueError as e:
             if "not found in configuration" in str(e):
@@ -183,16 +187,16 @@ def get_status(
             if stderr:
                 raise SlurmJobError(f"Error: {stderr}")
 
-            status_helper = StatusHelper(slurm_job_id, output, log_dir)
-            status_helper.process_job_state()
+            model_status_monitor = ModelStatusMonitor(slurm_job_id, output, log_dir)
+            model_status_monitor.process_job_state()
 
             return StatusResponse(
                 slurm_job_id=slurm_job_id,
-                model_name=cast(str, status_helper.status_info["model_name"]),
-                status=cast(ModelStatus, status_helper.status_info["status"]),
-                base_url=status_helper.status_info["base_url"],
-                pending_reason=status_helper.status_info["pending_reason"],
-                failed_reason=status_helper.status_info["failed_reason"],
+                model_name=cast(str, model_status_monitor.status_info["model_name"]),
+                status=cast(ModelStatus, model_status_monitor.status_info["status"]),
+                base_url=model_status_monitor.status_info["base_url"],
+                pending_reason=model_status_monitor.status_info["pending_reason"],
+                failed_reason=model_status_monitor.status_info["failed_reason"],
                 raw_output=output,
             )
         except SlurmJobError:
@@ -226,21 +230,25 @@ def get_metrics(
 
         """
         try:
-            metrics_helper = MetricsHelper(slurm_job_id, log_dir)
+            performance_metrics_collector = PerformanceMetricsCollector(
+                slurm_job_id, log_dir
+            )
 
-            if not metrics_helper.metrics_url.startswith("http"):
+            if not performance_metrics_collector.metrics_url.startswith("http"):
                 raise ServerError(
-                    f"Metrics endpoint unavailable or server not ready - {metrics_helper.metrics_url}"
+                    f"Metrics endpoint unavailable or server not ready - {performance_metrics_collector.metrics_url}"
                 )
 
-            metrics = metrics_helper.fetch_metrics()
+            metrics = performance_metrics_collector.fetch_metrics()
 
             if isinstance(metrics, str):
                 raise requests.RequestException(metrics)
 
             return MetricsResponse(
                 slurm_job_id=slurm_job_id,
-                model_name=cast(str, metrics_helper.status_info["model_name"]),
+                model_name=cast(
+                    str, performance_metrics_collector.status_info["model_name"]
+                ),
                 metrics=metrics,
                 timestamp=time.time(),
             )
diff --git a/vec_inf/shared/__init__.py b/vec_inf/shared/__init__.py
deleted file mode 100644
index b1d18c95..00000000
--- a/vec_inf/shared/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Shared modules for vec_inf."""
diff --git a/vec_inf/shared/_models.py b/vec_inf/shared/_models.py
deleted file mode 100644
index cd11d6a4..00000000
--- a/vec_inf/shared/_models.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""Shared data models for Vector Inference."""
-
-from enum import Enum
-
-
-class ModelStatus(str, Enum):
-    """Enum representing the possible status states of a model."""
-
-    PENDING = "PENDING"
-    LAUNCHING = "LAUNCHING"
-    READY = "READY"
-    FAILED = "FAILED"
-    SHUTDOWN = "SHUTDOWN"
-    UNAVAILABLE = "UNAVAILABLE"
-
-
-class ModelType(str, Enum):
-    """Enum representing the possible model types."""
-
-    LLM = "LLM"
-    VLM = "VLM"
-    TEXT_EMBEDDING = "Text_Embedding"
-    REWARD_MODELING = "Reward_Modeling"

From 2af196a667e75163980190cc3cdc8c53fc040725 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Tue, 8 Apr 2025 12:44:16 -0400
Subject: [PATCH 38/52] Move post launch processing and launch logic into a
 single launch function

---
 vec_inf/client/_helper.py | 19 +++++++++++++++----
 vec_inf/client/api.py     | 11 +----------
 2 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
index d3de5686..394fee8c 100644
--- a/vec_inf/client/_helper.py
+++ b/vec_inf/client/_helper.py
@@ -128,7 +128,7 @@ def _get_launch_params(self) -> dict[str, Any]:
 
         return params
 
-    def set_env_vars(self) -> None:
+    def _set_env_vars(self) -> None:
         """Set environment variables for the launch command."""
         os.environ["MODEL_NAME"] = self.model_name
         os.environ["MAX_MODEL_LEN"] = self.params["max_model_len"]
@@ -156,7 +156,7 @@ def set_env_vars(self) -> None:
         if self.params.get("enforce_eager"):
             os.environ["ENFORCE_EAGER"] = self.params["enforce_eager"]
 
-    def build_launch_command(self) -> str:
+    def _build_launch_command(self) -> str:
         """Construct the full launch command with parameters."""
         # Base command
         command_list = ["sbatch"]
@@ -186,10 +186,21 @@ def build_launch_command(self) -> str:
         command_list.append(f"{SRC_DIR}/{slurm_script}")
         return " ".join(command_list)
 
-    def post_launch_processing(self, command_output: str) -> LaunchResponse:
-        """Process and display launch output."""
+    def launch(self) -> LaunchResponse:
+        """Launch the model."""
+        # Set environment variables
+        self._set_env_vars()
+
+        # Build and execute the launch command
+        command_output, stderr = utils.run_bash_command(self._build_launch_command())
+        if stderr:
+            raise SlurmJobError(f"Error: {stderr}")
+
+        # Extract slurm job id from command output
         slurm_job_id = command_output.split(" ")[-1].strip().strip("\n")
         self.params["slurm_job_id"] = slurm_job_id
+
+        # Create log directory and job json file
         job_json = Path(
             self.params["log_dir"],
             f"{self.model_name}.{slurm_job_id}",
diff --git a/vec_inf/client/api.py b/vec_inf/client/api.py
index 47b6f3a7..fa2ffe5f 100644
--- a/vec_inf/client/api.py
+++ b/vec_inf/client/api.py
@@ -139,16 +139,7 @@ def launch_model(
             # Create and use the API Launch Helper
             model_launcher = ModelLauncher(model_name, options_dict)
 
-            # Set environment variables
-            model_launcher.set_env_vars()
-
-            # Build and execute the launch command
-            launch_command = model_launcher.build_launch_command()
-            command_output, stderr = run_bash_command(launch_command)
-            if stderr:
-                raise SlurmJobError(f"Error: {stderr}")
-
-            return model_launcher.post_launch_processing(command_output)
+            return model_launcher.launch()
 
         except ValueError as e:
             if "not found in configuration" in str(e):

From dcf8abf5cb88598df52329bd9370e275ef18a920 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Tue, 8 Apr 2025 13:25:05 -0400
Subject: [PATCH 39/52] Add slurm ID as class param for model launcher

---
 vec_inf/client/_helper.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
index 394fee8c..7d93844a 100644
--- a/vec_inf/client/_helper.py
+++ b/vec_inf/client/_helper.py
@@ -43,6 +43,7 @@ def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
         """
         self.model_name = model_name
         self.kwargs = kwargs or {}
+        self.slurm_job_id = ""
         self.model_config = self._get_model_configuration()
         self.params = self._get_launch_params()
 
@@ -197,14 +198,13 @@ def launch(self) -> LaunchResponse:
             raise SlurmJobError(f"Error: {stderr}")
 
         # Extract slurm job id from command output
-        slurm_job_id = command_output.split(" ")[-1].strip().strip("\n")
-        self.params["slurm_job_id"] = slurm_job_id
+        self.slurm_job_id = command_output.split(" ")[-1].strip().strip("\n")
 
         # Create log directory and job json file
         job_json = Path(
             self.params["log_dir"],
-            f"{self.model_name}.{slurm_job_id}",
-            f"{self.model_name}.{slurm_job_id}.json",
+            f"{self.model_name}.{self.slurm_job_id}",
+            f"{self.model_name}.{self.slurm_job_id}.json",
         )
         job_json.parent.mkdir(parents=True, exist_ok=True)
         job_json.touch(exist_ok=True)
@@ -213,7 +213,7 @@ def launch(self) -> LaunchResponse:
             json.dump(self.params, file, indent=4)
 
         return LaunchResponse(
-            slurm_job_id=int(slurm_job_id),
+            slurm_job_id=int(self.slurm_job_id),
             model_name=self.model_name,
             config=self.params,
             raw_output=command_output,

From ebd6898b0a7d0ef63b9faf69c71ceb0b896590ff Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Tue, 8 Apr 2025 14:03:45 -0400
Subject: [PATCH 40/52] Integrate status retrival code into ModelStatusMonitor
 class

---
 vec_inf/client/_helper.py | 38 +++++++++++++++++++++++++++++---------
 vec_inf/client/api.py     | 21 +++------------------
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
index 7d93844a..2ced1f26 100644
--- a/vec_inf/client/_helper.py
+++ b/vec_inf/client/_helper.py
@@ -18,7 +18,13 @@
     ModelNotFoundError,
     SlurmJobError,
 )
-from vec_inf.client._models import LaunchResponse, ModelInfo, ModelStatus, ModelType
+from vec_inf.client._models import (
+    LaunchResponse,
+    ModelInfo,
+    ModelStatus,
+    ModelType,
+    StatusResponse,
+)
 from vec_inf.client._vars import (
     BOOLEAN_FIELDS,
     LD_LIBRARY_PATH,
@@ -223,12 +229,20 @@ def launch(self) -> LaunchResponse:
 class ModelStatusMonitor:
     """Class for handling server status information and monitoring."""
 
-    def __init__(self, slurm_job_id: int, output: str, log_dir: Optional[str] = None):
+    def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
         self.slurm_job_id = slurm_job_id
-        self.output = output
+        self.output = self._get_raw_status_output()
         self.log_dir = log_dir
         self.status_info = self._get_base_status_data()
 
+    def _get_raw_status_output(self) -> str:
+        """Get the raw server status output from slurm."""
+        status_cmd = f"scontrol show job {self.slurm_job_id} --oneliner"
+        output, stderr = utils.run_bash_command(status_cmd)
+        if stderr:
+            raise SlurmJobError(f"Error: {stderr}")
+        return cast(str, output)
+
     def _get_base_status_data(self) -> dict[str, Union[str, None]]:
         """Extract basic job status information from scontrol output."""
         try:
@@ -247,13 +261,23 @@ def _get_base_status_data(self) -> dict[str, Union[str, None]]:
             "failed_reason": None,
         }
 
-    def process_job_state(self) -> None:
+    def process_model_status(self) -> StatusResponse:
         """Process different job states and update status information."""
         if self.status_info["state"] == ModelStatus.PENDING:
             self.process_pending_state()
         elif self.status_info["state"] == "RUNNING":
             self.process_running_state()
 
+        return StatusResponse(
+            slurm_job_id=self.slurm_job_id,
+            model_name=cast(str, self.status_info["model_name"]),
+            status=cast(ModelStatus, self.status_info["status"]),
+            raw_output=self.output,
+            base_url=self.status_info["base_url"],
+            pending_reason=self.status_info["pending_reason"],
+            failed_reason=self.status_info["failed_reason"],
+        )
+
     def check_model_health(self) -> None:
         """Check model health and update status accordingly."""
         status, status_code = utils.model_health_check(
@@ -317,11 +341,7 @@ def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
 
     def _get_status_info(self) -> dict[str, Union[str, None]]:
         """Retrieve status info using existing StatusHelper."""
-        status_cmd = f"scontrol show job {self.slurm_job_id} --oneliner"
-        output, stderr = utils.run_bash_command(status_cmd)
-        if stderr:
-            raise SlurmJobError(f"Error: {stderr}")
-        status_helper = ModelStatusMonitor(self.slurm_job_id, output, self.log_dir)
+        status_helper = ModelStatusMonitor(self.slurm_job_id, self.log_dir)
         return status_helper.status_info
 
     def _build_metrics_url(self) -> str:
diff --git a/vec_inf/client/api.py b/vec_inf/client/api.py
index fa2ffe5f..e2c1636c 100644
--- a/vec_inf/client/api.py
+++ b/vec_inf/client/api.py
@@ -30,7 +30,7 @@
     ModelStatus,
     StatusResponse,
 )
-from vec_inf.client._utils import run_bash_command, shutdown_model
+from vec_inf.client._utils import shutdown_model
 
 
 class VecInfClient:
@@ -173,23 +173,8 @@ def get_status(
             Error if there was an error retrieving the status.
         """
         try:
-            status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
-            output, stderr = run_bash_command(status_cmd)
-            if stderr:
-                raise SlurmJobError(f"Error: {stderr}")
-
-            model_status_monitor = ModelStatusMonitor(slurm_job_id, output, log_dir)
-            model_status_monitor.process_job_state()
-
-            return StatusResponse(
-                slurm_job_id=slurm_job_id,
-                model_name=cast(str, model_status_monitor.status_info["model_name"]),
-                status=cast(ModelStatus, model_status_monitor.status_info["status"]),
-                base_url=model_status_monitor.status_info["base_url"],
-                pending_reason=model_status_monitor.status_info["pending_reason"],
-                failed_reason=model_status_monitor.status_info["failed_reason"],
-                raw_output=output,
-            )
+            model_status_monitor = ModelStatusMonitor(slurm_job_id, log_dir)
+            return model_status_monitor.process_model_status()
         except SlurmJobError:
             raise
         except Exception as e:

From 988040c997a172a404fd1fc0c4a43408702649ab Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Tue, 8 Apr 2025 14:33:35 -0400
Subject: [PATCH 41/52] Add slurm ID to model launcher params to be dumped into
 json

---
 vec_inf/client/_helper.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
index 2ced1f26..21d9ec24 100644
--- a/vec_inf/client/_helper.py
+++ b/vec_inf/client/_helper.py
@@ -205,6 +205,7 @@ def launch(self) -> LaunchResponse:
 
         # Extract slurm job id from command output
         self.slurm_job_id = command_output.split(" ")[-1].strip().strip("\n")
+        self.params["slurm_job_id"] = self.slurm_job_id
 
         # Create log directory and job json file
         job_json = Path(

From 7722344507a2245bfadeb1da525d9f009f8f5635 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Tue, 8 Apr 2025 14:35:25 -0400
Subject: [PATCH 42/52] Refactor CLI to use client

---
 vec_inf/cli/_cli.py    | 61 +++++++++++++++++------------------
 vec_inf/cli/_helper.py | 72 +++++++++++++-----------------------------
 vec_inf/cli/_models.py | 15 +++++++++
 3 files changed, 66 insertions(+), 82 deletions(-)
 create mode 100644 vec_inf/cli/_models.py

diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index 6c50c24e..63278e48 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -7,12 +7,12 @@
 from rich.console import Console
 from rich.live import Live
 
-import vec_inf.shared._utils as utils
+import vec_inf.client._utils as utils
 from vec_inf.cli._helper import (
-    CLILaunchHelper,
-    CLIListHelper,
-    CLIMetricsHelper,
-    CLIStatusHelper,
+    CLIMetricsCollector,
+    CLIModelLauncher,
+    CLIModelRegistry,
+    CLIModelStatusMonitor,
 )
 
 
@@ -131,14 +131,15 @@ def launch(
 ) -> None:
     """Launch a model on the cluster."""
     try:
-        launch_helper = CLILaunchHelper(model_name, cli_kwargs)
-
-        launch_helper.set_env_vars()
-        launch_command = launch_helper.build_launch_command()
-        command_output, stderr = utils.run_bash_command(launch_command)
-        if stderr:
-            raise click.ClickException(f"Error: {stderr}")
-        launch_helper.post_launch_processing(command_output, CONSOLE)
+        model_launcher = CLIModelLauncher(model_name, cli_kwargs)
+        # Launch model inference server
+        model_launcher.launch()
+        # Display launch information
+        if cli_kwargs.get("json_mode"):
+            click.echo(model_launcher.params)
+        else:
+            launch_info_table = model_launcher.format_table_output()
+            CONSOLE.print(launch_info_table)
 
     except click.ClickException as e:
         raise e
@@ -163,18 +164,14 @@ def status(
 ) -> None:
     """Get the status of a running model on the cluster."""
     try:
-        status_cmd = f"scontrol show job {slurm_job_id} --oneliner"
-        output, stderr = utils.run_bash_command(status_cmd)
-        if stderr:
-            raise click.ClickException(f"Error: {stderr}")
-
-        status_helper = CLIStatusHelper(slurm_job_id, output, log_dir)
-
-        status_helper.process_job_state()
+        # Get model inference server status
+        model_status_monitor = CLIModelStatusMonitor(slurm_job_id, log_dir)
+        model_status_monitor.process_model_status()
+        # Display status information
         if json_mode:
-            status_helper.output_json()
+            model_status_monitor.output_json()
         else:
-            status_helper.output_table(CONSOLE)
+            model_status_monitor.output_table(CONSOLE)
 
     except click.ClickException as e:
         raise e
@@ -200,8 +197,8 @@ def shutdown(slurm_job_id: int) -> None:
 def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
     """List all available models, or get default setup of a specific model."""
     try:
-        list_helper = CLIListHelper(json_mode)
-        list_helper.process_list_command(CONSOLE, model_name)
+        model_registry = CLIModelRegistry(json_mode)
+        model_registry.process_list_command(CONSOLE, model_name)
     except click.ClickException as e:
         raise e
     except Exception as e:
@@ -216,28 +213,28 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
 def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
     """Stream real-time performance metrics from the model endpoint."""
     try:
-        metrics_helper = CLIMetricsHelper(slurm_job_id, log_dir)
+        metrics_collector = CLIMetricsCollector(slurm_job_id, log_dir)
 
         # Check if metrics URL is ready
-        if not metrics_helper.metrics_url.startswith("http"):
+        if not metrics_collector.metrics_url.startswith("http"):
             table = utils.create_table("Metric", "Value")
-            metrics_helper.display_failed_metrics(
+            metrics_collector.display_failed_metrics(
                 table,
-                f"Metrics endpoint unavailable or server not ready - {metrics_helper.metrics_url}",
+                f"Metrics endpoint unavailable or server not ready - {metrics_collector.metrics_url}",
             )
             CONSOLE.print(table)
             return
 
         with Live(refresh_per_second=1, console=CONSOLE) as live:
             while True:
-                metrics = metrics_helper.fetch_metrics()
+                metrics = metrics_collector.fetch_metrics()
                 table = utils.create_table("Metric", "Value")
 
                 if isinstance(metrics, str):
                     # Show status information if metrics aren't available
-                    metrics_helper.display_failed_metrics(table, metrics)
+                    metrics_collector.display_failed_metrics(table, metrics)
                 else:
-                    metrics_helper.display_metrics(table, metrics)
+                    metrics_collector.display_metrics(table, metrics)
 
                 live.update(table)
                 time.sleep(2)
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index b2679205..8d2939a8 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -11,13 +11,19 @@
 from rich.panel import Panel
 from rich.table import Table
 
-import vec_inf.shared._utils as utils
-from vec_inf.shared._config import ModelConfig
-from vec_inf.shared._helper import LaunchHelper, ListHelper, MetricsHelper, StatusHelper
+import vec_inf.client._utils as utils
+from vec_inf.cli._models import MODEL_TYPE_COLORS, MODEL_TYPE_PRIORITY
+from vec_inf.client._config import ModelConfig
+from vec_inf.client._helper import (
+    ModelLauncher,
+    ModelRegistry,
+    ModelStatusMonitor,
+    PerformanceMetricsCollector,
+)
 
 
-class CLILaunchHelper(LaunchHelper):
-    """CLI Helper class for handling launch information."""
+class CLIModelLauncher(ModelLauncher):
+    """CLI Helper class for handling inference server launch."""
 
     def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
         super().__init__(model_name, kwargs)
@@ -26,12 +32,12 @@ def _warn(self, message: str) -> None:
         """Warn the user about a potential issue."""
         click.echo(click.style(f"Warning: {message}", fg="yellow"), err=True)
 
-    def _format_table_output(self, job_id: str) -> Table:
+    def format_table_output(self) -> Table:
         """Format output as rich Table."""
         table = utils.create_table(key_title="Job Config", value_title="Value")
 
         # Add key information with consistent styling
-        table.add_row("Slurm Job ID", job_id, style="blue")
+        table.add_row("Slurm Job ID", self.slurm_job_id, style="blue")
         table.add_row("Job Name", self.model_name)
 
         # Add model details
@@ -71,33 +77,12 @@ def _format_table_output(self, job_id: str) -> Table:
 
         return table
 
-    def post_launch_processing(self, output: str, console: Console) -> None:
-        """Process and display launch output."""
-        json_mode = bool(self.kwargs.get("json_mode", False))
-        slurm_job_id = output.split(" ")[-1].strip().strip("\n")
-        self.params["slurm_job_id"] = slurm_job_id
-        job_json = Path(
-            self.params["log_dir"],
-            f"{self.model_name}.{slurm_job_id}",
-            f"{self.model_name}.{slurm_job_id}.json",
-        )
-        job_json.parent.mkdir(parents=True, exist_ok=True)
-        job_json.touch(exist_ok=True)
-
-        with job_json.open("w") as file:
-            json.dump(self.params, file, indent=4)
-        if json_mode:
-            click.echo(self.params)
-        else:
-            table = self._format_table_output(slurm_job_id)
-            console.print(table)
 
+class CLIModelStatusMonitor(ModelStatusMonitor):
+    """CLI Helper class for handling server status information and monitoring."""
 
-class CLIStatusHelper(StatusHelper):
-    """CLI Helper class for handling status information."""
-
-    def __init__(self, slurm_job_id: int, output: str, log_dir: Optional[str] = None):
-        super().__init__(slurm_job_id, output, log_dir)
+    def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
+        super().__init__(slurm_job_id, log_dir)
 
     def output_json(self) -> None:
         """Format and output JSON data."""
@@ -127,7 +112,7 @@ def output_table(self, console: Console) -> None:
         console.print(table)
 
 
-class CLIMetricsHelper(MetricsHelper):
+class CLIMetricsCollector(PerformanceMetricsCollector):
     """CLI Helper class for streaming metrics information."""
 
     def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
@@ -204,8 +189,8 @@ def display_metrics(self, table: Table, metrics: dict[str, float]) -> None:
         )
 
 
-class CLIListHelper(ListHelper):
-    """Helper class for handling model listing functionality."""
+class CLIModelRegistry(ModelRegistry):
+    """CLI Helper class for handling model listing functionality."""
 
     def __init__(self, json_mode: bool = False):
         super().__init__()
@@ -237,28 +222,15 @@ def format_all_models_output(self) -> Union[list[str], list[Panel]]:
             return [config.model_name for config in self.model_configs]
 
         # Sort by model type priority
-        type_priority = {
-            "LLM": 0,
-            "VLM": 1,
-            "Text_Embedding": 2,
-            "Reward_Modeling": 3,
-        }
         sorted_configs = sorted(
             self.model_configs,
-            key=lambda x: type_priority.get(x.model_type, 4),
+            key=lambda x: MODEL_TYPE_PRIORITY.get(x.model_type, 4),
         )
 
         # Create panels with color coding
-        model_type_colors = {
-            "LLM": "cyan",
-            "VLM": "bright_blue",
-            "Text_Embedding": "purple",
-            "Reward_Modeling": "bright_magenta",
-        }
-
         panels = []
         for config in sorted_configs:
-            color = model_type_colors.get(config.model_type, "white")
+            color = MODEL_TYPE_COLORS.get(config.model_type, "white")
             variant = config.model_variant or ""
             display_text = f"[magenta]{config.model_family}[/magenta]"
             if variant:
diff --git a/vec_inf/cli/_models.py b/vec_inf/cli/_models.py
new file mode 100644
index 00000000..1b45df9f
--- /dev/null
+++ b/vec_inf/cli/_models.py
@@ -0,0 +1,15 @@
+"""Data models for CLI rendering."""
+
+MODEL_TYPE_PRIORITY = {
+    "LLM": 0,
+    "VLM": 1,
+    "Text_Embedding": 2,
+    "Reward_Modeling": 3,
+}
+
+MODEL_TYPE_COLORS = {
+    "LLM": "cyan",
+    "VLM": "bright_blue",
+    "Text_Embedding": "purple",
+    "Reward_Modeling": "bright_magenta",
+}

From cfca24e1f58d0506ff88abe16d9e41d34179aa02 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Tue, 8 Apr 2025 14:35:39 -0400
Subject: [PATCH 43/52] Update tests

---
 tests/test_imports.py         | 22 ++++++++--------------
 tests/vec_inf/cli/test_cli.py | 26 +++++++++++++-------------
 2 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/tests/test_imports.py b/tests/test_imports.py
index 2f91d9af..d450f6bf 100644
--- a/tests/test_imports.py
+++ b/tests/test_imports.py
@@ -11,25 +11,19 @@ class TestVecInfImports(unittest.TestCase):
     def test_imports(self):
         """Test that all modules can be imported."""
         try:
-            # API imports
-            import vec_inf.api
-            import vec_inf.api._helper
-            import vec_inf.api._models
-            import vec_inf.api.client
-
             # CLI imports
             import vec_inf.cli
             import vec_inf.cli._cli
             import vec_inf.cli._helper
 
-            # Shared imports
-            import vec_inf.shared
-            import vec_inf.shared._config
-            import vec_inf.shared._exceptions
-            import vec_inf.shared._helper
-            import vec_inf.shared._models
-            import vec_inf.shared._utils
-            import vec_inf.shared._vars  # noqa: F401
+            # Client imports
+            import vec_inf.client
+            import vec_inf.client._config
+            import vec_inf.client._exceptions
+            import vec_inf.client._helper
+            import vec_inf.client._models
+            import vec_inf.client._utils
+            import vec_inf.client._vars  # noqa: F401
 
         except ImportError as e:
             pytest.fail(f"Import failed: {e}")
diff --git a/tests/vec_inf/cli/test_cli.py b/tests/vec_inf/cli/test_cli.py
index 7d61f7fc..aef2295c 100644
--- a/tests/vec_inf/cli/test_cli.py
+++ b/tests/vec_inf/cli/test_cli.py
@@ -229,7 +229,7 @@ def base_patches(test_paths, mock_truediv, debug_helper):
         patch("pathlib.Path.iterdir", return_value=[]),  # Mock empty directory listing
         patch("json.dump"),
         patch("pathlib.Path.touch"),
-        patch("vec_inf.shared._utils.Path", return_value=test_paths["weights_dir"]),
+        patch("vec_inf.client._utils.Path", return_value=test_paths["weights_dir"]),
         patch(
             "pathlib.Path.home", return_value=Path("/home/user")
         ),  # Mock home directory
@@ -251,7 +251,7 @@ def test_launch_command_success(runner, mock_launch_output, path_exists, debug_h
     test_log_dir = Path("/tmp/test_vec_inf_logs")
 
     with (
-        patch("vec_inf.shared._utils.run_bash_command") as mock_run,
+        patch("vec_inf.client._utils.run_bash_command") as mock_run,
         patch("pathlib.Path.mkdir"),
         patch("builtins.open", debug_helper.tracked_mock_open),
         patch("pathlib.Path.open", debug_helper.tracked_mock_open),
@@ -282,7 +282,7 @@ def test_launch_command_with_json_output(
     """Test JSON output format for launch command."""
     test_log_dir = Path("/tmp/test_vec_inf_logs")
     with (
-        patch("vec_inf.shared._utils.run_bash_command") as mock_run,
+        patch("vec_inf.client._utils.run_bash_command") as mock_run,
         patch("pathlib.Path.mkdir"),
         patch("builtins.open", debug_helper.tracked_mock_open),
         patch("pathlib.Path.open", debug_helper.tracked_mock_open),
@@ -340,7 +340,7 @@ def test_launch_command_model_not_in_config_with_weights(
         for patch_obj in base_patches:
             stack.enter_context(patch_obj)
         # Apply specific patches for this test
-        mock_run = stack.enter_context(patch("vec_inf.shared._utils.run_bash_command"))
+        mock_run = stack.enter_context(patch("vec_inf.client._utils.run_bash_command"))
         stack.enter_context(patch("pathlib.Path.exists", new=custom_path_exists))
 
         expected_job_id = "14933051"
@@ -381,7 +381,7 @@ def custom_path_exists(p):
 
         # Mock Path to return the weights dir path
         stack.enter_context(
-            patch("vec_inf.shared._utils.Path", return_value=test_paths["weights_dir"])
+            patch("vec_inf.client._utils.Path", return_value=test_paths["weights_dir"])
         )
 
         result = runner.invoke(cli, ["launch", "unknown-model"])
@@ -417,9 +417,9 @@ def test_metrics_command_pending_server(
 ):
     """Test metrics command when server is pending."""
     with (
-        patch("vec_inf.shared._utils.run_bash_command") as mock_run,
+        patch("vec_inf.client._utils.run_bash_command") as mock_run,
         patch("pathlib.Path.exists", new=path_exists),
-        patch("vec_inf.shared._utils.get_base_url", return_value="URL NOT FOUND"),
+        patch("vec_inf.client._utils.get_base_url", return_value="URL NOT FOUND"),
     ):
         job_id = 12345
         mock_run.return_value = (mock_status_output(job_id, "PENDING"), "")
@@ -441,9 +441,9 @@ def test_metrics_command_server_not_ready(
 ):
     """Test metrics command when server is running but not ready."""
     with (
-        patch("vec_inf.shared._utils.run_bash_command") as mock_run,
+        patch("vec_inf.client._utils.run_bash_command") as mock_run,
         patch("pathlib.Path.exists", new=path_exists),
-        patch("vec_inf.shared._utils.get_base_url", return_value="Server not ready"),
+        patch("vec_inf.client._utils.get_base_url", return_value="Server not ready"),
     ):
         job_id = 12345
         mock_run.return_value = (mock_status_output(job_id, "RUNNING"), "")
@@ -481,9 +481,9 @@ def test_metrics_command_server_ready(
     mock_response.status_code = 200
 
     with (
-        patch("vec_inf.shared._utils.run_bash_command") as mock_run,
+        patch("vec_inf.client._utils.run_bash_command") as mock_run,
         patch("pathlib.Path.exists", new=path_exists),
-        patch("vec_inf.shared._utils.get_base_url", return_value="http://test:8000/v1"),
+        patch("vec_inf.client._utils.get_base_url", return_value="http://test:8000/v1"),
         patch("time.sleep", side_effect=KeyboardInterrupt),  # Break the infinite loop
     ):
         job_id = 12345
@@ -507,9 +507,9 @@ def test_metrics_command_request_failed(
     mock_get.side_effect = requests.exceptions.RequestException("Connection refused")
 
     with (
-        patch("vec_inf.shared._utils.run_bash_command") as mock_run,
+        patch("vec_inf.client._utils.run_bash_command") as mock_run,
         patch("pathlib.Path.exists", new=path_exists),
-        patch("vec_inf.shared._utils.get_base_url", return_value="http://test:8000/v1"),
+        patch("vec_inf.client._utils.get_base_url", return_value="http://test:8000/v1"),
         patch("time.sleep", side_effect=KeyboardInterrupt),  # Break the infinite loop
     ):
         job_id = 12345

From e39b99549403078d66861b825a904c6e95ad2a48 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 8 Apr 2025 18:35:55 +0000
Subject: [PATCH 44/52] [pre-commit.ci] Add auto fixes from pre-commit.com
 hooks

for more information, see https://pre-commit.ci
---
 vec_inf/cli/_helper.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index 8d2939a8..0a77e51a 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -1,8 +1,6 @@
 """Helper classes for the CLI."""
 
-import json
 import os
-from pathlib import Path
 from typing import Any, Optional, Union
 
 import click

From 72af8b98ea93c4e0dede1b8dea0a97111975d18d Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Tue, 8 Apr 2025 14:40:20 -0400
Subject: [PATCH 45/52] Remove redundant casts, seems like my local mypy was
 acting up

---
 vec_inf/client/_helper.py | 4 ++--
 vec_inf/client/api.py     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
index 21d9ec24..b14b4cd3 100644
--- a/vec_inf/client/_helper.py
+++ b/vec_inf/client/_helper.py
@@ -104,7 +104,7 @@ def _get_model_configuration(self) -> ModelConfig:
 
     def _get_launch_params(self) -> dict[str, Any]:
         """Merge config defaults with CLI overrides."""
-        params = cast(dict[str, Any], self.model_config.model_dump())
+        params = self.model_config.model_dump()
 
         # Process boolean fields
         for bool_field in BOOLEAN_FIELDS:
@@ -242,7 +242,7 @@ def _get_raw_status_output(self) -> str:
         output, stderr = utils.run_bash_command(status_cmd)
         if stderr:
             raise SlurmJobError(f"Error: {stderr}")
-        return cast(str, output)
+        return output
 
     def _get_base_status_data(self) -> dict[str, Union[str, None]]:
         """Extract basic job status information from scontrol output."""
diff --git a/vec_inf/client/api.py b/vec_inf/client/api.py
index e2c1636c..e9b55c7d 100644
--- a/vec_inf/client/api.py
+++ b/vec_inf/client/api.py
@@ -73,7 +73,7 @@ def list_models(self) -> list[ModelInfo]:
         """
         try:
             model_registry = ModelRegistry()
-            return cast(list[ModelInfo], model_registry.get_all_models())
+            return model_registry.get_all_models()
         except Exception as e:
             raise APIError(f"Failed to list models: {str(e)}") from e
 

From 8e2f8e63fde4cc6f220e5550ab1301cc5d708672 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Wed, 9 Apr 2025 11:50:16 -0400
Subject: [PATCH 46/52] Refactoring client for CLI use

---
 vec_inf/client/_helper.py | 67 ++++++++++++++++++---------------------
 vec_inf/client/_models.py |  9 +++---
 vec_inf/client/api.py     | 21 +++---------
 3 files changed, 39 insertions(+), 58 deletions(-)

diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
index b14b4cd3..d7928901 100644
--- a/vec_inf/client/_helper.py
+++ b/vec_inf/client/_helper.py
@@ -244,42 +244,26 @@ def _get_raw_status_output(self) -> str:
             raise SlurmJobError(f"Error: {stderr}")
         return output
 
-    def _get_base_status_data(self) -> dict[str, Union[str, None]]:
+    def _get_base_status_data(self) -> StatusResponse:
         """Extract basic job status information from scontrol output."""
         try:
             job_name = self.output.split(" ")[1].split("=")[1]
             job_state = self.output.split(" ")[9].split("=")[1]
         except IndexError:
-            job_name = ModelStatus.UNAVAILABLE
+            job_name = "UNAVAILABLE"
             job_state = ModelStatus.UNAVAILABLE
 
-        return {
-            "model_name": job_name,
-            "status": ModelStatus.UNAVAILABLE,
-            "base_url": ModelStatus.UNAVAILABLE,
-            "state": job_state,
-            "pending_reason": None,
-            "failed_reason": None,
-        }
-
-    def process_model_status(self) -> StatusResponse:
-        """Process different job states and update status information."""
-        if self.status_info["state"] == ModelStatus.PENDING:
-            self.process_pending_state()
-        elif self.status_info["state"] == "RUNNING":
-            self.process_running_state()
-
         return StatusResponse(
-            slurm_job_id=self.slurm_job_id,
-            model_name=cast(str, self.status_info["model_name"]),
-            status=cast(ModelStatus, self.status_info["status"]),
+            model_name=job_name,
+            server_status=ModelStatus.UNAVAILABLE,
+            job_state=job_state,
             raw_output=self.output,
-            base_url=self.status_info["base_url"],
-            pending_reason=self.status_info["pending_reason"],
-            failed_reason=self.status_info["failed_reason"],
+            base_url="UNAVAILABLE",
+            pending_reason=None,
+            failed_reason=None,
         )
 
-    def check_model_health(self) -> None:
+    def _check_model_health(self) -> None:
         """Check model health and update status accordingly."""
         status, status_code = utils.model_health_check(
             cast(str, self.status_info["model_name"]), self.slurm_job_id, self.log_dir
@@ -290,40 +274,49 @@ def check_model_health(self) -> None:
                 self.slurm_job_id,
                 self.log_dir,
             )
-            self.status_info["status"] = status
+            self.status_info["server_status"] = status
         else:
-            self.status_info["status"], self.status_info["failed_reason"] = (
+            self.status_info["server_status"], self.status_info["failed_reason"] = (
                 status,
                 cast(str, status_code),
             )
 
-    def process_running_state(self) -> None:
+    def _process_running_state(self) -> None:
         """Process RUNNING job state and check server status."""
         server_status = utils.is_server_running(
             cast(str, self.status_info["model_name"]), self.slurm_job_id, self.log_dir
         )
 
         if isinstance(server_status, tuple):
-            self.status_info["status"], self.status_info["failed_reason"] = (
+            self.status_info["server_status"], self.status_info["failed_reason"] = (
                 server_status
             )
             return
 
         if server_status == "RUNNING":
-            self.check_model_health()
+            self._check_model_health()
         else:
-            self.status_info["status"] = server_status
+            self.status_info["server_status"] = server_status
 
-    def process_pending_state(self) -> None:
+    def _process_pending_state(self) -> None:
         """Process PENDING job state."""
         try:
             self.status_info["pending_reason"] = self.output.split(" ")[10].split("=")[
                 1
             ]
-            self.status_info["status"] = ModelStatus.PENDING
+            self.status_info["server_status"] = ModelStatus.PENDING
         except IndexError:
             self.status_info["pending_reason"] = "Unknown pending reason"
 
+    def process_model_status(self) -> StatusResponse:
+        """Process different job states and update status information."""
+        if self.status_info["job_state"] == ModelStatus.PENDING:
+            self._process_pending_state()
+        elif self.status_info["job_state"] == "RUNNING":
+            self._process_running_state()
+
+        return self.status_info
+
 
 class PerformanceMetricsCollector:
     """Class for handling metrics collection and processing."""
@@ -340,18 +333,18 @@ def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
         self._last_updated: Optional[float] = None
         self._last_throughputs = {"prompt": 0.0, "generation": 0.0}
 
-    def _get_status_info(self) -> dict[str, Union[str, None]]:
+    def _get_status_info(self) -> StatusResponse:
         """Retrieve status info using existing StatusHelper."""
         status_helper = ModelStatusMonitor(self.slurm_job_id, self.log_dir)
-        return status_helper.status_info
+        return status_helper.process_model_status()
 
     def _build_metrics_url(self) -> str:
         """Construct metrics endpoint URL from base URL with version stripping."""
-        if self.status_info.get("state") == "PENDING":
+        if self.status_info.job_state == ModelStatus.PENDING:
             return "Pending resources for server initialization"
 
         base_url = utils.get_base_url(
-            cast(str, self.status_info["model_name"]),
+            self.status_info.model_name,
             self.slurm_job_id,
             self.log_dir,
         )
diff --git a/vec_inf/client/_models.py b/vec_inf/client/_models.py
index 6dfce8e9..f53e6773 100644
--- a/vec_inf/client/_models.py
+++ b/vec_inf/client/_models.py
@@ -7,7 +7,7 @@
 
 from dataclasses import dataclass, field
 from enum import Enum
-from typing import Any, Optional, TypedDict
+from typing import Any, Optional, TypedDict, Union
 
 from typing_extensions import NotRequired
 
@@ -46,9 +46,9 @@ class LaunchResponse:
 class StatusResponse:
     """Response from checking a model's status."""
 
-    slurm_job_id: int
     model_name: str
-    status: ModelStatus
+    server_status: ModelStatus
+    job_state: Union[str, ModelStatus]
     raw_output: str = field(repr=False)
     base_url: Optional[str] = None
     pending_reason: Optional[str] = None
@@ -59,9 +59,8 @@ class StatusResponse:
 class MetricsResponse:
     """Response from retrieving model metrics."""
 
-    slurm_job_id: int
     model_name: str
-    metrics: dict[str, float]
+    metrics: Union[dict[str, float], str]
     timestamp: float
 
 
diff --git a/vec_inf/client/api.py b/vec_inf/client/api.py
index e9b55c7d..2dad5169 100644
--- a/vec_inf/client/api.py
+++ b/vec_inf/client/api.py
@@ -5,9 +5,7 @@
 """
 
 import time
-from typing import Any, Optional, cast
-
-import requests
+from typing import Any, Optional
 
 from vec_inf.client._config import ModelConfig
 from vec_inf.client._exceptions import (
@@ -138,7 +136,6 @@ def launch_model(
 
             # Create and use the API Launch Helper
             model_launcher = ModelLauncher(model_name, options_dict)
-
             return model_launcher.launch()
 
         except ValueError as e:
@@ -211,20 +208,12 @@ def get_metrics(
             )
 
             if not performance_metrics_collector.metrics_url.startswith("http"):
-                raise ServerError(
-                    f"Metrics endpoint unavailable or server not ready - {performance_metrics_collector.metrics_url}"
-                )
-
-            metrics = performance_metrics_collector.fetch_metrics()
-
-            if isinstance(metrics, str):
-                raise requests.RequestException(metrics)
+                metrics = performance_metrics_collector.metrics_url
+            else:
+                metrics = performance_metrics_collector.fetch_metrics()
 
             return MetricsResponse(
-                slurm_job_id=slurm_job_id,
-                model_name=cast(
-                    str, performance_metrics_collector.status_info["model_name"]
-                ),
+                model_name=performance_metrics_collector.status_info.model_name,
                 metrics=metrics,
                 timestamp=time.time(),
             )

From 3e5e5ad1e65652ab33cd72a7a83d1c80e2b8fd98 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Wed, 9 Apr 2025 15:38:37 -0400
Subject: [PATCH 47/52] Fix wrong var names and data access for client, removed
 unnecessary try excepts in api.py, update client tests accordingly

---
 tests/vec_inf/client/test_api.py    |   6 +-
 tests/vec_inf/client/test_models.py |   6 +-
 vec_inf/client/_helper.py           | 114 ++++++++++-----------
 vec_inf/client/_models.py           |   4 +-
 vec_inf/client/_utils.py            |  12 +--
 vec_inf/client/api.py               | 147 ++++++++--------------------
 6 files changed, 108 insertions(+), 181 deletions(-)

diff --git a/tests/vec_inf/client/test_api.py b/tests/vec_inf/client/test_api.py
index 43bb5857..74dc3980 100644
--- a/tests/vec_inf/client/test_api.py
+++ b/tests/vec_inf/client/test_api.py
@@ -113,10 +113,10 @@ def test_wait_until_ready():
     with patch.object(VecInfClient, "get_status") as mock_status:
         # First call returns LAUNCHING, second call returns READY
         status1 = MagicMock()
-        status1.status = ModelStatus.LAUNCHING
+        status1.server_status = ModelStatus.LAUNCHING
 
         status2 = MagicMock()
-        status2.status = ModelStatus.READY
+        status2.server_status = ModelStatus.READY
         status2.base_url = "http://gpu123:8080/v1"
 
         mock_status.side_effect = [status1, status2]
@@ -125,6 +125,6 @@ def test_wait_until_ready():
             client = VecInfClient()
             result = client.wait_until_ready("12345678", timeout_seconds=5)
 
-            assert result.status == ModelStatus.READY
+            assert result.server_status == ModelStatus.READY
             assert result.base_url == "http://gpu123:8080/v1"
             assert mock_status.call_count == 2
diff --git a/tests/vec_inf/client/test_models.py b/tests/vec_inf/client/test_models.py
index bbcaacda..fd4a0a5e 100644
--- a/tests/vec_inf/client/test_models.py
+++ b/tests/vec_inf/client/test_models.py
@@ -10,14 +10,14 @@ def test_model_info_creation():
         family="test-family",
         variant="test-variant",
         type=ModelType.LLM,
-        config={"num_gpus": 1},
+        config={"gpus_per_node": 1},
     )
 
     assert model.name == "test-model"
     assert model.family == "test-family"
     assert model.variant == "test-variant"
     assert model.type == ModelType.LLM
-    assert model.config["num_gpus"] == 1
+    assert model.config["gpus_per_node"] == 1
 
 
 def test_model_info_optional_fields():
@@ -40,7 +40,7 @@ def test_launch_options_default_values():
     """Test LaunchOptions with default values."""
     options = LaunchOptions()
 
-    assert options.num_gpus is None
+    assert options.gpus_per_node is None
     assert options.partition is None
     assert options.data_type is None
     assert options.num_nodes is None
diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
index d7928901..f4d91353 100644
--- a/vec_inf/client/_helper.py
+++ b/vec_inf/client/_helper.py
@@ -74,9 +74,7 @@ def _get_model_configuration(self) -> ModelConfig:
         )
 
         if not model_weights_parent_dir:
-            raise ValueError(
-                f"Could not determine model_weights_parent_dir and '{self.model_name}' not found in configuration"
-            )
+            raise ModelNotFoundError("Could not determine model weights parent directory")
 
         model_weights_path = Path(model_weights_parent_dir, self.model_name)
 
@@ -266,53 +264,47 @@ def _get_base_status_data(self) -> StatusResponse:
     def _check_model_health(self) -> None:
         """Check model health and update status accordingly."""
         status, status_code = utils.model_health_check(
-            cast(str, self.status_info["model_name"]), self.slurm_job_id, self.log_dir
+            self.status_info.model_name, self.slurm_job_id, self.log_dir
         )
         if status == ModelStatus.READY:
-            self.status_info["base_url"] = utils.get_base_url(
-                cast(str, self.status_info["model_name"]),
+            self.status_info.base_url = utils.get_base_url(
+                self.status_info.model_name,
                 self.slurm_job_id,
                 self.log_dir,
             )
-            self.status_info["server_status"] = status
+            self.status_info.server_status = status
         else:
-            self.status_info["server_status"], self.status_info["failed_reason"] = (
-                status,
-                cast(str, status_code),
-            )
+            self.status_info.server_status = status
+            self.status_info.failed_reason = cast(str, status_code)
 
     def _process_running_state(self) -> None:
         """Process RUNNING job state and check server status."""
         server_status = utils.is_server_running(
-            cast(str, self.status_info["model_name"]), self.slurm_job_id, self.log_dir
+            self.status_info.model_name, self.slurm_job_id, self.log_dir
         )
 
         if isinstance(server_status, tuple):
-            self.status_info["server_status"], self.status_info["failed_reason"] = (
-                server_status
-            )
+            self.status_info.server_status, self.status_info.failed_reason = server_status
             return
 
         if server_status == "RUNNING":
             self._check_model_health()
         else:
-            self.status_info["server_status"] = server_status
+            self.status_info.server_status = server_status
 
     def _process_pending_state(self) -> None:
         """Process PENDING job state."""
         try:
-            self.status_info["pending_reason"] = self.output.split(" ")[10].split("=")[
-                1
-            ]
-            self.status_info["server_status"] = ModelStatus.PENDING
+            self.status_info.pending_reason = self.output.split(" ")[10].split("=")[1]
+            self.status_info.server_status = ModelStatus.PENDING
         except IndexError:
-            self.status_info["pending_reason"] = "Unknown pending reason"
+            self.status_info.pending_reason = "Unknown pending reason"
 
     def process_model_status(self) -> StatusResponse:
         """Process different job states and update status information."""
-        if self.status_info["job_state"] == ModelStatus.PENDING:
+        if self.status_info.job_state == ModelStatus.PENDING:
             self._process_pending_state()
-        elif self.status_info["job_state"] == "RUNNING":
+        elif self.status_info.job_state == "RUNNING":
             self._process_running_state()
 
         return self.status_info
@@ -360,7 +352,7 @@ def _build_metrics_url(self) -> str:
     def _check_prefix_caching(self) -> bool:
         """Check if prefix caching is enabled."""
         job_json = utils.read_slurm_log(
-            cast(str, self.status_info["model_name"]),
+            self.status_info.model_name,
             self.slurm_job_id,
             "json",
             self.log_dir,
@@ -369,6 +361,43 @@ def _check_prefix_caching(self) -> bool:
             return False
         return bool(cast(dict[str, str], job_json).get("enable_prefix_caching", False))
 
+    def _parse_metrics(self, metrics_text: str) -> dict[str, float]:
+        """Parse metrics with latency count and sum."""
+        key_metrics = {
+            "vllm:prompt_tokens_total": "total_prompt_tokens",
+            "vllm:generation_tokens_total": "total_generation_tokens",
+            "vllm:e2e_request_latency_seconds_sum": "request_latency_sum",
+            "vllm:e2e_request_latency_seconds_count": "request_latency_count",
+            "vllm:request_queue_time_seconds_sum": "queue_time_sum",
+            "vllm:request_success_total": "successful_requests_total",
+            "vllm:num_requests_running": "requests_running",
+            "vllm:num_requests_waiting": "requests_waiting",
+            "vllm:num_requests_swapped": "requests_swapped",
+            "vllm:gpu_cache_usage_perc": "gpu_cache_usage",
+            "vllm:cpu_cache_usage_perc": "cpu_cache_usage",
+        }
+
+        if self.enabled_prefix_caching:
+            key_metrics["vllm:gpu_prefix_cache_hit_rate"] = "gpu_prefix_cache_hit_rate"
+            key_metrics["vllm:cpu_prefix_cache_hit_rate"] = "cpu_prefix_cache_hit_rate"
+
+        parsed: dict[str, float] = {}
+        for line in metrics_text.split("\n"):
+            if line.startswith("#") or not line.strip():
+                continue
+
+            parts = line.split()
+            if len(parts) < 2:
+                continue
+
+            metric_name = parts[0].split("{")[0]
+            if metric_name in key_metrics:
+                try:
+                    parsed[key_metrics[metric_name]] = float(parts[1])
+                except (ValueError, IndexError):
+                    continue
+        return parsed
+
     def fetch_metrics(self) -> Union[dict[str, float], str]:
         """Fetch metrics from the endpoint."""
         try:
@@ -443,43 +472,6 @@ def fetch_metrics(self) -> Union[dict[str, float], str]:
         except requests.RequestException as e:
             return f"Metrics request failed, `metrics` endpoint might not be ready yet: {str(e)}"
 
-    def _parse_metrics(self, metrics_text: str) -> dict[str, float]:
-        """Parse metrics with latency count and sum."""
-        key_metrics = {
-            "vllm:prompt_tokens_total": "total_prompt_tokens",
-            "vllm:generation_tokens_total": "total_generation_tokens",
-            "vllm:e2e_request_latency_seconds_sum": "request_latency_sum",
-            "vllm:e2e_request_latency_seconds_count": "request_latency_count",
-            "vllm:request_queue_time_seconds_sum": "queue_time_sum",
-            "vllm:request_success_total": "successful_requests_total",
-            "vllm:num_requests_running": "requests_running",
-            "vllm:num_requests_waiting": "requests_waiting",
-            "vllm:num_requests_swapped": "requests_swapped",
-            "vllm:gpu_cache_usage_perc": "gpu_cache_usage",
-            "vllm:cpu_cache_usage_perc": "cpu_cache_usage",
-        }
-
-        if self.enabled_prefix_caching:
-            key_metrics["vllm:gpu_prefix_cache_hit_rate"] = "gpu_prefix_cache_hit_rate"
-            key_metrics["vllm:cpu_prefix_cache_hit_rate"] = "cpu_prefix_cache_hit_rate"
-
-        parsed: dict[str, float] = {}
-        for line in metrics_text.split("\n"):
-            if line.startswith("#") or not line.strip():
-                continue
-
-            parts = line.split()
-            if len(parts) < 2:
-                continue
-
-            metric_name = parts[0].split("{")[0]
-            if metric_name in key_metrics:
-                try:
-                    parsed[key_metrics[metric_name]] = float(parts[1])
-                except (ValueError, IndexError):
-                    continue
-        return parsed
-
 
 class ModelRegistry:
     """Class for handling model listing and configuration management."""
diff --git a/vec_inf/client/_models.py b/vec_inf/client/_models.py
index f53e6773..df78d9e5 100644
--- a/vec_inf/client/_models.py
+++ b/vec_inf/client/_models.py
@@ -78,7 +78,7 @@ class LaunchOptions:
     max_num_batched_tokens: Optional[int] = None
     partition: Optional[str] = None
     num_nodes: Optional[int] = None
-    num_gpus: Optional[int] = None
+    gpus_per_node: Optional[int] = None
     qos: Optional[str] = None
     time: Optional[str] = None
     vocab_size: Optional[int] = None
@@ -104,7 +104,7 @@ class LaunchOptionsDict(TypedDict):
     max_num_batched_tokens: NotRequired[Optional[int]]
     partition: NotRequired[Optional[str]]
     num_nodes: NotRequired[Optional[int]]
-    num_gpus: NotRequired[Optional[int]]
+    gpus_per_node: NotRequired[Optional[int]]
     qos: NotRequired[Optional[str]]
     time: NotRequired[Optional[str]]
     vocab_size: NotRequired[Optional[int]]
diff --git a/vec_inf/client/_utils.py b/vec_inf/client/_utils.py
index f115848c..c2e3ef6b 100644
--- a/vec_inf/client/_utils.py
+++ b/vec_inf/client/_utils.py
@@ -3,6 +3,7 @@
 import json
 import os
 import subprocess
+import warnings
 from pathlib import Path
 from typing import Any, Optional, Union, cast
 
@@ -151,8 +152,9 @@ def load_config() -> list[ModelConfig]:
                     else:
                         config.setdefault("models", {})[name] = data
         else:
-            print(
-                f"WARNING: Could not find user config: {user_path}, revert to default config located at {default_path}"
+            warnings.warn(
+                f"WARNING: Could not find user config: {user_path}, revert to default config located at {default_path}", UserWarning,
+                stacklevel=2
             )
 
     return [
@@ -161,12 +163,6 @@ def load_config() -> list[ModelConfig]:
     ]
 
 
-def shutdown_model(slurm_job_id: int) -> None:
-    """Shutdown a running model on the cluster."""
-    shutdown_cmd = f"scancel {slurm_job_id}"
-    run_bash_command(shutdown_cmd)
-
-
 def parse_launch_output(output: str) -> tuple[str, dict[str, str]]:
     """Parse output from model launch command.
 
diff --git a/vec_inf/client/api.py b/vec_inf/client/api.py
index 2dad5169..4a70b9f3 100644
--- a/vec_inf/client/api.py
+++ b/vec_inf/client/api.py
@@ -9,8 +9,6 @@
 
 from vec_inf.client._config import ModelConfig
 from vec_inf.client._exceptions import (
-    APIError,
-    ModelNotFoundError,
     ServerError,
     SlurmJobError,
 )
@@ -28,7 +26,7 @@
     ModelStatus,
     StatusResponse,
 )
-from vec_inf.client._utils import shutdown_model
+from vec_inf.client._utils import run_bash_command
 
 
 class VecInfClient:
@@ -62,18 +60,9 @@ def list_models(self) -> list[ModelInfo]:
         -------
         list[ModelInfo]
             ModelInfo objects containing information about available models.
-
-        Raises
-        ------
-        APIError
-            If there was an error retrieving model information.
-
         """
-        try:
-            model_registry = ModelRegistry()
-            return model_registry.get_all_models()
-        except Exception as e:
-            raise APIError(f"Failed to list models: {str(e)}") from e
+        model_registry = ModelRegistry()
+        return model_registry.get_all_models()
 
     def get_model_config(self, model_name: str) -> ModelConfig:
         """Get the configuration for a specific model.
@@ -87,22 +76,9 @@ def get_model_config(self, model_name: str) -> ModelConfig:
         -------
         ModelConfig
             Model configuration.
-
-        Raises
-        ------
-        ModelNotFoundError
-            Error if the specified model is not found.
-        APIError
-                Error if there was an error retrieving the model configuration.
-
         """
-        try:
-            model_registry = ModelRegistry()
-            return model_registry.get_single_model_config(model_name)
-        except ModelNotFoundError:
-            raise
-        except Exception as e:
-            raise APIError(f"Failed to get model configuration: {str(e)}") from e
+        model_registry = ModelRegistry()
+        return model_registry.get_single_model_config(model_name)
 
     def launch_model(
         self, model_name: str, options: Optional[LaunchOptions] = None
@@ -120,30 +96,15 @@ def launch_model(
         -------
         LaunchResponse
             Information about the launched model.
-
-        Raises
-        ------
-        ModelNotFoundError
-            Error if the specified model is not found.
-        APIError
-            Error if there was an error launching the model.
         """
-        try:
-            # Convert LaunchOptions to dictionary if provided
-            options_dict: dict[str, Any] = {}
-            if options:
-                options_dict = {k: v for k, v in vars(options).items() if v is not None}
-
-            # Create and use the API Launch Helper
-            model_launcher = ModelLauncher(model_name, options_dict)
-            return model_launcher.launch()
-
-        except ValueError as e:
-            if "not found in configuration" in str(e):
-                raise ModelNotFoundError(str(e)) from e
-            raise APIError(f"Failed to launch model: {str(e)}") from e
-        except Exception as e:
-            raise APIError(f"Failed to launch model: {str(e)}") from e
+        # Convert LaunchOptions to dictionary if provided
+        options_dict: dict[str, Any] = {}
+        if options:
+            options_dict = {k: v for k, v in vars(options).items() if v is not None}
+
+        # Create and use the API Launch Helper
+        model_launcher = ModelLauncher(model_name, options_dict)
+        return model_launcher.launch()
 
     def get_status(
         self, slurm_job_id: int, log_dir: Optional[str] = None
@@ -161,21 +122,9 @@ def get_status(
         -------
         StatusResponse
             Model status information.
-
-        Raises
-        ------
-        SlurmJobError
-            Error if the specified job is not found or there's an error with the job.
-        APIError
-            Error if there was an error retrieving the status.
         """
-        try:
-            model_status_monitor = ModelStatusMonitor(slurm_job_id, log_dir)
-            return model_status_monitor.process_model_status()
-        except SlurmJobError:
-            raise
-        except Exception as e:
-            raise APIError(f"Failed to get status: {str(e)}") from e
+        model_status_monitor = ModelStatusMonitor(slurm_job_id, log_dir)
+        return model_status_monitor.process_model_status()
 
     def get_metrics(
         self, slurm_job_id: int, log_dir: Optional[str] = None
@@ -193,34 +142,21 @@ def get_metrics(
         -------
         MetricsResponse
             Object containing the model's performance metrics.
+        """
+        performance_metrics_collector = PerformanceMetricsCollector(
+            slurm_job_id, log_dir
+        )
 
-        Raises
-        ------
-        SlurmJobError
-            If the specified job is not found or there's an error with the job.
-        APIError
-            If there was an error retrieving the metrics.
+        if not performance_metrics_collector.metrics_url.startswith("http"):
+            metrics = performance_metrics_collector.metrics_url
+        else:
+            metrics = performance_metrics_collector.fetch_metrics()
 
-        """
-        try:
-            performance_metrics_collector = PerformanceMetricsCollector(
-                slurm_job_id, log_dir
-            )
-
-            if not performance_metrics_collector.metrics_url.startswith("http"):
-                metrics = performance_metrics_collector.metrics_url
-            else:
-                metrics = performance_metrics_collector.fetch_metrics()
-
-            return MetricsResponse(
-                model_name=performance_metrics_collector.status_info.model_name,
-                metrics=metrics,
-                timestamp=time.time(),
-            )
-        except SlurmJobError:
-            raise
-        except Exception as e:
-            raise APIError(f"Failed to get metrics: {str(e)}") from e
+        return MetricsResponse(
+            model_name=performance_metrics_collector.status_info.model_name,
+            metrics=metrics,
+            timestamp=time.time(),
+        )
 
     def shutdown_model(self, slurm_job_id: int) -> bool:
         """Shutdown a running model.
@@ -232,17 +168,20 @@ def shutdown_model(self, slurm_job_id: int) -> bool:
 
         Returns
         -------
+        bool
             True if the model was successfully shutdown, False otherwise.
 
         Raises
         ------
-            APIError: If there was an error shutting down the model.
+        SlurmJobError
+            If there was an error shutting down the model.
         """
-        try:
-            shutdown_model(slurm_job_id)
-            return True
-        except Exception as e:
-            raise APIError(f"Failed to shutdown model: {str(e)}") from e
+        shutdown_cmd = f"scancel {slurm_job_id}"
+        _, stderr = run_bash_command(shutdown_cmd)
+        if stderr:
+            raise SlurmJobError(f"Failed to shutdown model: {stderr}")
+        return True
+
 
     def wait_until_ready(
         self,
@@ -282,16 +221,16 @@ def wait_until_ready(
         start_time = time.time()
 
         while True:
-            status = self.get_status(slurm_job_id, log_dir)
+            status_info = self.get_status(slurm_job_id, log_dir)
 
-            if status.status == ModelStatus.READY:
-                return status
+            if status_info.server_status == ModelStatus.READY:
+                return status_info
 
-            if status.status == ModelStatus.FAILED:
-                error_message = status.failed_reason or "Unknown error"
+            if status_info.server_status == ModelStatus.FAILED:
+                error_message = status_info.failed_reason or "Unknown error"
                 raise ServerError(f"Model failed to start: {error_message}")
 
-            if status.status == ModelStatus.SHUTDOWN:
+            if status_info.server_status == ModelStatus.SHUTDOWN:
                 raise ServerError("Model was shutdown before it became ready")
 
             # Check timeout

From ed0a5ddf22d5bc03e2cfe89f3e1dfc7580295784 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Wed, 9 Apr 2025 15:39:22 -0400
Subject: [PATCH 48/52] Refactor CLI logic to use client instead of inheriting
 client helper classes

---
 tests/vec_inf/cli/test_cli.py |  50 +++++---
 vec_inf/cli/_cli.py           |  79 +++++++-----
 vec_inf/cli/_helper.py        | 221 ++++++++++++++++------------------
 3 files changed, 188 insertions(+), 162 deletions(-)

diff --git a/tests/vec_inf/cli/test_cli.py b/tests/vec_inf/cli/test_cli.py
index aef2295c..4b19f9db 100644
--- a/tests/vec_inf/cli/test_cli.py
+++ b/tests/vec_inf/cli/test_cli.py
@@ -322,6 +322,25 @@ def test_launch_command_with_json_output(
         assert str(test_log_dir) in output.get("log_dir", "")
 
 
+def test_launch_command_no_model_weights_parent_dir(runner, debug_helper, base_patches):
+    """Test handling when model weights parent dir is not set."""
+    with ExitStack() as stack:
+        # Apply all base patches
+        for patch_obj in base_patches:
+            stack.enter_context(patch_obj)
+
+        # Mock load_config to return empty list
+        stack.enter_context(
+            patch("vec_inf.client._utils.load_config", return_value=[])
+        )
+
+        result = runner.invoke(cli, ["launch", "test-model"])
+        debug_helper.print_debug_info(result)
+
+        assert result.exit_code == 1
+        assert "Could not determine model weights parent directory" in result.output
+
+
 def test_launch_command_model_not_in_config_with_weights(
     runner, mock_launch_output, path_exists, debug_helper, test_paths, base_patches
 ):
@@ -346,18 +365,19 @@ def test_launch_command_model_not_in_config_with_weights(
         expected_job_id = "14933051"
         mock_run.return_value = mock_launch_output(expected_job_id)
 
-        result = runner.invoke(cli, ["launch", "unknown-model"])
-        debug_helper.print_debug_info(result)
+        with pytest.warns(UserWarning) as record:
+            result = runner.invoke(cli, ["launch", "unknown-model"])
+            debug_helper.print_debug_info(result)
 
-        assert result.exit_code == 1
-        assert (
-            "Could not determine model_weights_parent_dir and 'unknown-model' not found in configuration"
-            in result.output
+        assert result.exit_code == 0
+        assert len(record) == 1
+        assert str(record[0].message) == (
+            "Warning: 'unknown-model' configuration not found in config, please ensure model configuration are properly set in command arguments"
         )
 
 
 def test_launch_command_model_not_found(
-    runner, path_exists, debug_helper, test_paths, base_patches
+    runner, debug_helper, test_paths, base_patches
 ):
     """Test handling of a model that's neither in config nor has weights."""
 
@@ -389,7 +409,8 @@ def custom_path_exists(p):
 
         assert result.exit_code == 1
         assert (
-            "Could not determine model_weights_parent_dir and 'unknown-model' not found in configuration"
+            "'unknown-model' not found in configuration and model weights "
+            "not found at expected path '/model-weights/unknown-model'"
             in result.output
         )
 
@@ -428,10 +449,9 @@ def test_metrics_command_pending_server(
         debug_helper.print_debug_info(result)
 
         assert result.exit_code == 0
-        assert "Server State" in result.output
-        assert "PENDING" in result.output
+        assert "ERROR" in result.output
         assert (
-            "Metrics endpoint unavailable or server not ready - Pending"
+            "Pending resources for server initialization"
             in result.output
         )
 
@@ -452,10 +472,9 @@ def test_metrics_command_server_not_ready(
         debug_helper.print_debug_info(result)
 
         assert result.exit_code == 0
-        assert "Server State" in result.output
-        assert "RUNNING" in result.output
+        assert "ERROR" in result.output
         assert (
-            "Metrics endpoint unavailable or server not ready - Server not"
+            "Server not ready"
             in result.output
         )
 
@@ -519,8 +538,7 @@ def test_metrics_command_request_failed(
         debug_helper.print_debug_info(result)
 
         # KeyboardInterrupt is expected and ok
-        assert "Server State" in result.output
-        assert "RUNNING" in result.output
+        assert "ERROR" in result.output
         assert (
             "Metrics request failed, `metrics` endpoint might not be ready"
             in result.output
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index 63278e48..e9f52947 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -9,11 +9,13 @@
 
 import vec_inf.client._utils as utils
 from vec_inf.cli._helper import (
-    CLIMetricsCollector,
-    CLIModelLauncher,
-    CLIModelRegistry,
-    CLIModelStatusMonitor,
+    LaunchResponseFormatter,
+    ListCmdDisplay,
+    MetricsResponseFormatter,
+    StatusResponseFormatter,
 )
+from vec_inf.client._models import LaunchOptions
+from vec_inf.client.api import VecInfClient
 
 
 CONSOLE = Console()
@@ -131,14 +133,19 @@ def launch(
 ) -> None:
     """Launch a model on the cluster."""
     try:
-        model_launcher = CLIModelLauncher(model_name, cli_kwargs)
-        # Launch model inference server
-        model_launcher.launch()
+        # Convert cli_kwargs to LaunchOptions
+        launch_options = LaunchOptions(**{k: v for k, v in cli_kwargs.items() if k != "json_mode"})
+
+        # Start the client and launch model inference server
+        client = VecInfClient()
+        launch_response = client.launch_model(model_name, launch_options)
+
         # Display launch information
+        launch_formatter = LaunchResponseFormatter(model_name, launch_response.config)
         if cli_kwargs.get("json_mode"):
-            click.echo(model_launcher.params)
+            click.echo(launch_response.config)
         else:
-            launch_info_table = model_launcher.format_table_output()
+            launch_info_table = launch_formatter.format_table_output()
             CONSOLE.print(launch_info_table)
 
     except click.ClickException as e:
@@ -164,14 +171,16 @@ def status(
 ) -> None:
     """Get the status of a running model on the cluster."""
     try:
-        # Get model inference server status
-        model_status_monitor = CLIModelStatusMonitor(slurm_job_id, log_dir)
-        model_status_monitor.process_model_status()
+        # Start the client and get model inference server status
+        client = VecInfClient()
+        status_response = client.get_status(slurm_job_id, log_dir)
         # Display status information
+        status_formatter = StatusResponseFormatter(status_response)
         if json_mode:
-            model_status_monitor.output_json()
+            status_formatter.output_json()
         else:
-            model_status_monitor.output_table(CONSOLE)
+            status_info_table = status_formatter.output_table()
+            CONSOLE.print(status_info_table)
 
     except click.ClickException as e:
         raise e
@@ -197,8 +206,15 @@ def shutdown(slurm_job_id: int) -> None:
 def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> None:
     """List all available models, or get default setup of a specific model."""
     try:
-        model_registry = CLIModelRegistry(json_mode)
-        model_registry.process_list_command(CONSOLE, model_name)
+        # Start the client
+        client = VecInfClient()
+        list_display = ListCmdDisplay(CONSOLE, json_mode)
+        if model_name:
+            model_config = client.get_model_config(model_name)
+            list_display.display_single_model_output(model_config)
+        else:
+            model_infos = client.list_models()
+            list_display.display_all_models_output(model_infos)
     except click.ClickException as e:
         raise e
     except Exception as e:
@@ -213,30 +229,29 @@ def list_models(model_name: Optional[str] = None, json_mode: bool = False) -> No
 def metrics(slurm_job_id: int, log_dir: Optional[str] = None) -> None:
     """Stream real-time performance metrics from the model endpoint."""
     try:
-        metrics_collector = CLIMetricsCollector(slurm_job_id, log_dir)
-
-        # Check if metrics URL is ready
-        if not metrics_collector.metrics_url.startswith("http"):
-            table = utils.create_table("Metric", "Value")
-            metrics_collector.display_failed_metrics(
-                table,
-                f"Metrics endpoint unavailable or server not ready - {metrics_collector.metrics_url}",
-            )
-            CONSOLE.print(table)
+        # Start the client and get inference server metrics
+        client = VecInfClient()
+        metrics_response = client.get_metrics(slurm_job_id, log_dir)
+        metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
+
+        # Check if metrics response is ready
+        if isinstance(metrics_response.metrics, str):
+            metrics_formatter.format_failed_metrics(metrics_response.metrics)
+            CONSOLE.print(metrics_formatter.table)
             return
 
         with Live(refresh_per_second=1, console=CONSOLE) as live:
             while True:
-                metrics = metrics_collector.fetch_metrics()
-                table = utils.create_table("Metric", "Value")
+                metrics_response = client.get_metrics(slurm_job_id, log_dir)
+                metrics_formatter = MetricsResponseFormatter(metrics_response.metrics)
 
-                if isinstance(metrics, str):
+                if isinstance(metrics_response.metrics, str):
                     # Show status information if metrics aren't available
-                    metrics_collector.display_failed_metrics(table, metrics)
+                    metrics_formatter.format_failed_metrics(metrics_response.metrics)
                 else:
-                    metrics_collector.display_metrics(table, metrics)
+                    metrics_formatter.format_metrics()
 
-                live.update(table)
+                live.update(metrics_formatter.table)
                 time.sleep(2)
     except click.ClickException as e:
         raise e
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index 0a77e51a..6924f565 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -1,7 +1,7 @@
 """Helper classes for the CLI."""
 
 import os
-from typing import Any, Optional, Union
+from typing import Any, Union
 
 import click
 from rich.columns import Columns
@@ -12,30 +12,22 @@
 import vec_inf.client._utils as utils
 from vec_inf.cli._models import MODEL_TYPE_COLORS, MODEL_TYPE_PRIORITY
 from vec_inf.client._config import ModelConfig
-from vec_inf.client._helper import (
-    ModelLauncher,
-    ModelRegistry,
-    ModelStatusMonitor,
-    PerformanceMetricsCollector,
-)
+from vec_inf.client._models import ModelInfo, StatusResponse
 
 
-class CLIModelLauncher(ModelLauncher):
-    """CLI Helper class for handling inference server launch."""
+class LaunchResponseFormatter():
+    """CLI Helper class for formatting LaunchResponse."""
 
-    def __init__(self, model_name: str, kwargs: Optional[dict[str, Any]]):
-        super().__init__(model_name, kwargs)
-
-    def _warn(self, message: str) -> None:
-        """Warn the user about a potential issue."""
-        click.echo(click.style(f"Warning: {message}", fg="yellow"), err=True)
+    def __init__(self, model_name: str, params: dict[str, Any]):
+        self.model_name = model_name
+        self.params = params
 
     def format_table_output(self) -> Table:
         """Format output as rich Table."""
         table = utils.create_table(key_title="Job Config", value_title="Value")
 
         # Add key information with consistent styling
-        table.add_row("Slurm Job ID", self.slurm_job_id, style="blue")
+        table.add_row("Slurm Job ID", self.params["slurm_job_id"], style="blue")
         table.add_row("Job Name", self.model_name)
 
         # Add model details
@@ -76,128 +68,140 @@ def format_table_output(self) -> Table:
         return table
 
 
-class CLIModelStatusMonitor(ModelStatusMonitor):
-    """CLI Helper class for handling server status information and monitoring."""
+class StatusResponseFormatter():
+    """CLI Helper class for formatting StatusResponse."""
 
-    def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
-        super().__init__(slurm_job_id, log_dir)
+    def __init__(self, status_info: StatusResponse):
+        self.status_info = status_info
 
     def output_json(self) -> None:
         """Format and output JSON data."""
         json_data = {
-            "model_name": self.status_info["model_name"],
-            "model_status": self.status_info["status"],
-            "base_url": self.status_info["base_url"],
+            "model_name": self.status_info.model_name,
+            "model_status": self.status_info.server_status,
+            "base_url": self.status_info.base_url,
         }
-        if self.status_info["pending_reason"]:
-            json_data["pending_reason"] = self.status_info["pending_reason"]
-        if self.status_info["failed_reason"]:
-            json_data["failed_reason"] = self.status_info["failed_reason"]
+        if self.status_info.pending_reason:
+            json_data["pending_reason"] = self.status_info.pending_reason
+        if self.status_info.failed_reason:
+            json_data["failed_reason"] = self.status_info.failed_reason
         click.echo(json_data)
 
-    def output_table(self, console: Console) -> None:
+    def output_table(self) -> Table:
         """Create and display rich table."""
         table = utils.create_table(key_title="Job Status", value_title="Value")
-        table.add_row("Model Name", self.status_info["model_name"])
-        table.add_row("Model Status", self.status_info["status"], style="blue")
+        table.add_row("Model Name", self.status_info.model_name)
+        table.add_row("Model Status", self.status_info.server_status, style="blue")
+
+        if self.status_info.pending_reason:
+            table.add_row("Pending Reason", self.status_info.pending_reason)
+        if self.status_info.failed_reason:
+            table.add_row("Failed Reason", self.status_info.failed_reason)
+
+        table.add_row("Base URL", self.status_info.base_url)
+        return table
 
-        if self.status_info["pending_reason"]:
-            table.add_row("Pending Reason", self.status_info["pending_reason"])
-        if self.status_info["failed_reason"]:
-            table.add_row("Failed Reason", self.status_info["failed_reason"])
 
-        table.add_row("Base URL", self.status_info["base_url"])
-        console.print(table)
+class MetricsResponseFormatter():
+    """CLI Helper class for formatting MetricsResponse."""
 
+    def __init__(self, metrics: dict[str, float]):
+        self.metrics = metrics
+        self.table = utils.create_table("Metric", "Value")
+        self.enabled_prefix_caching = self._check_prefix_caching()
 
-class CLIMetricsCollector(PerformanceMetricsCollector):
-    """CLI Helper class for streaming metrics information."""
+    def _check_prefix_caching(self) -> bool:
+        """Check if prefix caching is enabled by looking for prefix cache metrics."""
+        if isinstance(self.metrics, str):
+            # If metrics is a string, it's an error message
+            return False
 
-    def __init__(self, slurm_job_id: int, log_dir: Optional[str] = None):
-        super().__init__(slurm_job_id, log_dir)
+        cache_rate = self.metrics.get("gpu_prefix_cache_hit_rate")
+        return cache_rate is not None
 
-    def display_failed_metrics(self, table: Table, metrics: str) -> None:
-        table.add_row("Server State", self.status_info["state"], style="yellow")
-        table.add_row("Message", metrics)
+    def format_failed_metrics(self, message: str) -> None:
+        self.table.add_row("ERROR", message)
 
-    def display_metrics(self, table: Table, metrics: dict[str, float]) -> None:
+    def format_metrics(self) -> None:
         # Throughput metrics
-        table.add_row(
+        self.table.add_row(
             "Prompt Throughput",
-            f"{metrics.get('prompt_tokens_per_sec', 0):.1f} tokens/s",
+            f"{self.metrics.get('prompt_tokens_per_sec', 0):.1f} tokens/s",
         )
-        table.add_row(
+        self.table.add_row(
             "Generation Throughput",
-            f"{metrics.get('generation_tokens_per_sec', 0):.1f} tokens/s",
+            f"{self.metrics.get('generation_tokens_per_sec', 0):.1f} tokens/s",
         )
 
         # Request queue metrics
-        table.add_row(
+        self.table.add_row(
             "Requests Running",
-            f"{metrics.get('requests_running', 0):.0f} reqs",
+            f"{self.metrics.get('requests_running', 0):.0f} reqs",
         )
-        table.add_row(
+        self.table.add_row(
             "Requests Waiting",
-            f"{metrics.get('requests_waiting', 0):.0f} reqs",
+            f"{self.metrics.get('requests_waiting', 0):.0f} reqs",
         )
-        table.add_row(
+        self.table.add_row(
             "Requests Swapped",
-            f"{metrics.get('requests_swapped', 0):.0f} reqs",
+            f"{self.metrics.get('requests_swapped', 0):.0f} reqs",
         )
 
         # Cache usage metrics
-        table.add_row(
+        self.table.add_row(
             "GPU Cache Usage",
-            f"{metrics.get('gpu_cache_usage', 0) * 100:.1f}%",
+            f"{self.metrics.get('gpu_cache_usage', 0) * 100:.1f}%",
         )
-        table.add_row(
+        self.table.add_row(
             "CPU Cache Usage",
-            f"{metrics.get('cpu_cache_usage', 0) * 100:.1f}%",
+            f"{self.metrics.get('cpu_cache_usage', 0) * 100:.1f}%",
         )
 
         if self.enabled_prefix_caching:
-            table.add_row(
+            self.table.add_row(
                 "GPU Prefix Cache Hit Rate",
-                f"{metrics.get('gpu_prefix_cache_hit_rate', 0) * 100:.1f}%",
+                f"{self.metrics.get('gpu_prefix_cache_hit_rate', 0) * 100:.1f}%",
             )
-            table.add_row(
+            self.table.add_row(
                 "CPU Prefix Cache Hit Rate",
-                f"{metrics.get('cpu_prefix_cache_hit_rate', 0) * 100:.1f}%",
+                f"{self.metrics.get('cpu_prefix_cache_hit_rate', 0) * 100:.1f}%",
             )
 
         # Show average latency if available
-        if "avg_request_latency" in metrics:
-            table.add_row(
+        if "avg_request_latency" in self.metrics:
+            self.table.add_row(
                 "Avg Request Latency",
-                f"{metrics['avg_request_latency']:.1f} s",
+                f"{self.metrics['avg_request_latency']:.1f} s",
             )
 
         # Token counts
-        table.add_row(
+        self.table.add_row(
             "Total Prompt Tokens",
-            f"{metrics.get('total_prompt_tokens', 0):.0f} tokens",
+            f"{self.metrics.get('total_prompt_tokens', 0):.0f} tokens",
         )
-        table.add_row(
+        self.table.add_row(
             "Total Generation Tokens",
-            f"{metrics.get('total_generation_tokens', 0):.0f} tokens",
+            f"{self.metrics.get('total_generation_tokens', 0):.0f} tokens",
         )
-        table.add_row(
+        self.table.add_row(
             "Successful Requests",
-            f"{metrics.get('successful_requests_total', 0):.0f} reqs",
+            f"{self.metrics.get('successful_requests_total', 0):.0f} reqs",
         )
 
 
-class CLIModelRegistry(ModelRegistry):
-    """CLI Helper class for handling model listing functionality."""
+class ListCmdDisplay():
+    """CLI Helper class for displaying model listing functionality."""
 
-    def __init__(self, json_mode: bool = False):
-        super().__init__()
+    def __init__(self, console: Console, json_mode: bool = False):
+        self.console = console
         self.json_mode = json_mode
+        self.model_config = None
+        self.model_names: list[str] = []
 
-    def format_single_model_output(
+    def _format_single_model_output(
         self, config: ModelConfig
     ) -> Union[dict[str, Any], Table]:
-        """Format output for a single model."""
+        """Format output table for a single model."""
         if self.json_mode:
             # Exclude non-essential fields from JSON output
             excluded = {"venv", "log_dir"}
@@ -214,51 +218,40 @@ def format_single_model_output(
                 table.add_row(field, str(value))
         return table
 
-    def format_all_models_output(self) -> Union[list[str], list[Panel]]:
-        """Format output for all models."""
-        if self.json_mode:
-            return [config.model_name for config in self.model_configs]
-
+    def _format_all_models_output(self, model_infos: list[ModelInfo]) -> Union[list[str], list[Panel]]:
+        """Format output table for all models."""
         # Sort by model type priority
-        sorted_configs = sorted(
-            self.model_configs,
-            key=lambda x: MODEL_TYPE_PRIORITY.get(x.model_type, 4),
+        sorted_model_infos = sorted(
+            model_infos,
+            key=lambda x: MODEL_TYPE_PRIORITY.get(x.type, 4),
         )
 
         # Create panels with color coding
         panels = []
-        for config in sorted_configs:
-            color = MODEL_TYPE_COLORS.get(config.model_type, "white")
-            variant = config.model_variant or ""
-            display_text = f"[magenta]{config.model_family}[/magenta]"
+        for model_info in sorted_model_infos:
+            color = MODEL_TYPE_COLORS.get(model_info.type, "white")
+            variant = model_info.variant or ""
+            display_text = f"[magenta]{model_info.family}[/magenta]"
             if variant:
                 display_text += f"-{variant}"
             panels.append(Panel(display_text, expand=True, border_style=color))
 
         return panels
 
-    def process_list_command(
-        self, console: Console, model_name: Optional[str] = None
-    ) -> None:
-        """Process the list command and display output."""
-        try:
-            if model_name:
-                # Handle single model case
-                config = self.get_single_model_config(model_name)
-                output = self.format_single_model_output(config)
-                if self.json_mode:
-                    click.echo(output)
-                else:
-                    console.print(output)
-            # Handle all models case
-            elif self.json_mode:
-                # JSON output for all models is just a list of names
-                model_names = [config.model_name for config in self.model_configs]
-                click.echo(model_names)
-            else:
-                # Rich output for all models is a list of panels
-                panels = self.format_all_models_output()
-                if isinstance(panels, list):  # This helps mypy understand the type
-                    console.print(Columns(panels, equal=True))
-        except Exception as e:
-            raise click.ClickException(str(e)) from e
+    def display_single_model_output(self, config: ModelConfig) -> None:
+        """Display the output for a single model."""
+        output = self._format_single_model_output(config)
+        if self.json_mode:
+            click.echo(output)
+        else:
+            self.console.print(output)
+
+    def display_all_models_output(self, model_infos: list[ModelInfo]) -> None:
+        """Display the output for all models."""
+        if self.json_mode:
+            model_names = [info.name for info in model_infos]
+            click.echo(model_names)
+        else:
+            panels = self._format_all_models_output(model_infos)
+            self.console.print(Columns(panels, equal=True))
+

From 35b96dcf14b451e069dc6c8c046c296572304656 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 9 Apr 2025 19:40:25 +0000
Subject: [PATCH 49/52] [pre-commit.ci] Add auto fixes from pre-commit.com
 hooks

for more information, see https://pre-commit.ci
---
 tests/vec_inf/cli/test_cli.py | 21 +++++----------------
 vec_inf/cli/_cli.py           |  4 +++-
 vec_inf/cli/_helper.py        | 13 +++++++------
 vec_inf/client/_helper.py     |  8 ++++++--
 vec_inf/client/_utils.py      |  5 +++--
 vec_inf/client/api.py         |  1 -
 6 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/tests/vec_inf/cli/test_cli.py b/tests/vec_inf/cli/test_cli.py
index 4b19f9db..e249c636 100644
--- a/tests/vec_inf/cli/test_cli.py
+++ b/tests/vec_inf/cli/test_cli.py
@@ -330,9 +330,7 @@ def test_launch_command_no_model_weights_parent_dir(runner, debug_helper, base_p
             stack.enter_context(patch_obj)
 
         # Mock load_config to return empty list
-        stack.enter_context(
-            patch("vec_inf.client._utils.load_config", return_value=[])
-        )
+        stack.enter_context(patch("vec_inf.client._utils.load_config", return_value=[]))
 
         result = runner.invoke(cli, ["launch", "test-model"])
         debug_helper.print_debug_info(result)
@@ -376,9 +374,7 @@ def test_launch_command_model_not_in_config_with_weights(
         )
 
 
-def test_launch_command_model_not_found(
-    runner, debug_helper, test_paths, base_patches
-):
+def test_launch_command_model_not_found(runner, debug_helper, test_paths, base_patches):
     """Test handling of a model that's neither in config nor has weights."""
 
     def custom_path_exists(p):
@@ -410,8 +406,7 @@ def custom_path_exists(p):
         assert result.exit_code == 1
         assert (
             "'unknown-model' not found in configuration and model weights "
-            "not found at expected path '/model-weights/unknown-model'"
-            in result.output
+            "not found at expected path '/model-weights/unknown-model'" in result.output
         )
 
 
@@ -450,10 +445,7 @@ def test_metrics_command_pending_server(
 
         assert result.exit_code == 0
         assert "ERROR" in result.output
-        assert (
-            "Pending resources for server initialization"
-            in result.output
-        )
+        assert "Pending resources for server initialization" in result.output
 
 
 def test_metrics_command_server_not_ready(
@@ -473,10 +465,7 @@ def test_metrics_command_server_not_ready(
 
         assert result.exit_code == 0
         assert "ERROR" in result.output
-        assert (
-            "Server not ready"
-            in result.output
-        )
+        assert "Server not ready" in result.output
 
 
 @patch("requests.get")
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index e9f52947..2b50e563 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -134,7 +134,9 @@ def launch(
     """Launch a model on the cluster."""
     try:
         # Convert cli_kwargs to LaunchOptions
-        launch_options = LaunchOptions(**{k: v for k, v in cli_kwargs.items() if k != "json_mode"})
+        launch_options = LaunchOptions(
+            **{k: v for k, v in cli_kwargs.items() if k != "json_mode"}
+        )
 
         # Start the client and launch model inference server
         client = VecInfClient()
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index 6924f565..be1f30c7 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -15,7 +15,7 @@
 from vec_inf.client._models import ModelInfo, StatusResponse
 
 
-class LaunchResponseFormatter():
+class LaunchResponseFormatter:
     """CLI Helper class for formatting LaunchResponse."""
 
     def __init__(self, model_name: str, params: dict[str, Any]):
@@ -68,7 +68,7 @@ def format_table_output(self) -> Table:
         return table
 
 
-class StatusResponseFormatter():
+class StatusResponseFormatter:
     """CLI Helper class for formatting StatusResponse."""
 
     def __init__(self, status_info: StatusResponse):
@@ -102,7 +102,7 @@ def output_table(self) -> Table:
         return table
 
 
-class MetricsResponseFormatter():
+class MetricsResponseFormatter:
     """CLI Helper class for formatting MetricsResponse."""
 
     def __init__(self, metrics: dict[str, float]):
@@ -189,7 +189,7 @@ def format_metrics(self) -> None:
         )
 
 
-class ListCmdDisplay():
+class ListCmdDisplay:
     """CLI Helper class for displaying model listing functionality."""
 
     def __init__(self, console: Console, json_mode: bool = False):
@@ -218,7 +218,9 @@ def _format_single_model_output(
                 table.add_row(field, str(value))
         return table
 
-    def _format_all_models_output(self, model_infos: list[ModelInfo]) -> Union[list[str], list[Panel]]:
+    def _format_all_models_output(
+        self, model_infos: list[ModelInfo]
+    ) -> Union[list[str], list[Panel]]:
         """Format output table for all models."""
         # Sort by model type priority
         sorted_model_infos = sorted(
@@ -254,4 +256,3 @@ def display_all_models_output(self, model_infos: list[ModelInfo]) -> None:
         else:
             panels = self._format_all_models_output(model_infos)
             self.console.print(Columns(panels, equal=True))
-
diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
index f4d91353..83683516 100644
--- a/vec_inf/client/_helper.py
+++ b/vec_inf/client/_helper.py
@@ -74,7 +74,9 @@ def _get_model_configuration(self) -> ModelConfig:
         )
 
         if not model_weights_parent_dir:
-            raise ModelNotFoundError("Could not determine model weights parent directory")
+            raise ModelNotFoundError(
+                "Could not determine model weights parent directory"
+            )
 
         model_weights_path = Path(model_weights_parent_dir, self.model_name)
 
@@ -284,7 +286,9 @@ def _process_running_state(self) -> None:
         )
 
         if isinstance(server_status, tuple):
-            self.status_info.server_status, self.status_info.failed_reason = server_status
+            self.status_info.server_status, self.status_info.failed_reason = (
+                server_status
+            )
             return
 
         if server_status == "RUNNING":
diff --git a/vec_inf/client/_utils.py b/vec_inf/client/_utils.py
index c2e3ef6b..14b858df 100644
--- a/vec_inf/client/_utils.py
+++ b/vec_inf/client/_utils.py
@@ -153,8 +153,9 @@ def load_config() -> list[ModelConfig]:
                         config.setdefault("models", {})[name] = data
         else:
             warnings.warn(
-                f"WARNING: Could not find user config: {user_path}, revert to default config located at {default_path}", UserWarning,
-                stacklevel=2
+                f"WARNING: Could not find user config: {user_path}, revert to default config located at {default_path}",
+                UserWarning,
+                stacklevel=2,
             )
 
     return [
diff --git a/vec_inf/client/api.py b/vec_inf/client/api.py
index 4a70b9f3..97adb2a4 100644
--- a/vec_inf/client/api.py
+++ b/vec_inf/client/api.py
@@ -182,7 +182,6 @@ def shutdown_model(self, slurm_job_id: int) -> bool:
             raise SlurmJobError(f"Failed to shutdown model: {stderr}")
         return True
 
-
     def wait_until_ready(
         self,
         slurm_job_id: int,

From 5ef4e1ff643daa5a4fc719f6344dd23fb8336a45 Mon Sep 17 00:00:00 2001
From: XkunW <marshallxkwang@gmail.com>
Date: Wed, 9 Apr 2025 16:14:28 -0400
Subject: [PATCH 50/52] mypy fixes

---
 examples/api/basic_usage.py |  2 +-
 vec_inf/cli/_cli.py         | 20 ++++++++++++--------
 vec_inf/cli/_helper.py      | 15 +++++++--------
 vec_inf/client/_helper.py   |  2 +-
 vec_inf/client/api.py       |  3 ++-
 5 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/examples/api/basic_usage.py b/examples/api/basic_usage.py
index c027065f..2c01a3be 100755
--- a/examples/api/basic_usage.py
+++ b/examples/api/basic_usage.py
@@ -33,7 +33,7 @@
 # Get metrics
 print("\nRetrieving metrics...")
 metrics = client.get_metrics(job_id)
-if metrics.metrics:
+if isinstance(metrics.metrics, dict):
     for key, value in metrics.metrics.items():
         print(f"- {key}: {value}")
 
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index 2b50e563..bb6bf3e9 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -7,14 +7,13 @@
 from rich.console import Console
 from rich.live import Live
 
-import vec_inf.client._utils as utils
 from vec_inf.cli._helper import (
     LaunchResponseFormatter,
     ListCmdDisplay,
     MetricsResponseFormatter,
     StatusResponseFormatter,
 )
-from vec_inf.client._models import LaunchOptions
+from vec_inf.client._models import LaunchOptions, LaunchOptionsDict
 from vec_inf.client.api import VecInfClient
 
 
@@ -129,14 +128,15 @@ def cli() -> None:
 )
 def launch(
     model_name: str,
-    **cli_kwargs: Optional[Union[str, int, bool]],
+    **cli_kwargs: Optional[Union[str, int, float, bool]],
 ) -> None:
     """Launch a model on the cluster."""
     try:
         # Convert cli_kwargs to LaunchOptions
-        launch_options = LaunchOptions(
-            **{k: v for k, v in cli_kwargs.items() if k != "json_mode"}
-        )
+        kwargs = {k: v for k, v in cli_kwargs.items() if k != "json_mode"}
+        # Cast the dictionary to LaunchOptionsDict
+        options_dict: LaunchOptionsDict = kwargs  # type: ignore
+        launch_options = LaunchOptions(**options_dict)
 
         # Start the client and launch model inference server
         client = VecInfClient()
@@ -194,8 +194,12 @@ def status(
 @click.argument("slurm_job_id", type=int, nargs=1)
 def shutdown(slurm_job_id: int) -> None:
     """Shutdown a running model on the cluster."""
-    utils.shutdown_model(slurm_job_id)
-    click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
+    try:
+        client = VecInfClient()
+        client.shutdown_model(slurm_job_id)
+        click.echo(f"Shutting down model with Slurm Job ID: {slurm_job_id}")
+    except Exception as e:
+        raise click.ClickException(f"Shutdown failed: {str(e)}") from e
 
 
 @cli.command("list")
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index be1f30c7..95667725 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -105,19 +105,18 @@ def output_table(self) -> Table:
 class MetricsResponseFormatter:
     """CLI Helper class for formatting MetricsResponse."""
 
-    def __init__(self, metrics: dict[str, float]):
-        self.metrics = metrics
+    def __init__(self, metrics: Union[dict[str, float], str]):
+        self.metrics = self._set_metrics(metrics)
         self.table = utils.create_table("Metric", "Value")
         self.enabled_prefix_caching = self._check_prefix_caching()
 
+    def _set_metrics(self, metrics: Union[dict[str, float], str]) -> dict[str, float]:
+        """Set the metrics attribute."""
+        return metrics if isinstance(metrics, dict) else {}
+
     def _check_prefix_caching(self) -> bool:
         """Check if prefix caching is enabled by looking for prefix cache metrics."""
-        if isinstance(self.metrics, str):
-            # If metrics is a string, it's an error message
-            return False
-
-        cache_rate = self.metrics.get("gpu_prefix_cache_hit_rate")
-        return cache_rate is not None
+        return self.metrics.get("gpu_prefix_cache_hit_rate") is not None
 
     def format_failed_metrics(self, message: str) -> None:
         self.table.add_row("ERROR", message)
diff --git a/vec_inf/client/_helper.py b/vec_inf/client/_helper.py
index 83683516..d5b9481a 100644
--- a/vec_inf/client/_helper.py
+++ b/vec_inf/client/_helper.py
@@ -294,7 +294,7 @@ def _process_running_state(self) -> None:
         if server_status == "RUNNING":
             self._check_model_health()
         else:
-            self.status_info.server_status = server_status
+            self.status_info.server_status = cast(ModelStatus, server_status)
 
     def _process_pending_state(self) -> None:
         """Process PENDING job state."""
diff --git a/vec_inf/client/api.py b/vec_inf/client/api.py
index 97adb2a4..88020d88 100644
--- a/vec_inf/client/api.py
+++ b/vec_inf/client/api.py
@@ -5,7 +5,7 @@
 """
 
 import time
-from typing import Any, Optional
+from typing import Any, Optional, Union
 
 from vec_inf.client._config import ModelConfig
 from vec_inf.client._exceptions import (
@@ -147,6 +147,7 @@ def get_metrics(
             slurm_job_id, log_dir
         )
 
+        metrics: Union[dict[str, float], str]
         if not performance_metrics_collector.metrics_url.startswith("http"):
             metrics = performance_metrics_collector.metrics_url
         else:

From d48afb13cca3c033f9c36c8763a8a283bebc3eb8 Mon Sep 17 00:00:00 2001
From: Marshall Wang <marshallxkwang@gmail.com>
Date: Wed, 9 Apr 2025 23:45:11 -0400
Subject: [PATCH 51/52] Remove private imports from client, move util function
 only used by CLI to cli/_utils.py

---
 tests/vec_inf/cli/test_utils.py    | 17 +++++++++++++++++
 tests/vec_inf/client/test_utils.py | 17 +----------------
 vec_inf/cli/_cli.py                |  3 +--
 vec_inf/cli/_helper.py             | 13 ++++++-------
 vec_inf/cli/_utils.py              | 13 +++++++++++++
 vec_inf/client/__init__.py         |  2 ++
 vec_inf/client/_utils.py           | 11 -----------
 7 files changed, 40 insertions(+), 36 deletions(-)
 create mode 100644 tests/vec_inf/cli/test_utils.py
 create mode 100644 vec_inf/cli/_utils.py

diff --git a/tests/vec_inf/cli/test_utils.py b/tests/vec_inf/cli/test_utils.py
new file mode 100644
index 00000000..00149f73
--- /dev/null
+++ b/tests/vec_inf/cli/test_utils.py
@@ -0,0 +1,17 @@
+"""Tests for the utils functions in the vec-inf cli."""
+
+from vec_inf.cli._utils import create_table
+
+
+def test_create_table_with_header():
+    """Test that create_table creates a table with the correct header."""
+    table = create_table("Key", "Value")
+    assert table.columns[0].header == "Key"
+    assert table.columns[1].header == "Value"
+    assert table.show_header is True
+
+
+def test_create_table_without_header():
+    """Test create_table without header."""
+    table = create_table(show_header=False)
+    assert table.show_header is False
\ No newline at end of file
diff --git a/tests/vec_inf/client/test_utils.py b/tests/vec_inf/client/test_utils.py
index 45f4df3e..23cc631c 100644
--- a/tests/vec_inf/client/test_utils.py
+++ b/tests/vec_inf/client/test_utils.py
@@ -1,4 +1,4 @@
-"""Tests for the utility functions in the CLI module."""
+"""Tests for the utility functions in the vec-inf client."""
 
 import os
 from unittest.mock import MagicMock, patch
@@ -8,7 +8,6 @@
 
 from vec_inf.client._utils import (
     MODEL_READY_SIGNATURE,
-    create_table,
     get_base_url,
     is_server_running,
     load_config,
@@ -134,20 +133,6 @@ def test_model_health_check_request_exception():
         assert result == ("FAILED", "Connection error")
 
 
-def test_create_table_with_header():
-    """Test that create_table creates a table with the correct header."""
-    table = create_table("Key", "Value")
-    assert table.columns[0].header == "Key"
-    assert table.columns[1].header == "Value"
-    assert table.show_header is True
-
-
-def test_create_table_without_header():
-    """Test create_table without header."""
-    table = create_table(show_header=False)
-    assert table.show_header is False
-
-
 def test_load_config_default_only():
     """Test loading the actual default configuration file from the filesystem."""
     configs = load_config()
diff --git a/vec_inf/cli/_cli.py b/vec_inf/cli/_cli.py
index bb6bf3e9..cf220355 100644
--- a/vec_inf/cli/_cli.py
+++ b/vec_inf/cli/_cli.py
@@ -13,8 +13,7 @@
     MetricsResponseFormatter,
     StatusResponseFormatter,
 )
-from vec_inf.client._models import LaunchOptions, LaunchOptionsDict
-from vec_inf.client.api import VecInfClient
+from vec_inf.client import LaunchOptions, LaunchOptionsDict, VecInfClient
 
 
 CONSOLE = Console()
diff --git a/vec_inf/cli/_helper.py b/vec_inf/cli/_helper.py
index 95667725..9d2872d2 100644
--- a/vec_inf/cli/_helper.py
+++ b/vec_inf/cli/_helper.py
@@ -9,10 +9,9 @@
 from rich.panel import Panel
 from rich.table import Table
 
-import vec_inf.client._utils as utils
 from vec_inf.cli._models import MODEL_TYPE_COLORS, MODEL_TYPE_PRIORITY
-from vec_inf.client._config import ModelConfig
-from vec_inf.client._models import ModelInfo, StatusResponse
+from vec_inf.cli._utils import create_table
+from vec_inf.client import ModelConfig, ModelInfo, StatusResponse
 
 
 class LaunchResponseFormatter:
@@ -24,7 +23,7 @@ def __init__(self, model_name: str, params: dict[str, Any]):
 
     def format_table_output(self) -> Table:
         """Format output as rich Table."""
-        table = utils.create_table(key_title="Job Config", value_title="Value")
+        table = create_table(key_title="Job Config", value_title="Value")
 
         # Add key information with consistent styling
         table.add_row("Slurm Job ID", self.params["slurm_job_id"], style="blue")
@@ -89,7 +88,7 @@ def output_json(self) -> None:
 
     def output_table(self) -> Table:
         """Create and display rich table."""
-        table = utils.create_table(key_title="Job Status", value_title="Value")
+        table = create_table(key_title="Job Status", value_title="Value")
         table.add_row("Model Name", self.status_info.model_name)
         table.add_row("Model Status", self.status_info.server_status, style="blue")
 
@@ -107,7 +106,7 @@ class MetricsResponseFormatter:
 
     def __init__(self, metrics: Union[dict[str, float], str]):
         self.metrics = self._set_metrics(metrics)
-        self.table = utils.create_table("Metric", "Value")
+        self.table = create_table("Metric", "Value")
         self.enabled_prefix_caching = self._check_prefix_caching()
 
     def _set_metrics(self, metrics: Union[dict[str, float], str]) -> dict[str, float]:
@@ -211,7 +210,7 @@ def _format_single_model_output(
             )
             return config_dict
 
-        table = utils.create_table(key_title="Model Config", value_title="Value")
+        table = create_table(key_title="Model Config", value_title="Value")
         for field, value in config.model_dump().items():
             if field not in {"venv", "log_dir"}:
                 table.add_row(field, str(value))
diff --git a/vec_inf/cli/_utils.py b/vec_inf/cli/_utils.py
new file mode 100644
index 00000000..33ad63d1
--- /dev/null
+++ b/vec_inf/cli/_utils.py
@@ -0,0 +1,13 @@
+"""Helper functions for the CLI."""
+
+from rich.table import Table
+
+
+def create_table(
+    key_title: str = "", value_title: str = "", show_header: bool = True
+) -> Table:
+    """Create a table for displaying model status."""
+    table = Table(show_header=show_header, header_style="bold magenta")
+    table.add_column(key_title, style="dim")
+    table.add_column(value_title)
+    return table
diff --git a/vec_inf/client/__init__.py b/vec_inf/client/__init__.py
index f4b5f864..e6b45824 100644
--- a/vec_inf/client/__init__.py
+++ b/vec_inf/client/__init__.py
@@ -5,6 +5,7 @@
 users direct control over the lifecycle of inference servers via python scripts.
 """
 
+from vec_inf.client._config import ModelConfig
 from vec_inf.client._models import (
     LaunchOptions,
     LaunchOptionsDict,
@@ -28,4 +29,5 @@
     "ModelType",
     "LaunchOptions",
     "LaunchOptionsDict",
+    "ModelConfig",
 ]
diff --git a/vec_inf/client/_utils.py b/vec_inf/client/_utils.py
index 14b858df..25882e0f 100644
--- a/vec_inf/client/_utils.py
+++ b/vec_inf/client/_utils.py
@@ -9,7 +9,6 @@
 
 import requests
 import yaml
-from rich.table import Table
 
 from vec_inf.client._config import ModelConfig
 from vec_inf.client._models import ModelStatus
@@ -118,16 +117,6 @@ def model_health_check(
         return (ModelStatus.FAILED, str(e))
 
 
-def create_table(
-    key_title: str = "", value_title: str = "", show_header: bool = True
-) -> Table:
-    """Create a table for displaying model status."""
-    table = Table(show_header=show_header, header_style="bold magenta")
-    table.add_column(key_title, style="dim")
-    table.add_column(value_title)
-    return table
-
-
 def load_config() -> list[ModelConfig]:
     """Load the model configuration."""
     default_path = (

From 1e9f8d777ff8e367eae97468a836c69bbfaddeda Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 10 Apr 2025 03:45:28 +0000
Subject: [PATCH 52/52] [pre-commit.ci] Add auto fixes from pre-commit.com
 hooks

for more information, see https://pre-commit.ci
---
 tests/vec_inf/cli/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/vec_inf/cli/test_utils.py b/tests/vec_inf/cli/test_utils.py
index 00149f73..c0a460cb 100644
--- a/tests/vec_inf/cli/test_utils.py
+++ b/tests/vec_inf/cli/test_utils.py
@@ -14,4 +14,4 @@ def test_create_table_with_header():
 def test_create_table_without_header():
     """Test create_table without header."""
     table = create_table(show_header=False)
-    assert table.show_header is False
\ No newline at end of file
+    assert table.show_header is False