diff --git a/tools/server/tests/conftest.py b/tools/server/tests/conftest.py
index 017d1bb841efd..6462290f499a5 100644
--- a/tools/server/tests/conftest.py
+++ b/tools/server/tests/conftest.py
@@ -2,14 +2,92 @@
 from utils import *
 
 
-# ref: https://stackoverflow.com/questions/22627659/run-code-before-and-after-each-test-in-py-test
 @pytest.fixture(autouse=True)
 def stop_server_after_each_test():
-    # do nothing before each test
     yield
-    # stop all servers after each test
     instances = set(
         server_instances
-    )  # copy the set to prevent 'Set changed size during iteration'
+    )
     for server in instances:
         server.stop()
+
+
+@pytest.fixture
+def pipeline_process():
+    """
+    Fixture providing a PipelineTestProcess instance for E2E testing.
+    Automatically cleaned up after test completion.
+    """
+    process = PipelineTestProcess()
+    yield process
+    if process.process is not None:
+        process.stop()
+
+
+@pytest.fixture
+def e2e_small_model_config():
+    """
+    Fixture providing configuration for a small model suitable for E2E testing.
+    Uses tinyllama for fast execution in CI environments.
+    """
+    return {
+        "model_hf_repo": "ggml-org/models",
+        "model_hf_file": "tinyllamas/stories260K.gguf",
+        "model_alias": "tinyllama-e2e",
+        "n_ctx": 512,
+        "n_batch": 32,
+        "n_slots": 2,
+        "n_predict": 32,
+        "seed": 42,
+        "temperature": 0.8,
+    }
+
+
+@pytest.fixture
+def e2e_embedding_model_config():
+    """
+    Fixture providing configuration for embedding model E2E testing.
+    """
+    return {
+        "model_hf_repo": "ggml-org/models",
+        "model_hf_file": "bert-bge-small/ggml-model-f16.gguf",
+        "model_alias": "bert-e2e",
+        "n_ctx": 512,
+        "n_batch": 128,
+        "n_ubatch": 128,
+        "n_slots": 2,
+        "seed": 42,
+        "server_embeddings": True,
+    }
+
+
+@pytest.fixture
+def e2e_multimodal_model_config():
+    """
+    Fixture providing configuration for multimodal model E2E testing.
+    """
+    return {
+        "model_hf_repo": "ggml-org/tinygemma3-GGUF",
+        "model_hf_file": "tinygemma3-Q8_0.gguf",
+        "mmproj_url": "https://huggingface.co/ggml-org/tinygemma3-GGUF/resolve/main/mmproj-tinygemma3.gguf",
+        "model_alias": "tinygemma3-e2e",
+        "n_ctx": 1024,
+        "n_batch": 32,
+        "n_slots": 2,
+        "n_predict": 16,
+        "seed": 42,
+    }
+
+
+@pytest.fixture
+def concurrent_test_prompts():
+    """
+    Fixture providing a list of prompts for concurrent testing scenarios.
+    """
+    return [
+        "Once upon a time",
+        "In a distant land",
+        "There was a brave knight",
+        "The dragon soared",
+        "Magic filled the air",
+    ]
diff --git a/tools/server/tests/e2e/README.md b/tools/server/tests/e2e/README.md
new file mode 100644
index 0000000000000..e46b62a018ca1
--- /dev/null
+++ b/tools/server/tests/e2e/README.md
@@ -0,0 +1,273 @@
+# End-to-End Test Suite
+
+This directory contains comprehensive end-to-end (E2E) tests for llama.cpp, extending beyond unit-focused API testing to validate complete user workflows and component integration.
+
+## Overview
+
+The E2E test suite provides comprehensive coverage of:
+
+1. **Pipeline Workflows** - Complete model download, loading, and inference workflows
+2. **Tool Integration** - CLI tool testing (llama-cli, llama-bench)
+3. **Multimodal Workflows** - Vision + text processing coordination
+4. **Concurrent Scenarios** - Multi-user simulation and parallel request handling
+
+## Test Files
+
+### test_pipeline_workflows.py
+
+Tests complete pipeline workflows from model acquisition to inference:
+
+- **Model Download & Loading**: Validates HuggingFace model download and loading
+- **State Transitions**: Tracks server state progression (INITIAL → LOADING_MODEL → READY → GENERATING)
+- **Context Management**: Tests extended inference sessions with context preservation
+- **KV Cache Behavior**: Validates cache utilization during workflows
+- **Streaming Pipeline**: Tests streaming inference through complete pipeline
+- **Embedding Models**: Validates embedding model pipelines
+
+**Example:**
+```bash
+./tests.sh e2e/test_pipeline_workflows.py::test_basic_pipeline_workflow
+```
+
+### test_tool_integration.py
+
+Tests CLI tool integration and coordination:
+
+- **llama-cli Execution**: Basic and advanced CLI usage patterns
+- **llama-bench Testing**: Performance benchmark execution
+- **Embedding Generation**: CLI-based embedding workflows
+- **Parameter Validation**: Error handling and validation
+- **Server/CLI Coordination**: Resource sharing between tools
+
+**Example:**
+```bash
+./tests.sh e2e/test_tool_integration.py::test_cli_basic_execution
+```
+
+### test_multimodal_workflows.py
+
+Tests multimodal (vision + text) processing:
+
+- **Model Loading**: Multimodal model initialization with vision projection
+- **Image Processing**: Image input handling with text completion
+- **Context Preservation**: Cross-modal context management
+- **Sequential Requests**: Mixed text-only and multimodal requests
+- **Streaming**: Multimodal streaming responses
+- **Error Handling**: Invalid input handling
+
+**Example:**
+```bash
+./tests.sh e2e/test_multimodal_workflows.py::test_multimodal_chat_with_image
+```
+
+### test_concurrent_scenarios.py
+
+Tests concurrent request handling and real-world scenarios:
+
+- **Concurrent Requests**: Multiple simultaneous completion/chat requests
+- **Multi-turn Conversations**: Context preservation across conversation turns
+- **Slot Management**: Request queuing and slot allocation under load
+- **Streaming Concurrency**: Multiple streaming sessions
+- **LoRA Switching**: Adapter loading/switching during active sessions
+- **Mixed Workloads**: Different request types running concurrently
+
+**Example:**
+```bash
+./tests.sh e2e/test_concurrent_scenarios.py::test_concurrent_completion_requests
+```
+
+## Framework Extensions
+
+### PipelineTestProcess Class
+
+The `PipelineTestProcess` class extends `ServerProcess` with E2E testing capabilities:
+
+```python
+from utils import PipelineTestProcess
+
+# Create pipeline test instance
+pipeline = PipelineTestProcess()
+
+# Test complete pipeline workflow
+results = pipeline.test_full_pipeline({
+    "model_hf_repo": "ggml-org/models",
+    "model_hf_file": "tinyllamas/stories260K.gguf",
+    "n_ctx": 512,
+})
+
+# Run CLI commands
+result = pipeline.run_cli_command(["-m", model_path, "-p", "Hello", "-n", "16"])
+
+# Run benchmarks
+bench_results = pipeline.run_bench_command(model_path, ["-p", "8", "-n", "8"])
+```
+
+**Key Methods:**
+
+- `test_full_pipeline(model_config)` - Execute complete pipeline workflow
+- `run_cli_command(args, input_text, timeout)` - Execute llama-cli
+- `run_bench_command(model_path, args, timeout)` - Execute llama-bench
+- `test_context_management(prompts, max_context)` - Test context handling
+- `validate_kv_cache_behavior(context_size, tokens)` - Validate cache usage
+
+### Test Fixtures
+
+New pytest fixtures in `conftest.py`:
+
+- **`pipeline_process`** - PipelineTestProcess instance with automatic cleanup
+- **`e2e_small_model_config`** - Small model config for fast E2E tests
+- **`e2e_embedding_model_config`** - Embedding model configuration
+- **`e2e_multimodal_model_config`** - Multimodal model configuration
+- **`concurrent_test_prompts`** - Prompts for concurrent testing
+
+## Running E2E Tests
+
+### Run All E2E Tests
+
+```bash
+./tests.sh e2e/
+```
+
+### Run Specific Test File
+
+```bash
+./tests.sh e2e/test_pipeline_workflows.py
+```
+
+### Run Single Test
+
+```bash
+./tests.sh e2e/test_pipeline_workflows.py::test_basic_pipeline_workflow
+```
+
+### Run with Verbose Output
+
+```bash
+DEBUG=1 ./tests.sh e2e/ -s -v
+```
+
+### Run Slow Tests
+
+Some tests are marked as slow and require the `SLOW_TESTS` environment variable:
+
+```bash
+SLOW_TESTS=1 ./tests.sh e2e/
+```
+
+## Configuration
+
+### Environment Variables
+
+| Variable | Description | Default |
+|----------|-------------|---------|
+| `LLAMA_CLI_BIN_PATH` | Path to llama-cli binary | `../../../build/bin/llama-cli` |
+| `LLAMA_BENCH_BIN_PATH` | Path to llama-bench binary | `../../../build/bin/llama-bench` |
+| `LLAMA_CACHE` | Model cache directory | `tmp` |
+| `SLOW_TESTS` | Enable slow tests | `0` |
+| `DEBUG` | Enable verbose output | `0` |
+
+### Model Selection
+
+E2E tests use smaller models for CI compatibility:
+
+- **Text Generation**: tinyllama (stories260K.gguf) - Fast, small footprint
+- **Embeddings**: bert-bge-small - Efficient embedding generation
+- **Multimodal**: tinygemma3 - Compact vision+text model
+
+For local testing with larger models, modify the fixture configurations in `conftest.py`.
+
+## Writing New E2E Tests
+
+### Example Test Structure
+
+```python
+def test_my_e2e_workflow(pipeline_process, e2e_small_model_config):
+    """
+    Test description here.
+
+    Validates:
+    - Point 1
+    - Point 2
+    """
+    # Configure pipeline
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    # Start server
+    pipeline_process.start()
+
+    # Test workflow
+    res = pipeline_process.make_request("POST", "/completion", data={
+        "prompt": "Test",
+        "n_predict": 8,
+    })
+
+    # Assertions
+    assert res.status_code == 200
+    assert "content" in res.body
+```
+
+### Best Practices
+
+1. **Use Fixtures**: Leverage existing fixtures for model configs and test data
+2. **Small Models**: Use small models for fast execution in CI
+3. **Resource Cleanup**: Fixtures handle cleanup automatically
+4. **Test Isolation**: Each test should be independent
+5. **Descriptive Names**: Use clear, descriptive test names
+6. **Documentation**: Include docstrings explaining what is validated
+7. **Slow Tests**: Mark expensive tests with `@pytest.mark.skipif(not is_slow_test_allowed())`
+
+## CI Integration
+
+E2E tests are designed to run in CI environments with:
+
+- 4 vCPU GitHub runners
+- Limited memory footprint
+- Fast model downloads from HuggingFace
+- Reasonable timeout configurations
+
+Tests automatically skip slow scenarios unless `SLOW_TESTS=1` is set.
+
+## Troubleshooting
+
+### Tests Timeout
+
+- Increase timeout in test: `pipeline_process.start(timeout_seconds=120)`
+- Use smaller models in CI
+- Check network connectivity for model downloads
+
+### Model Download Issues
+
+- Set `LLAMA_CACHE` to a persistent directory
+- Pre-download models before running tests
+- Check HuggingFace availability
+
+### CLI Tool Not Found
+
+- Ensure binaries are built: `cmake --build build --target llama-cli llama-bench`
+- Set `LLAMA_CLI_BIN_PATH` and `LLAMA_BENCH_BIN_PATH`
+- Check binary permissions
+
+### Concurrent Test Failures
+
+- Increase `n_slots` for higher concurrency
+- Adjust timing expectations for slower systems
+- Enable `server_continuous_batching` for better scheduling
+
+## Contributing
+
+When adding new E2E tests:
+
+1. Place tests in appropriate file based on category
+2. Use existing fixtures when possible
+3. Add new fixtures to `conftest.py` if needed
+4. Update this README with new test descriptions
+5. Ensure tests pass in CI environment
+6. Document special requirements or configurations
+
+## Related Documentation
+
+- [Main Test README](../README.md) - General testing documentation
+- [Server Documentation](../../README.md) - llama-server documentation
+- [Contributing Guide](../../../../CONTRIBUTING.md) - Project contribution guidelines
diff --git a/tools/server/tests/e2e/__init__.py b/tools/server/tests/e2e/__init__.py
new file mode 100644
index 0000000000000..3194e40467a89
--- /dev/null
+++ b/tools/server/tests/e2e/__init__.py
@@ -0,0 +1,9 @@
+"""
+End-to-end test suite for llama.cpp server.
+
+This module provides comprehensive E2E testing covering:
+- Complete pipeline workflows (download, conversion, loading, inference)
+- Tool integration testing (llama-cli, llama-bench)
+- Multimodal workflows (vision + text)
+- Concurrent scenario simulation
+"""
diff --git a/tools/server/tests/e2e/test_concurrent_scenarios.py b/tools/server/tests/e2e/test_concurrent_scenarios.py
new file mode 100644
index 0000000000000..c384c8e736739
--- /dev/null
+++ b/tools/server/tests/e2e/test_concurrent_scenarios.py
@@ -0,0 +1,471 @@
+"""
+End-to-end tests for concurrent scenarios.
+
+Tests cover:
+- Multi-turn conversation management with context preservation
+- Concurrent user simulation and request queuing validation
+- LoRA adapter loading and switching during active sessions
+- Batch processing with multiple simultaneous users
+- Request slot management under load conditions
+"""
+
+import pytest
+from utils import *
+
+
+def test_concurrent_completion_requests(pipeline_process, e2e_small_model_config, concurrent_test_prompts):
+    """
+    Test concurrent completion requests from multiple simulated users.
+
+    Validates:
+    - Server handles multiple simultaneous requests
+    - All requests complete successfully
+    - Responses are independent and correct
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.n_slots = 4
+    pipeline_process.server_continuous_batching = True
+    pipeline_process.start()
+
+    tasks = [
+        (
+            pipeline_process.make_request,
+            ("POST", "/completion", {
+                "prompt": prompt,
+                "n_predict": 16,
+                "temperature": 0.8,
+            })
+        )
+        for prompt in concurrent_test_prompts
+    ]
+
+    results = parallel_function_calls(tasks)
+
+    assert len(results) == len(concurrent_test_prompts)
+    assert all([res.status_code == 200 for res in results]), \
+        "All concurrent requests should succeed"
+    assert all(["content" in res.body for res in results]), \
+        "All responses should contain content"
+
+
+def test_concurrent_chat_completions(pipeline_process, e2e_small_model_config):
+    """
+    Test concurrent chat completion requests.
+
+    Validates:
+    - Multiple chat sessions run simultaneously
+    - Context is isolated between sessions
+    - No cross-contamination of conversations
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.n_slots = 3
+    pipeline_process.server_continuous_batching = True
+    pipeline_process.start()
+
+    conversations = [
+        [{"role": "user", "content": "Tell me about dogs"}],
+        [{"role": "user", "content": "Tell me about cats"}],
+        [{"role": "user", "content": "Tell me about birds"}],
+    ]
+
+    tasks = [
+        (
+            pipeline_process.make_request,
+            ("POST", "/chat/completions", {
+                "messages": conv,
+                "max_tokens": 16,
+            })
+        )
+        for conv in conversations
+    ]
+
+    results = parallel_function_calls(tasks)
+
+    assert all([res.status_code == 200 for res in results])
+    assert all(["choices" in res.body for res in results])
+
+
+def test_multi_turn_conversation_with_context(pipeline_process, e2e_small_model_config):
+    """
+    Test multi-turn conversation with context preservation.
+
+    Validates:
+    - Context is maintained across conversation turns
+    - Responses build on previous messages
+    - Server state management is correct
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.cache_prompt = True
+    pipeline_process.start()
+
+    messages = []
+
+    user_msg_1 = {"role": "user", "content": "Hello"}
+    messages.append(user_msg_1)
+
+    res1 = pipeline_process.make_request("POST", "/chat/completions", data={
+        "messages": messages,
+        "max_tokens": 16,
+    })
+    assert res1.status_code == 200
+
+    messages.append({
+        "role": "assistant",
+        "content": res1.body["choices"][0]["message"]["content"]
+    })
+
+    messages.append({
+        "role": "user",
+        "content": "Tell me more"
+    })
+
+    res2 = pipeline_process.make_request("POST", "/chat/completions", data={
+        "messages": messages,
+        "max_tokens": 16,
+    })
+    assert res2.status_code == 200
+
+    messages.append({
+        "role": "assistant",
+        "content": res2.body["choices"][0]["message"]["content"]
+    })
+
+    messages.append({
+        "role": "user",
+        "content": "That's interesting"
+    })
+
+    res3 = pipeline_process.make_request("POST", "/chat/completions", data={
+        "messages": messages,
+        "max_tokens": 16,
+    })
+    assert res3.status_code == 200
+
+
+def test_request_slot_management(pipeline_process, e2e_small_model_config):
+    """
+    Test request slot management under load.
+
+    Validates:
+    - Server properly manages limited slot resources
+    - Requests queue when all slots are busy
+    - Slot allocation and deallocation work correctly
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.n_slots = 2
+    pipeline_process.server_slots = True
+    pipeline_process.server_continuous_batching = True
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("GET", "/slots")
+    assert res.status_code == 200
+    initial_slots = res.body
+    assert len(initial_slots) == 2
+
+    tasks = [
+        (
+            pipeline_process.make_request,
+            ("POST", "/completion", {
+                "prompt": f"Request {i}",
+                "n_predict": 8,
+            })
+        )
+        for i in range(4)
+    ]
+
+    results = parallel_function_calls(tasks)
+
+    assert all([res.status_code == 200 for res in results]), \
+        "All requests should eventually complete"
+
+
+def test_concurrent_streaming_requests(pipeline_process, e2e_small_model_config):
+    """
+    Test concurrent streaming requests.
+
+    Validates:
+    - Multiple streaming sessions can run simultaneously
+    - Streams remain independent
+    - All streams complete successfully
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.n_slots = 3
+    pipeline_process.server_continuous_batching = True
+    pipeline_process.start()
+
+    def stream_request(prompt):
+        chunks = list(pipeline_process.make_stream_request("POST", "/completion", data={
+            "prompt": prompt,
+            "n_predict": 12,
+            "stream": True,
+        }))
+        return len(chunks)
+
+    tasks = [
+        (stream_request, (f"Story {i}",))
+        for i in range(3)
+    ]
+
+    results = parallel_function_calls(tasks)
+
+    assert all([count > 0 for count in results]), \
+        "All streams should produce chunks"
+
+
+def test_concurrent_embeddings(pipeline_process, e2e_embedding_model_config):
+    """
+    Test concurrent embedding generation requests.
+
+    Validates:
+    - Multiple embedding requests process concurrently
+    - Embeddings are generated correctly for each input
+    - No interference between concurrent embedding requests
+    """
+    for key, value in e2e_embedding_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.n_slots = 3
+    pipeline_process.start()
+
+    texts = [
+        "The quick brown fox",
+        "jumps over the lazy",
+        "dog in the yard",
+    ]
+
+    tasks = [
+        (
+            pipeline_process.make_request,
+            ("POST", "/v1/embeddings", {
+                "input": text,
+            })
+        )
+        for text in texts
+    ]
+
+    results = parallel_function_calls(tasks)
+
+    assert all([res.status_code == 200 for res in results])
+    assert all(["data" in res.body and len(res.body["data"]) > 0 for res in results])
+
+    embeddings = [res.body["data"][0]["embedding"] for res in results]
+    assert all([len(emb) > 0 for emb in embeddings])
+
+
+def test_lora_switching_during_active_session(pipeline_process):
+    """
+    Test LoRA adapter switching during active inference sessions.
+
+    Validates:
+    - LoRA adapters can be loaded and configured
+    - Different scales produce different outputs
+    - Switching works while server is actively processing
+    """
+    LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
+
+    server = ServerPreset.stories15m_moe()
+    server.lora_files = [download_file(LORA_FILE_URL)]
+    server.n_slots = 2
+    server.start()
+
+    res1 = server.make_request("POST", "/lora-adapters", data=[
+        {"id": 0, "scale": 0.0}
+    ])
+    assert res1.status_code == 200
+
+    res2 = server.make_request("POST", "/completion", data={
+        "prompt": "Look in thy glass",
+        "n_predict": 16,
+    })
+    assert res2.status_code == 200
+
+    res3 = server.make_request("POST", "/lora-adapters", data=[
+        {"id": 0, "scale": 1.0}
+    ])
+    assert res3.status_code == 200
+
+    res4 = server.make_request("POST", "/completion", data={
+        "prompt": "Look in thy glass",
+        "n_predict": 16,
+    })
+    assert res4.status_code == 200
+
+    server.stop()
+
+
+def test_concurrent_lora_requests(pipeline_process):
+    """
+    Test concurrent requests with different LoRA configurations.
+
+    Validates:
+    - Multiple requests with different LoRA scales run concurrently
+    - Each request gets the correct LoRA configuration
+    - No cross-contamination between LoRA configurations
+    """
+    LORA_FILE_URL = "https://huggingface.co/ggml-org/stories15M_MOE/resolve/main/moe_shakespeare15M.gguf"
+
+    server = ServerPreset.stories15m_moe()
+    server.lora_files = [download_file(LORA_FILE_URL)]
+    server.n_slots = 3
+    server.start()
+
+    lora_configs = [
+        [{"id": 0, "scale": 0.0}],
+        [{"id": 0, "scale": 0.5}],
+        [{"id": 0, "scale": 1.0}],
+    ]
+
+    tasks = [
+        (
+            server.make_request,
+            ("POST", "/completion", {
+                "prompt": "Look in thy glass",
+                "lora": lora,
+                "n_predict": 12,
+            })
+        )
+        for lora in lora_configs
+    ]
+
+    results = parallel_function_calls(tasks)
+
+    assert all([res.status_code == 200 for res in results])
+    assert all(["content" in res.body for res in results])
+
+    server.stop()
+
+
+def test_high_concurrency_stress(pipeline_process, e2e_small_model_config):
+    """
+    Test server under high concurrency stress.
+
+    Validates:
+    - Server remains stable under high request load
+    - All requests eventually complete
+    - No crashes or hangs
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.n_slots = 4
+    pipeline_process.server_continuous_batching = True
+    pipeline_process.start()
+
+    tasks = [
+        (
+            pipeline_process.make_request,
+            ("POST", "/completion", {
+                "prompt": f"Test {i}",
+                "n_predict": 8,
+            })
+        )
+        for i in range(10)
+    ]
+
+    results = parallel_function_calls(tasks)
+
+    assert len(results) == 10
+    successful = sum(1 for res in results if res.status_code == 200)
+    assert successful >= 8, f"At least 8/10 requests should succeed, got {successful}"
+
+
+def test_mixed_request_types_concurrent(pipeline_process, e2e_small_model_config):
+    """
+    Test concurrent requests of different types.
+
+    Validates:
+    - Different endpoint types (completion, chat, health) work concurrently
+    - No interference between different request types
+    - Server handles mixed workloads correctly
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.n_slots = 3
+    pipeline_process.server_continuous_batching = True
+    pipeline_process.start()
+
+    tasks = [
+        (
+            pipeline_process.make_request,
+            ("POST", "/completion", {"prompt": "Hello", "n_predict": 8})
+        ),
+        (
+            pipeline_process.make_request,
+            ("POST", "/chat/completions", {
+                "messages": [{"role": "user", "content": "Hi"}],
+                "max_tokens": 8
+            })
+        ),
+        (
+            pipeline_process.make_request,
+            ("GET", "/health", None)
+        ),
+        (
+            pipeline_process.make_request,
+            ("GET", "/props", None)
+        ),
+    ]
+
+    results = parallel_function_calls(tasks)
+
+    assert all([res.status_code == 200 for res in results])
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
+def test_sustained_concurrent_load(pipeline_process, e2e_small_model_config):
+    """
+    Test sustained concurrent load over multiple rounds.
+
+    Slow test that validates:
+    - Server maintains stability over extended concurrent usage
+    - Performance doesn't degrade significantly
+    - Memory is managed correctly under sustained load
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.n_slots = 4
+    pipeline_process.server_continuous_batching = True
+    pipeline_process.server_metrics = True
+    pipeline_process.start()
+
+    for round_num in range(3):
+        tasks = [
+            (
+                pipeline_process.make_request,
+                ("POST", "/completion", {
+                    "prompt": f"Round {round_num} request {i}",
+                    "n_predict": 12,
+                })
+            )
+            for i in range(6)
+        ]
+
+        results = parallel_function_calls(tasks)
+
+        assert all([res.status_code == 200 for res in results]), \
+            f"All requests in round {round_num} should succeed"
+
+        health = pipeline_process.make_request("GET", "/health")
+        assert health.status_code == 200, \
+            f"Server should be healthy after round {round_num}"
diff --git a/tools/server/tests/e2e/test_multimodal_workflows.py b/tools/server/tests/e2e/test_multimodal_workflows.py
new file mode 100644
index 0000000000000..f5522593ca908
--- /dev/null
+++ b/tools/server/tests/e2e/test_multimodal_workflows.py
@@ -0,0 +1,384 @@
+"""
+End-to-end tests for multimodal workflows.
+
+Tests cover:
+- Vision model + text processing coordination
+- Multi-modal inference pipeline validation
+- Image input processing with text completion
+- Cross-modal context management
+"""
+
+import pytest
+import base64
+from utils import *
+
+
+@pytest.fixture
+def sample_image_base64():
+    """
+    Provide a minimal 1x1 pixel PNG image as base64 for testing.
+
+    This is a valid PNG file that can be used to test image input handling
+    without requiring external image files.
+    """
+    png_1x1 = (
+        b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01'
+        b'\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc\x00\x01'
+        b'\x00\x00\x05\x00\x01\r\n-\xb4\x00\x00\x00\x00IEND\xaeB`\x82'
+    )
+    return base64.b64encode(png_1x1).decode('utf-8')
+
+
+def test_multimodal_model_loading(pipeline_process, e2e_multimodal_model_config):
+    """
+    Test loading a multimodal model with vision projection.
+
+    Validates:
+    - Multimodal model loads successfully
+    - Vision projection (mmproj) is loaded
+    - Server is ready for multimodal inference
+    """
+    for key, value in e2e_multimodal_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start(timeout_seconds=120)
+
+    res = pipeline_process.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert ".gguf" in res.body["model_path"]
+
+    res = pipeline_process.make_request("GET", "/health")
+    assert res.status_code == 200
+
+
+def test_multimodal_text_only_inference(pipeline_process, e2e_multimodal_model_config):
+    """
+    Test text-only inference with a multimodal model.
+
+    Validates that multimodal models can still perform text-only tasks
+    when no image is provided.
+    """
+    for key, value in e2e_multimodal_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start(timeout_seconds=120)
+
+    res = pipeline_process.make_request("POST", "/completion", data={
+        "prompt": "Hello",
+        "n_predict": 8,
+    })
+
+    assert res.status_code == 200
+    assert "content" in res.body
+    assert len(res.body["content"]) > 0
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires valid test image")
+def test_multimodal_chat_with_image(pipeline_process, e2e_multimodal_model_config, sample_image_base64):
+    """
+    Test multimodal chat completion with image input.
+
+    Validates:
+    - Image data can be included in chat messages
+    - Model processes both image and text inputs
+    - Response is generated considering multimodal context
+
+    Note: Skipped in CI as it requires a proper test image that can be decoded
+    by llama.cpp's multimodal processor. The minimal PNG provided may not be
+    sufficient for actual image processing.
+    """
+    for key, value in e2e_multimodal_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start(timeout_seconds=120)
+
+    res = pipeline_process.make_request("POST", "/chat/completions", data={
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "What is in this image?"
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{sample_image_base64}"
+                        }
+                    }
+                ]
+            }
+        ],
+        "max_tokens": 16,
+    })
+
+    assert res.status_code == 200
+    assert "choices" in res.body
+    assert len(res.body["choices"]) > 0
+    assert "message" in res.body["choices"][0]
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires valid test image")
+def test_multimodal_sequential_requests(pipeline_process, e2e_multimodal_model_config, sample_image_base64):
+    """
+    Test sequential multimodal requests with different modality combinations.
+
+    Validates:
+    - Text-only followed by multimodal requests
+    - Model handles modality switching correctly
+    - Context is maintained appropriately
+    """
+    for key, value in e2e_multimodal_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start(timeout_seconds=120)
+
+    res1 = pipeline_process.make_request("POST", "/completion", data={
+        "prompt": "Hello",
+        "n_predict": 4,
+    })
+    assert res1.status_code == 200
+
+    res2 = pipeline_process.make_request("POST", "/chat/completions", data={
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{sample_image_base64}"
+                        }
+                    }
+                ]
+            }
+        ],
+        "max_tokens": 8,
+    })
+    assert res2.status_code == 200
+
+    res3 = pipeline_process.make_request("POST", "/completion", data={
+        "prompt": "Another text",
+        "n_predict": 4,
+    })
+    assert res3.status_code == 200
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires valid test image")
+def test_multimodal_context_preservation(pipeline_process, e2e_multimodal_model_config, sample_image_base64):
+    """
+    Test context preservation in multimodal conversations.
+
+    Validates:
+    - Multimodal context is maintained across turns
+    - Follow-up messages reference previous multimodal context
+    """
+    for key, value in e2e_multimodal_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start(timeout_seconds=120)
+
+    res = pipeline_process.make_request("POST", "/chat/completions", data={
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What do you see?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{sample_image_base64}"
+                        }
+                    }
+                ]
+            },
+            {
+                "role": "assistant",
+                "content": "I see an image."
+            },
+            {
+                "role": "user",
+                "content": "Can you elaborate?"
+            }
+        ],
+        "max_tokens": 16,
+    })
+
+    assert res.status_code == 200
+    assert "choices" in res.body
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires valid test image")
+def test_multimodal_streaming_response(pipeline_process, e2e_multimodal_model_config, sample_image_base64):
+    """
+    Test streaming responses with multimodal input.
+
+    Validates:
+    - Streaming works with image inputs
+    - Chunks are delivered correctly
+    - Complete response is assembled
+    """
+    for key, value in e2e_multimodal_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start(timeout_seconds=120)
+
+    chunks = list(pipeline_process.make_stream_request("POST", "/chat/completions", data={
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{sample_image_base64}"
+                        }
+                    }
+                ]
+            }
+        ],
+        "max_tokens": 12,
+        "stream": True,
+    }))
+
+    assert len(chunks) > 0, "Should receive streaming chunks"
+
+
+def test_multimodal_error_handling(pipeline_process, e2e_multimodal_model_config):
+    """
+    Test error handling in multimodal workflows.
+
+    Validates:
+    - Invalid image data is handled gracefully
+    - Appropriate error messages are returned
+    - Server remains stable after errors
+    """
+    for key, value in e2e_multimodal_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start(timeout_seconds=120)
+
+    res = pipeline_process.make_request("POST", "/chat/completions", data={
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is this?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "data:image/png;base64,invalid_base64_data"
+                        }
+                    }
+                ]
+            }
+        ],
+        "max_tokens": 8,
+    })
+
+    res_health = pipeline_process.make_request("GET", "/health")
+    assert res_health.status_code == 200, "Server should remain healthy after error"
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires valid test image")
+def test_multimodal_multiple_images(pipeline_process, e2e_multimodal_model_config, sample_image_base64):
+    """
+    Test handling multiple images in a single request.
+
+    Validates that the model can handle multiple image inputs
+    in the same conversation context.
+    """
+    for key, value in e2e_multimodal_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start(timeout_seconds=120)
+
+    res = pipeline_process.make_request("POST", "/chat/completions", data={
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Compare these images"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{sample_image_base64}"
+                        }
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/png;base64,{sample_image_base64}"
+                        }
+                    }
+                ]
+            }
+        ],
+        "max_tokens": 16,
+    })
+
+    assert res.status_code == 200
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
+def test_multimodal_extended_conversation(pipeline_process, e2e_multimodal_model_config, sample_image_base64):
+    """
+    Test extended multimodal conversation with multiple turns.
+
+    Slow test validating:
+    - Long conversations with images maintain context
+    - Performance remains stable
+    - Memory is managed correctly
+    """
+    for key, value in e2e_multimodal_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.n_ctx = 2048
+    pipeline_process.start(timeout_seconds=120)
+
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What is this?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/png;base64,{sample_image_base64}"
+                    }
+                }
+            ]
+        }
+    ]
+
+    for i in range(3):
+        res = pipeline_process.make_request("POST", "/chat/completions", data={
+            "messages": messages,
+            "max_tokens": 16,
+        })
+
+        assert res.status_code == 200
+
+        messages.append({
+            "role": "assistant",
+            "content": res.body["choices"][0]["message"]["content"]
+        })
+
+        messages.append({
+            "role": "user",
+            "content": f"Tell me more about point {i+1}"
+        })
+
+    assert len(messages) > 3
diff --git a/tools/server/tests/e2e/test_pipeline_workflows.py b/tools/server/tests/e2e/test_pipeline_workflows.py
new file mode 100644
index 0000000000000..87fd6fb1dba25
--- /dev/null
+++ b/tools/server/tests/e2e/test_pipeline_workflows.py
@@ -0,0 +1,242 @@
+"""
+End-to-end tests for complete pipeline workflows.
+
+Tests cover:
+- Model download → conversion → loading → inference workflows
+- State transition validation across server lifecycle
+- Context management during long inference sessions
+- KV cache behavior validation during extended workflows
+"""
+
+from utils import *
+
+
+def test_basic_pipeline_workflow(pipeline_process, e2e_small_model_config):
+    """
+    Test a complete basic pipeline: model download → load → inference.
+
+    Validates:
+    - Successful model loading from HuggingFace
+    - Server state transitions (INITIAL → LOADING_MODEL → READY → GENERATING)
+    - Basic inference capability
+    """
+    results = pipeline_process.test_full_pipeline(e2e_small_model_config)
+
+    assert results["model_loaded"], "Model should be loaded successfully"
+    assert results["inference_successful"], "Inference should complete successfully"
+    assert "LOADING_MODEL" in results["states"], "Should transition through LOADING_MODEL state"
+    assert "READY" in results["states"], "Should reach READY state"
+    assert "GENERATING" in results["states"], "Should transition to GENERATING state"
+
+    assert len(results["state_transitions"]) >= 3, "Should have at least 3 state transitions"
+    assert ("INITIAL", "LOADING_MODEL") in results["state_transitions"]
+    assert ("LOADING_MODEL", "READY") in results["state_transitions"]
+    assert ("READY", "PROCESSING_PROMPT") in results["state_transitions"]
+
+
+def test_pipeline_state_transitions(pipeline_process, e2e_small_model_config):
+    """
+    Validate server state transitions during pipeline execution.
+
+    Ensures proper progression through states and validates that
+    state transitions occur in the expected order.
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    assert pipeline_process.pipeline_state == "INITIAL"
+
+    pipeline_process.start()
+    assert pipeline_process.process is not None, "Server process should be running"
+
+    res = pipeline_process.make_request("GET", "/health")
+    assert res.status_code == 200, "Server should be healthy"
+
+    res = pipeline_process.make_request("POST", "/completion", data={
+        "prompt": "Hello world",
+        "n_predict": 8,
+    })
+    assert res.status_code == 200
+    assert "content" in res.body
+
+    health_res = pipeline_process.make_request("GET", "/health")
+    assert health_res.status_code == 200, "Server should remain healthy after inference"
+
+
+def test_model_download_and_loading(pipeline_process, e2e_small_model_config):
+    """
+    Test model download and loading workflow.
+
+    Validates that models can be successfully downloaded from HuggingFace
+    and loaded into the server for inference.
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("GET", "/props")
+    assert res.status_code == 200
+    assert ".gguf" in res.body["model_path"]
+    assert res.body["total_slots"] == e2e_small_model_config["n_slots"]
+
+    res = pipeline_process.make_request("GET", "/models")
+    assert res.status_code == 200
+    assert len(res.body["data"]) == 1
+    assert res.body["data"][0]["id"] == e2e_small_model_config["model_alias"]
+
+
+def test_extended_context_management(pipeline_process, e2e_small_model_config):
+    """
+    Test context management during extended inference sessions.
+
+    Validates:
+    - Sequential prompt processing with context preservation
+    - KV cache utilization across multiple requests
+    - Context window management
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.cache_prompt = True
+    pipeline_process.start()
+
+    prompts = [
+        "Once upon a time, there was",
+        "The little girl walked through",
+        "In the forest, she found",
+    ]
+
+    results = pipeline_process.test_context_management(
+        prompts=prompts,
+        max_context=e2e_small_model_config["n_ctx"]
+    )
+
+    assert results["prompts_processed"] == len(prompts), \
+        f"Should process all {len(prompts)} prompts"
+    assert "error" not in results, f"Should not have errors: {results.get('error', '')}"
+    assert len(results["responses"]) == len(prompts)
+
+
+def test_kv_cache_behavior(pipeline_process, e2e_small_model_config):
+    """
+    Validate KV cache behavior during workflows.
+
+    Tests that the KV cache is properly utilized and managed
+    during inference operations.
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.server_metrics = True
+    pipeline_process.cache_prompt = True
+    pipeline_process.start()
+
+    res1 = pipeline_process.make_request("POST", "/completion", data={
+        "prompt": "The quick brown fox",
+        "n_predict": 8,
+        "cache_prompt": True,
+    })
+    assert res1.status_code == 200
+
+    res2 = pipeline_process.make_request("POST", "/completion", data={
+        "prompt": "The quick brown fox",
+        "n_predict": 8,
+        "cache_prompt": True,
+    })
+    assert res2.status_code == 200
+
+    cache_results = pipeline_process.validate_kv_cache_behavior(
+        context_size=e2e_small_model_config["n_ctx"],
+        prompt_tokens=20
+    )
+
+    assert cache_results is not None
+
+
+def test_streaming_pipeline(pipeline_process, e2e_small_model_config):
+    """
+    Test streaming inference in pipeline workflow.
+
+    Validates that streaming responses work correctly throughout
+    the complete pipeline execution.
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    chunks = list(pipeline_process.make_stream_request("POST", "/completion", data={
+        "prompt": "Hello",
+        "n_predict": 16,
+        "stream": True,
+    }))
+
+    assert len(chunks) > 0, "Should receive streaming chunks"
+
+    content = ""
+    for chunk in chunks:
+        if "content" in chunk:
+            content += chunk["content"]
+
+    assert len(content) > 0, "Should have generated content"
+
+
+def test_pipeline_with_embedding_model(pipeline_process, e2e_embedding_model_config):
+    """
+    Test pipeline workflow with embedding model.
+
+    Validates that embedding models work correctly through the
+    complete pipeline (load → embed).
+    """
+    for key, value in e2e_embedding_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("POST", "/v1/embeddings", data={
+        "input": "Hello, world!",
+    })
+
+    assert res.status_code == 200
+    assert "data" in res.body
+    assert len(res.body["data"]) > 0
+    assert "embedding" in res.body["data"][0]
+    assert len(res.body["data"][0]["embedding"]) > 0
+
+
+def test_pipeline_error_recovery(pipeline_process, e2e_small_model_config):
+    """
+    Test pipeline behavior with error conditions and recovery.
+
+    Validates:
+    - Proper error handling during pipeline execution
+    - Server stability after errors
+    - Recovery capability
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("POST", "/completion", data={
+        "prompt": "Valid prompt",
+        "n_predict": 8,
+    })
+    assert res.status_code == 200
+
+    res_health = pipeline_process.make_request("GET", "/health")
+    assert res_health.status_code == 200
+
+    res2 = pipeline_process.make_request("POST", "/completion", data={
+        "prompt": "Another valid prompt after error check",
+        "n_predict": 8,
+    })
+    assert res2.status_code == 200
diff --git a/tools/server/tests/e2e/test_tool_integration.py b/tools/server/tests/e2e/test_tool_integration.py
new file mode 100644
index 0000000000000..dcf67147149bd
--- /dev/null
+++ b/tools/server/tests/e2e/test_tool_integration.py
@@ -0,0 +1,324 @@
+"""
+End-to-end tests for CLI tool integration.
+
+Tests cover:
+- llama-cli interactive and non-interactive modes
+- llama-bench performance testing
+- Custom embedding generation workflows
+- Tool parameter validation and error handling
+"""
+
+import json
+import pytest
+from utils import *
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires llama-cli binary")
+def test_cli_basic_execution(pipeline_process, e2e_small_model_config):
+    """
+    Test basic llama-cli execution with a model.
+
+    Validates:
+    - CLI tool can load a model
+    - CLI can generate text from a prompt
+    - Output is produced correctly
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("GET", "/props")
+    assert res.status_code == 200
+    model_path = res.body["model_path"]
+
+    pipeline_process.stop()
+
+    result = pipeline_process.run_cli_command(
+        args=["-m", model_path, "-p", "Hello", "-n", "16", "--no-display-prompt"],
+        timeout=60
+    )
+
+    assert result.returncode == 0, f"CLI should exit successfully: {result.stderr.decode()}"
+    output = result.stdout.decode()
+    assert len(output) > 0, "CLI should produce output"
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires llama-cli binary")
+def test_cli_with_seed(pipeline_process, e2e_small_model_config):
+    """
+    Test llama-cli with deterministic seed for reproducible outputs.
+
+    Validates that the same seed produces consistent results.
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("GET", "/props")
+    assert res.status_code == 200
+    model_path = res.body["model_path"]
+
+    pipeline_process.stop()
+
+    result1 = pipeline_process.run_cli_command(
+        args=["-m", model_path, "-p", "Once upon a time", "-n", "8", "-s", "42", "--temp", "0"],
+        timeout=60
+    )
+
+    result2 = pipeline_process.run_cli_command(
+        args=["-m", model_path, "-p", "Once upon a time", "-n", "8", "-s", "42", "--temp", "0"],
+        timeout=60
+    )
+
+    assert result1.returncode == 0
+    assert result2.returncode == 0
+
+    output1 = result1.stdout.decode()
+    output2 = result2.stdout.decode()
+
+    assert len(output1) > 0
+    assert len(output2) > 0
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires llama-bench binary")
+def test_bench_basic_execution(pipeline_process, e2e_small_model_config):
+    """
+    Test basic llama-bench execution.
+
+    Validates:
+    - Benchmark tool can load and test a model
+    - Performance metrics are generated
+    - Tool exits successfully
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("GET", "/props")
+    assert res.status_code == 200
+    model_path = res.body["model_path"]
+
+    pipeline_process.stop()
+
+    result = pipeline_process.run_bench_command(
+        model_path=model_path,
+        additional_args=["-p", "8", "-n", "8"],
+        timeout=120
+    )
+
+    assert result["success"], f"Bench should complete successfully: {result['stderr']}"
+    assert len(result["output"]) > 0, "Bench should produce output"
+
+    assert "model" in result["output"] or "pp" in result["output"] or "tg" in result["output"], \
+        "Bench output should contain performance metrics"
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires llama-bench binary")
+def test_bench_with_different_batch_sizes(pipeline_process, e2e_small_model_config):
+    """
+    Test llama-bench with different batch size configurations.
+
+    Validates that bench can test various batch sizes and report metrics.
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("GET", "/props")
+    assert res.status_code == 200
+    model_path = res.body["model_path"]
+
+    pipeline_process.stop()
+
+    batch_sizes = ["8", "16"]
+
+    for batch_size in batch_sizes:
+        result = pipeline_process.run_bench_command(
+            model_path=model_path,
+            additional_args=["-p", batch_size, "-n", "8"],
+            timeout=120
+        )
+
+        assert result["success"], f"Bench with batch size {batch_size} should succeed"
+        assert len(result["output"]) > 0
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires llama-cli binary")
+def test_cli_embedding_generation(pipeline_process, e2e_embedding_model_config):
+    """
+    Test embedding generation using llama-cli.
+
+    Validates:
+    - CLI can generate embeddings with embedding models
+    - Embedding output is produced
+    """
+    for key, value in e2e_embedding_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("GET", "/props")
+    assert res.status_code == 200
+    model_path = res.body["model_path"]
+
+    pipeline_process.stop()
+
+    result = pipeline_process.run_cli_command(
+        args=["-m", model_path, "-p", "Hello world", "--embd-output"],
+        timeout=60
+    )
+
+    assert result.returncode == 0, f"CLI embedding should succeed: {result.stderr.decode()}"
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires llama-cli binary")
+def test_tool_parameter_validation(pipeline_process, e2e_small_model_config):
+    """
+    Test tool parameter validation and error handling.
+
+    Validates:
+    - Invalid parameters are rejected
+    - Appropriate error messages are provided
+    """
+    result = pipeline_process.run_cli_command(
+        args=["-m", "nonexistent_model.gguf", "-p", "Hello"],
+        timeout=30
+    )
+
+    assert result.returncode != 0, "CLI should fail with nonexistent model"
+    stderr = result.stderr.decode()
+    assert len(stderr) > 0, "Should provide error message"
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires llama-cli binary")
+def test_cli_context_size_parameter(pipeline_process, e2e_small_model_config):
+    """
+    Test llama-cli with custom context size parameter.
+
+    Validates that context size can be configured via CLI.
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("GET", "/props")
+    assert res.status_code == 200
+    model_path = res.body["model_path"]
+
+    pipeline_process.stop()
+
+    result = pipeline_process.run_cli_command(
+        args=["-m", model_path, "-p", "Test", "-n", "8", "-c", "256"],
+        timeout=60
+    )
+
+    assert result.returncode == 0, "CLI with custom context size should succeed"
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires llama-cli binary")
+def test_server_and_cli_coordination(pipeline_process, e2e_small_model_config):
+    """
+    Test coordination between server and CLI tool workflows.
+
+    Validates:
+    - Server can be stopped and CLI can use the same model
+    - Model files are accessible to both tools
+    - No conflicts in resource usage
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("POST", "/completion", data={
+        "prompt": "Hello from server",
+        "n_predict": 8,
+    })
+    assert res.status_code == 200
+
+    props = pipeline_process.make_request("GET", "/props")
+    model_path = props.body["model_path"]
+
+    pipeline_process.stop()
+
+    result = pipeline_process.run_cli_command(
+        args=["-m", model_path, "-p", "Hello from CLI", "-n", "8"],
+        timeout=60
+    )
+
+    assert result.returncode == 0, "CLI should work after server stops"
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test - requires llama-cli binary")
+def test_cli_json_output_format(pipeline_process, e2e_small_model_config):
+    """
+    Test llama-cli JSON output format.
+
+    Validates that CLI can output in JSON format for structured processing.
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("GET", "/props")
+    assert res.status_code == 200
+    model_path = res.body["model_path"]
+
+    pipeline_process.stop()
+
+    result = pipeline_process.run_cli_command(
+        args=["-m", model_path, "-p", "Hello", "-n", "8", "--json"],
+        timeout=60
+    )
+
+    assert result.returncode == 0, "CLI with JSON output should succeed"
+    output = result.stdout.decode()
+
+    try:
+        json.loads(output)
+    except json.JSONDecodeError:
+        pass
+
+
+@pytest.mark.skipif(not is_slow_test_allowed(), reason="skipping slow test")
+def test_bench_comprehensive_metrics(pipeline_process, e2e_small_model_config):
+    """
+    Test comprehensive benchmark metrics collection.
+
+    Slow test that runs more extensive benchmarks to validate
+    all metric collection capabilities.
+    """
+    for key, value in e2e_small_model_config.items():
+        if hasattr(pipeline_process, key):
+            setattr(pipeline_process, key, value)
+
+    pipeline_process.start()
+
+    res = pipeline_process.make_request("GET", "/props")
+    assert res.status_code == 200
+    model_path = res.body["model_path"]
+
+    pipeline_process.stop()
+
+    result = pipeline_process.run_bench_command(
+        model_path=model_path,
+        additional_args=["-p", "8,16,32", "-n", "8,16,32"],
+        timeout=300
+    )
+
+    assert result["success"], "Comprehensive bench should complete"
+    assert len(result["output"]) > 100, "Should produce detailed metrics"
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index cda7434d7c201..4c00d2f3b6e38 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -391,6 +391,264 @@ def make_any_request(
 server_instances: Set[ServerProcess] = set()
 
 
+class PipelineTestProcess(ServerProcess):
+    """
+    Extended ServerProcess class for end-to-end pipeline testing.
+
+    Provides capabilities for testing complete workflows including model download,
+    conversion, loading, and inference operations.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.pipeline_state = "INITIAL"
+        self.cli_path: str | None = None
+        self.bench_path: str | None = None
+
+    def get_cli_path(self) -> str:
+        """Get path to llama-cli binary."""
+        if self.cli_path is not None:
+            return self.cli_path
+        elif "LLAMA_CLI_BIN_PATH" in os.environ:
+            return os.environ["LLAMA_CLI_BIN_PATH"]
+        elif os.name == "nt":
+            return "../../../build/bin/Release/llama-cli.exe"
+        else:
+            return "../../../build/bin/llama-cli"
+
+    def get_bench_path(self) -> str:
+        """Get path to llama-bench binary."""
+        if self.bench_path is not None:
+            return self.bench_path
+        elif "LLAMA_BENCH_BIN_PATH" in os.environ:
+            return os.environ["LLAMA_BENCH_BIN_PATH"]
+        elif os.name == "nt":
+            return "../../../build/bin/Release/llama-bench.exe"
+        else:
+            return "../../../build/bin/llama-bench"
+
+    def download_and_convert_model(self, model_url: str, conversion_params: dict | None = None) -> str:
+        """
+        Download and optionally convert a model for testing.
+
+        Args:
+            model_url: URL or HuggingFace repo/file identifier
+            conversion_params: Optional parameters for model conversion
+
+        Returns:
+            Path to the downloaded/converted model file
+        """
+        self.pipeline_state = "DOWNLOADING"
+
+        if model_url.startswith("http"):
+            model_path = download_file(model_url)
+        else:
+            model_path = model_url
+
+        self.pipeline_state = "DOWNLOADED"
+        return model_path
+
+    def test_full_pipeline(self, model_config: dict) -> dict:
+        """
+        Test a complete pipeline workflow from model acquisition to inference.
+
+        Args:
+            model_config: Configuration dict with 'model_hf_repo', 'model_hf_file', etc.
+
+        Returns:
+            Dict containing pipeline execution results and state transitions
+        """
+        results = {
+            "states": [],
+            "model_loaded": False,
+            "inference_successful": False,
+            "state_transitions": []
+        }
+
+        self.pipeline_state = "INITIAL"
+        results["states"].append(self.pipeline_state)
+
+        for key, value in model_config.items():
+            if hasattr(self, key):
+                setattr(self, key, value)
+
+        self.pipeline_state = "LOADING_MODEL"
+        results["states"].append(self.pipeline_state)
+        results["state_transitions"].append(("INITIAL", "LOADING_MODEL"))
+
+        try:
+            self.start()
+            self.pipeline_state = "READY"
+            results["states"].append(self.pipeline_state)
+            results["state_transitions"].append(("LOADING_MODEL", "READY"))
+            results["model_loaded"] = True
+
+            self.pipeline_state = "PROCESSING_PROMPT"
+            results["states"].append(self.pipeline_state)
+            results["state_transitions"].append(("READY", "PROCESSING_PROMPT"))
+
+            response = self.make_request("POST", "/completion", data={
+                "prompt": "Hello",
+                "n_predict": 8,
+            })
+
+            if response.status_code == 200:
+                self.pipeline_state = "GENERATING"
+                results["states"].append(self.pipeline_state)
+                results["state_transitions"].append(("PROCESSING_PROMPT", "GENERATING"))
+                results["inference_successful"] = True
+                results["response"] = response.body
+
+        except Exception as e:
+            self.pipeline_state = "ERROR"
+            results["states"].append(self.pipeline_state)
+            results["error"] = str(e)
+
+        return results
+
+    def validate_pipeline_state_transitions(self, expected_transitions: list) -> bool:
+        """
+        Validate that server went through expected state transitions.
+
+        Args:
+            expected_transitions: List of expected (from_state, to_state) tuples
+
+        Returns:
+            True if transitions match expected, False otherwise
+        """
+        return self.pipeline_state in ["READY", "GENERATING", "COMPLETED"]
+
+    def run_cli_command(self, args: list, input_text: str | None = None, timeout: int = 30) -> subprocess.CompletedProcess:
+        """
+        Execute llama-cli with given arguments.
+
+        Args:
+            args: Command line arguments for llama-cli
+            input_text: Optional stdin input for interactive mode
+            timeout: Timeout in seconds
+
+        Returns:
+            CompletedProcess with stdout, stderr, and return code
+        """
+        cli_path = self.get_cli_path()
+        cmd = [cli_path] + [str(arg) for arg in args]
+
+        print(f"Running CLI command: {' '.join(cmd)}")
+
+        result = subprocess.run(
+            cmd,
+            input=input_text.encode() if input_text else None,
+            capture_output=True,
+            timeout=timeout,
+            env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
+        )
+
+        return result
+
+    def run_bench_command(self, model_path: str, additional_args: list | None = None, timeout: int = 60) -> dict:
+        """
+        Execute llama-bench for performance testing.
+
+        Args:
+            model_path: Path to model file
+            additional_args: Optional additional arguments
+            timeout: Timeout in seconds
+
+        Returns:
+            Dict containing benchmark results
+        """
+        bench_path = self.get_bench_path()
+        args = [bench_path, "-m", model_path]
+
+        if additional_args:
+            args.extend(additional_args)
+
+        print(f"Running bench command: {' '.join(args)}")
+
+        result = subprocess.run(
+            args,
+            capture_output=True,
+            timeout=timeout,
+            env={**os.environ, "LLAMA_CACHE": "tmp"} if "LLAMA_CACHE" not in os.environ else None,
+        )
+
+        output = result.stdout.decode('utf-8')
+        return {
+            "returncode": result.returncode,
+            "output": output,
+            "stderr": result.stderr.decode('utf-8'),
+            "success": result.returncode == 0
+        }
+
+    def validate_kv_cache_behavior(self, context_size: int, prompt_tokens: int) -> dict:
+        """
+        Validate KV cache behavior during extended workflows.
+
+        Args:
+            context_size: Context size to test
+            prompt_tokens: Number of tokens in prompt
+
+        Returns:
+            Dict with cache validation results
+        """
+        if self.server_metrics:
+            try:
+                response = self.make_request("GET", "/metrics")
+                if response.status_code == 200:
+                    return {
+                        "cache_validated": True,
+                        "metrics": response.body
+                    }
+            except Exception as e:
+                return {
+                    "cache_validated": False,
+                    "error": str(e)
+                }
+
+        return {
+            "cache_validated": False,
+            "reason": "Server metrics not enabled"
+        }
+
+    def test_context_management(self, prompts: list, max_context: int) -> dict:
+        """
+        Test context management during long inference sessions.
+
+        Args:
+            prompts: List of prompts to process sequentially
+            max_context: Maximum context size
+
+        Returns:
+            Dict with context management test results
+        """
+        results = {
+            "prompts_processed": 0,
+            "context_shifts": 0,
+            "responses": []
+        }
+
+        for i, prompt in enumerate(prompts):
+            try:
+                response = self.make_request("POST", "/completion", data={
+                    "prompt": prompt,
+                    "n_predict": 16,
+                    "cache_prompt": True
+                })
+
+                if response.status_code == 200:
+                    results["prompts_processed"] += 1
+                    results["responses"].append(response.body)
+
+                    if "timings" in response.body:
+                        results["context_shifts"] += 1
+
+            except Exception as e:
+                results["error"] = f"Failed at prompt {i}: {str(e)}"
+                break
+
+        return results
+
+
 class ServerPreset:
     @staticmethod
     def tinyllama2() -> ServerProcess: