diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml
index cae4785c..0d23aaba 100644
--- a/.github/workflows/docker-publish.yml
+++ b/.github/workflows/docker-publish.yml
@@ -5,12 +5,12 @@ on:
   workflow_call:
     inputs:
       tag_suffix:
-        description: 'Custom tag suffix for the Docker image'
+        description: "Custom tag suffix for the Docker image"
         required: false
         type: string
-        default: ''
+        default: ""
       is_nightly:
-        description: 'Whether this is a nightly build'
+        description: "Whether this is a nightly build"
         required: false
         type: boolean
         default: false
@@ -20,7 +20,7 @@ on:
         type: boolean
         default: true
   push:
-    branches: [ "main" ]
+    branches: ["main"]
   pull_request:
     paths:
       - ".github/workflows/docker-publish.yml"
@@ -42,16 +42,32 @@ jobs:
         # Multi-architecture build strategy:
         # - AMD64: Native build on ubuntu-latest (fast)
         # - ARM64: Cross-compilation on ubuntu-latest (faster than emulation)
-#        arch: ${{ github.event_name == 'pull_request' && fromJSON('["amd64"]') || fromJSON('["amd64", "arm64"]') }}
+        #        arch: ${{ github.event_name == 'pull_request' && fromJSON('["amd64"]') || fromJSON('["amd64", "arm64"]') }}
         arch: ["amd64", "arm64"]
       fail-fast: false
 
     steps:
+      - name: Free up disk space
+        run: |
+          echo "Before cleanup:"
+          df -h
+          sudo rm -rf /usr/share/dotnet
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /opt/ghc
+          sudo rm -rf /opt/hostedtoolcache/CodeQL
+          sudo docker image prune --all --force
+          echo "After cleanup:"
+          df -h
+
       - name: Check out the repo
         uses: actions/checkout@v4
 
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
+        with:
+          driver-opts: |
+            image=moby/buildkit:latest
+            network=host
 
       - name: Set up QEMU for cross-compilation
         if: matrix.arch == 'arm64'
@@ -172,6 +188,14 @@ jobs:
             fi
           fi
 
+      - name: Additional cleanup for llm-katan (large Python packages)
+        if: matrix.image == 'llm-katan'
+        run: |
+          echo "Freeing up more space for llm-katan build..."
+          sudo apt-get clean
+          sudo rm -rf /var/lib/apt/lists/*
+          df -h
+
       - name: Build and push ${{ matrix.image }} Docker image
         id: build
         uses: docker/build-push-action@v5
@@ -182,10 +206,8 @@ jobs:
           push: ${{ github.event_name != 'pull_request' }}
           load: ${{ github.event_name == 'pull_request' }}
           tags: ${{ steps.tags.outputs.tags }}
-          cache-from: |
-            type=gha
-            type=local,src=/tmp/.buildx-cache
-          cache-to: type=local,dest=/tmp/.buildx-cache,mode=max
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
           build-args: |
             BUILDKIT_INLINE_CACHE=1
             CARGO_BUILD_JOBS=${{ github.event_name == 'pull_request' && '8' || '16' }}
diff --git a/.github/workflows/test-and-build.yml b/.github/workflows/test-and-build.yml
index 864c3159..d77545f5 100644
--- a/.github/workflows/test-and-build.yml
+++ b/.github/workflows/test-and-build.yml
@@ -64,6 +64,7 @@ jobs:
           key: ${{ runner.os }}-models-v1-${{ hashFiles('tools/make/models.mk') }}
           restore-keys: |
             ${{ runner.os }}-models-v1-
+        continue-on-error: true # Don't fail the job if caching fails
 
       - name: Check go mod tidy
         run: make check-go-mod-tidy
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8367a124..41c055b4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,100 +1,100 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
-# Basic hooks for Go, Rust, Python And JavaScript files only
-- repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v6.0.0
-  hooks:
-  - id: trailing-whitespace
-    files: \.(go|rs|py|js)$
-  - id: end-of-file-fixer
-    files: \.(go|rs|py|js)$
-  - id: check-added-large-files
-    args: ['--maxkb=500']
-    files: \.(go|rs|py|js)$
+  # Basic hooks for Go, Rust, Python And JavaScript files only
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.2.0
+    hooks:
+      - id: trailing-whitespace
+        files: \.(go|rs|py|js)$
+      - id: end-of-file-fixer
+        files: \.(go|rs|py|js)$
+      - id: check-added-large-files
+        args: ["--maxkb=500"]
+        files: \.(go|rs|py|js)$
 
-# Go specific hooks
-- repo: local
-  hooks:
-  - id: go-fmt
-    name: go fmt
-    entry: gofmt -w
-    language: system
-    files: \.go$
+  # Go specific hooks
+  - repo: local
+    hooks:
+      - id: go-fmt
+        name: go fmt
+        entry: gofmt -w
+        language: system
+        files: \.go$
 
-- repo: local
-  hooks:
-    - id: golang-lint
-      name: go lint
-      entry: make go-lint
-      language: system
-      files: \.go$
-      pass_filenames: false
+  - repo: local
+    hooks:
+      - id: golang-lint
+        name: go lint
+        entry: make go-lint
+        language: system
+        files: \.go$
+        pass_filenames: false
 
-# Markdown specific hooks
-- repo: local
-  hooks:
-  - id: md-fmt
-    name: md fmt
-    entry: bash -c "make markdown-lint"
-    language: system
-    files: \.md$
-    exclude: ^(\node_modules/|CLAUDE\.md)
+  # Markdown specific hooks
+  - repo: local
+    hooks:
+      - id: md-fmt
+        name: md fmt
+        entry: bash -c "make markdown-lint"
+        language: system
+        files: \.md$
+        exclude: ^(\node_modules/|CLAUDE\.md)
 
-# Yaml specific hooks
-- repo: local
-  hooks:
-  - id: yaml-and-yml-fmt
-    name: yaml/yml fmt
-    entry: bash -c "make markdown-lint"
-    language: system
-    files: \.(yaml|yml)$
-    exclude: ^(\node_modules/)
+  # Yaml specific hooks
+  - repo: local
+    hooks:
+      - id: yaml-and-yml-fmt
+        name: yaml/yml fmt
+        entry: bash -c "make markdown-lint"
+        language: system
+        files: \.(yaml|yml)$
+        exclude: ^(\node_modules/)
 
-# JavaScript and TypeScript specific hooks
-- repo: local
-  hooks:
-  - id: js-ts-lint
-    name: js/ts lint
-    entry: bash -c 'cd website && npm install 2>/dev/null || true && npm run lint'
-    language: system
-    files: \.(js|ts|tsx)$
-    exclude: ^(\node_modules/)
-    pass_filenames: false
+  # JavaScript and TypeScript specific hooks
+  - repo: local
+    hooks:
+      - id: js-ts-lint
+        name: js/ts lint
+        entry: bash -c 'cd website && npm install 2>/dev/null || true && npm run lint'
+        language: system
+        files: \.(js|ts|tsx)$
+        exclude: ^(\node_modules/)
+        pass_filenames: false
 
-# Rust specific hooks
-- repo: local
-  hooks:
-  - id: cargo-fmt
-    name: cargo fmt
-    entry: bash -c 'cd candle-binding && rustup component add rustfmt 2>/dev/null || true && cargo fmt'
-    language: system
-    files: \.rs$
-    pass_filenames: false
-  - id: cargo-check
-    name: cargo check
-    entry: bash -c 'cd candle-binding && cargo check'
-    language: system
-    files: \.rs$
-    pass_filenames: false
+  # Rust specific hooks
+  - repo: local
+    hooks:
+      - id: cargo-fmt
+        name: cargo fmt
+        entry: bash -c 'cd candle-binding && rustup component add rustfmt 2>/dev/null || true && cargo fmt'
+        language: system
+        files: \.rs$
+        pass_filenames: false
+      - id: cargo-check
+        name: cargo check
+        entry: bash -c 'cd candle-binding && cargo check'
+        language: system
+        files: \.rs$
+        pass_filenames: false
 
-# Python specific hooks
-- repo: https://github.com/psf/black
-  rev: 25.1.0
-  hooks:
-  - id: black
-    language_version: python3
-    files: \.py$
-    exclude: ^(\.venv/|venv/|env/|__pycache__/|\.git/|site/)
-
-- repo: https://github.com/PyCQA/isort
-  rev: 6.0.1
-  hooks:
-  - id: isort
-    args: ["--profile", "black"]
-    files: \.py$
-    exclude: ^(\.venv/|venv/|env/|__pycache__/|\.git/|site/)
+  # Python specific hooks
+  # isort must run before black
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        args: ["--profile", "black", "--line-length", "88"]
+        files: \.py$
+        exclude: ^(\.venv/|venv/|env/|__pycache__/|\.git/|site/)
 
+  - repo: https://github.com/psf/black
+    rev: 25.1.0
+    hooks:
+      - id: black
+        language_version: python3
+        files: \.py$
+        exclude: ^(\.venv/|venv/|env/|__pycache__/|\.git/|site/)
 # Commented out flake8 - only reports issues, doesn't auto-fix
 # -   repo: https://github.com/PyCQA/flake8
 #     rev: 7.3.0
diff --git a/config/config-mcp-classifier-example.yaml b/config/config-mcp-classifier-example.yaml
index 22468df6..4d7f6530 100644
--- a/config/config-mcp-classifier-example.yaml
+++ b/config/config-mcp-classifier-example.yaml
@@ -14,7 +14,7 @@
 
 # BERT model for semantic caching and tool selection
 bert_model:
-  model_id: "sentence-transformers/all-MiniLM-L6-v2"
+  model_id: models/all-MiniLM-L12-v2
   threshold: 0.85
   use_cpu: true
 
diff --git a/config/config.development.yaml b/config/config.development.yaml
index fa7afdef..31051e7c 100644
--- a/config/config.development.yaml
+++ b/config/config.development.yaml
@@ -3,7 +3,7 @@
 # for local development and debugging.
 
 bert_model:
-  model_id: sentence-transformers/all-MiniLM-L12-v2
+  model_id: models/all-MiniLM-L12-v2
   threshold: 0.6
   use_cpu: true
 
diff --git a/config/config.e2e.yaml b/config/config.e2e.yaml
index 42167503..b588849f 100644
--- a/config/config.e2e.yaml
+++ b/config/config.e2e.yaml
@@ -1,5 +1,5 @@
 bert_model:
-  model_id: sentence-transformers/all-MiniLM-L12-v2
+  model_id: models/all-MiniLM-L12-v2
   threshold: 0.6
   use_cpu: true
 semantic_cache:
diff --git a/config/config.production.yaml b/config/config.production.yaml
index edd049a3..9c4dd4f8 100644
--- a/config/config.production.yaml
+++ b/config/config.production.yaml
@@ -3,7 +3,7 @@
 # for production deployment with Jaeger or other OTLP-compatible backends.
 
 bert_model:
-  model_id: sentence-transformers/all-MiniLM-L12-v2
+  model_id: models/all-MiniLM-L12-v2
   threshold: 0.6
   use_cpu: true
 
diff --git a/config/config.recipe-accuracy.yaml b/config/config.recipe-accuracy.yaml
index 18f2751d..584b0291 100644
--- a/config/config.recipe-accuracy.yaml
+++ b/config/config.recipe-accuracy.yaml
@@ -13,7 +13,7 @@
 # - Jailbreak protection enabled
 
 bert_model:
-  model_id: sentence-transformers/all-MiniLM-L12-v2
+  model_id: models/all-MiniLM-L12-v2
   threshold: 0.7  # Higher threshold for better precision
   use_cpu: true
 
diff --git a/config/config.recipe-latency.yaml b/config/config.recipe-latency.yaml
index 00b3ae00..ce31a36f 100644
--- a/config/config.recipe-latency.yaml
+++ b/config/config.recipe-latency.yaml
@@ -13,7 +13,7 @@
 # - Minimal observability overhead
 
 bert_model:
-  model_id: sentence-transformers/all-MiniLM-L12-v2
+  model_id: models/all-MiniLM-L12-v2
   threshold: 0.4  # Very low threshold for fast matching
   use_cpu: true
 
diff --git a/config/config.recipe-token-efficiency.yaml b/config/config.recipe-token-efficiency.yaml
index b76aeec4..49008db5 100644
--- a/config/config.recipe-token-efficiency.yaml
+++ b/config/config.recipe-token-efficiency.yaml
@@ -13,7 +13,7 @@
 # - Larger batch sizes for efficient processing
 
 bert_model:
-  model_id: sentence-transformers/all-MiniLM-L12-v2
+  model_id: models/all-MiniLM-L12-v2
   threshold: 0.5  # Lower threshold for faster matching
   use_cpu: true
 
diff --git a/config/config.testing.yaml b/config/config.testing.yaml
index 9dc59e5c..91722f56 100644
--- a/config/config.testing.yaml
+++ b/config/config.testing.yaml
@@ -1,5 +1,5 @@
 bert_model:
-  model_id: sentence-transformers/all-MiniLM-L12-v2
+  model_id: models/all-MiniLM-L12-v2
   threshold: 0.6
   use_cpu: true
 
diff --git a/config/config.yaml b/config/config.yaml
index 5ad29d5a..1e2c43d7 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,15 +1,15 @@
 bert_model:
-  model_id: sentence-transformers/all-MiniLM-L12-v2
+  model_id: models/all-MiniLM-L12-v2
   threshold: 0.6
   use_cpu: true
 
 semantic_cache:
   enabled: true
-  backend_type: "memory"  # Options: "memory" or "milvus"
+  backend_type: "memory" # Options: "memory" or "milvus"
   similarity_threshold: 0.8
-  max_entries: 1000  # Only applies to memory backend
+  max_entries: 1000 # Only applies to memory backend
   ttl_seconds: 3600
-  eviction_policy: "fifo"  
+  eviction_policy: "fifo"
 
 tools:
   enabled: true
@@ -32,13 +32,13 @@ prompt_guard:
 # NOT supported: domain names (example.com), protocol prefixes (http://), paths (/api), ports in address (use 'port' field)
 vllm_endpoints:
   - name: "endpoint1"
-    address: "172.28.0.20"  # Static IPv4 of llm-katan within docker compose network
+    address: "172.28.0.20" # Static IPv4 of llm-katan within docker compose network
     port: 8002
     weight: 1
 
 model_config:
   "qwen3":
-    reasoning_family: "qwen3"  # This model uses Qwen-3 reasoning syntax
+    reasoning_family: "qwen3" # This model uses Qwen-3 reasoning syntax
     preferred_endpoints: ["endpoint1"]
     pii_policy:
       allow_by_default: true
@@ -65,7 +65,7 @@ categories:
     model_scores:
       - model: qwen3
         score: 0.7
-        use_reasoning: false  # Business performs better without reasoning
+        use_reasoning: false # Business performs better without reasoning
   - name: law
     system_prompt: "You are a knowledgeable legal expert with comprehensive understanding of legal principles, case law, statutory interpretation, and legal procedures across multiple jurisdictions. Provide accurate legal information and analysis while clearly stating that your responses are for informational purposes only and do not constitute legal advice. Always recommend consulting with qualified legal professionals for specific legal matters."
     model_scores:
@@ -89,7 +89,7 @@ categories:
     model_scores:
       - model: qwen3
         score: 0.6
-        use_reasoning: true  # Enable reasoning for complex chemistry
+        use_reasoning: true # Enable reasoning for complex chemistry
   - name: history
     system_prompt: "You are a historian with expertise across different time periods and cultures. Provide accurate historical context and analysis."
     model_scores:
@@ -119,13 +119,13 @@ categories:
     model_scores:
       - model: qwen3
         score: 1.0
-        use_reasoning: true  # Enable reasoning for complex math
+        use_reasoning: true # Enable reasoning for complex math
   - name: physics
     system_prompt: "You are a physics expert with deep understanding of physical laws and phenomena. Provide clear explanations with mathematical derivations when appropriate."
     model_scores:
       - model: qwen3
         score: 0.7
-        use_reasoning: true  # Enable reasoning for physics
+        use_reasoning: true # Enable reasoning for physics
   - name: computer science
     system_prompt: "You are a computer science expert with knowledge of algorithms, data structures, programming languages, and software engineering. Provide clear, practical solutions with code examples when helpful."
     model_scores:
@@ -178,23 +178,23 @@ api:
       detailed_goroutine_tracking: true
       high_resolution_timing: false
       sample_rate: 1.0
-      duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
+      duration_buckets:
+        [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
       size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
 
 # Observability Configuration
 observability:
   tracing:
-    enabled: false  # Enable distributed tracing (default: false)
-    provider: "opentelemetry"  # Provider: opentelemetry, openinference, openllmetry
+    enabled: false # Enable distributed tracing (default: false)
+    provider: "opentelemetry" # Provider: opentelemetry, openinference, openllmetry
     exporter:
-      type: "stdout"  # Exporter: otlp, jaeger, zipkin, stdout
-      endpoint: "localhost:4317"  # OTLP endpoint (when type: otlp)
-      insecure: true  # Use insecure connection (no TLS)
+      type: "stdout" # Exporter: otlp, jaeger, zipkin, stdout
+      endpoint: "localhost:4317" # OTLP endpoint (when type: otlp)
+      insecure: true # Use insecure connection (no TLS)
     sampling:
-      type: "always_on"  # Sampling: always_on, always_off, probabilistic
-      rate: 1.0  # Sampling rate for probabilistic (0.0-1.0)
+      type: "always_on" # Sampling: always_on, always_off, probabilistic
+      rate: 1.0 # Sampling rate for probabilistic (0.0-1.0)
     resource:
       service_name: "vllm-semantic-router"
       service_version: "v0.1.0"
       deployment_environment: "development"
-
diff --git a/e2e-tests/llm-katan/Dockerfile b/e2e-tests/llm-katan/Dockerfile
index 9e29080e..303fc016 100644
--- a/e2e-tests/llm-katan/Dockerfile
+++ b/e2e-tests/llm-katan/Dockerfile
@@ -17,7 +17,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
 
 # Copy requirements first for better layer caching
 COPY requirements.txt ./
-RUN pip install --no-cache-dir -r requirements.txt
+# Install PyTorch CPU-only version to save space (no CUDA for testing server)
+RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu && \
+    pip install --no-cache-dir -r requirements.txt
 
 # Copy the llm_katan package
 COPY llm_katan/ ./llm_katan/
diff --git a/src/training/training_lora/classifier_model_fine_tuning_lora/ft_linear_lora.py b/src/training/training_lora/classifier_model_fine_tuning_lora/ft_linear_lora.py
index 783de39a..ba7c0ab6 100644
--- a/src/training/training_lora/classifier_model_fine_tuning_lora/ft_linear_lora.py
+++ b/src/training/training_lora/classifier_model_fine_tuning_lora/ft_linear_lora.py
@@ -69,13 +69,7 @@
 import torch
 import torch.nn as nn
 from datasets import Dataset, load_dataset
-from peft import (
-    LoraConfig,
-    PeftConfig,
-    PeftModel,
-    TaskType,
-    get_peft_model,
-)
+from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
 from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
 from sklearn.model_selection import train_test_split
 from transformers import (
diff --git a/src/training/training_lora/classifier_model_fine_tuning_lora/ft_qwen3_generative_lora.py b/src/training/training_lora/classifier_model_fine_tuning_lora/ft_qwen3_generative_lora.py
index 01378b03..147d564f 100644
--- a/src/training/training_lora/classifier_model_fine_tuning_lora/ft_qwen3_generative_lora.py
+++ b/src/training/training_lora/classifier_model_fine_tuning_lora/ft_qwen3_generative_lora.py
@@ -53,13 +53,7 @@
 
 import torch
 from datasets import Dataset, load_dataset
-from peft import (
-    LoraConfig,
-    PeftConfig,
-    PeftModel,
-    TaskType,
-    get_peft_model,
-)
+from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
 from sklearn.metrics import accuracy_score, f1_score
 from sklearn.model_selection import train_test_split
 from transformers import (
diff --git a/src/training/training_lora/pii_model_fine_tuning_lora/pii_bert_finetuning_lora.py b/src/training/training_lora/pii_model_fine_tuning_lora/pii_bert_finetuning_lora.py
index e9147caf..a48c4d1d 100644
--- a/src/training/training_lora/pii_model_fine_tuning_lora/pii_bert_finetuning_lora.py
+++ b/src/training/training_lora/pii_model_fine_tuning_lora/pii_bert_finetuning_lora.py
@@ -70,13 +70,7 @@
 import torch
 import torch.nn as nn
 from datasets import Dataset, load_dataset
-from peft import (
-    LoraConfig,
-    PeftConfig,
-    PeftModel,
-    TaskType,
-    get_peft_model,
-)
+from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
 from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
 from sklearn.model_selection import train_test_split
 from transformers import (
diff --git a/src/training/training_lora/prompt_guard_fine_tuning_lora/jailbreak_bert_finetuning_lora.py b/src/training/training_lora/prompt_guard_fine_tuning_lora/jailbreak_bert_finetuning_lora.py
index 408792dc..da5007cd 100644
--- a/src/training/training_lora/prompt_guard_fine_tuning_lora/jailbreak_bert_finetuning_lora.py
+++ b/src/training/training_lora/prompt_guard_fine_tuning_lora/jailbreak_bert_finetuning_lora.py
@@ -77,13 +77,7 @@
 import torch
 import torch.nn as nn
 from datasets import Dataset, load_dataset
-from peft import (
-    LoraConfig,
-    PeftConfig,
-    PeftModel,
-    TaskType,
-    get_peft_model,
-)
+from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
 from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
 from sklearn.model_selection import train_test_split
 from transformers import (
diff --git a/tools/make/docker.mk b/tools/make/docker.mk
index 975d91f4..9437354a 100644
--- a/tools/make/docker.mk
+++ b/tools/make/docker.mk
@@ -130,9 +130,24 @@ docker-compose-rebuild-llm-katan: docker-compose-up-llm-katan
 
 docker-compose-down:
 	@$(LOG_TARGET)
-	@echo "Stopping docker-compose services..."
+	@echo "Stopping docker-compose services (default includes llm-katan)..."
+	@docker compose --profile llm-katan down
+
+docker-compose-down-core:
+	@$(LOG_TARGET)
+	@echo "Stopping core services only (no llm-katan)..."
 	@docker compose down
 
+docker-compose-down-testing:
+	@$(LOG_TARGET)
+	@echo "Stopping services with testing profile..."
+	@docker compose --profile testing down
+
+docker-compose-down-llm-katan:
+	@$(LOG_TARGET)
+	@echo "Stopping services with llm-katan profile..."
+	@docker compose --profile llm-katan down
+
 # Help target for Docker commands
 docker-help:
 	@echo "Docker Make Targets:"
@@ -152,7 +167,10 @@ docker-help:
 	@echo "  docker-compose-rebuild               - Force rebuild then start"
 	@echo "  docker-compose-rebuild-testing       - Force rebuild (testing profile)"
 	@echo "  docker-compose-rebuild-llm-katan     - Force rebuild (llm-katan profile)"
-	@echo "  docker-compose-down                  - Stop docker-compose services"
+	@echo "  docker-compose-down                  - Stop services (default includes llm-katan)"
+	@echo "  docker-compose-down-core             - Stop core services only (no llm-katan)"
+	@echo "  docker-compose-down-testing          - Stop services with testing profile"
+	@echo "  docker-compose-down-llm-katan        - Stop services with llm-katan profile"
 	@echo ""
 	@echo "Environment Variables:"
 	@echo "  DOCKER_REGISTRY - Docker registry (default: ghcr.io/vllm-project/semantic-router)"
diff --git a/tools/make/models.mk b/tools/make/models.mk
index 500b8031..a22828e0 100644
--- a/tools/make/models.mk
+++ b/tools/make/models.mk
@@ -28,6 +28,9 @@ download-models-minimal:
 	@if [ ! -f "models/Qwen/Qwen3-0.6B/.downloaded" ] || [ ! -d "models/Qwen/Qwen3-0.6B" ]; then \
 		hf download Qwen/Qwen3-0.6B --local-dir models/Qwen/Qwen3-0.6B && printf '%s\n' "$$(date -u +%Y-%m-%dT%H:%M:%SZ)" > models/Qwen/Qwen3-0.6B/.downloaded; \
 	fi
+	@if [ ! -f "models/all-MiniLM-L12-v2/.downloaded" ] || [ ! -d "models/all-MiniLM-L12-v2" ]; then \
+		hf download sentence-transformers/all-MiniLM-L12-v2 --local-dir models/all-MiniLM-L12-v2 && printf '%s\n' "$$(date -u +%Y-%m-%dT%H:%M:%SZ)" > models/all-MiniLM-L12-v2/.downloaded; \
+	fi
 	@if [ ! -f "models/category_classifier_modernbert-base_model/.downloaded" ] || [ ! -d "models/category_classifier_modernbert-base_model" ]; then \
 		hf download LLM-Semantic-Router/category_classifier_modernbert-base_model --local-dir models/category_classifier_modernbert-base_model && printf '%s\n' "$$(date -u +%Y-%m-%dT%H:%M:%SZ)" > models/category_classifier_modernbert-base_model/.downloaded; \
 	fi
@@ -49,6 +52,9 @@ download-models-full:
 	@if [ ! -f "models/Qwen/Qwen3-0.6B/.downloaded" ] || [ ! -d "models/Qwen/Qwen3-0.6B" ]; then \
 		hf download Qwen/Qwen3-0.6B --local-dir models/Qwen/Qwen3-0.6B && printf '%s\n' "$$(date -u +%Y-%m-%dT%H:%M:%SZ)" > models/Qwen/Qwen3-0.6B/.downloaded; \
 	fi
+	@if [ ! -f "models/all-MiniLM-L12-v2/.downloaded" ] || [ ! -d "models/all-MiniLM-L12-v2" ]; then \
+		hf download sentence-transformers/all-MiniLM-L12-v2 --local-dir models/all-MiniLM-L12-v2 && printf '%s\n' "$$(date -u +%Y-%m-%dT%H:%M:%SZ)" > models/all-MiniLM-L12-v2/.downloaded; \
+	fi
 	@if [ ! -f "models/category_classifier_modernbert-base_model/.downloaded" ] || [ ! -d "models/category_classifier_modernbert-base_model" ]; then \
 		hf download LLM-Semantic-Router/category_classifier_modernbert-base_model --local-dir models/category_classifier_modernbert-base_model && printf '%s\n' "$$(date -u +%Y-%m-%dT%H:%M:%SZ)" > models/category_classifier_modernbert-base_model/.downloaded; \
 	fi