From 211b547bad4ced9ee78c3496427877291220f0b3 Mon Sep 17 00:00:00 2001
From: FeiDaLI <1607741781@qq.com>
Date: Sun, 5 Oct 2025 19:47:03 +0800
Subject: [PATCH 1/3] feat(mock_and_cuda): add cuda support and llm-katan
 support

Signed-off-by: FeiDaLI <1607741781@qq.com>
---
 candle-binding/Cargo.toml                     |   6 +-
 candle-binding/src/lib.rs                     |   4 +-
 .../dual_classifier/dual_classifier.py        |   2 +-
 src/training/dual_classifier/trainer.py       |   2 +-
 .../jailbreak_bert_finetuning.py              |   2 +-
 tools/llm-katan-server/Dockerfile             |  21 +++
 tools/llm-katan-server/README.md              | 101 ++++++++++++++
 tools/llm-katan-server/app.py                 | 132 ++++++++++++++++++
 tools/llm-katan-server/requirements.txt       |   4 +
 website/docs/installation/installation.md     |  32 +++++
 10 files changed, 298 insertions(+), 8 deletions(-)
 create mode 100644 tools/llm-katan-server/Dockerfile
 create mode 100644 tools/llm-katan-server/README.md
 create mode 100644 tools/llm-katan-server/app.py
 create mode 100644 tools/llm-katan-server/requirements.txt

diff --git a/candle-binding/Cargo.toml b/candle-binding/Cargo.toml
index 9b9364f4..4159a9aa 100644
--- a/candle-binding/Cargo.toml
+++ b/candle-binding/Cargo.toml
@@ -11,9 +11,9 @@ crate-type = ["staticlib", "cdylib"]
 
 [dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-candle-core = "0.8.4"
-candle-nn = "0.8.4"
-candle-transformers = "0.8.4"
+candle-core = { version = "0.8.4", features = ["cuda"] }
+candle-nn = { version= "0.8.4", features = ["cuda"] }    
+candle-transformers = { version= "0.8.4", features = ["cuda"] }    
 tokenizers = { version = "0.21.0", features = ["http"] }
 hf-hub = "0.4.1"
 safetensors = "0.4.1"
diff --git a/candle-binding/src/lib.rs b/candle-binding/src/lib.rs
index d778c3fb..87380f96 100644
--- a/candle-binding/src/lib.rs
+++ b/candle-binding/src/lib.rs
@@ -495,7 +495,7 @@ impl BertSimilarity {
         let mut tokenizer = self.tokenizer.clone();
         tokenizer
             .with_truncation(Some(TruncationParams {
-                max_length: max_length.unwrap_or(512),
+                max_length: max_length.unwrap_or(100000),
                 strategy: TruncationStrategy::LongestFirst,
                 stride: 0,
                 direction: TruncationDirection::Right,
@@ -517,7 +517,7 @@ impl BertSimilarity {
         let mut tokenizer = self.tokenizer.clone();
         tokenizer
             .with_truncation(Some(TruncationParams {
-                max_length: max_length.unwrap_or(512),
+                max_length: max_length.unwrap_or(100000),
                 strategy: TruncationStrategy::LongestFirst,
                 stride: 0,
                 direction: TruncationDirection::Right,
diff --git a/src/training/dual_classifier/dual_classifier.py b/src/training/dual_classifier/dual_classifier.py
index 6cdb30ef..e2088060 100644
--- a/src/training/dual_classifier/dual_classifier.py
+++ b/src/training/dual_classifier/dual_classifier.py
@@ -19,7 +19,7 @@ def __init__(
         self,
         num_categories: int,
         model_name: str = "distilbert-base-uncased",
-        max_length: int = 512,
+        max_length: int = 100000,
     ):
         super().__init__()
 
diff --git a/src/training/dual_classifier/trainer.py b/src/training/dual_classifier/trainer.py
index 773a6bab..04b5b876 100644
--- a/src/training/dual_classifier/trainer.py
+++ b/src/training/dual_classifier/trainer.py
@@ -22,7 +22,7 @@ def __init__(
         category_labels: List[int],
         pii_labels: List[List[int]],  # Token-level PII labels
         tokenizer,
-        max_length: int = 512,
+        max_length: int = 100000,
     ):
         self.texts = texts
         self.category_labels = category_labels
diff --git a/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py b/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py
index a795f12a..ee66d405 100644
--- a/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py
+++ b/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py
@@ -641,7 +641,7 @@ def _analyze_sequence_lengths(self, texts: List[str], tokenizer) -> Dict[str, in
         }
 
     def optimize_sequence_length(
-        self, texts: List[str], tokenizer, default_max_length: int = 512
+        self, texts: List[str], tokenizer, default_max_length: int = 100000
     ) -> int:
         """Find optimal sequence length based on dataset characteristics."""
         logger.info("Analyzing sequence length distribution...")
diff --git a/tools/llm-katan-server/Dockerfile b/tools/llm-katan-server/Dockerfile
new file mode 100644
index 00000000..88b76c74
--- /dev/null
+++ b/tools/llm-katan-server/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY app.py ./  
+
+EXPOSE 8000
+
+# Environment variables for configuration
+ENV MODEL=Qwen/Qwen2-0.5B-Instruct
+ENV SERVED_MODEL_NAME=Qwen/Qwen2-0.5B-Instruct
+ENV LLM_KATAN_URL=http://localhost:8001
+
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/tools/llm-katan-server/README.md b/tools/llm-katan-server/README.md
new file mode 100644
index 00000000..4475767b
--- /dev/null
+++ b/tools/llm-katan-server/README.md
@@ -0,0 +1,101 @@
+# LLM Katan Server
+
+A FastAPI wrapper around [llm-katan](https://pypi.org/project/llm-katan/) that provides the same API design as mock-vllm but uses real LLM functionality.
+
+## Architecture
+
+This server acts as a proxy that:
+1. Receives OpenAI-compatible API requests
+2. Forwards them to a running `llm-katan` instance
+3. Returns the responses with proper model name mapping
+4. Falls back to echo behavior if `llm-katan` is unavailable
+
+## Features
+
+- Same API design as mock-vllm (FastAPI-based)
+- Proxies requests to real `llm-katan` backend
+- OpenAI-compatible API endpoints:
+  - GET /health
+  - GET /v1/models  
+  - POST /v1/chat/completions
+- Fallback behavior when backend is unavailable
+- Configurable via environment variables
+
+## Environment Variables
+
+- `MODEL`: HuggingFace model name for llm-katan (default: `Qwen/Qwen2-0.5B-Instruct`)
+- `SERVED_MODEL_NAME`: Model name to expose in API (default: same as MODEL)
+- `LLM_KATAN_URL`: URL of the llm-katan backend (default: `http://localhost:8001`)
+- `HUGGINGFACE_HUB_TOKEN`: HuggingFace authentication token
+
+## Setup
+
+### 1. Start llm-katan backend
+
+```bash
+# Install llm-katan
+pip install llm-katan
+
+# Start llm-katan server on port 8001
+llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001
+```
+
+### 2. Start this FastAPI server
+
+```bash
+# Using Docker
+docker run -p 8000:8000 llm-katan-server
+
+# Or directly with Python
+pip install -r requirements.txt
+python app.py
+```
+
+## Usage
+
+### Docker Compose (Recommended)
+
+```yaml
+services:
+  llm-katan-backend:
+    image: python:3.11-slim
+    command: >
+      sh -c "pip install llm-katan && 
+             llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001 --host 0.0.0.0"
+    ports:
+      - "8001:8001"
+    environment:
+      - HUGGINGFACE_HUB_TOKEN=${HUGGINGFACE_HUB_TOKEN}
+
+  llm-katan-server:
+    build: .
+    ports:
+      - "8000:8000"
+    environment:
+      - MODEL=Qwen/Qwen2-0.5B-Instruct
+      - SERVED_MODEL_NAME=Qwen/Qwen2-0.5B-Instruct
+      - LLM_KATAN_URL=http://llm-katan-backend:8001
+    depends_on:
+      - llm-katan-backend
+```
+
+### Testing
+
+```bash
+# Health check
+curl http://localhost:8000/health
+
+# List models
+curl http://localhost:8000/v1/models
+
+# Chat completion (uses real LLM)
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Qwen/Qwen2-0.5B-Instruct",
+    "messages": [{"role": "user", "content": "Hello!"}],
+    "max_tokens": 50
+  }'
+```
+
+Intended for local testing with Docker Compose profile `testing`.
diff --git a/tools/llm-katan-server/app.py b/tools/llm-katan-server/app.py
new file mode 100644
index 00000000..05e8f5b2
--- /dev/null
+++ b/tools/llm-katan-server/app.py
@@ -0,0 +1,132 @@
+import math
+import time
+import os
+import requests
+from typing import List, Optional
+
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+app = FastAPI()
+
+# Configuration
+MODEL = os.getenv("MODEL", "Qwen/Qwen2-0.5B-Instruct")
+SERVED_MODEL_NAME = os.getenv("SERVED_MODEL_NAME", MODEL)
+LLM_KATAN_URL = os.getenv("LLM_KATAN_URL", "http://localhost:8001")
+
+# Check if HuggingFace token is set
+hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
+if not hf_token:
+    print("Warning: HUGGINGFACE_HUB_TOKEN not set. Some models may require authentication.")
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ChatRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    temperature: Optional[float] = 0.2
+    max_tokens: Optional[int] = None
+
+
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+
+
+@app.get("/v1/models")
+async def models():
+    return {"data": [{"id": SERVED_MODEL_NAME, "object": "model"}]}
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(req: ChatRequest):
+    try:
+        # Forward request to llm-katan backend
+        llm_katan_request = {
+            "model": MODEL,
+            "messages": [{"role": msg.role, "content": msg.content} for msg in req.messages],
+            "temperature": req.temperature,
+        }
+        
+        if req.max_tokens:
+            llm_katan_request["max_tokens"] = req.max_tokens
+        
+        # Make request to llm-katan
+        response = requests.post(
+            f"{LLM_KATAN_URL}/v1/chat/completions",
+            json=llm_katan_request,
+            timeout=30
+        )
+        
+        if response.status_code != 200:
+            raise HTTPException(
+                status_code=response.status_code,
+                detail=f"LLM Katan error: {response.text}"
+            )
+        
+        result = response.json()
+        
+        # Update the model name in response to match our served model name
+        result["model"] = req.model
+        
+        return result
+        
+    except requests.exceptions.RequestException as e:
+        # Fallback to simple echo behavior if llm-katan is not available
+        print(f"Warning: LLM Katan not available ({e}), using fallback response")
+        
+        # Simple echo-like behavior as fallback
+        last_user = next(
+            (m.content for m in reversed(req.messages) if m.role == "user"), ""
+        )
+        content = f"[katan-{req.model}] You said: {last_user}"
+
+        # Rough token estimation: ~1 token per 4 characters (ceil)
+        def estimate_tokens(text: str) -> int:
+            if not text:
+                return 0
+            return max(1, math.ceil(len(text) / 4))
+
+        prompt_text = "\n".join(
+            m.content for m in req.messages if isinstance(m.content, str)
+        )
+        prompt_tokens = estimate_tokens(prompt_text)
+        completion_tokens = estimate_tokens(content)
+        total_tokens = prompt_tokens + completion_tokens
+
+        created_ts = int(time.time())
+
+        usage = {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": total_tokens,
+            "prompt_tokens_details": {"cached_tokens": 0},
+            "completion_tokens_details": {"reasoning_tokens": 0},
+        }
+
+        return {
+            "id": "cmpl-katan-123",
+            "object": "chat.completion",
+            "created": created_ts,
+            "model": req.model,
+            "system_fingerprint": "llm-katan-server",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": content},
+                    "finish_reason": "stop",
+                    "logprobs": None,
+                }
+            ],
+            "usage": usage,
+            "token_usage": usage,
+        }
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/tools/llm-katan-server/requirements.txt b/tools/llm-katan-server/requirements.txt
new file mode 100644
index 00000000..6c9f1068
--- /dev/null
+++ b/tools/llm-katan-server/requirements.txt
@@ -0,0 +1,4 @@
+fastapi==0.115.0
+uvicorn==0.30.6
+pydantic==2.9.2
+requests==2.31.0
diff --git a/website/docs/installation/installation.md b/website/docs/installation/installation.md
index a96c683b..7dd685e0 100644
--- a/website/docs/installation/installation.md
+++ b/website/docs/installation/installation.md
@@ -122,6 +122,38 @@ model_config:
     preferred_endpoints: ["your-endpoint"]
 ```
 
+:::tip[**No vLLM Backend? Use Mock Services for Testing**]
+If you don't have a vLLM backend set up, you can use the provided mock services for testing:
+
+**Option 1: Mock vLLM (Simple Echo Service)**
+```bash
+# Start a simple mock service that echoes back responses
+python tools/mock-vllm/app.py
+```
+
+**Option 2: LLM Katan Server (Real LLM with Lightweight Backend)**
+```bash
+# First, start llm-katan backend (requires pip install llm-katan)
+llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001
+
+# Then start the FastAPI wrapper
+python tools/llm-katan-server/app.py
+```
+
+For the mock services, update your `config/config.yaml`:
+```yaml
+vllm_endpoints:
+  - name: "mock-endpoint"
+    address: "127.0.0.1"
+    port: 8000                    # Mock service port
+    models:
+      - "openai/gpt-oss-20b"      # For mock-vllm
+      # OR
+      - "Qwen/Qwen2-0.5B-Instruct"  # For llm-katan-server
+    weight: 1
+```
+:::
+
 :::note[**Important: Address Format Requirements**]
 The `address` field **must** contain a valid IP address (IPv4 or IPv6). Domain names are not supported.
 

From 6cb1b5506886267dc902babb7cf4dcf396a7d46e Mon Sep 17 00:00:00 2001
From: FeiDaLI <1607741781@qq.com>
Date: Sun, 5 Oct 2025 20:13:39 +0800
Subject: [PATCH 2/3] feat(mock): llm-katan support

Signed-off-by: FeiDaLI <1607741781@qq.com>
---
 .pre-commit-config.yaml                       |  6 ++--
 e2e-tests/06-pii-detection-test.py            |  8 ++---
 .../classifier_model_fine_tuning/ft_linear.py | 22 +++++++-----
 .../jailbreak_bert_finetuning.py              | 28 +++++++++------
 .../ft_linear_lora.py                         |  8 +----
 .../pii_bert_finetuning_lora.py               |  8 +----
 .../jailbreak_bert_finetuning_lora.py         |  8 +----
 tools/llm-katan-server/README.md              | 22 +-----------
 tools/llm-katan-server/app.py                 | 34 ++++++++++---------
 website/docs/installation/installation.md     |  4 +++
 10 files changed, 64 insertions(+), 84 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9438abb6..8b1ca784 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,7 +3,7 @@
 repos:
 # Basic hooks for Go, Rust, Python And JavaScript files only
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v6.0.0
+  rev: v4.4.0
   hooks:
   - id: trailing-whitespace
     files: \.(go|rs|py|js)$
@@ -71,7 +71,7 @@ repos:
 
 # Python specific hooks
 - repo: https://github.com/psf/black
-  rev: 25.1.0
+  rev: 22.12.0
   hooks:
   - id: black
     language_version: python3
@@ -79,7 +79,7 @@ repos:
     exclude: ^(\.venv/|venv/|env/|__pycache__/|\.git/|site/)
 
 - repo: https://github.com/PyCQA/isort
-  rev: 6.0.1
+  rev: 5.12.0
   hooks:
   - id: isort
     args: ["--profile", "black"]
diff --git a/e2e-tests/06-pii-detection-test.py b/e2e-tests/06-pii-detection-test.py
index 3a7a6167..5cef1302 100644
--- a/e2e-tests/06-pii-detection-test.py
+++ b/e2e-tests/06-pii-detection-test.py
@@ -652,7 +652,9 @@ def test_extproc_pii_detection_comprehensive(self):
             status = (
                 "🔒"
                 if result["is_blocked"]
-                else "✅" if result["request_allowed"] else "❌"
+                else "✅"
+                if result["request_allowed"]
+                else "❌"
             )
             print(f"  {status} {result['test_case']}")
             print(f"      Content: {result['content']}")
@@ -671,9 +673,7 @@ def test_extproc_pii_detection_comprehensive(self):
                 passed=False,
                 message="⚠️ No clear evidence of ExtProc PII detection in production pipeline",
             )
-            print(
-                "📝 NOTE: This may indicate PII detection is not active in ExtProc or"
-            )
+            print("📝 NOTE: This may indicate PII detection is not active in ExtProc or")
             print("         PII policies are configured to allow all content through")
 
     def test_multiple_pii_types_analysis(self):
diff --git a/src/training/classifier_model_fine_tuning/ft_linear.py b/src/training/classifier_model_fine_tuning/ft_linear.py
index 3c736202..740aed8c 100644
--- a/src/training/classifier_model_fine_tuning/ft_linear.py
+++ b/src/training/classifier_model_fine_tuning/ft_linear.py
@@ -508,19 +508,25 @@ def tokenize_function(examples):
 
     # Evaluate on validation set
     logger.info("Evaluating on validation set...")
-    val_accuracy, val_report, val_conf_matrix, val_predictions = (
-        evaluate_category_classifier(
-            model, tokenizer, val_texts, val_categories, idx_to_category, device
-        )
+    (
+        val_accuracy,
+        val_report,
+        val_conf_matrix,
+        val_predictions,
+    ) = evaluate_category_classifier(
+        model, tokenizer, val_texts, val_categories, idx_to_category, device
     )
     logger.info(f"Validation accuracy: {val_accuracy:.4f}")
 
     # Evaluate on test set
     logger.info("Evaluating on test set...")
-    test_accuracy, test_report, test_conf_matrix, test_predictions = (
-        evaluate_category_classifier(
-            model, tokenizer, test_texts, test_categories, idx_to_category, device
-        )
+    (
+        test_accuracy,
+        test_report,
+        test_conf_matrix,
+        test_predictions,
+    ) = evaluate_category_classifier(
+        model, tokenizer, test_texts, test_categories, idx_to_category, device
     )
     logger.info(f"Test accuracy: {test_accuracy:.4f}")
 
diff --git a/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py b/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py
index ee66d405..deacec2f 100644
--- a/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py
+++ b/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py
@@ -100,9 +100,9 @@
 import torch
 
 # Suppress common non-critical warnings
-os.environ["TOKENIZERS_PARALLELISM"] = (
-    "false"  # Suppress tokenizer parallelism warnings
-)
+os.environ[
+    "TOKENIZERS_PARALLELISM"
+] = "false"  # Suppress tokenizer parallelism warnings
 warnings.filterwarnings(
     "ignore", message=".*TensorFloat32.*"
 )  # Suppress TF32 performance hints
@@ -2343,19 +2343,25 @@ def tokenize_function(examples):
 
     # Evaluate on validation set
     logger.info("Evaluating on validation set...")
-    val_accuracy, val_report, val_conf_matrix, val_predictions = (
-        evaluate_jailbreak_classifier(
-            model, tokenizer, val_texts, val_categories, idx_to_category, device
-        )
+    (
+        val_accuracy,
+        val_report,
+        val_conf_matrix,
+        val_predictions,
+    ) = evaluate_jailbreak_classifier(
+        model, tokenizer, val_texts, val_categories, idx_to_category, device
     )
     logger.info(f"Validation accuracy: {val_accuracy:.4f}")
 
     # Evaluate on test set
     logger.info("Evaluating on test set...")
-    test_accuracy, test_report, test_conf_matrix, test_predictions = (
-        evaluate_jailbreak_classifier(
-            model, tokenizer, test_texts, test_categories, idx_to_category, device
-        )
+    (
+        test_accuracy,
+        test_report,
+        test_conf_matrix,
+        test_predictions,
+    ) = evaluate_jailbreak_classifier(
+        model, tokenizer, test_texts, test_categories, idx_to_category, device
     )
     logger.info(f"Test accuracy: {test_accuracy:.4f}")
 
diff --git a/src/training/training_lora/classifier_model_fine_tuning_lora/ft_linear_lora.py b/src/training/training_lora/classifier_model_fine_tuning_lora/ft_linear_lora.py
index 3a955a46..0d855c61 100644
--- a/src/training/training_lora/classifier_model_fine_tuning_lora/ft_linear_lora.py
+++ b/src/training/training_lora/classifier_model_fine_tuning_lora/ft_linear_lora.py
@@ -69,13 +69,7 @@
 import torch
 import torch.nn as nn
 from datasets import Dataset, load_dataset
-from peft import (
-    LoraConfig,
-    PeftConfig,
-    PeftModel,
-    TaskType,
-    get_peft_model,
-)
+from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
 from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
 from sklearn.model_selection import train_test_split
 from transformers import (
diff --git a/src/training/training_lora/pii_model_fine_tuning_lora/pii_bert_finetuning_lora.py b/src/training/training_lora/pii_model_fine_tuning_lora/pii_bert_finetuning_lora.py
index f182499b..a730eef6 100644
--- a/src/training/training_lora/pii_model_fine_tuning_lora/pii_bert_finetuning_lora.py
+++ b/src/training/training_lora/pii_model_fine_tuning_lora/pii_bert_finetuning_lora.py
@@ -70,13 +70,7 @@
 import torch
 import torch.nn as nn
 from datasets import Dataset, load_dataset
-from peft import (
-    LoraConfig,
-    PeftConfig,
-    PeftModel,
-    TaskType,
-    get_peft_model,
-)
+from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
 from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
 from sklearn.model_selection import train_test_split
 from transformers import (
diff --git a/src/training/training_lora/prompt_guard_fine_tuning_lora/jailbreak_bert_finetuning_lora.py b/src/training/training_lora/prompt_guard_fine_tuning_lora/jailbreak_bert_finetuning_lora.py
index 76b6df02..bdce39c6 100644
--- a/src/training/training_lora/prompt_guard_fine_tuning_lora/jailbreak_bert_finetuning_lora.py
+++ b/src/training/training_lora/prompt_guard_fine_tuning_lora/jailbreak_bert_finetuning_lora.py
@@ -77,13 +77,7 @@
 import torch
 import torch.nn as nn
 from datasets import Dataset, load_dataset
-from peft import (
-    LoraConfig,
-    PeftConfig,
-    PeftModel,
-    TaskType,
-    get_peft_model,
-)
+from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
 from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
 from sklearn.model_selection import train_test_split
 from transformers import (
diff --git a/tools/llm-katan-server/README.md b/tools/llm-katan-server/README.md
index 4475767b..e2b04d60 100644
--- a/tools/llm-katan-server/README.md
+++ b/tools/llm-katan-server/README.md
@@ -5,6 +5,7 @@ A FastAPI wrapper around [llm-katan](https://pypi.org/project/llm-katan/) that p
 ## Architecture
 
 This server acts as a proxy that:
+
 1. Receives OpenAI-compatible API requests
 2. Forwards them to a running `llm-katan` instance
 3. Returns the responses with proper model name mapping
@@ -78,24 +79,3 @@ services:
     depends_on:
       - llm-katan-backend
 ```
-
-### Testing
-
-```bash
-# Health check
-curl http://localhost:8000/health
-
-# List models
-curl http://localhost:8000/v1/models
-
-# Chat completion (uses real LLM)
-curl -X POST http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "Qwen/Qwen2-0.5B-Instruct",
-    "messages": [{"role": "user", "content": "Hello!"}],
-    "max_tokens": 50
-  }'
-```
-
-Intended for local testing with Docker Compose profile `testing`.
diff --git a/tools/llm-katan-server/app.py b/tools/llm-katan-server/app.py
index 05e8f5b2..c3bf5aee 100644
--- a/tools/llm-katan-server/app.py
+++ b/tools/llm-katan-server/app.py
@@ -1,9 +1,9 @@
 import math
-import time
 import os
-import requests
+import time
 from typing import List, Optional
 
+import requests
 import uvicorn
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
@@ -18,7 +18,9 @@
 # Check if HuggingFace token is set
 hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
 if not hf_token:
-    print("Warning: HUGGINGFACE_HUB_TOKEN not set. Some models may require authentication.")
+    print(
+        "Warning: HUGGINGFACE_HUB_TOKEN not set. Some models may require authentication."
+    )
 
 
 class ChatMessage(BaseModel):
@@ -49,37 +51,37 @@ async def chat_completions(req: ChatRequest):
         # Forward request to llm-katan backend
         llm_katan_request = {
             "model": MODEL,
-            "messages": [{"role": msg.role, "content": msg.content} for msg in req.messages],
+            "messages": [
+                {"role": msg.role, "content": msg.content} for msg in req.messages
+            ],
             "temperature": req.temperature,
         }
-        
+
         if req.max_tokens:
             llm_katan_request["max_tokens"] = req.max_tokens
-        
+
         # Make request to llm-katan
         response = requests.post(
-            f"{LLM_KATAN_URL}/v1/chat/completions",
-            json=llm_katan_request,
-            timeout=30
+            f"{LLM_KATAN_URL}/v1/chat/completions", json=llm_katan_request, timeout=30
         )
-        
+
         if response.status_code != 200:
             raise HTTPException(
                 status_code=response.status_code,
-                detail=f"LLM Katan error: {response.text}"
+                detail=f"LLM Katan error: {response.text}",
             )
-        
+
         result = response.json()
-        
+
         # Update the model name in response to match our served model name
         result["model"] = req.model
-        
+
         return result
-        
+
     except requests.exceptions.RequestException as e:
         # Fallback to simple echo behavior if llm-katan is not available
         print(f"Warning: LLM Katan not available ({e}), using fallback response")
-        
+
         # Simple echo-like behavior as fallback
         last_user = next(
             (m.content for m in reversed(req.messages) if m.role == "user"), ""
diff --git a/website/docs/installation/installation.md b/website/docs/installation/installation.md
index 7dd685e0..b90ad64c 100644
--- a/website/docs/installation/installation.md
+++ b/website/docs/installation/installation.md
@@ -126,12 +126,14 @@ model_config:
 If you don't have a vLLM backend set up, you can use the provided mock services for testing:
 
 **Option 1: Mock vLLM (Simple Echo Service)**
+
 ```bash
 # Start a simple mock service that echoes back responses
 python tools/mock-vllm/app.py
 ```
 
 **Option 2: LLM Katan Server (Real LLM with Lightweight Backend)**
+
 ```bash
 # First, start llm-katan backend (requires pip install llm-katan)
 llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001
@@ -141,6 +143,7 @@ python tools/llm-katan-server/app.py
 ```
 
 For the mock services, update your `config/config.yaml`:
+
 ```yaml
 vllm_endpoints:
   - name: "mock-endpoint"
@@ -152,6 +155,7 @@ vllm_endpoints:
       - "Qwen/Qwen2-0.5B-Instruct"  # For llm-katan-server
     weight: 1
 ```
+
 :::
 
 :::note[**Important: Address Format Requirements**]

From 25290b928fc68e15eec16987e21c86be60ad2f7f Mon Sep 17 00:00:00 2001
From: FeiDaLI <1607741781@qq.com>
Date: Sun, 5 Oct 2025 20:19:24 +0800
Subject: [PATCH 3/3] feat(mock): llm-katan support

Signed-off-by: FeiDaLI <1607741781@qq.com>
---
 candle-binding/Cargo.toml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/candle-binding/Cargo.toml b/candle-binding/Cargo.toml
index 4159a9aa..9b9364f4 100644
--- a/candle-binding/Cargo.toml
+++ b/candle-binding/Cargo.toml
@@ -11,9 +11,9 @@ crate-type = ["staticlib", "cdylib"]
 
 [dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-candle-core = { version = "0.8.4", features = ["cuda"] }
-candle-nn = { version= "0.8.4", features = ["cuda"] }    
-candle-transformers = { version= "0.8.4", features = ["cuda"] }    
+candle-core = "0.8.4"
+candle-nn = "0.8.4"
+candle-transformers = "0.8.4"
 tokenizers = { version = "0.21.0", features = ["http"] }
 hf-hub = "0.4.1"
 safetensors = "0.4.1"