feat(mock_and_cuda): add cuda support and llm-katan support

FeiDaLI · FeiDaLI · commit 211b547bad4c · 2025-10-05T19:47:03.000+08:00
Signed-off-by: FeiDaLI &lt;1607741781@qq.com&gt;
diff --git a/candle-binding/Cargo.toml b/candle-binding/Cargo.toml
@@ -11,9 +11,9 @@ crate-type = ["staticlib", "cdylib"]
 
 [dependencies]
 anyhow = { version = "1", features = ["backtrace"] }
-candle-core = "0.8.4"
-candle-nn = "0.8.4"
-candle-transformers = "0.8.4"
+candle-core = { version = "0.8.4", features = ["cuda"] }
+candle-nn = { version= "0.8.4", features = ["cuda"] }    
+candle-transformers = { version= "0.8.4", features = ["cuda"] }    
 tokenizers = { version = "0.21.0", features = ["http"] }
 hf-hub = "0.4.1"
 safetensors = "0.4.1"
diff --git a/candle-binding/src/lib.rs b/candle-binding/src/lib.rs
@@ -495,7 +495,7 @@ impl BertSimilarity {
         let mut tokenizer = self.tokenizer.clone();
         tokenizer
             .with_truncation(Some(TruncationParams {
-                max_length: max_length.unwrap_or(512),
+                max_length: max_length.unwrap_or(100000),
                 strategy: TruncationStrategy::LongestFirst,
                 stride: 0,
                 direction: TruncationDirection::Right,
@@ -517,7 +517,7 @@ impl BertSimilarity {
         let mut tokenizer = self.tokenizer.clone();
         tokenizer
             .with_truncation(Some(TruncationParams {
-                max_length: max_length.unwrap_or(512),
+                max_length: max_length.unwrap_or(100000),
                 strategy: TruncationStrategy::LongestFirst,
                 stride: 0,
                 direction: TruncationDirection::Right,
diff --git a/src/training/dual_classifier/dual_classifier.py b/src/training/dual_classifier/dual_classifier.py
@@ -19,7 +19,7 @@ def __init__(
         self,
         num_categories: int,
         model_name: str = "distilbert-base-uncased",
-        max_length: int = 512,
+        max_length: int = 100000,
     ):
         super().__init__()
 
diff --git a/src/training/dual_classifier/trainer.py b/src/training/dual_classifier/trainer.py
@@ -22,7 +22,7 @@ def __init__(
         category_labels: List[int],
         pii_labels: List[List[int]],  # Token-level PII labels
         tokenizer,
-        max_length: int = 512,
+        max_length: int = 100000,
     ):
         self.texts = texts
         self.category_labels = category_labels
diff --git a/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py b/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py
@@ -641,7 +641,7 @@ def _analyze_sequence_lengths(self, texts: List[str], tokenizer) -> Dict[str, in
         }
 
     def optimize_sequence_length(
-        self, texts: List[str], tokenizer, default_max_length: int = 512
+        self, texts: List[str], tokenizer, default_max_length: int = 100000
     ) -> int:
         """Find optimal sequence length based on dataset characteristics."""
         logger.info("Analyzing sequence length distribution...")
diff --git a/tools/llm-katan-server/Dockerfile b/tools/llm-katan-server/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY app.py ./  
+
+EXPOSE 8000
+
+# Environment variables for configuration
+ENV MODEL=Qwen/Qwen2-0.5B-Instruct
+ENV SERVED_MODEL_NAME=Qwen/Qwen2-0.5B-Instruct
+ENV LLM_KATAN_URL=http://localhost:8001
+
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/tools/llm-katan-server/README.md b/tools/llm-katan-server/README.md
@@ -0,0 +1,101 @@
+# LLM Katan Server
+
+A FastAPI wrapper around [llm-katan](https://pypi.org/project/llm-katan/) that provides the same API design as mock-vllm but uses real LLM functionality.
+
+## Architecture
+
+This server acts as a proxy that:
+1. Receives OpenAI-compatible API requests
+2. Forwards them to a running `llm-katan` instance
+3. Returns the responses with proper model name mapping
+4. Falls back to echo behavior if `llm-katan` is unavailable
+
+## Features
+
+- Same API design as mock-vllm (FastAPI-based)
+- Proxies requests to real `llm-katan` backend
+- OpenAI-compatible API endpoints:
+  - GET /health
+  - GET /v1/models  
+  - POST /v1/chat/completions
+- Fallback behavior when backend is unavailable
+- Configurable via environment variables
+
+## Environment Variables
+
+- `MODEL`: HuggingFace model name for llm-katan (default: `Qwen/Qwen2-0.5B-Instruct`)
+- `SERVED_MODEL_NAME`: Model name to expose in API (default: same as MODEL)
+- `LLM_KATAN_URL`: URL of the llm-katan backend (default: `http://localhost:8001`)
+- `HUGGINGFACE_HUB_TOKEN`: HuggingFace authentication token
+
+## Setup
+
+### 1. Start llm-katan backend
+
+```bash
+# Install llm-katan
+pip install llm-katan
+
+# Start llm-katan server on port 8001
+llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001
+```
+
+### 2. Start this FastAPI server
+
+```bash
+# Using Docker
+docker run -p 8000:8000 llm-katan-server
+
+# Or directly with Python
+pip install -r requirements.txt
+python app.py
+```
+
+## Usage
+
+### Docker Compose (Recommended)
+
+```yaml
+services:
+  llm-katan-backend:
+    image: python:3.11-slim
+    command: >
+      sh -c "pip install llm-katan && 
+             llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001 --host 0.0.0.0"
+    ports:
+      - "8001:8001"
+    environment:
+      - HUGGINGFACE_HUB_TOKEN=${HUGGINGFACE_HUB_TOKEN}
+
+  llm-katan-server:
+    build: .
+    ports:
+      - "8000:8000"
+    environment:
+      - MODEL=Qwen/Qwen2-0.5B-Instruct
+      - SERVED_MODEL_NAME=Qwen/Qwen2-0.5B-Instruct
+      - LLM_KATAN_URL=http://llm-katan-backend:8001
+    depends_on:
+      - llm-katan-backend
+```
+
+### Testing
+
+```bash
+# Health check
+curl http://localhost:8000/health
+
+# List models
+curl http://localhost:8000/v1/models
+
+# Chat completion (uses real LLM)
+curl -X POST http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Qwen/Qwen2-0.5B-Instruct",
+    "messages": [{"role": "user", "content": "Hello!"}],
+    "max_tokens": 50
+  }'
+```
+
+Intended for local testing with Docker Compose profile `testing`.
diff --git a/tools/llm-katan-server/app.py b/tools/llm-katan-server/app.py
@@ -0,0 +1,132 @@
+import math
+import time
+import os
+import requests
+from typing import List, Optional
+
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from pydantic import BaseModel
+
+app = FastAPI()
+
+# Configuration
+MODEL = os.getenv("MODEL", "Qwen/Qwen2-0.5B-Instruct")
+SERVED_MODEL_NAME = os.getenv("SERVED_MODEL_NAME", MODEL)
+LLM_KATAN_URL = os.getenv("LLM_KATAN_URL", "http://localhost:8001")
+
+# Check if HuggingFace token is set
+hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
+if not hf_token:
+    print("Warning: HUGGINGFACE_HUB_TOKEN not set. Some models may require authentication.")
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ChatRequest(BaseModel):
+    model: str
+    messages: List[ChatMessage]
+    temperature: Optional[float] = 0.2
+    max_tokens: Optional[int] = None
+
+
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+
+
+@app.get("/v1/models")
+async def models():
+    return {"data": [{"id": SERVED_MODEL_NAME, "object": "model"}]}
+
+
+@app.post("/v1/chat/completions")
+async def chat_completions(req: ChatRequest):
+    try:
+        # Forward request to llm-katan backend
+        llm_katan_request = {
+            "model": MODEL,
+            "messages": [{"role": msg.role, "content": msg.content} for msg in req.messages],
+            "temperature": req.temperature,
+        }
+        
+        if req.max_tokens:
+            llm_katan_request["max_tokens"] = req.max_tokens
+        
+        # Make request to llm-katan
+        response = requests.post(
+            f"{LLM_KATAN_URL}/v1/chat/completions",
+            json=llm_katan_request,
+            timeout=30
+        )
+        
+        if response.status_code != 200:
+            raise HTTPException(
+                status_code=response.status_code,
+                detail=f"LLM Katan error: {response.text}"
+            )
+        
+        result = response.json()
+        
+        # Update the model name in response to match our served model name
+        result["model"] = req.model
+        
+        return result
+        
+    except requests.exceptions.RequestException as e:
+        # Fallback to simple echo behavior if llm-katan is not available
+        print(f"Warning: LLM Katan not available ({e}), using fallback response")
+        
+        # Simple echo-like behavior as fallback
+        last_user = next(
+            (m.content for m in reversed(req.messages) if m.role == "user"), ""
+        )
+        content = f"[katan-{req.model}] You said: {last_user}"
+
+        # Rough token estimation: ~1 token per 4 characters (ceil)
+        def estimate_tokens(text: str) -> int:
+            if not text:
+                return 0
+            return max(1, math.ceil(len(text) / 4))
+
+        prompt_text = "\n".join(
+            m.content for m in req.messages if isinstance(m.content, str)
+        )
+        prompt_tokens = estimate_tokens(prompt_text)
+        completion_tokens = estimate_tokens(content)
+        total_tokens = prompt_tokens + completion_tokens
+
+        created_ts = int(time.time())
+
+        usage = {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": total_tokens,
+            "prompt_tokens_details": {"cached_tokens": 0},
+            "completion_tokens_details": {"reasoning_tokens": 0},
+        }
+
+        return {
+            "id": "cmpl-katan-123",
+            "object": "chat.completion",
+            "created": created_ts,
+            "model": req.model,
+            "system_fingerprint": "llm-katan-server",
+            "choices": [
+                {
+                    "index": 0,
+                    "message": {"role": "assistant", "content": content},
+                    "finish_reason": "stop",
+                    "logprobs": None,
+                }
+            ],
+            "usage": usage,
+            "token_usage": usage,
+        }
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/tools/llm-katan-server/requirements.txt b/tools/llm-katan-server/requirements.txt
@@ -0,0 +1,4 @@
+fastapi==0.115.0
+uvicorn==0.30.6
+pydantic==2.9.2
+requests==2.31.0
diff --git a/website/docs/installation/installation.md b/website/docs/installation/installation.md
@@ -122,6 +122,38 @@ model_config:
     preferred_endpoints: ["your-endpoint"]
 ```
 
+:::tip[**No vLLM Backend? Use Mock Services for Testing**]
+If you don't have a vLLM backend set up, you can use the provided mock services for testing:
+
+**Option 1: Mock vLLM (Simple Echo Service)**
+```bash
+# Start a simple mock service that echoes back responses
+python tools/mock-vllm/app.py
+```
+
+**Option 2: LLM Katan Server (Real LLM with Lightweight Backend)**
+```bash
+# First, start llm-katan backend (requires pip install llm-katan)
+llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001
+
+# Then start the FastAPI wrapper
+python tools/llm-katan-server/app.py
+```
+
+For the mock services, update your `config/config.yaml`:
+```yaml
+vllm_endpoints:
+  - name: "mock-endpoint"
+    address: "127.0.0.1"
+    port: 8000                    # Mock service port
+    models:
+      - "openai/gpt-oss-20b"      # For mock-vllm
+      # OR
+      - "Qwen/Qwen2-0.5B-Instruct"  # For llm-katan-server
+    weight: 1
+```
+:::
+
 :::note[**Important: Address Format Requirements**]
 The `address` field **must** contain a valid IP address (IPv4 or IPv6). Domain names are not supported.
 

Original file line number	Diff line number	Diff line change
`@@ -641,7 +641,7 @@ def _analyze_sequence_lengths(self, texts: List[str], tokenizer) -> Dict[str, in`
`641`	`641`	`}`
`642`	`642`
`643`	`643`	`def optimize_sequence_length(`
`644`		`- self, texts: List[str], tokenizer, default_max_length: int = 512`
	`644`	`+ self, texts: List[str], tokenizer, default_max_length: int = 100000`
`645`	`645`	`) -> int:`
`646`	`646`	`"""Find optimal sequence length based on dataset characteristics."""`
`647`	`647`	`logger.info("Analyzing sequence length distribution...")`