Skip to content

Commit 211b547

Browse files
committed
feat(mock_and_cuda): add cuda support and llm-katan support
Signed-off-by: FeiDaLI <[email protected]>
1 parent e54d751 commit 211b547

File tree

10 files changed

+298
-8
lines changed

10 files changed

+298
-8
lines changed

candle-binding/Cargo.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@ crate-type = ["staticlib", "cdylib"]
1111

1212
[dependencies]
1313
anyhow = { version = "1", features = ["backtrace"] }
14-
candle-core = "0.8.4"
15-
candle-nn = "0.8.4"
16-
candle-transformers = "0.8.4"
14+
candle-core = { version = "0.8.4", features = ["cuda"] }
15+
candle-nn = { version= "0.8.4", features = ["cuda"] }
16+
candle-transformers = { version= "0.8.4", features = ["cuda"] }
1717
tokenizers = { version = "0.21.0", features = ["http"] }
1818
hf-hub = "0.4.1"
1919
safetensors = "0.4.1"

candle-binding/src/lib.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ impl BertSimilarity {
495495
let mut tokenizer = self.tokenizer.clone();
496496
tokenizer
497497
.with_truncation(Some(TruncationParams {
498-
max_length: max_length.unwrap_or(512),
498+
max_length: max_length.unwrap_or(100000),
499499
strategy: TruncationStrategy::LongestFirst,
500500
stride: 0,
501501
direction: TruncationDirection::Right,
@@ -517,7 +517,7 @@ impl BertSimilarity {
517517
let mut tokenizer = self.tokenizer.clone();
518518
tokenizer
519519
.with_truncation(Some(TruncationParams {
520-
max_length: max_length.unwrap_or(512),
520+
max_length: max_length.unwrap_or(100000),
521521
strategy: TruncationStrategy::LongestFirst,
522522
stride: 0,
523523
direction: TruncationDirection::Right,

src/training/dual_classifier/dual_classifier.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def __init__(
1919
self,
2020
num_categories: int,
2121
model_name: str = "distilbert-base-uncased",
22-
max_length: int = 512,
22+
max_length: int = 100000,
2323
):
2424
super().__init__()
2525

src/training/dual_classifier/trainer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def __init__(
2222
category_labels: List[int],
2323
pii_labels: List[List[int]], # Token-level PII labels
2424
tokenizer,
25-
max_length: int = 512,
25+
max_length: int = 100000,
2626
):
2727
self.texts = texts
2828
self.category_labels = category_labels

src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -641,7 +641,7 @@ def _analyze_sequence_lengths(self, texts: List[str], tokenizer) -> Dict[str, in
641641
}
642642

643643
def optimize_sequence_length(
644-
self, texts: List[str], tokenizer, default_max_length: int = 512
644+
self, texts: List[str], tokenizer, default_max_length: int = 100000
645645
) -> int:
646646
"""Find optimal sequence length based on dataset characteristics."""
647647
logger.info("Analyzing sequence length distribution...")

tools/llm-katan-server/Dockerfile

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
FROM python:3.11-slim
2+
3+
WORKDIR /app
4+
5+
RUN apt-get update && apt-get install -y --no-install-recommends \
6+
curl \
7+
&& rm -rf /var/lib/apt/lists/*
8+
9+
COPY requirements.txt ./
10+
RUN pip install --no-cache-dir -r requirements.txt
11+
12+
COPY app.py ./
13+
14+
EXPOSE 8000
15+
16+
# Environment variables for configuration
17+
ENV MODEL=Qwen/Qwen2-0.5B-Instruct
18+
ENV SERVED_MODEL_NAME=Qwen/Qwen2-0.5B-Instruct
19+
ENV LLM_KATAN_URL=http://localhost:8001
20+
21+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

tools/llm-katan-server/README.md

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
# LLM Katan Server
2+
3+
A FastAPI wrapper around [llm-katan](https://pypi.org/project/llm-katan/) that provides the same API design as mock-vllm but uses real LLM functionality.
4+
5+
## Architecture
6+
7+
This server acts as a proxy that:
8+
1. Receives OpenAI-compatible API requests
9+
2. Forwards them to a running `llm-katan` instance
10+
3. Returns the responses with proper model name mapping
11+
4. Falls back to echo behavior if `llm-katan` is unavailable
12+
13+
## Features
14+
15+
- Same API design as mock-vllm (FastAPI-based)
16+
- Proxies requests to real `llm-katan` backend
17+
- OpenAI-compatible API endpoints:
18+
- GET /health
19+
- GET /v1/models
20+
- POST /v1/chat/completions
21+
- Fallback behavior when backend is unavailable
22+
- Configurable via environment variables
23+
24+
## Environment Variables
25+
26+
- `MODEL`: HuggingFace model name for llm-katan (default: `Qwen/Qwen2-0.5B-Instruct`)
27+
- `SERVED_MODEL_NAME`: Model name to expose in API (default: same as MODEL)
28+
- `LLM_KATAN_URL`: URL of the llm-katan backend (default: `http://localhost:8001`)
29+
- `HUGGINGFACE_HUB_TOKEN`: HuggingFace authentication token
30+
31+
## Setup
32+
33+
### 1. Start llm-katan backend
34+
35+
```bash
36+
# Install llm-katan
37+
pip install llm-katan
38+
39+
# Start llm-katan server on port 8001
40+
llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001
41+
```
42+
43+
### 2. Start this FastAPI server
44+
45+
```bash
46+
# Using Docker
47+
docker run -p 8000:8000 llm-katan-server
48+
49+
# Or directly with Python
50+
pip install -r requirements.txt
51+
python app.py
52+
```
53+
54+
## Usage
55+
56+
### Docker Compose (Recommended)
57+
58+
```yaml
59+
services:
60+
llm-katan-backend:
61+
image: python:3.11-slim
62+
command: >
63+
sh -c "pip install llm-katan &&
64+
llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001 --host 0.0.0.0"
65+
ports:
66+
- "8001:8001"
67+
environment:
68+
- HUGGINGFACE_HUB_TOKEN=${HUGGINGFACE_HUB_TOKEN}
69+
70+
llm-katan-server:
71+
build: .
72+
ports:
73+
- "8000:8000"
74+
environment:
75+
- MODEL=Qwen/Qwen2-0.5B-Instruct
76+
- SERVED_MODEL_NAME=Qwen/Qwen2-0.5B-Instruct
77+
- LLM_KATAN_URL=http://llm-katan-backend:8001
78+
depends_on:
79+
- llm-katan-backend
80+
```
81+
82+
### Testing
83+
84+
```bash
85+
# Health check
86+
curl http://localhost:8000/health
87+
88+
# List models
89+
curl http://localhost:8000/v1/models
90+
91+
# Chat completion (uses real LLM)
92+
curl -X POST http://localhost:8000/v1/chat/completions \
93+
-H "Content-Type: application/json" \
94+
-d '{
95+
"model": "Qwen/Qwen2-0.5B-Instruct",
96+
"messages": [{"role": "user", "content": "Hello!"}],
97+
"max_tokens": 50
98+
}'
99+
```
100+
101+
Intended for local testing with Docker Compose profile `testing`.

tools/llm-katan-server/app.py

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
import math
2+
import time
3+
import os
4+
import requests
5+
from typing import List, Optional
6+
7+
import uvicorn
8+
from fastapi import FastAPI, HTTPException
9+
from pydantic import BaseModel
10+
11+
app = FastAPI()
12+
13+
# Configuration
14+
MODEL = os.getenv("MODEL", "Qwen/Qwen2-0.5B-Instruct")
15+
SERVED_MODEL_NAME = os.getenv("SERVED_MODEL_NAME", MODEL)
16+
LLM_KATAN_URL = os.getenv("LLM_KATAN_URL", "http://localhost:8001")
17+
18+
# Check if HuggingFace token is set
19+
hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN")
20+
if not hf_token:
21+
print("Warning: HUGGINGFACE_HUB_TOKEN not set. Some models may require authentication.")
22+
23+
24+
class ChatMessage(BaseModel):
25+
role: str
26+
content: str
27+
28+
29+
class ChatRequest(BaseModel):
30+
model: str
31+
messages: List[ChatMessage]
32+
temperature: Optional[float] = 0.2
33+
max_tokens: Optional[int] = None
34+
35+
36+
@app.get("/health")
37+
async def health():
38+
return {"status": "ok"}
39+
40+
41+
@app.get("/v1/models")
42+
async def models():
43+
return {"data": [{"id": SERVED_MODEL_NAME, "object": "model"}]}
44+
45+
46+
@app.post("/v1/chat/completions")
47+
async def chat_completions(req: ChatRequest):
48+
try:
49+
# Forward request to llm-katan backend
50+
llm_katan_request = {
51+
"model": MODEL,
52+
"messages": [{"role": msg.role, "content": msg.content} for msg in req.messages],
53+
"temperature": req.temperature,
54+
}
55+
56+
if req.max_tokens:
57+
llm_katan_request["max_tokens"] = req.max_tokens
58+
59+
# Make request to llm-katan
60+
response = requests.post(
61+
f"{LLM_KATAN_URL}/v1/chat/completions",
62+
json=llm_katan_request,
63+
timeout=30
64+
)
65+
66+
if response.status_code != 200:
67+
raise HTTPException(
68+
status_code=response.status_code,
69+
detail=f"LLM Katan error: {response.text}"
70+
)
71+
72+
result = response.json()
73+
74+
# Update the model name in response to match our served model name
75+
result["model"] = req.model
76+
77+
return result
78+
79+
except requests.exceptions.RequestException as e:
80+
# Fallback to simple echo behavior if llm-katan is not available
81+
print(f"Warning: LLM Katan not available ({e}), using fallback response")
82+
83+
# Simple echo-like behavior as fallback
84+
last_user = next(
85+
(m.content for m in reversed(req.messages) if m.role == "user"), ""
86+
)
87+
content = f"[katan-{req.model}] You said: {last_user}"
88+
89+
# Rough token estimation: ~1 token per 4 characters (ceil)
90+
def estimate_tokens(text: str) -> int:
91+
if not text:
92+
return 0
93+
return max(1, math.ceil(len(text) / 4))
94+
95+
prompt_text = "\n".join(
96+
m.content for m in req.messages if isinstance(m.content, str)
97+
)
98+
prompt_tokens = estimate_tokens(prompt_text)
99+
completion_tokens = estimate_tokens(content)
100+
total_tokens = prompt_tokens + completion_tokens
101+
102+
created_ts = int(time.time())
103+
104+
usage = {
105+
"prompt_tokens": prompt_tokens,
106+
"completion_tokens": completion_tokens,
107+
"total_tokens": total_tokens,
108+
"prompt_tokens_details": {"cached_tokens": 0},
109+
"completion_tokens_details": {"reasoning_tokens": 0},
110+
}
111+
112+
return {
113+
"id": "cmpl-katan-123",
114+
"object": "chat.completion",
115+
"created": created_ts,
116+
"model": req.model,
117+
"system_fingerprint": "llm-katan-server",
118+
"choices": [
119+
{
120+
"index": 0,
121+
"message": {"role": "assistant", "content": content},
122+
"finish_reason": "stop",
123+
"logprobs": None,
124+
}
125+
],
126+
"usage": usage,
127+
"token_usage": usage,
128+
}
129+
130+
131+
if __name__ == "__main__":
132+
uvicorn.run(app, host="0.0.0.0", port=8000)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
fastapi==0.115.0
2+
uvicorn==0.30.6
3+
pydantic==2.9.2
4+
requests==2.31.0

website/docs/installation/installation.md

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,38 @@ model_config:
122122
preferred_endpoints: ["your-endpoint"]
123123
```
124124
125+
:::tip[**No vLLM Backend? Use Mock Services for Testing**]
126+
If you don't have a vLLM backend set up, you can use the provided mock services for testing:
127+
128+
**Option 1: Mock vLLM (Simple Echo Service)**
129+
```bash
130+
# Start a simple mock service that echoes back responses
131+
python tools/mock-vllm/app.py
132+
```
133+
134+
**Option 2: LLM Katan Server (Real LLM with Lightweight Backend)**
135+
```bash
136+
# First, start llm-katan backend (requires pip install llm-katan)
137+
llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001
138+
139+
# Then start the FastAPI wrapper
140+
python tools/llm-katan-server/app.py
141+
```
142+
143+
For the mock services, update your `config/config.yaml`:
144+
```yaml
145+
vllm_endpoints:
146+
- name: "mock-endpoint"
147+
address: "127.0.0.1"
148+
port: 8000 # Mock service port
149+
models:
150+
- "openai/gpt-oss-20b" # For mock-vllm
151+
# OR
152+
- "Qwen/Qwen2-0.5B-Instruct" # For llm-katan-server
153+
weight: 1
154+
```
155+
:::
156+
125157
:::note[**Important: Address Format Requirements**]
126158
The `address` field **must** contain a valid IP address (IPv4 or IPv6). Domain names are not supported.
127159

0 commit comments

Comments
 (0)