From 211b547bad4ced9ee78c3496427877291220f0b3 Mon Sep 17 00:00:00 2001 From: FeiDaLI <1607741781@qq.com> Date: Sun, 5 Oct 2025 19:47:03 +0800 Subject: [PATCH 1/3] feat(mock_and_cuda): add cuda support and llm-katan support Signed-off-by: FeiDaLI <1607741781@qq.com> --- candle-binding/Cargo.toml | 6 +- candle-binding/src/lib.rs | 4 +- .../dual_classifier/dual_classifier.py | 2 +- src/training/dual_classifier/trainer.py | 2 +- .../jailbreak_bert_finetuning.py | 2 +- tools/llm-katan-server/Dockerfile | 21 +++ tools/llm-katan-server/README.md | 101 ++++++++++++++ tools/llm-katan-server/app.py | 132 ++++++++++++++++++ tools/llm-katan-server/requirements.txt | 4 + website/docs/installation/installation.md | 32 +++++ 10 files changed, 298 insertions(+), 8 deletions(-) create mode 100644 tools/llm-katan-server/Dockerfile create mode 100644 tools/llm-katan-server/README.md create mode 100644 tools/llm-katan-server/app.py create mode 100644 tools/llm-katan-server/requirements.txt diff --git a/candle-binding/Cargo.toml b/candle-binding/Cargo.toml index 9b9364f4..4159a9aa 100644 --- a/candle-binding/Cargo.toml +++ b/candle-binding/Cargo.toml @@ -11,9 +11,9 @@ crate-type = ["staticlib", "cdylib"] [dependencies] anyhow = { version = "1", features = ["backtrace"] } -candle-core = "0.8.4" -candle-nn = "0.8.4" -candle-transformers = "0.8.4" +candle-core = { version = "0.8.4", features = ["cuda"] } +candle-nn = { version= "0.8.4", features = ["cuda"] } +candle-transformers = { version= "0.8.4", features = ["cuda"] } tokenizers = { version = "0.21.0", features = ["http"] } hf-hub = "0.4.1" safetensors = "0.4.1" diff --git a/candle-binding/src/lib.rs b/candle-binding/src/lib.rs index d778c3fb..87380f96 100644 --- a/candle-binding/src/lib.rs +++ b/candle-binding/src/lib.rs @@ -495,7 +495,7 @@ impl BertSimilarity { let mut tokenizer = self.tokenizer.clone(); tokenizer .with_truncation(Some(TruncationParams { - max_length: max_length.unwrap_or(512), + max_length: max_length.unwrap_or(100000), strategy: TruncationStrategy::LongestFirst, stride: 0, direction: TruncationDirection::Right, @@ -517,7 +517,7 @@ impl BertSimilarity { let mut tokenizer = self.tokenizer.clone(); tokenizer .with_truncation(Some(TruncationParams { - max_length: max_length.unwrap_or(512), + max_length: max_length.unwrap_or(100000), strategy: TruncationStrategy::LongestFirst, stride: 0, direction: TruncationDirection::Right, diff --git a/src/training/dual_classifier/dual_classifier.py b/src/training/dual_classifier/dual_classifier.py index 6cdb30ef..e2088060 100644 --- a/src/training/dual_classifier/dual_classifier.py +++ b/src/training/dual_classifier/dual_classifier.py @@ -19,7 +19,7 @@ def __init__( self, num_categories: int, model_name: str = "distilbert-base-uncased", - max_length: int = 512, + max_length: int = 100000, ): super().__init__() diff --git a/src/training/dual_classifier/trainer.py b/src/training/dual_classifier/trainer.py index 773a6bab..04b5b876 100644 --- a/src/training/dual_classifier/trainer.py +++ b/src/training/dual_classifier/trainer.py @@ -22,7 +22,7 @@ def __init__( category_labels: List[int], pii_labels: List[List[int]], # Token-level PII labels tokenizer, - max_length: int = 512, + max_length: int = 100000, ): self.texts = texts self.category_labels = category_labels diff --git a/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py b/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py index a795f12a..ee66d405 100644 --- a/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py +++ b/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py @@ -641,7 +641,7 @@ def _analyze_sequence_lengths(self, texts: List[str], tokenizer) -> Dict[str, in } def optimize_sequence_length( - self, texts: List[str], tokenizer, default_max_length: int = 512 + self, texts: List[str], tokenizer, default_max_length: int = 100000 ) -> int: """Find optimal sequence length based on dataset characteristics.""" logger.info("Analyzing sequence length distribution...") diff --git a/tools/llm-katan-server/Dockerfile b/tools/llm-katan-server/Dockerfile new file mode 100644 index 00000000..88b76c74 --- /dev/null +++ b/tools/llm-katan-server/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.11-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY app.py ./ + +EXPOSE 8000 + +# Environment variables for configuration +ENV MODEL=Qwen/Qwen2-0.5B-Instruct +ENV SERVED_MODEL_NAME=Qwen/Qwen2-0.5B-Instruct +ENV LLM_KATAN_URL=http://localhost:8001 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/tools/llm-katan-server/README.md b/tools/llm-katan-server/README.md new file mode 100644 index 00000000..4475767b --- /dev/null +++ b/tools/llm-katan-server/README.md @@ -0,0 +1,101 @@ +# LLM Katan Server + +A FastAPI wrapper around [llm-katan](https://pypi.org/project/llm-katan/) that provides the same API design as mock-vllm but uses real LLM functionality. + +## Architecture + +This server acts as a proxy that: +1. Receives OpenAI-compatible API requests +2. Forwards them to a running `llm-katan` instance +3. Returns the responses with proper model name mapping +4. Falls back to echo behavior if `llm-katan` is unavailable + +## Features + +- Same API design as mock-vllm (FastAPI-based) +- Proxies requests to real `llm-katan` backend +- OpenAI-compatible API endpoints: + - GET /health + - GET /v1/models + - POST /v1/chat/completions +- Fallback behavior when backend is unavailable +- Configurable via environment variables + +## Environment Variables + +- `MODEL`: HuggingFace model name for llm-katan (default: `Qwen/Qwen2-0.5B-Instruct`) +- `SERVED_MODEL_NAME`: Model name to expose in API (default: same as MODEL) +- `LLM_KATAN_URL`: URL of the llm-katan backend (default: `http://localhost:8001`) +- `HUGGINGFACE_HUB_TOKEN`: HuggingFace authentication token + +## Setup + +### 1. Start llm-katan backend + +```bash +# Install llm-katan +pip install llm-katan + +# Start llm-katan server on port 8001 +llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001 +``` + +### 2. Start this FastAPI server + +```bash +# Using Docker +docker run -p 8000:8000 llm-katan-server + +# Or directly with Python +pip install -r requirements.txt +python app.py +``` + +## Usage + +### Docker Compose (Recommended) + +```yaml +services: + llm-katan-backend: + image: python:3.11-slim + command: > + sh -c "pip install llm-katan && + llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001 --host 0.0.0.0" + ports: + - "8001:8001" + environment: + - HUGGINGFACE_HUB_TOKEN=${HUGGINGFACE_HUB_TOKEN} + + llm-katan-server: + build: . + ports: + - "8000:8000" + environment: + - MODEL=Qwen/Qwen2-0.5B-Instruct + - SERVED_MODEL_NAME=Qwen/Qwen2-0.5B-Instruct + - LLM_KATAN_URL=http://llm-katan-backend:8001 + depends_on: + - llm-katan-backend +``` + +### Testing + +```bash +# Health check +curl http://localhost:8000/health + +# List models +curl http://localhost:8000/v1/models + +# Chat completion (uses real LLM) +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "Qwen/Qwen2-0.5B-Instruct", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 50 + }' +``` + +Intended for local testing with Docker Compose profile `testing`. diff --git a/tools/llm-katan-server/app.py b/tools/llm-katan-server/app.py new file mode 100644 index 00000000..05e8f5b2 --- /dev/null +++ b/tools/llm-katan-server/app.py @@ -0,0 +1,132 @@ +import math +import time +import os +import requests +from typing import List, Optional + +import uvicorn +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel + +app = FastAPI() + +# Configuration +MODEL = os.getenv("MODEL", "Qwen/Qwen2-0.5B-Instruct") +SERVED_MODEL_NAME = os.getenv("SERVED_MODEL_NAME", MODEL) +LLM_KATAN_URL = os.getenv("LLM_KATAN_URL", "http://localhost:8001") + +# Check if HuggingFace token is set +hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN") +if not hf_token: + print("Warning: HUGGINGFACE_HUB_TOKEN not set. Some models may require authentication.") + + +class ChatMessage(BaseModel): + role: str + content: str + + +class ChatRequest(BaseModel): + model: str + messages: List[ChatMessage] + temperature: Optional[float] = 0.2 + max_tokens: Optional[int] = None + + +@app.get("/health") +async def health(): + return {"status": "ok"} + + +@app.get("/v1/models") +async def models(): + return {"data": [{"id": SERVED_MODEL_NAME, "object": "model"}]} + + +@app.post("/v1/chat/completions") +async def chat_completions(req: ChatRequest): + try: + # Forward request to llm-katan backend + llm_katan_request = { + "model": MODEL, + "messages": [{"role": msg.role, "content": msg.content} for msg in req.messages], + "temperature": req.temperature, + } + + if req.max_tokens: + llm_katan_request["max_tokens"] = req.max_tokens + + # Make request to llm-katan + response = requests.post( + f"{LLM_KATAN_URL}/v1/chat/completions", + json=llm_katan_request, + timeout=30 + ) + + if response.status_code != 200: + raise HTTPException( + status_code=response.status_code, + detail=f"LLM Katan error: {response.text}" + ) + + result = response.json() + + # Update the model name in response to match our served model name + result["model"] = req.model + + return result + + except requests.exceptions.RequestException as e: + # Fallback to simple echo behavior if llm-katan is not available + print(f"Warning: LLM Katan not available ({e}), using fallback response") + + # Simple echo-like behavior as fallback + last_user = next( + (m.content for m in reversed(req.messages) if m.role == "user"), "" + ) + content = f"[katan-{req.model}] You said: {last_user}" + + # Rough token estimation: ~1 token per 4 characters (ceil) + def estimate_tokens(text: str) -> int: + if not text: + return 0 + return max(1, math.ceil(len(text) / 4)) + + prompt_text = "\n".join( + m.content for m in req.messages if isinstance(m.content, str) + ) + prompt_tokens = estimate_tokens(prompt_text) + completion_tokens = estimate_tokens(content) + total_tokens = prompt_tokens + completion_tokens + + created_ts = int(time.time()) + + usage = { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "prompt_tokens_details": {"cached_tokens": 0}, + "completion_tokens_details": {"reasoning_tokens": 0}, + } + + return { + "id": "cmpl-katan-123", + "object": "chat.completion", + "created": created_ts, + "model": req.model, + "system_fingerprint": "llm-katan-server", + "choices": [ + { + "index": 0, + "message": {"role": "assistant", "content": content}, + "finish_reason": "stop", + "logprobs": None, + } + ], + "usage": usage, + "token_usage": usage, + } + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/tools/llm-katan-server/requirements.txt b/tools/llm-katan-server/requirements.txt new file mode 100644 index 00000000..6c9f1068 --- /dev/null +++ b/tools/llm-katan-server/requirements.txt @@ -0,0 +1,4 @@ +fastapi==0.115.0 +uvicorn==0.30.6 +pydantic==2.9.2 +requests==2.31.0 diff --git a/website/docs/installation/installation.md b/website/docs/installation/installation.md index a96c683b..7dd685e0 100644 --- a/website/docs/installation/installation.md +++ b/website/docs/installation/installation.md @@ -122,6 +122,38 @@ model_config: preferred_endpoints: ["your-endpoint"] ``` +:::tip[**No vLLM Backend? Use Mock Services for Testing**] +If you don't have a vLLM backend set up, you can use the provided mock services for testing: + +**Option 1: Mock vLLM (Simple Echo Service)** +```bash +# Start a simple mock service that echoes back responses +python tools/mock-vllm/app.py +``` + +**Option 2: LLM Katan Server (Real LLM with Lightweight Backend)** +```bash +# First, start llm-katan backend (requires pip install llm-katan) +llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001 + +# Then start the FastAPI wrapper +python tools/llm-katan-server/app.py +``` + +For the mock services, update your `config/config.yaml`: +```yaml +vllm_endpoints: + - name: "mock-endpoint" + address: "127.0.0.1" + port: 8000 # Mock service port + models: + - "openai/gpt-oss-20b" # For mock-vllm + # OR + - "Qwen/Qwen2-0.5B-Instruct" # For llm-katan-server + weight: 1 +``` +::: + :::note[**Important: Address Format Requirements**] The `address` field **must** contain a valid IP address (IPv4 or IPv6). Domain names are not supported. From 6cb1b5506886267dc902babb7cf4dcf396a7d46e Mon Sep 17 00:00:00 2001 From: FeiDaLI <1607741781@qq.com> Date: Sun, 5 Oct 2025 20:13:39 +0800 Subject: [PATCH 2/3] feat(mock): llm-katan support Signed-off-by: FeiDaLI <1607741781@qq.com> --- .pre-commit-config.yaml | 6 ++-- e2e-tests/06-pii-detection-test.py | 8 ++--- .../classifier_model_fine_tuning/ft_linear.py | 22 +++++++----- .../jailbreak_bert_finetuning.py | 28 +++++++++------ .../ft_linear_lora.py | 8 +---- .../pii_bert_finetuning_lora.py | 8 +---- .../jailbreak_bert_finetuning_lora.py | 8 +---- tools/llm-katan-server/README.md | 22 +----------- tools/llm-katan-server/app.py | 34 ++++++++++--------- website/docs/installation/installation.md | 4 +++ 10 files changed, 64 insertions(+), 84 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 9438abb6..8b1ca784 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -3,7 +3,7 @@ repos: # Basic hooks for Go, Rust, Python And JavaScript files only - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v6.0.0 + rev: v4.4.0 hooks: - id: trailing-whitespace files: \.(go|rs|py|js)$ @@ -71,7 +71,7 @@ repos: # Python specific hooks - repo: https://github.com/psf/black - rev: 25.1.0 + rev: 22.12.0 hooks: - id: black language_version: python3 @@ -79,7 +79,7 @@ repos: exclude: ^(\.venv/|venv/|env/|__pycache__/|\.git/|site/) - repo: https://github.com/PyCQA/isort - rev: 6.0.1 + rev: 5.12.0 hooks: - id: isort args: ["--profile", "black"] diff --git a/e2e-tests/06-pii-detection-test.py b/e2e-tests/06-pii-detection-test.py index 3a7a6167..5cef1302 100644 --- a/e2e-tests/06-pii-detection-test.py +++ b/e2e-tests/06-pii-detection-test.py @@ -652,7 +652,9 @@ def test_extproc_pii_detection_comprehensive(self): status = ( "🔒" if result["is_blocked"] - else "✅" if result["request_allowed"] else "❌" + else "✅" + if result["request_allowed"] + else "❌" ) print(f" {status} {result['test_case']}") print(f" Content: {result['content']}") @@ -671,9 +673,7 @@ def test_extproc_pii_detection_comprehensive(self): passed=False, message="⚠️ No clear evidence of ExtProc PII detection in production pipeline", ) - print( - "📝 NOTE: This may indicate PII detection is not active in ExtProc or" - ) + print("📝 NOTE: This may indicate PII detection is not active in ExtProc or") print(" PII policies are configured to allow all content through") def test_multiple_pii_types_analysis(self): diff --git a/src/training/classifier_model_fine_tuning/ft_linear.py b/src/training/classifier_model_fine_tuning/ft_linear.py index 3c736202..740aed8c 100644 --- a/src/training/classifier_model_fine_tuning/ft_linear.py +++ b/src/training/classifier_model_fine_tuning/ft_linear.py @@ -508,19 +508,25 @@ def tokenize_function(examples): # Evaluate on validation set logger.info("Evaluating on validation set...") - val_accuracy, val_report, val_conf_matrix, val_predictions = ( - evaluate_category_classifier( - model, tokenizer, val_texts, val_categories, idx_to_category, device - ) + ( + val_accuracy, + val_report, + val_conf_matrix, + val_predictions, + ) = evaluate_category_classifier( + model, tokenizer, val_texts, val_categories, idx_to_category, device ) logger.info(f"Validation accuracy: {val_accuracy:.4f}") # Evaluate on test set logger.info("Evaluating on test set...") - test_accuracy, test_report, test_conf_matrix, test_predictions = ( - evaluate_category_classifier( - model, tokenizer, test_texts, test_categories, idx_to_category, device - ) + ( + test_accuracy, + test_report, + test_conf_matrix, + test_predictions, + ) = evaluate_category_classifier( + model, tokenizer, test_texts, test_categories, idx_to_category, device ) logger.info(f"Test accuracy: {test_accuracy:.4f}") diff --git a/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py b/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py index ee66d405..deacec2f 100644 --- a/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py +++ b/src/training/prompt_guard_fine_tuning/jailbreak_bert_finetuning.py @@ -100,9 +100,9 @@ import torch # Suppress common non-critical warnings -os.environ["TOKENIZERS_PARALLELISM"] = ( - "false" # Suppress tokenizer parallelism warnings -) +os.environ[ + "TOKENIZERS_PARALLELISM" +] = "false" # Suppress tokenizer parallelism warnings warnings.filterwarnings( "ignore", message=".*TensorFloat32.*" ) # Suppress TF32 performance hints @@ -2343,19 +2343,25 @@ def tokenize_function(examples): # Evaluate on validation set logger.info("Evaluating on validation set...") - val_accuracy, val_report, val_conf_matrix, val_predictions = ( - evaluate_jailbreak_classifier( - model, tokenizer, val_texts, val_categories, idx_to_category, device - ) + ( + val_accuracy, + val_report, + val_conf_matrix, + val_predictions, + ) = evaluate_jailbreak_classifier( + model, tokenizer, val_texts, val_categories, idx_to_category, device ) logger.info(f"Validation accuracy: {val_accuracy:.4f}") # Evaluate on test set logger.info("Evaluating on test set...") - test_accuracy, test_report, test_conf_matrix, test_predictions = ( - evaluate_jailbreak_classifier( - model, tokenizer, test_texts, test_categories, idx_to_category, device - ) + ( + test_accuracy, + test_report, + test_conf_matrix, + test_predictions, + ) = evaluate_jailbreak_classifier( + model, tokenizer, test_texts, test_categories, idx_to_category, device ) logger.info(f"Test accuracy: {test_accuracy:.4f}") diff --git a/src/training/training_lora/classifier_model_fine_tuning_lora/ft_linear_lora.py b/src/training/training_lora/classifier_model_fine_tuning_lora/ft_linear_lora.py index 3a955a46..0d855c61 100644 --- a/src/training/training_lora/classifier_model_fine_tuning_lora/ft_linear_lora.py +++ b/src/training/training_lora/classifier_model_fine_tuning_lora/ft_linear_lora.py @@ -69,13 +69,7 @@ import torch import torch.nn as nn from datasets import Dataset, load_dataset -from peft import ( - LoraConfig, - PeftConfig, - PeftModel, - TaskType, - get_peft_model, -) +from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support from sklearn.model_selection import train_test_split from transformers import ( diff --git a/src/training/training_lora/pii_model_fine_tuning_lora/pii_bert_finetuning_lora.py b/src/training/training_lora/pii_model_fine_tuning_lora/pii_bert_finetuning_lora.py index f182499b..a730eef6 100644 --- a/src/training/training_lora/pii_model_fine_tuning_lora/pii_bert_finetuning_lora.py +++ b/src/training/training_lora/pii_model_fine_tuning_lora/pii_bert_finetuning_lora.py @@ -70,13 +70,7 @@ import torch import torch.nn as nn from datasets import Dataset, load_dataset -from peft import ( - LoraConfig, - PeftConfig, - PeftModel, - TaskType, - get_peft_model, -) +from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support from sklearn.model_selection import train_test_split from transformers import ( diff --git a/src/training/training_lora/prompt_guard_fine_tuning_lora/jailbreak_bert_finetuning_lora.py b/src/training/training_lora/prompt_guard_fine_tuning_lora/jailbreak_bert_finetuning_lora.py index 76b6df02..bdce39c6 100644 --- a/src/training/training_lora/prompt_guard_fine_tuning_lora/jailbreak_bert_finetuning_lora.py +++ b/src/training/training_lora/prompt_guard_fine_tuning_lora/jailbreak_bert_finetuning_lora.py @@ -77,13 +77,7 @@ import torch import torch.nn as nn from datasets import Dataset, load_dataset -from peft import ( - LoraConfig, - PeftConfig, - PeftModel, - TaskType, - get_peft_model, -) +from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support from sklearn.model_selection import train_test_split from transformers import ( diff --git a/tools/llm-katan-server/README.md b/tools/llm-katan-server/README.md index 4475767b..e2b04d60 100644 --- a/tools/llm-katan-server/README.md +++ b/tools/llm-katan-server/README.md @@ -5,6 +5,7 @@ A FastAPI wrapper around [llm-katan](https://pypi.org/project/llm-katan/) that p ## Architecture This server acts as a proxy that: + 1. Receives OpenAI-compatible API requests 2. Forwards them to a running `llm-katan` instance 3. Returns the responses with proper model name mapping @@ -78,24 +79,3 @@ services: depends_on: - llm-katan-backend ``` - -### Testing - -```bash -# Health check -curl http://localhost:8000/health - -# List models -curl http://localhost:8000/v1/models - -# Chat completion (uses real LLM) -curl -X POST http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "Qwen/Qwen2-0.5B-Instruct", - "messages": [{"role": "user", "content": "Hello!"}], - "max_tokens": 50 - }' -``` - -Intended for local testing with Docker Compose profile `testing`. diff --git a/tools/llm-katan-server/app.py b/tools/llm-katan-server/app.py index 05e8f5b2..c3bf5aee 100644 --- a/tools/llm-katan-server/app.py +++ b/tools/llm-katan-server/app.py @@ -1,9 +1,9 @@ import math -import time import os -import requests +import time from typing import List, Optional +import requests import uvicorn from fastapi import FastAPI, HTTPException from pydantic import BaseModel @@ -18,7 +18,9 @@ # Check if HuggingFace token is set hf_token = os.getenv("HUGGINGFACE_HUB_TOKEN") if not hf_token: - print("Warning: HUGGINGFACE_HUB_TOKEN not set. Some models may require authentication.") + print( + "Warning: HUGGINGFACE_HUB_TOKEN not set. Some models may require authentication." + ) class ChatMessage(BaseModel): @@ -49,37 +51,37 @@ async def chat_completions(req: ChatRequest): # Forward request to llm-katan backend llm_katan_request = { "model": MODEL, - "messages": [{"role": msg.role, "content": msg.content} for msg in req.messages], + "messages": [ + {"role": msg.role, "content": msg.content} for msg in req.messages + ], "temperature": req.temperature, } - + if req.max_tokens: llm_katan_request["max_tokens"] = req.max_tokens - + # Make request to llm-katan response = requests.post( - f"{LLM_KATAN_URL}/v1/chat/completions", - json=llm_katan_request, - timeout=30 + f"{LLM_KATAN_URL}/v1/chat/completions", json=llm_katan_request, timeout=30 ) - + if response.status_code != 200: raise HTTPException( status_code=response.status_code, - detail=f"LLM Katan error: {response.text}" + detail=f"LLM Katan error: {response.text}", ) - + result = response.json() - + # Update the model name in response to match our served model name result["model"] = req.model - + return result - + except requests.exceptions.RequestException as e: # Fallback to simple echo behavior if llm-katan is not available print(f"Warning: LLM Katan not available ({e}), using fallback response") - + # Simple echo-like behavior as fallback last_user = next( (m.content for m in reversed(req.messages) if m.role == "user"), "" diff --git a/website/docs/installation/installation.md b/website/docs/installation/installation.md index 7dd685e0..b90ad64c 100644 --- a/website/docs/installation/installation.md +++ b/website/docs/installation/installation.md @@ -126,12 +126,14 @@ model_config: If you don't have a vLLM backend set up, you can use the provided mock services for testing: **Option 1: Mock vLLM (Simple Echo Service)** + ```bash # Start a simple mock service that echoes back responses python tools/mock-vllm/app.py ``` **Option 2: LLM Katan Server (Real LLM with Lightweight Backend)** + ```bash # First, start llm-katan backend (requires pip install llm-katan) llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001 @@ -141,6 +143,7 @@ python tools/llm-katan-server/app.py ``` For the mock services, update your `config/config.yaml`: + ```yaml vllm_endpoints: - name: "mock-endpoint" @@ -152,6 +155,7 @@ vllm_endpoints: - "Qwen/Qwen2-0.5B-Instruct" # For llm-katan-server weight: 1 ``` + ::: :::note[**Important: Address Format Requirements**] From 25290b928fc68e15eec16987e21c86be60ad2f7f Mon Sep 17 00:00:00 2001 From: FeiDaLI <1607741781@qq.com> Date: Sun, 5 Oct 2025 20:19:24 +0800 Subject: [PATCH 3/3] feat(mock): llm-katan support Signed-off-by: FeiDaLI <1607741781@qq.com> --- candle-binding/Cargo.toml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/candle-binding/Cargo.toml b/candle-binding/Cargo.toml index 4159a9aa..9b9364f4 100644 --- a/candle-binding/Cargo.toml +++ b/candle-binding/Cargo.toml @@ -11,9 +11,9 @@ crate-type = ["staticlib", "cdylib"] [dependencies] anyhow = { version = "1", features = ["backtrace"] } -candle-core = { version = "0.8.4", features = ["cuda"] } -candle-nn = { version= "0.8.4", features = ["cuda"] } -candle-transformers = { version= "0.8.4", features = ["cuda"] } +candle-core = "0.8.4" +candle-nn = "0.8.4" +candle-transformers = "0.8.4" tokenizers = { version = "0.21.0", features = ["http"] } hf-hub = "0.4.1" safetensors = "0.4.1"