vllm-project · FeiDaLI · Oct 5, 2025 · Oct 5, 2025 · Oct 5, 2025 · Oct 5, 2025
@@ -3,7 +3,7 @@
 repos:
 # Basic hooks for Go, Rust, Python And JavaScript files only
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v6.0.0
+  rev: v4.4.0
   hooks:
   - id: trailing-whitespace
     files: \.(go|rs|py|js)$
@@ -71,15 +71,15 @@ repos:
 
 # Python specific hooks
 - repo: https://github.com/psf/black
-  rev: 25.1.0
+  rev: 22.12.0
   hooks:
   - id: black
     language_version: python3
     files: \.py$
     exclude: ^(\.venv/|venv/|env/|__pycache__/|\.git/|site/)
 
 - repo: https://github.com/PyCQA/isort
-  rev: 6.0.1
+  rev: 5.12.0
   hooks:
   - id: isort
     args: ["--profile", "black"]

@@ -495,7 +495,7 @@ impl BertSimilarity {
         let mut tokenizer = self.tokenizer.clone();
         tokenizer
             .with_truncation(Some(TruncationParams {
-                max_length: max_length.unwrap_or(512),
+                max_length: max_length.unwrap_or(100000),
                 strategy: TruncationStrategy::LongestFirst,
                 stride: 0,
                 direction: TruncationDirection::Right,
@@ -517,7 +517,7 @@ impl BertSimilarity {
         let mut tokenizer = self.tokenizer.clone();
         tokenizer
             .with_truncation(Some(TruncationParams {
-                max_length: max_length.unwrap_or(512),
+                max_length: max_length.unwrap_or(100000),
                 strategy: TruncationStrategy::LongestFirst,
                 stride: 0,
                 direction: TruncationDirection::Right,

@@ -652,7 +652,9 @@ def test_extproc_pii_detection_comprehensive(self):
             status = (
                 "🔒"
                 if result["is_blocked"]
-                else "✅" if result["request_allowed"] else "❌"
+                else "✅"
+                if result["request_allowed"]
+                else "❌"
             )
             print(f"  {status} {result['test_case']}")
             print(f"      Content: {result['content']}")
@@ -671,9 +673,7 @@ def test_extproc_pii_detection_comprehensive(self):
                 passed=False,
                 message="⚠️ No clear evidence of ExtProc PII detection in production pipeline",
             )
-            print(
-                "📝 NOTE: This may indicate PII detection is not active in ExtProc or"
-            )
+            print("📝 NOTE: This may indicate PII detection is not active in ExtProc or")
             print("         PII policies are configured to allow all content through")
 
     def test_multiple_pii_types_analysis(self):

@@ -508,19 +508,25 @@ def tokenize_function(examples):
 
     # Evaluate on validation set
     logger.info("Evaluating on validation set...")
-    val_accuracy, val_report, val_conf_matrix, val_predictions = (
-        evaluate_category_classifier(
-            model, tokenizer, val_texts, val_categories, idx_to_category, device
-        )
+    (
+        val_accuracy,
+        val_report,
+        val_conf_matrix,
+        val_predictions,
+    ) = evaluate_category_classifier(
+        model, tokenizer, val_texts, val_categories, idx_to_category, device
     )
     logger.info(f"Validation accuracy: {val_accuracy:.4f}")
 
     # Evaluate on test set
     logger.info("Evaluating on test set...")
-    test_accuracy, test_report, test_conf_matrix, test_predictions = (
-        evaluate_category_classifier(
-            model, tokenizer, test_texts, test_categories, idx_to_category, device
-        )
+    (
+        test_accuracy,
+        test_report,
+        test_conf_matrix,
+        test_predictions,
+    ) = evaluate_category_classifier(
+        model, tokenizer, test_texts, test_categories, idx_to_category, device
     )
     logger.info(f"Test accuracy: {test_accuracy:.4f}")
 

@@ -19,7 +19,7 @@ def __init__(
         self,
         num_categories: int,
         model_name: str = "distilbert-base-uncased",
-        max_length: int = 512,
+        max_length: int = 100000,
     ):
         super().__init__()
 

@@ -22,7 +22,7 @@ def __init__(
         category_labels: List[int],
         pii_labels: List[List[int]],  # Token-level PII labels
         tokenizer,
-        max_length: int = 512,
+        max_length: int = 100000,
     ):
         self.texts = texts
         self.category_labels = category_labels

@@ -100,9 +100,9 @@
 import torch
 
 # Suppress common non-critical warnings
-os.environ["TOKENIZERS_PARALLELISM"] = (
-    "false"  # Suppress tokenizer parallelism warnings
-)
+os.environ[
+    "TOKENIZERS_PARALLELISM"
+] = "false"  # Suppress tokenizer parallelism warnings
 warnings.filterwarnings(
     "ignore", message=".*TensorFloat32.*"
 )  # Suppress TF32 performance hints
@@ -641,7 +641,7 @@ def _analyze_sequence_lengths(self, texts: List[str], tokenizer) -> Dict[str, in
         }
 
     def optimize_sequence_length(
-        self, texts: List[str], tokenizer, default_max_length: int = 512
+        self, texts: List[str], tokenizer, default_max_length: int = 100000
     ) -> int:
         """Find optimal sequence length based on dataset characteristics."""
         logger.info("Analyzing sequence length distribution...")
@@ -2343,19 +2343,25 @@ def tokenize_function(examples):
 
     # Evaluate on validation set
     logger.info("Evaluating on validation set...")
-    val_accuracy, val_report, val_conf_matrix, val_predictions = (
-        evaluate_jailbreak_classifier(
-            model, tokenizer, val_texts, val_categories, idx_to_category, device
-        )
+    (
+        val_accuracy,
+        val_report,
+        val_conf_matrix,
+        val_predictions,
+    ) = evaluate_jailbreak_classifier(
+        model, tokenizer, val_texts, val_categories, idx_to_category, device
     )
     logger.info(f"Validation accuracy: {val_accuracy:.4f}")
 
     # Evaluate on test set
     logger.info("Evaluating on test set...")
-    test_accuracy, test_report, test_conf_matrix, test_predictions = (
-        evaluate_jailbreak_classifier(
-            model, tokenizer, test_texts, test_categories, idx_to_category, device
-        )
+    (
+        test_accuracy,
+        test_report,
+        test_conf_matrix,
+        test_predictions,
+    ) = evaluate_jailbreak_classifier(
+        model, tokenizer, test_texts, test_categories, idx_to_category, device
     )
     logger.info(f"Test accuracy: {test_accuracy:.4f}")
 

@@ -69,13 +69,7 @@
 import torch
 import torch.nn as nn
 from datasets import Dataset, load_dataset
-from peft import (
-    LoraConfig,
-    PeftConfig,
-    PeftModel,
-    TaskType,
-    get_peft_model,
-)
+from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
 from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
 from sklearn.model_selection import train_test_split
 from transformers import (

@@ -70,13 +70,7 @@
 import torch
 import torch.nn as nn
 from datasets import Dataset, load_dataset
-from peft import (
-    LoraConfig,
-    PeftConfig,
-    PeftModel,
-    TaskType,
-    get_peft_model,
-)
+from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
 from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
 from sklearn.model_selection import train_test_split
 from transformers import (

@@ -77,13 +77,7 @@
 import torch
 import torch.nn as nn
 from datasets import Dataset, load_dataset
-from peft import (
-    LoraConfig,
-    PeftConfig,
-    PeftModel,
-    TaskType,
-    get_peft_model,
-)
+from peft import LoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model
 from sklearn.metrics import accuracy_score, f1_score, precision_recall_fscore_support
 from sklearn.model_selection import train_test_split
 from transformers import (

@@ -0,0 +1,21 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt ./
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY app.py ./  
+
+EXPOSE 8000
+
+# Environment variables for configuration
+ENV MODEL=Qwen/Qwen2-0.5B-Instruct
+ENV SERVED_MODEL_NAME=Qwen/Qwen2-0.5B-Instruct
+ENV LLM_KATAN_URL=http://localhost:8001
+
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
@@ -0,0 +1,81 @@
+# LLM Katan Server
+
+A FastAPI wrapper around [llm-katan](https://pypi.org/project/llm-katan/) that provides the same API design as mock-vllm but uses real LLM functionality.
+
+## Architecture
+
+This server acts as a proxy that:
+
+1. Receives OpenAI-compatible API requests
+2. Forwards them to a running `llm-katan` instance
+3. Returns the responses with proper model name mapping
+4. Falls back to echo behavior if `llm-katan` is unavailable
+
+## Features
+
+- Same API design as mock-vllm (FastAPI-based)
+- Proxies requests to real `llm-katan` backend
+- OpenAI-compatible API endpoints:
+  - GET /health
+  - GET /v1/models  
+  - POST /v1/chat/completions
+- Fallback behavior when backend is unavailable
+- Configurable via environment variables
+
+## Environment Variables
+
+- `MODEL`: HuggingFace model name for llm-katan (default: `Qwen/Qwen2-0.5B-Instruct`)
+- `SERVED_MODEL_NAME`: Model name to expose in API (default: same as MODEL)
+- `LLM_KATAN_URL`: URL of the llm-katan backend (default: `http://localhost:8001`)
+- `HUGGINGFACE_HUB_TOKEN`: HuggingFace authentication token
+
+## Setup
+
+### 1. Start llm-katan backend
+
+```bash
+# Install llm-katan
+pip install llm-katan
+
+# Start llm-katan server on port 8001
+llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001
+```
+
+### 2. Start this FastAPI server
+
+```bash
+# Using Docker
+docker run -p 8000:8000 llm-katan-server
+
+# Or directly with Python
+pip install -r requirements.txt
+python app.py
+```
+
+## Usage
+
+### Docker Compose (Recommended)
+
+```yaml
+services:
+  llm-katan-backend:
+    image: python:3.11-slim
+    command: >
+      sh -c "pip install llm-katan && 
+             llm-katan --model Qwen/Qwen2-0.5B-Instruct --port 8001 --host 0.0.0.0"
+    ports:
+      - "8001:8001"
+    environment:
+      - HUGGINGFACE_HUB_TOKEN=${HUGGINGFACE_HUB_TOKEN}
+
+  llm-katan-server:
+    build: .
+    ports:
+      - "8000:8000"
+    environment:
+      - MODEL=Qwen/Qwen2-0.5B-Instruct
+      - SERVED_MODEL_NAME=Qwen/Qwen2-0.5B-Instruct
+      - LLM_KATAN_URL=http://llm-katan-backend:8001
+    depends_on:
+      - llm-katan-backend
+```