working solution

cooktheryan · cooktheryan · commit cd7e05aeb919 · 2025-11-14T15:25:37.000-05:00
Signed-off-by: Ryan Cook &lt;rcook@redhat.com&gt;
diff --git a/deploy/kserve/QUICKSTART.md b/deploy/kserve/QUICKSTART.md
@@ -81,9 +81,9 @@ Just need semantic routing with defaults:
 ./deploy.sh -n myproject -i mymodel -m mymodel
 ```
 
-### Scenario 2: Custom Storage
+### Scenario 2: Custom Storage and Embedding Model
 
-Using a specific storage class or larger PVCs:
+Using a specific storage class, larger PVCs, and custom embedding model:
 
 ```bash
 ./deploy.sh \
@@ -92,9 +92,16 @@ Using a specific storage class or larger PVCs:
   -m mymodel \
   -s gp3-csi \
   --models-pvc-size 20Gi \
-  --cache-pvc-size 10Gi
+  --cache-pvc-size 10Gi \
+  --embedding-model all-mpnet-base-v2
 ```
 
+**Available Embedding Models:**
+- `all-MiniLM-L12-v2` (default) - Balanced speed/quality (~90MB)
+- `all-mpnet-base-v2` - Higher quality, larger (~420MB)
+- `all-MiniLM-L6-v2` - Faster, smaller (~80MB)
+- `paraphrase-multilingual-MiniLM-L12-v2` - Multilingual support
+
 ### Scenario 3: Preview Before Deploying
 
 Want to see what will be created first:
@@ -235,7 +242,12 @@ Simply redeploy:
 
 1. **Run validation tests**:
    ```bash
-   NAMESPACE=<ns> MODEL_NAME=<model> ./test-semantic-routing.sh
+   # Set namespace and model name
+   NAMESPACE=<namespace> MODEL_NAME=<model> ./test-semantic-routing.sh
+
+   # Or let the script auto-detect from your deployment
+   cd deploy/kserve
+   ./test-semantic-routing.sh
    ```
 
 2. **Customize configuration**: See [README.md](./README.md) for detailed configuration options:
diff --git a/deploy/kserve/README.md b/deploy/kserve/README.md
@@ -417,7 +417,12 @@ time curl -k -s "https://$ROUTER_URL/v1/chat/completions" \
 Run comprehensive validation tests:
 
 ```bash
+# Set environment variables and run tests
 NAMESPACE=$NAMESPACE MODEL_NAME=my-model-name ./test-semantic-routing.sh
+
+# Or let the script auto-detect from config
+cd deploy/kserve
+./test-semantic-routing.sh
 ```
 
 ## Configuration Deep Dive
diff --git a/deploy/kserve/configmap-router-config.yaml b/deploy/kserve/configmap-router-config.yaml
@@ -8,7 +8,7 @@ metadata:
 data:
   config.yaml: |
     bert_model:
-      model_id: models/all-MiniLM-L12-v2
+      model_id: models/{{EMBEDDING_MODEL}}
       threshold: 0.6
       use_cpu: true
 
@@ -25,7 +25,7 @@ data:
       embedding_model: "bert"
 
     tools:
-      enabled: true
+      enabled: false  # Disabled - tools_db.json not included in KServe deployment
       top_k: 3
       similarity_threshold: 0.2
       tools_db_path: "config/tools_db.json"
@@ -203,11 +203,19 @@ data:
           duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10, 30]
           size_buckets: [1, 2, 5, 10, 20, 50, 100, 200]
 
-    # Embedding Models Configuration
-    embedding_models:
-      qwen3_model_path: "models/Qwen3-Embedding-0.6B"
-      gemma_model_path: "models/embeddinggemma-300m"
-      use_cpu: true
+    # Embedding Models Configuration (Optional)
+    # These are SEPARATE from the bert_model above and are used for the /v1/embeddings API endpoint.
+    # The bert_model (configured above) is used for semantic caching and tools similarity.
+    #
+    # To enable the embeddings API with Qwen3/Gemma models:
+    # 1. Uncomment the section below
+    # 2. Update the deployment init container to download these models
+    # 3. Note: These models are large (~600MB each) and not required for routing functionality
+    #
+    # embedding_models:
+    #   qwen3_model_path: "models/Qwen3-Embedding-0.6B"
+    #   gemma_model_path: "models/embeddinggemma-300m"
+    #   use_cpu: true
 
     # Observability Configuration
     observability:
diff --git a/deploy/kserve/deploy.sh b/deploy/kserve/deploy.sh
@@ -22,6 +22,13 @@ MODEL_NAME=""
 STORAGE_CLASS=""
 MODELS_PVC_SIZE="10Gi"
 CACHE_PVC_SIZE="5Gi"
+# Embedding model for semantic caching and tools similarity
+# Common options from sentence-transformers:
+#   - all-MiniLM-L12-v2 (default, balanced speed/quality)
+#   - all-mpnet-base-v2 (higher quality, slower)
+#   - all-MiniLM-L6-v2 (faster, lower quality)
+#   - paraphrase-multilingual-MiniLM-L12-v2 (multilingual)
+EMBEDDING_MODEL="all-MiniLM-L12-v2"
 DRY_RUN=false
 SKIP_VALIDATION=false
 
@@ -41,6 +48,7 @@ Optional:
   -s, --storage-class CLASS          StorageClass for PVCs (default: cluster default)
   --models-pvc-size SIZE             Size for models PVC (default: 10Gi)
   --cache-pvc-size SIZE              Size for cache PVC (default: 5Gi)
+  --embedding-model MODEL            BERT embedding model (default: all-MiniLM-L12-v2)
   --dry-run                          Generate manifests without applying
   --skip-validation                  Skip pre-deployment validation
   -h, --help                         Show this help message
@@ -49,8 +57,8 @@ Examples:
   # Deploy to namespace 'semantic' with granite32-8b model
   $0 -n semantic -i granite32-8b -m granite32-8b
 
-  # Deploy with custom storage class
-  $0 -n myproject -i llama3-70b -m llama3-70b -s gp3-csi
+  # Deploy with custom storage class and embedding model
+  $0 -n myproject -i llama3-70b -m llama3-70b -s gp3-csi --embedding-model all-mpnet-base-v2
 
   # Dry run to see what will be deployed
   $0 -n semantic -i granite32-8b -m granite32-8b --dry-run
@@ -93,6 +101,10 @@ while [[ $# -gt 0 ]]; do
             CACHE_PVC_SIZE="$2"
             shift 2
             ;;
+        --embedding-model)
+            EMBEDDING_MODEL="$2"
+            shift 2
+            ;;
         --dry-run)
             DRY_RUN=true
             shift
@@ -129,6 +141,7 @@ echo -e "${BLUE}Configuration:${NC}"
 echo "  Namespace:              $NAMESPACE"
 echo "  InferenceService:       $INFERENCESERVICE_NAME"
 echo "  Model Name:             $MODEL_NAME"
+echo "  Embedding Model:        $EMBEDDING_MODEL"
 echo "  Storage Class:          ${STORAGE_CLASS:-<cluster default>}"
 echo "  Models PVC Size:        $MODELS_PVC_SIZE"
 echo "  Cache PVC Size:         $CACHE_PVC_SIZE"
@@ -237,7 +250,7 @@ fi
 echo -e "${BLUE}Step 2: Generating manifests...${NC}"
 
 TEMP_DIR=$(mktemp -d)
-trap "rm -rf $TEMP_DIR" EXIT
+trap 'rm -rf "$TEMP_DIR"' EXIT
 
 # Function to substitute variables in a file
 substitute_vars() {
@@ -247,6 +260,7 @@ substitute_vars() {
     sed -e "s/{{NAMESPACE}}/$NAMESPACE/g" \
         -e "s/{{INFERENCESERVICE_NAME}}/$INFERENCESERVICE_NAME/g" \
         -e "s/{{MODEL_NAME}}/$MODEL_NAME/g" \
+        -e "s|{{EMBEDDING_MODEL}}|$EMBEDDING_MODEL|g" \
         -e "s/{{PREDICTOR_SERVICE_IP}}/${PREDICTOR_SERVICE_IP:-10.0.0.1}/g" \
         -e "s/{{MODELS_PVC_SIZE}}/$MODELS_PVC_SIZE/g" \
         -e "s/{{CACHE_PVC_SIZE}}/$CACHE_PVC_SIZE/g" \
diff --git a/deploy/kserve/deployment.yaml b/deploy/kserve/deployment.yaml
@@ -63,7 +63,7 @@ spec:
               ("LLM-Semantic-Router/pii_classifier_modernbert-base_model", "pii_classifier_modernbert-base_model"),
               ("LLM-Semantic-Router/jailbreak_classifier_modernbert-base_model", "jailbreak_classifier_modernbert-base_model"),
               ("LLM-Semantic-Router/pii_classifier_modernbert-base_presidio_token_model", "pii_classifier_modernbert-base_presidio_token_model"),
-              ("sentence-transformers/all-MiniLM-L12-v2", "all-MiniLM-L12-v2")
+              ("sentence-transformers/{{EMBEDDING_MODEL}}", "{{EMBEDDING_MODEL}}")
           ]
 
           cache_dir = "/app/cache/hf"
@@ -72,17 +72,25 @@ spec:
           for repo_id, local_dir_name in models:
               local_dir = os.path.join(base_dir, local_dir_name)
 
-              # Check if model already exists
+              # Check if model weights actually exist (not just the directory)
               has_model = False
               if os.path.exists(local_dir):
-                  for ext in ['.safetensors', '.bin', 'pytorch_model.']:
-                      for root, dirs, files in os.walk(local_dir):
-                          if any(ext in f for f in files):
+                  # Look specifically for model weight files
+                  for root, dirs, files in os.walk(local_dir):
+                      for f in files:
+                          if f.endswith('.safetensors') or f.endswith('.bin') or f.startswith('pytorch_model.'):
                               has_model = True
+                              print(f"Found model weights: {f}")
                               break
                       if has_model:
                           break
 
+              # Clean up incomplete downloads
+              if os.path.exists(local_dir) and not has_model:
+                  print(f"Removing incomplete download: {local_dir_name}")
+                  import shutil
+                  shutil.rmtree(local_dir, ignore_errors=True)
+
               if not has_model:
                   print(f"Downloading {repo_id}...")
                   snapshot_download(
@@ -124,11 +132,11 @@ spec:
           value: /tmp/python_user/bin:/usr/local/bin:/usr/bin:/bin
         resources:
           requests:
-            memory: "512Mi"
-            cpu: "250m"
-          limits:
             memory: "1Gi"
             cpu: "500m"
+          limits:
+            memory: "2Gi"
+            cpu: "1"
         volumeMounts:
         - name: models-volume
           mountPath: /app/models
@@ -194,11 +202,11 @@ spec:
           failureThreshold: 3
         resources:
           requests:
-            memory: "3Gi"
-            cpu: "1"
+            memory: "4Gi"
+            cpu: "1500m"
           limits:
-            memory: "6Gi"
-            cpu: "2"
+            memory: "8Gi"
+            cpu: "3"
 
       # Envoy proxy container - routes to KServe endpoints
       - name: envoy-proxy
diff --git a/deploy/kserve/test-semantic-routing.sh b/deploy/kserve/test-semantic-routing.sh