vllm-project
diff --git a/‎deploy/kubernetes/config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎deploy/kubernetes/config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎deploy/kubernetes/deployment.yaml‎
Lines changed: 151 additions & 109 deletions b/‎deploy/kubernetes/deployment.yaml‎
Lines changed: 151 additions & 109 deletions
diff --git a/‎deploy/kubernetes/kustomization.yaml‎
Lines changed: 1 addition & 7 deletions b/‎deploy/kubernetes/kustomization.yaml‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎deploy/kubernetes/pv-models.yaml‎
Lines changed: 34 additions & 0 deletions b/‎deploy/kubernetes/pv-models.yaml‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎deploy/kubernetes/pvc.yaml‎
Lines changed: 0 additions & 13 deletions b/‎deploy/kubernetes/pvc.yaml‎
Lines changed: 0 additions & 13 deletions
@@ -1,5 +1,5 @@
 bert_model:
-  model_id: sentence-transformers/all-MiniLM-L12-v2
+  model_id: models/all-MiniLM-L12-v2
   threshold: 0.6
   use_cpu: true
 
 
@@ -16,121 +16,163 @@ spec:
         app: semantic-router
     spec:
       initContainers:
-      - name: model-downloader
-        image: python:3.11-slim
-        securityContext:
-          runAsNonRoot: false
-          allowPrivilegeEscalation: false
-        command: ["/bin/bash", "-c"]
-        args:
-        - |
-          set -e
-          echo "Installing Hugging Face CLI..."
-          pip install --no-cache-dir huggingface_hub[cli]
+        - name: model-downloader
+          image: python:3.11-slim
+          securityContext:
+            runAsNonRoot: false
+            allowPrivilegeEscalation: false
+          command: ["/bin/bash", "-c"]
+          args:
+            - |
+              set -e
+              # Check if all required models already exist in PVC; if yes, skip downloads entirely
+              REQUIRED_DIRS=(
+                "all-MiniLM-L12-v2"
+                "category_classifier_modernbert-base_model"
+                "pii_classifier_modernbert-base_model"
+                "jailbreak_classifier_modernbert-base_model"
+                "pii_classifier_modernbert-base_presidio_token_model"
+              )
+              mkdir -p /app/models
+              cd /app/models
+              MISSING=false
+              for d in "${REQUIRED_DIRS[@]}"; do
+                if [ ! -d "$d" ]; then
+                  MISSING=true
+                  break
+                fi
+              done
+              if [ "$MISSING" = false ]; then
+                echo "All required models already present in PVC. Skipping download."
+                exit 0
+              fi
 
-          echo "Downloading models to persistent volume..."
-          cd /app/models
+              echo "Installing Hugging Face CLI..."
+              pip install --no-cache-dir huggingface_hub[cli]
 
-          # Download category classifier model
-          if [ ! -d "category_classifier_modernbert-base_model" ]; then
-            echo "Downloading category classifier model..."
-            huggingface-cli download LLM-Semantic-Router/category_classifier_modernbert-base_model --local-dir category_classifier_modernbert-base_model
-          else
-            echo "Category classifier model already exists, skipping..."
-          fi
+              echo "Downloading missing models to persistent volume..."
 
-          # Download PII classifier model
-          if [ ! -d "pii_classifier_modernbert-base_model" ]; then
-            echo "Downloading PII classifier model..."
-            huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_model --local-dir pii_classifier_modernbert-base_model
-          else
-            echo "PII classifier model already exists, skipping..."
-          fi
+              # Download all-MiniLM-L12-v2 model
+              if [ ! -d "all-MiniLM-L12-v2" ]; then
+                echo "Downloading all-MiniLM-L12-v2 model..."
+                hf download sentence-transformers/all-MiniLM-L12-v2 --local-dir all-MiniLM-L12-v2
+              else
+                echo "all-MiniLM-L12-v2 model already exists, skipping..."
+              fi
 
-          # Download jailbreak classifier model
-          if [ ! -d "jailbreak_classifier_modernbert-base_model" ]; then
-            echo "Downloading jailbreak classifier model..."
-            huggingface-cli download LLM-Semantic-Router/jailbreak_classifier_modernbert-base_model --local-dir jailbreak_classifier_modernbert-base_model
-          else
-            echo "Jailbreak classifier model already exists, skipping..."
-          fi
+              # Download category classifier model
+              if [ ! -d "category_classifier_modernbert-base_model" ]; then
+                echo "Downloading category classifier model..."
+                hf download LLM-Semantic-Router/category_classifier_modernbert-base_model --local-dir category_classifier_modernbert-base_model
+              else
+                echo "Category classifier model already exists, skipping..."
+              fi
 
-          # Download PII token classifier model
-          if [ ! -d "pii_classifier_modernbert-base_presidio_token_model" ]; then
-            echo "Downloading PII token classifier model..."
-            huggingface-cli download LLM-Semantic-Router/pii_classifier_modernbert-base_presidio_token_model --local-dir pii_classifier_modernbert-base_presidio_token_model
-          else
-            echo "PII token classifier model already exists, skipping..."
-          fi
+              # Download PII classifier model
+              if [ ! -d "pii_classifier_modernbert-base_model" ]; then
+                echo "Downloading PII classifier model..."
+                hf download LLM-Semantic-Router/pii_classifier_modernbert-base_model --local-dir pii_classifier_modernbert-base_model
+              else
+                echo "PII classifier model already exists, skipping..."
+              fi
 
-          echo "All models downloaded successfully!"
-          ls -la /app/models/
-        env:
-        - name: HF_HUB_CACHE
-          value: /tmp/hf_cache
-        # Reduced resource requirements for init container
-        resources:
-          requests:
-            memory: "512Mi"
-            cpu: "250m"
-          limits:
-            memory: "1Gi"
-            cpu: "500m"
-        volumeMounts:
-        - name: models-volume
-          mountPath: /app/models
+              # Download jailbreak classifier model
+              if [ ! -d "jailbreak_classifier_modernbert-base_model" ]; then
+                echo "Downloading jailbreak classifier model..."
+                hf download LLM-Semantic-Router/jailbreak_classifier_modernbert-base_model --local-dir jailbreak_classifier_modernbert-base_model
+              else
+                echo "Jailbreak classifier model already exists, skipping..."
+              fi
+
+              # Download PII token classifier model
+              if [ ! -d "pii_classifier_modernbert-base_presidio_token_model" ]; then
+                echo "Downloading PII token classifier model..."
+                hf download LLM-Semantic-Router/pii_classifier_modernbert-base_presidio_token_model --local-dir pii_classifier_modernbert-base_presidio_token_model
+              else
+                echo "PII token classifier model already exists, skipping..."
+              fi
+
+              echo "All missing models downloaded successfully!"
+              ls -la /app/models/
+          env:
+            - name: HF_HUB_CACHE
+              value: /tmp/hf_cache
+            # China Mirror
+            - name: HUGGINGFACE_HUB_CACHE
+              value: /tmp/hf_cache
+            - name: HUGGINGFACE_HUB_ENABLE_HF_TRANSFER
+              value: "1"
+            - name: HUGGINGFACE_HUB_DOWNLOAD_TIMEOUT
+              value: "300"
+            - name: HUGGINGFACE_HUB_PROXY_URL
+              value: "https://hf-mirror.com"
+            - name: PIP_INDEX_URL
+              value: https://pypi.tuna.tsinghua.edu.cn/simple
+            - name: NO_PROXY
+              value: localhost,127.0.0.1,10.0.0.0/8,172.16.0.0/12,192.168.0.0/16,.svc,.svc.cluster.local
+          # Reduced resource requirements for init container
+          resources:
+            requests:
+              memory: "512Mi"
+              cpu: "250m"
+            limits:
+              memory: "1Gi"
+              cpu: "500m"
+          volumeMounts:
+            - name: models-volume
+              mountPath: /app/models
       containers:
-      - name: semantic-router
-        image: ghcr.io/vllm-project/semantic-router/extproc:latest
-        args: ["--secure=true"]
-        securityContext:
-          runAsNonRoot: false
-          allowPrivilegeEscalation: false
-        ports:
-        - containerPort: 50051
-          name: grpc
-          protocol: TCP
-        - containerPort: 9190
-          name: metrics
-          protocol: TCP
-        - containerPort: 8080
-          name: classify-api
-          protocol: TCP
-        env:
-        - name: LD_LIBRARY_PATH
-          value: "/app/lib"
-        volumeMounts:
+        - name: semantic-router
+          image: ghcr.io/vllm-project/semantic-router/extproc:latest
+          args: ["--secure=true"]
+          securityContext:
+            runAsNonRoot: false
+            allowPrivilegeEscalation: false
+          ports:
+            - containerPort: 50051
+              name: grpc
+              protocol: TCP
+            - containerPort: 9190
+              name: metrics
+              protocol: TCP
+            - containerPort: 8080
+              name: classify-api
+              protocol: TCP
+          env:
+            - name: LD_LIBRARY_PATH
+              value: "/app/lib"
+          volumeMounts:
+            - name: config-volume
+              mountPath: /app/config
+              readOnly: true
+            - name: models-volume
+              mountPath: /app/models
+          livenessProbe:
+            tcpSocket:
+              port: 50051
+            initialDelaySeconds: 60
+            periodSeconds: 30
+            timeoutSeconds: 10
+            failureThreshold: 3
+          readinessProbe:
+            tcpSocket:
+              port: 50051
+            initialDelaySeconds: 90
+            periodSeconds: 30
+            timeoutSeconds: 10
+            failureThreshold: 3
+          # Significantly reduced resource requirements for kind cluster
+          resources:
+            requests:
+              memory: "3Gi" # Reduced from 8Gi
+              cpu: "1" # Reduced from 2
+            limits:
+              memory: "6Gi" # Reduced from 12Gi
+              cpu: "2" # Reduced from 4
+      volumes:
         - name: config-volume
-          mountPath: /app/config
-          readOnly: true
+          configMap:
+            name: semantic-router-config
         - name: models-volume
-          mountPath: /app/models
-        livenessProbe:
-          tcpSocket:
-            port: 50051
-          initialDelaySeconds: 60
-          periodSeconds: 30
-          timeoutSeconds: 10
-          failureThreshold: 3
-        readinessProbe:
-          tcpSocket:
-            port: 50051
-          initialDelaySeconds: 90
-          periodSeconds: 30
-          timeoutSeconds: 10
-          failureThreshold: 3
-        # Significantly reduced resource requirements for kind cluster
-        resources:
-          requests:
-            memory: "3Gi"    # Reduced from 8Gi
-            cpu: "1"         # Reduced from 2
-          limits:
-            memory: "6Gi"    # Reduced from 12Gi
-            cpu: "2"         # Reduced from 4
-      volumes:
-      - name: config-volume
-        configMap:
-          name: semantic-router-config
-      - name: models-volume
-        persistentVolumeClaim:
-          claimName: semantic-router-models
+          persistentVolumeClaim:
+            claimName: semantic-router-models
@@ -6,7 +6,7 @@ metadata:
 
 resources:
   - namespace.yaml
-  - pvc.yaml
+  - pv-models.yaml
   - deployment.yaml
   - service.yaml
 
@@ -17,10 +17,4 @@ configMapGenerator:
       - config.yaml
       - tools_db.json
 
-# Namespace for all resources
 namespace: vllm-semantic-router-system
-
-images:
-  - name: ghcr.io/vllm-project/semantic-router/extproc
-    newName: semantic-router-extproc
-    newTag: local
@@ -0,0 +1,34 @@
+# PV for Models
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: semantic-router-models-pv
+  labels:
+    app: semantic-router
+spec:
+  capacity:
+    storage: 20Gi
+  accessModes:
+    - ReadWriteOnce
+  storageClassName: standard
+  persistentVolumeReclaimPolicy: Retain
+  hostPath:
+    path: /mnt/models
+    type: DirectoryOrCreate
+---
+# PVC for Models
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: semantic-router-models
+  namespace: vllm-semantic-router-system
+  labels:
+    app: semantic-router
+spec:
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 20Gi
+  storageClassName: standard
+  volumeName: semantic-router-models-pv