Enhance local deployment pipeline with chat interface integration and Kubernetes configuration updates

safoinme · safoinme · commit dfe2f01a560b · 2024-12-10T12:51:04.000+01:00
diff --git a/llm-complete-guide/pipelines/local_deployment.py b/llm-complete-guide/pipelines/local_deployment.py
@@ -1,11 +1,13 @@
 from steps.bento_builder import bento_builder
 from steps.bento_deployment import bento_deployment
+from steps.visualize_chat import create_chat_interface
 from zenml import pipeline
 
 
 @pipeline(enable_cache=False)
 def local_deployment():
     bento = bento_builder()
     bento_deployment(bento)
+    create_chat_interface()
     
     #vllm_model_deployer_step()
diff --git a/llm-complete-guide/run.py b/llm-complete-guide/run.py
@@ -50,6 +50,7 @@
     rag_deployment,
     llm_index_and_evaluate,
     local_deployment,
+    production_deployment,
 )
 from structures import Document
 from zenml.materializers.materializer_registry import materializer_registry
@@ -144,6 +145,12 @@
     default=None,
     help="Path to config",
 )
+@click.option(
+    "--env",
+    "env",
+    default="local",
+    help="The environment to use for the completion.",
+)   
 def main(
     pipeline: str,
     query_text: Optional[str] = None,
@@ -154,6 +161,7 @@ def main(
     use_argilla: bool = False,
     use_reranker: bool = False,
     config: Optional[str] = None,
+    env: str = "local",
 ):
     """Main entry point for the pipeline execution.
 
@@ -167,6 +175,7 @@ def main(
         use_argilla (bool): If True, Argilla an notations will be used
         use_reranker (bool): If True, rerankers will be used
         config (Optional[str]): Path to config file
+        env (str): The environment to use for the deployment (local, huggingface space, k8s etc.)
     """
     pipeline_args = {"enable_cache": not no_cache}
     embeddings_finetune_args = {
@@ -259,9 +268,18 @@ def main(
         )()
 
     elif pipeline == "deploy":
-        #rag_deployment.with_options(model=zenml_model, **pipeline_args)()
-        local_deployment.with_options(model=zenml_model, **pipeline_args)()
-
+        if env == "local":
+            local_deployment.with_options(
+                model=zenml_model, config_path=config_path, **pipeline_args
+            )()
+        elif env == "huggingface":
+            rag_deployment.with_options(
+                model=zenml_model, config_path=config_path, **pipeline_args
+            )()
+        elif env == "k8s":
+            production_deployment.with_options(
+                model=zenml_model, config_path=config_path, **pipeline_args
+            )()
     elif pipeline == "evaluation":
         pipeline_args["enable_cache"] = False
         llm_eval.with_options(model=zenml_model, config_path=config_path)()
diff --git a/llm-complete-guide/service.py b/llm-complete-guide/service.py
@@ -26,6 +26,18 @@
         "timeout": 300,
         "concurrency": 256,
     },
+    http={
+        "cors": {
+            "enabled": True,
+            "access_control_allow_origins": ["https://cloud.zenml.io"],  # Add your allowed origins
+            "access_control_allow_methods": ["GET", "OPTIONS", "POST", "HEAD", "PUT"],
+            "access_control_allow_credentials": True,
+            "access_control_allow_headers": ["*"],
+            # "access_control_allow_origin_regex": "https://.*\.my_org\.com",  # Optional regex
+            "access_control_max_age": 1200,
+            "access_control_expose_headers": ["Content-Length"],
+        }
+    }
 )
 class RAGService:
     """RAG service for generating responses using LLM and RAG."""
diff --git a/llm-complete-guide/steps/bento_builder.py b/llm-complete-guide/steps/bento_builder.py
@@ -31,6 +31,7 @@
 )
 from zenml.integrations.bentoml.steps import bento_builder_step
 from zenml.logger import get_logger
+from zenml.orchestrators.utils import get_config_environment_vars
 from zenml.utils import source_utils
 
 logger = get_logger(__name__)
@@ -64,6 +65,7 @@ def bento_builder() -> (
     if Client().active_stack.orchestrator.flavor == "local":
         model = get_step_context().model
         version_to_deploy = Model(name=model.name, version="production")
+        logger.info(f"Building BentoML bundle for model: {version_to_deploy.name}")
         # Build the BentoML bundle
         bento = bentos.build(
             service="service.py:RAGService",
diff --git a/llm-complete-guide/steps/bento_dockerizer.py b/llm-complete-guide/steps/bento_dockerizer.py
@@ -28,7 +28,7 @@
 
 logger = get_logger(__name__)
 
-@step
+@step(enable_cache=False)
 def bento_dockerizer() -> (
     Annotated[
         str,
@@ -40,12 +40,11 @@ def bento_dockerizer() -> (
     This step is responsible for dockerizing the BentoML model.
     """
     ### ADD YOUR OWN CODE HERE - THIS IS JUST AN EXAMPLE ###
+    zenml_client = Client()
     model = get_step_context().model
-    version_to_deploy = Model(name=model.name, version="production")
-    bentoml_deployment = version_to_deploy.get_model_artifact(name="bentoml_rag_deployment")
+    version_to_deploy = Model(name=model.name)
+    bentoml_deployment = zenml_client.get_artifact_version(name_id_or_prefix="bentoml_rag_deployment")
     bento_tag = f'{bentoml_deployment.run_metadata["bento_tag_name"]}:{bentoml_deployment.run_metadata["bento_info_version"]}'
-    
-    zenml_client = Client()
     container_registry = zenml_client.active_stack.container_registry
     assert container_registry, "Container registry is not configured."
     image_name = f"{container_registry.config.uri}/{bento_tag}"
diff --git a/llm-complete-guide/steps/k8s_deployment.py b/llm-complete-guide/steps/k8s_deployment.py
@@ -11,15 +11,21 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 #  or implied. See the License for the specific language governing
 #  permissions and limitations under the License.
-from pathlib import Path
-from typing import Dict, Optional
 import re
+from pathlib import Path
+from typing import Dict, Optional, cast
+
 import yaml
 from kubernetes import client, config
 from kubernetes.client.rest import ApiException
 from zenml import get_step_context, step
 from zenml.client import Client
+from zenml.integrations.bentoml.services.bentoml_local_deployment import (
+    BentoMLLocalDeploymentConfig,
+    BentoMLLocalDeploymentService,
+)
 from zenml.logger import get_logger
+from zenml.orchestrators.utils import get_config_environment_vars
 
 logger = get_logger(__name__)
 
@@ -93,7 +99,7 @@ def apply_kubernetes_configuration(k8s_configs: list) -> None:
             logger.error(f"Error applying {kind} {name}: {e}")
             raise e
 
-@step
+@step(enable_cache=False)
 def k8s_deployment(
     docker_image_tag: str,
     namespace: str = "default"
@@ -103,6 +109,17 @@ def k8s_deployment(
     # Sanitize the model name
     model_name = sanitize_name(raw_model_name)
     
+    # Get environment variables
+    environment_vars = get_config_environment_vars()
+    
+    # Get current deployment
+    zenml_client = Client()
+    model_deployer = zenml_client.active_stack.model_deployer
+    services = model_deployer.find_model_server(
+        model_name=model_name,
+        model_version="production",
+    )
+
     # Read the K8s template
     template_path = Path(__file__).parent / "k8s_template.yaml"
     with open(template_path, "r") as f:
@@ -120,6 +137,23 @@ def k8s_deployment(
         if config["kind"] == "Service":
             # Update service selector
             config["spec"]["selector"]["app"] = model_name
+
+            # Update metadata annotations with SSL certificate ARN
+            config["metadata"]["annotations"] = {
+                "service.beta.kubernetes.io/aws-load-balancer-ssl-cert": "arn:aws:acm:eu-central-1:339712793861:certificate/0426ace8-5fa3-40dd-bd81-b0fb1064bd85",
+                "service.beta.kubernetes.io/aws-load-balancer-backend-protocol": "http",
+                "service.beta.kubernetes.io/aws-load-balancer-ssl-ports": "443",
+                "service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout": "3600"
+            }
+ 
+            # Update ports
+            config["spec"]["ports"] = [
+                {
+                    "name": "https",
+                    "port": 443,
+                    "targetPort": 3000
+                }
+            ]
             
         elif config["kind"] == "Deployment":
             # Update deployment selector and template
@@ -131,6 +165,12 @@ def k8s_deployment(
             for container in containers:
                 container["name"] = model_name
                 container["image"] = docker_image_tag
+        
+                # Add environment variables to the container
+                env_vars = []
+                for key, value in environment_vars.items():
+                    env_vars.append({"name": key, "value": value})
+                container["env"] = env_vars
     
     # Apply the configurations
     try:
@@ -149,9 +189,22 @@ def k8s_deployment(
         "namespace": namespace,
         "status": deployment_status,
         "service_port": 3000,
-        "configurations": k8s_configs
+        "configurations": k8s_configs,
+        "url": "chat-rag.staging.cloudinfra.zenml.io"
     }
     
+    if services:
+        bentoml_deployment= cast(BentoMLLocalDeploymentService, services[0])
+        zenml_client.update_service(
+            id=bentoml_deployment.uuid,
+            prediction_url="https://chat-rag.staging.cloudinfra.zenml.io",
+            health_check_url="https://chat-rag.staging.cloudinfra.zenml.io/healthz",
+            labels={
+                "docker_image": docker_image_tag,
+                "namespace": namespace,
+            }
+        )
+    
     return deployment_info
 
 
diff --git a/llm-complete-guide/steps/k8s_template.yaml b/llm-complete-guide/steps/k8s_template.yaml
@@ -1,17 +1,22 @@
 apiVersion: v1
 kind: Service
 metadata:
+  name: placeholder
   labels:
     app: placeholder
-  name: placeholder
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-ssl-cert: arn:aws:acm:region:account-id:certificate/certificate-id
+    service.beta.kubernetes.io/aws-load-balancer-backend-protocol: http
+    service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "443"
+    service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600"
 spec:
-  ports:
-  - name: http        # Changed from 'predict' to 'http' for clarity
-    port: 80          # External port exposed by LoadBalancer
-    targetPort: 3000  # Internal container port
   selector:
     app: placeholder
   type: LoadBalancer
+  ports:
+    - name: https
+      port: 443        # External port exposed by LoadBalancer (HTTPS)
+      targetPort: 3000 # Internal container port
 ---
 apiVersion: apps/v1
 kind: Deployment
diff --git a/llm-complete-guide/steps/visualize_chat.py b/llm-complete-guide/steps/visualize_chat.py
diff --git a/llm-complete-guide/utils/openai_utils.py b/llm-complete-guide/utils/openai_utils.py