diff --git a/agentic_rag/gradio_app.py b/agentic_rag/gradio_app.py index 7c39376..916b4b8 100644 --- a/agentic_rag/gradio_app.py +++ b/agentic_rag/gradio_app.py @@ -177,8 +177,16 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, response = agent.process_query(message) print("Query processed successfully") + # Handle string responses from Ollama models + if isinstance(response, str): + response = { + "answer": response, + "reasoning_steps": [response] if use_cot else [], + "context": [] + } + # Format response with reasoning steps if CoT is enabled - if use_cot and "reasoning_steps" in response: + if use_cot and isinstance(response, dict) and "reasoning_steps" in response: formatted_response = "🤔 Let me think about this step by step:\n\n" print("\nChain of Thought Reasoning Steps:") print("-" * 50) @@ -195,7 +203,7 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, # Add final answer print("\nFinal Answer:") print("-" * 50) - final_answer = "\n🎯 Final Answer:\n" + response["answer"] + final_answer = "\n🎯 Final Answer:\n" + response.get("answer", "No answer provided") formatted_response += final_answer print(final_answer) @@ -208,27 +216,28 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, print(sources_text) for ctx in response["context"]: - source = ctx["metadata"].get("source", "Unknown") - if "page_numbers" in ctx["metadata"]: - pages = ctx["metadata"].get("page_numbers", []) - source_line = f"- {source} (pages: {pages})\n" - else: - file_path = ctx["metadata"].get("file_path", "Unknown") - source_line = f"- {source} (file: {file_path})\n" - formatted_response += source_line - print(source_line) + if isinstance(ctx, dict) and "metadata" in ctx: + source = ctx["metadata"].get("source", "Unknown") + if "page_numbers" in ctx["metadata"]: + pages = ctx["metadata"].get("page_numbers", []) + source_line = f"- {source} (pages: {pages})\n" + else: + file_path = ctx["metadata"].get("file_path", "Unknown") + source_line = f"- {source} (file: {file_path})\n" + formatted_response += source_line + print(source_line) # Add final formatted response to history history.append([message, formatted_response]) else: # For standard response (no CoT) - formatted_response = response["answer"] + formatted_response = response.get("answer", "No answer provided") if isinstance(response, dict) else str(response) print("\nStandard Response:") print("-" * 50) print(formatted_response) # Add sources if available - if response.get("context"): + if isinstance(response, dict) and response.get("context"): print("\nSources Used:") print("-" * 50) sources_text = "\n\n📚 Sources used:\n" @@ -236,15 +245,16 @@ def chat(message: str, history: List[List[str]], agent_type: str, use_cot: bool, print(sources_text) for ctx in response["context"]: - source = ctx["metadata"].get("source", "Unknown") - if "page_numbers" in ctx["metadata"]: - pages = ctx["metadata"].get("page_numbers", []) - source_line = f"- {source} (pages: {pages})\n" - else: - file_path = ctx["metadata"].get("file_path", "Unknown") - source_line = f"- {source} (file: {file_path})\n" - formatted_response += source_line - print(source_line) + if isinstance(ctx, dict) and "metadata" in ctx: + source = ctx["metadata"].get("source", "Unknown") + if "page_numbers" in ctx["metadata"]: + pages = ctx["metadata"].get("page_numbers", []) + source_line = f"- {source} (pages: {pages})\n" + else: + file_path = ctx["metadata"].get("file_path", "Unknown") + source_line = f"- {source} (file: {file_path})\n" + formatted_response += source_line + print(source_line) history.append([message, formatted_response]) diff --git a/agentic_rag/local_rag_agent.py b/agentic_rag/local_rag_agent.py index a64f4c7..dbbc79b 100644 --- a/agentic_rag/local_rag_agent.py +++ b/agentic_rag/local_rag_agent.py @@ -358,8 +358,17 @@ def _process_query_with_cot(self, query: str) -> Dict[str, Any]: logger.info("Falling back to general response") return self._generate_general_response(query) + # Handle string response from synthesis + if isinstance(synthesis_result, str): + return { + "answer": synthesis_result, + "reasoning_steps": reasoning_steps, + "context": context + } + + # Handle dictionary response return { - "answer": synthesis_result["answer"], + "answer": synthesis_result.get("answer", synthesis_result) if isinstance(synthesis_result, dict) else synthesis_result, "reasoning_steps": reasoning_steps, "context": context } diff --git a/agentic_rag/tests/test_cot_chat.py b/agentic_rag/tests/test_cot_chat.py new file mode 100644 index 0000000..17e4eae --- /dev/null +++ b/agentic_rag/tests/test_cot_chat.py @@ -0,0 +1,132 @@ +import sys +import logging +import json +from pathlib import Path + +# Add parent directory to path to import modules +sys.path.append(str(Path(__file__).parent.parent)) + +from gradio_app import chat +from store import VectorStore +from local_rag_agent import LocalRAGAgent + +# Configure logging +logging.basicConfig( + level=logging.DEBUG, + format='%(asctime)s | %(name)s | %(levelname)s | %(message)s', + handlers=[ + logging.StreamHandler(sys.stdout) + ] +) + +logger = logging.getLogger(__name__) + +def debug_response_structure(response, prefix=""): + """Helper function to debug response structure""" + logger.debug(f"{prefix}Response type: {type(response)}") + if isinstance(response, dict): + logger.debug(f"{prefix}Response keys: {list(response.keys())}") + for key, value in response.items(): + logger.debug(f"{prefix}Key '{key}' type: {type(value)}") + if isinstance(value, list): + logger.debug(f"{prefix}List length: {len(value)}") + if value and isinstance(value[0], dict): + logger.debug(f"{prefix}First item keys: {list(value[0].keys())}") + elif isinstance(response, str): + logger.debug(f"{prefix}String length: {len(response)}") + logger.debug(f"{prefix}First 100 chars: {response[:100]}") + +def test_cot_chat(): + """Test the CoT chat interface with detailed logging""" + try: + # Initialize components + logger.info("Initializing vector store...") + vector_store = VectorStore() + + logger.info("Initializing local agent...") + agent = LocalRAGAgent(vector_store, model_name="ollama:phi3", use_cot=True) + + # Test message + test_message = "What is self-instruct in AI?" + logger.info(f"Test message: {test_message}") + + # Initialize empty chat history + history = [] + + # Log initial state + logger.info("Initial state:") + logger.info(f"History type: {type(history)}") + logger.info(f"History length: {len(history)}") + + # Process the chat + logger.info("Processing chat...") + try: + # Get raw response from agent + logger.info("Getting raw response from agent...") + raw_response = agent.process_query(test_message) + logger.info("Raw response received") + debug_response_structure(raw_response, "Raw response: ") + + # Verify response structure + if not isinstance(raw_response, dict): + logger.error(f"Unexpected response type: {type(raw_response)}") + raise TypeError(f"Expected dict response, got {type(raw_response)}") + + required_keys = ["answer", "reasoning_steps", "context"] + missing_keys = [key for key in required_keys if key not in raw_response] + if missing_keys: + logger.error(f"Missing required keys in response: {missing_keys}") + raise KeyError(f"Response missing required keys: {missing_keys}") + + # Process through chat function + logger.info("Processing through chat function...") + result = chat( + message=test_message, + history=history, + agent_type="ollama:phi3", + use_cot=True, + collection="PDF Collection" + ) + logger.info("Chat processing completed") + debug_response_structure(result, "Final result: ") + + except Exception as e: + logger.error(f"Error during processing: {str(e)}", exc_info=True) + raise + + # Log final state + logger.info("Final state:") + logger.info(f"Result type: {type(result)}") + logger.info(f"Result length: {len(result)}") + + # Save debug information to file + debug_info = { + "test_message": test_message, + "raw_response": { + "type": str(type(raw_response)), + "keys": list(raw_response.keys()) if isinstance(raw_response, dict) else None, + "content": str(raw_response) + }, + "final_result": { + "type": str(type(result)), + "length": len(result) if isinstance(result, list) else None, + "content": str(result) + }, + "history": { + "type": str(type(history)), + "length": len(history), + "content": str(history) + } + } + + with open("cot_chat_debug.json", "w") as f: + json.dump(debug_info, f, indent=2) + + logger.info("Debug information saved to cot_chat_debug.json") + + except Exception as e: + logger.error(f"Test failed: {str(e)}", exc_info=True) + raise + +if __name__ == "__main__": + test_cot_chat() \ No newline at end of file diff --git a/nvidia-nim-oke/README.md b/nvidia-nim-oke/README.md index dbc41e3..fea1450 100644 --- a/nvidia-nim-oke/README.md +++ b/nvidia-nim-oke/README.md @@ -2,460 +2,992 @@ [![License: UPL](https://img.shields.io/badge/license-UPL-green)](https://img.shields.io/badge/license-UPL-green) -## Introduction +This guide provides step-by-step instructions for deploying NVIDIA NIM (NVIDIA Inference Microservices) on Oracle Cloud Infrastructure (OCI) using Oracle Kubernetes Engine (OKE) and GPU instances. NIM allows you to easily deploy and serve AI models like LLaMA 3 with production-ready APIs, scalability, and GPU optimization. -NVIDIA NIM is a set of easy-to-use microservices designed to accelerate the deployment of generative AI models across the cloud, data center, and workstations. NIM packages are categorized by model family and a per model basis. For example, NVIDIA NIM for large language models (LLMs) brings the power of state-of-the-art LLMs to enterprise applications, providing unmatched natural language processing and understanding capabilities. +--- -In this solution, we demonstrate how quickly you can get started with NVIDIA NIM (as a Large Language Model inference framework) and Oracle Container Engine for Kubernetes (as the deployment and orchestration system) on OCI. +## 📋 Prerequisites -NIM makes it easy for IT and DevOps teams to self-host large language models (LLMs) in their own managed environments while still providing developers with industry standard APIs that allow them to build powerful copilots, chatbots, and AI assistants that can transform their business. Leveraging NVIDIA’s cutting-edge GPU acceleration and scalable deployment, NIM offers the fastest path to inference with unparalleled performance. +Before starting the deployment process, ensure you have the following: -OCI provides managed Kubernetes, together with some NVIDIA A10 Tensor Core GPUs as Kubernetes nodes, to rapidly accelerate your onboarding into AI. Thanks to NVIDIA NIM, you will be able to explore from a vast collection of models, containers, Helm charts and AI projects in many fields (bioengineering, speech recognition, object detection...), and easily pull these models into your Kubernetes cluster. Following a few steps, you'll be ready to perform inference, invoke and manipulate these models (training, finetuning, testing and using them) in your environment. -The application has the following components: +- An active OCI account with appropriate permissions +- OCI CLI installed and configured on your local machine +- NVIDIA NGC API key (from [NGC](https://ngc.nvidia.com)) to access NVIDIA's container registry +- Helm (version 3.x) installed on your local machine for deploying Kubernetes applications +- Access to the [`nim-deploy`](https://github.com/NVIDIA/nim-deploy) GitHub repo for reference materials -- Oracle Container Engine for Kubernetes (OKE) -- NVIDIA NIM for inferencing LLMs -- Python code to invoke these inference endpoints +--- -This is an illustration of how NIM works whenever we invoke a supported model on the NVIDIA NGC Catalog: +### 🛡️ IAM Policy Requirements -![ngc architecture](./img/ngc_arch.PNG) +The deployment requires specific OCI Identity and Access Management (IAM) permissions. Ensure your user/group has the following permissions (either directly or via dynamic groups): -[You can watch the solution's video here.]() +```text +Allow group to manage instance-family in compartment +Allow group to manage cluster-family in compartment +Allow group to manage virtual-network-family in compartment +Allow group to use subnets in compartment +Allow group to manage secret-family in compartment +Allow group to use instance-configurations in compartment +``` + +You can assign these permissions through OCI IAM policies or by using predefined roles like "OKE Cluster Administrator" combined with "Network Administrator" and "Compute Instance Administrator" for your compartment. + +--- + +## 🧱 Infrastructure Setup + +This section covers the steps to prepare your OCI infrastructure for running NIM. Oracle Cloud offers various GPU options that provide the compute power needed for efficient AI model inference. + +### 1. Create a Virtual Cloud Network (VCN) +**Setting up the network foundation for your OKE cluster** -## 0. Prerequisites and docs +First, set up the networking infrastructure to support your OKE cluster: -### Prerequisites +- Public Subnet: For OKE worker nodes to allow management access +- Private Subnet (optional): For internal services that don't need direct internet access +- NAT Gateway or Internet Gateway (if using public IPs): For outbound internet connectivity +- Ensure ports `443` and `8000` are open in your NSG or security list for specific trusted IP ranges -- An Oracle Cloud Infrastructure (OCI) Account -- An [NVIDIA AI Enterprise](https://www.nvidia.com/en-us/data-center/products/ai-enterprise/) License -- Access to some NVIDIA **A10** or **A100** Tensor Core GPUs on your OCI tenancy -- An Operating System with `x86_64` architecture compatible with the **NVIDIA Container toolkit**. Find [the available list here.](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/supported-platforms.html) -- CUDA Drivers installed -- Docker installed +These network components establish the foundation for your cluster's connectivity. -### Docs +When configuring security lists or network security groups, use restricted CIDR blocks instead of opening to all IPs: -- [NVIDIA NIM Introduction](https://docs.nvidia.com/nim/large-language-models/latest/introduction.html) -- [llama-3-8b on NVIDIA NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nim/teams/meta/containers/llama3-8b-instruct) -- [Installing the NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-the-nvidia-container-toolkit) -- [NIM for LLMs - Prerequisites](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html#ngc-authentication) -- [Arto Bendiken - Alpine CUDA](https://arto.s3.amazonaws.com/notes/cuda) -- [Pulling images from a Private Registry - Kubernetes](https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/) -- [OKE Node Resizing for very large container images](https://blogs.oracle.com/ateam/post/oke-node-sizing-for-very-large-container-images) -- [Kubernetes - Setting resource quotas on namespaces](https://kubernetes.io/docs/concepts/policy/resource-quotas/) -- [OCI - cloud-init scripts for OKE](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengusingcustomcloudinitscripts.htm) -- [Kubernetes - Unexpected EOF](https://stackoverflow.com/questions/53677592/docker-pull-unexpected-eof) -- [OCI Compute - Microk8s installation for local compute](https://microk8s.io/docs/getting-started) -- [OCI Compute - Microk8s official repository](https://github.com/canonical/microk8s) -- [nim-deploy official repository](https://github.com/NVIDIA/nim-deploy) -- [Launching a NIM with a minimal configuration](https://github.com/NVIDIA/nim-deploy/tree/main/helm) -- [NIM LLMs - Getting started](https://docs.nvidia.com/nim/large-language-models/latest/getting-started.html) -- [NVIDIA NGC CLI - Getting started](https://docs.ngc.nvidia.com/cli/cmd.html) -- [Running Docker inside a container](https://stackoverflow.com/questions/76150514/running-docker-inside-a-container) +```bash +# Example of restricting access to specific trusted IPs or corporate network +oci network security-list update \ + --security-list-id \ + --ingress-security-rules '[ + { + "protocol":"6", + "source":"10.0.0.0/8", + "tcpOptions":{"destinationPortRange":{"min":8000,"max":8000}}, + "isStateless":false + }, + { + "protocol":"6", + "source":"192.168.0.0/16", + "tcpOptions":{"destinationPortRange":{"min":443,"max":443}}, + "isStateless":false + } + ]' +``` -## 1. Set up OKE Instance on OCI +Replace the CIDR blocks (`10.0.0.0/8`, `192.168.0.0/16`) with your specific corporate network ranges or trusted IP addresses. -First, let's create an OKE instance in our OCI tenancy. To create an OKE Cluster, we can perform this step through the OCI Console: +--- -![k8s creation](./img/creation_1.PNG) +### 2. Create a NAT Gateway (Recommended) +**Enabling secure outbound internet access for private resources** -![k8s creation](./img/creation_2.PNG) +A NAT Gateway provides outbound internet access for resources in private subnets while maintaining security: -![k8s creation](./img/creation_3.PNG) +```bash +oci network route-rule add --route-table-id \ + --destination 0.0.0.0/0 \ + --network-entity-id +``` -> Note you can specify whichever GPU shape available in OCI, each node will be created with the selected shape (you can have multiple OKE nodes in the cluster, so be mindful of resource spending.) +This allows your cluster nodes to download containers and model weights while maintaining a secure network posture. -Make sure you select a custom boot volume size. For this solution, this is especially important. OKE nodes in a node pool use their boot volume for pod storage. The default size for the boot volume is `46.6GB`. On a typical, Oracle Linux 8 based node, the root filesystem has a capacity around 38GB, and available space for pod images of around 12GB. This, in our case, is not enough, as we'll be operating with Large Language Models, which typically are comprised of several files which can amount up to 100GB-500GB in some extreme cases. Therefore, we need to select a custom boot size during creation: +--- -![custom boot size](./img/custom_volume_size.PNG) +### 3. Setup Internet Gateway (Alternative) +**Providing direct internet connectivity for public-facing resources** -Also, since we want to change the original boot volume size into something bigger, and we're working with an Oracle Linux Operating System, we shall add an SSH key in advanced options, which will allow us to manually debug and connect to the nodes in our node pool: +If you prefer to use public subnets with direct internet access, you can set up an Internet Gateway instead: -![ssh key during creation](./img/custom_ssh_key.PNG) +```bash +# Create Internet Gateway +oci network internet-gateway create \ + --compartment-id \ + --vcn-id \ + --is-enabled true \ + --display-name "NIM-InternetGateway" + +# Get the Internet Gateway OCID +INTERNET_GATEWAY_OCID=$(oci network internet-gateway list \ + --compartment-id \ + --vcn-id \ + --query "data[?contains(\"display-name\",'NIM-InternetGateway')].id" \ + --raw-output) + +# Add route rule to the public subnet's route table +oci network route-table update \ + --rt-id \ + --route-rules '[{"destination": "0.0.0.0/0", "destinationType": "CIDR_BLOCK", "networkEntityId": "'$INTERNET_GATEWAY_OCID'"}]' +``` -And wait for the creation of the cluster, it'll take around 5 minutes. +Using an Internet Gateway provides: +- Direct inbound and outbound internet connectivity +- Simplifies access to external resources +- Eliminates the need for proxies in many cases +- Useful for development environments or when security requirements allow direct connectivity -> You will be able to access this cluster however you want. It's recommended to use OCI Cloud Shell to access and connect to the cluster, as all OCI configuration is performed automatically. If you still want to use a Compute Instance or your own local machine, you will need to set up authentication to your OCI tenancy. Also, you must have downloaded and installed `OCI CLI version 2.24.0` (or later) and configured it for use. If your version of the OCI CLI is earlier than version `2.24.0`, download and install a newer version from here. +However, Internet Gateways expose your nodes to the public internet, so ensure proper security groups and network security lists are configured with restricted CIDR blocks. -## 2. Expand node boot volume sizes with cloud-init +--- -Assuming a typical, Oracle Linux, node instance, what we did during the last step will adjust the boot volume size. +### 4. Create an Instance Configuration +**Defining the VM template for your GPU nodes** -However, the file system will not automatically grow to fill the additional space. In order to fill the additional space, let's change the init script to add a `growfs` command. The default init script for a Oracle Linux node will look something like the following: +This step creates a template that defines the hardware and software configuration for your GPU instances, including shape, image, network settings, and SSH access: ```bash -#!/bin/bash -curl --fail -H "Authorization: Bearer Oracle" -L0 http://169.254.169.254/opc/v2/instance/metadata/oke_init_script | base64 --decode >/var/run/oke-init.sh -bash /var/run/oke-init.sh +oci compute-management instance-configuration create \ +--compartment-id \ +--instance-details '{ + "instanceType": "compute", + "launchDetails": { + "availabilityDomain": "", + "compartmentId": "", + "shape": "BM.GPU.A100-v2.8", + "sourceDetails": { + "sourceType": "image", + "imageId": "" + }, + "metadata": { + "ssh_authorized_keys": "" + }, + "createVnicDetails": { + "subnetId": "", + "assignPublicIp": true + } + } +}' \ +--profile OCI ``` -We need to append a line to this script: +--- + +### 5. Create the Cluster Network with GPU Nodes +**Creating your compute cluster with GPU resources in one step** + +This step creates a cluster network with the specified GPU nodes, which will be used to run your NIM deployment: ```bash -sudo /usr/libexec/oci-growfs -y +oci compute-management cluster-network create \ +--compartment-id \ +--instance-pools file://instance_pools.json \ +--placement-configuration file://placement_config.json \ +--display-name "A100-Cluster-NIM" \ +--profile OCI ``` -which will grow the file system to the size we specified during creation. You can do this by editing the node pool itself, after the cluster has been created: +#### 📄 `instance_pools.json` + +```json +[ + { + "instanceConfigurationId": "", + "size": 1, + "displayName": "NIM-Pool", + "availabilityDomain": "", + "faultDomain": "FAULT-DOMAIN-1" + } +] +``` -![updating node pool](./img/cloud-init.PNG) +#### 📄 `placement_config.json` -If the nodes are already running before you set the updated init script, simply **cycle** the nodes to get new ones to run the init script. If you don't want to use the init script to run oci-growfs, we can also SSH into each node in the node pool - using the previously inserted SSH key - and insert it manually to run prior to kubelet initialization. +```json +{ + "availabilityDomain": "", + "placementConstraint": "PACKED_DISTRIBUTION_MULTI_BLOCK", + "primaryVnicSubnets": { + "subnetId": "" + } +} +``` -## 3. Access OKE cluster +This creates GPU-equipped nodes in a cluster configuration. -After the cluster has been provisioned, to get access into the OKE cluster, follow these steps: +--- -1. Click Access Cluster on the `Cluster details` page: +## 6. Connect to Your Cluster +**Streamlining authentication with direct kubectl configuration** - ![cluster1](img/AccessCluster.png) +There are two ways to connect to your OKE cluster: -2. Accept the default Cloud Shell Access and click Copy to copy the `oci ce cluster create-kubeconfig ...` command. +### Option A: Direct kubectl Configuration (Recommended) +**Setting up persistent access to your cluster** -3. To access the cluster, paste the command into your Cloud Shell session and hit Enter. +For a more seamless experience that allows using `kubectl` directly: -4. Verify that the `kubectl` is working by using the `get nodes` command: +```bash +# Configure kubectl with OCI authentication +oci ce cluster create-kubeconfig --cluster-id --file $HOME/.kube/config --region --token-version 2.0.0 --kube-endpoint PUBLIC_ENDPOINT --profile oci --auth security_token +``` - ```bash - kubectl get nodes - ``` +This command: +- Creates a persistent kubeconfig file with token-based authentication +- Adds the necessary authentication parameters to the kubeconfig +- Sets up secure access using your OCI security token +- Allows direct use of standard kubectl commands + +> **Note on OCI Authentication:** The OCI security token has a maximum lifetime of 60 minutes. When your token expires (usually after closing your laptop or leaving it idle for too long), you will need to re-authenticate. +> +> To refresh your authentication: +> +> **Quick refresh command:** +> +> ```bash +> oci session authenticate --profile oci && oci ce cluster create-kubeconfig --cluster-id --file $HOME/.kube/config --region --token-version 2.0.0 --kube-endpoint PUBLIC_ENDPOINT --profile oci --auth security_token +> ``` +> +> 1. Run `oci session authenticate --profile oci` +> 2. Then recreate the kubeconfig with the same command as above +> +> Unfortunately, OCI does not support tokens with lifetimes longer than 60 minutes, so periodic re-authentication is required. + + +Test your connection to verify it works: - ![get nodes](./img/get_nodes.PNG) +```bash +kubectl get nodes +``` -5. Repeat this command multiple times until all nodes show `Ready` in the `STATUS` column: +**Expected Output:** +``` +NAME STATUS ROLES AGE VERSION +10.0.10.12 Ready node 12h v1.32.1 +10.0.10.40 Ready node 17h v1.32.1 +``` - When all nodes are `Ready`, the OKE installation has finished successfully. +You should see a list of your cluster's nodes. This confirms that the configuration is working correctly and you have direct access to your cluster. -## 4. Authenticate with NVIDIA NGC +### Option B: Wrapper Script (Legacy Method) +**Using a helper script for temporary access** -![nvidia nim](./img/nvidia_nim.PNG) +If you prefer using a wrapper script (not recommended for ongoing use): -1. Now, we need to authenticate against **NVIDIA NGC**, a portal of services that allows us to download and use LLMs and other types of AI solutions (basically, this is like an LLM and AI catalog, and all related resources (like Helm charts to automatically deploy these)). An **NGC API key** is required to access NGC resources and a key can be generated [in this URL.](https://org.ngc.nvidia.com/setup/personal-keys): +```bash +# Create a wrapper script for kubectl with OCI authentication +cat > oke-connect.sh << 'EOF' +#!/bin/bash - ![creating API key](./img/api_key.PNG) +# Generate token +oci ce cluster generate-token \ + --cluster-id \ + --region \ + --profile oci \ + --auth security_token > /tmp/k8s_token.json - > It is required that permission `NGC Catalog` is enabled on the key. +# Extract token +TOKEN=$(cat /tmp/k8s_token.json | grep -o '"token": "[^"]*' | cut -d'"' -f4) -2. Let's make note of this API as we will use it in the next step, and put it into our **OCI Cloud Shell**, in order to automatically authenticate with NVIDIA's Container Registry (`nvcr`): +# Use token with kubectl +kubectl --token=$TOKEN "$@" +EOF - ```bash - export NGC_API_KEY= - ``` +# Make the script executable +chmod +x oke-connect.sh +``` -3. Run the following command to make the environment variable available at startup, every time we launch our **OCI Cloud Shell** instance: +Test with: +```bash +./oke-connect.sh get nodes +``` - ```bash - echo "export NGC_API_KEY=" >> /home//.bashrc - ``` +## 7. Set Up the NIM Namespace and NGC API Key +**Creating a dedicated namespace and securing your NGC credentials** - This command will append this line into the end of the `.bashrc` file. +Create a dedicated namespace for NIM and store your NGC API key as a Kubernetes secret: -4. Authenticate against `nvcr.io` by running the following command (make sure your environment variable is set on the previous step): +```bash +# Create namespace +kubectl create namespace nim +``` - ```bash - echo "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin - ``` +**Expected Output:** +``` +namespace/nim created +``` -Now that our OKE environment has been created, and we're correctly authenticated on NVIDIA's Container Registry, we can deploy the inference server and many other things! +```bash +# Get your NGC API key +NGC_API_KEY= + +# Create a secret for pulling images from NGC +kubectl create secret docker-registry ngc-registry \ + --docker-server=nvcr.io \ + --docker-username='$oauthtoken' \ + --docker-password=$NGC_API_KEY \ + -n nim +``` -## 5. Deploy the inference server +**Expected Output:** +``` +secret/ngc-registry created +``` -1. Before we begin with all our K8s resources being created, we shall create a new namespace for our K8s resources: +This isolates your NIM deployment from other applications in the cluster and securely stores your NGC API key, which is needed to pull NVIDIA's container images. - ```bash - kubectl create namespace nim # we create a new namespace for NIM-associated resources - ``` +## 8. Install Node Feature Discovery (NFD) (Optional but recommended for an easier setup) +**Enabling Kubernetes to identify GPU-equipped nodes** -In this repository, we offer three ways to deploy inference: with Helm, with Kserve, or with Kubernetes directly. Check out their respective directories, `helm/`, `kserve/` and `pod` (for Kubernetes). +NFD is a critical component that allows Kubernetes to identify and label nodes with their hardware capabilities, particularly GPUs: -### (Recommended) Deploy with Kubernetes with official `nvcr.io` image +```bash +# Add the NFD Helm repository +helm repo add nfd https://kubernetes-sigs.github.io/node-feature-discovery/charts +``` -1. First, we need to make sure we have access to pull from `nvcr.io`. Note that in file `pod/llama3-pod.yaml`, we reference a secret called `registry-secret`. Let's create the secret with our credentials so Kubernetes knows we have permission to pull the image from `nvcr`: +**Expected Output:** +``` +"nfd" has been added to your repositories +``` - ```bash - kubectl -n nim create secret docker-registry registry-secret --docker-server=nvcr.io --docker-username='$oauthtoken' --docker-password=$NGC_API_KEY - ``` +```bash +helm repo update +``` - > More information on other methods to create a secret [here.](https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/) +**Expected Output:** +``` +Hang tight while we grab the latest from your chart repositories... +...Successfully got an update from the "nfd" chart repository +Update Complete. ⎈Happy Helming!⎈ +``` -2. From the file `pod/llama3-pod.yaml`, let's create the pod with K8s: +```bash +# Install NFD using Helm +helm install nfd nfd/node-feature-discovery --namespace kube-system +``` - ```bash - # make sure you are on the repository root directory - kubectl create -f pod/llama3-pod.yaml -n nim - ``` +**Expected Output:** +``` +NAME: nfd +LAST DEPLOYED: Thu Jun 4 12:23:45 2023 +NAMESPACE: kube-system +STATUS: deployed +REVISION: 1 +TEST SUITE: None +NOTES: +The Node Feature Discovery has been installed. Check its status by running: + kubectl --namespace kube-system get pods -l "app.kubernetes.io/instance=nfd" +``` - Wait until it's running. To check the status of the pod, you can run the following command: +```bash +# Verify NFD is running +kubectl get pods -n kube-system | grep nfd +``` - ```bash - kubectl get pods -n nim - ``` +**Expected Output:** +``` +nfd-master-85b844d55-zxj7p 1/1 Running 0 3m22s +nfd-worker-4hk8f 1/1 Running 0 3m22s +nfd-worker-6zrwj 1/1 Running 0 3m22s +nfd-worker-nczl8 1/1 Running 0 3m22s +``` - > Note that `pod/llama3-pod.yaml` needs to have privileged mode in the security context, when you're trying to run a docker image inside a Kubernetes pod, as its docs instruct. +NFD automatically detects hardware capabilities—including NVIDIA GPUs—and labels nodes accordingly. While Kubernetes can still detect GPUs without NFD, these labels are essential for automated scheduling. Without NFD, the NIM Operator cannot use label-based node selection, and you must manually configure scheduling (e.g., using node selectors or tolerations). -3. Wait until it's running, then run `sh` inside it: - ```bash - kubectl exec -it docker sh -n nim - ``` -4. Once connected, you will be able to make requests as described in the next chapter. +## 9. Install NVIDIA NIM Operator +**Deploying the custom resource controller for NIM services** -### Deploy with Kubernetes with fresh `ubuntu` image +The NIM Operator manages the lifecycle of NIM services in your cluster: -1. From the file `pod/ubuntu-pod.yaml`, let's create the pod with K8s: +```bash +# Add NVIDIA Helm repository +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +``` + +**Expected Output:** +``` +"nvidia" has been added to your repositories +``` - ```bash - # make sure you are on the repository root directory - kubectl create -f pod/ubuntu-pod.yaml -n nim - ``` +```bash +helm repo update +``` -2. Wait until it's running, then run `bash` inside it: +**Expected Output:** +``` +Hang tight while we grab the latest from your chart repositories... +...Successfully got an update from the "nvidia" chart repository +Update Complete. ⎈Happy Helming!⎈ +``` - ```bash - kubectl exec -it docker bash -n nim - ``` +```bash +# Install NVIDIA NIM Operator +helm install --namespace nim nvidia-nim-operator nvidia/k8s-nim-operator +``` -3. Once connected, we need to install required dependencies. For this, we have prepared `scripts/fresh_ubuntu_install.sh` with all required steps. You need to modify this file to include your NVIDIA NGC API key: +**Expected Output:** +``` +NAME: nvidia-nim-operator +LAST DEPLOYED: Thu Jun 4 12:34:56 2023 +NAMESPACE: nim +STATUS: deployed +REVISION: 1 +TEST SUITE: None +NOTES: +The NVIDIA NIM Operator has been installed. Check its status by running: + kubectl --namespace nim get pods -l "app.kubernetes.io/instance=nvidia-nim-operator" +``` - ```bash - touch fresh_ubuntu_install.sh # create empty file first - # paste the contents of the original script - # change line 8, from: - export NGC_API_KEY= - # replace it with your API key and save the file. - ``` +This operator extends Kubernetes with custom resources for NIM deployments, making it easier to manage model deployments and their configurations. -4. Change permissions to be able to run the shell script: +## 10. Enable Internet Access via Proxy (Optional) +**Deploying a proxy solution for restricted network environments** - ```bash - chmod a+x fresh_ubuntu_install.sh - ``` +In enterprise environments, OKE clusters often lack direct internet access. If needed, set up a proxy to allow model downloads: -5. Run the installation script: +```bash +# Navigate to the Helm chart directory +cd nim-deploy/helm - ```bash - bash fresh_ubuntu_install.sh - ``` +# Install Squid proxy using Helm +helm install squid-proxy ./squid-proxy --namespace nim +``` -6. After this, a Docker container will be created from the image, and you'll be able to make local requests to the Kubernetes node's public IP address. We will explain inference further in the next chapter. +**Expected Output:** +``` +NAME: squid-proxy +LAST DEPLOYED: Thu Jun 4 12:45:12 2023 +NAMESPACE: nim +STATUS: deployed +REVISION: 1 +TEST SUITE: None +NOTES: +The Squid proxy has been deployed to your cluster. +``` -We have also included an initial script for you to install with the official `docker` and `alpine` images, which are very popular Docker images. However, at the time of writing, NVIDIA Container Toolkit (or NVIDIA drivers as a whole) don't officially support `musl`-based Operating Systems, as their drivers are compiled with `glibc`; while Alpine uses `musl-libc`, part of the reason why the Operating System is so lightweight. If you're particularly interested in these Docker images, check out [this script](scripts/alpine_cuda.txt) which illustrates all steps required to manually compile Alpine-CUDA. +This deploys a Squid proxy in your cluster that uses hostNetwork to bypass network restrictions. The NIM services will be configured to use this proxy for downloading model files from NVIDIA's servers. -### Deploy with Helm +## 11. Deploy LLaMA 3-8B Model Using Helm +**Installing and configuring the LLaMA model with persistent storage** -To perform the deployment of our inference server, we will use `Helm`, a solution that allows us to manage and install Kubernetes applications with configuration files (it's like the Terraform equivalent of Cloud deployments). We will use NVIDIA's official `nim-deploy` repository to perform this deployment. +Now it's time to deploy the actual LLaMA 3-8B model using Helm: -1. We can launch `llama3-8b-instruct` using a default configuration while only setting the NGC API key and persistence in one line with no extra files. For this, set `persistence.enabled` to `true` to ensure that permissions are set correctly and the container runtime filesystem isn't filled by downloading models: +```bash +# Add NVIDIA Helm repository if not already added +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +helm repo update +``` - ```bash - # clone the repository - git clone https://github.com/NVIDIA/nim-deploy.git - # cd into helm charts directory - cd helm/ - helm --namespace nim install my-nim nim-llm/ --set model.ngcAPIKey=$NGC_API_KEY --set persistence.enabled=true - ``` +**Expected Output:** +``` +"nvidia" already exists with the same configuration, skipping +Hang tight while we grab the latest from your chart repositories... +...Successfully got an update from the "nvidia" chart repository +Update Complete. ⎈Happy Helming!⎈ +``` - You can modify the file `helm/nim-llm/values.yaml` to select which model you want to deploy, and many other options (whether you want persistent volumes or you're fine with `ephemeral-storage`, etc.). I have included my own `values.yaml` file as a guidance for those of you trying to replicate. +```bash +# Install the NIM service using Helm with the provided values.yaml +helm --namespace nim install llama3-8b nvidia/nim-llm -f values.yaml +``` - > Note we're referencing the environment variable `$NGC_API_KEY` which we set on step 1.6. Make sure the variable is set on your environment before running this installation., +**Expected Output:** +``` +NAME: llama3-8b +LAST DEPLOYED: Thu Jun 4 13:01:23 2023 +NAMESPACE: nim +STATUS: deployed +REVISION: 1 +TEST SUITE: None +NOTES: +The LLaMA 3-8B model has been deployed to your cluster. +It might take several minutes for the model to download and initialize. +Check the status with: kubectl --namespace nim get pods -l "app=llama3-8b" +``` -2. To learn more about this installation (and what got installed where, like the persistence volumes automatically created...), run: +> **Note:** You'll need to create a `values.yaml` file with appropriate configuration for your deployment. A sample `values.yaml` file is provided below: +> +> ```yaml +> # Sample values.yaml for LLaMA 3-8B deployment +> image: +> repository: nvcr.io/nim/meta/llama3-8b-instruct +> tag: "1.0.0" +> +> resources: +> limits: +> nvidia.com/gpu: 1 +> requests: +> nvidia.com/gpu: 1 +> memory: "16Gi" +> cpu: "4" +> +> persistence: +> size: "50Gi" +> storageClass: "oci-bv" +> +> service: +> type: LoadBalancer +> port: 8000 +> ``` +> +> Adjust the resource requests and limits according to your specific GPU type and model requirements. + +The `values.yaml` file includes: +- Model configuration (LLaMA 3-8B by default) +- GPU resource allocation based on your selected GPU type +- Persistent storage for model files +- Health probes for monitoring +- Service exposure via LoadBalancer + +The persistence configuration is critical to ensure that model weights are stored persistently. For larger models like LLaMA 3-70B, increase the size to at least 100Gi or more. + +The deployment will take several minutes as it downloads the model weights and initializes the service. + +## 12. Monitor Deployment Status +**Verifying the successful deployment of your model** + +Monitor the deployment to ensure everything is running correctly: - ```bash - helm status my-nim - helm get all my-nim - ``` +```bash +# Check the pods +kubectl get pods -n nim +``` -3. You can check the status of all pods in the `nim` namespace, to check when it's ready: +**Expected Output (initial state):** +``` +NAME READY STATUS RESTARTS AGE +llama3-8b-76c9f6b5f-8x4jz 0/1 ContainerCreating 0 2m15s +``` - ```bash - kubectl get pods -n nim - ``` +**Expected Output (after model download):** +``` +NAME READY STATUS RESTARTS AGE +llama3-8b-76c9f6b5f-8x4jz 1/1 Running 0 12m45s +``` - > Wait until the status changes to `READY`. +```bash +# Check the services +kubectl get svc -n nim +``` -4. Use `kubectl` to see the status of this Helm deployment, and wait until the inference server pods are running (the first pull might take a few minutes). Once the container is created, loading the model also takes a few minutes. You can monitor the pod with these commands: +**Expected Output:** +``` +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +llama3-8b LoadBalancer 10.96.157.218 8000:30450/TCP 15m +``` -5. You can check the specific logs of pods, and debug them, with the following commands: +```bash +# Check deployment status +kubectl describe pod -n nim -l app=llama3-8b +``` - ```bash - kubectl describe pods - kubectl logs - ``` +**Expected Output (abbreviated):** +``` +Name: llama3-8b-76c9f6b5f-8x4jz +Namespace: nim +Priority: 0 +Node: 10.0.10.17/10.0.10.17 +Start Time: Thu, 04 Jun 2023 13:01:45 -0500 +... +Status: Running +... +Conditions: + Type Status + Initialized True + Ready True + ContainersReady True + PodScheduled True +... +Events: + Type Reason Age Message + ---- ------ ---- ------- + Normal Scheduled 14m Successfully assigned nim/llama3-8b-76c9f6b5f-8x4jz to 10.0.10.17 + Normal Pulling 14m Pulling image "nvcr.io/nim/meta/llama3-8b-instruct:1.0.0" + Normal Pulled 12m Successfully pulled image "nvcr.io/nim/meta/llama3-8b-instruct:1.0.0" + Normal Created 12m Created container llama3-8b + Normal Started 12m Started container llama3-8b +``` -Once it's ready, we can begin with **inference** (making requests to the model). +These commands help you verify that: +1. The pod is running correctly +2. The service has been created with the correct configuration +3. There are no errors in the deployment -## 6. Run Inference +The pod may initially show a status of "ContainerCreating" as it downloads the large model files. -1. We can check the logs by running the following command: +## 13. Accessing the Model via LoadBalancer +**Establishing external access for production use** - ```bash - helm -n nim test my-nim --logs - ``` +The LoadBalancer service provides a stable, externally accessible endpoint for your model: - > This will run some simple inference requests. If the three tests pass, you'll know the deployment was successful. Avoid setting up external ingress without adding an authentication layer. This is because NIM doesn't provide authentication on its own. The chart provides options for basic ingress. +```bash +# Get the LoadBalancer service details +kubectl get svc -n nim +``` -2. To test the inference server on OKE, we need to set up port forwarding on the service (or the pod), so we can try it from an external IP address (outside of the K8s node) and still be able to access the exposed port on the node: +**Expected Output:** +``` +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +llama3-8b LoadBalancer 10.96.157.218 8000:30450/TCP 15m +``` - ```bash - kubectl -n nim port-forward service/my-nim-nim-llm 8000:8000 - ``` +Look for the `llama3-8b` service with a TYPE of `LoadBalancer`. The `EXTERNAL-IP` column will show the assigned IP address. Once available, you can access the model directly through this IP: -3. Let's make a request to our LLM using `curl`: +```bash +# Test the health endpoint +curl http://:8000/v1/health/ready +``` - ```bash - Then try a request: +**Expected Output:** +``` +{"status":"ready"} +``` - curl -X 'POST' \ - 'http://localhost:8000/v1/chat/completions' \ - -H 'accept: application/json' \ - -H 'Content-Type: application/json' \ - -d '{ +```bash +# Test a chat completion +curl -X POST http://:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ "messages": [ - { - "content": "You are a polite and respectful chatbot helping people plan a vacation.", - "role": "system" - }, - { - "content": "What should I do for a 4 day vacation in Spain?", - "role": "user" - } + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello, tell me briefly about NVIDIA."} ], "model": "meta/llama3-8b-instruct", - "max_tokens": 16, - "top_p": 1, - "n": 1, - "stream": false, - "stop": "\n", - "frequency_penalty": 0.0 - }' - ``` + "max_tokens": 150 + }' +``` - ![inference](./img/inference.PNG) +**Expected Output (abbreviated):** +```json +{ + "id": "cdee0dec-a12c-4de3-9061-285def95f4b1", + "object": "chat.completion", + "created": 1717710883, + "model": "meta/llama3-8b-instruct", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "NVIDIA is a leading technology company specializing in designing and manufacturing graphics processing units (GPUs) and artificial intelligence systems. Founded in 1993, NVIDIA initially focused on producing GPUs for gaming but has since expanded into various fields including data centers, autonomous vehicles, robotics, and AI computing. The company's innovations have been crucial for advancements in deep learning, scientific computing, and visual computing applications." + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 28, + "completion_tokens": 84, + "total_tokens": 112 + } +} +``` -4. We have also prepared a Python script called `scripts/invoke_llama_3.py` which uses the *OpenAI completions* module to communicate with the model and allows us to automate querying the LLM and receiving responses programatically. To run this script, you will need to create an environment, activate it, install the requirements in `scripts/requirements.txt` and run the application. +The LoadBalancer provides several benefits for production use: +1. Manages traffic distribution to your pods +2. Provides a consistent access point +3. Handles pod failures and restarts transparently +4. Offers better stability than port forwarding - ![openai completions](./img/completions_openai.PNG) +Note that it may take several minutes for the LoadBalancer to provision and for the external IP to become accessible. If the readiness probe is failing, the LoadBalancer might not route traffic to the pod until it's ready. -5. If you don't have a virtual environment, create a new one (you can do this with `conda` too): +## 14. Alternative: Test the Model via Port Forwarding +**Creating a secure tunnel to access your model during development** - ```bash - python3 -m venv .demo - ``` +> **Note:** This alternative method is only needed if your LoadBalancer is not yet provisioned or if you're working in an environment where LoadBalancer services aren't available. -6. Activate the virtual environment we just created: +You can test the model locally using port forwarding, which creates a secure tunnel between your local machine and the pod: - ```bash - source .demo/bin/activate - ``` +```bash +# Set up port forwarding +kubectl port-forward -n nim pod/$(kubectl get pods -n nim -l app=llama3-8b -o jsonpath='{.items[0].metadata.name}') 8000:8000 & +``` -7. Install Python requirements into the environment: +**Expected Output:** +``` +Forwarding from 127.0.0.1:8000 -> 8000 +Forwarding from [::1]:8000 -> 8000 +``` - ```bash - pip install -r scripts/requirements.txt - ``` +```bash +# Test the health endpoint +curl http://localhost:8000/v1/health/ready +``` -8. Run the Python script: +**Expected Output:** +``` +{"status":"ready"} +``` - ```bash - python scripts/invoke_llama_3.py - ``` +```bash +# Test a chat completion +curl -X POST http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "messages": [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "Hello, tell me about NVIDIA."} + ], + "model": "meta/llama3-8b-instruct" + }' +``` + +**Expected Output (abbreviated):** +```json +{ + "id": "a57b41f6-8321-4e7c-9cb3-4851df7a6d22", + "object": "chat.completion", + "created": 1717710985, + "model": "meta/llama3-8b-instruct", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "NVIDIA is a technology company that specializes in designing and manufacturing graphics processing units (GPUs) and other computing hardware. Founded in 1993, NVIDIA initially focused on creating graphics cards for gaming and professional visualization. Over time, they've expanded their focus to include artificial intelligence, high-performance computing, autonomous vehicles, and robotics.\n\nSome key aspects of NVIDIA:\n\n1. GPU Technology: NVIDIA is best known for their GPUs, which were originally designed for rendering graphics but have become essential for parallel processing tasks like AI training and inference.\n\n2. CUDA Platform: They developed CUDA, a parallel computing platform that allows developers to use NVIDIA GPUs for general-purpose processing.\n\n3. Data Center Solutions: NVIDIA provides hardware and software for data centers, including their DGX systems for AI research and HGX platforms for cloud computing.\n\n4. Autonomous Vehicles: Their DRIVE platform is used for developing self-driving car technology.\n\n5. Professional Visualization: Their Quadro/RTX line serves professionals in fields like design, animation, and scientific visualization.\n\nNVIDIA has become particularly important in the AI revolution, as their GPUs have proven essential for training and running large AI models." + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 24, + "completion_tokens": 231, + "total_tokens": 255 + } +} +``` + +> **Troubleshooting Note:** Port forwarding can be unstable and may disconnect unexpectedly. If you experience "lost connection to pod" errors or "address already in use" messages, you may need to kill existing port-forward processes (`pkill -f "port-forward"`) before trying again. + +## Conclusion +**Summary of your NIM deployment on OKE** -> Make sure to change the IP address where requests are made if you're working with OKE. If you're using OCI Compute, make sure you choose the instance's public IP address or invoke locally. +You now have a working NIM deployment on OKE serving the LLaMA 3-8B model. The deployment is accessible both via port forwarding for development/testing purposes and through a LoadBalancer service for more permanent external access. -## 7. (Bonus) NGC CLI +This setup provides you with: +1. A scalable, production-ready LLM deployment +2. GPU-accelerated inference for fast responses +3. An OpenAI-compatible API for easy integration with applications +4. Persistent storage for model files -With NVIDIA GPU Cloud (NGC) command-line interface (CLI), you can perform many of the same operations that are available from the NGC website, such as running jobs, viewing Docker repositories and downloading AI models within your organization and team space. +Remember to regularly check your OCI authentication status if you encounter connection issues, as session tokens expire after a period of time. -To do this with the CLI, follow these steps: +## Infrastructure Creation Summary -1. Install the CLI from [this link](https://org.ngc.nvidia.com/setup/installers/cli), or [with this official script for Ubuntu](https://github.com/NVIDIA/nim-deploy/blob/main/helm/nim-llm/files/ngc_pull.sh). +In this guide, the infrastructure setup process follows these key steps: - > Note the mentioned installation script doesn't work on busybox or alpine Linux. +1. Create the networking components (VCN, subnets, gateways) +2. Define an instance configuration template for the GPU-equipped VMs +3. Create a cluster network that instantiates the GPU nodes based on the template +4. Deploy NIM services using Helm charts onto the cluster +5. Configure persistent storage for model weights -2. The first time that you use the CLI, you must set your configuration. Configure and authenticate with NVIDIA NGC: +--- - ```bash - ngc config set - ``` +## ✅ Deployment Checklist -3. Explore all possible options in the CLI by running the help command: +Ensure the following are complete before proceeding with inference: - ```bash - ngc -h - ``` +- [ ] **OKE cluster** is active and accessible +- [ ] **GPU node pool** (e.g., A100, L40S) is ready and healthy +- [ ] **NAT Gateway** or other outbound internet access is configured +- [ ] **NGC secret** is created in the `nim` namespace +- [ ] **Helm chart** deployed successfully (`helm list -n nim`) +- [ ] **NIM service** is reachable on `port 8000` -## 7. Conclusions +### Verification Steps -Using `LLaMA-3` is just one of many examples you can find on the *NVIDIA NGC* catalog. Check out all their solutions in [the official NGC website](https://catalog.ngc.nvidia.com), where you will be able to find: +Use these commands to verify your deployment is correctly configured: + +```bash +# Check if your OKE cluster is accessible +kubectl get nodes +``` + +```bash +# Verify GPU node labels are detected +kubectl describe nodes | grep nvidia.com/gpu +``` -- **Collections** of solutions, industry solutions and use cases from several publishers, in all AI fields, including video analytics, speech recognition and transcription, protein structure folding (bioengineering), and much more. -- A collection of **containers** for AI/ML, the metaverse and HPC-related workloads -- **Helm Charts** to easily deploy the deployment of software, like the NVIDIA GPU Operator, on Kubernetes clusters. -- **Pre-trained models** on Computer Vision, Speech recognition and transcription, NLP, Drug discovery, TTS... -- Lots of **resources** and documentation for you to get started. +```bash +# Confirm NGC secret exists +kubectl get secret -n nim ngc-registry +``` -Get started today developing on OCI and NVIDIA with these tools! +```bash +# Verify Helm chart is installed +helm list -n nim +``` -## Annex: OCI Compute: Install NVIDIA Container Toolkit +```bash +# Check NIM service health +SERVICE_IP=$(kubectl get svc -n nim llama3-8b -o jsonpath='{.status.loadBalancer.ingress[0].ip}') +curl http://${SERVICE_IP}:8000/v1/health/ready +``` -If you're planning on running any NIM solution directly into an OCI Compute instead of OKE, you will need to set up the NVIDIA Container Toolkit in your environment (wherever you're planning on launching these Docker images), as the images will run with the NVIDIA Docker runtime. +These verification steps should all return successful responses to confirm your deployment is ready. -1. Configure the production repository: +--- - ```bash - curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list - ``` +## 🎮 GPU Compatibility -2. Update packages list from repository and install the NVIDIA Container Toolkit required packages: +### 🔢 Recommended GPU Shapes - ```bash - sudo apt-get update -y && sudo apt-get install -y nvidia-container-toolkit - ``` +| Model Size | Recommended Shapes | +|------------|--------------------| +| 8B | `BM.GPU.A10G.2`, `BM.GPU.L40.2` | +| 13B | `BM.GPU.L40.2`, `BM.GPU.A100-v2.8` | +| 30B | `BM.GPU.A100-v2.8`, `BM.GPU.H100.8` | +| 70B+ | `BM.GPU.H100.8`, `BM.GPU.H200.8` | -3. Configure Docker's container runtime and restart the Docker daemon: +### ✅ Supported OCI GPUs for NIM - ```bash - sudo nvidia-ctk runtime configure --runtime=docker - sudo systemctl restart docker - ``` +| GPU Model | Memory | Architecture | NIM Compatibility | Best For | +|--------------|----------|--------------|-------------------|--------------------------------| +| H200 | 141 GB | Hopper | ✅ Excellent | Max throughput, large models | +| H100 | 80 GB | Hopper | ✅ Excellent | 30B–70B models, production use | +| A100 | 80 GB | Ampere | ✅ Excellent | Most models, stable baseline | +| L40S | 48 GB | Lovelace | ✅ Good | Mid-size models (7B–30B) | +| A10G | 24 GB | Ampere | ✅ Limited | Small models (7B–13B) | - > The `nvidia-ctk` command modifies the /etc/docker/daemon.json file on the host. The file is updated so that Docker can use the NVIDIA Container Runtime. +--- -4. To ensure things are working, run the following command: - ```bash - docker run --rm --runtime=nvidia --gpus all ubuntu nvidia-smi - ``` +## 🚨 Troubleshooting - > This will produce an output for your own system, where you can confirm CUDA driver version, and available GPUs: +This section outlines the most common issues you might encounter when deploying or running NIM on OKE, along with actionable steps to resolve them. - ![similar output](./img/similar_output.PNG) +--- -5. Now that we have the proper runtime installed, we can either authenticate with our OKE cluster as described in chapter 3.1. However, if you have decided to only use OCI Compute for this solution and not Kubernetes, you can emulate a Kubernetes cluster with `microk8s`. +### 🔧 Common Problems & Fixes - To install microk8s, run: +#### 🔁 Pod in CrashLoopBackOff - ```bash - sudo snap install microk8s --classic - ``` +```bash +kubectl logs -n nim +``` - Join the `microk8s` group with your current user: +**Expected Output (for NGC API key issues):** +``` +Error: Failed to download model files: Authentication failed. Please check your NGC API key. +``` - ```bash - sudo usermod -a -G microk8s $USER - mkdir -p ~/.kube - chmod 0700 ~/.kube - ``` +**Possible causes:** - Restart your shell and check the status of microk8s: +* Invalid NGC API key +* No outbound internet access +* Insufficient GPU resources - ```bash - microk8s kubectl get nodes - ``` +--- + +#### 🌐 Hanging curl / No Response + +```bash +kubectl run curl-test -n nim --image=ghcr.io/curl/curlimages/curl:latest \ + -it --rm --restart=Never -- \ + curl https://api.ngc.nvidia.com +``` + +**Expected Output (successful connection):** +``` + +301 Moved Permanently + +

301 Moved Permanently

+
nginx
+ + +``` + +**If this fails:** Outbound internet is blocked. Set up a NAT Gateway or configure proxy settings. + +--- + +#### 🎮 GPU Not Detected + +```bash +kubectl describe nodes | grep nvidia.com/gpu +``` + +**Expected Output:** +``` + nvidia.com/gpu: 8 + nvidia.com/gpu.memory: 81920M + nvidia.com/gpu.product: A100-SXM4-80GB +``` + +**Possible causes:** + +* Node Feature Discovery (NFD) not installed +* GPU drivers not properly installed on nodes +* Incorrect GPU shape configuration + +--- + +#### 🔐 Authentication Issues + +```bash +# Method 1: Session authentication +oci session authenticate +``` + +**Expected Output:** +``` +Enter a password or web browser will be opened to https://login.us-ashburn-1.oraclecloud.com/... +``` + +```bash +# Method 2: Validate existing session +oci session validate --profile oci +``` + +**Expected Output (valid session):** +``` +Session is valid +``` + +**Expected Output (expired session):** +``` +Session is invalid or expired +``` + +Session tokens typically expire after a few hours. Refresh if needed. + +--- + +#### 🌍 Internet Connectivity Problems + +```bash +# Test general internet connectivity from within the cluster +kubectl run test-connectivity --image=alpine -n nim --rm -it -- sh -c "apk add curl && curl -I https://ngc.nvidia.com" +``` + +**Expected Output:** +``` +HTTP/1.1 200 OK +Date: Mon, 01 Jun 2023 12:34:56 GMT +Server: nginx +Content-Type: text/html; charset=UTF-8 +Connection: keep-alive +Cache-Control: no-cache +``` -6. From here, now that `microk8s` is installed in your OCI Compute instance, proceed to **chapter 5** to continue with deploying your Docker images within a Kubernetes pod. +**Troubleshooting steps:** + +1. **If DNS resolution fails:** Check DNS server configuration in the cluster + ```bash + kubectl run dns-test -n nim --rm -it --image=alpine -- nslookup ngc.nvidia.com + ``` + +2. **If proxy is needed:** Configure HTTP_PROXY and HTTPS_PROXY environment variables in your pod spec + ```yaml + env: + - name: HTTP_PROXY + value: "http://proxy.example.com:8080" + - name: HTTPS_PROXY + value: "http://proxy.example.com:8080" + - name: NO_PROXY + value: "localhost,127.0.0.1,10.96.0.0/12,192.168.0.0/16" + ``` + +3. **If NAT Gateway isn't working:** Verify your route table configurations + ```bash + oci network route-table get --rt-id + ``` + +4. **For security group issues:** Ensure outbound traffic is allowed on ports 443 and 80 + ```bash + oci network security-list list --subnet-id + ``` ## Contributing diff --git a/nvidia-nim-oke/helm/Chart.yaml b/nvidia-nim-oke/helm/Chart.yaml deleted file mode 100644 index 58f256c..0000000 --- a/nvidia-nim-oke/helm/Chart.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: v2 -name: nim-llm -description: A Helm chart for NVIDIA NIM for LLMs -type: application -kubeVersion: ">=v1.23.0-0" -# This is the chart version. This version number should be incremented each time you make changes -# to the chart and its templates, including the app version. -# Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.2.1 - -# This is the version number of the application being deployed. This version number should be -# incremented each time you make changes to the application. Versions are not expected to -# follow Semantic Versioning. They should reflect the version the application is using. -# It is recommended to use it with quotes. -appVersion: "1.0.0" \ No newline at end of file diff --git a/nvidia-nim-oke/helm/values.yaml b/nvidia-nim-oke/helm/values.yaml deleted file mode 100644 index a0511d3..0000000 --- a/nvidia-nim-oke/helm/values.yaml +++ /dev/null @@ -1,27 +0,0 @@ -image: - repository: nvcr.io/nim/meta/llama3-8b-instruct:latest - pullPolicy: IfNotPresent - model: llama3-8b-instruct - numGpus: 1 - -service: - type: ClusterIP - openaiPort: 8000 - annotations: {} - labels: {} - name: "" # override the default service name - # below options are deprecated - # http_port: 8000 # exposes http interface used in healthchecks to the service - # grpc_port: 8001 # exposes the triton grpc interface - # metrics_port: 8002 # expose metrics through the main service - # openai_port: 8005 - # nemo_port: 8006 - -mount: - name: /opt/nim/cache - -imageCredentials: - registry: nvcr.io - username: $oauthtoken - password: - email: \ No newline at end of file diff --git a/nvidia-nim-oke/helm/values_triton.yaml b/nvidia-nim-oke/helm/values_triton.yaml deleted file mode 100644 index d3a924b..0000000 --- a/nvidia-nim-oke/helm/values_triton.yaml +++ /dev/null @@ -1,17 +0,0 @@ -image: - imageName: nvcr.io/nvidia/tritonserver:latest - pullPolicy: IfNotPresent - model: llama3-8b-instruct - numGpus: 1 - -service: - type: LoadBalancer - -mount: - name: /opt/nim/cache - -imageCredentials: - registry: nvcr.io - username: $oauthtoken - password: - email: \ No newline at end of file diff --git a/nvidia-nim-oke/img/1.PNG b/nvidia-nim-oke/img/1.PNG deleted file mode 100644 index dadf0e0..0000000 Binary files a/nvidia-nim-oke/img/1.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/AccessCluster.png b/nvidia-nim-oke/img/AccessCluster.png deleted file mode 100644 index e515a36..0000000 Binary files a/nvidia-nim-oke/img/AccessCluster.png and /dev/null differ diff --git a/nvidia-nim-oke/img/api_key.PNG b/nvidia-nim-oke/img/api_key.PNG deleted file mode 100644 index 2673506..0000000 Binary files a/nvidia-nim-oke/img/api_key.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/cloud-init.PNG b/nvidia-nim-oke/img/cloud-init.PNG deleted file mode 100644 index 90a3e84..0000000 Binary files a/nvidia-nim-oke/img/cloud-init.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/completions_openai.PNG b/nvidia-nim-oke/img/completions_openai.PNG deleted file mode 100644 index f2cb0a0..0000000 Binary files a/nvidia-nim-oke/img/completions_openai.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/creation_1.PNG b/nvidia-nim-oke/img/creation_1.PNG deleted file mode 100644 index ab7c72c..0000000 Binary files a/nvidia-nim-oke/img/creation_1.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/creation_2.PNG b/nvidia-nim-oke/img/creation_2.PNG deleted file mode 100644 index ffb516b..0000000 Binary files a/nvidia-nim-oke/img/creation_2.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/creation_3.PNG b/nvidia-nim-oke/img/creation_3.PNG deleted file mode 100644 index b669151..0000000 Binary files a/nvidia-nim-oke/img/creation_3.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/custom_ssh_key.PNG b/nvidia-nim-oke/img/custom_ssh_key.PNG deleted file mode 100644 index 497f2a4..0000000 Binary files a/nvidia-nim-oke/img/custom_ssh_key.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/custom_volume_size.PNG b/nvidia-nim-oke/img/custom_volume_size.PNG deleted file mode 100644 index 09b9ad3..0000000 Binary files a/nvidia-nim-oke/img/custom_volume_size.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/get_nodes.PNG b/nvidia-nim-oke/img/get_nodes.PNG deleted file mode 100644 index 7f5b2a1..0000000 Binary files a/nvidia-nim-oke/img/get_nodes.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/inference.PNG b/nvidia-nim-oke/img/inference.PNG deleted file mode 100644 index afc5a4d..0000000 Binary files a/nvidia-nim-oke/img/inference.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/invoke_model_python.PNG b/nvidia-nim-oke/img/invoke_model_python.PNG deleted file mode 100644 index 03dc692..0000000 Binary files a/nvidia-nim-oke/img/invoke_model_python.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/ngc_arch.PNG b/nvidia-nim-oke/img/ngc_arch.PNG deleted file mode 100644 index e650981..0000000 Binary files a/nvidia-nim-oke/img/ngc_arch.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/nvidia_nim.PNG b/nvidia-nim-oke/img/nvidia_nim.PNG deleted file mode 100644 index 96393f7..0000000 Binary files a/nvidia-nim-oke/img/nvidia_nim.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/similar_output.PNG b/nvidia-nim-oke/img/similar_output.PNG deleted file mode 100644 index 40d6be8..0000000 Binary files a/nvidia-nim-oke/img/similar_output.PNG and /dev/null differ diff --git a/nvidia-nim-oke/img/token_generations.PNG b/nvidia-nim-oke/img/token_generations.PNG deleted file mode 100644 index 8fdca39..0000000 Binary files a/nvidia-nim-oke/img/token_generations.PNG and /dev/null differ diff --git a/nvidia-nim-oke/kserve/kserve_llama3.yaml b/nvidia-nim-oke/kserve/kserve_llama3.yaml deleted file mode 100644 index cdb563e..0000000 --- a/nvidia-nim-oke/kserve/kserve_llama3.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: serving.kserve.io/v1beta1 -kind: InferenceService -metadata: - annotations: - autoscaling.knative.dev/target: "10" - name: llama3-8b-instruct-1xgpu -spec: - predictor: - minReplicas: 1 - model: - modelFormat: - name: nvidia-nim-llama3-8b-instruct - resources: - limits: - nvidia.com/gpu: "1" - requests: - nvidia.com/gpu: "1" - runtime: nvidia-nim-llama3-8b-instruct-24.05 - storageUri: pvc://nvidia-nim-pvc/ \ No newline at end of file diff --git a/nvidia-nim-oke/pod/docker-pod.yaml b/nvidia-nim-oke/pod/docker-pod.yaml deleted file mode 100644 index 8ae3784..0000000 --- a/nvidia-nim-oke/pod/docker-pod.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: docker - labels: - name: docker -spec: - containers: - - name: docker - image: docker:latest - securityContext: - privileged: true - command: ["tail", "-f", "/dev/null"] - resources: - limits: - nvidia.com/gpu: 1 - hostNetwork: true \ No newline at end of file diff --git a/nvidia-nim-oke/pod/llama3-pod.yaml b/nvidia-nim-oke/pod/llama3-pod.yaml deleted file mode 100644 index 2ffcc91..0000000 --- a/nvidia-nim-oke/pod/llama3-pod.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: docker - labels: - name: docker -spec: - containers: - - name: docker - image: nvcr.io/nim/meta/llama3-8b-instruct:latest - securityContext: - privileged: true - command: ["tail", "-f", "/dev/null"] - resources: - limits: - nvidia.com/gpu: 1 - #ephemeral-storage: "100Gi" - hostNetwork: true - - imagePullSecrets: - - name: registry-secret \ No newline at end of file diff --git a/nvidia-nim-oke/pod/testpod.yaml b/nvidia-nim-oke/pod/testpod.yaml deleted file mode 100644 index 2be0665..0000000 --- a/nvidia-nim-oke/pod/testpod.yaml +++ /dev/null @@ -1,21 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: docker - labels: - name: docker -spec: - containers: - - name: docker - #image: ubuntu - image: nvcr.io/nim/meta/llama3-8b-instruct:latest - securityContext: - privileged: true - command: ["tail", "-f", "/dev/null"] - resources: - limits: - nvidia.com/gpu: 1 - hostNetwork: true - - imagePullSecrets: - - name: registry-secret \ No newline at end of file diff --git a/nvidia-nim-oke/pod/time-slicing-config-all.yaml b/nvidia-nim-oke/pod/time-slicing-config-all.yaml deleted file mode 100644 index d549212..0000000 --- a/nvidia-nim-oke/pod/time-slicing-config-all.yaml +++ /dev/null @@ -1,15 +0,0 @@ -# configure GPU time-slicing if you have fewer than four GPUs. -apiVersion: v1 -kind: ConfigMap -metadata: - name: time-slicing-config-all -data: - any: |- - version: v1 - flags: - migStrategy: none - sharing: - timeSlicing: - resources: - - name: nvidia.com/gpu - replicas: 4 \ No newline at end of file diff --git a/nvidia-nim-oke/pod/ubuntu-pod.yaml b/nvidia-nim-oke/pod/ubuntu-pod.yaml deleted file mode 100644 index 9b6bd11..0000000 --- a/nvidia-nim-oke/pod/ubuntu-pod.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: docker - labels: - name: docker -spec: - containers: - - name: docker - image: ubuntu:latest - securityContext: - privileged: true - command: ["tail", "-f", "/dev/null"] - resources: - limits: - nvidia.com/gpu: 1 - #ephemeral-storage: "100Gi" - hostNetwork: true \ No newline at end of file diff --git a/nvidia-nim-oke/scripts/alpine_cuda.txt b/nvidia-nim-oke/scripts/alpine_cuda.txt deleted file mode 100644 index 5c0c062..0000000 --- a/nvidia-nim-oke/scripts/alpine_cuda.txt +++ /dev/null @@ -1,128 +0,0 @@ -Author of this guide is Arto Bendiken from https://arto.s3.amazonaws.com/ - -Drivers - - https://developer.nvidia.com/vulkan-driver - -$ lsmod | fgrep nvidia - -$ nvidia-smi - -Driver Installation - - https://us.download.nvidia.com/XFree86/Linux-x86_64/390.77/README/ - https://github.com/NVIDIA/nvidia-installer - -Driver Installation on Alpine Linux - - https://github.com/sgerrand/alpine-pkg-glibc - https://github.com/sgerrand/alpine-pkg-glibc/releases - https://wiki.alpinelinux.org/wiki/Running_glibc_programs - -$ apk add sudo bash ca-certificates wget xz make gcc linux-headers - -$ wget -q -O /etc/apk/keys/sgerrand.rsa.pub https://raw.githubusercontent.com/sgerrand/alpine-pkg-glibc/master/sgerrand.rsa.pub - -$ wget https://github.com/sgerrand/alpine-pkg-glibc/releases/download/2.27-r0/glibc-2.27-r0.apk -$ wget https://github.com/sgerrand/alpine-pkg-glibc/releases/download/2.27-r0/glibc-bin-2.27-r0.apk -$ wget https://github.com/sgerrand/alpine-pkg-glibc/releases/download/2.27-r0/glibc-dev-2.27-r0.apk -$ wget https://github.com/sgerrand/alpine-pkg-glibc/releases/download/2.27-r0/glibc-i18n-2.27-r0.apk - -$ apk add glibc-2.27-r0.apk glibc-bin-2.27-r0.apk glibc-dev-2.27-r0.apk glibc-i18n-2.27-r0.apk - -$ /usr/glibc-compat/bin/localedef -i en_US -f UTF-8 en_US.UTF-8 - -$ bash NVIDIA-Linux-x86_64-390.77.run --check - -$ bash NVIDIA-Linux-x86_64-390.77.run --extract-only - -$ cd NVIDIA-Linux-x86_64-390.77 && ./nvidia-installer - -Driver Uninstallation - -$ nvidia-uninstall - -Driver Troubleshooting - - Uncompressing NVIDIA Accelerated Graphics Driver for Linux-x86_64 390.77NVIDIA-Linux-x86_64-390.77.run: line 998: /tmp/makeself.XXX/xz: No such file or directory\nExtraction failed. - - $ apk add xz # Alpine Linux - - bash: ./nvidia-installer: No such file or directory - - Install the glibc compatibility layer package for Alpine Linux. - - ERROR: You do not appear to have libc header files installed on your system. Please install your distribution's libc development package. - - $ apk add musl-dev # Alpine Linux - - ERROR: Unable to find the kernel source tree for the currently running kernel. Please make sure you have installed the kernel source files for your kernel and that they are properly configured - - $ apk add linux-vanilla-dev # Alpine Linux - - ERROR: Failed to execute `/sbin/ldconfig`: The installer has encountered the following error during installation: 'Failed to execute `/sbin/ldconfig`'. Would you like to continue installation anyway? - - Continue installation. - -Toolkit - - https://developer.nvidia.com/cuda-toolkit - https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/ - -Toolkit Download - - https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&target_distro=Ubuntu&target_version=1604&target_type=runfilelocal - -$ wget -c https://developer.nvidia.com/compute/cuda/9.2/Prod2/local_installers/cuda_9.2.148_396.37_linux - -Toolkit Installation - - https://docs.nvidia.com/cuda/cuda-installation-guide-linux/ - -Toolkit Installation on Alpine Linux - -$ apk add sudo bash - -$ sudo bash cuda_9.2.148_396.37_linux - -# You are attempting to install on an unsupported configuration. Do you wish to continue? y -# Install NVIDIA Accelerated Graphics Driver for Linux-x86_64 396.37? y -# Do you want to install the OpenGL libraries? y -# Do you want to run nvidia-xconfig? n -# Install the CUDA 9.2 Toolkit? y -# Enter Toolkit Location: /opt/cuda-9.2 -# Do you want to install a symbolic link at /usr/local/cuda? y -# Install the CUDA 9.2 Samples? y -# Enter CUDA Samples Location: /opt/cuda-9.2/samples - -$ sudo ln -s cuda-9.2 /opt/cuda - -$ export PATH="/opt/cuda/bin:$PATH" - -Toolkit Uninstallation - -$ sudo /opt/cuda-9.2/bin/uninstall_cuda_9.2.pl - -Toolkit Troubleshooting - - Cannot find termcap: Can't find a valid termcap file at /usr/share/perl5/core_perl/Term/ReadLine.pm line 377. - - $ export PERL_RL="Perl o=0" - - gcc: error trying to exec 'cc1plus': execvp: No such file or directory - - $ apk add g++ # Alpine Linux - - cicc: Relink `/usr/lib/libgcc_s.so.1' with `/usr/glibc-compat/lib/libc.so.6' for IFUNC symbol `memset' - - https://github.com/sgerrand/alpine-pkg-glibc/issues/58 - - $ scp /lib/x86_64-linux-gnu/libgcc_s.so.1 root@alpine:/usr/glibc-compat/lib/libgcc_s.so.1 - - $ sudo /usr/glibc-compat/sbin/ldconfig /usr/glibc-compat/lib /lib /usr/lib - -Compiler - - https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/ - -$ nvcc -V \ No newline at end of file diff --git a/nvidia-nim-oke/scripts/fresh_ubuntu_install.sh b/nvidia-nim-oke/scripts/fresh_ubuntu_install.sh deleted file mode 100644 index ea89fd2..0000000 --- a/nvidia-nim-oke/scripts/fresh_ubuntu_install.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -# author @jasperan -# this script installs all required dependencies on a fresh ubuntu image, which allows you to run NVIDIA container runtime workloads on docker. - -# install sudo, curl (to download docker), gnupg2 -apt-get update -y && apt-get install sudo curl gnupg2 -y - -# declare environment variable -export NGC_API_KEY= - -# download and install docker -curl -fsSL https://get.docker.com -o get-docker.sh -sh get-docker.sh - -# install nvidia container toolkit (required to run their images on NVIDIA GPUs) - -curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \ - && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \ - sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \ - sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list - - -sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list - -sudo apt-get update -y -sudo apt-get install -y nvidia-container-toolkit -sudo apt install nvidia-cuda-toolkit -y -sudo apt install nvidia-driver-525 -y # for ubuntu 22.04, change this to your recommended driver -# you can find your recommended driver for your specific docker image by running: ~ sudo ubuntu-drivers devices ~ - -# run the docker image inside the container. - -# Choose a container name for bookkeeping -export CONTAINER_NAME=llama3-8b-instruct -export IMG_NAME="nvcr.io/nim/meta/llama3-8b-instruct:latest" -export LOCAL_NIM_CACHE="/home/ubuntu/nim/cache" -mkdir -p "$LOCAL_NIM_CACHE" - -# login to NVIDIA NGC and run any image. - -echo "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin - -# launch dockerd if it wasn't previously launched on the background. -nohup dockerd & -# Start the LLM NIM -docker run -it --privileged --rm --name=$CONTAINER_NAME --runtime=nvidia --gpus 1 --env NGC_API_KEY="$NGC_API_KEY" -v "$LOCAL_NIM_CACHE:/opt/nim/cache" -u $(id -u) -p 8000:8000 $IMG_NAME \ No newline at end of file diff --git a/nvidia-nim-oke/scripts/invoke_llama_3.py b/nvidia-nim-oke/scripts/invoke_llama_3.py deleted file mode 100644 index f09b5de..0000000 --- a/nvidia-nim-oke/scripts/invoke_llama_3.py +++ /dev/null @@ -1,37 +0,0 @@ -''' -@author jasperan - -This code does the following: - - Imports the requests library. - Defines the URL for the POST request. - Defines the request body (payload) as a dictionary containing the model, prompt, - max_tokens, and other hyperparameters. - Sends a POST request using requests.post with the URL, headers, and payload as JSON. - Otherwise, it prints an error message with the status code. -''' - -from openai import OpenAI - -# Define the URL (assuming that you will call the endpoint locally, but can be changed to a public IP address) -client = OpenAI( - base_url = "http://0.0.0.0:8000/v1", - api_key="no-key-required" -) - -messages=[ - {"role":"user","content":"What is a GPU?"}, - {"role": "user", "content": "Write a short limerick about the wonders of GPU computing."} -] - -completion = client.chat.completions.create( - model="meta/llama3-8b-instruct", - messages=messages, - temperature=0.5, - top_p=1, - max_tokens=1024, - stream=True) - -for chunk in completion: - if chunk.choices[0].delta.content is not None: - print(chunk.choices[0].delta.content, end="") \ No newline at end of file diff --git a/nvidia-nim-oke/scripts/launch_compute_llama-3-8b-instruct.sh b/nvidia-nim-oke/scripts/launch_compute_llama-3-8b-instruct.sh deleted file mode 100644 index 5fcb30f..0000000 --- a/nvidia-nim-oke/scripts/launch_compute_llama-3-8b-instruct.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - - -# modify this environment variable -export NGC_API_KEY= - -# Choose a container name for bookkeeping -export CONTAINER_NAME=llama3-8b-instruct - -echo "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin -# Choose a LLM NIM Image from the NGC catalog -export IMG_NAME="nvcr.io/nim/meta/llama3-8b-instruct:latest" -#export IMG_NAME="nvcr.io/nvidia/aiworkflows/genai-llm-playground:latest" - -# Choose a path on your system to cache the downloaded models -export LOCAL_NIM_CACHE="/home/$USER/nim/cache" -mkdir /home/$USER/nim -mkdir /home/$USER/nim/cache -mkdir -p "$LOCAL_NIM_CACHE" - -# Start the LLM NIM -# here, you can specify --gpus all (if you have more than 1 node in your OKE cluster). -# specify port forwarding with -p. -docker run -it --rm --name=$CONTAINER_NAME \ - --privileged \ - --runtime=nvidia \ - --gpus 1 \ - --env NGC_API_KEY="$NGC_API_KEY" \ - -v "$LOCAL_NIM_CACHE:/opt/nim/cache" \ - -u $(id -u) \ - -p 8000:8000 \ - $IMG_NAME \ No newline at end of file diff --git a/nvidia-nim-oke/scripts/requirements.txt b/nvidia-nim-oke/scripts/requirements.txt deleted file mode 100644 index f0dd0ae..0000000 --- a/nvidia-nim-oke/scripts/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -openai \ No newline at end of file diff --git a/nvidia-nim-oke/values.yaml b/nvidia-nim-oke/values.yaml new file mode 100644 index 0000000..4bc0aed --- /dev/null +++ b/nvidia-nim-oke/values.yaml @@ -0,0 +1,298 @@ +# NVIDIA NIM Helm Values for Oracle Kubernetes Engine (OKE) +# This is a comprehensive values.yaml for deploying NIM on OKE + +#----------------------- +# Image Configuration +#----------------------- +image: + # Model image repository - uncomment the desired model + repository: nvcr.io/nim/meta/llama3-8b-instruct + # repository: nvcr.io/nim/meta/llama3-70b-instruct # For larger models + # repository: nvcr.io/nim/mistralai/mistral-7b-instruct-v0.2 # For Mistral models + + # Image tag (version) + tag: 1.0.0 + + # Image pull policy + pullPolicy: IfNotPresent + +#----------------------- +# Image Pull Secrets +#----------------------- +imagePullSecrets: + - name: ngc-registry # Secret created for NGC container registry + +#----------------------- +# Model Configuration +#----------------------- +model: + # Model name in NGC catalog - should match the repository above + name: meta/llama3-8b-instruct + # name: meta/llama3-70b-instruct # For larger models + # name: mistralai/mistral-7b-instruct-v0.2 # For Mistral models + + # NGC API key secret name - Created with: kubectl create secret generic ngc-api -n nim --from-literal=NGC_API_KEY=your_key + ngcAPISecret: ngc-api + +#----------------------- +# Persistence Configuration +#----------------------- +persistence: + # Enable persistent storage for model weights + enabled: true + + # Storage size (increase for larger models) + size: 50Gi # For 8B models + # size: 150Gi # For 70B models + + # OCI Block Volume storage class + storageClass: "oci-bv" + + # Access mode for the volume + accessMode: ReadWriteOnce + +#----------------------- +# StatefulSet Configuration +#----------------------- +statefulSet: + # Use Deployment instead of StatefulSet + enabled: false + +#----------------------- +# Resource Configuration - Adjust based on GPU availability and model size +#----------------------- +resources: + limits: + # --- GPU Configurations --- + # A10G (24GB) - Good for smaller models + nvidia.com/gpu: 1 + memory: "24Gi" + cpu: "8" + + # --- Uncomment for larger models/GPUs --- + # A100 (80GB) - Good for most models + # nvidia.com/gpu: 1 + # memory: "80Gi" + # cpu: "12" + + # H100 (80GB) - Excellent for large models + # nvidia.com/gpu: 1 + # memory: "80Gi" + # cpu: "16" + + # For 70B models on A100/H100 - Use 2 or more GPUs + # nvidia.com/gpu: 2 + # memory: "160Gi" + # cpu: "24" + + requests: + # Keep requests slightly lower than limits + nvidia.com/gpu: 1 + memory: "16Gi" + cpu: "4" + + # For larger configs, adjust accordingly + # nvidia.com/gpu: 2 + # memory: "120Gi" + # cpu: "16" + +#----------------------- +# Proxy Configuration - Uncomment if you need a proxy for outbound connections +#----------------------- +env: + # Uncomment proxy settings if needed + # - name: HTTP_PROXY + # value: "http://squid-proxy.nim.svc.cluster.local:3128" + # - name: HTTPS_PROXY + # value: "http://squid-proxy.nim.svc.cluster.local:3128" + # - name: NO_PROXY + # value: "localhost,127.0.0.1,10.0.0.0/8,10.96.0.0/16" + + # Model parameters + - name: CONTEXT_WINDOW_SIZE + value: "4096" + - name: MAX_TOKENS + value: "4096" + # - name: TIMEOUT # Uncomment to increase timeout for large generations + # value: "300" + +#----------------------- +# Health Probes - Adjust timeouts based on model size +#----------------------- +probes: + startup: + enabled: true + httpGet: + path: /v1/health/ready + port: 8000 + failureThreshold: 240 # Increase for larger models (up to 360 for 70B) + initialDelaySeconds: 240 # Increase for larger models (up to 480 for 70B) + periodSeconds: 30 + + liveness: + enabled: true + httpGet: + path: /v1/health/live + port: 8000 + failureThreshold: 3 + initialDelaySeconds: 60 # Give time for the model to load + periodSeconds: 30 + + readiness: + enabled: true + httpGet: + path: /v1/health/ready + port: 8000 + failureThreshold: 3 + initialDelaySeconds: 60 # Give time for the model to load + periodSeconds: 30 + +#----------------------- +# Service Configuration +#----------------------- +service: + # LoadBalancer for external access + type: LoadBalancer + # Use ClusterIP if you prefer to access via port-forwarding or ingress + # type: ClusterIP + + # Service port + port: 8000 + + # Additional annotations if needed for OCI LoadBalancer + annotations: {} + # service.beta.kubernetes.io/oci-load-balancer-security-list-management-mode: "None" + # service.beta.kubernetes.io/oci-load-balancer-shape: "flexible" + # service.beta.kubernetes.io/oci-load-balancer-shape-flex-min: "10" + # service.beta.kubernetes.io/oci-load-balancer-shape-flex-max: "100" + +#----------------------- +# Security Context +#----------------------- +securityContext: + runAsUser: 1000 + runAsGroup: 1000 + fsGroup: 1000 + +#----------------------- +# Affinity Configuration - Ensure pods are scheduled on GPU nodes +#----------------------- +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu.present + operator: In + values: + - "true" + +#----------------------- +# Topology Spread Constraints - Uncomment for multi-node clusters +#----------------------- +# topologySpreadConstraints: +# - maxSkew: 1 +# topologyKey: kubernetes.io/hostname +# whenUnsatisfiable: DoNotSchedule +# labelSelector: +# matchLabels: +# app: nim-llm + +#----------------------- +# Autoscaling - Optional for scaling based on GPU utilization +#----------------------- +# autoscaling: +# enabled: false +# minReplicas: 1 +# maxReplicas: 3 +# targetCPUUtilizationPercentage: 80 +# targetMemoryUtilizationPercentage: 80 + +#----------------------- +# Additional Parameters - Uncomment if needed +#----------------------- +# serviceAccount: +# create: true +# name: "" + +# podSecurityContext: +# runAsNonRoot: true + +# initContainers: [] + +# tolerations: [] + +# nodeSelector: {} + +#----------------------- +# Model-Specific Quick-Configuration Templates +#----------------------- +# UNCOMMENT ONE OF THESE BLOCKS TO QUICKLY CONFIGURE FOR SPECIFIC MODELS + +# --- LLaMA 3-8B on A10G GPU --- +# image: +# repository: nvcr.io/nim/meta/llama3-8b-instruct +# tag: 1.0.0 +# model: +# name: meta/llama3-8b-instruct +# resources: +# limits: +# nvidia.com/gpu: 1 +# memory: "16Gi" +# requests: +# nvidia.com/gpu: 1 +# memory: "12Gi" +# persistence: +# size: 30Gi + +# --- LLaMA 3-8B on A100/L40S GPU --- +# image: +# repository: nvcr.io/nim/meta/llama3-8b-instruct +# tag: 1.0.0 +# model: +# name: meta/llama3-8b-instruct +# resources: +# limits: +# nvidia.com/gpu: 1 +# memory: "40Gi" +# requests: +# nvidia.com/gpu: 1 +# memory: "32Gi" +# persistence: +# size: 50Gi + +# --- LLaMA 3-70B on A100/H100 GPUs --- +# image: +# repository: nvcr.io/nim/meta/llama3-70b-instruct +# tag: 1.0.0 +# model: +# name: meta/llama3-70b-instruct +# resources: +# limits: +# nvidia.com/gpu: 2 +# memory: "160Gi" +# requests: +# nvidia.com/gpu: 2 +# memory: "120Gi" +# persistence: +# size: 150Gi +# probes: +# startup: +# failureThreshold: 360 +# initialDelaySeconds: 480 + +# --- Mistral 7B on A10G/L40S --- +# image: +# repository: nvcr.io/nim/mistralai/mistral-7b-instruct-v0.2 +# tag: 1.0.0 +# model: +# name: mistralai/mistral-7b-instruct-v0.2 +# resources: +# limits: +# nvidia.com/gpu: 1 +# memory: "16Gi" +# requests: +# nvidia.com/gpu: 1 +# memory: "12Gi" +# persistence: +# size: 30Gi