diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/.gitignore b/llama-index-integrations/llms/llama-index-llms-nvidia/.gitignore
new file mode 100644
index 00000000000..990c18de229
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/.gitignore
@@ -0,0 +1,153 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/BUILD b/llama-index-integrations/llms/llama-index-llms-nvidia/BUILD
new file mode 100644
index 00000000000..db46e8d6c97
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/Makefile b/llama-index-integrations/llms/llama-index-llms-nvidia/Makefile
new file mode 100644
index 00000000000..b9eab05aa37
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/README.md b/llama-index-integrations/llms/llama-index-llms-nvidia/README.md
new file mode 100644
index 00000000000..45ba0b5e533
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/README.md
@@ -0,0 +1 @@
+# LlamaIndex Llms Integration: Nvidia-Ai-Playground
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/__init__.py b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/__init__.py
new file mode 100644
index 00000000000..1ac86372297
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/__init__.py
@@ -0,0 +1,3 @@
+from llama_index.llms.nvidia.base import NVIDIA
+
+__all__ = ["NVIDIA"]
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/base.py b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/base.py
new file mode 100644
index 00000000000..f426c67c97d
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/base.py
@@ -0,0 +1,306 @@
+from typing import Any, Callable, Dict, Optional, Sequence, Awaitable
+
+from llama_index.core.base.llms.types import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseAsyncGen,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseAsyncGen,
+    CompletionResponseGen,
+    LLMMetadata,
+    MessageRole,
+)
+from llama_index.core.bridge.pydantic import Field, PrivateAttr
+from llama_index.core.callbacks import CallbackManager
+from llama_index.core.constants import DEFAULT_TEMPERATURE
+from llama_index.core.llms.callbacks import (
+    llm_chat_callback,
+    llm_completion_callback,
+)
+from llama_index.core.base.llms.generic_utils import (
+    achat_to_completion_decorator,
+    astream_chat_to_completion_decorator,
+    chat_to_completion_decorator,
+    get_from_param_or_env,
+    stream_chat_to_completion_decorator,
+)
+from llama_index.core.llms.llm import LLM
+from llama_index.core.types import BaseOutputParser, PydanticProgramMode
+
+from llama_index.llms.nvidia.utils import (
+    playground_modelname_to_contextsize,
+)
+
+from llama_index.llms.openai.utils import (
+    from_openai_message,
+    to_openai_message_dicts,
+)
+
+from openai import OpenAI as SyncOpenAI
+from openai import AsyncOpenAI
+
+DEFAULT_PLAYGROUND_MODEL = "mistralai/mistral-7b-instruct-v0.2"
+BASE_PLAYGROUND_URL = "https://integrate.api.nvidia.com/v1/"
+DEFAULT_PLAYGROUND_MAX_TOKENS = 512
+
+
+class NVIDIA(LLM):
+    """NVIDIA's API Catalog Connector."""
+
+    model: str = Field(
+        default=DEFAULT_PLAYGROUND_MODEL,
+        description="The NVIDIA API Catalog model to use.",
+    )
+    temperature: float = Field(
+        default=DEFAULT_TEMPERATURE,
+        description="The temperature to use for sampling.",
+        gte=0.0,
+        lte=1.0,
+    )
+    max_tokens: int = Field(
+        default=DEFAULT_PLAYGROUND_MAX_TOKENS,
+        description="The maximum number of tokens to generate.",
+        gte=0,
+    )
+
+    timeout: float = Field(
+        default=120, description="The timeout for the API request in seconds.", gte=0
+    )
+
+    max_retries: int = Field(
+        default=5,
+        description="The maximum number of retries for the API request.",
+        gte=0,
+    )
+
+    _client: Any = PrivateAttr()
+    _aclient: Any = PrivateAttr()
+
+    def __init__(
+        self,
+        model: str = DEFAULT_PLAYGROUND_MODEL,
+        temperature: float = DEFAULT_TEMPERATURE,
+        max_tokens: int = DEFAULT_PLAYGROUND_MAX_TOKENS,
+        timeout: float = 120,
+        max_retries: int = 5,
+        api_key: Optional[str] = None,
+        callback_manager: Optional[CallbackManager] = None,
+        system_prompt: Optional[str] = None,
+        messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None,
+        complettion_to_prompt: Optional[Callable[[str], str]] = None,
+        pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT,
+        output_parser: Optional[BaseOutputParser] = None,
+    ) -> None:
+        callback_manager = callback_manager or CallbackManager([])
+
+        api_key = get_from_param_or_env("api_key", api_key, "NVIDIA_API_KEY", "")
+
+        if not api_key:
+            raise ValueError(
+                "The NVIDIA API key must be provided as an environment variable or as a parameter."
+            )
+
+        self._client = SyncOpenAI(
+            api_key=api_key,
+            base_url=BASE_PLAYGROUND_URL,
+            timeout=timeout,
+            max_retries=max_retries,
+        )
+        self._client._custom_headers = {"User-Agent": "llama-index-llms-nvidia"}
+        self._aclient = AsyncOpenAI(
+            api_key=api_key,
+            base_url=BASE_PLAYGROUND_URL,
+            timeout=timeout,
+            max_retries=max_retries,
+        )
+        self._aclient._custom_headers = {"User-Agent": "llama-index-llms-nvidia"}
+
+        super().__init__(
+            model=model,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            timeout=timeout,
+            max_retries=max_retries,
+            callback_manager=callback_manager,
+            system_prompt=system_prompt,
+            messages_to_prompt=messages_to_prompt,
+            complettion_to_prompt=complettion_to_prompt,
+            pydantic_program_mode=pydantic_program_mode,
+            output_parser=output_parser,
+        )
+
+    @classmethod
+    def class_name(cls) -> str:
+        return "NVIDIA"
+
+    @property
+    def metadata(self) -> LLMMetadata:
+        return LLMMetadata(
+            context_window=playground_modelname_to_contextsize(self.model),
+            num_output=self.max_tokens,
+            is_chat_model=True,
+            model_name=self.model,
+        )
+
+    @property
+    def _model_kwargs(self) -> Dict[str, Any]:
+        return {
+            "model": self.model,
+            "temperature": self.temperature,
+            "max_tokens": self.max_tokens,
+        }
+
+    # === Helper Methods ===
+
+    def _get_all_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
+        return {
+            **self._model_kwargs,
+            **kwargs,
+        }
+
+    def _get_response_token_counts(self, raw_response: Any) -> dict:
+        """Get the token usage reported by the response."""
+        if not isinstance(raw_response, dict):
+            return {}
+
+        usage = raw_response.get("usage", {})
+        if usage is None:
+            return {}
+
+        return {
+            "prompt_tokens": usage.get("prompt_tokens", 0),
+            "completion_tokens": usage.get("completion_tokens", 0),
+            "total_tokens": usage.get("total_tokens", 0),
+        }
+
+    # === Sync Methods ===
+
+    @llm_chat_callback()
+    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
+        message_dicts = to_openai_message_dicts(messages)
+
+        response = self._client.chat.completions.create(
+            messages=message_dicts,
+            stream=False,
+            **self._get_all_kwargs(**kwargs),
+        )
+        playground_openai_message = response.choices[0].message
+        message = from_openai_message(playground_openai_message)
+
+        return ChatResponse(
+            message=message,
+            raw=response,
+            additional_kwargs=self._get_response_token_counts(response),
+        )
+
+    @llm_completion_callback()
+    def complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponse:
+        complete_fn = chat_to_completion_decorator(self.chat)
+        return complete_fn(prompt, **kwargs)
+
+    @llm_chat_callback()
+    def stream_chat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponseGen:
+        all_kwargs = self._get_all_kwargs(**kwargs)
+        message_dicts = to_openai_message_dicts(messages)
+
+        response = self._client.chat.completions.create(
+            messages=message_dicts, stream=True, **all_kwargs
+        )
+
+        def gen() -> ChatResponseGen:
+            content = ""
+            role = MessageRole.ASSISTANT
+            for chunk in response:
+                content_delta = chunk.choices[0].delta.content
+                if content_delta is None:
+                    continue
+                content += content_delta
+                yield ChatResponse(
+                    message=ChatMessage(role=role, content=content),
+                    delta=content_delta,
+                    raw=chunk,
+                )
+
+        return gen()
+
+    @llm_completion_callback()
+    def stream_complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponseGen:
+        stream_complete_fn = stream_chat_to_completion_decorator(self.stream_chat)
+        return stream_complete_fn(prompt, **kwargs)
+
+    # === Async Methods ===
+
+    @llm_chat_callback()
+    async def achat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponse:
+        achat_fn: Callable[..., Awaitable[ChatResponse]]
+        achat_fn = self._achat
+        return await achat_fn(messages, **kwargs)
+
+    @llm_chat_callback()
+    async def astream_chat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponseAsyncGen:
+        all_kwargs = self._get_all_kwargs(**kwargs)
+        message_dicts = to_openai_message_dicts(messages)
+
+        response = await self._aclient.chat.completions.create(
+            messages=message_dicts, stream=True, **all_kwargs
+        )
+
+        async def gen() -> ChatResponseAsyncGen:
+            content = ""
+            role = MessageRole.ASSISTANT
+            async for chunk in response:
+                content_delta = chunk.choices[0].delta.content
+                if content_delta is None:
+                    continue
+                content += content_delta
+                yield ChatResponse(
+                    message=ChatMessage(role=role, content=content),
+                    delta=content_delta,
+                    raw=chunk,
+                )
+
+        return gen()
+
+    @llm_completion_callback()
+    async def acomplete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponse:
+        acomplete_fn = achat_to_completion_decorator(self._achat)
+        return await acomplete_fn(prompt, **kwargs)
+
+    @llm_completion_callback()
+    async def astream_complete(
+        self, prompt: str, formatted: bool = False, **kwargs: Any
+    ) -> CompletionResponseAsyncGen:
+        astream_complete_fn = astream_chat_to_completion_decorator(self.astream_chat)
+        return await astream_complete_fn(prompt, **kwargs)
+
+    async def _achat(
+        self, messages: Sequence[ChatMessage], **kwargs: Any
+    ) -> ChatResponse:
+        all_kwargs = self._get_all_kwargs(**kwargs)
+        message_dicts = to_openai_message_dicts(messages)
+        response = await self._aclient.chat.completions.create(
+            messages=message_dicts,
+            stream=False,
+            **all_kwargs,
+        )
+        message_dict = response.choices[0].message
+        message = from_openai_message(message_dict)
+
+        return ChatResponse(
+            message=message,
+            raw=response,
+            additional_kwargs=self._get_response_token_counts(response),
+        )
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/utils.py b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/utils.py
new file mode 100644
index 00000000000..98787ab5568
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/llama_index/llms/nvidia/utils.py
@@ -0,0 +1,21 @@
+from typing import Dict
+
+API_CATALOG_MODELS: Dict[str, int] = {
+    "mistralai/mistral-7b-instruct-v0.2": 16384,
+    "mistralai/mixtral-8x7b-instruct-v0.1": 16384,
+    "google/gemma-7b": 4096,
+    "google/gemma-2b": 4096,
+    "google/codegemma-7b": 4096,
+    "meta/codellama-70b": 1024,
+    "meta/llama2-70b": 1024,
+}
+
+
+def playground_modelname_to_contextsize(modelname: str) -> int:
+    if modelname not in API_CATALOG_MODELS:
+        raise ValueError(
+            f"Unknown model: {modelname}. Please provide a valid AI Playground model name."
+            "Known models are: " + ", ".join(API_CATALOG_MODELS.keys())
+        )
+
+    return API_CATALOG_MODELS[modelname]
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-nvidia/pyproject.toml
new file mode 100644
index 00000000000..49195b3a1e1
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/pyproject.toml
@@ -0,0 +1,50 @@
+[build-system]
+build-backend = "poetry.core.masonry.api"
+requires = ["poetry-core"]
+
+[tool.codespell]
+check-filenames = true
+check-hidden = true
+# Feel free to un-skip examples, and experimental, you will just need to
+# work through many typos (--write-changes and --interactive will help)
+skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb"
+
+[tool.mypy]
+disallow_untyped_defs = true
+# Remove venv skip when integrated with pre-commit
+exclude = ["_static", "build", "examples", "notebooks", "venv"]
+ignore_missing_imports = true
+python_version = "3.8"
+
+[tool.poetry]
+authors = ["Chris Alexiuk <calexiuk@nvidia.com>"]
+description = "llama-index llms nvidia api catalog integration"
+license = "MIT"
+name = "llama-index-llms-nvidia"
+packages = [{include = "llama_index/"}]
+readme = "README.md"
+version = "0.1.0"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+llama-index-core = "^0.10.0"
+llama-index-llms-openai = "^0.1.15"
+
+[tool.poetry.group.dev.dependencies]
+black = {extras = ["jupyter"], version = "<=23.9.1,>=23.7.0"}
+codespell = {extras = ["toml"], version = ">=v2.2.6"}
+ipython = "8.10.0"
+jupyter = "^1.0.0"
+mypy = "0.991"
+pre-commit = "3.2.0"
+pylint = "2.15.10"
+pytest = "7.2.1"
+pytest-mock = "3.11.1"
+ruff = "0.0.292"
+tree-sitter-languages = "^1.8.0"
+types-Deprecated = ">=0.1.0"
+types-PyYAML = "^6.0.12.12"
+types-protobuf = "^4.24.0.4"
+types-redis = "4.5.5.0"
+types-requests = "2.28.11.8"  # TODO: unpin when mypy>0.991
+types-setuptools = "67.1.0.0"
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/default_functionality_tests.ipynb b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/default_functionality_tests.ipynb
new file mode 100644
index 00000000000..6dfa00f9d4d
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/default_functionality_tests.ipynb
@@ -0,0 +1,837 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Using NVIDIA's LLM API Catalog Connector\n",
+    "\n",
+    "This notebook will guide you through understanding the basic usage of the `NVIDIA` connector.\n",
+    "\n",
+    "With this connector, you'll be able to connect to and generate from compatible models available at the NVIDIA [API Catalog](https://build.nvidia.com/explore/discover), such as:\n",
+    "\n",
+    "- Google's [gemma-7b](https://build.nvidia.com/google/gemma-7b)\n",
+    "- Mistal AI's [mistral-7b-instruct-v0.2](https://build.nvidia.com/mistralai/mistral-7b-instruct-v2)\n",
+    "- And more!\n",
+    "\n",
+    "We'll begin by ensuring `llama-index` and associated packages are installed.\n",
+    "\n",
+    "> NOTE: Only models that have a base URL of `https://integrate.api.nvidia.com/v1` are compatible with this connector at this time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting llama-index-embeddings-openai\n",
+      "  Using cached llama_index_embeddings_openai-0.1.7-py3-none-any.whl.metadata (603 bytes)\n",
+      "Requirement already satisfied: llama-index-core<0.11.0,>=0.10.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-embeddings-openai) (0.10.30)\n",
+      "Requirement already satisfied: PyYAML>=6.0.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (6.0.1)\n",
+      "Requirement already satisfied: SQLAlchemy>=1.4.49 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from SQLAlchemy[asyncio]>=1.4.49->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.0.29)\n",
+      "Requirement already satisfied: aiohttp<4.0.0,>=3.8.6 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.9.5)\n",
+      "Requirement already satisfied: dataclasses-json in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.6.4)\n",
+      "Requirement already satisfied: deprecated>=1.2.9.3 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.2.14)\n",
+      "Requirement already satisfied: dirtyjson<2.0.0,>=1.0.8 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.0.8)\n",
+      "Requirement already satisfied: fsspec>=2023.5.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2024.3.1)\n",
+      "Requirement already satisfied: httpx in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.27.0)\n",
+      "Requirement already satisfied: llamaindex-py-client<0.2.0,>=0.1.18 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.1.18)\n",
+      "Requirement already satisfied: nest-asyncio<2.0.0,>=1.5.8 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.6.0)\n",
+      "Requirement already satisfied: networkx>=3.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.1)\n",
+      "Requirement already satisfied: nltk<4.0.0,>=3.8.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.8.1)\n",
+      "Requirement already satisfied: numpy in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.24.4)\n",
+      "Requirement already satisfied: openai>=1.1.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.22.0)\n",
+      "Requirement already satisfied: pandas in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.0.3)\n",
+      "Requirement already satisfied: pillow>=9.0.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (10.3.0)\n",
+      "Requirement already satisfied: requests>=2.31.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.31.0)\n",
+      "Requirement already satisfied: tenacity<9.0.0,>=8.2.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (8.2.3)\n",
+      "Requirement already satisfied: tiktoken>=0.3.3 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.6.0)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.66.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (4.66.2)\n",
+      "Requirement already satisfied: typing-extensions>=4.5.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (4.11.0)\n",
+      "Requirement already satisfied: typing-inspect>=0.8.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.9.0)\n",
+      "Requirement already satisfied: wrapt in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.16.0)\n",
+      "Requirement already satisfied: aiosignal>=1.1.2 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.3.1)\n",
+      "Requirement already satisfied: attrs>=17.3.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (23.2.0)\n",
+      "Requirement already satisfied: frozenlist>=1.1.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.4.1)\n",
+      "Requirement already satisfied: multidict<7.0,>=4.5 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (6.0.5)\n",
+      "Requirement already satisfied: yarl<2.0,>=1.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from aiohttp<4.0.0,>=3.8.6->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.9.4)\n",
+      "Requirement already satisfied: pydantic>=1.10 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from llamaindex-py-client<0.2.0,>=0.1.18->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.7.0)\n",
+      "Requirement already satisfied: anyio in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (4.3.0)\n",
+      "Requirement already satisfied: certifi in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2024.2.2)\n",
+      "Requirement already satisfied: httpcore==1.* in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.0.5)\n",
+      "Requirement already satisfied: idna in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.7)\n",
+      "Requirement already satisfied: sniffio in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.3.1)\n",
+      "Requirement already satisfied: h11<0.15,>=0.13 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from httpcore==1.*->httpx->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.14.0)\n",
+      "Requirement already satisfied: click in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from nltk<4.0.0,>=3.8.1->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (8.1.7)\n",
+      "Requirement already satisfied: joblib in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from nltk<4.0.0,>=3.8.1->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.4.0)\n",
+      "Requirement already satisfied: regex>=2021.8.3 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from nltk<4.0.0,>=3.8.1->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2024.4.16)\n",
+      "Requirement already satisfied: distro<2,>=1.7.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from openai>=1.1.0->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.9.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from requests>=2.31.0->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.3.2)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from requests>=2.31.0->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.2.1)\n",
+      "Requirement already satisfied: greenlet!=0.4.17 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from SQLAlchemy>=1.4.49->SQLAlchemy[asyncio]>=1.4.49->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.0.3)\n",
+      "Requirement already satisfied: mypy-extensions>=0.3.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from typing-inspect>=0.8.0->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.0.0)\n",
+      "Requirement already satisfied: marshmallow<4.0.0,>=3.18.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from dataclasses-json->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (3.21.1)\n",
+      "Requirement already satisfied: python-dateutil>=2.8.2 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from pandas->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.9.0.post0)\n",
+      "Requirement already satisfied: pytz>=2020.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from pandas->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2024.1)\n",
+      "Requirement already satisfied: tzdata>=2022.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from pandas->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2024.1)\n",
+      "Requirement already satisfied: packaging>=17.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from marshmallow<4.0.0,>=3.18.0->dataclasses-json->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (24.0)\n",
+      "Requirement already satisfied: annotated-types>=0.4.0 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from pydantic>=1.10->llamaindex-py-client<0.2.0,>=0.1.18->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (0.6.0)\n",
+      "Requirement already satisfied: pydantic-core==2.18.1 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from pydantic>=1.10->llamaindex-py-client<0.2.0,>=0.1.18->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (2.18.1)\n",
+      "Requirement already satisfied: six>=1.5 in /home/chris/anaconda3/envs/nvidia-llama-index-api/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas->llama-index-core<0.11.0,>=0.10.1->llama-index-embeddings-openai) (1.16.0)\n",
+      "Using cached llama_index_embeddings_openai-0.1.7-py3-none-any.whl (6.0 kB)\n",
+      "Installing collected packages: llama-index-embeddings-openai\n",
+      "Successfully installed llama-index-embeddings-openai-0.1.7\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install llama-index-embeddings-openai"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## API Keys and Boilerplate\n",
+    "\n",
+    "During the next cell we'll run some boilerplate to allow the examples to be executed smoothly in a notebook environment. \n",
+    "\n",
+    "We'll also provide our API keys. \n",
+    "\n",
+    "> NOTE: You can create your NVIDIA API key using the `Get API Key` button in the code example window."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# llama-parse is async-first, running the async code in a notebook requires the use of nest_asyncio\n",
+    "import nest_asyncio\n",
+    "\n",
+    "nest_asyncio.apply()\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "# Using OpenAI API for embeddings\n",
+    "os.environ[\"OPENAI_API_KEY\"] = \"sk-\"\n",
+    "\n",
+    "# Using NVIDIA API Playground API Key for LLM\n",
+    "os.environ[\"NVIDIA_API_KEY\"] = \"nvapi-\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading the NVIDIA LLM\n",
+    "\n",
+    "Now we can load our `NVIDIA` LLM by passing in the model name, as found in the docs - located [here](https://docs.api.nvidia.com/nim/reference/)\n",
+    "\n",
+    "> NOTE: The default model is `mistralai/mistral-7b-instruct-v0.2`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.llms.nvidia import NVIDIA\n",
+    "from llama_index.core import VectorStoreIndex\n",
+    "from llama_index.core import Settings\n",
+    "\n",
+    "llm = NVIDIA(model=\"mistralai/mistral-7b-instruct-v0.2\")\n",
+    "\n",
+    "Settings.llm = llm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can observe which model our `llm` object is currently associated with the `.model` attribute."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'mistralai/mistral-7b-instruct-v0.2'"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm.model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading API Catalogue LLM\n",
+    "\n",
+    "We can also load models using their API Catalogue address.\n",
+    "\n",
+    "Let's use `gemma-7b` as an example!\n",
+    "\n",
+    "1. Navigate to the [model page](https://build.nvidia.com/google/gemma-7b)\n",
+    "2. Find the address in the `model` parameter (e.g. `\"google/gemma-7b\"`)\n",
+    "3. Verify it has the `base_url` of `\"https://integrate.api.nvidia.com/v1\"`\n",
+    "4. Use `NVIDIA(model=\"model_name_here\")` to point the connector at that model (e.g. `NVIDIA(model=\"google/gemma-7b\"`)\n",
+    "\n",
+    "Let's see this in the code."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = NVIDIA(model=\"google/gemma-7b\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's confirm we've associated our `NvidiaAIPlayground` LLM with the correct model!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'google/gemma-7b'"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "llm.model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Basic Functionality\n",
+    "\n",
+    "Now we can explore the different ways you can use the connector within the LlamaIndex ecosystem!\n",
+    "\n",
+    "Before we begin, lets set up a list of `ChatMessage` objects - which is the expected input for some of the methods."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core.llms import ChatMessage, MessageRole\n",
+    "\n",
+    "chat_messages = [\n",
+    "    ChatMessage(role=MessageRole.SYSTEM, content=(\"You are a helpful assistant.\")),\n",
+    "    ChatMessage(\n",
+    "        role=MessageRole.USER,\n",
+    "        content=(\"What are the most popular house pets in North America?\"),\n",
+    "    ),\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We'll follow the same basic pattern for each example: \n",
+    "\n",
+    "1. We'll point our `NVIDIA` LLM to our desired model\n",
+    "2. We'll examine how to use the endpoint to achieve the desired task!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Complete: `.complete()`\n",
+    "\n",
+    "We can use `.complete()`/`.acomplete()` (which takes a string) to prompt a response from the selected model.\n",
+    "\n",
+    "Let's use our default model for this task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "completion_llm = NVIDIA()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can verify this is the expected default by checking the `.model` attribute."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'mistralai/mistral-7b-instruct-v0.2'"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "completion_llm.model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's call `.complete()` on our model with a string, in this case `\"Hello!\"`, and observe the response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CompletionResponse(text=\" Hello there! How can I help you today? I'm here to answer any questions you might have or provide information on a wide range of topics. So, feel free to ask me anything!\\n\\nIf you're looking for some general information, I can help you with that too. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some general information, I can provide you with that as well. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some general information, I can provide you with that as well. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some general information, I can provide you with that as well. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some\", additional_kwargs={}, raw={'id': 'chatcmpl-f6906079-51e7-44bf-aaea-a9478397dfbf', 'choices': [Choice(finish_reason=None, index=0, logprobs=ChoiceLogprobs(content=None, text_offset=[], token_logprobs=[0.0, 0.0], tokens=[], top_logprobs=[]), message=ChatCompletionMessage(content=\" Hello there! How can I help you today? I'm here to answer any questions you might have or provide information on a wide range of topics. So, feel free to ask me anything!\\n\\nIf you're looking for some general information, I can help you with that too. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some general information, I can provide you with that as well. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some general information, I can provide you with that as well. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some general information, I can provide you with that as well. For example, I can tell you about the weather, current events, or provide definitions for various words and concepts. I can also help you with math problems, translate words and phrases, and even tell you a joke or two!\\n\\nSo, what would you like to know? Let me know and I'll do my best to help you out!\\n\\nIf you have any specific question or topic in mind, please let me know and I'll be glad to help you out. If you want some\", role='assistant', function_call=None, tool_calls=None))], 'created': 1713474670, 'model': 'mistralai/mistral-7b-instruct-v0.2', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': CompletionUsage(completion_tokens=512, prompt_tokens=11, total_tokens=523)}, logprobs=None, delta=None)"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "completion_llm.complete(\"Hello!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As is expected by LlamaIndex - we get a `CompletionResponse` in response."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Async Complete: `.acomplete()`\n",
+    "\n",
+    "There is also an async implementation which can be leveraged in the same way!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CompletionResponse(text=\" Hello there! How can I help you today? I'm here to answer any questions you might have or provide information on a wide range of topics. So feel free to ask me anything!\\n\\nIf you're looking for a specific topic, just let me know and I'll do my best to provide you with accurate and up-to-date information. And if you have any requests for fun facts or trivia, I'm happy to oblige!\\n\\nSo, what would you like to know today? Let me help make your day a little brighter! 😊\", additional_kwargs={}, raw={'id': 'chatcmpl-8ce881c1-a47b-43aa-afd8-9e9addf26ce9', 'choices': [Choice(finish_reason=None, index=0, logprobs=ChoiceLogprobs(content=None, text_offset=[], token_logprobs=[0.0, 0.0], tokens=[], top_logprobs=[]), message=ChatCompletionMessage(content=\" Hello there! How can I help you today? I'm here to answer any questions you might have or provide information on a wide range of topics. So feel free to ask me anything!\\n\\nIf you're looking for a specific topic, just let me know and I'll do my best to provide you with accurate and up-to-date information. And if you have any requests for fun facts or trivia, I'm happy to oblige!\\n\\nSo, what would you like to know today? Let me help make your day a little brighter! 😊\", role='assistant', function_call=None, tool_calls=None))], 'created': 1712175910, 'model': 'mistralai/mistral-7b-instruct-v0.2', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': CompletionUsage(completion_tokens=123, prompt_tokens=11, total_tokens=134)}, logprobs=None, delta=None)"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "await completion_llm.acomplete(\"Hello!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Chat: `.chat()`\n",
+    "\n",
+    "Now we can try the same thing using the `.chat()` method. This method expects a list of chat messages - so we'll use the one we created above.\n",
+    "\n",
+    "We'll use the `mistralai/mixtral-8x7b-instruct-v0.1` model for the example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chat_llm = NVIDIA(model=\"mistralai/mixtral-8x7b-instruct-v0.1\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "All we need to do now is call `.chat()` on our list of `ChatMessages` and observe our response.\n",
+    "\n",
+    "You'll also notice that we can pass in a few additional key-word arguments that can influence the generation - in this case, we've used the `seed` parameter to influence our generation and the `stop` parameter to indicate we want the model to stop generating once it reaches a certain token!\n",
+    "\n",
+    "> NOTE: You can find information about what additional kwargs are supported by the model's endpoint by referencing the API documentation for the selected model. Mixtral's is located [here](https://docs.api.nvidia.com/nim/reference/mistralai-mixtral-8x7b-instruct-infer) as an example!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content=\" In North America, the most popular types of house pets are:\\n\\n1. Dogs: Man's best friend is the most popular pet in North America. They are known for their loyalty, companionship, and the variety of breeds that cater to different lifestyles and preferences.\\n\\n2. Cats\", additional_kwargs={}), raw={'id': 'chatcmpl-b6ef95ca-e023-4dc8-8ee9-843f214169e9', 'choices': [Choice(finish_reason=None, index=0, logprobs=ChoiceLogprobs(content=None, text_offset=[], token_logprobs=[0.0, 0.0], tokens=[], top_logprobs=[]), message=ChatCompletionMessage(content=\" In North America, the most popular types of house pets are:\\n\\n1. Dogs: Man's best friend is the most popular pet in North America. They are known for their loyalty, companionship, and the variety of breeds that cater to different lifestyles and preferences.\\n\\n2. Cats\", role='assistant', function_call=None, tool_calls=None))], 'created': 1713474655, 'model': 'mistralai/mixtral-8x7b-instruct-v0.1', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': CompletionUsage(completion_tokens=66, prompt_tokens=26, total_tokens=92)}, delta=None, logprobs=None, additional_kwargs={})"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "chat_llm.chat(chat_messages, seed=4, stop=[\"cat\", \"cats\", \"Cat\", \"Cats\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As expected, we receive a `ChatResponse` in response."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Async Chat: (`achat`)\n",
+    "\n",
+    "We also have an async implementation of the `.chat()` method which can be called in the following way."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "ChatResponse(message=ChatMessage(role=<MessageRole.ASSISTANT: 'assistant'>, content=' The most popular house pets in North America are dogs and cats. According to the American Pet Products Association (APPA), as of 2021, approximately 69 million homes in the United States own a pet, and 63.4 million of those households have a dog, while 42.7 million have a cat. Birds, small mammals, reptiles, and fish are also popular pets, but to a lesser extent.', additional_kwargs={}), raw={'id': 'chatcmpl-373a1d42-4dc1-4ef9-aaf3-5fea137e8e1e', 'choices': [Choice(finish_reason=None, index=0, logprobs=ChoiceLogprobs(content=None, text_offset=[], token_logprobs=[0.0, 0.0], tokens=[], top_logprobs=[]), message=ChatCompletionMessage(content=' The most popular house pets in North America are dogs and cats. According to the American Pet Products Association (APPA), as of 2021, approximately 69 million homes in the United States own a pet, and 63.4 million of those households have a dog, while 42.7 million have a cat. Birds, small mammals, reptiles, and fish are also popular pets, but to a lesser extent.', role='assistant', function_call=None, tool_calls=None))], 'created': 1712177472, 'model': 'mistralai/mixtral-8x7b-instruct-v0.1', 'object': 'chat.completion', 'system_fingerprint': None, 'usage': CompletionUsage(completion_tokens=95, prompt_tokens=59, total_tokens=154)}, delta=None, logprobs=None, additional_kwargs={})"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "await chat_llm.achat(chat_messages)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Stream: `.stream_chat()`\n",
+    "\n",
+    "We can also use the models found on `build.nvidia.com` for streaming use-cases!\n",
+    "\n",
+    "Let's select another model and observe this behaviour. We'll use Google's `gemma-7b` model for this task."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stream_llm = NVIDIA(model=\"google/gemma-7b\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's call our model with `.stream_chat()`, which again expects a list of `ChatMessage` objects, and capture the response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "streamed_response = stream_llm.stream_chat(chat_messages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<generator object llm_chat_callback.<locals>.wrap.<locals>.wrapped_llm_chat.<locals>.wrapped_gen at 0x7dd89853e320>"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "streamed_response"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As we can see, the response is a generator with the streamed response. \n",
+    "\n",
+    "Let's take a look at the final response once the generation is complete."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "assistant: **Top Popular House Pets in North America:**\n",
+      "\n",
+      "**1. Dogs:**\n",
+      "* Estimated 63.4 million pet dogs in households (2023)\n",
+      "* Known for their loyalty, companionship, and trainability\n",
+      "\n",
+      "**2. Cats:**\n",
+      "* Estimated 38.4 million pet cats in households (2023)\n",
+      "* Known for their independence, affection, and low-maintenance nature\n",
+      "\n",
+      "**3. Fish:**\n",
+      "* Estimated 14.5 million pet fish in households (2023)\n",
+      "* Popular for their tranquility, beauty, and variety of species\n",
+      "\n",
+      "**4. Small mammals (guinea pigs, hamsters, rabbits):**\n",
+      "* Estimated 14.4 million pet small mammals in households (2023)\n",
+      "* Known for their playful and affectionate nature\n",
+      "\n",
+      "**5. Birds:**\n",
+      "* Estimated 13.3 million pet birds in households (2023)\n",
+      "* Known for their beauty, song, and intelligence\n",
+      "\n",
+      "**Other popular pets:**\n",
+      "\n",
+      "* Tortoises and reptiles\n",
+      "* Hamsters and rodents\n",
+      "* Invertebrates (such as spiders and hermit crabs)\n",
+      "\n",
+      "**Factors influencing pet popularity:**\n",
+      "\n",
+      "* **Lifestyle and living situation:** Urban dwellers are more likely to have cats, while suburban and rural residents are more likely to have dogs.\n",
+      "* **Cost:** Dogs tend to be more expensive to own than cats.\n",
+      "* **Personality and preferences:** Some people prefer the companionship of dogs, while others prefer the independence of cats.\n",
+      "* **Availability:** Certain pets are easier to find or adopt than others.\n",
+      "* **Trend and cultural influences:** Some pets become more popular than others due to trends or cultural preferences.\n"
+     ]
+    }
+   ],
+   "source": [
+    "last_element = None\n",
+    "for last_element in streamed_response:\n",
+    "    pass\n",
+    "\n",
+    "print(last_element)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Async Stream: `.astream_chat()`\n",
+    "\n",
+    "We have the equivalent async method for streaming as well, which can be used in a similar way to the sync implementation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "streamed_response = await stream_llm.astream_chat(chat_messages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<async_generator object llm_chat_callback.<locals>.wrap.<locals>.wrapped_async_llm_chat.<locals>.wrapped_gen at 0x787709eea460>"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "streamed_response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "assistant: Sure, here are the most popular house pets in North America:\n",
+      "\n",
+      "1. Dogs\n",
+      "2. Cats\n",
+      "3. Fish\n",
+      "4. Small Mammals\n",
+      "5. Birds\n"
+     ]
+    }
+   ],
+   "source": [
+    "last_element = None\n",
+    "async for last_element in streamed_response:\n",
+    "    pass\n",
+    "\n",
+    "print(last_element)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Streaming Query Engine Responses\n",
+    "\n",
+    "Let's look at a slightly more involved example using a query engine!\n",
+    "\n",
+    "We'll start by loading some data (we'll be using the [Hitchhiker's Guide to the Galaxy](https://web.eecs.utk.edu/~hqi/deeplearning/project/hhgttg.txt))."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Loading Data\n",
+    "\n",
+    "Let's first create a directory where our data can live."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir -p 'data/hhgttg'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We'll download our data from the above source."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2024-04-01 14:39:38--  https://web.eecs.utk.edu/~hqi/deeplearning/project/hhgttg.txt\n",
+      "Resolving web.eecs.utk.edu (web.eecs.utk.edu)... 160.36.127.165\n",
+      "Connecting to web.eecs.utk.edu (web.eecs.utk.edu)|160.36.127.165|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 1534289 (1.5M) [text/plain]\n",
+      "Saving to: ‘data/hhgttg/hhgttg.txt’\n",
+      "\n",
+      "data/hhgttg/hhgttg. 100%[===================>]   1.46M  6.75MB/s    in 0.2s    \n",
+      "\n",
+      "2024-04-01 14:39:39 (6.75 MB/s) - ‘data/hhgttg/hhgttg.txt’ saved [1534289/1534289]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget 'https://web.eecs.utk.edu/~hqi/deeplearning/project/hhgttg.txt' -O 'data/hhgttg/hhgttg.txt'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We'll need to have an embedding model for this step! We'll use OpenAI's `text-embedding-03-small` model to achieve this, and save it in our `Settings`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.embeddings.openai import OpenAIEmbedding\n",
+    "\n",
+    "openai_embedding = OpenAIEmbedding(model=\"text-embedding-3-small\")\n",
+    "\n",
+    "Settings.embed_model = openai_embedding"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can load our document and create an index leveraging the above created `OpenAIEmbedding()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import VectorStoreIndex, SimpleDirectoryReader\n",
+    "\n",
+    "documents = SimpleDirectoryReader(\"data/hhgttg\").load_data()\n",
+    "index = VectorStoreIndex.from_documents(documents)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we can create a simple query engine and set our `streaming` parameter to `True`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "streaming_qe = index.as_query_engine(streaming=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's send a query to our query engine, and then stream the response."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "streaming_response = streaming_qe.query(\n",
+    "    \"What is the significance of the number 42?\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The significance of the number 42 is a central theme in \"The Hitchhiker's Guide to the Galaxy\" by Douglas Adams. The book is a comedic science fiction satire that follows the adventures of two intergalactic travelers, Arthur Dent and Ford Prefect, as they try to escape the destruction of Earth and uncover the true meaning of the number 42.\n",
+      "\n",
+      "Throughout the book, the number 42 is presented as the ultimate answer to the ultimate question of life, the universe, and everything. The question itself is never explicitly stated, but it is implied to be a deeply profound and existential one that has been sought after by philosophers, scientists, and thinkers throughout history.\n",
+      "\n",
+      "The idea of the number 42 as the ultimate answer is a playful jab at the idea of seeking ultimate knowledge and understanding, which is often seen as an impossible task. The number 42 is also a reference to the famous \"42\" answer in the \"The Hitchhiker's Guide to the Galaxy\" by Douglas Adams, which is a comedic science fiction satire that follows the adventures of two intergalactic travelers, Arthur Dent and Ford Prefect, as they try to escape the destruction of Earth and uncover the true meaning of the number 42.\n",
+      "\n",
+      "In the book, the supercomputer Deep Thought is asked to find the answer to the ultimate question, and after billions of years of computation, it determines that the answer is 42. The answer is so profound that it causes Deep Thought to become obsolete, as it is no longer needed to answer questions.\n",
+      "\n",
+      "The significance of the number 42 in \"The Hitchhiker's Guide to the Galaxy\" is a commentary on the nature of knowledge and the quest for ultimate understanding. It is a reminder that there are limits to what can be known and that the pursuit of knowledge should be done with a sense of humor and a willingness to accept the unknown."
+     ]
+    }
+   ],
+   "source": [
+    "streaming_response.print_response_stream()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "nvidia-llama-index-playground-connector",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_nvidia.py b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_nvidia.py
new file mode 100644
index 00000000000..6bb8adfd1b9
--- /dev/null
+++ b/llama-index-integrations/llms/llama-index-llms-nvidia/tests/test_nvidia.py
@@ -0,0 +1,221 @@
+import os
+from typing import Any, AsyncGenerator, Generator, Optional
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+from llama_index.core.base.llms.types import ChatMessage
+from llama_index.llms.nvidia import NVIDIA
+
+from openai.types.chat.chat_completion import (
+    ChatCompletion,
+    ChatCompletionMessage,
+    Choice,
+    ChoiceLogprobs,
+)
+from openai.types.chat.chat_completion_chunk import ChatCompletionChunk, ChoiceDelta
+from openai.types.chat.chat_completion_chunk import Choice as ChunkChoice
+from openai.types.completion import Completion, CompletionUsage
+
+
+class CachedNVIDIApiKeys:
+    def __init__(self, set_env_key_to: Optional[str] = "", set_fake_key: bool = False):
+        self.set_env_key_to = set_env_key_to
+        self.set_fake_key = set_fake_key
+
+    def __enter__(self) -> None:
+        self.api_env_was = os.environ.get("NVIDIA_API_KEY", "")
+        os.environ["NVIDIA_API_KEY"] = self.set_env_key_to
+
+        if self.set_fake_key:
+            os.environ["NVIDIA_API_KEY"] = "nvai-" + "x" * 9 + "-" + "x" * 54
+
+    def __exit__(self, *exc: object) -> None:
+        if self.api_env_was == "":
+            del os.environ["NVIDIA_API_KEY"]
+        else:
+            os.environ["NVIDIA_API_KEY"] = self.api_env_was
+
+
+def mock_chat_completion_v1(*args: Any, **kwargs: Any) -> ChatCompletion:
+    return ChatCompletion(
+        id="chatcmpl-4162e407-e121-42b4-8590-1c173380be7d",
+        object="chat.completion",
+        created=1713474384,
+        model="mistralai/mistral-7b-instruct-v0.2",
+        usage=CompletionUsage(
+            completion_tokens=304, prompt_tokens=11, total_tokens=315
+        ),
+        choices=[
+            Choice(
+                finish_reason="stop",
+                index=0,
+                logprobs=ChoiceLogprobs(
+                    content=None,
+                    text_offset=[],
+                    token_logprobs=[0.0, 0.0],
+                    tokens=[],
+                    top_logprobs=[],
+                ),
+                message=ChatCompletionMessage(
+                    content="Cool Test Message",
+                    role="assistant",
+                    function_call=None,
+                    tool_calls=None,
+                ),
+            )
+        ],
+    )
+
+
+async def mock_async_chat_completion_v1(*args: Any, **kwargs: Any) -> Completion:
+    return mock_chat_completion_v1(*args, **kwargs)
+
+
+def mock_chat_completion_stream_v1(
+    *args: Any, **kwargs: Any
+) -> Generator[ChatCompletionChunk, None, None]:
+    responses = [
+        ChatCompletionChunk(
+            id="chatcmpl-998d9b96-0b71-41f5-b910-dd3bc00f38c6",
+            object="chat.completion.chunk",
+            created=1713474736,
+            model="google/gemma-7b",
+            choices=[
+                ChunkChoice(
+                    finish_reason="stop",
+                    index=0,
+                    delta=ChoiceDelta(
+                        content="Test",
+                        function_call=None,
+                        role="assistant",
+                        tool_calls=None,
+                    ),
+                )
+            ],
+        ),
+        ChatCompletionChunk(
+            id="chatcmpl-998d9b96-0b71-41f5-b910-dd3bc00f38c6",
+            object="chat.completion.chunk",
+            created=1713474736,
+            model="google/gemma-7b",
+            choices=[
+                ChunkChoice(
+                    finish_reason="stop",
+                    index=0,
+                    delta=ChoiceDelta(
+                        content="Second Test",
+                        function_call=None,
+                        role="assistant",
+                        tool_calls=None,
+                    ),
+                )
+            ],
+        ),
+    ]
+
+    yield from responses
+
+
+async def mock_async_chat_completion_stream_v1(
+    *args: Any, **kwargs: Any
+) -> AsyncGenerator[Completion, None]:
+    async def gen() -> AsyncGenerator[Completion, None]:
+        for response in mock_chat_completion_stream_v1(*args, **kwargs):
+            yield response
+
+    return gen()
+
+
+@patch("llama_index.llms.nvidia.base.SyncOpenAI")
+def test_chat_model_basic(MockSyncOpenAI: MagicMock) -> None:
+    with CachedNVIDIApiKeys(set_fake_key=True):
+        mock_instance = MockSyncOpenAI.return_value
+        mock_instance.chat.completions.create.return_value = mock_chat_completion_v1()
+
+        llm = NVIDIA()
+        prompt = "test prompt"
+        message = ChatMessage(role="user", content="test message")
+
+        response = llm.complete(prompt)
+        assert response.text == "Cool Test Message"
+
+        chat_response = llm.chat([message])
+        assert chat_response.message.content == "Cool Test Message"
+
+
+@patch("llama_index.llms.nvidia.base.SyncOpenAI")
+def test_chat_model_streaming(MockSyncOpenAI: MagicMock) -> None:
+    with CachedNVIDIApiKeys(set_fake_key=True):
+        mock_instance = MockSyncOpenAI.return_value
+        mock_instance.chat.completions.create.return_value = (
+            mock_chat_completion_stream_v1()
+        )
+
+        llm = NVIDIA()
+        prompt = "test prompt"
+        message = ChatMessage(role="user", content="test message")
+
+        response_gen = llm.stream_complete(prompt)
+        responses = list(response_gen)
+        assert responses[-1].text == "TestSecond Test"
+
+        mock_instance.chat.completions.create.return_value = (
+            mock_chat_completion_stream_v1()
+        )
+
+        chat_response_gen = llm.stream_chat([message])
+        chat_responses = list(chat_response_gen)
+        print(chat_responses)
+        assert chat_responses[-1].message.content == "TestSecond Test"
+        assert chat_responses[-1].message.role == "assistant"
+
+
+@pytest.mark.asyncio()
+@patch("llama_index.llms.nvidia.base.AsyncOpenAI")
+async def test_async_chat_model_basic(MockAsyncOpenAI: MagicMock) -> None:
+    with CachedNVIDIApiKeys(set_fake_key=True):
+        mock_instance = MockAsyncOpenAI.return_value
+        create_fn = AsyncMock()
+        create_fn.side_effect = mock_async_chat_completion_v1
+        mock_instance.chat.completions.create = create_fn
+
+        llm = NVIDIA()
+        prompt = "test prompt"
+        message = ChatMessage(role="user", content="test message")
+
+        response = await llm.acomplete(prompt)
+        assert response.text == "Cool Test Message"
+
+        chat_response = await llm.achat([message])
+        assert chat_response.message.content == "Cool Test Message"
+
+
+@pytest.mark.asyncio()
+@patch("llama_index.llms.nvidia.base.AsyncOpenAI")
+async def test_async_streaming_chat_model(MockAsyncOpenAI: MagicMock) -> None:
+    with CachedNVIDIApiKeys(set_fake_key=True):
+        mock_instance = MockAsyncOpenAI.return_value
+        create_fn = AsyncMock()
+        create_fn.side_effect = mock_async_chat_completion_stream_v1
+        mock_instance.chat.completions.create = create_fn
+
+        llm = NVIDIA()
+        prompt = "test prompt"
+        message = ChatMessage(role="user", content="test message")
+
+        response_gen = await llm.astream_complete(prompt)
+        responses = [response async for response in response_gen]
+        assert responses[-1].text == "TestSecond Test"
+
+        chat_response_gen = await llm.astream_chat([message])
+        chat_responses = [response async for response in chat_response_gen]
+        assert chat_responses[-1].message.content == "TestSecond Test"
+
+
+def test_validates_api_key_is_present() -> None:
+    with CachedNVIDIApiKeys(set_fake_key=True):
+        assert NVIDIA()
+
+        os.environ["NVIDIA_API_KEY"] = ""
+
+        assert NVIDIA(api_key="nvai-" + "x" * 9 + "-" + "x" * 54)