run-llama
diff --git a/‎llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/README.md‎
Lines changed: 17 additions & 0 deletions b/‎llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/README.md‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/llama_index/postprocessor/nvidia_rerank/base.py‎
Lines changed: 99 additions & 26 deletions b/‎llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/llama_index/postprocessor/nvidia_rerank/base.py‎
Lines changed: 99 additions & 26 deletions
diff --git a/‎llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/pyproject.toml‎
Lines changed: 5 additions & 1 deletion b/‎llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/pyproject.toml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_api_key.py‎
Lines changed: 4 additions & 10 deletions b/‎llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_api_key.py‎
Lines changed: 4 additions & 10 deletions
diff --git a/‎llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_available_models.py‎
Lines changed: 3 additions & 3 deletions b/‎llama-index-integrations/postprocessor/llama-index-postprocessor-nvidia-rerank/tests/test_available_models.py‎
Lines changed: 3 additions & 3 deletions
@@ -103,3 +103,20 @@ nodes = parser.get_nodes_from_documents(documents)
 # rerank
 rerank.postprocess_nodes(nodes, query_str=query)
 ```
+
+### Custom HTTP Client
+
+If you need more control over HTTP settings (e.g., timeouts, proxies, retries), you can pass your own `httpx.Client` instance to the `NVIDIARerank` initializer:
+
+```python
+import httpx
+from llama_index.postprocessor.nvidia_rerank import NVIDIARerank
+
+# Create a custom httpx client with a 10-second timeout
+custom_client = httpx.Client(timeout=10.0)
+
+# Pass the custom client to the reranker
+rerank = NVIDIARerank(
+    base_url="http://localhost:1976/v1", http_client=custom_client
+)
+```
@@ -1,6 +1,7 @@
 from typing import Any, List, Optional, Generator, Literal
 import os
 from urllib.parse import urlparse, urlunparse
+import httpx
 
 from llama_index.core.bridge.pydantic import Field, PrivateAttr, ConfigDict
 from llama_index.core.callbacks import CBEventType, EventPayload
@@ -11,10 +12,16 @@
 )
 from llama_index.core.postprocessor.types import BaseNodePostprocessor
 from llama_index.core.schema import MetadataMode, NodeWithScore, QueryBundle
-import requests
 import warnings
 from llama_index.core.base.llms.generic_utils import get_from_param_or_env
 
+from .utils import (
+    RANKING_MODEL_TABLE,
+    BASE_URL,
+    DEFAULT_MODEL,
+    Model,
+    determine_model,
+)
 from .utils import (
     RANKING_MODEL_TABLE,
     BASE_URL,
@@ -56,13 +63,15 @@ class NVIDIARerank(BaseNodePostprocessor):
     _mode: str = PrivateAttr("nvidia")
     _is_hosted: bool = PrivateAttr(True)
     base_url: Optional[str] = None
+    _http_client: Optional[httpx.Client] = PrivateAttr(None)
 
     def __init__(
         self,
         model: Optional[str] = None,
         nvidia_api_key: Optional[str] = None,
         api_key: Optional[str] = None,
         base_url: Optional[str] = os.getenv("NVIDIA_BASE_URL", BASE_URL),
+        http_client: Optional[httpx.Client] = None,
         **kwargs: Any,
     ):
         """
@@ -75,6 +84,7 @@ def __init__(
             nvidia_api_key (str, optional): The NVIDIA API key. Defaults to None.
             api_key (str, optional): The API key. Defaults to None.
             base_url (str, optional): The base URL of the on-premises NIM. Defaults to None.
+            http_client (httpx.Client, optional): Custom HTTP client for making requests.
             truncate (str): "NONE", "END", truncate input text if it exceeds
                             the model's context length. Default is model dependent and
                             is likely to raise an error if an input is too long.
@@ -87,6 +97,8 @@ def __init__(
             model = model or DEFAULT_MODEL
         super().__init__(model=model, **kwargs)
 
+        self._is_hosted = base_url in KNOWN_URLS
+        self.base_url = base_url
         self._is_hosted = base_url in KNOWN_URLS
         self.base_url = base_url
         self._api_key = get_from_param_or_env(
@@ -95,12 +107,11 @@ def __init__(
             "NVIDIA_API_KEY",
             "NO_API_KEY_PROVIDED",
         )
-
         if self._is_hosted:  # hosted on API Catalog (build.nvidia.com)
             if (not self._api_key) or (self._api_key == "NO_API_KEY_PROVIDED"):
                 raise ValueError("An API key is required for hosted NIM.")
         else:  # not hosted
-            self.base_url = self._validate_url(base_url)
+            self.base_url = self._validate_url(self.base_url)
 
         self.model = model
         if not self.model:
@@ -110,10 +121,9 @@ def __init__(
                 self.__get_default_model()
 
         if not self.model.startswith("nvdev/"):
-            # allow internal models
-            # TODO: add test case for this
             self._validate_model(self.model)  ## validate model
-        self.base_url = base_url
+
+        self._http_client = http_client
 
     def __get_default_model(self):
         """Set default model."""
@@ -136,24 +146,30 @@ def __get_default_model(self):
         else:
             self.model = DEFAULT_MODEL
 
+    @property
+    def normalized_base_url(self) -> str:
+        """Return the normalized base URL (without trailing slashes)."""
+        return self.base_url.rstrip("/")
+
+    def _get_headers(self, auth_required: bool = False) -> dict:
+        """Return default headers for HTTP requests.
+
+        If auth_required is True or the client is hosted, includes an Authorization header.
+        """
+        headers = {"Accept": "application/json"}
+        if auth_required or self._is_hosted:
+            headers["Authorization"] = f"Bearer {self._api_key}"
+        return headers
+
     def _get_models(self) -> List[Model]:
-        session = requests.Session()
-        self.base_url = self.base_url.rstrip("/") + "/"
-        if self._is_hosted:
-            _headers = {
-                "Authorization": f"Bearer {self._api_key}",
-                "Accept": "application/json",
-            }
-        else:
-            _headers = {
-                "Accept": "application/json",
-            }
+        client = self.client
+        _headers = self._get_headers(auth_required=self._is_hosted)
         url = (
             "https://integrate.api.nvidia.com/v1/models"
             if self._is_hosted
-            else self.base_url.rstrip("/") + "/models"
+            else self.normalized_base_url + "/models"
         )
-        response = session.get(url, headers=_headers)
+        response = client.get(url, headers=_headers)
         response.raise_for_status()
 
         assert (
@@ -181,6 +197,18 @@ def _get_models(self) -> List[Model]:
             ]
         else:
             return RANKING_MODEL_TABLE
+        # TODO: hosted now has a model listing, need to merge known and listed models
+        # TODO: parse model config for local models
+        if not self._is_hosted:
+            return [
+                Model(
+                    id=model["id"],
+                    base_model=getattr(model, "params", {}).get("root", None),
+                )
+                for model in response.json()["data"]
+            ]
+        else:
+            return RANKING_MODEL_TABLE
 
     def _validate_url(self, base_url):
         """
@@ -190,10 +218,37 @@ def _validate_url(self, base_url):
         emit a warning. old documentation told users to pass in the full
         inference url, which is incorrect and prevents model listing from working.
         normalize base_url to end in /v1.
+        validate the base_url.
+        if the base_url is not a url, raise an error
+        if the base_url does not end in /v1, e.g. /embeddings
+        emit a warning. old documentation told users to pass in the full
+        inference url, which is incorrect and prevents model listing from working.
+        normalize base_url to end in /v1.
         """
         if base_url is not None:
             parsed = urlparse(base_url)
 
+            # Ensure scheme and netloc (domain name) are present
+            if not (parsed.scheme and parsed.netloc):
+                expected_format = "Expected format is: http://host:port"
+                raise ValueError(
+                    f"Invalid base_url format. {expected_format} Got: {base_url}"
+                )
+
+            normalized_path = parsed.path.rstrip("/")
+            if not normalized_path.endswith("/v1"):
+                warnings.warn(
+                    f"{base_url} does not end in /v1, you may "
+                    "have inference and listing issues"
+                )
+                normalized_path += "/v1"
+
+                base_url = urlunparse(
+                    (parsed.scheme, parsed.netloc, normalized_path, None, None, None)
+                )
+        if base_url is not None:
+            parsed = urlparse(base_url)
+
             # Ensure scheme and netloc (domain name) are present
             if not (parsed.scheme and parsed.netloc):
                 expected_format = "Expected format is: http://host:port"
@@ -228,6 +283,15 @@ def _validate_model(self, model_name: str) -> None:
         model = determine_model(model_name)
         available_model_ids = [model.id for model in self.available_models]
 
+        if not model:
+            if self._is_hosted:
+                warnings.warn(f"Unable to determine validity of {model_name}")
+            else:
+                if model_name not in available_model_ids:
+                    raise ValueError(f"No locally hosted {model_name} was found.")
+        model = determine_model(model_name)
+        available_model_ids = [model.id for model in self.available_models]
+
         if not model:
             if self._is_hosted:
                 warnings.warn(f"Unable to determine validity of {model_name}")
@@ -238,16 +302,29 @@ def _validate_model(self, model_name: str) -> None:
         if model and model.endpoint:
             self.base_url = model.endpoint
 
+        if model and model.endpoint:
+            self.base_url = model.endpoint
+
     @property
     def available_models(self) -> List[Model]:
         """Get available models."""
         # all available models are in the map
         ids = RANKING_MODEL_TABLE.keys()
+        ids = RANKING_MODEL_TABLE.keys()
         if not self._is_hosted:
             return self._get_models()
         else:
             return [Model(id=id) for id in ids]
 
+    @property
+    def client(self) -> httpx.Client:
+        """
+        Lazy initialization of the HTTP client.
+        """
+        if self._http_client is None:
+            self._http_client = httpx.Client()
+        return self._http_client
+
     @classmethod
     def class_name(cls) -> str:
         return "NVIDIARerank"
@@ -273,12 +350,8 @@ def _postprocess_nodes(
         if len(nodes) == 0:
             return []
 
-        session = requests.Session()
-
-        _headers = {
-            "Authorization": f"Bearer {self._api_key}",
-            "Accept": "application/json",
-        }
+        client = self.client
+        _headers = self._get_headers(auth_required=True)
 
         # TODO: replace with itertools.batched in python 3.12
         def batched(ls: list, size: int) -> Generator[List[NodeWithScore], None, None]:
@@ -305,7 +378,7 @@ def batched(ls: list, size: int) -> Generator[List[NodeWithScore], None, None]:
                         for n in batch
                     ],
                 }
-                response = session.post(self.base_url, headers=_headers, json=payloads)
+                response = client.post(self.base_url, headers=_headers, json=payloads)
                 response.raise_for_status()
                 # expected response format:
                 # {
 
@@ -30,7 +30,7 @@ license = "MIT"
 name = "llama-index-postprocessor-nvidia-rerank"
 packages = [{include = "llama_index/"}]
 readme = "README.md"
-version = "0.4.1"
+version = "0.4.2"
 
 [tool.poetry.dependencies]
 python = ">=3.9,<4.0"
@@ -56,6 +56,10 @@ types-redis = "4.5.5.0"
 types-requests = "2.28.11.8"  # TODO: unpin when mypy>0.991
 types-setuptools = "67.1.0.0"
 
+[tool.poetry.group.test-integration.dependencies]
+responses = "^0.25.6"
+respx = {extras = ["pytest"], version = "^0.22.0"}
+
 [tool.poetry.group.test_integration.dependencies]
 pytest-httpx = "*"
 requests-mock = "^1.12.1"
 
@@ -1,23 +1,17 @@
 import os
 
 import pytest
-
+import respx
 from llama_index.postprocessor.nvidia_rerank import NVIDIARerank as Interface
 from llama_index.core.schema import NodeWithScore, Document
 
 from typing import Any
-from requests_mock import Mocker
 
 
 @pytest.fixture()
-def mock_local_models(requests_mock: Mocker) -> None:
-    requests_mock.get(
-        "https://test_url/v1/models",
-        json={
-            "data": [
-                {"id": "model1"},
-            ]
-        },
+def mock_local_models(respx_mock: respx.MockRouter) -> None:
+    respx_mock.get("https://test_url/v1/models").respond(
+        json={"data": [{"id": "model1"}]}
     )
 
 
 
@@ -1,12 +1,12 @@
 import pytest
 
 from llama_index.postprocessor.nvidia_rerank import NVIDIARerank
-from requests_mock import Mocker
+import respx
 
 
 @pytest.fixture(autouse=True)
-def mock_local_models(requests_mock: Mocker) -> None:
-    requests_mock.get(
+def mock_local_models(respx_mock: respx.MockRouter) -> None:
+    respx_mock.get(
         "https://test_url/v1/models",
         json={
             "data": [