derekhiggins
diff --git a/‎.github/actions/run-and-record-tests/action.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/actions/run-and-record-tests/action.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 2 deletions b/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎.github/workflows/record-integration-tests.yml‎
Lines changed: 13 additions & 6 deletions b/‎.github/workflows/record-integration-tests.yml‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎llama_stack/testing/inference_recorder.py‎
Lines changed: 80 additions & 1 deletion b/‎llama_stack/testing/inference_recorder.py‎
Lines changed: 80 additions & 1 deletion
diff --git a/‎tests/integration/recordings/index.sqlite‎
12 KB b/‎tests/integration/recordings/index.sqlite‎
12 KB
@@ -71,6 +71,7 @@ runs:
       shell: bash
       run: |
         sudo docker logs ollama > ollama-${{ inputs.inference-mode }}.log || true
+        sudo docker logs vllm > vllm-${{ inputs.inference-mode }}.log || true
 
     - name: Upload logs
       if: ${{ always() }}
 
@@ -20,7 +20,6 @@ on:
   schedule:
     # If changing the cron schedule, update the provider in the test-matrix job
     - cron: '0 0 * * *'  # (test latest client) Daily at 12 AM UTC
-    - cron: '1 0 * * 0'  # (test vllm) Weekly on Sunday at 1 AM UTC
   workflow_dispatch:
     inputs:
       test-all-client-versions:
@@ -47,7 +46,7 @@ jobs:
       matrix:
         client-type: [library, server]
         # Use vllm on weekly schedule, otherwise use test-provider input (defaults to ollama)
-        provider: ${{ (github.event.schedule == '1 0 * * 0') && fromJSON('["vllm"]') || fromJSON(format('["{0}"]', github.event.inputs.test-provider || 'ollama')) }}
+        provider: [ollama, vllm]
         # Use Python 3.13 only on nightly schedule (daily latest client test), otherwise use 3.12
         python-version: ${{ github.event.schedule == '0 0 * * *' && fromJSON('["3.12", "3.13"]') || fromJSON('["3.12"]') }}
         client-version: ${{ (github.event.schedule == '0 0 * * *' || github.event.inputs.test-all-client-versions == 'true') && fromJSON('["published", "latest"]') || fromJSON('["latest"]') }}
 
@@ -31,6 +31,9 @@ jobs:
     if: contains(github.event.pull_request.labels.*.name, 're-record-tests') ||
       contains(github.event.pull_request.labels.*.name, 're-record-vision-tests')
     runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        provider: [ollama, vllm]
     outputs:
       test-types: ${{ steps.generate-test-types.outputs.test-types }}
       matrix-modes: ${{ steps.generate-test-types.outputs.matrix-modes }}
@@ -42,17 +45,21 @@ jobs:
       - name: Generate test types
         id: generate-test-types
         run: |
-          # Get test directories dynamically, excluding non-test directories
-          TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
-            grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
-            sort | jq -R -s -c 'split("\n")[:-1]')
-          echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
+          if [ ${{ matrix.provider }} == "vllm" ]; then
+            echo "test-types=[\"inference\"]" >> $GITHUB_OUTPUT
+          elif
+            # Get test directories dynamically, excluding non-test directories
+            TEST_TYPES=$(find tests/integration -maxdepth 1 -mindepth 1 -type d -printf "%f\n" |
+              grep -Ev "^(__pycache__|fixtures|test_cases|recordings|post_training)$" |
+              sort | jq -R -s -c 'split("\n")[:-1]')
+            echo "test-types=$TEST_TYPES" >> $GITHUB_OUTPUT
+          fi
 
           labels=$(gh pr view ${{ github.event.pull_request.number }} --json labels --jq '.labels[].name')
           echo "labels=$labels"
 
           modes_array=()
-          if [[ $labels == *"re-record-vision-tests"* ]]; then
+          if [[ $labels == *"re-record-vision-tests"* ]] && [[ ${{ matrix.provider }} == "ollama" ]]; then
             modes_array+=("vision")
           fi
           if [[ $labels == *"re-record-tests"* ]]; then
 
@@ -108,13 +108,29 @@ def _deserialize_response(data: dict[str, Any]) -> Any:
         try:
             # Import the original class and reconstruct the object
             module_path, class_name = data["__type__"].rsplit(".", 1)
+
+            # Handle generic types (e.g. AsyncPage[Model]) by removing the generic part
+            if "[" in class_name and "]" in class_name:
+                class_name = class_name.split("[")[0]
+
             module = __import__(module_path, fromlist=[class_name])
             cls = getattr(module, class_name)
 
             if not hasattr(cls, "model_validate"):
                 raise ValueError(f"Pydantic class {cls} does not support model_validate?")
 
-            return cls.model_validate(data["__data__"])
+            # Special handling for AsyncPage - convert nested model dicts to proper model objects
+            validate_data = data["__data__"]
+            if class_name == "AsyncPage" and isinstance(validate_data, dict) and "data" in validate_data:
+                # Convert model dictionaries to objects with attributes so they work with .id access
+                from types import SimpleNamespace
+
+                validate_data = dict(validate_data)
+                validate_data["data"] = [
+                    SimpleNamespace(**item) if isinstance(item, dict) else item for item in validate_data["data"]
+                ]
+
+            return cls.model_validate(validate_data)
         except (ImportError, AttributeError, TypeError, ValueError) as e:
             logger.warning(f"Failed to deserialize object of type {data['__type__']}: {e}")
             return data["__data__"]
@@ -332,9 +348,11 @@ def patch_inference_clients():
     from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
     from openai.resources.completions import AsyncCompletions
     from openai.resources.embeddings import AsyncEmbeddings
+    from openai.resources.models import AsyncModels
 
     # Store original methods for both OpenAI and Ollama clients
     _original_methods = {
+        "model_list": AsyncModels.list,
         "chat_completions_create": AsyncChatCompletions.create,
         "completions_create": AsyncCompletions.create,
         "embeddings_create": AsyncEmbeddings.create,
@@ -347,6 +365,64 @@ def patch_inference_clients():
     }
 
     # Create patched methods for OpenAI client
+    def patched_model_list(self, *args, **kwargs):
+        # The original models.list() returns an AsyncPaginator that can be used with async for
+        # We need to create a wrapper that preserves this behavior
+        class PatchedAsyncPaginator:
+            def __init__(self, original_method, instance, client_type, endpoint, args, kwargs):
+                self.original_method = original_method
+                self.instance = instance
+                self.client_type = client_type
+                self.endpoint = endpoint
+                self.args = args
+                self.kwargs = kwargs
+                self._result = None
+
+            def __await__(self):
+                # Make it awaitable like the original AsyncPaginator
+                async def _await():
+                    self._result = await _patched_inference_method(
+                        self.original_method, self.instance, self.client_type, self.endpoint, *self.args, **self.kwargs
+                    )
+                    return self._result
+
+                return _await().__await__()
+
+            def __aiter__(self):
+                # Make it async iterable like the original AsyncPaginator
+                return self
+
+            async def __anext__(self):
+                # Get the result if we haven't already
+                if self._result is None:
+                    self._result = await _patched_inference_method(
+                        self.original_method, self.instance, self.client_type, self.endpoint, *self.args, **self.kwargs
+                    )
+
+                # Initialize iteration on first call
+                if not hasattr(self, "_iter_index"):
+                    # Extract the data list from the result
+                    if hasattr(self._result, "data") and isinstance(self._result.data, list):
+                        self._data_list = self._result.data
+                    elif isinstance(self._result, list):
+                        self._data_list = self._result
+                    else:
+                        # Not a list-like response, return it once
+                        if hasattr(self, "_returned"):
+                            raise StopAsyncIteration
+                        self._returned = True
+                        return self._result
+                    self._iter_index = 0
+
+                # Return next item from the list
+                if self._iter_index >= len(self._data_list):
+                    raise StopAsyncIteration
+                item = self._data_list[self._iter_index]
+                self._iter_index += 1
+                return item
+
+        return PatchedAsyncPaginator(_original_methods["model_list"], self, "openai", "/v1/models", args, kwargs)
+
     async def patched_chat_completions_create(self, *args, **kwargs):
         return await _patched_inference_method(
             _original_methods["chat_completions_create"], self, "openai", "/v1/chat/completions", *args, **kwargs
@@ -363,6 +439,7 @@ async def patched_embeddings_create(self, *args, **kwargs):
         )
 
     # Apply OpenAI patches
+    AsyncModels.list = patched_model_list
     AsyncChatCompletions.create = patched_chat_completions_create
     AsyncCompletions.create = patched_completions_create
     AsyncEmbeddings.create = patched_embeddings_create
@@ -419,8 +496,10 @@ def unpatch_inference_clients():
     from openai.resources.chat.completions import AsyncCompletions as AsyncChatCompletions
     from openai.resources.completions import AsyncCompletions
     from openai.resources.embeddings import AsyncEmbeddings
+    from openai.resources.models import AsyncModels
 
     # Restore OpenAI client methods
+    AsyncModels.list = _original_methods["model_list"]
     AsyncChatCompletions.create = _original_methods["chat_completions_create"]
     AsyncCompletions.create = _original_methods["completions_create"]
     AsyncEmbeddings.create = _original_methods["embeddings_create"]