Skip to content

Commit 1813d24

Browse files
authored
Merge pull request #61 from dmaniloff/tests_overhaul
Tests overhaul
2 parents 95882df + ba148d2 commit 1813d24

File tree

9 files changed

+653
-533
lines changed

9 files changed

+653
-533
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,4 +32,4 @@ repos:
3232
language: system
3333
pass_filenames: false
3434
always_run: true
35-
args: [-c, 'KUBEFLOW_BASE_IMAGE=dummy uv run pytest -v -m "not lls_integration and not kfp_integration" --tb=short --maxfail=3; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret']
35+
args: [-c, 'KUBEFLOW_BASE_IMAGE=dummy uv run pytest -v -m "unit or lls_integration" --tb=short --maxfail=3; ret=$?; [ $ret = 5 ] && exit 0 || exit $ret']

pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,9 @@ addopts = "-v"
6464
log_cli = true
6565
log_cli_level = "INFO"
6666
markers = [
67+
"unit: Unit tests for wrapper classes (mocked client by default)",
6768
"lls_integration: Llama Stack integration tests",
68-
"kfp_integration: Kubeflow Pipelines integration tests",
69+
"e2e: End-to-end tests against a deployed Llama Stack distribution on OpenShift",
6970
]
7071

7172
[tool.ruff]

tests/TESTING.md

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
# Testing
2+
3+
All test files live under `tests/`. Shared evaluation logic (smoke checks, eval job polling) is factored into `base_eval_tests.py`, which is not collected by pytest directly.
4+
5+
## Unit tests (`test_remote_wrappers.py`, pytest marker `unit`)
6+
7+
Tests the LangChain-compatible wrapper classes (`LlamaStackRemoteLLM` and `LlamaStackRemoteEmbeddings`) that the remote provider uses for inference. By default, the `LlamaStackClient` is mocked — no running server is required.
8+
9+
```bash
10+
uv run pytest tests/test_remote_wrappers.py
11+
```
12+
13+
Pass `--no-mock-client` to use a real `LlamaStackClient` against a running Llama Stack server (defaults to `http://localhost:8321`). Model IDs can be overridden with `INFERENCE_MODEL` and `EMBEDDING_MODEL`.
14+
15+
```bash
16+
uv run pytest tests/test_remote_wrappers.py --no-mock-client
17+
```
18+
19+
## Integration tests (`test_inline_evaluation.py`, pytest marker `lls_integration`)
20+
21+
Tests the eval providers through an in-process Llama Stack server using `LlamaStackAsLibraryClient`. The stack configuration (providers, models, storage) is built entirely in fixtures. By default, Ollama connectivity and inference are mocked.
22+
23+
```bash
24+
uv run pytest tests/test_inline_evaluation.py
25+
```
26+
27+
Pass `--no-mock-inference` to use a real Ollama instance for inference:
28+
29+
```bash
30+
INFERENCE_MODEL=ollama/granite3.3:2b \
31+
EMBEDDING_MODEL=ollama/all-minilm:latest \
32+
uv run pytest tests/test_inline_evaluation.py --no-mock-inference
33+
```
34+
35+
## End-to-end tests (`test_e2e.py`, pytest marker `e2e`)
36+
37+
Tests against a fully deployed Llama Stack distribution on an OpenShift cluster. Requires the cluster environment from `cluster-deployment/` to be set up and a port-forward to the Llama Stack service:
38+
39+
```bash
40+
oc port-forward -n ragas-test svc/lsd-ragas-test-service 8321:8321
41+
uv run pytest tests/test_e2e.py
42+
```
43+
44+
These tests exercise both the inline and remote eval providers through the Llama Stack eval API, including dataset registration, benchmark creation, and eval job execution with result polling.
45+
46+
## Model configuration
47+
48+
Each test module defines its own `inference_model` and `embedding_model` fixtures with defaults appropriate to its backend:
49+
50+
| Module | Inference default | Embedding default | Backend |
51+
|--------|-------------------|-------------------|---------|
52+
| `test_inline_evaluation.py` | `ollama/granite3.3:2b` | `ollama/all-minilm:latest` | In-process Ollama (library client) |
53+
| `test_remote_wrappers.py` | `litellm/Mistral-Small-24B-W8A8` | `nomic-ai/nomic-embed-text-v1.5` | Mocked `LlamaStackClient` |
54+
| `test_e2e.py` | `Mistral-Small-24B-W8A8` | `nomic-ai/nomic-embed-text-v1.5` | OpenShift cluster (see `cluster-deployment/manifests/configmap-and-secrets.yaml`) |
55+
56+
The `INFERENCE_MODEL` and `EMBEDDING_MODEL` environment variables override these defaults across all suites. When overriding, ensure the values match the models registered in the target environment — e.g. e2e defaults must match the OpenShift configmap, and inline defaults must use the `ollama/` prefix expected by the library client config.
57+
58+
## Cluster deployment (`cluster-deployment/`)
59+
60+
Contains the Containerfile, deployment/teardown scripts, and Kubernetes manifests needed to stand up the e2e test environment on OpenShift. See `cluster-deployment/deploy-e2e.sh` to deploy.

tests/base_eval_tests.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
"""Shared test helpers for Llama Stack eval provider tests.
2+
3+
Provides ``SmokeTester`` and ``EvalTester``, plain helper classes that
4+
encapsulate common assertions (model/dataset/benchmark registration) and
5+
eval-job execution logic (run, poll, verify scores). Test modules
6+
instantiate them via fixtures, supplying the appropriate client and
7+
configuration for each environment (in-process library client or remote
8+
``LlamaStackClient``).
9+
"""
10+
11+
import time
12+
13+
from rich import print as pprint
14+
15+
16+
class SmokeTester:
17+
def __init__(self, client, dataset_id, inline_benchmark_id, remote_benchmark_id):
18+
self.client = client
19+
self.dataset_id = dataset_id
20+
self.inline_benchmark_id = inline_benchmark_id
21+
self.remote_benchmark_id = remote_benchmark_id
22+
23+
def test_providers_registered(self):
24+
providers = self.client.providers.list()
25+
assert len(providers) > 0
26+
assert any(p.api == "eval" for p in providers)
27+
pprint("Providers:", providers)
28+
29+
def test_models_registered(self):
30+
models = self.client.models.list()
31+
pprint("Models:", models)
32+
assert len(models) > 0, "No models registered"
33+
34+
def test_datasets_registered(self):
35+
datasets = self.client.beta.datasets.list()
36+
pprint("Datasets:", datasets)
37+
dataset_ids = [d.identifier for d in datasets]
38+
assert self.dataset_id in dataset_ids, (
39+
f"Dataset '{self.dataset_id}' not found. Available: {dataset_ids}"
40+
)
41+
42+
def test_benchmarks_registered(self):
43+
benchmarks = self.client.alpha.benchmarks.list()
44+
pprint("Benchmarks:", benchmarks)
45+
benchmark_ids = [b.identifier for b in benchmarks]
46+
assert self.inline_benchmark_id in benchmark_ids, (
47+
f"Benchmark '{self.inline_benchmark_id}' not found. Available: {benchmark_ids}"
48+
)
49+
assert self.remote_benchmark_id in benchmark_ids, (
50+
f"Benchmark '{self.remote_benchmark_id}' not found. Available: {benchmark_ids}"
51+
)
52+
53+
54+
class EvalTester:
55+
"""Base evaluation test class."""
56+
57+
def __init__(
58+
self,
59+
client,
60+
inference_model,
61+
dataset_id,
62+
inline_benchmark_id,
63+
remote_benchmark_id,
64+
poll_interval: int = 5,
65+
poll_timeout: int = 300,
66+
):
67+
self.client = client
68+
self.inference_model = inference_model
69+
self.dataset_id = dataset_id
70+
self.inline_benchmark_id = inline_benchmark_id
71+
self.remote_benchmark_id = remote_benchmark_id
72+
self.poll_interval = poll_interval
73+
self.poll_timeout = poll_timeout
74+
75+
def run_eval(
76+
self,
77+
benchmark_id: str,
78+
inference_model: str,
79+
num_examples: int | None = None,
80+
):
81+
"""Run an evaluation job and verify it completes with scores."""
82+
benchmark_config = self._build_benchmark_config(
83+
inference_model, num_examples=num_examples
84+
)
85+
job = self.client.alpha.eval.run_eval(
86+
benchmark_id=benchmark_id,
87+
benchmark_config=benchmark_config,
88+
)
89+
assert job.job_id is not None
90+
assert job.status == "in_progress"
91+
92+
completed = self._wait_for_job(self.client, benchmark_id, job.job_id)
93+
assert completed.status == "completed", (
94+
f"Job finished with status '{completed.status}'"
95+
)
96+
97+
results = self.client.alpha.eval.jobs.retrieve(
98+
benchmark_id=benchmark_id, job_id=job.job_id
99+
)
100+
pprint(f"[{self.__class__.__name__}] Results:", results)
101+
assert results.scores, "Expected non-empty scores"
102+
103+
# -- helpers --------------------------------------------------------
104+
105+
def _build_benchmark_config(
106+
self, inference_model: str, num_examples: int | None = None
107+
) -> dict:
108+
"""Build the ``benchmark_config`` dict for ``run_eval``."""
109+
config: dict = {
110+
"eval_candidate": {
111+
"type": "model",
112+
"model": inference_model,
113+
"sampling_params": {
114+
"temperature": 0.1,
115+
"max_tokens": 100,
116+
},
117+
},
118+
"scoring_params": {},
119+
}
120+
if num_examples is not None:
121+
config["num_examples"] = num_examples
122+
return config
123+
124+
def _wait_for_job(
125+
self, client, benchmark_id: str, job_id: str, timeout: int | None = None
126+
):
127+
"""Poll until the eval job reaches a terminal state."""
128+
timeout = timeout if timeout is not None else self.poll_timeout
129+
deadline = time.time() + timeout
130+
while time.time() < deadline:
131+
job = client.alpha.eval.jobs.status(
132+
benchmark_id=benchmark_id, job_id=job_id
133+
)
134+
pprint(f"[{self.__class__.__name__}] Job status:", job)
135+
if job.status in ("completed", "failed"):
136+
return job
137+
time.sleep(self.poll_interval)
138+
raise TimeoutError(
139+
f"Job {job_id} for benchmark {benchmark_id} "
140+
f"did not complete within {timeout}s"
141+
)

0 commit comments

Comments
 (0)