-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Expand file tree
/
Copy pathtry_evals_iterator_async_span.py
More file actions
82 lines (61 loc) · 2.2 KB
/
try_evals_iterator_async_span.py
File metadata and controls
82 lines (61 loc) · 2.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Manual smoke test: async + span-level metric.
python try_evals_iterator_async_span.py
Metric is declared on @observe(metrics=[...]) and evaluated on the span.
"""
from __future__ import annotations
import asyncio
import random
import time
from typing import List
from deepeval.dataset import EvaluationDataset, Golden
from deepeval.evaluate.configs import AsyncConfig, DisplayConfig
from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.tracing import observe
AGENT_DELAY_SEC = 0.6
METRIC_DELAY_SEC = 0.4
class RandomScoreMetric(BaseMetric):
threshold: float = 0.5
async_mode: bool = True
_required_params: List[LLMTestCaseParams] = [LLMTestCaseParams.INPUT]
def __init__(self, threshold: float = 0.5):
self.threshold = threshold
def _finalize(self) -> float:
self.score = random.random()
self.success = self.score >= self.threshold
self.reason = f"random score {self.score:.3f}"
return self.score
def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
time.sleep(METRIC_DELAY_SEC)
return self._finalize()
async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
await asyncio.sleep(METRIC_DELAY_SEC)
return self._finalize()
def is_successful(self) -> bool:
return bool(self.success)
@property
def __name__(self):
return "RandomScore"
QUESTIONS = [
f"[{i}] {q}"
for i, q in enumerate(
[
"What is the capital of France?",
"Who wrote Hamlet?",
"What is 2 + 2?",
"Define entropy.",
"What is the speed of light?",
]
)
]
@observe(type="agent", name="span_metric_agent", metrics=[RandomScoreMetric()])
def agent(question: str) -> str:
time.sleep(AGENT_DELAY_SEC)
return f"Answer to {question!r} is 42."
if __name__ == "__main__":
dataset = EvaluationDataset(goldens=[Golden(input=q) for q in QUESTIONS])
for golden in dataset.evals_iterator(
async_config=AsyncConfig(run_async=True),
display_config=DisplayConfig(show_indicator=True, verbose_mode=False),
):
agent(golden.input)