deepeval/try_evals_iterator_async_span.py at main · confident-ai/deepeval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Manual smoke test: async + span-level metric.

    python try_evals_iterator_async_span.py

Metric is declared on @observe(metrics=[...]) and evaluated on the span.
"""

from __future__ import annotations

import asyncio
import random
import time
from typing import List

from deepeval.dataset import EvaluationDataset, Golden
from deepeval.evaluate.configs import AsyncConfig, DisplayConfig
from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams
from deepeval.tracing import observe


AGENT_DELAY_SEC = 0.6
METRIC_DELAY_SEC = 0.4


class RandomScoreMetric(BaseMetric):
    threshold: float = 0.5
    async_mode: bool = True
    _required_params: List[LLMTestCaseParams] = [LLMTestCaseParams.INPUT]

    def __init__(self, threshold: float = 0.5):
        self.threshold = threshold

    def _finalize(self) -> float:
        self.score = random.random()
        self.success = self.score >= self.threshold
        self.reason = f"random score {self.score:.3f}"
        return self.score

    def measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
        time.sleep(METRIC_DELAY_SEC)
        return self._finalize()

    async def a_measure(self, test_case: LLMTestCase, *args, **kwargs) -> float:
        await asyncio.sleep(METRIC_DELAY_SEC)
        return self._finalize()

    def is_successful(self) -> bool:
        return bool(self.success)

    @property
    def __name__(self):
        return "RandomScore"


QUESTIONS = [
    f"[{i}] {q}"
    for i, q in enumerate(
        [
            "What is the capital of France?",
            "Who wrote Hamlet?",
            "What is 2 + 2?",
            "Define entropy.",
            "What is the speed of light?",
        ]
    )
]


@observe(type="agent", name="span_metric_agent", metrics=[RandomScoreMetric()])
def agent(question: str) -> str:
    time.sleep(AGENT_DELAY_SEC)
    return f"Answer to {question!r} is 42."


if __name__ == "__main__":
    dataset = EvaluationDataset(goldens=[Golden(input=q) for q in QUESTIONS])
    for golden in dataset.evals_iterator(
        async_config=AsyncConfig(run_async=True),
        display_config=DisplayConfig(show_indicator=True, verbose_mode=False),
    ):
        agent(golden.input)