Skip to content

Commit 05319b6

Browse files
wrisazhirafovod
authored andcommitted
Added evaluation span and event. Added log api for event
1 parent ddba8fc commit 05319b6

File tree

9 files changed

+281
-22
lines changed

9 files changed

+281
-22
lines changed

instrumentation-genai/opentelemetry-genai-sdk/src/opentelemetry/genai/sdk/api.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
from opentelemetry.metrics import get_meter
2626
from opentelemetry.trace import get_tracer
2727
from opentelemetry._events import get_event_logger
28+
from opentelemetry._logs import get_logger
2829
from opentelemetry.semconv.schemas import Schemas
2930

3031

@@ -49,8 +50,13 @@ def __init__(self, exporter_type_full: bool = True, **kwargs):
4950
__name__, __version__, event_logger_provider=event_logger_provider, schema_url=Schemas.V1_28_0.value
5051
)
5152

53+
logger_provider = kwargs.get("logger_provider")
54+
self._logger = get_logger(
55+
__name__, __version__, logger_provider=logger_provider, schema_url=Schemas.V1_28_0.value
56+
)
57+
5258
self._exporter = (
53-
SpanMetricEventExporter(tracer=self._tracer, meter=self._meter, event_logger=self._event_logger)
59+
SpanMetricEventExporter(tracer=self._tracer, meter=self._meter, event_logger=self._event_logger, logger=self._event_logger)
5460
if exporter_type_full
5561
else SpanMetricExporter(tracer=self._tracer, meter=self._meter)
5662
)

instrumentation-genai/opentelemetry-genai-sdk/src/opentelemetry/genai/sdk/evals.py

Lines changed: 74 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
from abc import ABC, abstractmethod
2+
from opentelemetry._events import Event
3+
24
from .types import LLMInvocation
5+
from opentelemetry import trace
6+
from opentelemetry.trace import (
7+
Tracer,
8+
)
9+
from opentelemetry import _events
10+
from .deepeval import evaluate_answer_relevancy_metric
11+
from opentelemetry.trace import SpanContext, Span
12+
from opentelemetry.trace.span import NonRecordingSpan
313

414

515
class EvaluationResult:
@@ -22,20 +32,75 @@ def evaluate(self, invocation: LLMInvocation) -> EvaluationResult:
2232
"""
2333
pass
2434

25-
class DeepEvalsEvaluator(Evaluator):
35+
class DeepEvalEvaluator(Evaluator):
2636
"""
2737
Uses DeepEvals library for LLM-as-judge evaluations.
2838
"""
29-
def __init__(self, config: dict = None):
39+
def __init__(self, event_logger, tracer: Tracer = None, config: dict = None):
3040
# e.g. load models, setup API keys
3141
self.config = config or {}
42+
self._tracer = tracer or trace.get_tracer(__name__)
43+
self._event_logger = event_logger or _events.get_event_logger(__name__)
3244

33-
def evaluate(self, invocation: LLMInvocation) -> EvaluationResult:
45+
def evaluate(self, invocation: LLMInvocation):
3446
# stub: integrate with deepevals SDK
3547
# result = deepevals.judge(invocation.prompt, invocation.response, **self.config)
36-
score = 0.0 # placeholder
37-
details = {"method": "deepevals"}
38-
return EvaluationResult(score=score, details=details)
48+
human_message = next((msg for msg in invocation.messages if msg.type == "human"), None)
49+
content = invocation.chat_generations[0].content
50+
if content is not None and content != "":
51+
eval_arm = evaluate_answer_relevancy_metric(human_message.content, invocation.chat_generations[0].content, [])
52+
self._do_telemetry(invocation.messages[1].content, invocation.chat_generations[0].content,
53+
invocation.span_id, invocation.trace_id, eval_arm)
54+
55+
def _do_telemetry(self, query, output, parent_span_id, parent_trace_id, eval_arm):
56+
57+
# emit event
58+
body = {
59+
"content": f"query: {query} output: {output}",
60+
}
61+
attributes = {
62+
"gen_ai.evaluation.name": "relevance",
63+
"gen_ai.evaluation.score": eval_arm.score,
64+
"gen_ai.evaluation.reasoning": eval_arm.reason,
65+
"gen_ai.evaluation.cost": eval_arm.evaluation_cost,
66+
}
67+
68+
event = Event(
69+
name="gen_ai.evaluation.message",
70+
attributes=attributes,
71+
body=body if body else None,
72+
span_id=parent_span_id,
73+
trace_id=parent_trace_id,
74+
)
75+
self._event_logger.emit(event)
76+
77+
# create span
78+
span_context = SpanContext(
79+
trace_id=parent_trace_id,
80+
span_id=parent_span_id,
81+
is_remote=False,
82+
)
83+
84+
span = NonRecordingSpan(
85+
context=span_context,
86+
)
87+
88+
tracer = trace.get_tracer(__name__)
89+
90+
with tracer.start_as_current_span("evaluation relevance") as span:
91+
# do evaluation
92+
93+
span.add_link(span_context, attributes={
94+
"gen_ai.operation.name": "evaluation",
95+
})
96+
span.set_attribute("gen_ai.operation.name", "evaluation")
97+
span.set_attribute("gen_ai.evaluation.name", "relevance")
98+
span.set_attribute("gen_ai.evaluation.score", eval_arm.score)
99+
span.set_attribute("gen_ai.evaluation.label", "Pass")
100+
span.set_attribute("gen_ai.evaluation.reasoning", eval_arm.reason)
101+
span.set_attribute("gen_ai.evaluation.model", eval_arm.evaluation_model)
102+
span.set_attribute("gen_ai.evaluation.cost", eval_arm.evaluation_cost)
103+
#span.set_attribute("gen_ai.evaluation.verdict", eval_arm.verdicts)
39104

40105

41106
class OpenLitEvaluator(Evaluator):
@@ -54,16 +119,16 @@ def evaluate(self, invocation: LLMInvocation) -> EvaluationResult:
54119

55120
# Registry for easy lookup
56121
EVALUATORS = {
57-
"deepevals": DeepEvalsEvaluator,
122+
"deepeval": DeepEvalEvaluator,
58123
"openlit": OpenLitEvaluator,
59124
}
60125

61126

62-
def get_evaluator(name: str, config: dict = None) -> Evaluator:
127+
def get_evaluator(name: str, event_logger = None, tracer: Tracer = None, config: dict = None) -> Evaluator:
63128
"""
64129
Factory: return an evaluator by name.
65130
"""
66131
cls = EVALUATORS.get(name.lower())
67132
if not cls:
68133
raise ValueError(f"Unknown evaluator: {name}")
69-
return cls(config)
134+
return cls(event_logger, tracer, config)

0 commit comments

Comments
 (0)