Skip to content

Commit 22fd071

Browse files
committed
fix(tracing): enforce span_kwargs precedence for tool spans
ToolSpan now filters observe_kwargs to model fields and drops any keys that would collide with explicit span_kwargs so reserved fields always win. - Rename tool span in component-level doc tests to avoid name collisions - Assert observe(name) overrides function name, and update_current_span overrides observe - Add checklist coverage for parent/child UUID relationships in nested spans - Add regression test to ensure @observe(type="tool", name=...) does not crash due to name collison
1 parent 90733ff commit 22fd071

File tree

2 files changed

+201
-20
lines changed

2 files changed

+201
-20
lines changed

deepeval/tracing/tracing.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,11 +1039,8 @@ def create_span_instance(self):
10391039

10401040
elif self.span_type == SpanType.TOOL.value:
10411041
kwargs = filter_model_kwargs(ToolSpan, dict(self.observe_kwargs))
1042-
1043-
# Only drop keys that would collide with explicit span_kwargs
1044-
collisions = set(kwargs).intersection(span_kwargs)
1045-
for k in collisions:
1046-
kwargs.pop(k, None)
1042+
# explicit span_kwargs always win over observe_kwargs
1043+
kwargs = {k: v for k, v in kwargs.items() if k not in span_kwargs}
10471044
return ToolSpan(**span_kwargs, **kwargs)
10481045
else:
10491046
return BaseSpan(**span_kwargs)

tests/test_docs/test_deepeval/test_llm_evals/test_component_level_evals.py

Lines changed: 199 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,6 @@
2424

2525

2626
# Apps
27-
28-
2927
def your_llm_app(input: str):
3028
@observe(type="retriever")
3129
def retriever(input: str):
@@ -349,7 +347,7 @@ def your_llm_app_with_tools_called(input: str):
349347
Docs: LLMTestCase supports tools_called and expected_tools fields.
350348
"""
351349

352-
@observe(type="tool", name="search_tool")
350+
@observe(type="tool", name="observe_search_tool")
353351
def search_tool(query: str):
354352
return "search result"
355353

@@ -755,12 +753,6 @@ def test_evals_iterator_emits_span_with_matching_input_per_golden():
755753
assert gen.get("input") == golden.input
756754

757755

758-
###############################
759-
# Additional coverage tests #
760-
###############################
761-
762-
763-
# Consider deleting this test
764756
def test_update_current_trace_sets_trace_level_test_case():
765757
"""
766758
Docs: update_current_trace can be used to set end-to-end test cases for the trace.
@@ -785,7 +777,6 @@ def test_update_current_trace_sets_trace_level_test_case():
785777
)
786778

787779

788-
# consider deleting this test: not asserting anything of value
789780
def test_update_current_span_with_individual_params():
790781
"""
791782
Docs: update_current_span can take individual LLMTestCase params
@@ -808,7 +799,6 @@ def test_update_current_span_with_individual_params():
808799
assert gen.get("output") == "MOCK_RESPONSE"
809800

810801

811-
# consider deleting this test: Low value
812802
def test_observe_name_parameter_customizes_span_name():
813803
"""
814804
Docs: The @observe decorator accepts a `name` parameter to customize
@@ -894,15 +884,49 @@ def test_llm_test_case_with_tools_called():
894884
assert gen is not None
895885

896886
# Also verify tool span was created
897-
tool_span = find_span_by_name(trace_dict, "search_tool")
887+
tool_span = find_span_by_name(trace_dict, "observe_search_tool")
898888
tool_names = span_names_by_key(trace_dict, "toolSpans")
899889
base_names = span_names_by_key(trace_dict, "baseSpans")
900890

901891
assert (
902892
tool_span is not None
903-
or "search_tool" in tool_names
904-
or "search_tool" in base_names
905-
), f"Expected search_tool span. Got: {debug_span_names(trace_dict)}"
893+
or "observe_search_tool" in tool_names
894+
or "observe_search_tool" in base_names
895+
), f"Expected observe_search_tool span. Got: {debug_span_names(trace_dict)}"
896+
897+
assert find_span_by_name(trace_dict, "search_tool") is None
898+
899+
900+
def test_update_current_span_name_overrides_observer_name():
901+
@observe(type="tool", name="observer_name")
902+
def tool_fn(x: str):
903+
update_current_span(name="update_name") # if supported in your API
904+
return "ok"
905+
906+
out = tool_fn("x")
907+
assert out == "ok"
908+
909+
trace_dict = get_latest_trace_dict()
910+
span = find_span_by_name(trace_dict, "update_name")
911+
assert span is not None, (
912+
"Expected update_current_span(name=...) to override @observe(name=...). "
913+
f"Got: {debug_span_names(trace_dict)}"
914+
)
915+
assert find_span_by_name(trace_dict, "observer_name") is None
916+
917+
918+
def test_update_current_span_output_not_overridden_by_observer_kwargs():
919+
@observe(name="tool_span", type="tool", output="SHOULD_NOT_WIN")
920+
def tool_fn(x: str):
921+
update_current_span(output="SHOULD_WIN")
922+
return "ok"
923+
924+
tool_fn("x")
925+
926+
trace_dict = get_latest_trace_dict()
927+
span = find_span_by_name(trace_dict, "tool_span")
928+
assert span is not None
929+
assert span.get("output") == "SHOULD_WIN"
906930

907931

908932
def test_golden_with_expected_output():
@@ -1068,3 +1092,163 @@ def agent_func(input: str):
10681092
"test_agent" in agent_names
10691093
or find_span_by_name(trace_dict, "test_agent") is not None
10701094
), f"Expected test_agent span. Got: {debug_span_names(trace_dict)}"
1095+
1096+
1097+
###############################################################
1098+
# Checklist: Nested execution contexts produce parent/child #
1099+
# span relationships with explicit UUID edges #
1100+
###############################################################
1101+
1102+
1103+
def test_nested_spans_parent_child_uuid_relationships():
1104+
"""
1105+
Checklist item 1: Nested execution contexts correctly produce parent and
1106+
child span relationships.
1107+
1108+
This test verifies:
1109+
- All spans have a uuid field
1110+
- Parent-child relationships are explicitly linked via parentUuid == parent.uuid
1111+
- The tree structure is: nested_app (root) -> outer_retriever -> inner_retriever
1112+
-> generator
1113+
"""
1114+
out = your_llm_app_nested_spans("How are you?")
1115+
assert out == "MOCK_RESPONSE"
1116+
1117+
trace_dict = get_latest_trace_dict()
1118+
assert trace_dict is not None
1119+
1120+
spans = all_spans(trace_dict)
1121+
1122+
def by_name(n: str):
1123+
return next((s for s in spans if s.get("name") == n), None)
1124+
1125+
app_span = by_name("nested_app")
1126+
outer_span = by_name("outer_retriever")
1127+
inner_span = by_name("inner_retriever")
1128+
gen_span = by_name("generator")
1129+
1130+
# All spans must exist
1131+
assert (
1132+
app_span is not None
1133+
), f"Missing nested_app. Available: {[s.get('name') for s in spans]}"
1134+
assert (
1135+
outer_span is not None
1136+
), f"Missing outer_retriever. Available: {[s.get('name') for s in spans]}"
1137+
assert (
1138+
inner_span is not None
1139+
), f"Missing inner_retriever. Available: {[s.get('name') for s in spans]}"
1140+
assert (
1141+
gen_span is not None
1142+
), f"Missing generator. Available: {[s.get('name') for s in spans]}"
1143+
1144+
# All spans must have a uuid
1145+
assert app_span.get("uuid"), "nested_app must have uuid"
1146+
assert outer_span.get("uuid"), "outer_retriever must have uuid"
1147+
assert inner_span.get("uuid"), "inner_retriever must have uuid"
1148+
assert gen_span.get("uuid"), "generator must have uuid"
1149+
1150+
# Verify explicit parent-child UUID relationships
1151+
# nested_app is root (parentUuid is None or missing)
1152+
assert (
1153+
app_span.get("parentUuid") is None
1154+
), f"nested_app should be root span with no parent, got parentUuid={app_span.get('parentUuid')}"
1155+
1156+
# outer_retriever.parentUuid == nested_app.uuid
1157+
assert outer_span.get("parentUuid") == app_span.get("uuid"), (
1158+
f"outer_retriever.parentUuid should equal nested_app.uuid. "
1159+
f"Got parentUuid={outer_span.get('parentUuid')}, expected={app_span.get('uuid')}"
1160+
)
1161+
1162+
# inner_retriever.parentUuid == outer_retriever.uuid
1163+
assert inner_span.get("parentUuid") == outer_span.get("uuid"), (
1164+
f"inner_retriever.parentUuid should equal outer_retriever.uuid. "
1165+
f"Got parentUuid={inner_span.get('parentUuid')}, expected={outer_span.get('uuid')}"
1166+
)
1167+
1168+
# generator.parentUuid == nested_app.uuid
1169+
assert gen_span.get("parentUuid") == app_span.get("uuid"), (
1170+
f"generator.parentUuid should equal nested_app.uuid. "
1171+
f"Got parentUuid={gen_span.get('parentUuid')}, expected={app_span.get('uuid')}"
1172+
)
1173+
1174+
1175+
###############################################################
1176+
# Checklist: Component-level outputs convert to serialized #
1177+
# structure (TraceApi) with expected keys #
1178+
###############################################################
1179+
1180+
1181+
def test_trace_serialization_contains_expected_top_level_keys():
1182+
"""
1183+
Checklist item 2: Component-level outputs can be converted into a test run
1184+
or serialized structure with the expected keys.
1185+
1186+
Verifies the trace_dict (TraceApi serialized output) contains:
1187+
- Top-level keys: uuid, startTime, endTime, status
1188+
- Typed span bucket keys: baseSpans, agentSpans, llmSpans, retrieverSpans, toolSpans
1189+
- Each bucket is a list
1190+
- Spans in buckets have required keys: uuid, name, status, startTime, endTime
1191+
"""
1192+
out = your_llm_app_rooted("How are you?")
1193+
assert out == "MOCK_RESPONSE"
1194+
1195+
trace_dict = get_latest_trace_dict()
1196+
assert trace_dict is not None
1197+
1198+
# Top-level trace keys
1199+
assert "uuid" in trace_dict, "Trace must have 'uuid' key"
1200+
assert "startTime" in trace_dict, "Trace must have 'startTime' key"
1201+
assert "endTime" in trace_dict, "Trace must have 'endTime' key"
1202+
assert "status" in trace_dict, "Trace must have 'status' key"
1203+
1204+
# Typed span bucket keys must exist as lists
1205+
expected_buckets = [
1206+
"baseSpans",
1207+
"agentSpans",
1208+
"llmSpans",
1209+
"retrieverSpans",
1210+
"toolSpans",
1211+
]
1212+
for bucket in expected_buckets:
1213+
assert bucket in trace_dict, f"Trace must have '{bucket}' key"
1214+
assert isinstance(
1215+
trace_dict[bucket], list
1216+
), f"'{bucket}' must be a list"
1217+
1218+
# Verify spans have required per-span keys
1219+
required_span_keys = {"uuid", "name", "status", "startTime", "endTime"}
1220+
for bucket in expected_buckets:
1221+
for span in trace_dict[bucket]:
1222+
missing = required_span_keys - set(span.keys())
1223+
assert (
1224+
not missing
1225+
), f"Span '{span.get('name')}' in {bucket} missing keys: {missing}"
1226+
1227+
1228+
###############################################################
1229+
# Regression: ToolSpan kwargs collision fix #
1230+
###############################################################
1231+
1232+
1233+
def test_observe_tool_with_name_kwarg_does_not_crash():
1234+
"""
1235+
Regression test: @observe(type="tool", name="...") previously crashed due to
1236+
kwargs collision when 'name' was passed both in observe_kwargs and span_kwargs.
1237+
The fix filters observe_kwargs to ToolSpan model fields and drops colliding keys.
1238+
"""
1239+
1240+
@observe(type="tool", name="my_named_tool")
1241+
def named_tool_func(arg: str) -> str:
1242+
return f"tool output: {arg}"
1243+
1244+
result = named_tool_func("test_input")
1245+
assert result == "tool output: test_input"
1246+
1247+
trace_dict = get_latest_trace_dict()
1248+
assert trace_dict is not None
1249+
1250+
# The tool span should be in toolSpans with the custom name
1251+
tool_names = span_names_by_key(trace_dict, "toolSpans")
1252+
assert (
1253+
"my_named_tool" in tool_names
1254+
), f"Expected 'my_named_tool' in toolSpans. Got: {tool_names}"

0 commit comments

Comments
 (0)