@@ -116,11 +116,11 @@ def fetch_traces_for_evaluation(
116116) -> list[dict ]:
117117 """
118118 Fetch traces from Langfuse for evaluation.
119-
119+
120120 Args:
121121 limit: Maximum number of traces to fetch
122122 tags: Optional tag filter
123-
123+
124124 Returns:
125125 List of trace dictionaries
126126 """
@@ -129,7 +129,7 @@ def fetch_traces_for_evaluation(
129129 limit = limit,
130130 tags = tags,
131131 )
132-
132+
133133 result = []
134134 for trace in response.data:
135135 # Only include traces with input and output
@@ -140,13 +140,13 @@ def fetch_traces_for_evaluation(
140140 " output" : trace.output,
141141 " metadata" : trace.metadata or {},
142142 }
143-
143+
144144 # Add expected output if available in metadata
145145 if trace.metadata and " expected" in trace.metadata:
146146 trace_dict[" expected" ] = trace.metadata[" expected" ]
147-
147+
148148 result.append(trace_dict)
149-
149+
150150 return result
151151```
152152
@@ -164,23 +164,23 @@ def fetch_recent_traces(
164164) -> list[dict ]:
165165 """
166166 Fetch traces from the last N hours.
167-
167+
168168 Args:
169169 hours_back: Number of hours to look back
170170 limit: Maximum number of traces to fetch
171171 tags: Optional tag filter
172-
172+
173173 Returns:
174174 List of trace dictionaries
175175 """
176176 from_timestamp = datetime.now() - timedelta(hours = hours_back)
177-
177+
178178 response = langfuse.api.trace.list(
179179 limit = limit,
180180 tags = tags,
181181 from_timestamp = from_timestamp,
182182 )
183-
183+
184184 result = []
185185 for trace in response.data:
186186 if trace.input and trace.output:
@@ -190,7 +190,7 @@ def fetch_recent_traces(
190190 " output" : trace.output,
191191 " metadata" : trace.metadata or {},
192192 })
193-
193+
194194 return result
195195```
196196
@@ -225,14 +225,14 @@ from openjudge.graders.schema import GraderScore, GraderError
225225
226226async def evaluate_single_trace ():
227227 """ Evaluate traces using a single grader"""
228-
228+
229229 # Initialize model and grader
230230 model = OpenAIChatModel(model = " qwen3-32b" )
231231 grader = RelevanceGrader(model = model)
232-
232+
233233 # Fetch traces
234234 traces = fetch_traces_for_evaluation(limit = 10 )
235-
235+
236236 for trace in traces:
237237 try :
238238 # Run evaluation
@@ -241,7 +241,7 @@ async def evaluate_single_trace():
241241 query = trace[" input" ],
242242 response = trace[" output" ],
243243 )
244-
244+
245245 # Process result and send to Langfuse
246246 if isinstance (result, GraderScore):
247247 langfuse.create_score(
@@ -253,10 +253,10 @@ async def evaluate_single_trace():
253253 print (f " ✓ Trace { trace[' id' ][:8 ]} ... scored: { result.score} " )
254254 elif isinstance (result, GraderError):
255255 print (f " ✗ Trace { trace[' id' ][:8 ]} ... error: { result.error} " )
256-
256+
257257 except Exception as e:
258258 print (f " ✗ Error evaluating trace { trace[' id' ]} : { e} " )
259-
259+
260260 # Ensure all scores are sent
261261 langfuse.flush()
262262
@@ -280,10 +280,10 @@ from openjudge.runner.aggregator.weighted_sum_aggregator import WeightedSumAggre
280280
281281async def batch_evaluate_traces ():
282282 """ Batch evaluate traces using GradingRunner"""
283-
283+
284284 # Initialize model
285285 model = OpenAIChatModel(model = " qwen3-32b" )
286-
286+
287287 # Configure multiple graders with field mappers
288288 # Map trace fields to grader expected parameters
289289 runner = GradingRunner(
@@ -310,18 +310,18 @@ async def batch_evaluate_traces():
310310 )
311311 ],
312312 )
313-
313+
314314 # Fetch traces
315315 traces = fetch_traces_for_evaluation(limit = 50 )
316-
316+
317317 if not traces:
318318 print (" No traces to evaluate" )
319319 return
320-
320+
321321 # Prepare evaluation data
322322 evaluation_data = []
323323 trace_id_mapping = {} # Map index to trace_id
324-
324+
325325 for i, trace in enumerate (traces):
326326 eval_item = {
327327 " input" : trace[" input" ],
@@ -330,14 +330,14 @@ async def batch_evaluate_traces():
330330 # Add expected output as reference if available
331331 if trace.get(" expected" ):
332332 eval_item[" expected" ] = trace[" expected" ]
333-
333+
334334 evaluation_data.append(eval_item)
335335 trace_id_mapping[i] = trace[" id" ]
336-
336+
337337 # Run batch evaluation
338338 try :
339339 results = await runner.arun(evaluation_data)
340-
340+
341341 # Send results back to Langfuse
342342 # results contains individual grader scores + aggregated "overall_quality" score
343343 scores_sent = 0
@@ -347,18 +347,18 @@ async def batch_evaluate_traces():
347347 print (f " Sending { grader_name} score for trace { trace_id} " )
348348 send_result_to_langfuse(trace_id, grader_name, result)
349349 scores_sent += 1
350-
351-
350+
351+
352352 print (f " ✓ Successfully sent { scores_sent} scores for { len (traces)} traces " )
353-
353+
354354 except Exception as e:
355355 print (f " ✗ Batch evaluation failed: { e} " )
356-
356+
357357 # Ensure all scores are sent
358358 langfuse.flush()
359359def send_result_to_langfuse (trace_id : str , grader_name : str , result ) -> None :
360360 """ Send evaluation result to Langfuse"""
361-
361+
362362 if isinstance (result, GraderScore):
363363 langfuse.create_score(
364364 trace_id = trace_id,
0 commit comments