Use correct resultReason method when fetching eval reason (#2089)

learningbizz · web-flow · commit 444ed2d1e76a · 2026-01-07T17:05:39.000+01:00
We used to do a quickfix to get the reason, now we implemented the improved version using the correct resultReason method
diff --git a/.gitignore b/.gitignore
@@ -94,6 +94,8 @@ __testing__
 # New spans
 apps/web/ingest
 apps/workers/workspaces/*/traces
+provider-logs/*
+workspaces/*
 
 # Helm chart secrets (do not commit)
 charts/latitude/values.secrets.yaml
diff --git a/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.test.ts b/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.test.ts
@@ -221,6 +221,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => {
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -256,6 +257,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => {
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -283,6 +285,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => {
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -307,6 +310,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => {
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -337,6 +341,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => {
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -366,6 +371,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => {
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(false)
@@ -379,6 +385,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => {
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -426,6 +433,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => {
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -535,6 +543,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => {
         workspace,
         commit: commit2,
         issue,
+        existingEvaluations: [evaluation],
       })
 
       expect(result.ok).toBe(true)
@@ -642,6 +651,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => {
         workspace,
         commit: draftCommit,
         issue,
+        existingEvaluations: [evaluation],
       })
 
       expect(result.ok).toBe(true)
@@ -733,6 +743,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => {
         workspace,
         commit,
         issue,
+        existingEvaluations: [evaluation],
       })
 
       expect(result.ok).toBe(true)
diff --git a/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.ts b/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.ts
@@ -5,13 +5,18 @@ import {
   EvaluationResultsV2Repository,
 } from '../../repositories'
 import { Result } from '../../lib/Result'
-import { EvaluationResultV2 } from '../../constants'
+import {
+  EvaluationResultSuccessValue,
+  EvaluationResultV2,
+  EvaluationV2,
+} from '../../constants'
 import { Issue } from '../../schema/models/types/Issue'
 import { Message as LegacyMessage } from '@latitude-data/constants/legacyCompiler'
 import { assembleTraceWithMessages } from '../../services/tracing/traces/assemble'
 import { adaptCompletionSpanMessagesToLegacy } from '../../services/tracing/spans/fetching/findCompletionSpanFromTrace'
 import { UnprocessableEntityError } from '../../lib/errors'
 import { getHITLSpansByIssue } from './getHITLSpansByIssue'
+import { getEvaluationMetricSpecification } from '../../services/evaluationsV2/specifications'
 
 export type SpanMessagesWithEvalResultReason = {
   messages: LegacyMessage[]
@@ -29,10 +34,12 @@ export async function getSpanMessagesAndEvaluationResultsByIssue({
   workspace,
   commit,
   issue,
+  existingEvaluations,
 }: {
   workspace: Workspace
   commit: Commit
   issue: Issue
+  existingEvaluations: EvaluationV2[]
 }) {
   // Three is enough, as we don't want to overfit or add too many tokens to the prompt
   const spansResult = await getHITLSpansByIssue({
@@ -79,30 +86,37 @@ export async function getSpanMessagesAndEvaluationResultsByIssue({
     }
 
     // There will always be exactly one evaluation result for a span and trace id
-    const evaluationResults = evaluationResultsResult.filter(
+    const evaluationResult = evaluationResultsResult.find(
       (result) =>
         result.evaluatedSpanId === span.id &&
         result.evaluatedTraceId === span.traceId,
-    )[0]
+    )
+
+    const evaluation = evaluationResult
+      ? existingEvaluations.find(
+          (e) => e.uuid === evaluationResult.evaluationUuid,
+        )
+      : undefined
 
     messagesAndEvaluationResults.push({
       messages: adaptCompletionSpanMessagesToLegacy(completionSpan),
-      reason: getReasonFromEvaluationResult(evaluationResults),
+      reason: getReasonFromEvaluationResult(evaluationResult, evaluation),
     })
   }
 
   return Result.ok(messagesAndEvaluationResults)
 }
 
-// We need an efficient way of extracting reasons directly from metadata without fetching evaluations
-function getReasonFromEvaluationResult(result: EvaluationResultV2) {
-  if (result.error || !result.metadata) {
+function getReasonFromEvaluationResult(
+  result: EvaluationResultV2 | undefined,
+  evaluation: EvaluationV2 | undefined,
+): string {
+  if (!result || !evaluation || result.error) {
     return ''
   }
-
-  // LLM, Rule, and Human evaluations all have a reason field (required for LLM, optional for Rule/Human)
-  if ('reason' in result.metadata && result.metadata.reason) {
-    return result.metadata.reason ?? ''
-  }
-  return ''
+  const specification = getEvaluationMetricSpecification(evaluation)
+  const resultReason = specification.resultReason as (
+    result: EvaluationResultSuccessValue,
+  ) => string | undefined
+  return resultReason(result) ?? ''
 }
diff --git a/packages/core/src/services/evaluationsV2/generateFromIssue/generateFromIssue.ts b/packages/core/src/services/evaluationsV2/generateFromIssue/generateFromIssue.ts
@@ -99,24 +99,28 @@ export async function generateEvaluationConfigFromIssueWithCopilot(
   const copilot = copilotResult.unwrap()
 
   // Get the existing evaluation names for the same commit and document to avoid generating evals with the same name (unique key)
-  const existingEvaluationNamesResult = await getExistingEvaluationNames({
-    workspace: workspace,
-    commit: commit,
-    issue: issue,
-  })
-
-  if (!Result.isOk(existingEvaluationNamesResult)) {
-    return existingEvaluationNamesResult
+  const evaluationsRepository = new EvaluationsV2Repository(workspace.id)
+  const evaluationsFromSameCommitAndDocumentResult =
+    await evaluationsRepository.listAtCommitByDocument({
+      projectId: commit.projectId,
+      commitUuid: commit.uuid,
+      documentUuid: issue.documentUuid,
+    })
+  if (!Result.isOk(evaluationsFromSameCommitAndDocumentResult)) {
+    return evaluationsFromSameCommitAndDocumentResult
   }
+  const existingEvaluations =
+    evaluationsFromSameCommitAndDocumentResult.unwrap()
 
-  const existingEvaluationNames = existingEvaluationNamesResult.unwrap()
+  const existingEvaluationNames = existingEvaluations.map((e) => e.name)
 
   // Getting failed examples (evaluation results with the issue attached) to feed the copilot
   const messagesAndReasonWhyFailedForIssueResult =
     await getSpanMessagesAndEvaluationResultsByIssue({
       workspace: workspace,
       commit: commit,
       issue: issue,
+      existingEvaluations,
     })
 
   if (!Result.isOk(messagesAndReasonWhyFailedForIssueResult)) {
@@ -197,31 +201,6 @@ export async function generateEvaluationConfigFromIssueWithCopilot(
   return Result.ok(evaluationConfigWithProviderAndModel)
 }
 
-async function getExistingEvaluationNames({
-  workspace,
-  commit,
-  issue,
-}: {
-  workspace: Workspace
-  commit: Commit
-  issue: Issue
-}) {
-  const evaluationsRepository = new EvaluationsV2Repository(workspace.id)
-  const evaluationsFromSameCommitAndDocumentResult =
-    await evaluationsRepository.listAtCommitByDocument({
-      projectId: commit.projectId,
-      commitUuid: commit.uuid,
-      documentUuid: issue.documentUuid,
-    })
-  if (!Result.isOk(evaluationsFromSameCommitAndDocumentResult)) {
-    return evaluationsFromSameCommitAndDocumentResult
-  }
-  const existingEvaluations =
-    evaluationsFromSameCommitAndDocumentResult.unwrap()
-
-  return Result.ok(existingEvaluations.map((e) => e.name))
-}
-
 export async function getSpansFromSpanAndTraceIdPairs({
   spanAndTraceIdPairs,
   workspace,