latitude-dev · learningbizz · Jan 7, 2026 · Jan 6, 2026 · learningbizz · Jan 6, 2026
diff --git a/.gitignore b/.gitignore
@@ -94,6 +94,8 @@ __testing__
 # New spans
 apps/web/ingest
 apps/workers/workspaces/*/traces
+provider-logs/*
+workspaces/*
 
 # Helm chart secrets (do not commit)
 charts/latitude/values.secrets.yaml

diff --git a/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.test.ts b/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.test.ts
@@ -122,8 +122,8 @@
      provider: 'openai',
      model: 'gpt-4o',
      configuration: {},
      input: input as any,
      output: output as any,
    }

    const disk = diskFactory('private')
@@ -167,7 +167,7 @@
      span: {
        ...span,
        type: SpanType.Prompt,
      } as any,
      hasPassed,
      ...(error ? { error } : {}),
      ...(metadata
@@ -178,10 +178,10 @@
              expectedOutput: metadata.expectedOutput ?? 'expected output',
              datasetLabel: metadata.datasetLabel ?? 'default',
              reason: metadata.reason,
            } as any,
          }
        : {}),
    } as any)

    await createIssueEvaluationResult({
      workspace,
@@ -221,6 +221,7 @@
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -256,6 +257,7 @@
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -283,6 +285,7 @@
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -307,6 +310,7 @@
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -327,7 +331,7 @@
    })

    await createEvaluationResultForIssue({
      span: httpSpan as any,
      metadata: {
        reason: 'This should be skipped',
      },
@@ -337,6 +341,7 @@
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -366,6 +371,7 @@
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(false)
@@ -379,6 +385,7 @@
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -426,6 +433,7 @@
       workspace,
       commit,
       issue,
+      existingEvaluations: [evaluation],
     })
 
     expect(result.ok).toBe(true)
@@ -535,6 +543,7 @@
         workspace,
         commit: commit2,
         issue,
+        existingEvaluations: [evaluation],
       })
 
       expect(result.ok).toBe(true)
@@ -642,6 +651,7 @@
         workspace,
         commit: draftCommit,
         issue,
+        existingEvaluations: [evaluation],
       })
 
       expect(result.ok).toBe(true)
@@ -713,14 +723,14 @@
        span: {
          ...span2,
          type: SpanType.Prompt,
        } as any,
        hasPassed: false,
        metadata: {
          configuration: llmEvaluation.configuration,
          actualOutput: 'actual output',
          expectedOutput: 'expected output',
          reason: 'LLM evaluation reason',
        } as any,
      } as any)

      await createIssueEvaluationResult({
@@ -733,6 +743,7 @@
         workspace,
         commit,
         issue,
+        existingEvaluations: [evaluation],
       })
 
       expect(result.ok).toBe(true)

diff --git a/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.ts b/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.ts
@@ -5,13 +5,18 @@ import {
   EvaluationResultsV2Repository,
 } from '../../repositories'
 import { Result } from '../../lib/Result'
-import { EvaluationResultV2 } from '../../constants'
+import {
+  EvaluationResultSuccessValue,
+  EvaluationResultV2,
+  EvaluationV2,
+} from '../../constants'
 import { Issue } from '../../schema/models/types/Issue'
 import { Message as LegacyMessage } from '@latitude-data/constants/legacyCompiler'
 import { assembleTraceWithMessages } from '../../services/tracing/traces/assemble'
 import { adaptCompletionSpanMessagesToLegacy } from '../../services/tracing/spans/fetching/findCompletionSpanFromTrace'
 import { UnprocessableEntityError } from '../../lib/errors'
 import { getHITLSpansByIssue } from './getHITLSpansByIssue'
+import { getEvaluationMetricSpecification } from '../../services/evaluationsV2/specifications'
 
 export type SpanMessagesWithEvalResultReason = {
   messages: LegacyMessage[]
@@ -29,10 +34,12 @@ export async function getSpanMessagesAndEvaluationResultsByIssue({
   workspace,
   commit,
   issue,
+  existingEvaluations,
 }: {
   workspace: Workspace
   commit: Commit
   issue: Issue
+  existingEvaluations: EvaluationV2[]
 }) {
   // Three is enough, as we don't want to overfit or add too many tokens to the prompt
   const spansResult = await getHITLSpansByIssue({
@@ -79,30 +86,37 @@ export async function getSpanMessagesAndEvaluationResultsByIssue({
     }
 
     // There will always be exactly one evaluation result for a span and trace id
-    const evaluationResults = evaluationResultsResult.filter(
+    const evaluationResult = evaluationResultsResult.find(
       (result) =>
         result.evaluatedSpanId === span.id &&
         result.evaluatedTraceId === span.traceId,
-    )[0]
+    )
+
+    const evaluation = evaluationResult
+      ? existingEvaluations.find(
+          (e) => e.uuid === evaluationResult.evaluationUuid,
+        )
+      : undefined
 
     messagesAndEvaluationResults.push({
       messages: adaptCompletionSpanMessagesToLegacy(completionSpan),
-      reason: getReasonFromEvaluationResult(evaluationResults),
+      reason: getReasonFromEvaluationResult(evaluationResult, evaluation),
     })
   }
 
   return Result.ok(messagesAndEvaluationResults)
 }
 
-// We need an efficient way of extracting reasons directly from metadata without fetching evaluations
-function getReasonFromEvaluationResult(result: EvaluationResultV2) {
-  if (result.error || !result.metadata) {
+function getReasonFromEvaluationResult(
+  result: EvaluationResultV2 | undefined,
+  evaluation: EvaluationV2 | undefined,
+): string {
+  if (!result || !evaluation || result.error) {
     return ''
   }
-
-  // LLM, Rule, and Human evaluations all have a reason field (required for LLM, optional for Rule/Human)
-  if ('reason' in result.metadata && result.metadata.reason) {
-    return result.metadata.reason ?? ''
-  }
-  return ''
+  const specification = getEvaluationMetricSpecification(evaluation)
+  const resultReason = specification.resultReason as (
+    result: EvaluationResultSuccessValue,
+  ) => string | undefined
+  return resultReason(result) ?? ''
-  const resultReason = specification.resultReason as (
-    result: EvaluationResultSuccessValue,
-  ) => string | undefined
-  return resultReason(result) ?? ''
+  const resultReason = specification.resultReason(result as EvaluationResultSuccessValue)
+  return resultReason ?? ''
-  const resultReason = specification.resultReason as (
-    result: EvaluationResultSuccessValue,
-  ) => string | undefined
-  return resultReason(result) ?? ''
+  const resultReason = specification.resultReason(result as EvaluationResultSuccessValue)
+  return resultReason ?? ''
 }
diff --git a/packages/core/src/services/evaluationsV2/generateFromIssue/generateFromIssue.ts b/packages/core/src/services/evaluationsV2/generateFromIssue/generateFromIssue.ts
@@ -99,24 +99,28 @@ export async function generateEvaluationConfigFromIssueWithCopilot(
   const copilot = copilotResult.unwrap()
 
   // Get the existing evaluation names for the same commit and document to avoid generating evals with the same name (unique key)
-  const existingEvaluationNamesResult = await getExistingEvaluationNames({
-    workspace: workspace,
-    commit: commit,
-    issue: issue,
-  })
-
-  if (!Result.isOk(existingEvaluationNamesResult)) {
-    return existingEvaluationNamesResult
+  const evaluationsRepository = new EvaluationsV2Repository(workspace.id)
+  const evaluationsFromSameCommitAndDocumentResult =
+    await evaluationsRepository.listAtCommitByDocument({
+      projectId: commit.projectId,
+      commitUuid: commit.uuid,
+      documentUuid: issue.documentUuid,
+    })
+  if (!Result.isOk(evaluationsFromSameCommitAndDocumentResult)) {
+    return evaluationsFromSameCommitAndDocumentResult
   }
+  const existingEvaluations =
+    evaluationsFromSameCommitAndDocumentResult.unwrap()
 
-  const existingEvaluationNames = existingEvaluationNamesResult.unwrap()
+  const existingEvaluationNames = existingEvaluations.map((e) => e.name)
 
   // Getting failed examples (evaluation results with the issue attached) to feed the copilot
   const messagesAndReasonWhyFailedForIssueResult =
     await getSpanMessagesAndEvaluationResultsByIssue({
       workspace: workspace,
       commit: commit,
       issue: issue,
+      existingEvaluations,
     })
 
   if (!Result.isOk(messagesAndReasonWhyFailedForIssueResult)) {
@@ -197,31 +201,6 @@ export async function generateEvaluationConfigFromIssueWithCopilot(
   return Result.ok(evaluationConfigWithProviderAndModel)
 }
 
-async function getExistingEvaluationNames({
-  workspace,
-  commit,
-  issue,
-}: {
-  workspace: Workspace
-  commit: Commit
-  issue: Issue
-}) {
-  const evaluationsRepository = new EvaluationsV2Repository(workspace.id)
-  const evaluationsFromSameCommitAndDocumentResult =
-    await evaluationsRepository.listAtCommitByDocument({
-      projectId: commit.projectId,
-      commitUuid: commit.uuid,
-      documentUuid: issue.documentUuid,
-    })
-  if (!Result.isOk(evaluationsFromSameCommitAndDocumentResult)) {
-    return evaluationsFromSameCommitAndDocumentResult
-  }
-  const existingEvaluations =
-    evaluationsFromSameCommitAndDocumentResult.unwrap()
-
-  return Result.ok(existingEvaluations.map((e) => e.name))
-}
-
 export async function getSpansFromSpanAndTraceIdPairs({
   spanAndTraceIdPairs,
   workspace,