diff --git a/.gitignore b/.gitignore index e0f6eeaee1..3d23bc50b1 100644 --- a/.gitignore +++ b/.gitignore @@ -94,6 +94,8 @@ __testing__ # New spans apps/web/ingest apps/workers/workspaces/*/traces +provider-logs/* +workspaces/* # Helm chart secrets (do not commit) charts/latitude/values.secrets.yaml diff --git a/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.test.ts b/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.test.ts index 9bcbeafab2..36a4cffee8 100644 --- a/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.test.ts +++ b/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.test.ts @@ -221,6 +221,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => { workspace, commit, issue, + existingEvaluations: [evaluation], }) expect(result.ok).toBe(true) @@ -256,6 +257,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => { workspace, commit, issue, + existingEvaluations: [evaluation], }) expect(result.ok).toBe(true) @@ -283,6 +285,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => { workspace, commit, issue, + existingEvaluations: [evaluation], }) expect(result.ok).toBe(true) @@ -307,6 +310,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => { workspace, commit, issue, + existingEvaluations: [evaluation], }) expect(result.ok).toBe(true) @@ -337,6 +341,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => { workspace, commit, issue, + existingEvaluations: [evaluation], }) expect(result.ok).toBe(true) @@ -366,6 +371,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => { workspace, commit, issue, + existingEvaluations: [evaluation], }) expect(result.ok).toBe(false) @@ -379,6 +385,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => { workspace, commit, issue, + existingEvaluations: [evaluation], }) expect(result.ok).toBe(true) @@ -426,6 +433,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => { workspace, commit, issue, + existingEvaluations: [evaluation], }) expect(result.ok).toBe(true) @@ -535,6 +543,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => { workspace, commit: commit2, issue, + existingEvaluations: [evaluation], }) expect(result.ok).toBe(true) @@ -642,6 +651,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => { workspace, commit: draftCommit, issue, + existingEvaluations: [evaluation], }) expect(result.ok).toBe(true) @@ -733,6 +743,7 @@ describe('getSpanMessagesAndEvaluationResultsByIssue', () => { workspace, commit, issue, + existingEvaluations: [evaluation], }) expect(result.ok).toBe(true) diff --git a/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.ts b/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.ts index 22ed670a17..898fdde4b2 100644 --- a/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.ts +++ b/packages/core/src/data-access/issues/getSpanMessagesAndEvaluationResultsByIssue.ts @@ -5,13 +5,18 @@ import { EvaluationResultsV2Repository, } from '../../repositories' import { Result } from '../../lib/Result' -import { EvaluationResultV2 } from '../../constants' +import { + EvaluationResultSuccessValue, + EvaluationResultV2, + EvaluationV2, +} from '../../constants' import { Issue } from '../../schema/models/types/Issue' import { Message as LegacyMessage } from '@latitude-data/constants/legacyCompiler' import { assembleTraceWithMessages } from '../../services/tracing/traces/assemble' import { adaptCompletionSpanMessagesToLegacy } from '../../services/tracing/spans/fetching/findCompletionSpanFromTrace' import { UnprocessableEntityError } from '../../lib/errors' import { getHITLSpansByIssue } from './getHITLSpansByIssue' +import { getEvaluationMetricSpecification } from '../../services/evaluationsV2/specifications' export type SpanMessagesWithEvalResultReason = { messages: LegacyMessage[] @@ -29,10 +34,12 @@ export async function getSpanMessagesAndEvaluationResultsByIssue({ workspace, commit, issue, + existingEvaluations, }: { workspace: Workspace commit: Commit issue: Issue + existingEvaluations: EvaluationV2[] }) { // Three is enough, as we don't want to overfit or add too many tokens to the prompt const spansResult = await getHITLSpansByIssue({ @@ -79,30 +86,37 @@ export async function getSpanMessagesAndEvaluationResultsByIssue({ } // There will always be exactly one evaluation result for a span and trace id - const evaluationResults = evaluationResultsResult.filter( + const evaluationResult = evaluationResultsResult.find( (result) => result.evaluatedSpanId === span.id && result.evaluatedTraceId === span.traceId, - )[0] + ) + + const evaluation = evaluationResult + ? existingEvaluations.find( + (e) => e.uuid === evaluationResult.evaluationUuid, + ) + : undefined messagesAndEvaluationResults.push({ messages: adaptCompletionSpanMessagesToLegacy(completionSpan), - reason: getReasonFromEvaluationResult(evaluationResults), + reason: getReasonFromEvaluationResult(evaluationResult, evaluation), }) } return Result.ok(messagesAndEvaluationResults) } -// We need an efficient way of extracting reasons directly from metadata without fetching evaluations -function getReasonFromEvaluationResult(result: EvaluationResultV2) { - if (result.error || !result.metadata) { +function getReasonFromEvaluationResult( + result: EvaluationResultV2 | undefined, + evaluation: EvaluationV2 | undefined, +): string { + if (!result || !evaluation || result.error) { return '' } - - // LLM, Rule, and Human evaluations all have a reason field (required for LLM, optional for Rule/Human) - if ('reason' in result.metadata && result.metadata.reason) { - return result.metadata.reason ?? '' - } - return '' + const specification = getEvaluationMetricSpecification(evaluation) + const resultReason = specification.resultReason as ( + result: EvaluationResultSuccessValue, + ) => string | undefined + return resultReason(result) ?? '' } diff --git a/packages/core/src/services/evaluationsV2/generateFromIssue/generateFromIssue.ts b/packages/core/src/services/evaluationsV2/generateFromIssue/generateFromIssue.ts index 9700a917ec..7ce49b39d1 100644 --- a/packages/core/src/services/evaluationsV2/generateFromIssue/generateFromIssue.ts +++ b/packages/core/src/services/evaluationsV2/generateFromIssue/generateFromIssue.ts @@ -99,17 +99,20 @@ export async function generateEvaluationConfigFromIssueWithCopilot( const copilot = copilotResult.unwrap() // Get the existing evaluation names for the same commit and document to avoid generating evals with the same name (unique key) - const existingEvaluationNamesResult = await getExistingEvaluationNames({ - workspace: workspace, - commit: commit, - issue: issue, - }) - - if (!Result.isOk(existingEvaluationNamesResult)) { - return existingEvaluationNamesResult + const evaluationsRepository = new EvaluationsV2Repository(workspace.id) + const evaluationsFromSameCommitAndDocumentResult = + await evaluationsRepository.listAtCommitByDocument({ + projectId: commit.projectId, + commitUuid: commit.uuid, + documentUuid: issue.documentUuid, + }) + if (!Result.isOk(evaluationsFromSameCommitAndDocumentResult)) { + return evaluationsFromSameCommitAndDocumentResult } + const existingEvaluations = + evaluationsFromSameCommitAndDocumentResult.unwrap() - const existingEvaluationNames = existingEvaluationNamesResult.unwrap() + const existingEvaluationNames = existingEvaluations.map((e) => e.name) // Getting failed examples (evaluation results with the issue attached) to feed the copilot const messagesAndReasonWhyFailedForIssueResult = @@ -117,6 +120,7 @@ export async function generateEvaluationConfigFromIssueWithCopilot( workspace: workspace, commit: commit, issue: issue, + existingEvaluations, }) if (!Result.isOk(messagesAndReasonWhyFailedForIssueResult)) { @@ -197,31 +201,6 @@ export async function generateEvaluationConfigFromIssueWithCopilot( return Result.ok(evaluationConfigWithProviderAndModel) } -async function getExistingEvaluationNames({ - workspace, - commit, - issue, -}: { - workspace: Workspace - commit: Commit - issue: Issue -}) { - const evaluationsRepository = new EvaluationsV2Repository(workspace.id) - const evaluationsFromSameCommitAndDocumentResult = - await evaluationsRepository.listAtCommitByDocument({ - projectId: commit.projectId, - commitUuid: commit.uuid, - documentUuid: issue.documentUuid, - }) - if (!Result.isOk(evaluationsFromSameCommitAndDocumentResult)) { - return evaluationsFromSameCommitAndDocumentResult - } - const existingEvaluations = - evaluationsFromSameCommitAndDocumentResult.unwrap() - - return Result.ok(existingEvaluations.map((e) => e.name)) -} - export async function getSpansFromSpanAndTraceIdPairs({ spanAndTraceIdPairs, workspace,