Factuality and BinaryNdcgAtK

lerouxb · lerouxb · commit c3b345e35779 · 2025-08-20T16:39:47.000+01:00
diff --git a/packages/compass-assistant/test/assistant.eval.ts b/packages/compass-assistant/test/assistant.eval.ts
@@ -1,27 +1,41 @@
 /* eslint-disable no-console */
+import { createOpenAI } from '@ai-sdk/openai';
+import { streamText } from 'ai';
+import { init, Factuality as _Factuality } from 'autoevals';
 import { Eval } from 'braintrust';
 import type { EvalCase, EvalScorer } from 'braintrust';
-import { Levenshtein } from 'autoevals';
-import { streamText } from 'ai';
-import { createOpenAI } from '@ai-sdk/openai';
+import { OpenAI } from 'openai';
 import { evalCases } from './eval-cases';
+import { fuzzyLinkMatch } from './fuzzylinkmatch';
+import { binaryNdcgAtK } from './binaryndcgatk';
+
+const client = new OpenAI({
+  baseURL: 'https://api.braintrust.dev/v1/proxy',
+  apiKey: process.env.BRAINTRUST_API_KEY,
+});
+
+init({ client });
 
 export type SimpleEvalCase = {
   name?: string;
   input: string;
   expected: string;
+  expectedSources?: string[];
 };
 
 type Message = {
   text: string;
 };
+type InputMessage = Message;
+type OutputMessage = Message & { sources: string[] };
+type ExpectedMessage = OutputMessage;
 
 type ConversationEvalCaseInput = {
-  messages: Message[];
+  messages: InputMessage[];
 };
 
 type ConversationEvalCaseExpected = {
-  messages: Message[];
+  messages: OutputMessage[];
 };
 
 type ConversationEvalCase = EvalCase<
@@ -36,7 +50,7 @@ type ConversationTaskOutput = {
   // again this could also be an array of messages and each message could be an
   // object for future-proofing. But we're probably just going to be taking the
   // result from the chatbot as a block of text for test purposes
-  messages: Message[];
+  messages: ExpectedMessage[];
 };
 
 type ConversationEvalScorer = EvalScorer<
@@ -57,7 +71,7 @@ function makeEvalCases(): ConversationEvalCase[] {
         messages: [{ text: c.input }],
       },
       expected: {
-        messages: [{ text: c.expected }],
+        messages: [{ text: c.expected, sources: c.expectedSources || [] }],
       },
       metadata: {},
     };
@@ -78,6 +92,7 @@ async function makeAssistantCall(
 
   const result = streamText({
     model: openai.responses('mongodb-chat-latest'),
+    temperature: 0,
     prompt,
   });
 
@@ -90,19 +105,50 @@ async function makeAssistantCall(
     }
   }
   const text = chunks.join('');
+
+  // TODO: something up with this type
+  const resolvedSources = (await result.sources) as { url: string }[];
+
+  const sources = resolvedSources
+    .map((source) => {
+      console.log(source);
+      return source.url;
+    })
+    .filter((url) => !!url);
+
   return {
-    messages: [{ text }],
+    messages: [{ text, sources }],
   };
 }
 
-function makeLevenshtein(): ConversationEvalScorer {
-  return ({ output, expected }) => {
-    return Levenshtein({
-      output: allText(output.messages),
-      expected: allText(expected.messages),
-    });
-  };
-}
+const Factuality: ConversationEvalScorer = ({ input, output, expected }) => {
+  return _Factuality({
+    input: allText(input.messages),
+    output: allText(output.messages),
+    expected: allText(expected.messages),
+    model: 'gpt-4.1',
+    temperature: 0,
+  });
+};
+
+const BinaryNdcgAt5: ConversationEvalScorer = ({ output, expected }) => {
+  const name = 'BinaryNdcgAt5';
+  const k = 5;
+  const outputLinks = output.messages[0].sources ?? [];
+  const expectedLinks = expected.messages[0].sources;
+  if (expectedLinks) {
+    return {
+      name,
+      score: binaryNdcgAtK(expectedLinks, outputLinks, fuzzyLinkMatch, k),
+    };
+  } else {
+    // If there are no expected links, return null
+    return {
+      name,
+      score: null,
+    };
+  }
+};
 
 void Eval<
   ConversationEvalCaseInput,
@@ -111,8 +157,5 @@ void Eval<
 >('Compass Assistant', {
   data: makeEvalCases,
   task: makeAssistantCall,
-  // if input, output and expected were all just text we could have just stuck
-  // scorers from autoevals straight in here like [Levenshtein]. But because
-  // our types are custom we need to wrap them.
-  scores: [makeLevenshtein()],
+  scores: [Factuality, BinaryNdcgAt5],
 });
diff --git a/packages/compass-assistant/test/binaryndcgatk.ts b/packages/compass-assistant/test/binaryndcgatk.ts
@@ -0,0 +1,93 @@
+import { strict as assert } from 'assert';
+
+type MatchFunc<T> = (expected: T, actual: T) => boolean;
+
+type Primitive = string | number | boolean | null | undefined;
+
+const assertKIsValid = (k: number) =>
+  assert(k > 0 && Number.isInteger(k), 'k must be a positive integer');
+
+/**
+  Taken from https://github.com/mongodb/chatbot/blob/004a61464c2c25d6b61ad943d1ad9b2fc934eb73/packages/mongodb-rag-core/src/eval/retrievalMetrics/binaryNdcgAtK.ts#L17
+  
+  Calculate binary Normalized Discounted Cumulative Gain (NDCG) at rank K.
+  NDCG is a measure of ranking quality that evaluates how well the retrieved
+  results are ordered by relevance, considering the position of each result.
+  For binary relevance (relevant or not relevant), relevance scores are 1 or 0.
+
+  @param relevantItems - List of expected relevant items (all with relevance score 1).
+  @param retrievedItems - List of retrieved items to evaluate.
+  @param matchFunc - Function to compare items for equality.
+  @param k - Cutoff rank (top-k results to consider).
+  @returns Binary NDCG at rank K.
+ */
+export function binaryNdcgAtK<T extends Primitive>(
+  relevantItems: T[],
+  retrievedItems: T[],
+  matchFunc: MatchFunc<T>,
+  k: number
+): number {
+  assertKIsValid(k);
+
+  const limit = Math.min(k, retrievedItems.length);
+
+  const deduplicatedRetrievedItems = removeDuplicates(retrievedItems, limit);
+
+  const relevanceScores = calculateRelevanceScores(
+    deduplicatedRetrievedItems,
+    relevantItems,
+    matchFunc
+  );
+
+  // Use the ndcg function to calculate NDCG
+  return ndcg(relevanceScores, relevantItems.length, k);
+}
+
+function removeDuplicates<T extends Primitive>(
+  items: T[],
+  limit: number
+): (T | null)[] {
+  const itemsInLimit = items.slice(0, limit);
+  const seen = new Set<T>();
+  return itemsInLimit.map((item) => {
+    if (seen.has(item)) {
+      return null;
+    } else {
+      seen.add(item);
+      return item;
+    }
+  });
+}
+
+function calculateRelevanceScores<T extends Primitive>(
+  retrievedItems: (T | null)[],
+  relevantItems: T[],
+  matchFunc: MatchFunc<T>
+): number[] {
+  return retrievedItems.map((item) => {
+    // handle duplicate items
+    if (item === null) {
+      return 0;
+    }
+    return relevantItems.some((relevantItem) => matchFunc(relevantItem, item))
+      ? 1
+      : 0;
+  });
+}
+
+/**
+  Normalized Discounted Cumulative Gain (NDCG)
+ */
+export function ndcg(realScores: number[], idealNum: number, k: number) {
+  const actualDcg = dcg(realScores);
+  const idealDcg = dcg(ideal(idealNum, k));
+  return idealDcg === 0 ? 0 : actualDcg / idealDcg;
+}
+
+function dcg(scores: number[]) {
+  return scores.reduce((sum, gain, i) => sum + gain / Math.log2(i + 2), 0);
+}
+
+function ideal(n: number, k: number) {
+  return Array.from({ length: k }, (_, i) => (i < n ? 1 : 0));
+}
diff --git a/packages/compass-assistant/test/eval-cases/aggregation-pipeline.ts b/packages/compass-assistant/test/eval-cases/aggregation-pipeline.ts
@@ -30,6 +30,10 @@ db.orders.aggregate([
    { $unset: ["_id"] }
 ])
 `,
+  expectedSources: [
+    'https://www.mongodb.com/docs/manual/core/aggregation-pipeline/',
+    'https://www.mongodb.com/docs/compass/create-agg-pipeline/',
+  ],
 };
 
 export default evalCase;
diff --git a/packages/compass-assistant/test/eval-cases/filter-docs-before-search.ts b/packages/compass-assistant/test/eval-cases/filter-docs-before-search.ts
@@ -4,6 +4,10 @@ const evalCase: SimpleEvalCase = {
   input: 'How can I filter docs before running a $search query?',
   expected:
     'Because the $search stage must be the first stage in an aggregation pipeline, you cannot pre-filter documents with a preceding $match stage. Instead, filtering should be performed within the $search stage using the filter clause of the compound operator. This allows you to apply predicate queries (e.g., on ranges, dates, or specific terms) to narrow down the dataset before the main query clauses (must or should) are executed. Alternatively, you can filter documents by creating a View—a partial index of your collection that pre-queries and filters out unwanted documents. Note that users need createCollection privileges to build views.',
+  expectedSources: [
+    'https://www.mongodb.com/docs/atlas/atlas-search/compound/#options',
+    'https://www.mongodb.com/docs/atlas/atlas-search/transform-documents-collections/#example--filter-documents',
+  ],
 };
 
 export default evalCase;
diff --git a/packages/compass-assistant/test/eval-cases/model-data.ts b/packages/compass-assistant/test/eval-cases/model-data.ts
@@ -9,6 +9,10 @@ Map relationships: Identify the relationships in your application's data and dec
 Apply design patterns: Apply schema design patterns to optimize reads and writes.
 Create indexes: Create indexes to support common query patterns.
 `,
+  expectedSources: [
+    'https://www.mongodb.com/docs/manual/data-modeling/#plan-your-schema',
+    'https://www.mongodb.com/docs/manual/data-modeling/schema-design-process/#designing-your-schema',
+  ],
 };
 
 export default evalCase;
diff --git a/packages/compass-assistant/test/fuzzylinkmatch.ts b/packages/compass-assistant/test/fuzzylinkmatch.ts
@@ -0,0 +1,85 @@
+/**
+  Taken from https://github.com/mongodb/chatbot/blob/004a61464c2c25d6b61ad943d1ad9b2fc934eb73/packages/chatbot-server-mongodb-public/src/eval/fuzzyLinkMatch.ts#L16
+
+  Performs a case-insensitive match between two URLs or URL fragments.
+  
+  First attempts to match based on paths:
+  - Removes trailing slashes
+  - Checks if actual path ends with expected path (ignoring domain, query, and fragment)
+  
+  If either path is empty/invalid, falls back to exact match of normalized URLs.
+  
+  @param expected - The expected URL or URL fragment
+  @param actual - The actual URL or URL fragment to compare against
+  @returns true if URLs match according to above rules, false otherwise
+ */
+
+type NormalizeUrlParams = {
+  url: string;
+  removeHash?: boolean;
+  removeQueryString?: boolean;
+};
+
+// Regex used to get just the "front part" of a URL
+const optionalRegex = {
+  REMOVE_HASH: /^[^#]+/,
+  REMOVE_QUERY: /^[^?]+/,
+  REMOVE_BOTH: /^[^?#]+/,
+};
+
+/**
+  Utility function that normalizes a URL.
+  Removes http/s protocol, www, trailing backslashes.
+  Optionally removes query string and hash fragment.
+*/
+export function normalizeUrl({
+  url,
+  removeHash = true,
+  removeQueryString = true,
+}: NormalizeUrlParams): string {
+  if (removeHash && removeQueryString) {
+    url = (url.match(optionalRegex.REMOVE_BOTH) ?? [url])[0];
+  } else if (removeHash) {
+    url = (url.match(optionalRegex.REMOVE_HASH) ?? [url])[0];
+  } else if (removeQueryString) {
+    // Splitting on hash so we retain the hash fragment
+    const [frontUrl, hashFragment] = url.split('#');
+    url = (frontUrl.match(optionalRegex.REMOVE_QUERY) ?? [url])[0];
+    url += hashFragment ? `#${hashFragment}` : '';
+  }
+  return url
+    .replace(/^https?:\/\//i, '')
+    .replace(/^www\./, '')
+    .replace(/\/+$/, '');
+}
+
+export function fuzzyLinkMatch(expected: string, actual: string) {
+  const cleanActualPath = getCleanPath(actual);
+  const cleanExpectedPath = getCleanPath(expected);
+
+  // if cleaned path is not an empty string, compare cleaned paths
+  if (cleanActualPath && cleanExpectedPath) {
+    return cleanActualPath.endsWith(cleanExpectedPath);
+  } else {
+    // compare normalized full URLs
+    const normalizedActual = normalizeUrl({ url: actual });
+    const normalizedExpected = normalizeUrl({ url: expected });
+    return normalizedActual === normalizedExpected;
+  }
+}
+
+function cleanPath(path: string) {
+  return path.toLowerCase().replace(/\/$/, '');
+}
+
+function getCleanPath(maybeUrl: string) {
+  let out = '';
+  try {
+    const url = new URL(maybeUrl);
+    out = cleanPath(url.pathname);
+  } catch {
+    // If it's not a valid URL, return the input string as is
+    out = cleanPath(maybeUrl);
+  }
+  return out;
+}