Skip to content

Commit c3b345e

Browse files
committed
Factuality and BinaryNdcgAtK
1 parent 9bb69f2 commit c3b345e

File tree

6 files changed

+253
-20
lines changed

6 files changed

+253
-20
lines changed

packages/compass-assistant/test/assistant.eval.ts

Lines changed: 63 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,41 @@
11
/* eslint-disable no-console */
2+
import { createOpenAI } from '@ai-sdk/openai';
3+
import { streamText } from 'ai';
4+
import { init, Factuality as _Factuality } from 'autoevals';
25
import { Eval } from 'braintrust';
36
import type { EvalCase, EvalScorer } from 'braintrust';
4-
import { Levenshtein } from 'autoevals';
5-
import { streamText } from 'ai';
6-
import { createOpenAI } from '@ai-sdk/openai';
7+
import { OpenAI } from 'openai';
78
import { evalCases } from './eval-cases';
9+
import { fuzzyLinkMatch } from './fuzzylinkmatch';
10+
import { binaryNdcgAtK } from './binaryndcgatk';
11+
12+
const client = new OpenAI({
13+
baseURL: 'https://api.braintrust.dev/v1/proxy',
14+
apiKey: process.env.BRAINTRUST_API_KEY,
15+
});
16+
17+
init({ client });
818

919
export type SimpleEvalCase = {
1020
name?: string;
1121
input: string;
1222
expected: string;
23+
expectedSources?: string[];
1324
};
1425

1526
type Message = {
1627
text: string;
1728
};
29+
type InputMessage = Message;
30+
type OutputMessage = Message & { sources: string[] };
31+
type ExpectedMessage = OutputMessage;
1832

1933
type ConversationEvalCaseInput = {
20-
messages: Message[];
34+
messages: InputMessage[];
2135
};
2236

2337
type ConversationEvalCaseExpected = {
24-
messages: Message[];
38+
messages: OutputMessage[];
2539
};
2640

2741
type ConversationEvalCase = EvalCase<
@@ -36,7 +50,7 @@ type ConversationTaskOutput = {
3650
// again this could also be an array of messages and each message could be an
3751
// object for future-proofing. But we're probably just going to be taking the
3852
// result from the chatbot as a block of text for test purposes
39-
messages: Message[];
53+
messages: ExpectedMessage[];
4054
};
4155

4256
type ConversationEvalScorer = EvalScorer<
@@ -57,7 +71,7 @@ function makeEvalCases(): ConversationEvalCase[] {
5771
messages: [{ text: c.input }],
5872
},
5973
expected: {
60-
messages: [{ text: c.expected }],
74+
messages: [{ text: c.expected, sources: c.expectedSources || [] }],
6175
},
6276
metadata: {},
6377
};
@@ -78,6 +92,7 @@ async function makeAssistantCall(
7892

7993
const result = streamText({
8094
model: openai.responses('mongodb-chat-latest'),
95+
temperature: 0,
8196
prompt,
8297
});
8398

@@ -90,19 +105,50 @@ async function makeAssistantCall(
90105
}
91106
}
92107
const text = chunks.join('');
108+
109+
// TODO: something up with this type
110+
const resolvedSources = (await result.sources) as { url: string }[];
111+
112+
const sources = resolvedSources
113+
.map((source) => {
114+
console.log(source);
115+
return source.url;
116+
})
117+
.filter((url) => !!url);
118+
93119
return {
94-
messages: [{ text }],
120+
messages: [{ text, sources }],
95121
};
96122
}
97123

98-
function makeLevenshtein(): ConversationEvalScorer {
99-
return ({ output, expected }) => {
100-
return Levenshtein({
101-
output: allText(output.messages),
102-
expected: allText(expected.messages),
103-
});
104-
};
105-
}
124+
const Factuality: ConversationEvalScorer = ({ input, output, expected }) => {
125+
return _Factuality({
126+
input: allText(input.messages),
127+
output: allText(output.messages),
128+
expected: allText(expected.messages),
129+
model: 'gpt-4.1',
130+
temperature: 0,
131+
});
132+
};
133+
134+
const BinaryNdcgAt5: ConversationEvalScorer = ({ output, expected }) => {
135+
const name = 'BinaryNdcgAt5';
136+
const k = 5;
137+
const outputLinks = output.messages[0].sources ?? [];
138+
const expectedLinks = expected.messages[0].sources;
139+
if (expectedLinks) {
140+
return {
141+
name,
142+
score: binaryNdcgAtK(expectedLinks, outputLinks, fuzzyLinkMatch, k),
143+
};
144+
} else {
145+
// If there are no expected links, return null
146+
return {
147+
name,
148+
score: null,
149+
};
150+
}
151+
};
106152

107153
void Eval<
108154
ConversationEvalCaseInput,
@@ -111,8 +157,5 @@ void Eval<
111157
>('Compass Assistant', {
112158
data: makeEvalCases,
113159
task: makeAssistantCall,
114-
// if input, output and expected were all just text we could have just stuck
115-
// scorers from autoevals straight in here like [Levenshtein]. But because
116-
// our types are custom we need to wrap them.
117-
scores: [makeLevenshtein()],
160+
scores: [Factuality, BinaryNdcgAt5],
118161
});
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import { strict as assert } from 'assert';
2+
3+
type MatchFunc<T> = (expected: T, actual: T) => boolean;
4+
5+
type Primitive = string | number | boolean | null | undefined;
6+
7+
const assertKIsValid = (k: number) =>
8+
assert(k > 0 && Number.isInteger(k), 'k must be a positive integer');
9+
10+
/**
11+
Taken from https://github.com/mongodb/chatbot/blob/004a61464c2c25d6b61ad943d1ad9b2fc934eb73/packages/mongodb-rag-core/src/eval/retrievalMetrics/binaryNdcgAtK.ts#L17
12+
13+
Calculate binary Normalized Discounted Cumulative Gain (NDCG) at rank K.
14+
NDCG is a measure of ranking quality that evaluates how well the retrieved
15+
results are ordered by relevance, considering the position of each result.
16+
For binary relevance (relevant or not relevant), relevance scores are 1 or 0.
17+
18+
@param relevantItems - List of expected relevant items (all with relevance score 1).
19+
@param retrievedItems - List of retrieved items to evaluate.
20+
@param matchFunc - Function to compare items for equality.
21+
@param k - Cutoff rank (top-k results to consider).
22+
@returns Binary NDCG at rank K.
23+
*/
24+
export function binaryNdcgAtK<T extends Primitive>(
25+
relevantItems: T[],
26+
retrievedItems: T[],
27+
matchFunc: MatchFunc<T>,
28+
k: number
29+
): number {
30+
assertKIsValid(k);
31+
32+
const limit = Math.min(k, retrievedItems.length);
33+
34+
const deduplicatedRetrievedItems = removeDuplicates(retrievedItems, limit);
35+
36+
const relevanceScores = calculateRelevanceScores(
37+
deduplicatedRetrievedItems,
38+
relevantItems,
39+
matchFunc
40+
);
41+
42+
// Use the ndcg function to calculate NDCG
43+
return ndcg(relevanceScores, relevantItems.length, k);
44+
}
45+
46+
function removeDuplicates<T extends Primitive>(
47+
items: T[],
48+
limit: number
49+
): (T | null)[] {
50+
const itemsInLimit = items.slice(0, limit);
51+
const seen = new Set<T>();
52+
return itemsInLimit.map((item) => {
53+
if (seen.has(item)) {
54+
return null;
55+
} else {
56+
seen.add(item);
57+
return item;
58+
}
59+
});
60+
}
61+
62+
function calculateRelevanceScores<T extends Primitive>(
63+
retrievedItems: (T | null)[],
64+
relevantItems: T[],
65+
matchFunc: MatchFunc<T>
66+
): number[] {
67+
return retrievedItems.map((item) => {
68+
// handle duplicate items
69+
if (item === null) {
70+
return 0;
71+
}
72+
return relevantItems.some((relevantItem) => matchFunc(relevantItem, item))
73+
? 1
74+
: 0;
75+
});
76+
}
77+
78+
/**
79+
Normalized Discounted Cumulative Gain (NDCG)
80+
*/
81+
export function ndcg(realScores: number[], idealNum: number, k: number) {
82+
const actualDcg = dcg(realScores);
83+
const idealDcg = dcg(ideal(idealNum, k));
84+
return idealDcg === 0 ? 0 : actualDcg / idealDcg;
85+
}
86+
87+
function dcg(scores: number[]) {
88+
return scores.reduce((sum, gain, i) => sum + gain / Math.log2(i + 2), 0);
89+
}
90+
91+
function ideal(n: number, k: number) {
92+
return Array.from({ length: k }, (_, i) => (i < n ? 1 : 0));
93+
}

packages/compass-assistant/test/eval-cases/aggregation-pipeline.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ db.orders.aggregate([
3030
{ $unset: ["_id"] }
3131
])
3232
`,
33+
expectedSources: [
34+
'https://www.mongodb.com/docs/manual/core/aggregation-pipeline/',
35+
'https://www.mongodb.com/docs/compass/create-agg-pipeline/',
36+
],
3337
};
3438

3539
export default evalCase;

packages/compass-assistant/test/eval-cases/filter-docs-before-search.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@ const evalCase: SimpleEvalCase = {
44
input: 'How can I filter docs before running a $search query?',
55
expected:
66
'Because the $search stage must be the first stage in an aggregation pipeline, you cannot pre-filter documents with a preceding $match stage. Instead, filtering should be performed within the $search stage using the filter clause of the compound operator. This allows you to apply predicate queries (e.g., on ranges, dates, or specific terms) to narrow down the dataset before the main query clauses (must or should) are executed. Alternatively, you can filter documents by creating a View—a partial index of your collection that pre-queries and filters out unwanted documents. Note that users need createCollection privileges to build views.',
7+
expectedSources: [
8+
'https://www.mongodb.com/docs/atlas/atlas-search/compound/#options',
9+
'https://www.mongodb.com/docs/atlas/atlas-search/transform-documents-collections/#example--filter-documents',
10+
],
711
};
812

913
export default evalCase;

packages/compass-assistant/test/eval-cases/model-data.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ Map relationships: Identify the relationships in your application's data and dec
99
Apply design patterns: Apply schema design patterns to optimize reads and writes.
1010
Create indexes: Create indexes to support common query patterns.
1111
`,
12+
expectedSources: [
13+
'https://www.mongodb.com/docs/manual/data-modeling/#plan-your-schema',
14+
'https://www.mongodb.com/docs/manual/data-modeling/schema-design-process/#designing-your-schema',
15+
],
1216
};
1317

1418
export default evalCase;
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/**
2+
Taken from https://github.com/mongodb/chatbot/blob/004a61464c2c25d6b61ad943d1ad9b2fc934eb73/packages/chatbot-server-mongodb-public/src/eval/fuzzyLinkMatch.ts#L16
3+
4+
Performs a case-insensitive match between two URLs or URL fragments.
5+
6+
First attempts to match based on paths:
7+
- Removes trailing slashes
8+
- Checks if actual path ends with expected path (ignoring domain, query, and fragment)
9+
10+
If either path is empty/invalid, falls back to exact match of normalized URLs.
11+
12+
@param expected - The expected URL or URL fragment
13+
@param actual - The actual URL or URL fragment to compare against
14+
@returns true if URLs match according to above rules, false otherwise
15+
*/
16+
17+
type NormalizeUrlParams = {
18+
url: string;
19+
removeHash?: boolean;
20+
removeQueryString?: boolean;
21+
};
22+
23+
// Regex used to get just the "front part" of a URL
24+
const optionalRegex = {
25+
REMOVE_HASH: /^[^#]+/,
26+
REMOVE_QUERY: /^[^?]+/,
27+
REMOVE_BOTH: /^[^?#]+/,
28+
};
29+
30+
/**
31+
Utility function that normalizes a URL.
32+
Removes http/s protocol, www, trailing backslashes.
33+
Optionally removes query string and hash fragment.
34+
*/
35+
export function normalizeUrl({
36+
url,
37+
removeHash = true,
38+
removeQueryString = true,
39+
}: NormalizeUrlParams): string {
40+
if (removeHash && removeQueryString) {
41+
url = (url.match(optionalRegex.REMOVE_BOTH) ?? [url])[0];
42+
} else if (removeHash) {
43+
url = (url.match(optionalRegex.REMOVE_HASH) ?? [url])[0];
44+
} else if (removeQueryString) {
45+
// Splitting on hash so we retain the hash fragment
46+
const [frontUrl, hashFragment] = url.split('#');
47+
url = (frontUrl.match(optionalRegex.REMOVE_QUERY) ?? [url])[0];
48+
url += hashFragment ? `#${hashFragment}` : '';
49+
}
50+
return url
51+
.replace(/^https?:\/\//i, '')
52+
.replace(/^www\./, '')
53+
.replace(/\/+$/, '');
54+
}
55+
56+
export function fuzzyLinkMatch(expected: string, actual: string) {
57+
const cleanActualPath = getCleanPath(actual);
58+
const cleanExpectedPath = getCleanPath(expected);
59+
60+
// if cleaned path is not an empty string, compare cleaned paths
61+
if (cleanActualPath && cleanExpectedPath) {
62+
return cleanActualPath.endsWith(cleanExpectedPath);
63+
} else {
64+
// compare normalized full URLs
65+
const normalizedActual = normalizeUrl({ url: actual });
66+
const normalizedExpected = normalizeUrl({ url: expected });
67+
return normalizedActual === normalizedExpected;
68+
}
69+
}
70+
71+
function cleanPath(path: string) {
72+
return path.toLowerCase().replace(/\/$/, '');
73+
}
74+
75+
function getCleanPath(maybeUrl: string) {
76+
let out = '';
77+
try {
78+
const url = new URL(maybeUrl);
79+
out = cleanPath(url.pathname);
80+
} catch {
81+
// If it's not a valid URL, return the input string as is
82+
out = cleanPath(maybeUrl);
83+
}
84+
return out;
85+
}

0 commit comments

Comments
 (0)