Skip to content

Commit 9db296e

Browse files
chore: refactor to make tests writing simpler and other QOL improvements.
1. Removes unnecessary suite description from tests 2. Removes the test suite name from the storage as well 3. Centralize the constants used everywhere in the SDK 4. Adds clarifying comments and docs wherever necessary 5. Write tests for accuracy-scorer
1 parent cb46c43 commit 9db296e

33 files changed

+739
-556
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,5 @@ state.json
1111

1212
tests/tmp
1313
coverage
14-
.accuracy-snapshots
14+
# Generated assets by accuracy runs
15+
.accuracy

scripts/update-accuracy-run-status.ts

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,18 @@
11
import { getAccuracySnapshotStorage } from "../tests/accuracy/sdk/accuracy-snapshot-storage/get-snapshot-storage.js";
2-
import {
3-
AccuracyRunStatus,
4-
AccuracyRunStatuses,
5-
} from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js";
2+
import { AccuracyRunStatus } from "../tests/accuracy/sdk/accuracy-snapshot-storage/snapshot-storage.js";
63

74
const envAccuracyRunId = process.env.MDB_ACCURACY_RUN_ID;
85
const envAccuracyRunStatus = process.env.MDB_ACCURACY_RUN_STATUS;
96

10-
let status: AccuracyRunStatuses | undefined;
117
if (
128
!envAccuracyRunId ||
139
(envAccuracyRunStatus !== AccuracyRunStatus.Done && envAccuracyRunStatus !== AccuracyRunStatus.Failed)
1410
) {
1511
process.exit(1);
1612
}
1713

18-
console.time(`Marked accuracy run id - ${envAccuracyRunId} as ${status} in`);
14+
console.time(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`);
1915
const storage = await getAccuracySnapshotStorage();
2016
await storage.updateAccuracyRunStatus(envAccuracyRunId, envAccuracyRunStatus);
2117
await storage.close();
22-
console.timeEnd(`Marked accuracy run id - ${envAccuracyRunId} as ${status} in`);
18+
console.timeEnd(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`);

tests/accuracy/aggregate.test.ts

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,28 +1,16 @@
1-
import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js";
1+
import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
22
import { getAvailableModels } from "./sdk/models.js";
3-
import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
43

5-
function callsAggregate(prompt: string, pipeline: Record<string, unknown>[]): AccuracyTestConfig {
6-
return {
7-
injectConnectedAssumption: true,
8-
prompt: prompt,
9-
mockedTools: {},
4+
describeAccuracyTests(getAvailableModels(), [
5+
{
6+
prompt: "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them",
107
expectedToolCalls: [
118
{
129
toolName: "aggregate",
1310
parameters: {
14-
pipeline: pipeline,
11+
pipeline: { $group: { _id: "$release_year", count: { $sum: 1 } } },
1512
},
1613
},
1714
],
18-
};
19-
}
20-
21-
describeAccuracyTests(getAvailableModels(), {
22-
...describeSuite("should call 'aggregate' tool", [
23-
callsAggregate(
24-
"Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them",
25-
[{ $group: { _id: "$release_year", count: { $sum: 1 } } }]
26-
),
27-
]),
28-
});
15+
},
16+
]);
Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js";
1+
import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
22
import { getAvailableModels } from "./sdk/models.js";
33
import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
44

55
function callsCollectionIndexes(prompt: string): AccuracyTestConfig {
66
return {
7-
injectConnectedAssumption: true,
87
prompt: prompt,
9-
mockedTools: {},
108
expectedToolCalls: [
119
{
1210
toolName: "collection-indexes",
@@ -19,12 +17,10 @@ function callsCollectionIndexes(prompt: string): AccuracyTestConfig {
1917
};
2018
}
2119

22-
describeAccuracyTests(getAvailableModels(), {
23-
...describeSuite("should call 'collection-indexes' tool", [
24-
callsCollectionIndexes("How many indexes do I have in 'mflix.movies' namespace?"),
25-
callsCollectionIndexes("List all the indexes in movies collection in mflix database"),
26-
callsCollectionIndexes(
27-
`Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?`
28-
),
29-
]),
30-
});
20+
describeAccuracyTests(getAvailableModels(), [
21+
callsCollectionIndexes("How many indexes do I have in 'mflix.movies' namespace?"),
22+
callsCollectionIndexes("List all the indexes in movies collection in mflix database"),
23+
callsCollectionIndexes(
24+
`Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?`
25+
),
26+
]);
Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js";
1+
import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
22
import { getAvailableModels } from "./sdk/models.js";
33
import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
44

55
function callsCollectionSchema(prompt: string): AccuracyTestConfig {
66
return {
7-
injectConnectedAssumption: true,
87
prompt: prompt,
9-
mockedTools: {},
108
expectedToolCalls: [
119
{
1210
toolName: "collection-schema",
@@ -19,9 +17,7 @@ function callsCollectionSchema(prompt: string): AccuracyTestConfig {
1917
};
2018
}
2119

22-
describeAccuracyTests(getAvailableModels(), {
23-
...describeSuite("should call 'collection-schema' tool", [
24-
callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"),
25-
callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"),
26-
]),
27-
});
20+
describeAccuracyTests(getAvailableModels(), [
21+
callsCollectionSchema("Is there a title field in 'db1.coll1' namespace?"),
22+
callsCollectionSchema("What is the type of value stored in title field in coll1 collection in db1 database?"),
23+
]);
Lines changed: 13 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,22 @@
1-
import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js";
1+
import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
22
import { getAvailableModels } from "./sdk/models.js";
3-
import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
4-
import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js";
53

6-
function callsCollectionStorageSize(prompt: string, expectedToolCalls: ExpectedToolCall[]): AccuracyTestConfig {
7-
return {
8-
injectConnectedAssumption: true,
9-
prompt: prompt,
10-
mockedTools: {},
11-
expectedToolCalls: expectedToolCalls,
12-
};
13-
}
14-
15-
describeAccuracyTests(getAvailableModels(), {
16-
...describeSuite("should only call 'collection-storage-size' tool", [
17-
callsCollectionStorageSize("What is the size of 'mflix.movies' namespace", [
4+
describeAccuracyTests(getAvailableModels(), [
5+
{
6+
prompt: "What is the size of 'mflix.movies' namespace",
7+
expectedToolCalls: [
188
{
199
toolName: "collection-storage-size",
2010
parameters: {
2111
database: "mflix",
2212
collection: "movies",
2313
},
2414
},
25-
]),
26-
]),
27-
...describeSuite("should call 'collection-storage-size' tool after another tool/s", [
28-
callsCollectionStorageSize("How much size is each collection in comics database", [
15+
],
16+
},
17+
{
18+
prompt: "How much size is each collection in comics database",
19+
expectedToolCalls: [
2920
{
3021
toolName: "list-collections",
3122
parameters: {
@@ -46,6 +37,6 @@ describeAccuracyTests(getAvailableModels(), {
4637
collection: "characters",
4738
},
4839
},
49-
]),
50-
]),
51-
});
40+
],
41+
},
42+
]);

tests/accuracy/count.test.ts

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js";
1+
import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
22
import { getAvailableModels } from "./sdk/models.js";
33
import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
44

55
function callsCountToolWithEmptyQuery(prompt: string, database = "mflix", collection = "movies"): AccuracyTestConfig {
66
return {
7-
injectConnectedAssumption: true,
87
prompt: prompt,
9-
mockedTools: {},
108
expectedToolCalls: [
119
{
1210
toolName: "count",
@@ -26,9 +24,7 @@ function callsCountToolWithQuery(
2624
query: Record<string, unknown> = {}
2725
): AccuracyTestConfig {
2826
return {
29-
injectConnectedAssumption: true,
3027
prompt: prompt,
31-
mockedTools: {},
3228
expectedToolCalls: [
3329
{
3430
toolName: "count",
@@ -42,19 +38,17 @@ function callsCountToolWithQuery(
4238
};
4339
}
4440

45-
describeAccuracyTests(getAvailableModels(), {
46-
...describeSuite("should only call 'count' tool", [
47-
callsCountToolWithEmptyQuery("Count number of documents in 'mflix.movies' namespace."),
48-
callsCountToolWithEmptyQuery(
49-
"How many documents are there in 'characters' collection in 'comics' database?",
50-
"comics",
51-
"characters"
52-
),
53-
callsCountToolWithQuery(
54-
"Count all the documents in 'mflix.movies' namespace with runtime less than 100?",
55-
"mflix",
56-
"movies",
57-
{ runtime: { $lt: 100 } }
58-
),
59-
]),
60-
});
41+
describeAccuracyTests(getAvailableModels(), [
42+
callsCountToolWithEmptyQuery("Count number of documents in 'mflix.movies' namespace."),
43+
callsCountToolWithEmptyQuery(
44+
"How many documents are there in 'characters' collection in 'comics' database?",
45+
"comics",
46+
"characters"
47+
),
48+
callsCountToolWithQuery(
49+
"Count all the documents in 'mflix.movies' namespace with runtime less than 100?",
50+
"mflix",
51+
"movies",
52+
{ runtime: { $lt: 100 } }
53+
),
54+
]);
Lines changed: 21 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
1-
import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js";
1+
import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
22
import { getAvailableModels } from "./sdk/models.js";
33
import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
44
import { ExpectedToolCall } from "./sdk/accuracy-snapshot-storage/snapshot-storage.js";
55

66
function callsCreateCollection(prompt: string, database: string, collection: string): AccuracyTestConfig {
77
return {
8-
injectConnectedAssumption: true,
98
prompt: prompt,
10-
mockedTools: {},
119
expectedToolCalls: [
1210
{
1311
toolName: "create-collection",
@@ -29,29 +27,25 @@ function callsCreateCollectionWithListCollections(prompt: string, expectedToolCa
2927
};
3028
}
3129

32-
describeAccuracyTests(getAvailableModels(), {
33-
...describeSuite("should only call 'create-collection' tool", [
34-
callsCreateCollection("Create a new namespace 'mflix.documentaries'", "mflix", "documentaries"),
35-
callsCreateCollection("Create a new collection villains in comics database", "comics", "villains"),
36-
]),
37-
...describeSuite("should call 'create-collection' alongside other required tools", [
38-
callsCreateCollectionWithListCollections(
39-
"If and only if, the namespace 'mflix.documentaries' does not exist, then create it",
40-
[
41-
{
42-
toolName: "list-collections",
43-
parameters: {
44-
database: "mflix",
45-
},
30+
describeAccuracyTests(getAvailableModels(), [
31+
callsCreateCollection("Create a new namespace 'mflix.documentaries'", "mflix", "documentaries"),
32+
callsCreateCollection("Create a new collection villains in comics database", "comics", "villains"),
33+
callsCreateCollectionWithListCollections(
34+
"If and only if, the namespace 'mflix.documentaries' does not exist, then create it",
35+
[
36+
{
37+
toolName: "list-collections",
38+
parameters: {
39+
database: "mflix",
4640
},
47-
{
48-
toolName: "create-collection",
49-
parameters: {
50-
database: "mflix",
51-
collection: "documentaries",
52-
},
41+
},
42+
{
43+
toolName: "create-collection",
44+
parameters: {
45+
database: "mflix",
46+
collection: "documentaries",
5347
},
54-
]
55-
),
56-
]),
57-
});
48+
},
49+
]
50+
),
51+
]);

tests/accuracy/create-index.test.ts

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js";
1+
import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
22
import { getAvailableModels } from "./sdk/models.js";
33
import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
44

55
function callsCreateIndex(prompt: string, indexKeys: Record<string, unknown>): AccuracyTestConfig {
66
return {
7-
injectConnectedAssumption: true,
87
prompt: prompt,
9-
mockedTools: {},
108
expectedToolCalls: [
119
{
1210
toolName: "create-index",
@@ -20,16 +18,14 @@ function callsCreateIndex(prompt: string, indexKeys: Record<string, unknown>): A
2018
};
2119
}
2220

23-
describeAccuracyTests(getAvailableModels(), {
24-
...describeSuite("should call 'create-index' tool", [
25-
callsCreateIndex(
26-
"Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }",
27-
{
28-
release_year: 1,
29-
}
30-
),
31-
callsCreateIndex("Create a text index on title field in 'mflix.movies' namespace", {
32-
title: "text",
33-
}),
34-
]),
35-
});
21+
describeAccuracyTests(getAvailableModels(), [
22+
callsCreateIndex(
23+
"Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }",
24+
{
25+
release_year: 1,
26+
}
27+
),
28+
callsCreateIndex("Create a text index on title field in 'mflix.movies' namespace", {
29+
title: "text",
30+
}),
31+
]);

tests/accuracy/db-stats.test.ts

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
import { describeAccuracyTests, describeSuite } from "./sdk/describe-accuracy-tests.js";
1+
import { describeAccuracyTests } from "./sdk/describe-accuracy-tests.js";
22
import { getAvailableModels } from "./sdk/models.js";
33
import { AccuracyTestConfig } from "./sdk/describe-accuracy-tests.js";
44

55
function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestConfig {
66
return {
7-
injectConnectedAssumption: true,
87
prompt: prompt,
9-
mockedTools: {},
108
expectedToolCalls: [
119
{
1210
toolName: "db-stats",
@@ -18,8 +16,4 @@ function callsListDatabases(prompt: string, database = "mflix"): AccuracyTestCon
1816
};
1917
}
2018

21-
describeAccuracyTests(getAvailableModels(), {
22-
...describeSuite("should only call 'db-stats' tool", [
23-
callsListDatabases("What is the size occupied by database mflix?"),
24-
]),
25-
});
19+
describeAccuracyTests(getAvailableModels(), [callsListDatabases("What is the size occupied by database mflix?")]);

0 commit comments

Comments
 (0)