diff --git a/.github/workflows/accuracy-tests.yml b/.github/workflows/accuracy-tests.yml new file mode 100644 index 00000000..e5f08d51 --- /dev/null +++ b/.github/workflows/accuracy-tests.yml @@ -0,0 +1,55 @@ +name: Accuracy Tests + +on: + workflow_dispatch: + push: + branches: + - main + pull_request: + types: + - labeled + +jobs: + run-accuracy-tests: + name: Run Accuracy Tests + runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write + if: | + github.event_name == 'workflow_dispatch' || + (github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests') + env: + MDB_OPEN_AI_API_KEY: ${{ secrets.ACCURACY_OPEN_AI_API_KEY }} + MDB_GEMINI_API_KEY: ${{ secrets.ACCURACY_GEMINI_API_KEY }} + MDB_AZURE_OPEN_AI_API_KEY: ${{ secrets.ACCURACY_AZURE_OPEN_AI_API_KEY }} + MDB_AZURE_OPEN_AI_API_URL: ${{ vars.ACCURACY_AZURE_OPEN_AI_API_URL }} + MDB_ACCURACY_MDB_URL: ${{ secrets.ACCURACY_MDB_CONNECTION_STRING }} + MDB_ACCURACY_MDB_DB: ${{ vars.ACCURACY_MDB_DB }} + MDB_ACCURACY_MDB_COLLECTION: ${{ vars.ACCURACY_MDB_COLLECTION }} + MDB_ACCURACY_BASELINE_COMMIT: ${{ github.event.pull_request.base.sha || '' }} + steps: + - uses: GitHubSecurityLab/actions-permissions/monitor@v1 + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version-file: package.json + cache: "npm" + - name: Install dependencies + run: npm ci + - name: Run accuracy tests + run: npm run test:accuracy + - name: Upload accuracy test summary + if: always() + uses: actions/upload-artifact@v4 + with: + name: accuracy-test-summary + path: .accuracy/test-summary.html + - name: Comment summary on PR + if: github.event_name == 'pull_request' && github.event.label.name == 'accuracy-tests' + uses: marocchino/sticky-pull-request-comment@d2ad0de260ae8b0235ce059e63f2949ba9e05943 # v2 + with: + # Hides the previous comment and add a comment at the end + hide_and_recreate: true + hide_classify: "OUTDATED" + path: .accuracy/test-brief.md diff --git a/.gitignore b/.gitignore index 4e3f7a54..49550e27 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,5 @@ state.json tests/tmp coverage +# Generated assets by accuracy runs +.accuracy diff --git a/package-lock.json b/package-lock.json index a80dcb27..7b092c89 100644 --- a/package-lock.json +++ b/package-lock.json @@ -31,24 +31,33 @@ "mongodb-mcp-server": "dist/index.js" }, "devDependencies": { + "@ai-sdk/azure": "^1.3.24", + "@ai-sdk/google": "^1.2.22", + "@ai-sdk/openai": "^1.3.23", "@eslint/js": "^9.30.1", "@modelcontextprotocol/inspector": "^0.16.0", "@redocly/cli": "^1.34.4", "@types/node": "^24.0.12", + "@types/proper-lockfile": "^4.1.4", "@types/simple-oauth2": "^5.0.7", "@types/yargs-parser": "^21.0.3", "@vitest/coverage-v8": "^3.2.4", + "ai": "^4.3.17", "eslint": "^9.30.1", "eslint-config-prettier": "^10.1.5", "eslint-plugin-prettier": "^5.5.1", "globals": "^16.3.0", "mongodb-runner": "^5.9.2", + "ollama-ai-provider": "^1.2.0", "openapi-types": "^12.1.3", "openapi-typescript": "^7.8.0", "prettier": "^3.6.2", + "proper-lockfile": "^4.1.2", + "simple-git": "^3.28.0", "tsx": "^4.20.3", "typescript": "^5.8.3", "typescript-eslint": "^8.36.0", + "uuid": "^11.1.0", "vitest": "^3.2.4", "yaml": "^2.8.0" }, @@ -56,6 +65,135 @@ "node": "^20.19.0 || ^22.12.0 || >= 23.0.0" } }, + "@himanshusinghs/ai-sdk-google": { + "extraneous": true + }, + "node_modules/@ai-sdk/azure": { + "version": "1.3.24", + "resolved": "https://registry.npmjs.org/@ai-sdk/azure/-/azure-1.3.24.tgz", + "integrity": "sha512-6zOG8mwmd8esSL/L9oYFZSyZWORRTxuG6on9A3RdPe7MRJ607Q6BWsuvul79kecbLf5xQ4bfP7LzXaBizsd8OA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/openai": "1.3.23", + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/google": { + "version": "1.2.22", + "resolved": "https://registry.npmjs.org/@ai-sdk/google/-/google-1.2.22.tgz", + "integrity": "sha512-Ppxu3DIieF1G9pyQ5O1Z646GYR0gkC57YdBqXJ82qvCdhEhZHu0TWhmnOoeIWe2olSbuDeoOY+MfJrW8dzS3Hw==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/openai": { + "version": "1.3.23", + "resolved": "https://registry.npmjs.org/@ai-sdk/openai/-/openai-1.3.23.tgz", + "integrity": "sha512-86U7rFp8yacUAOE/Jz8WbGcwMCqWvjK33wk5DXkfnAOEn3mx2r7tNSJdjukQFZbAK97VMXGPPHxF+aEARDXRXQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + } + }, + "node_modules/@ai-sdk/provider": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider/-/provider-1.1.3.tgz", + "integrity": "sha512-qZMxYJ0qqX/RfnuIaab+zp8UAeJn/ygXXAffR5I4N0n1IrvA6qBsjc8hXLmBiMV2zoXlifkacF7sEFnYnjBcqg==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "json-schema": "^0.4.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@ai-sdk/provider-utils": { + "version": "2.2.8", + "resolved": "https://registry.npmjs.org/@ai-sdk/provider-utils/-/provider-utils-2.2.8.tgz", + "integrity": "sha512-fqhG+4sCVv8x7nFzYnFo19ryhAa3w096Kmc3hWxMQfW/TubPOmt3A6tYZhl4mUfQWWQMsuSkLrtjlWuXBVSGQA==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "nanoid": "^3.3.8", + "secure-json-parse": "^2.7.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, + "node_modules/@ai-sdk/react": { + "version": "1.2.12", + "resolved": "https://registry.npmjs.org/@ai-sdk/react/-/react-1.2.12.tgz", + "integrity": "sha512-jK1IZZ22evPZoQW3vlkZ7wvjYGYF+tRBKXtrcolduIkQ/m/sOAVcVeVDUDvh1T91xCnWCdUGCPZg2avZ90mv3g==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider-utils": "2.2.8", + "@ai-sdk/ui-utils": "1.2.11", + "swr": "^2.2.5", + "throttleit": "2.1.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "react": "^18 || ^19 || ^19.0.0-rc", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, + "node_modules/@ai-sdk/ui-utils": { + "version": "1.2.11", + "resolved": "https://registry.npmjs.org/@ai-sdk/ui-utils/-/ui-utils-1.2.11.tgz", + "integrity": "sha512-3zcwCc8ezzFlwp3ZD15wAPjf2Au4s3vAbKsXQVyhxODHcmu0iyPO2Eua6D/vicq/AUm/BAo60r97O6HU+EI0+w==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8", + "zod-to-json-schema": "^3.24.1" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.23.8" + } + }, "node_modules/@ampproject/remapping": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/@ampproject/remapping/-/remapping-2.3.0.tgz", @@ -1842,6 +1980,23 @@ "jsep": "^0.4.0||^1.0.0" } }, + "node_modules/@kwsites/file-exists": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@kwsites/file-exists/-/file-exists-1.1.1.tgz", + "integrity": "sha512-m9/5YGR18lIwxSFDwfE3oA7bWuq9kdau6ugN4H2rJeyhFQZcG9AgSHkQtSD15a8WvTgfz9aikZMrKPHvbpqFiw==", + "dev": true, + "license": "MIT", + "dependencies": { + "debug": "^4.1.1" + } + }, + "node_modules/@kwsites/promise-deferred": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@kwsites/promise-deferred/-/promise-deferred-1.1.1.tgz", + "integrity": "sha512-GaHYm+c0O9MjZRu0ongGBRbinu8gVAMd2UZjji6jVmqKtZluZnptXGWhz1E8j8D2HJ3f/yMxKAUC0b+57wncIw==", + "dev": true, + "license": "MIT" + }, "node_modules/@modelcontextprotocol/inspector": { "version": "0.16.1", "resolved": "https://registry.npmjs.org/@modelcontextprotocol/inspector/-/inspector-0.16.1.tgz", @@ -4762,6 +4917,19 @@ "node": ">=18.0.0" } }, + "node_modules/@smithy/middleware-retry/node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "license": "MIT", + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/@smithy/middleware-serde": { "version": "4.0.3", "resolved": "https://registry.npmjs.org/@smithy/middleware-serde/-/middleware-serde-4.0.3.tgz", @@ -5205,6 +5373,13 @@ "devOptional": true, "license": "MIT" }, + "node_modules/@types/diff-match-patch": { + "version": "1.0.36", + "resolved": "https://registry.npmjs.org/@types/diff-match-patch/-/diff-match-patch-1.0.36.tgz", + "integrity": "sha512-xFdR6tkm0MWvBfO8xXCSsinYxHcqkQUlcHeSpMC2ukzOb6lwQAfDmW+Qt0AvlGd8HpsS28qKsB+oPeJn9I39jg==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/estree": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.8.tgz", @@ -5227,6 +5402,23 @@ "undici-types": "~7.8.0" } }, + "node_modules/@types/proper-lockfile": { + "version": "4.1.4", + "resolved": "https://registry.npmjs.org/@types/proper-lockfile/-/proper-lockfile-4.1.4.tgz", + "integrity": "sha512-uo2ABllncSqg9F1D4nugVl9v93RmjxF6LJzQLMLDdPaXCUIDPeOJ21Gbqi43xNKzBi/WQ0Q0dICqufzQbMjipQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/retry": "*" + } + }, + "node_modules/@types/retry": { + "version": "0.12.5", + "resolved": "https://registry.npmjs.org/@types/retry/-/retry-0.12.5.tgz", + "integrity": "sha512-3xSjTp3v03X/lSQLkczaN9UIEwJMoMCA1+Nb5HfbJEQWogdeQIyVtTvxPXDQjZ5zws8rFQfVfRdz03ARihPJgw==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/simple-oauth2": { "version": "5.0.7", "resolved": "https://registry.npmjs.org/@types/simple-oauth2/-/simple-oauth2-5.0.7.tgz", @@ -5739,6 +5931,33 @@ "node": ">= 14" } }, + "node_modules/ai": { + "version": "4.3.17", + "resolved": "https://registry.npmjs.org/ai/-/ai-4.3.17.tgz", + "integrity": "sha512-uWqIQ94Nb1GTYtYElGHegJMOzv3r2mCKNFlKrqkft9xrfvIahTI5OdcnD5U9612RFGuUNGmSDTO1/YRNFXobaQ==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "1.1.3", + "@ai-sdk/provider-utils": "2.2.8", + "@ai-sdk/react": "1.2.12", + "@ai-sdk/ui-utils": "1.2.11", + "@opentelemetry/api": "1.9.0", + "jsondiffpatch": "0.6.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "react": "^18 || ^19 || ^19.0.0-rc", + "zod": "^3.23.8" + }, + "peerDependenciesMeta": { + "react": { + "optional": true + } + } + }, "node_modules/ajv": { "version": "6.12.6", "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", @@ -7113,6 +7332,16 @@ "node": ">= 0.8" } }, + "node_modules/dequal": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/dequal/-/dequal-2.0.3.tgz", + "integrity": "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + }, "node_modules/destroy": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/destroy/-/destroy-1.2.0.tgz", @@ -7150,6 +7379,13 @@ "node": ">=0.3.1" } }, + "node_modules/diff-match-patch": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/diff-match-patch/-/diff-match-patch-1.0.5.tgz", + "integrity": "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==", + "dev": true, + "license": "Apache-2.0" + }, "node_modules/diff-sequences": { "version": "29.6.3", "resolved": "https://registry.npmjs.org/diff-sequences/-/diff-sequences-29.6.3.tgz", @@ -9025,6 +9261,13 @@ "foreach": "^2.0.4" } }, + "node_modules/json-schema": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz", + "integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA==", + "dev": true, + "license": "(AFL-2.1 OR BSD-3-Clause)" + }, "node_modules/json-schema-traverse": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", @@ -9038,6 +9281,37 @@ "integrity": "sha512-Bdboy+l7tA3OGW6FjyFHWkP5LuByj1Tk33Ljyq0axyzdk9//JSi2u3fP1QSmd1KNwq6VOKYGlAu87CisVir6Pw==", "license": "MIT" }, + "node_modules/jsondiffpatch": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/jsondiffpatch/-/jsondiffpatch-0.6.0.tgz", + "integrity": "sha512-3QItJOXp2AP1uv7waBkao5nCvhEv+QmJAd38Ybq7wNI74Q+BBmnLn4EDKz6yI9xGAIQoUF87qHt+kc1IVxB4zQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/diff-match-patch": "^1.0.36", + "chalk": "^5.3.0", + "diff-match-patch": "^1.0.5" + }, + "bin": { + "jsondiffpatch": "bin/jsondiffpatch.js" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + } + }, + "node_modules/jsondiffpatch/node_modules/chalk": { + "version": "5.4.1", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.4.1.tgz", + "integrity": "sha512-zgVZuo2WcZgfUEmsn6eO3kINexW8RAE4maiQ8QNs8CtpPCSyMiYsULR3HQYkm3w8FIA3SberyMJMSldGsW+U3w==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^12.17.0 || ^14.13 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" + } + }, "node_modules/jsonpath-plus": { "version": "10.3.0", "resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz", @@ -10177,6 +10451,29 @@ "node": "^10.13.0 || >=12.0.0" } }, + "node_modules/ollama-ai-provider": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/ollama-ai-provider/-/ollama-ai-provider-1.2.0.tgz", + "integrity": "sha512-jTNFruwe3O/ruJeppI/quoOUxG7NA6blG3ZyQj3lei4+NnJo7bi3eIRWqlVpRlu/mbzbFXeJSBuYQWF6pzGKww==", + "dev": true, + "license": "Apache-2.0", + "dependencies": { + "@ai-sdk/provider": "^1.0.0", + "@ai-sdk/provider-utils": "^2.0.0", + "partial-json": "0.1.7" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "zod": "^3.0.0" + }, + "peerDependenciesMeta": { + "zod": { + "optional": true + } + } + }, "node_modules/on-finished": { "version": "2.4.1", "resolved": "https://registry.npmjs.org/on-finished/-/on-finished-2.4.1.tgz", @@ -10493,6 +10790,13 @@ "node": ">= 0.8" } }, + "node_modules/partial-json": { + "version": "0.1.7", + "resolved": "https://registry.npmjs.org/partial-json/-/partial-json-0.1.7.tgz", + "integrity": "sha512-Njv/59hHaokb/hRUjce3Hdv12wd60MtM9Z5Olmn+nehe0QDAsRtRbJPvJ0Z91TusF0SuZRIvnM+S4l6EIP8leA==", + "dev": true, + "license": "MIT" + }, "node_modules/path-browserify": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/path-browserify/-/path-browserify-1.0.1.tgz", @@ -10869,6 +11173,18 @@ "dev": true, "license": "MIT" }, + "node_modules/proper-lockfile": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/proper-lockfile/-/proper-lockfile-4.1.2.tgz", + "integrity": "sha512-TjNPblN4BwAWMXU8s9AEz4JmQxnD1NNL7bNOY/AKUzyamc379FWASUhc/K1pL2noVb+XmZKLL68cjzLsiOAMaA==", + "dev": true, + "license": "MIT", + "dependencies": { + "graceful-fs": "^4.2.4", + "retry": "^0.12.0", + "signal-exit": "^3.0.2" + } + }, "node_modules/protobufjs": { "version": "7.5.0", "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.0.tgz", @@ -11290,6 +11606,16 @@ "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" } }, + "node_modules/retry": { + "version": "0.12.0", + "resolved": "https://registry.npmjs.org/retry/-/retry-0.12.0.tgz", + "integrity": "sha512-9LkiTwjUh6rT555DtE9rTX+BKByPfrMzEAtnlEtdEwr3Nkffwiihqe2bWADg+OQRjt9gl6ICdmB/ZFDCGAtSow==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 4" + } + }, "node_modules/reusify": { "version": "1.1.0", "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.1.0.tgz", @@ -11435,6 +11761,13 @@ "loose-envify": "^1.1.0" } }, + "node_modules/secure-json-parse": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz", + "integrity": "sha512-6aU+Rwsezw7VR8/nyvKTx8QpWH9FrcYiXXlqC4z5d5XQBDRqtbfsRjnwGyqbi3gddNtWHuEk9OANUotL26qKUw==", + "dev": true, + "license": "BSD-3-Clause" + }, "node_modules/seek-bzip": { "version": "1.0.6", "resolved": "https://registry.npmjs.org/seek-bzip/-/seek-bzip-1.0.6.tgz", @@ -11869,6 +12202,22 @@ "simple-concat": "^1.0.0" } }, + "node_modules/simple-git": { + "version": "3.28.0", + "resolved": "https://registry.npmjs.org/simple-git/-/simple-git-3.28.0.tgz", + "integrity": "sha512-Rs/vQRwsn1ILH1oBUy8NucJlXmnnLeLCfcvbSehkPzbv3wwoFWIdtfd6Ndo6ZPhlPsCZ60CPI4rxurnwAa+a2w==", + "dev": true, + "license": "MIT", + "dependencies": { + "@kwsites/file-exists": "^1.1.1", + "@kwsites/promise-deferred": "^1.1.1", + "debug": "^4.4.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/steveukx/git-js?sponsor=1" + } + }, "node_modules/simple-oauth2": { "version": "5.1.0", "resolved": "https://registry.npmjs.org/simple-oauth2/-/simple-oauth2-5.1.0.tgz", @@ -12345,6 +12694,20 @@ "node": ">= 6" } }, + "node_modules/swr": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/swr/-/swr-2.3.3.tgz", + "integrity": "sha512-dshNvs3ExOqtZ6kJBaAsabhPdHyeY4P2cKwRCniDVifBMoG/SVI7tfLWqPXriVspf2Rg4tPzXJTnwaihIeFw2A==", + "dev": true, + "license": "MIT", + "dependencies": { + "dequal": "^2.0.3", + "use-sync-external-store": "^1.4.0" + }, + "peerDependencies": { + "react": "^16.11.0 || ^17.0.0 || ^18.0.0 || ^19.0.0" + } + }, "node_modules/synckit": { "version": "0.11.8", "resolved": "https://registry.npmjs.org/synckit/-/synckit-0.11.8.tgz", @@ -12588,6 +12951,19 @@ "node": ">=16 || 14 >=14.17" } }, + "node_modules/throttleit": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/throttleit/-/throttleit-2.1.0.tgz", + "integrity": "sha512-nt6AMGKW1p/70DF/hGBdJB57B8Tspmbp5gfJ8ilhLnt7kkr2ye7hzD6NVG8GGErk2HWF34igrL2CXmNIkzKqKw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/through": { "version": "2.3.8", "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", @@ -13093,16 +13469,17 @@ } }, "node_modules/uuid": { - "version": "9.0.1", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", - "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "version": "11.1.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-11.1.0.tgz", + "integrity": "sha512-0/A9rDy9P7cJ+8w1c9WD9V//9Wj15Ce2MPz8Ri6032usz+NfePxx5AcN3bN+r6ZL6jEo066/yNYB3tn4pQEx+A==", + "dev": true, "funding": [ "https://github.com/sponsors/broofa", "https://github.com/sponsors/ctavan" ], "license": "MIT", "bin": { - "uuid": "dist/bin/uuid" + "uuid": "dist/esm/bin/uuid" } }, "node_modules/v8-compile-cache-lib": { diff --git a/package.json b/package.json index 5973a804..91acf2b0 100644 --- a/package.json +++ b/package.json @@ -29,29 +29,40 @@ "check:types": "tsc --noEmit --project tsconfig.json", "reformat": "prettier --write .", "generate": "./scripts/generate.sh", - "test": "vitest --coverage" + "test": "vitest --project unit-and-integration --coverage", + "pretest:accuracy": "npm run build:compile", + "test:accuracy": "sh ./scripts/accuracy/runAccuracyTests.sh" }, "license": "Apache-2.0", "devDependencies": { + "@ai-sdk/azure": "^1.3.24", + "@ai-sdk/google": "^1.2.22", + "@ai-sdk/openai": "^1.3.23", "@eslint/js": "^9.30.1", "@modelcontextprotocol/inspector": "^0.16.0", "@redocly/cli": "^1.34.4", "@types/node": "^24.0.12", + "@types/proper-lockfile": "^4.1.4", "@types/simple-oauth2": "^5.0.7", "@types/yargs-parser": "^21.0.3", "@vitest/coverage-v8": "^3.2.4", + "ai": "^4.3.17", "eslint": "^9.30.1", "eslint-config-prettier": "^10.1.5", "eslint-plugin-prettier": "^5.5.1", "globals": "^16.3.0", "mongodb-runner": "^5.9.2", + "ollama-ai-provider": "^1.2.0", "openapi-types": "^12.1.3", "openapi-typescript": "^7.8.0", "prettier": "^3.6.2", + "proper-lockfile": "^4.1.2", + "simple-git": "^3.28.0", "tsx": "^4.20.3", "typescript": "^5.8.3", "typescript-eslint": "^8.36.0", "vitest": "^3.2.4", + "uuid": "^11.1.0", "yaml": "^2.8.0" }, "dependencies": { diff --git a/resources/test-summary-template.html b/resources/test-summary-template.html new file mode 100644 index 00000000..2fa0498f --- /dev/null +++ b/resources/test-summary-template.html @@ -0,0 +1,415 @@ + + + + + + MongoDB MCP Server - Accuracy Test Summary + + + +
+

šŸ“Š MongoDB MCP Server - Accuracy Test Summary

+
+

šŸ“Š Current Run Information

+
+
+
Commit SHA
+
{{commitSHA}}
+
+
+
Accuracy Run ID
+
{{accuracyRunId}}
+
+
+
Accuracy Run Status
+
{{accuracyRunStatus}}
+
+
+
Run Created On
+
{{createdOn}}
+
+
+
Report Generated On
+
{{reportGeneratedOn}}
+
+
+
+ +
+

šŸ“ˆ Test Results Summary

+
+
+
Total Prompts Evaluated
+
{{totalPrompts}}
+
+
+
Models Tested
+
{{totalModels}}
+
+
+
Responses with 0% Accuracy
+
{{responsesWithZeroAccuracy}}
+
+
+
Average Accuracy
+
{{averageAccuracy}}
+
+
+
+ +
+

šŸ”„ Baseline Comparison

+
+
+
Baseline Commit SHA
+
{{baselineCommitSHA}}
+
+
+
Baseline Accuracy Run ID
+
{{baselineAccuracyRunId}}
+
+
+
Baseline Accuracy Run Status
+
{{baselineAccuracyRunStatus}}
+
+
+
Baseline Run Created On
+
{{baselineCreatedOn}}
+
+
+
Responses Improved vs Baseline
+
{{responsesImproved}}
+
+
+
Responses Regressed vs Baseline
+
{{responsesRegressed}}
+
+
+
+ + + + + + + + + + + + + + + {{tableRows}} + +
PromptModelExpected Tool CallsLLM Tool CallsAccuracyBaseline AccuracyLLM Response Time (ms)Total Tokens Used
+
+ + + diff --git a/scripts/accuracy/generateTestSummary.ts b/scripts/accuracy/generateTestSummary.ts new file mode 100644 index 00000000..6b9092f1 --- /dev/null +++ b/scripts/accuracy/generateTestSummary.ts @@ -0,0 +1,335 @@ +import path from "path"; +import { readFile, writeFile, mkdir } from "fs/promises"; +import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracyResultStorage/getAccuracyResultStorage.js"; +import { + AccuracyResult, + AccuracyRunStatuses, + ExpectedToolCall, + LLMToolCall, + ModelResponse, +} from "../../tests/accuracy/sdk/accuracyResultStorage/resultStorage.js"; +import { getCommitSHA } from "../../tests/accuracy/sdk/gitInfo.js"; +import { + HTML_TEST_SUMMARY_FILE, + HTML_TESTS_SUMMARY_TEMPLATE, + MARKDOWN_TEST_BRIEF_FILE, +} from "../../tests/accuracy/sdk/constants.js"; + +type ComparableAccuracyResult = Omit & { + promptAndModelResponses: PromptAndModelResponse[]; +}; + +interface PromptAndModelResponse extends ModelResponse { + prompt: string; + expectedToolCalls: ExpectedToolCall[]; + baselineToolAccuracy?: number; +} + +interface BaselineRunInfo { + commitSHA: string; + accuracyRunId: string; + accuracyRunStatus: AccuracyRunStatuses; + createdOn: string; +} + +function populateTemplate(template: string, data: Record): string { + return template.replace(/\{\{(\w+)\}\}/g, (_, key: string) => data[key] ?? ""); +} + +function formatRunStatus(status: AccuracyRunStatuses) { + const statusClasses = ["chip", "run-status"]; + if (status === "done") { + statusClasses.push("perfect"); + } else if (status === "in-progress" || status === "failed") { + statusClasses.push("poor"); + } + return `${status}`; +} + +function formatAccuracy(accuracy: number): string { + return (accuracy * 100).toFixed(1) + "%"; +} + +function getAccuracyClass(accuracy: number): string { + if (accuracy === 1) return "chip perfect"; + if (accuracy >= 0.75) return "chip good"; + return "chip poor"; +} + +function formatToolCallsWithTooltip(toolCalls: ExpectedToolCall[] | LLMToolCall[]): string { + return toolCalls + .map((call) => { + const params = JSON.stringify(call.parameters, null, 2); + return `${call.toolName}`; + }) + .join(", "); +} + +function formatTokenUsage(tokensUsage: { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; +}): string { + const total = tokensUsage.totalTokens || "-"; + const prompt = tokensUsage.promptTokens || "-"; + const completion = tokensUsage.completionTokens || "-"; + + const tooltip = [`Prompt: ${prompt}`, `Completion: ${completion}`, `Total: ${total}`].join("\n"); + return `${total}`; +} + +function formatMessages(messages: Array>): string { + return messages.map((msg) => JSON.stringify(msg, null, 2)).join("\n\n"); +} + +function formatCurrentAccuracy(response: PromptAndModelResponse): string { + const currentAccuracyText = formatAccuracy(response.toolCallingAccuracy); + const comparisonClass = getAccuracyClass(response.toolCallingAccuracy); + let comparisonIcon = ""; + + if (typeof response.baselineToolAccuracy === "number") { + if (response.toolCallingAccuracy > response.baselineToolAccuracy) { + comparisonIcon = " ↗"; + } else if (response.toolCallingAccuracy < response.baselineToolAccuracy) { + comparisonIcon = " ā†˜"; + } else { + comparisonIcon = " →"; + } + } + + return `${currentAccuracyText}${comparisonIcon}`; +} + +function formatBaselineAccuracy(response: PromptAndModelResponse): string { + if (response.baselineToolAccuracy === null || response.baselineToolAccuracy === undefined) { + return 'N/A'; + } + return `${formatAccuracy(response.baselineToolAccuracy)}`; +} + +function getTestSummary(comparableResult: ComparableAccuracyResult) { + const responses = comparableResult.promptAndModelResponses; + return { + totalPrompts: new Set(responses.map((r) => r.prompt)).size, + totalModels: new Set(responses.map((r) => `${r.provider} ${r.requestedModel}`)).size, + responsesWithZeroAccuracy: responses.filter((r) => r.toolCallingAccuracy === 0), + responsesWith75Accuracy: responses.filter((r) => r.toolCallingAccuracy === 0.75), + responsesWith100Accuracy: responses.filter((r) => r.toolCallingAccuracy === 1), + averageAccuracy: + responses.length > 0 ? responses.reduce((sum, r) => sum + r.toolCallingAccuracy, 0) / responses.length : 0, + responsesImproved: responses.filter( + (r) => typeof r.baselineToolAccuracy === "number" && r.toolCallingAccuracy > r.baselineToolAccuracy + ).length, + responsesRegressed: responses.filter( + (r) => typeof r.baselineToolAccuracy === "number" && r.toolCallingAccuracy < r.baselineToolAccuracy + ).length, + reportGeneratedOn: new Date().toLocaleString(), + resultCreatedOn: new Date(comparableResult.createdOn).toLocaleString(), + }; +} + +async function generateHtmlReport( + comparableResult: ComparableAccuracyResult, + testSummary: ReturnType, + baselineInfo: BaselineRunInfo | null +): Promise { + const responses = comparableResult.promptAndModelResponses; + const tableRows = responses + .map( + (response, index) => ` + + + ā–¶ + ${response.prompt} + + ${response.provider} - ${response.requestedModel} + ${formatToolCallsWithTooltip(response.expectedToolCalls)} + ${formatToolCallsWithTooltip(response.llmToolCalls)} + ${formatCurrentAccuracy(response)} + ${formatBaselineAccuracy(response)} + ${response.llmResponseTime.toFixed(2)} + ${formatTokenUsage(response.tokensUsed || {})} + + + +
+
+

šŸ¤– LLM Response

+
${response.text || "N/A"}
+
+
+

šŸ’¬ Conversation Messages

+
${formatMessages(response.messages || [])}
+
+
+ + + ` + ) + .join(""); + + const template = await readFile(HTML_TESTS_SUMMARY_TEMPLATE, "utf8"); + return populateTemplate(template, { + commitSHA: comparableResult.commitSHA, + accuracyRunId: comparableResult.runId, + accuracyRunStatus: formatRunStatus(comparableResult.runStatus), + reportGeneratedOn: testSummary.reportGeneratedOn, + createdOn: testSummary.resultCreatedOn, + totalPrompts: String(testSummary.totalPrompts), + totalModels: String(testSummary.totalModels), + responsesWithZeroAccuracy: String(testSummary.responsesWithZeroAccuracy.length), + averageAccuracy: formatAccuracy(testSummary.averageAccuracy), + baselineCommitSHA: baselineInfo?.commitSHA || "-", + baselineAccuracyRunId: baselineInfo?.accuracyRunId || "-", + baselineAccuracyRunStatus: baselineInfo?.accuracyRunStatus + ? formatRunStatus(baselineInfo?.accuracyRunStatus) + : "-", + baselineCreatedOn: baselineInfo?.createdOn || "-", + responsesImproved: baselineInfo ? String(testSummary.responsesImproved) : "-", + responsesRegressed: baselineInfo ? String(testSummary.responsesRegressed) : "-", + tableRows, + }); +} + +function generateMarkdownBrief( + comparableResult: ComparableAccuracyResult, + testSummary: ReturnType, + baselineInfo: BaselineRunInfo | null +): string { + const markdownTexts = [ + "# šŸ“Š Accuracy Test Results", + "## šŸ“ˆ Summary", + "| Metric | Value |", + "|--------|-------|", + `| **Commit SHA** | \`${comparableResult.commitSHA}\` |`, + `| **Run ID** | \`${comparableResult.runId}\` |`, + `| **Status** | ${comparableResult.runStatus} |`, + `| **Total Prompts Evaluated** | ${testSummary.totalPrompts} |`, + `| **Models Tested** | ${testSummary.totalModels} |`, + `| **Average Accuracy** | ${formatAccuracy(testSummary.averageAccuracy)} |`, + `| **Responses with 0% Accuracy** | ${testSummary.responsesWithZeroAccuracy.length} |`, + `| **Responses with 75% Accuracy** | ${testSummary.responsesWith75Accuracy.length} |`, + `| **Responses with 100% Accuracy** | ${testSummary.responsesWith100Accuracy.length} |`, + "", + ]; + + if (baselineInfo) { + markdownTexts.push( + ...[ + "## šŸ“Š Baseline Comparison", + "|--------|-------|", + `| **Baseline Commit** | \`${baselineInfo.commitSHA}\` |`, + `| **Baseline Run ID** | \`${baselineInfo.accuracyRunId}\` |`, + `| **Baseline Run Status** | \`${baselineInfo.accuracyRunStatus}\` |`, + `| **Responses Improved** | ${testSummary.responsesImproved} |`, + `| **Responses Regressed** | ${testSummary.responsesRegressed} |`, + "", + ] + ); + } + + const { GITHUB_SERVER_URL, GITHUB_REPOSITORY, GITHUB_RUN_ID } = process.env; + const githubRunUrl = + GITHUB_SERVER_URL && GITHUB_REPOSITORY && GITHUB_RUN_ID + ? `${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}` + : null; + + const reportLinkText = githubRunUrl + ? `šŸ“Ž **[Download Full HTML Report](${githubRunUrl})** - Look for the \`accuracy-test-summary\` artifact for detailed results.` + : `šŸ“Ž **Full HTML Report**: \`${HTML_TEST_SUMMARY_FILE}\``; + + markdownTexts.push(...["---", reportLinkText, "", `*Report generated on: ${testSummary.reportGeneratedOn}*`]); + + return markdownTexts.join("\n"); +} + +async function generateTestSummary() { + const storage = getAccuracyResultStorage(); + try { + const baselineCommit = process.env.MDB_ACCURACY_BASELINE_COMMIT; + const accuracyRunCommit = await getCommitSHA(); + const accuracyRunId = process.env.MDB_ACCURACY_RUN_ID; + + if (!accuracyRunCommit) { + throw new Error("Cannot generate summary without accuracyRunCommit"); + } + + const accuracyRunResult = await storage.getAccuracyResult(accuracyRunCommit, accuracyRunId); + if (!accuracyRunResult) { + throw new Error( + `No accuracy run result found for commitSHA - ${accuracyRunCommit}, runId - ${accuracyRunId}` + ); + } + + const baselineAccuracyRunResult = baselineCommit ? await storage.getAccuracyResult(baselineCommit) : null; + const baselineInfo: BaselineRunInfo | null = + baselineCommit && baselineAccuracyRunResult + ? { + commitSHA: baselineCommit, + accuracyRunId: baselineAccuracyRunResult.runId, + accuracyRunStatus: baselineAccuracyRunResult.runStatus, + createdOn: new Date(baselineAccuracyRunResult.createdOn).toLocaleString(), + } + : null; + + const comparableAccuracyResult: ComparableAccuracyResult = { + ...accuracyRunResult, + promptAndModelResponses: accuracyRunResult.promptResults.flatMap( + (currentPromptResult) => { + const baselinePromptResult = baselineAccuracyRunResult?.promptResults.find((baselineResult) => { + return baselineResult.prompt === currentPromptResult.prompt; + }); + + return currentPromptResult.modelResponses.map((currentModelResponse) => { + const baselineModelResponse = baselinePromptResult?.modelResponses.find( + (baselineModelResponse) => { + return ( + baselineModelResponse.provider === currentModelResponse.provider && + baselineModelResponse.requestedModel === currentModelResponse.requestedModel + ); + } + ); + return { + ...currentModelResponse, + prompt: currentPromptResult.prompt, + expectedToolCalls: currentPromptResult.expectedToolCalls, + baselineToolAccuracy: baselineModelResponse?.toolCallingAccuracy, + }; + }); + } + ), + }; + + // Ensure that our writable path actually exist. + await mkdir(path.dirname(HTML_TEST_SUMMARY_FILE), { recursive: true }); + + console.log(`\nšŸ“Š Generating test summary for accuracy run: ${accuracyRunId}\n`); + const testSummary = getTestSummary(comparableAccuracyResult); + + const htmlReport = await generateHtmlReport(comparableAccuracyResult, testSummary, baselineInfo); + await writeFile(HTML_TEST_SUMMARY_FILE, htmlReport, "utf8"); + console.log(`āœ… HTML report generated: ${HTML_TEST_SUMMARY_FILE}`); + + const markdownBrief = generateMarkdownBrief(comparableAccuracyResult, testSummary, baselineInfo); + await writeFile(MARKDOWN_TEST_BRIEF_FILE, markdownBrief, "utf8"); + console.log(`āœ… Markdown brief generated: ${MARKDOWN_TEST_BRIEF_FILE}`); + + console.log(`\nšŸ“ˆ Summary:`); + console.log(` Total prompts evaluated: ${testSummary.totalPrompts}`); + console.log(` Models tested: ${testSummary.totalModels}`); + console.log(` Responses with 0% accuracy: ${testSummary.responsesWithZeroAccuracy.length}`); + + if (baselineCommit) { + console.log(` Baseline commit: ${baselineCommit}`); + console.log(` Responses improved vs baseline: ${testSummary.responsesImproved}`); + console.log(` Responses regressed vs baseline: ${testSummary.responsesRegressed}`); + } + } catch (error) { + console.error("Error generating test summary:", error); + process.exit(1); + } finally { + await storage.close(); + } +} + +void generateTestSummary(); diff --git a/scripts/accuracy/runAccuracyTests.sh b/scripts/accuracy/runAccuracyTests.sh new file mode 100644 index 00000000..312d08a1 --- /dev/null +++ b/scripts/accuracy/runAccuracyTests.sh @@ -0,0 +1,45 @@ +#!/bin/sh +# Variables necessary for the accuracy test runs +export MDB_ACCURACY_RUN_ID=$(npx uuid v4) + +# For providing access tokens for different LLM providers +# export MDB_OPEN_AI_API_KEY="" +# export MDB_GEMINI_API_KEY="" +# export MDB_AZURE_OPEN_AI_API_KEY="" +# export MDB_AZURE_OPEN_AI_API_URL="" + +# For providing a mongodb based storage to store accuracy result +# export MDB_ACCURACY_MDB_URL="" +# export MDB_ACCURACY_MDB_DB="" +# export MDB_ACCURACY_MDB_COLLECTION="" + +# By default we run all the tests under tests/accuracy folder unless a path is +# specified in the command line. Such as: +# npm run test:accuracy -- tests/accuracy/some-test.test.ts +echo "Running accuracy tests with MDB_ACCURACY_RUN_ID '$MDB_ACCURACY_RUN_ID'" +vitest --config vitest.config.ts --project=accuracy --coverage=false --run "$@" + +# Preserving the exit code from test run to correctly notify in the CI +# environments when the tests fail. +TEST_EXIT_CODE=$? + +# Each test run submits an accuracy result with the accuracyRunStatus: +# "in-progress". When all the tests are done and jest exits with an exit code of +# 0, we can safely mark accuracy run as finished otherwise failed. + +# This "outside-the-test-status-update" is arising out of the fact that each +# test suite stores their own accuracy run data in the storage and this setup +# might lead to data inconsistency when the tests fail. To overcome that each +# accuracy result entry has a status which by default is "in-progress" and is +# updated when the tests either pass (all our accuracy tests are supposed to +# pass unless some errors occurs during the test runs), or fail. + +# This is necessary when comparing one accuracy run with another as we wouldn't +# want to compare against an incomplete run. +export MDB_ACCURACY_RUN_STATUS=$([ $TEST_EXIT_CODE -eq 0 ] && echo "done" || echo "failed") +npx tsx scripts/accuracy/updateAccuracyRunStatus.ts || echo "Warning: Failed to update accuracy run status to '$MDB_ACCURACY_RUN_STATUS'" + +# This is optional but we do it anyways to generate a readable summary of report. +npx tsx scripts/accuracy/generateTestSummary.ts || echo "Warning: Failed to generate test summary HTML report" + +exit $TEST_EXIT_CODE \ No newline at end of file diff --git a/scripts/accuracy/updateAccuracyRunStatus.ts b/scripts/accuracy/updateAccuracyRunStatus.ts new file mode 100644 index 00000000..59608707 --- /dev/null +++ b/scripts/accuracy/updateAccuracyRunStatus.ts @@ -0,0 +1,21 @@ +import { getAccuracyResultStorage } from "../../tests/accuracy/sdk/accuracyResultStorage/getAccuracyResultStorage.js"; +import { AccuracyRunStatus } from "../../tests/accuracy/sdk/accuracyResultStorage/resultStorage.js"; +import { getCommitSHA } from "../../tests/accuracy/sdk/gitInfo.js"; + +const envAccuracyRunId = process.env.MDB_ACCURACY_RUN_ID; +const envAccuracyRunStatus = process.env.MDB_ACCURACY_RUN_STATUS; +const commitSHA = await getCommitSHA(); + +if ( + !envAccuracyRunId || + !commitSHA || + (envAccuracyRunStatus !== AccuracyRunStatus.Done && envAccuracyRunStatus !== AccuracyRunStatus.Failed) +) { + process.exit(1); +} + +console.time(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`); +const storage = getAccuracyResultStorage(); +await storage.updateRunStatus(commitSHA, envAccuracyRunId, envAccuracyRunStatus); +await storage.close(); +console.timeEnd(`Marked accuracy run id - ${envAccuracyRunId} as ${envAccuracyRunStatus} in`); diff --git a/src/tools/mongodb/create/createIndex.ts b/src/tools/mongodb/create/createIndex.ts index 8e393f04..c050c9aa 100644 --- a/src/tools/mongodb/create/createIndex.ts +++ b/src/tools/mongodb/create/createIndex.ts @@ -9,7 +9,7 @@ export class CreateIndexTool extends MongoDBToolBase { protected description = "Create an index for a collection"; protected argsShape = { ...DbOperationArgs, - keys: z.record(z.string(), z.custom()).describe("The index definition"), + keys: z.object({}).catchall(z.custom()).describe("The index definition"), name: z.string().optional().describe("The name of the index"), }; diff --git a/src/tools/mongodb/create/insertMany.ts b/src/tools/mongodb/create/insertMany.ts index 4744e344..25ecba17 100644 --- a/src/tools/mongodb/create/insertMany.ts +++ b/src/tools/mongodb/create/insertMany.ts @@ -9,7 +9,7 @@ export class InsertManyTool extends MongoDBToolBase { protected argsShape = { ...DbOperationArgs, documents: z - .array(z.record(z.string(), z.unknown()).describe("An individual MongoDB document")) + .array(z.object({}).passthrough().describe("An individual MongoDB document")) .describe( "The array of documents to insert, matching the syntax of the document argument of db.collection.insertMany()" ), diff --git a/src/tools/mongodb/delete/deleteMany.ts b/src/tools/mongodb/delete/deleteMany.ts index aa135512..8440a25c 100644 --- a/src/tools/mongodb/delete/deleteMany.ts +++ b/src/tools/mongodb/delete/deleteMany.ts @@ -10,7 +10,8 @@ export class DeleteManyTool extends MongoDBToolBase { protected argsShape = { ...DbOperationArgs, filter: z - .record(z.string(), z.unknown()) + .object({}) + .passthrough() .optional() .describe( "The query filter, specifying the deletion criteria. Matches the syntax of the filter argument of db.collection.deleteMany()" diff --git a/src/tools/mongodb/metadata/explain.ts b/src/tools/mongodb/metadata/explain.ts index a686d9cc..ae9eb822 100644 --- a/src/tools/mongodb/metadata/explain.ts +++ b/src/tools/mongodb/metadata/explain.ts @@ -16,7 +16,7 @@ export class ExplainTool extends MongoDBToolBase { ...DbOperationArgs, method: z .array( - z.union([ + z.discriminatedUnion("name", [ z.object({ name: z.literal("aggregate"), arguments: z.object(AggregateArgs), diff --git a/src/tools/mongodb/read/aggregate.ts b/src/tools/mongodb/read/aggregate.ts index f9868dba..b74dd786 100644 --- a/src/tools/mongodb/read/aggregate.ts +++ b/src/tools/mongodb/read/aggregate.ts @@ -6,7 +6,7 @@ import { EJSON } from "bson"; import { checkIndexUsage } from "../../../helpers/indexCheck.js"; export const AggregateArgs = { - pipeline: z.array(z.record(z.string(), z.unknown())).describe("An array of aggregation stages to execute"), + pipeline: z.array(z.object({}).passthrough()).describe("An array of aggregation stages to execute"), }; export class AggregateTool extends MongoDBToolBase { diff --git a/src/tools/mongodb/read/count.ts b/src/tools/mongodb/read/count.ts index df3664b5..5f5f44c0 100644 --- a/src/tools/mongodb/read/count.ts +++ b/src/tools/mongodb/read/count.ts @@ -6,7 +6,8 @@ import { checkIndexUsage } from "../../../helpers/indexCheck.js"; export const CountArgs = { query: z - .record(z.string(), z.unknown()) + .object({}) + .passthrough() .optional() .describe( "A filter/query parameter. Allows users to filter the documents to count. Matches the syntax of the filter argument of db.collection.count()." diff --git a/src/tools/mongodb/read/find.ts b/src/tools/mongodb/read/find.ts index 02c337ed..0649e62d 100644 --- a/src/tools/mongodb/read/find.ts +++ b/src/tools/mongodb/read/find.ts @@ -8,18 +8,23 @@ import { checkIndexUsage } from "../../../helpers/indexCheck.js"; export const FindArgs = { filter: z - .record(z.string(), z.unknown()) + .object({}) + .passthrough() .optional() .describe("The query filter, matching the syntax of the query argument of db.collection.find()"), projection: z - .record(z.string(), z.unknown()) + .object({}) + .passthrough() .optional() .describe("The projection, matching the syntax of the projection argument of db.collection.find()"), limit: z.number().optional().default(10).describe("The maximum number of documents to return"), sort: z - .record(z.string(), z.custom()) + .object({}) + .catchall(z.custom()) .optional() - .describe("A document, describing the sort order, matching the syntax of the sort argument of cursor.sort()"), + .describe( + "A document, describing the sort order, matching the syntax of the sort argument of cursor.sort(). The keys of the object are the fields to sort on, while the values are the sort directions (1 for ascending, -1 for descending)." + ), }; export class FindTool extends MongoDBToolBase { diff --git a/src/tools/mongodb/update/updateMany.ts b/src/tools/mongodb/update/updateMany.ts index b31a843e..49dd2099 100644 --- a/src/tools/mongodb/update/updateMany.ts +++ b/src/tools/mongodb/update/updateMany.ts @@ -10,13 +10,15 @@ export class UpdateManyTool extends MongoDBToolBase { protected argsShape = { ...DbOperationArgs, filter: z - .record(z.string(), z.unknown()) + .object({}) + .passthrough() .optional() .describe( "The selection criteria for the update, matching the syntax of the filter argument of db.collection.updateOne()" ), update: z - .record(z.string(), z.unknown()) + .object({}) + .passthrough() .describe("An update document describing the modifications to apply using update operator expressions"), upsert: z .boolean() diff --git a/tests/accuracy/aggregate.test.ts b/tests/accuracy/aggregate.test.ts new file mode 100644 index 00000000..08b1ca61 --- /dev/null +++ b/tests/accuracy/aggregate.test.ts @@ -0,0 +1,27 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { Matcher } from "./sdk/matcher.js"; + +describeAccuracyTests([ + { + prompt: "Group all the movies in 'mflix.movies' namespace by 'release_year' and give me a count of them", + expectedToolCalls: [ + { + toolName: "aggregate", + parameters: { + database: "mflix", + collection: "movies", + pipeline: [ + { $group: { _id: "$release_year", count: { $sum: 1 } } }, + // For the sake of accuracy, we allow any sort order + Matcher.anyOf( + Matcher.undefined, + Matcher.value({ + $sort: Matcher.anyValue, + }) + ), + ], + }, + }, + ], + }, +]); diff --git a/tests/accuracy/collectionIndexes.test.ts b/tests/accuracy/collectionIndexes.test.ts new file mode 100644 index 00000000..5db4de1e --- /dev/null +++ b/tests/accuracy/collectionIndexes.test.ts @@ -0,0 +1,40 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; + +describeAccuracyTests([ + { + prompt: "How many indexes do I have in 'mflix.movies' namespace?", + expectedToolCalls: [ + { + toolName: "collection-indexes", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }, + { + prompt: "List all the indexes in movies collection in mflix database", + expectedToolCalls: [ + { + toolName: "collection-indexes", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }, + { + prompt: `Is the following query: ${JSON.stringify({ runtime: { $lt: 100 } })} on the namespace 'mflix.movies' indexed?`, + expectedToolCalls: [ + { + toolName: "collection-indexes", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }, +]); diff --git a/tests/accuracy/collectionSchema.test.ts b/tests/accuracy/collectionSchema.test.ts new file mode 100644 index 00000000..8c9039bd --- /dev/null +++ b/tests/accuracy/collectionSchema.test.ts @@ -0,0 +1,28 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; + +describeAccuracyTests([ + { + prompt: "Is there a title field in 'mflix.movies' namespace?", + expectedToolCalls: [ + { + toolName: "collection-schema", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }, + { + prompt: "What is the type of value stored in title field in movies collection in mflix database?", + expectedToolCalls: [ + { + toolName: "collection-schema", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }, +]); diff --git a/tests/accuracy/collectionStorageSize.test.ts b/tests/accuracy/collectionStorageSize.test.ts new file mode 100644 index 00000000..8180341e --- /dev/null +++ b/tests/accuracy/collectionStorageSize.test.ts @@ -0,0 +1,41 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; + +describeAccuracyTests([ + { + prompt: "What is the size of 'mflix.movies' namespace", + expectedToolCalls: [ + { + toolName: "collection-storage-size", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }, + { + prompt: "How much size is each collection in comics database", + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { + database: "comics", + }, + }, + { + toolName: "collection-storage-size", + parameters: { + database: "comics", + collection: "books", + }, + }, + { + toolName: "collection-storage-size", + parameters: { + database: "comics", + collection: "characters", + }, + }, + ], + }, +]); diff --git a/tests/accuracy/count.test.ts b/tests/accuracy/count.test.ts new file mode 100644 index 00000000..7716aa65 --- /dev/null +++ b/tests/accuracy/count.test.ts @@ -0,0 +1,44 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { Matcher } from "./sdk/matcher.js"; + +describeAccuracyTests([ + { + prompt: "Count number of documents in 'mflix.movies' namespace.", + expectedToolCalls: [ + { + toolName: "count", + parameters: { + database: "mflix", + collection: "movies", + query: Matcher.emptyObjectOrUndefined, + }, + }, + ], + }, + { + prompt: "How many documents are there in 'characters' collection in 'comics' database?", + expectedToolCalls: [ + { + toolName: "count", + parameters: { + database: "comics", + collection: "characters", + query: Matcher.emptyObjectOrUndefined, + }, + }, + ], + }, + { + prompt: "Count all the documents in 'mflix.movies' namespace with runtime less than 100?", + expectedToolCalls: [ + { + toolName: "count", + parameters: { + database: "mflix", + collection: "movies", + query: { runtime: { $lt: 100 } }, + }, + }, + ], + }, +]); diff --git a/tests/accuracy/createCollection.test.ts b/tests/accuracy/createCollection.test.ts new file mode 100644 index 00000000..75c32e01 --- /dev/null +++ b/tests/accuracy/createCollection.test.ts @@ -0,0 +1,46 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; + +describeAccuracyTests([ + { + prompt: "Create a new namespace 'mflix.documentaries'", + expectedToolCalls: [ + { + toolName: "create-collection", + parameters: { + database: "mflix", + collection: "documentaries", + }, + }, + ], + }, + { + prompt: "Create a new collection villains in comics database", + expectedToolCalls: [ + { + toolName: "create-collection", + parameters: { + database: "comics", + collection: "villains", + }, + }, + ], + }, + { + prompt: "If and only if, the namespace 'mflix.documentaries' does not exist, then create it", + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { + database: "mflix", + }, + }, + { + toolName: "create-collection", + parameters: { + database: "mflix", + collection: "documentaries", + }, + }, + ], + }, +]); diff --git a/tests/accuracy/createIndex.test.ts b/tests/accuracy/createIndex.test.ts new file mode 100644 index 00000000..08326ce3 --- /dev/null +++ b/tests/accuracy/createIndex.test.ts @@ -0,0 +1,37 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { Matcher } from "./sdk/matcher.js"; + +describeAccuracyTests([ + { + prompt: "Create an index that covers the following query on 'mflix.movies' namespace - { \"release_year\": 1992 }", + expectedToolCalls: [ + { + toolName: "create-index", + parameters: { + database: "mflix", + collection: "movies", + name: Matcher.anyOf(Matcher.undefined, Matcher.string()), + keys: { + release_year: 1, + }, + }, + }, + ], + }, + { + prompt: "Create a text index on title field in 'mflix.movies' namespace", + expectedToolCalls: [ + { + toolName: "create-index", + parameters: { + database: "mflix", + collection: "movies", + name: Matcher.anyOf(Matcher.undefined, Matcher.string()), + keys: { + title: "text", + }, + }, + }, + ], + }, +]); diff --git a/tests/accuracy/dbStats.test.ts b/tests/accuracy/dbStats.test.ts new file mode 100644 index 00000000..f32d3495 --- /dev/null +++ b/tests/accuracy/dbStats.test.ts @@ -0,0 +1,15 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; + +describeAccuracyTests([ + { + prompt: "What is the size occupied by database mflix?", + expectedToolCalls: [ + { + toolName: "db-stats", + parameters: { + database: "mflix", + }, + }, + ], + }, +]); diff --git a/tests/accuracy/deleteMany.test.ts b/tests/accuracy/deleteMany.test.ts new file mode 100644 index 00000000..a5ab1f09 --- /dev/null +++ b/tests/accuracy/deleteMany.test.ts @@ -0,0 +1,44 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { Matcher } from "./sdk/matcher.js"; + +describeAccuracyTests([ + { + prompt: "Delete all the documents from 'mflix.movies' namespace", + expectedToolCalls: [ + { + toolName: "delete-many", + parameters: { + database: "mflix", + collection: "movies", + filter: Matcher.emptyObjectOrUndefined, + }, + }, + ], + }, + { + prompt: "Purge the collection 'movies' in database 'mflix'", + expectedToolCalls: [ + { + toolName: "delete-many", + parameters: { + database: "mflix", + collection: "movies", + filter: Matcher.emptyObjectOrUndefined, + }, + }, + ], + }, + { + prompt: "Remove all the documents from namespace 'mflix.movies' where runtime is less than 100", + expectedToolCalls: [ + { + toolName: "delete-many", + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + }, + }, + ], + }, +]); diff --git a/tests/accuracy/dropCollection.test.ts b/tests/accuracy/dropCollection.test.ts new file mode 100644 index 00000000..77fe06b8 --- /dev/null +++ b/tests/accuracy/dropCollection.test.ts @@ -0,0 +1,74 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; + +describeAccuracyTests([ + { + prompt: "Remove mflix.movies namespace from my cluster.", + expectedToolCalls: [ + { + toolName: "drop-collection", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }, + { + prompt: "Drop movies collection from mflix database.", + expectedToolCalls: [ + { + toolName: "drop-collection", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ], + }, + { + prompt: "Remove books collection from which ever database contains it.", + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "list-collections", + parameters: { + database: "admin", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "comics", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "config", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "local", + }, + }, + { + toolName: "list-collections", + parameters: { + database: "mflix", + }, + }, + { + toolName: "drop-collection", + parameters: { + database: "comics", + collection: "books", + }, + }, + ], + }, +]); diff --git a/tests/accuracy/dropDatabase.test.ts b/tests/accuracy/dropDatabase.test.ts new file mode 100644 index 00000000..3010e83a --- /dev/null +++ b/tests/accuracy/dropDatabase.test.ts @@ -0,0 +1,41 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; + +describeAccuracyTests([ + { + prompt: "Remove mflix database from my cluster.", + expectedToolCalls: [ + { + toolName: "drop-database", + parameters: { + database: "mflix", + }, + }, + ], + }, + { + prompt: "Drop database named mflix.", + expectedToolCalls: [ + { + toolName: "drop-database", + parameters: { + database: "mflix", + }, + }, + ], + }, + { + prompt: "If there is a mflix database in my cluster then drop it.", + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "drop-database", + parameters: { + database: "mflix", + }, + }, + ], + }, +]); diff --git a/tests/accuracy/explain.test.ts b/tests/accuracy/explain.test.ts new file mode 100644 index 00000000..cb9ac0c1 --- /dev/null +++ b/tests/accuracy/explain.test.ts @@ -0,0 +1,73 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; + +/** + * None of these tests score a parameter match on any of the models, likely + * because we are using Zod.union, when we probably should've used + * Zod.discriminatedUnion + */ +describeAccuracyTests([ + { + prompt: `Will fetching documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`, + expectedToolCalls: [ + { + toolName: "explain", + parameters: { + database: "mflix", + collection: "movies", + method: [ + { + name: "find", + arguments: { + filter: { release_year: 2020 }, + }, + }, + ], + }, + }, + ], + }, + { + prompt: `Will aggregating documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`, + expectedToolCalls: [ + { + toolName: "explain", + parameters: { + database: "mflix", + collection: "movies", + method: [ + { + name: "aggregate", + arguments: { + pipeline: [ + { + $match: { release_year: 2020 }, + }, + ], + }, + }, + ], + }, + }, + ], + }, + { + prompt: `Will counting documents, where release_year is 2020, from 'mflix.movies' namespace perform a collection scan?`, + expectedToolCalls: [ + { + toolName: "explain", + parameters: { + database: "mflix", + collection: "movies", + method: [ + { + name: "count", + arguments: { + query: { release_year: 2020 }, + }, + }, + ], + }, + }, + ], + }, +]); diff --git a/tests/accuracy/find.test.ts b/tests/accuracy/find.test.ts new file mode 100644 index 00000000..f291c46b --- /dev/null +++ b/tests/accuracy/find.test.ts @@ -0,0 +1,114 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { Matcher } from "./sdk/matcher.js"; + +describeAccuracyTests([ + { + prompt: "List all the movies in 'mflix.movies' namespace.", + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: Matcher.emptyObjectOrUndefined, + }, + }, + ], + }, + { + prompt: "List all the documents in 'comics.books' namespace.", + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "comics", + collection: "books", + filter: Matcher.emptyObjectOrUndefined, + }, + }, + ], + }, + { + prompt: "Find all the movies in 'mflix.movies' namespace with runtime less than 100.", + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: { + runtime: { $lt: 100 }, + }, + }, + }, + ], + }, + { + prompt: "Find all movies in 'mflix.movies' collection where director is 'Christina Collins'", + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: { + director: "Christina Collins", + }, + }, + }, + ], + }, + { + prompt: "Give me all the movie titles available in 'mflix.movies' namespace", + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + projection: { + title: 1, + _id: Matcher.anyOf( + Matcher.undefined, + Matcher.number((value) => value === 0) + ), + }, + filter: Matcher.emptyObjectOrUndefined, + }, + }, + ], + }, + { + prompt: "Use 'mflix.movies' namespace to answer who were casted in the movie 'Certain Fish'", + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: { title: "Certain Fish" }, + projection: { + cast: 1, + _id: Matcher.anyOf(Matcher.undefined, Matcher.number()), + }, + limit: Matcher.number((value) => value > 0), + }, + }, + ], + }, + { + prompt: "From the mflix.movies namespace, give me first 2 movies of Horror genre sorted ascending by their runtime", + expectedToolCalls: [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: { genres: "Horror" }, + sort: { runtime: 1 }, + limit: 2, + }, + }, + ], + }, +]); diff --git a/tests/accuracy/insertMany.test.ts b/tests/accuracy/insertMany.test.ts new file mode 100644 index 00000000..159072bb --- /dev/null +++ b/tests/accuracy/insertMany.test.ts @@ -0,0 +1,48 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { Matcher } from "./sdk/matcher.js"; + +describeAccuracyTests([ + { + prompt: [ + "In my namespace 'mflix.movies', insert 3 documents each with the following fields:", + "- id: an incremental number starting from 1", + "- name: a string of format 'name'", + ].join("\n"), + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "mflix", + collection: "movies", + documents: [ + { + id: 1, + name: "name1", + }, + { + id: 2, + name: "name2", + }, + { + id: 3, + name: "name3", + }, + ], + }, + }, + ], + }, + { + prompt: "Add three empty documents in collection 'movies' inside database 'mflix'", + expectedToolCalls: [ + { + toolName: "insert-many", + parameters: { + database: "mflix", + collection: "movies", + documents: [{ _id: Matcher.anyValue }, { _id: Matcher.anyValue }, { _id: Matcher.anyValue }], + }, + }, + ], + }, +]); diff --git a/tests/accuracy/listCollections.test.ts b/tests/accuracy/listCollections.test.ts new file mode 100644 index 00000000..f3361d80 --- /dev/null +++ b/tests/accuracy/listCollections.test.ts @@ -0,0 +1,60 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; + +describeAccuracyTests([ + { + prompt: "How many collections do I have in database mflix?", + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { database: "mflix" }, + }, + ], + }, + { + prompt: "List all the collections in my MongoDB database mflix.", + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { database: "mflix" }, + }, + ], + }, + { + prompt: "Is there a shows collection in my MongoDB database mflix?", + expectedToolCalls: [ + { + toolName: "list-collections", + parameters: { database: "mflix" }, + }, + ], + }, + { + prompt: "List all the collections that I have in total on my cluster?", + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + { + toolName: "list-collections", + parameters: { database: "admin" }, + }, + { + toolName: "list-collections", + parameters: { database: "comics" }, + }, + { + toolName: "list-collections", + parameters: { database: "config" }, + }, + { + toolName: "list-collections", + parameters: { database: "local" }, + }, + { + toolName: "list-collections", + parameters: { database: "mflix" }, + }, + ], + }, +]); diff --git a/tests/accuracy/listDatabases.test.ts b/tests/accuracy/listDatabases.test.ts new file mode 100644 index 00000000..4681fd7c --- /dev/null +++ b/tests/accuracy/listDatabases.test.ts @@ -0,0 +1,31 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; + +describeAccuracyTests([ + { + prompt: "How many databases do I have?", + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + ], + }, + { + prompt: "List all the databases that I have in my clusters", + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + ], + }, + { + prompt: "Is there a mflix database in my cluster?", + expectedToolCalls: [ + { + toolName: "list-databases", + parameters: {}, + }, + ], + }, +]); diff --git a/tests/accuracy/logs.test.ts b/tests/accuracy/logs.test.ts new file mode 100644 index 00000000..83c9179b --- /dev/null +++ b/tests/accuracy/logs.test.ts @@ -0,0 +1,28 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { Matcher } from "./sdk/matcher.js"; + +describeAccuracyTests([ + { + prompt: "Were there any startup warnings for my MongoDB server?", + expectedToolCalls: [ + { + toolName: "mongodb-logs", + parameters: { + type: "startupWarnings", + }, + }, + ], + }, + { + prompt: "Retrieve first 10 logs for my MongoDB server?", + expectedToolCalls: [ + { + toolName: "mongodb-logs", + parameters: { + type: Matcher.anyOf(Matcher.undefined, Matcher.value("global")), + limit: 10, + }, + }, + ], + }, +]); diff --git a/tests/accuracy/renameCollection.test.ts b/tests/accuracy/renameCollection.test.ts new file mode 100644 index 00000000..9b2c9dac --- /dev/null +++ b/tests/accuracy/renameCollection.test.ts @@ -0,0 +1,31 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; + +describeAccuracyTests([ + { + prompt: "Rename my 'mflix.movies' namespace to 'mflix.new_movies'", + expectedToolCalls: [ + { + toolName: "rename-collection", + parameters: { + database: "mflix", + collection: "movies", + newName: "new_movies", + }, + }, + ], + }, + { + prompt: "Rename my 'mflix.movies' namespace to 'mflix.new_movies' while removing the old namespace.", + expectedToolCalls: [ + { + toolName: "rename-collection", + parameters: { + database: "mflix", + collection: "movies", + newName: "new_movies", + dropTarget: true, + }, + }, + ], + }, +]); diff --git a/tests/accuracy/sdk/accuracyResultStorage/diskStorage.ts b/tests/accuracy/sdk/accuracyResultStorage/diskStorage.ts new file mode 100644 index 00000000..03aba702 --- /dev/null +++ b/tests/accuracy/sdk/accuracyResultStorage/diskStorage.ts @@ -0,0 +1,189 @@ +import path from "path"; +import fs from "fs/promises"; +import { lock } from "proper-lockfile"; +import { ACCURACY_RESULTS_DIR, LATEST_ACCURACY_RUN_NAME } from "../constants.js"; +import { + AccuracyResult, + AccuracyResultStorage, + AccuracyRunStatus, + AccuracyRunStatuses, + ExpectedToolCall, + ModelResponse, +} from "./resultStorage.js"; + +export class DiskBasedResultStorage implements AccuracyResultStorage { + async getAccuracyResult(commitSHA: string, runId?: string): Promise { + const filePath = runId + ? // If we have both commit and runId then we get the path for + // specific file. Common case when saving prompt responses during an + // accuracy run + this.getAccuracyResultFilePath(commitSHA, runId) + : // If we only have commit then we grab the latest successful run for the + // commit. The latest run is a link to the last run that was + // marked as successful. + this.getAccuracyResultFilePath(commitSHA, LATEST_ACCURACY_RUN_NAME); + + return this.withFileLock(filePath, () => this.getAccuracyResultWithoutLock(filePath)); + } + + async updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise { + const resultFilePath = this.getAccuracyResultFilePath(commitSHA, runId); + await this.withFileLock(resultFilePath, async () => { + const accuracyResult = await this.getAccuracyResultWithoutLock(resultFilePath); + if (!accuracyResult) { + throw new Error("Results not found!"); + } + + await fs.writeFile( + resultFilePath, + JSON.stringify( + { + ...accuracyResult, + runStatus: status, + }, + null, + 2 + ), + { encoding: "utf8" } + ); + }); + + // This bit is important to mark the current run as the latest run for a + // commit so that we can use that during baseline comparison. + if (status === AccuracyRunStatus.Done) { + const latestResultFilePath = this.getLatestResultFilePath(commitSHA); + await this.ensureFileWithInitialData(latestResultFilePath, JSON.stringify({})); + await this.withFileLock(latestResultFilePath, async () => { + await fs.unlink(latestResultFilePath); + await fs.link(resultFilePath, latestResultFilePath); + }); + } + } + + async saveModelResponseForPrompt({ + commitSHA, + runId, + prompt, + expectedToolCalls, + modelResponse, + }: { + commitSHA: string; + runId: string; + prompt: string; + expectedToolCalls: ExpectedToolCall[]; + modelResponse: ModelResponse; + }): Promise { + const initialData: AccuracyResult = { + runId, + runStatus: AccuracyRunStatus.InProgress, + createdOn: Date.now(), + commitSHA, + promptResults: [ + { + prompt, + expectedToolCalls, + modelResponses: [modelResponse], + }, + ], + }; + const resultFilePath = this.getAccuracyResultFilePath(commitSHA, runId); + const { fileCreatedWithInitialData } = await this.ensureFileWithInitialData( + resultFilePath, + JSON.stringify(initialData, null, 2) + ); + + if (fileCreatedWithInitialData) { + return; + } + + await this.withFileLock(resultFilePath, async () => { + let accuracyResult = await this.getAccuracyResultWithoutLock(resultFilePath); + if (!accuracyResult) { + throw new Error("Expected at-least initial accuracy result to be present"); + } + + const existingPromptIdx = accuracyResult.promptResults.findIndex((result) => result.prompt === prompt); + const promptResult = accuracyResult.promptResults[existingPromptIdx]; + if (promptResult) { + accuracyResult.promptResults.splice(existingPromptIdx, 1, { + prompt: promptResult.prompt, + expectedToolCalls: promptResult.expectedToolCalls, + modelResponses: [...promptResult.modelResponses, modelResponse], + }); + } else { + accuracyResult = { + ...accuracyResult, + promptResults: [ + ...accuracyResult.promptResults, + { + prompt, + expectedToolCalls, + modelResponses: [modelResponse], + }, + ], + }; + } + + await fs.writeFile(resultFilePath, JSON.stringify(accuracyResult, null, 2)); + }); + } + + close(): Promise { + return Promise.resolve(); + } + + private async getAccuracyResultWithoutLock(filePath: string): Promise { + try { + const raw = await fs.readFile(filePath, "utf8"); + return JSON.parse(raw) as AccuracyResult; + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "ENOENT") { + return null; + } + throw error; + } + } + + private async ensureFileWithInitialData( + filePath: string, + initialData: string + ): Promise<{ + fileCreatedWithInitialData: boolean; + }> { + try { + await fs.mkdir(path.dirname(filePath), { recursive: true }); + await fs.writeFile(filePath, initialData, { flag: "wx" }); + return { + fileCreatedWithInitialData: true, + }; + } catch (error) { + if ((error as NodeJS.ErrnoException).code === "EEXIST") { + return { + fileCreatedWithInitialData: false, + }; + } + throw error; + } + } + + private async withFileLock(filePath: string, callback: () => Promise): Promise { + let releaseLock: (() => Promise) | undefined; + try { + releaseLock = await lock(filePath, { retries: 10 }); + return await callback(); + } catch (error) { + console.warn(`Could not acquire lock for file - ${filePath}.`, error); + throw error; + } finally { + await releaseLock?.(); + } + } + + private getAccuracyResultFilePath(commitSHA: string, runId: string): string { + return path.join(ACCURACY_RESULTS_DIR, commitSHA, `${runId}.json`); + } + + private getLatestResultFilePath(commitSHA: string): string { + return path.join(ACCURACY_RESULTS_DIR, commitSHA, `${LATEST_ACCURACY_RUN_NAME}.json`); + } +} diff --git a/tests/accuracy/sdk/accuracyResultStorage/getAccuracyResultStorage.ts b/tests/accuracy/sdk/accuracyResultStorage/getAccuracyResultStorage.ts new file mode 100644 index 00000000..127fc5f1 --- /dev/null +++ b/tests/accuracy/sdk/accuracyResultStorage/getAccuracyResultStorage.ts @@ -0,0 +1,11 @@ +import { DiskBasedResultStorage } from "./diskStorage.js"; +import { MongoDBBasedResultStorage } from "./mongodbStorage.js"; +import { AccuracyResultStorage } from "./resultStorage.js"; + +export function getAccuracyResultStorage(): AccuracyResultStorage { + const { MDB_ACCURACY_MDB_URL, MDB_ACCURACY_MDB_DB, MDB_ACCURACY_MDB_COLLECTION } = process.env; + if (MDB_ACCURACY_MDB_URL && MDB_ACCURACY_MDB_DB && MDB_ACCURACY_MDB_COLLECTION) { + return new MongoDBBasedResultStorage(MDB_ACCURACY_MDB_URL, MDB_ACCURACY_MDB_DB, MDB_ACCURACY_MDB_COLLECTION); + } + return new DiskBasedResultStorage(); +} diff --git a/tests/accuracy/sdk/accuracyResultStorage/mongodbStorage.ts b/tests/accuracy/sdk/accuracyResultStorage/mongodbStorage.ts new file mode 100644 index 00000000..be11aeb3 --- /dev/null +++ b/tests/accuracy/sdk/accuracyResultStorage/mongodbStorage.ts @@ -0,0 +1,151 @@ +import { Collection, MongoClient } from "mongodb"; +import { + AccuracyResult, + AccuracyResultStorage, + AccuracyRunStatus, + AccuracyRunStatuses, + ExpectedToolCall, + ModelResponse, +} from "./resultStorage.js"; + +// We could decide to omit some fields from the model response to reduce the size of the stored results. Since +// so far, the responses are not too big, we do not omit any fields, but if we decide to do so in the future, +// we could add `"messages"` and `"text"` to this list. +const OMITTED_MODEL_RESPONSE_FIELDS: (keyof ModelResponse)[] = []; + +export class MongoDBBasedResultStorage implements AccuracyResultStorage { + private client: MongoClient; + private resultCollection: Collection; + + constructor(connectionString: string, database: string, collection: string) { + this.client = new MongoClient(connectionString); + this.resultCollection = this.client.db(database).collection(collection); + } + + async getAccuracyResult(commitSHA: string, runId?: string): Promise { + const filters: Partial = runId + ? { commitSHA, runId } + : // Note that we use the `Done` status filter only when asked for + // a commit. That is because the one use case of asking for a run + // for commit is when you want the last successful run of that + // particular commit. + { commitSHA, runStatus: AccuracyRunStatus.Done }; + + return await this.resultCollection.findOne(filters, { + sort: { + createdOn: -1, + }, + }); + } + + async updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise { + await this.resultCollection.updateOne( + { commitSHA, runId }, + { + $set: { + runStatus: status, + }, + } + ); + } + + async saveModelResponseForPrompt({ + commitSHA, + runId, + prompt, + expectedToolCalls, + modelResponse, + }: { + commitSHA: string; + runId: string; + prompt: string; + expectedToolCalls: ExpectedToolCall[]; + modelResponse: ModelResponse; + }): Promise { + const modelResponseToSave: ModelResponse = { + ...modelResponse, + }; + + for (const field of OMITTED_MODEL_RESPONSE_FIELDS) { + delete modelResponseToSave[field]; + } + + await this.resultCollection.updateOne( + { commitSHA, runId }, + [ + { + $set: { + runStatus: { $ifNull: ["$runStatus", AccuracyRunStatus.InProgress] }, + createdOn: { $ifNull: ["$createdOn", Date.now()] }, + commitSHA: { $ifNull: ["$commitSHA", commitSHA] }, + runId: { $ifNull: ["$runId", runId] }, + promptResults: { + $ifNull: ["$promptResults", []], + }, + }, + }, + { + $set: { + promptResults: { + $let: { + vars: { + existingPromptIndex: { + $indexOfArray: ["$promptResults.prompt", prompt], + }, + }, + in: { + $cond: [ + { $eq: ["$$existingPromptIndex", -1] }, + { + $concatArrays: [ + "$promptResults", + [ + { + $literal: { + prompt, + expectedToolCalls, + modelResponses: [modelResponseToSave], + }, + }, + ], + ], + }, + { + $map: { + input: "$promptResults", + as: "promptResult", + in: { + $cond: [ + { $eq: ["$$promptResult.prompt", prompt] }, + { + prompt: "$$promptResult.prompt", + expectedToolCalls: { + $literal: expectedToolCalls, + }, + modelResponses: { + $concatArrays: [ + "$$promptResult.modelResponses", + [{ $literal: modelResponseToSave }], + ], + }, + }, + "$$promptResult", + ], + }, + }, + }, + ], + }, + }, + }, + }, + }, + ], + { upsert: true } + ); + } + + async close(): Promise { + await this.client.close(); + } +} diff --git a/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts b/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts new file mode 100644 index 00000000..845af8a0 --- /dev/null +++ b/tests/accuracy/sdk/accuracyResultStorage/resultStorage.ts @@ -0,0 +1,117 @@ +export interface LLMToolCall { + toolCallId: string; + toolName: string; + parameters: Record; +} + +export type ExpectedToolCall = Omit; + +export const AccuracyRunStatus = { + Done: "done", + Failed: "failed", + InProgress: "in-progress", +} as const; + +export type AccuracyRunStatuses = (typeof AccuracyRunStatus)[keyof typeof AccuracyRunStatus]; + +export interface AccuracyResult { + /** + * A unique id for each accuracy run. Should either be generated by the + * script triggering the accuracy run or provided via environment variables. + * */ + runId: string; + /** + * Represents the status of accuracy run. Each test completion, during an + * accuracy run, is supposed to submit an accuracy result entry with + * InProgress status which then later, after completion of accuracy run, is + * updated to either Done or Failed, depending on whether there were errors + * during the run or not. */ + runStatus: AccuracyRunStatuses; + /** + * Timestamp of when this result entry was generated. */ + createdOn: number; + /** + * The commit SHA for which the accuracy run was triggered. */ + commitSHA: string; + /** + * A list of results for different prompts tested in the accuracy run. */ + promptResults: PromptResult[]; +} + +export interface PromptResult { + /** + * The actual prompt that was provided to LLM as test */ + prompt: string; + /** + * A list of tools, along with their parameters, that are expected to be + * called by the LLM in test. */ + expectedToolCalls: ExpectedToolCall[]; + /** + * The responses from the LLMs tested, when provided with the prompt. */ + modelResponses: ModelResponse[]; +} + +export interface ModelResponse { + /** + * The LLM provider providing the LLM APIs */ + provider: string; + /** + * The LLM which was requested to respond to our test prompts */ + requestedModel: string; + /** + * The ID of the model that actually responded to our prompt request. */ + respondingModel: string; + /** + * The total time taken by LLM to respond to our prompt. */ + llmResponseTime: number; + /** + * A number between 0 and 1, representing how accurately the expected tools + * were called by LLM when responding to the provided prompts. To know more + * about how this number is generated, check - toolCallingAccuracy.ts */ + toolCallingAccuracy: number; + /** + * A list of tools, along with their parameters, that were actually called + * by the LLM in test. */ + llmToolCalls: LLMToolCall[]; + /** + * Token usage data, returned as part of LLM prompt response. */ + tokensUsed?: TokensUsed; + /** + * The final response text generated by the LLM, in response to our prompt + * request. */ + text?: string; + /** + * A list of messages, exchanged between LLM and our testing agent, in + * response to our prompt request. This is particularly helpful for + * debugging. */ + messages?: Record[]; +} + +interface TokensUsed { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; +} + +export interface AccuracyResultStorage { + /** + * Retrieves the accuracy result for the provided commit SHA and optionally + * the run id. When the run id is omitted, the implementation fetches the + * result for the last successful accuracy run otherwise it fetches the + * result regardless of the run status. */ + getAccuracyResult(commitSHA: string, runId?: string): Promise; + /** + * Updates the status of the run */ + updateRunStatus(commitSHA: string, runId: string, status: AccuracyRunStatuses): Promise; + /** + * Attempts to atomically insert the model response for the prompt in the + * stored accuracy result. */ + saveModelResponseForPrompt(data: { + commitSHA: string; + runId: string; + prompt: string; + expectedToolCalls: ExpectedToolCall[]; + modelResponse: ModelResponse; + }): Promise; + close(): Promise; +} diff --git a/tests/accuracy/sdk/accuracyScorer.ts b/tests/accuracy/sdk/accuracyScorer.ts new file mode 100644 index 00000000..92c18853 --- /dev/null +++ b/tests/accuracy/sdk/accuracyScorer.ts @@ -0,0 +1,93 @@ +import { ExpectedToolCall, LLMToolCall } from "./accuracyResultStorage/resultStorage.js"; +import { Matcher } from "./matcher.js"; + +/** + * Tool calling accuracy is a single number calculated based on two dimensions. + * 1. Did LLM call the right tool? + * 2. Did LLM call the tool with correct and required parameters? + * + * The number can be one of: + * - 0: When LLM: + * - did not call the right tool + * - did not call the tool with correct parameters + * - 0.75: When LLM: + * - called the right tool but hallucinated and called some extra tools as + * well or called the same tool but with different parameters + * - called the right tool but hallucinated and called it with some + * non-required parameters + * - 1: When LLM: + * - called exactly the tools that were expected + * - called the expected tools exactly with the expected parameters + * + * To calculate this number we must have: + * 1. a list of expected tool calls with their expected parameters + * 2. a list of LLM tool calls with their parameters + * + * For each expected tool call we find the best matching LLM tool call. Best + * matching LLM tool call will have: + * 1. the same name as that of the expected tool call + * 2. highest parameter similarity score, with at-least 0.75 to ensure an actual + * match. And in case of competing scores, we take the first one that appears + * in the LLM tool calls. + * + * Using the above logic we establish pairs between expected and actual tool + * calls. + * + * 1. If we could not pair some LLM tool calls with expected tool calls that + * means the LLM hallucinated over the extra tool calls. For that reason we + * will cap the maximum achievable accuracy to 0.75. + * + * 2. If we could not pair some expected tool calls with LLM tool calls that + * means the LLM did not call one of the expected tool required to solve the + * problem. For that reason we will mark the accuracy as 0 and exit early. + * + * 3. Now for each of the established tool call pairs, we will determine how + * correctly the parameters were called using the parameter similarity score. + * The parameter similarity score follow the same accuracy number pattern + * described above: + * - 0 : for missing parameters, incorrect parameter values + * - 0.75 : for additional parameters + * - 1 : for a perfect match + * + * The final accuracy score is then calculated as the least of: + * - Maximum achievable accuracy from #1 + * - The least of parameter similarity score from the established pairs in #3 + * + * For examples: see the test cases in - tests/unit/accuracy-scorer.test.ts + */ +export function calculateToolCallingAccuracy( + expectedToolCalls: ExpectedToolCall[], + actualToolCalls: LLMToolCall[] +): number { + if (expectedToolCalls.length === 0) { + return actualToolCalls.length === 0 ? 1 : 0.75; + } + + let currentScore = actualToolCalls.length > expectedToolCalls.length ? 0.75 : 1; + const checkedActualToolCallIndexes = new Set(); + + for (const expectedCall of expectedToolCalls) { + const candidates = actualToolCalls + .map((call, index) => ({ call, index })) + .filter( + ({ call, index }) => !checkedActualToolCallIndexes.has(index) && call.toolName === expectedCall.toolName + ) + .map(({ call, index }) => ({ + call, + index, + score: Matcher.value(expectedCall.parameters).match(call.parameters), + })) + .filter(({ score }) => score >= 0.75) + .sort((a, b) => b.score - a.score || a.index - b.index); + + const bestMatch = candidates[0]; + if (!bestMatch || bestMatch.score === 0) { + return 0; // No matching tool call found, return 0 + } + + checkedActualToolCallIndexes.add(bestMatch.index); + currentScore = Math.min(currentScore, bestMatch.score); + } + + return currentScore; +} diff --git a/tests/accuracy/sdk/accuracyTestingClient.ts b/tests/accuracy/sdk/accuracyTestingClient.ts new file mode 100644 index 00000000..e07a5146 --- /dev/null +++ b/tests/accuracy/sdk/accuracyTestingClient.ts @@ -0,0 +1,94 @@ +import { v4 as uuid } from "uuid"; +import { experimental_createMCPClient as createMCPClient, tool as createVercelTool } from "ai"; +import { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; +import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"; + +import { MCP_SERVER_CLI_SCRIPT } from "./constants.js"; +import { LLMToolCall } from "./accuracyResultStorage/resultStorage.js"; +import { VercelMCPClient, VercelMCPClientTools } from "./agent.js"; + +type ToolResultGeneratorFn = (...parameters: unknown[]) => CallToolResult | Promise; +export type MockedTools = Record; + +/** + * AccuracyTestingClient is a bridge between actual MCP client connected to our + * MCP server and our Tool calling agent. Its serves the following purposes: + * 1. Captures actual tools provided by our MCP server + * 2. Translates captured MCP tools to tool definitions that can be consumed by + * Tool Calling agent (Ref: `vercelTools`) + * 3. Allow dynamic mocking and resetting of mocks of individual tool calls. + * 4. Records and provides tool calls made by LLMs with their parameters. + */ +export class AccuracyTestingClient { + private mockedTools: MockedTools = {}; + private llmToolCalls: LLMToolCall[] = []; + + private constructor(private readonly vercelMCPClient: VercelMCPClient) {} + + async close(): Promise { + await this.vercelMCPClient?.close(); + } + + async vercelTools(): Promise { + const vercelTools = (await this.vercelMCPClient?.tools()) ?? {}; + const rewrappedVercelTools: VercelMCPClientTools = {}; + for (const [toolName, tool] of Object.entries(vercelTools)) { + rewrappedVercelTools[toolName] = createVercelTool({ + ...tool, + execute: async (args, options) => { + this.llmToolCalls.push({ + toolCallId: uuid(), + toolName: toolName, + parameters: args as Record, + }); + try { + const toolResultGeneratorFn = this.mockedTools[toolName]; + if (toolResultGeneratorFn) { + return await toolResultGeneratorFn(args); + } + + return await tool.execute(args, options); + } catch (error) { + // There are cases when LLM calls the tools incorrectly + // and the schema definition check fails. In production, + // the tool calling agents are deployed with this fail + // safe to allow LLM to course correct themselves. That + // is exactly what we do here as well. + return { + isError: true, + content: JSON.stringify(error), + }; + } + }, + }); + } + + return rewrappedVercelTools; + } + + getLLMToolCalls(): LLMToolCall[] { + return this.llmToolCalls; + } + + mockTools(mockedTools: MockedTools): void { + this.mockedTools = mockedTools; + } + + resetForTests(): void { + this.mockTools({}); + this.llmToolCalls = []; + } + + static async initializeClient(mdbConnectionString: string): Promise { + const clientTransport = new StdioClientTransport({ + command: process.execPath, + args: [MCP_SERVER_CLI_SCRIPT, "--connectionString", mdbConnectionString], + }); + + const client = await createMCPClient({ + transport: clientTransport, + }); + + return new AccuracyTestingClient(client); + } +} diff --git a/tests/accuracy/sdk/agent.ts b/tests/accuracy/sdk/agent.ts new file mode 100644 index 00000000..ee0b5f7f --- /dev/null +++ b/tests/accuracy/sdk/agent.ts @@ -0,0 +1,56 @@ +import { generateText, LanguageModelV1, experimental_createMCPClient } from "ai"; +import { Model } from "./models.js"; + +const systemPrompt = [ + 'The keywords "MUST", "MUST NOT", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119', + "You are an expert AI assistant with access to a set of tools for MongoDB database operations.", + "You MUST use the most relevant tool to answer the user's request", + "When calling a tool, you MUST strictly follow its input schema and MUST provide all required arguments", + "If a task requires multiple tool calls, you MUST call all the necessary tools in sequence, following the requirements mentioned above for each tool called.", + 'If you do not know the answer or the request cannot be fulfilled, you MUST reply with "I don\'t know"', +]; + +// These types are not exported by Vercel SDK so we derive them here to be +// re-used again. +export type VercelMCPClient = Awaited>; +export type VercelMCPClientTools = Awaited>; +export type VercelAgent = ReturnType; + +export interface VercelAgentPromptResult { + respondingModel: string; + tokensUsage?: { + promptTokens?: number; + completionTokens?: number; + totalTokens?: number; + }; + text: string; + messages: Record[]; +} + +// Generic interface for Agent, in case we need to switch to some other agent +// development SDK +export interface Agent { + prompt(prompt: string, model: Model, tools: Tools): Promise; +} + +export function getVercelToolCallingAgent( + requestedSystemPrompt?: string +): Agent, VercelMCPClientTools, VercelAgentPromptResult> { + return { + async prompt(prompt: string, model: Model, tools: VercelMCPClientTools) { + const result = await generateText({ + model: model.getModel(), + system: [...systemPrompt, requestedSystemPrompt].filter(Boolean).join("\n"), + prompt, + tools, + maxSteps: 100, + }); + return { + text: result.text, + messages: result.response.messages, + respondingModel: result.response.modelId, + tokensUsage: result.usage, + }; + }, + }; +} diff --git a/tests/accuracy/sdk/constants.ts b/tests/accuracy/sdk/constants.ts new file mode 100644 index 00000000..c59534e3 --- /dev/null +++ b/tests/accuracy/sdk/constants.ts @@ -0,0 +1,26 @@ +import path from "path"; +import { fileURLToPath } from "url"; + +const __dirname = fileURLToPath(import.meta.url); + +export const ROOT_DIR = path.join(__dirname, "..", "..", "..", ".."); + +export const DIST_DIR = path.join(ROOT_DIR, "dist"); + +export const RESOURCES_DIR = path.join(ROOT_DIR, "resources"); + +export const MCP_SERVER_CLI_SCRIPT = path.join(DIST_DIR, "index.js"); + +export const TEST_DATA_DUMPS_DIR = path.join(__dirname, "test-data-dumps"); + +export const GENERATED_ASSETS_DIR = path.join(ROOT_DIR, ".accuracy"); + +export const ACCURACY_RESULTS_DIR = path.join(GENERATED_ASSETS_DIR, "results"); + +export const LATEST_ACCURACY_RUN_NAME = "latest-run"; + +export const HTML_TEST_SUMMARY_FILE = path.join(GENERATED_ASSETS_DIR, "test-summary.html"); + +export const MARKDOWN_TEST_BRIEF_FILE = path.join(GENERATED_ASSETS_DIR, "test-brief.md"); + +export const HTML_TESTS_SUMMARY_TEMPLATE = path.join(RESOURCES_DIR, "test-summary-template.html"); diff --git a/tests/accuracy/sdk/describeAccuracyTests.ts b/tests/accuracy/sdk/describeAccuracyTests.ts new file mode 100644 index 00000000..a10d46ef --- /dev/null +++ b/tests/accuracy/sdk/describeAccuracyTests.ts @@ -0,0 +1,126 @@ +import { describe, it, beforeAll, beforeEach, afterAll } from "vitest"; +import { getAvailableModels } from "./models.js"; +import { calculateToolCallingAccuracy } from "./accuracyScorer.js"; +import { getVercelToolCallingAgent, VercelAgent } from "./agent.js"; +import { prepareTestData, setupMongoDBIntegrationTest } from "../../integration/tools/mongodb/mongodbHelpers.js"; +import { AccuracyTestingClient, MockedTools } from "./accuracyTestingClient.js"; +import { AccuracyResultStorage, ExpectedToolCall } from "./accuracyResultStorage/resultStorage.js"; +import { getAccuracyResultStorage } from "./accuracyResultStorage/getAccuracyResultStorage.js"; +import { getCommitSHA } from "./gitInfo.js"; + +export interface AccuracyTestConfig { + /** The prompt to be provided to LLM for evaluation. */ + prompt: string; + + /** + * A list of tools and their parameters that we expect LLM to call based on + * how vague or detailed the prompt is. Ideally this should be a list of + * bare minimum and critical tool calls that are required to solve the + * problem mentioned in the prompt but because, for even a slightly vague + * prompt, LLM might decide to do additional confirmation by calling other + * tools, its fine to include those other tool calls as well to get a + * perfect 1 on the tool calling accuracy score. */ + expectedToolCalls: ExpectedToolCall[]; + + /** + * The additional system prompt to be appended to already injected system + * prompt. */ + systemPrompt?: string; + + /** + * A small hint appended to the actual prompt in test, which is supposed to + * hint LLM to assume that the MCP server is already connected so that it + * does not call the connect tool. + * By default it is assumed to be true */ + injectConnectedAssumption?: boolean; + + /** + * A map of tool names to their mocked implementation. When the mocked + * implementations are available, the testing client will prefer those over + * actual MCP tool calls. */ + mockedTools?: MockedTools; +} + +export function describeAccuracyTests(accuracyTestConfigs: AccuracyTestConfig[]) { + if (!process.env.MDB_ACCURACY_RUN_ID) { + throw new Error("MDB_ACCURACY_RUN_ID env variable is required for accuracy test runs!"); + } + + const models = getAvailableModels(); + if (!models.length) { + throw new Error("No models available to test. Ensure that the API keys are properly setup!"); + } + + const eachModel = describe.each(models); + + eachModel(`$displayName`, function (model) { + const accuracyRunId = `${process.env.MDB_ACCURACY_RUN_ID}`; + const mdbIntegration = setupMongoDBIntegrationTest(); + const { populateTestData, cleanupTestDatabases } = prepareTestData(mdbIntegration); + + let commitSHA: string; + let accuracyResultStorage: AccuracyResultStorage; + let testMCPClient: AccuracyTestingClient; + let agent: VercelAgent; + + beforeAll(async () => { + const retrievedCommitSHA = await getCommitSHA(); + if (!retrievedCommitSHA) { + throw new Error("Could not derive commitSHA, exiting accuracy tests!"); + } + commitSHA = retrievedCommitSHA; + + accuracyResultStorage = getAccuracyResultStorage(); + testMCPClient = await AccuracyTestingClient.initializeClient(mdbIntegration.connectionString()); + agent = getVercelToolCallingAgent(); + }); + + beforeEach(async () => { + await cleanupTestDatabases(mdbIntegration); + await populateTestData(); + testMCPClient.resetForTests(); + }); + + afterAll(async () => { + await accuracyResultStorage?.close(); + await testMCPClient?.close(); + }); + + const eachTest = it.each(accuracyTestConfigs); + + eachTest("$prompt", async function (testConfig) { + testMCPClient.mockTools(testConfig.mockedTools ?? {}); + const toolsForModel = await testMCPClient.vercelTools(); + const promptForModel = + testConfig.injectConnectedAssumption === false + ? testConfig.prompt + : [testConfig.prompt, "(Assume that you are already connected to a MongoDB cluster!)"].join(" "); + + const timeBeforePrompt = Date.now(); + const result = await agent.prompt(promptForModel, model, toolsForModel); + const timeAfterPrompt = Date.now(); + + const llmToolCalls = testMCPClient.getLLMToolCalls(); + const toolCallingAccuracy = calculateToolCallingAccuracy(testConfig.expectedToolCalls, llmToolCalls); + + const responseTime = timeAfterPrompt - timeBeforePrompt; + await accuracyResultStorage.saveModelResponseForPrompt({ + commitSHA, + runId: accuracyRunId, + prompt: testConfig.prompt, + expectedToolCalls: testConfig.expectedToolCalls, + modelResponse: { + provider: model.provider, + requestedModel: model.modelName, + respondingModel: result.respondingModel, + llmResponseTime: responseTime, + toolCallingAccuracy: toolCallingAccuracy, + llmToolCalls: llmToolCalls, + tokensUsed: result.tokensUsage, + text: result.text, + messages: result.messages, + }, + }); + }); + }); +} diff --git a/tests/accuracy/sdk/gitInfo.ts b/tests/accuracy/sdk/gitInfo.ts new file mode 100644 index 00000000..03e34a7d --- /dev/null +++ b/tests/accuracy/sdk/gitInfo.ts @@ -0,0 +1,7 @@ +import { simpleGit } from "simple-git"; + +export async function getCommitSHA(): Promise { + const commitLogs = await simpleGit().log(); + const lastCommit = commitLogs.latest; + return lastCommit?.hash; +} diff --git a/tests/accuracy/sdk/matcher.ts b/tests/accuracy/sdk/matcher.ts new file mode 100644 index 00000000..06999a02 --- /dev/null +++ b/tests/accuracy/sdk/matcher.ts @@ -0,0 +1,193 @@ +const MATCHER_SYMBOL = Symbol("match"); + +export abstract class Matcher { + [MATCHER_SYMBOL] = true; + public abstract match(actual: unknown): number; + + public static get emptyObjectOrUndefined(): Matcher { + return new EmptyObjectOrUndefinedMatcher(); + } + + public static get anyValue(): Matcher { + return new AnyValueMatcher(); + } + + public static number(additionalFilter: (value: number) => boolean = () => true): Matcher { + return new NumberMatcher(additionalFilter); + } + + public static anyOf(...matchers: Matcher[]): Matcher { + return new CompositeMatcher(matchers); + } + + public static get undefined(): Matcher { + return new UndefinedMatcher(); + } + + public static boolean(expected?: boolean): Matcher { + return new BooleanMatcher(expected); + } + + public static string(): Matcher { + return new StringMatcher(); + } + + public static value(expected: unknown): Matcher { + if (typeof expected === "object" && expected !== null && MATCHER_SYMBOL in expected) { + return expected as Matcher; + } + + return new ValueMatcher(expected); + } +} + +export const PARAMETER_SCORER_SYMBOL = Symbol("parameterScorer"); + +class EmptyObjectOrUndefinedMatcher extends Matcher { + public match(actual: unknown): number { + if ( + actual === undefined || + actual === null || + (typeof actual === "object" && Object.keys(actual).length === 0) + ) { + return 1; // Match if actual is undefined, null, or an empty object + } + + return 0; // No match + } +} + +class AnyValueMatcher extends Matcher { + public match(): number { + return 1; + } +} + +class NumberMatcher extends Matcher { + constructor(private additionalFilter: (value: number) => boolean = () => true) { + super(); + } + public match(actual: unknown): number { + return typeof actual === "number" && this.additionalFilter(actual) ? 1 : 0; + } +} + +class UndefinedMatcher extends Matcher { + public match(actual: unknown): number { + return actual === undefined ? 1 : 0; + } +} + +class CompositeMatcher extends Matcher { + constructor(private matchers: Matcher[]) { + super(); + } + + public match(actual: unknown): number { + let currentScore = 0; + + for (const matcher of this.matchers) { + const score = matcher.match(actual); + if (score === 1) { + return 1; // If one of the matchers is perfect score, return immediately + } + currentScore = Math.max(currentScore, score); + } + + return currentScore; + } +} + +class BooleanMatcher extends Matcher { + constructor(private expected?: boolean) { + super(); + } + + public match(actual: unknown): number { + return typeof actual === "boolean" && (this.expected === undefined || this.expected === actual) ? 1 : 0; + } +} + +class StringMatcher extends Matcher { + public match(actual: unknown): number { + return typeof actual === "string" ? 1 : 0; + } +} + +class ValueMatcher extends Matcher { + constructor(private expected: unknown) { + super(); + } + + public match(actual: unknown): number { + if (this.expected === actual) { + // If both are the same, just return immediately. + return 1; + } + + if (this.expected === undefined || this.expected === null) { + // We expect null/undefined - return 1 if actual is also null/undefined + return actual === undefined || actual === null ? 1 : 0; + } + + let currentScore = 1; + + if (Array.isArray(this.expected)) { + if (!Array.isArray(actual)) { + // One is an array, the other is not + return 0; + } + + if (actual.length > this.expected.length) { + // Actual array has more elements - this is likely an error (e.g. an aggregation pipeline with extra stages) + // If we want to allow extra elements, we should add matchers to the array + return 0; + } + + for (let i = 0; i < this.expected.length; i++) { + currentScore = Math.min(currentScore, Matcher.value(this.expected[i]).match(actual[i])); + if (currentScore === 0) { + // If we already found a mismatch, we can stop early + return 0; + } + } + } else if (typeof this.expected === "object") { + if (MATCHER_SYMBOL in this.expected) { + return (this.expected as Matcher).match(actual); + } + + if (typeof actual !== "object" || actual === null) { + // One is an object, the other is not + return 0; + } + + const expectedKeys = Object.keys(this.expected); + const actualKeys = Object.keys(actual); + + if (actualKeys.length > expectedKeys.length) { + // The model provided more keys than expected - this should not happen. + // If we want to allow some extra keys, we should specify that in the test definition + // by adding matchers for those keys. + return 0; + } + + for (const key of expectedKeys) { + currentScore = Math.min( + currentScore, + Matcher.value((this.expected as Record)[key]).match( + (actual as Record)[key] + ) + ); + + if (currentScore === 0) { + // If we already found a mismatch, we can stop early + return 0; + } + } + } else { + return 0; + } + + return currentScore; + } +} diff --git a/tests/accuracy/sdk/models.ts b/tests/accuracy/sdk/models.ts new file mode 100644 index 00000000..02d2739b --- /dev/null +++ b/tests/accuracy/sdk/models.ts @@ -0,0 +1,95 @@ +import { LanguageModelV1 } from "ai"; +import { createGoogleGenerativeAI } from "@ai-sdk/google"; +import { createAzure } from "@ai-sdk/azure"; +import { createOpenAI } from "@ai-sdk/openai"; +import { ollama } from "ollama-ai-provider"; + +export interface Model { + readonly modelName: string; + readonly provider: string; + readonly displayName: string; + isAvailable(): boolean; + getModel(): VercelModel; +} + +export class OpenAIModel implements Model { + readonly provider = "OpenAI"; + readonly displayName: string; + + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; + } + + isAvailable(): boolean { + return !!process.env.MDB_OPEN_AI_API_KEY; + } + + getModel() { + return createOpenAI({ + apiKey: process.env.MDB_OPEN_AI_API_KEY, + })(this.modelName); + } +} + +export class AzureOpenAIModel implements Model { + readonly provider = "Azure"; + readonly displayName: string; + + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; + } + + isAvailable(): boolean { + return !!process.env.MDB_AZURE_OPEN_AI_API_KEY && !!process.env.MDB_AZURE_OPEN_AI_API_URL; + } + + getModel() { + return createAzure({ + baseURL: process.env.MDB_AZURE_OPEN_AI_API_URL, + apiKey: process.env.MDB_AZURE_OPEN_AI_API_KEY, + apiVersion: "2024-12-01-preview", + })(this.modelName); + } +} + +export class GeminiModel implements Model { + readonly provider = "Google"; + readonly displayName: string; + + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; + } + + isAvailable(): boolean { + return !!process.env.MDB_GEMINI_API_KEY; + } + + getModel() { + return createGoogleGenerativeAI({ + apiKey: process.env.MDB_GEMINI_API_KEY, + })(this.modelName); + } +} + +export class OllamaModel implements Model { + readonly provider = "Ollama"; + readonly displayName: string; + + constructor(readonly modelName: string) { + this.displayName = `${this.provider} - ${modelName}`; + } + + isAvailable(): boolean { + return true; + } + + getModel() { + return ollama(this.modelName); + } +} + +const ALL_TESTABLE_MODELS: Model[] = [new AzureOpenAIModel("gpt-4o")]; + +export function getAvailableModels(): Model[] { + return ALL_TESTABLE_MODELS.filter((model) => model.isAvailable()); +} diff --git a/tests/accuracy/test-data-dumps/comics.books.json b/tests/accuracy/test-data-dumps/comics.books.json new file mode 100644 index 00000000..f605f031 --- /dev/null +++ b/tests/accuracy/test-data-dumps/comics.books.json @@ -0,0 +1,417 @@ +[ + { + "_id": "fa53ead3-36f3-414c-9b3a-53aa9cf5038a", + "title": "Configurable dedicated project", + "publisher": "Dark Horse Comics", + "release_date": "2007-03-02T00:00:00", + "issues": 118, + "main_characters": ["Stephen Shaw"], + "genre": ["Sci-Fi"] + }, + { + "_id": "b2e993fb-2688-4ab0-9512-f8ada5faa948", + "title": "Focused intangible service-desk", + "publisher": "Image Comics", + "release_date": "1998-12-07T00:00:00", + "issues": 137, + "main_characters": ["Margaret Hogan"], + "genre": ["Adventure", "Horror"] + }, + { + "_id": "f674a05a-12c8-4344-875c-6cd1fcba8f9d", + "title": "Expanded secondary system engine", + "publisher": "DC Comics", + "release_date": "2012-12-01T00:00:00", + "issues": 227, + "main_characters": ["Joseph Cook", "Tammy Bishop"], + "genre": ["Superhero"] + }, + { + "_id": "bb72b493-2a61-41d7-9406-dfaf6e51a425", + "title": "Customizable zero-defect Graphic Interface", + "publisher": "DC Comics", + "release_date": "2011-02-24T00:00:00", + "issues": 270, + "main_characters": ["Sandra Moss"], + "genre": ["Fantasy"] + }, + { + "_id": "ea85131f-dfc8-4997-b3b0-996138185d73", + "title": "Reduced eco-centric help-desk", + "publisher": "Dark Horse Comics", + "release_date": "2021-03-12T00:00:00", + "issues": 202, + "main_characters": [ + "Margaret Hogan", + "Angelica Stein", + "Tammy Murphy", + "Larry Hensley" + ], + "genre": ["Adventure", "Horror"] + }, + { + "_id": "fdd56270-eb31-4456-8bf4-df81371eb290", + "title": "Triple-buffered dedicated help-desk", + "publisher": "Image Comics", + "release_date": "1964-09-20T00:00:00", + "issues": 36, + "main_characters": [ + "Richard Cooper", + "James Sanchez", + "Micheal Brown", + "Jeremy Rice" + ], + "genre": ["Fantasy", "Action"] + }, + { + "_id": "6de66ba4-3975-4055-824c-cda5caf517d2", + "title": "Operative logistical secured line", + "publisher": "Marvel Comics", + "release_date": "2007-11-19T00:00:00", + "issues": 55, + "main_characters": ["Joseph Bowman", "Robert Logan", "Ashley Watkins"], + "genre": ["Sci-Fi", "Horror"] + }, + { + "_id": "e3cafdbf-e97a-47c9-a848-bdd82e12f8f7", + "title": "Multi-lateral multi-state framework", + "publisher": "IDW Publishing", + "release_date": "2011-09-14T00:00:00", + "issues": 250, + "main_characters": [ + "Ashley Watkins", + "Virginia Watts", + "Lindsay Anderson", + "Scott Garcia" + ], + "genre": ["Action", "Horror"] + }, + { + "_id": "547190cd-5c9e-44c5-b8f9-afeefd039001", + "title": "Re-engineered encompassing standardization", + "publisher": "Marvel Comics", + "release_date": "1987-04-16T00:00:00", + "issues": 235, + "main_characters": ["Julie Goodwin"], + "genre": ["Sci-Fi"] + }, + { + "_id": "ba3d82f7-8edc-408c-8212-c0d6634624ee", + "title": "Fully-configurable local success", + "publisher": "Dark Horse Comics", + "release_date": "1979-09-13T00:00:00", + "issues": 239, + "main_characters": ["Chad Pham", "Lindsay Anderson", "Carlos Burton"], + "genre": ["Adventure"] + }, + { + "_id": "a6bc8677-22ab-415a-bfe2-731a9f887cb9", + "title": "Realigned zero-defect capability", + "publisher": "Marvel Comics", + "release_date": "2023-10-01T00:00:00", + "issues": 163, + "main_characters": ["Kevin Humphrey", "Maria Wright", "Virginia Watts"], + "genre": ["Fantasy", "Action"] + }, + { + "_id": "fb986790-df22-4db4-8168-c76e9e9471f8", + "title": "Sharable bottom-line frame", + "publisher": "IDW Publishing", + "release_date": "2016-09-28T00:00:00", + "issues": 14, + "main_characters": ["Brian Vincent"], + "genre": ["Sci-Fi", "Fantasy"] + }, + { + "_id": "700aa115-dc5a-4be6-b275-bfb943c95ee0", + "title": "Centralized next generation middleware", + "publisher": "Image Comics", + "release_date": "1970-04-16T00:00:00", + "issues": 5, + "main_characters": ["Joseph Cook"], + "genre": ["Fantasy"] + }, + { + "_id": "7959187e-9693-43a1-ae2d-c168431fceb2", + "title": "Re-engineered heuristic array", + "publisher": "IDW Publishing", + "release_date": "2019-02-15T00:00:00", + "issues": 121, + "main_characters": ["Angelica Stein", "Benjamin Morris", "Jeremy Rice"], + "genre": ["Fantasy", "Action"] + }, + { + "_id": "d6018445-5149-42e7-9d87-eb1b181ce20c", + "title": "Programmable transitional collaboration", + "publisher": "DC Comics", + "release_date": "1999-08-10T00:00:00", + "issues": 235, + "main_characters": [ + "Joseph Cook", + "Cynthia Brown", + "Carlos Burton", + "Micheal Brown" + ], + "genre": ["Adventure"] + }, + { + "_id": "055507ff-7a48-4df8-9ba9-7b6c10e11836", + "title": "Object-based dynamic knowledgebase", + "publisher": "Image Comics", + "release_date": "1993-02-24T00:00:00", + "issues": 189, + "main_characters": [ + "Cristian Oneal", + "Brian Vincent", + "Holly Green", + "James Sanchez" + ], + "genre": ["Sci-Fi", "Fantasy"] + }, + { + "_id": "1add2da3-68e6-48a3-9703-b593c9e0bf2e", + "title": "Enhanced asynchronous matrices", + "publisher": "DC Comics", + "release_date": "2001-03-01T00:00:00", + "issues": 176, + "main_characters": ["Justin Martinez", "Tammy Murphy"], + "genre": ["Action", "Fantasy"] + }, + { + "_id": "c0fe2869-eb7d-4f09-a773-028387a54969", + "title": "Synergized maximized artificial intelligence", + "publisher": "DC Comics", + "release_date": "1976-09-05T00:00:00", + "issues": 68, + "main_characters": ["Christopher Elliott", "Maria Wright"], + "genre": ["Superhero", "Adventure"] + }, + { + "_id": "c2fafbf6-5f71-4f31-9775-803e8c77e467", + "title": "Switchable bottom-line complexity", + "publisher": "Marvel Comics", + "release_date": "2012-08-12T00:00:00", + "issues": 156, + "main_characters": [ + "Lindsay Anderson", + "Virginia Watts", + "Robert Logan", + "Margaret Hogan" + ], + "genre": ["Adventure"] + }, + { + "_id": "f72be3a7-d4be-40a1-ad66-370b44759047", + "title": "Triple-buffered impactful customer loyalty", + "publisher": "Marvel Comics", + "release_date": "1976-09-18T00:00:00", + "issues": 275, + "main_characters": ["Sandra Moss", "Charles Blair", "Justin Martinez"], + "genre": ["Fantasy", "Action"] + }, + { + "_id": "da5be16e-13e8-42d5-8954-bd89919395af", + "title": "Programmable 24/7 website", + "publisher": "DC Comics", + "release_date": "2023-11-06T00:00:00", + "issues": 278, + "main_characters": [ + "Luis Callahan", + "Carlos Burton", + "Cristian Oneal", + "Michelle Valdez" + ], + "genre": ["Horror", "Fantasy"] + }, + { + "_id": "92afc1e6-f703-4aa7-9866-3b62f2784fec", + "title": "Advanced incremental framework", + "publisher": "Image Comics", + "release_date": "2008-07-21T00:00:00", + "issues": 109, + "main_characters": ["Holly Green", "Diana Mata", "Julie Goodwin"], + "genre": ["Horror", "Sci-Fi"] + }, + { + "_id": "fec61fdd-bddb-431a-b14a-d81601a47cf8", + "title": "Front-line coherent system engine", + "publisher": "DC Comics", + "release_date": "2012-04-27T00:00:00", + "issues": 297, + "main_characters": ["Joshua Hicks"], + "genre": ["Action", "Horror"] + }, + { + "_id": "9d37d0d7-1adc-4f54-8790-30f13472520c", + "title": "Progressive systematic superstructure", + "publisher": "Image Comics", + "release_date": "1996-02-20T00:00:00", + "issues": 295, + "main_characters": ["Margaret Hogan", "Christopher Elliott", "Joseph Cook"], + "genre": ["Fantasy", "Adventure"] + }, + { + "_id": "338a83ad-06fc-42e1-a605-60a192ce5643", + "title": "Implemented national help-desk", + "publisher": "DC Comics", + "release_date": "2015-05-11T00:00:00", + "issues": 257, + "main_characters": [ + "Lindsay Anderson", + "James Sanchez", + "Julie Goodwin", + "Charles Blair" + ], + "genre": ["Action"] + }, + { + "_id": "5b07c17b-4df9-4b72-9c3e-b51d93def1fb", + "title": "Down-sized impactful workforce", + "publisher": "IDW Publishing", + "release_date": "2024-06-19T00:00:00", + "issues": 259, + "main_characters": ["Debbie Green"], + "genre": ["Sci-Fi", "Superhero"] + }, + { + "_id": "625b11a5-bb45-4837-9cd6-50bfe2e3396c", + "title": "Re-engineered leadingedge structure", + "publisher": "DC Comics", + "release_date": "2011-04-14T00:00:00", + "issues": 282, + "main_characters": [ + "Larry Hensley", + "Joseph Cook", + "Brian Vincent", + "Sandra Moss" + ], + "genre": ["Adventure"] + }, + { + "_id": "71b845f3-4416-430a-81eb-8c208f824365", + "title": "Cloned 3rdgeneration contingency", + "publisher": "Dark Horse Comics", + "release_date": "2002-07-11T00:00:00", + "issues": 238, + "main_characters": [ + "Larry Hensley", + "Margaret Hogan", + "Holly Green", + "Joseph Bowman" + ], + "genre": ["Superhero", "Fantasy"] + }, + { + "_id": "14dbf3a6-d258-4c96-8883-336b60bc2112", + "title": "Secured zero tolerance monitoring", + "publisher": "DC Comics", + "release_date": "1969-11-30T00:00:00", + "issues": 104, + "main_characters": ["Micheal Brown"], + "genre": ["Horror", "Superhero"] + }, + { + "_id": "091e16d8-d50c-4e7d-9b3a-545cf2596738", + "title": "Automated bifurcated access", + "publisher": "Image Comics", + "release_date": "1990-01-24T00:00:00", + "issues": 74, + "main_characters": ["Robert Logan"], + "genre": ["Sci-Fi"] + }, + { + "_id": "c47ec96a-4d6e-43ea-9bb5-00e4c8058b53", + "title": "Universal high-level pricing structure", + "publisher": "DC Comics", + "release_date": "1971-04-21T00:00:00", + "issues": 135, + "main_characters": ["Jeremy Rice", "Elizabeth Robinson", "James Sanchez"], + "genre": ["Action", "Sci-Fi"] + }, + { + "_id": "d446a8ca-5d01-4be9-a061-027ef1f7bfc6", + "title": "Reduced optimizing strategy", + "publisher": "Dark Horse Comics", + "release_date": "1984-06-24T00:00:00", + "issues": 111, + "main_characters": ["Joshua Hicks", "Jeremy Rice", "Micheal Brown"], + "genre": ["Fantasy", "Superhero"] + }, + { + "_id": "09c734ff-2bf0-4cb6-bd42-4232209c00c9", + "title": "Virtual non-volatile groupware", + "publisher": "DC Comics", + "release_date": "2013-05-22T00:00:00", + "issues": 13, + "main_characters": ["Luis Callahan", "Tammy Bishop", "Cynthia Brown"], + "genre": ["Action"] + }, + { + "_id": "691034fa-ad52-413e-96a2-a9a319fffe7b", + "title": "Horizontal disintermediate extranet", + "publisher": "DC Comics", + "release_date": "2021-12-03T00:00:00", + "issues": 129, + "main_characters": ["Margaret Hogan"], + "genre": ["Action"] + }, + { + "_id": "07942b5a-f7c4-4fc1-bdeb-7eb46b0d57f8", + "title": "Cross-platform discrete framework", + "publisher": "Dark Horse Comics", + "release_date": "2001-08-02T00:00:00", + "issues": 38, + "main_characters": ["James Sanchez", "Larry Hensley"], + "genre": ["Superhero"] + }, + { + "_id": "05d637ed-3942-4276-a885-7b3363dd48e2", + "title": "Cross-platform regional info-mediaries", + "publisher": "Image Comics", + "release_date": "2005-03-30T00:00:00", + "issues": 150, + "main_characters": ["Carlos Burton"], + "genre": ["Superhero", "Fantasy"] + }, + { + "_id": "88904f06-50a6-44f1-bccc-f379a9788611", + "title": "Mandatory 6thgeneration secured line", + "publisher": "Image Comics", + "release_date": "2021-06-27T00:00:00", + "issues": 262, + "main_characters": ["Luis Callahan"], + "genre": ["Sci-Fi", "Superhero"] + }, + { + "_id": "fc961fd6-2ec6-43e5-beae-7f58a6c25d9c", + "title": "Exclusive interactive concept", + "publisher": "IDW Publishing", + "release_date": "1969-06-03T00:00:00", + "issues": 264, + "main_characters": ["Scott Garcia", "Joseph Bowman"], + "genre": ["Fantasy", "Superhero"] + }, + { + "_id": "481a3ea6-9629-4fe6-8a5a-eba846f0e62c", + "title": "Focused intermediate methodology", + "publisher": "DC Comics", + "release_date": "2004-03-19T00:00:00", + "issues": 210, + "main_characters": [ + "Justin Martinez", + "Julie Goodwin", + "Benjamin Morris", + "Virginia Watts" + ], + "genre": ["Adventure", "Action"] + }, + { + "_id": "6bab6bcd-2f6b-4dfb-a030-d63b32fc6250", + "title": "Right-sized contextually-based toolset", + "publisher": "IDW Publishing", + "release_date": "2007-12-27T00:00:00", + "issues": 117, + "main_characters": ["Debbie Green", "Christopher Elliott", "Joshua Hicks"], + "genre": ["Sci-Fi", "Action"] + } +] diff --git a/tests/accuracy/test-data-dumps/comics.characters.json b/tests/accuracy/test-data-dumps/comics.characters.json new file mode 100644 index 00000000..4a255f48 --- /dev/null +++ b/tests/accuracy/test-data-dumps/comics.characters.json @@ -0,0 +1,402 @@ +[ + { + "_id": "d7047787-abea-40fa-b78e-939925fd3589", + "name": "Elizabeth Robinson", + "alias": "ashley62", + "powers": ["Shapeshifting", "Telepathy", "Flight"], + "first_appearance": "1961-06-23T00:00:00", + "affiliations": ["Fantastic Four", "X-Men"], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "06ac8173-51a6-404c-8f9a-628de889b1de", + "name": "Joshua Wang", + "alias": "paulasmith", + "powers": ["Telekinesis"], + "first_appearance": "1987-04-16T00:00:00", + "affiliations": ["Fantastic Four", "Justice League"], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "252c203a-0271-4ee7-a3d9-34c9f922b959", + "name": "Stephen Shaw", + "alias": "adamskenneth", + "powers": ["Super Speed", "Flight"], + "first_appearance": "2004-07-26T00:00:00", + "affiliations": [], + "origin": "Atlantis", + "is_villain": true + }, + { + "_id": "bf5b7d04-fe71-4969-84a3-0eb9ed5d2197", + "name": "Joseph Bowman", + "alias": "amysalazar", + "powers": ["Time Manipulation"], + "first_appearance": "1961-07-03T00:00:00", + "affiliations": ["Teen Titans", "Avengers"], + "origin": "Atlantis", + "is_villain": true + }, + { + "_id": "c6271161-bd78-4338-b6ca-88d91f7b853e", + "name": "Debbie Green", + "alias": "steventodd", + "powers": ["Energy Blasts", "Regeneration"], + "first_appearance": "2021-12-05T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "60223f4c-5908-4f82-a2a3-a5dad1771f7f", + "name": "Christopher Elliott", + "alias": "barajasmitchell", + "powers": ["Flight", "Invisibility", "Telekinesis"], + "first_appearance": "1947-03-23T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "f66a8f7a-9ca3-431a-9ece-aba96be18220", + "name": "Tammy Murphy", + "alias": "jessicagill", + "powers": ["Super Strength", "Telekinesis"], + "first_appearance": "2000-07-06T00:00:00", + "affiliations": [], + "origin": "Mutant", + "is_villain": false + }, + { + "_id": "817c0b11-3eac-4a3a-b55f-203126db060f", + "name": "Scott Garcia", + "alias": "whitechristie", + "powers": ["Telepathy", "Energy Blasts"], + "first_appearance": "2000-11-22T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "1ee6789f-d774-43b8-87e2-9f6dbac6230a", + "name": "Julie Goodwin", + "alias": "robertsmith", + "powers": ["Telepathy", "Super Speed"], + "first_appearance": "1953-08-09T00:00:00", + "affiliations": ["Teen Titans"], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "3ab9b55d-94ab-449e-bda9-63b2c633494a", + "name": "Joshua Hicks", + "alias": "cynthia32", + "powers": ["Super Strength", "Invisibility", "Telekinesis"], + "first_appearance": "1967-07-17T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "51adf385-1f8e-4290-bcc6-ce2808dc461e", + "name": "Justin Martinez", + "alias": "janicebrown", + "powers": ["Super Speed", "Super Strength"], + "first_appearance": "1973-09-19T00:00:00", + "affiliations": ["Avengers"], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "3a3d934e-f5bb-4238-b8a5-74669a937a14", + "name": "Holly Green", + "alias": "ystanley", + "powers": ["Shapeshifting", "Energy Blasts"], + "first_appearance": "2013-08-05T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": true + }, + { + "_id": "f044b9fb-82c6-48b3-b8b2-806b0be66466", + "name": "Margaret Hogan", + "alias": "wendyconway", + "powers": ["Super Speed", "Telepathy"], + "first_appearance": "1944-08-13T00:00:00", + "affiliations": ["Justice League", "X-Men"], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "fd50880a-9d0e-43e1-8b20-2830eba8c7dc", + "name": "Ashley Watkins", + "alias": "cjohnson", + "powers": ["Shapeshifting"], + "first_appearance": "1940-09-13T00:00:00", + "affiliations": ["Fantastic Four", "Guardians of the Galaxy"], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "68036d6b-1780-4352-98ea-2c68cb5c7bff", + "name": "Tammy Bishop", + "alias": "geoffreyryan", + "powers": ["Regeneration"], + "first_appearance": "1984-11-04T00:00:00", + "affiliations": ["Fantastic Four", "X-Men"], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "dbfa84f2-e598-4e67-99a9-5e8c34e5606f", + "name": "Michelle Valdez", + "alias": "manuelcobb", + "powers": ["Regeneration", "Energy Blasts"], + "first_appearance": "2014-08-04T00:00:00", + "affiliations": ["Teen Titans"], + "origin": "Mutant", + "is_villain": false + }, + { + "_id": "ae85885c-13d0-4ae2-b82c-fa53859665d7", + "name": "Joseph Cook", + "alias": "scott40", + "powers": ["Telepathy", "Telekinesis"], + "first_appearance": "1976-04-01T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "0738b98f-4699-4609-9156-fb6a1085a503", + "name": "Jeremy Rice", + "alias": "james82", + "powers": ["Invisibility"], + "first_appearance": "1977-09-22T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "a072c5df-cc65-4044-ba24-fcc8eaa71b4a", + "name": "Chad Pham", + "alias": "smithjennifer", + "powers": ["Telepathy"], + "first_appearance": "2001-05-26T00:00:00", + "affiliations": ["Teen Titans"], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "d545ec48-680c-4493-8650-d759bedabb7e", + "name": "Diana Mata", + "alias": "zwilliamson", + "powers": ["Super Speed", "Energy Blasts", "Invisibility"], + "first_appearance": "2010-11-21T00:00:00", + "affiliations": [], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "e6bfb576-d65c-40f8-a547-90719578e03c", + "name": "Maria Wright", + "alias": "yraymond", + "powers": ["Flight", "Telepathy"], + "first_appearance": "1971-04-15T00:00:00", + "affiliations": ["Avengers", "Teen Titans"], + "origin": "Asgard", + "is_villain": true + }, + { + "_id": "a2e7b056-0c79-4a2e-83ff-1774b6e186ea", + "name": "Carlos Burton", + "alias": "rperkins", + "powers": ["Super Speed", "Time Manipulation", "Telekinesis"], + "first_appearance": "1970-01-20T00:00:00", + "affiliations": ["Teen Titans"], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "ec7f8d60-3fef-4329-a7d2-6d89805d758c", + "name": "Lindsay Anderson", + "alias": "amycox", + "powers": ["Super Strength", "Telekinesis"], + "first_appearance": "1976-04-30T00:00:00", + "affiliations": [], + "origin": "Atlantis", + "is_villain": false + }, + { + "_id": "cdc66356-a438-4989-b4d1-315609ec6d91", + "name": "Larry Hensley", + "alias": "ylester", + "powers": ["Super Strength", "Invisibility", "Shapeshifting"], + "first_appearance": "2019-01-21T00:00:00", + "affiliations": ["Guardians of the Galaxy", "Avengers"], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "0952b684-f887-446f-afcb-71d2ace3fd32", + "name": "Sandra Moss", + "alias": "alexandra81", + "powers": ["Telekinesis", "Super Speed"], + "first_appearance": "1989-07-28T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "9a63c787-3b44-46c2-b927-ffdde6ee10bc", + "name": "Cynthia Brown", + "alias": "freed", + "powers": ["Super Strength", "Energy Blasts"], + "first_appearance": "2015-06-19T00:00:00", + "affiliations": ["Fantastic Four"], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "2b058c3e-e795-4ecd-b5d7-dba6f1a831f6", + "name": "Brian Vincent", + "alias": "ghowell", + "powers": ["Invisibility", "Flight", "Super Speed"], + "first_appearance": "2012-05-12T00:00:00", + "affiliations": [], + "origin": "Asgard", + "is_villain": false + }, + { + "_id": "7a1e38ae-0bc6-41dd-ad61-e7542e6e9d4f", + "name": "Kevin Humphrey", + "alias": "mary44", + "powers": ["Super Strength", "Super Speed", "Telepathy"], + "first_appearance": "1993-05-10T00:00:00", + "affiliations": ["Justice League", "Teen Titans"], + "origin": "Mutant", + "is_villain": true + }, + { + "_id": "c147036a-ab66-4023-a950-1fb81acf7dca", + "name": "Luis Callahan", + "alias": "ashleyreeves", + "powers": ["Telekinesis"], + "first_appearance": "1943-11-02T00:00:00", + "affiliations": ["X-Men"], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "c42cec2b-156d-481e-993b-aa93637ae76e", + "name": "Micheal Brown", + "alias": "lisa85", + "powers": ["Telepathy", "Flight", "Time Manipulation"], + "first_appearance": "1983-11-04T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "5bd85192-926b-42f3-bc18-afd40a53753e", + "name": "James Sanchez", + "alias": "mary95", + "powers": ["Energy Blasts", "Telekinesis"], + "first_appearance": "1999-05-20T00:00:00", + "affiliations": ["Justice League"], + "origin": "Atlantis", + "is_villain": false + }, + { + "_id": "4b41e8f8-2cea-4d50-b7b0-ec59fca45367", + "name": "Richard Cooper", + "alias": "james85", + "powers": ["Telekinesis", "Energy Blasts", "Super Speed"], + "first_appearance": "2021-11-27T00:00:00", + "affiliations": ["Justice League", "Fantastic Four"], + "origin": "Mars", + "is_villain": true + }, + { + "_id": "8fd8c7b5-fabd-4021-9aeb-114e64ad06e0", + "name": "Charles Blair", + "alias": "barbara60", + "powers": ["Super Strength"], + "first_appearance": "2012-05-03T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "830eaa54-4397-4344-8964-2abdd7e2d86d", + "name": "Virginia Watts", + "alias": "klane", + "powers": ["Telekinesis"], + "first_appearance": "2016-04-27T00:00:00", + "affiliations": [], + "origin": "Earth", + "is_villain": false + }, + { + "_id": "495f64a9-123e-46d4-9ddb-21692353a849", + "name": "Robert Logan", + "alias": "griffinsean", + "powers": ["Telepathy"], + "first_appearance": "2003-07-16T00:00:00", + "affiliations": [], + "origin": "Krypton", + "is_villain": false + }, + { + "_id": "e3a96aac-bd9f-49f0-a9ea-efa7d6baf3e9", + "name": "Cheyenne Powell", + "alias": "laurenolsen", + "powers": ["Time Manipulation", "Energy Blasts"], + "first_appearance": "1964-02-05T00:00:00", + "affiliations": [], + "origin": "Atlantis", + "is_villain": false + }, + { + "_id": "2688321c-f5b0-43c8-b95c-060e748ba73b", + "name": "Benjamin Morris", + "alias": "sierra18", + "powers": ["Telekinesis", "Regeneration", "Shapeshifting"], + "first_appearance": "1964-09-27T00:00:00", + "affiliations": ["X-Men", "Avengers"], + "origin": "Mars", + "is_villain": false + }, + { + "_id": "98c4ca66-c7a7-44ad-ad16-5395905a011e", + "name": "Cristian Oneal", + "alias": "harrellamy", + "powers": ["Super Speed"], + "first_appearance": "1965-01-29T00:00:00", + "affiliations": [], + "origin": "Mutant", + "is_villain": false + }, + { + "_id": "e2999d26-1a93-4355-b04f-44f27a3c7f36", + "name": "Jessica Vargas", + "alias": "chadherrera", + "powers": ["Energy Blasts", "Super Strength", "Telekinesis"], + "first_appearance": "1974-03-29T00:00:00", + "affiliations": ["X-Men", "Teen Titans"], + "origin": "Earth", + "is_villain": true + }, + { + "_id": "f3fa712d-2124-433a-b405-c02757fa1503", + "name": "Angelica Stein", + "alias": "reedjason", + "powers": ["Invisibility"], + "first_appearance": "1981-01-02T00:00:00", + "affiliations": ["Avengers"], + "origin": "Earth", + "is_villain": true + } +] diff --git a/tests/accuracy/test-data-dumps/mflix.movies.json b/tests/accuracy/test-data-dumps/mflix.movies.json new file mode 100644 index 00000000..3c492185 --- /dev/null +++ b/tests/accuracy/test-data-dumps/mflix.movies.json @@ -0,0 +1,496 @@ +[ + { + "_id": "bf96c9f7-17be-467c-9f5e-3f19dc2e9ed4", + "title": "Human sell", + "release_year": 1993, + "genres": ["Sci-Fi"], + "director": "Christina Collins", + "cast": ["Jeremy Marks", "Matthew Moore", "Erica Miller", "Beth Morales"], + "runtime": 139, + "rating": 9.3 + }, + { + "_id": "ab338dcb-c541-4d39-ba3d-58e4ebcac16c", + "title": "Trial we much", + "release_year": 2020, + "genres": ["Horror", "Comedy"], + "director": "Steven Miles", + "cast": [ + "Patrick Huynh", + "Darrell Thompson", + "Lindsay Thompson", + "Brandi Cooper" + ], + "runtime": 149, + "rating": 5.0 + }, + { + "_id": "2bd3ed9f-cbeb-4c44-bec7-01d51c3dd7db", + "title": "Someone", + "release_year": 1996, + "genres": ["Action", "Horror"], + "director": "Steven Miles", + "cast": [ + "Carrie Cummings", + "Patricia Rice", + "Suzanne Collins", + "April Murray", + "Kimberly Shaw" + ], + "runtime": 153, + "rating": 2.6 + }, + { + "_id": "fb35d6f3-bda5-450f-8873-56e035e76c42", + "title": "Without our", + "release_year": 2012, + "genres": ["Comedy"], + "director": "Christina Collins", + "cast": [ + "Rodney Gray", + "Mr. Joseph Allen", + "Heather Robles", + "Eric Edwards", + "James Wilson" + ], + "runtime": 143, + "rating": 9.1 + }, + { + "_id": "4b0d5f7a-c551-4995-aece-a5a585d238a7", + "title": "Cost anything", + "release_year": 2002, + "genres": ["Romance", "Action"], + "director": "Bryan Andrews", + "cast": ["Gregory Mullins", "Jillian Arroyo", "Angela Reed"], + "runtime": 112, + "rating": 3.8 + }, + { + "_id": "797e4ee5-eff4-45f4-a0d7-40f62f7bd138", + "title": "Hold green energy their", + "release_year": 1989, + "genres": ["Horror"], + "director": "Christina Collins", + "cast": [ + "Eduardo Carey", + "Jodi Miller", + "Ronald Johnson", + "Lindsay Hernandez" + ], + "runtime": 126, + "rating": 7.4 + }, + { + "_id": "1b81c45b-1d09-47dc-871f-ace109107446", + "title": "Choose ability start", + "release_year": 1990, + "genres": ["Drama", "Comedy"], + "director": "Bryan Andrews", + "cast": [ + "Tyler Daniels", + "Gregory Harris", + "Whitney Swanson", + "Pamela Ramirez" + ], + "runtime": 141, + "rating": 5.6 + }, + { + "_id": "400a08be-f07b-416a-8cdc-46c9886b812b", + "title": "Cover perhaps", + "release_year": 2022, + "genres": ["Drama"], + "director": "Daniel Wallace", + "cast": ["Victoria Price", "Holly Ross", "Michele Jones"], + "runtime": 173, + "rating": 4.3 + }, + { + "_id": "4d4b5420-83e1-4ecd-9c86-238394a1fd0f", + "title": "Policy particularly", + "release_year": 2003, + "genres": ["Comedy"], + "director": "Brittany Parker", + "cast": ["Emily Haynes", "Crystal Johnson", "Ernest Jones"], + "runtime": 154, + "rating": 6.6 + }, + { + "_id": "9a489559-ab9d-4dbb-b3e7-d65895b27704", + "title": "Store care", + "release_year": 2017, + "genres": ["Romance", "Sci-Fi"], + "director": "Sara Stewart", + "cast": [ + "Katherine Matthews", + "Stacey Wolf", + "Laurie Blackwell", + "Luis Ortiz", + "Christopher Vasquez" + ], + "runtime": 168, + "rating": 7.7 + }, + { + "_id": "99e75e60-6466-4314-92c3-00c433a06600", + "title": "Section close bad", + "release_year": 2024, + "genres": ["Drama", "Comedy"], + "director": "Bryan Andrews", + "cast": [ + "Heather Marshall", + "Alexander Austin", + "Stephanie Villarreal MD", + "Ryan Marquez" + ], + "runtime": 180, + "rating": 7.7 + }, + { + "_id": "726d0c12-4bab-4684-b8e4-5ba795c88273", + "title": "Become stand", + "release_year": 2001, + "genres": ["Sci-Fi", "Thriller"], + "director": "Brian Martinez", + "cast": ["Robert Ross", "Kimberly Williamson", "Pam Wyatt"], + "runtime": 162, + "rating": 1.5 + }, + { + "_id": "aad23b4b-ddb9-48bd-9b48-b63da1874bb0", + "title": "I case", + "release_year": 2012, + "genres": ["Drama", "Comedy"], + "director": "Brittany Parker", + "cast": [ + "Justin Davis", + "Karen Doyle", + "Daniel Jackson", + "Courtney Mcdonald" + ], + "runtime": 122, + "rating": 3.1 + }, + { + "_id": "0d1ce099-18f1-4608-9c5b-5eb8b5870760", + "title": "No organization style", + "release_year": 2013, + "genres": ["Comedy"], + "director": "Christina Collins", + "cast": ["Benjamin Whitney", "Joseph Bush", "Barbara Griffin"], + "runtime": 167, + "rating": 9.6 + }, + { + "_id": "15855c7b-ece2-4238-b995-57f6207509ea", + "title": "Computer garden", + "release_year": 2012, + "genres": ["Horror"], + "director": "Steven Miles", + "cast": ["Darlene Lee", "Tina Wang", "Nathan Mayo"], + "runtime": 146, + "rating": 6.5 + }, + { + "_id": "e8a6ff98-1e7e-4481-a467-39ebbfc79f67", + "title": "Trip information feel", + "release_year": 2008, + "genres": ["Action", "Thriller"], + "director": "Brittany Parker", + "cast": ["Kelly Walsh", "Michael Rocha"], + "runtime": 148, + "rating": 9.8 + }, + { + "_id": "ef95e7a5-7f73-462e-bd03-c924a8876a7b", + "title": "It project low part", + "release_year": 1992, + "genres": ["Horror"], + "director": "Christina Collins", + "cast": [ + "Sheena Murphy", + "Amanda Miller", + "Erica Curtis", + "Roger Jones", + "Andrew Simpson" + ], + "runtime": 161, + "rating": 2.4 + }, + { + "_id": "efd2f4f4-1004-4b4e-8bc9-390466a6f77a", + "title": "Near attorney discuss", + "release_year": 1983, + "genres": ["Comedy"], + "director": "Christina Collins", + "cast": [ + "Chase Myers", + "Benjamin Kelly", + "Thomas Summers MD", + "Jessica Woods" + ], + "runtime": 174, + "rating": 9.5 + }, + { + "_id": "07f2cb6e-819e-4ff4-b3ba-134d3d9af549", + "title": "Whether know", + "release_year": 2009, + "genres": ["Comedy", "Thriller"], + "director": "Bryan Andrews", + "cast": ["Amy Reed", "William Williams", "Steven Lawrence"], + "runtime": 134, + "rating": 9.6 + }, + { + "_id": "ab5948c9-088b-42d6-89d9-42c4603c8b19", + "title": "Against place", + "release_year": 2017, + "genres": ["Drama", "Romance"], + "director": "Daniel Wallace", + "cast": [ + "Brittany Thompson", + "Clinton Bishop", + "Terri Meyer", + "Stacey Phillips", + "Alexander Hunt" + ], + "runtime": 152, + "rating": 5.0 + }, + { + "_id": "ef7f63fa-b25f-4aea-98e2-d7bdecc26ef5", + "title": "Return yard", + "release_year": 1994, + "genres": ["Horror"], + "director": "Christina Collins", + "cast": ["Mason Lara", "Taylor Salinas", "Tim Foster", "Erin Sharp"], + "runtime": 99, + "rating": 8.8 + }, + { + "_id": "b532e3c8-6292-4f9d-879f-1f070b1a6992", + "title": "Certain fish", + "release_year": 2009, + "genres": ["Romance"], + "director": "Steven Miles", + "cast": [ + "Jonathan King", + "Caitlyn Costa DDS", + "Steve Davis", + "Perry Anderson" + ], + "runtime": 130, + "rating": 8.6 + }, + { + "_id": "c95e74b0-e47e-4d10-b847-8caa20b94b32", + "title": "Agreement like program", + "release_year": 2004, + "genres": ["Sci-Fi"], + "director": "Daniel Jackson", + "cast": [ + "Ashley Green", + "Rebecca Osborne", + "Robert Williams", + "Breanna Dunn", + "Philip Vargas" + ], + "runtime": 110, + "rating": 8.1 + }, + { + "_id": "791688be-4358-45ab-956e-71fe3fd35d19", + "title": "Floor seven then", + "release_year": 2009, + "genres": ["Horror"], + "director": "Daniel Wallace", + "cast": ["Dustin Wright", "Crystal Young"], + "runtime": 143, + "rating": 4.8 + }, + { + "_id": "488fd79d-dde6-4462-9b90-339d1f3d7474", + "title": "Like rather paper", + "release_year": 2006, + "genres": ["Drama"], + "director": "Spencer Gillespie", + "cast": ["Sean Moyer", "James Edwards", "Tara Lee", "Robert Scott"], + "runtime": 175, + "rating": 9.1 + }, + { + "_id": "3da68e4d-ef14-4fab-9243-19075262e5ca", + "title": "Argue hospital", + "release_year": 1994, + "genres": ["Romance", "Sci-Fi"], + "director": "Amanda Young", + "cast": [ + "Carolyn Williams", + "Jasmin Sampson", + "Phillip Levy", + "Brenda Clark", + "Lauren Perry" + ], + "runtime": 149, + "rating": 9.5 + }, + { + "_id": "f5206a16-4dca-4c1e-b3aa-0d09f2082601", + "title": "Become after card", + "release_year": 1986, + "genres": ["Sci-Fi", "Horror"], + "director": "Brian Martinez", + "cast": ["Rhonda Ochoa", "Charlene Castillo"], + "runtime": 100, + "rating": 8.5 + }, + { + "_id": "fbf30e42-ae6d-4775-bb3e-c5c127ddea06", + "title": "Born authority attention", + "release_year": 1994, + "genres": ["Romance"], + "director": "Brian Martinez", + "cast": ["Matthew Thomas", "Carly Perkins"], + "runtime": 131, + "rating": 4.9 + }, + { + "_id": "4b85a220-8a09-46a7-bea3-a2dad8130311", + "title": "Local seven media", + "release_year": 1998, + "genres": ["Sci-Fi", "Drama"], + "director": "Amanda Young", + "cast": ["Jessica Perez", "Larry Atkinson"], + "runtime": 95, + "rating": 2.0 + }, + { + "_id": "498597d2-3254-46ef-a800-f322a86fbd55", + "title": "Keep employee", + "release_year": 1981, + "genres": ["Horror"], + "director": "Christina Collins", + "cast": ["Alexis Carlson", "Andrew Stewart"], + "runtime": 161, + "rating": 6.0 + }, + { + "_id": "788d9343-6908-4762-88ee-b04aba1e58b5", + "title": "American question generation", + "release_year": 1986, + "genres": ["Romance"], + "director": "Daniel Jackson", + "cast": ["Troy Carter", "Peter Hernandez", "Christine Brown"], + "runtime": 176, + "rating": 8.0 + }, + { + "_id": "74bcf255-df91-40c0-85c0-d7b85ff84f9a", + "title": "Maintain out", + "release_year": 2000, + "genres": ["Sci-Fi", "Action"], + "director": "Brian Martinez", + "cast": ["Nancy Evans", "Michael Gill", "Justin Carroll"], + "runtime": 179, + "rating": 10.0 + }, + { + "_id": "61ddf1d4-17b7-4c63-9bf4-5315e740dc7f", + "title": "Ten box study", + "release_year": 2011, + "genres": ["Horror", "Romance"], + "director": "Steven Miles", + "cast": [ + "Mark Hicks", + "Michelle Dean", + "John Buchanan", + "Veronica Johnson" + ], + "runtime": 147, + "rating": 2.5 + }, + { + "_id": "ab7d8067-f0ff-4955-bc0c-baca4e56e9a4", + "title": "Production operation", + "release_year": 2014, + "genres": ["Horror", "Romance"], + "director": "Sara Stewart", + "cast": ["Ashley Mata", "Mark Kelly", "John West", "Harold Day"], + "runtime": 125, + "rating": 4.1 + }, + { + "_id": "ccd27288-a496-447d-b01c-1f0b42edcc92", + "title": "What language", + "release_year": 2004, + "genres": ["Sci-Fi"], + "director": "Sara Stewart", + "cast": [ + "Scott Mckenzie", + "Jason Lee", + "Nathan Gardner", + "Jamie Greene", + "Angela Garner" + ], + "runtime": 177, + "rating": 3.7 + }, + { + "_id": "b32dd176-938b-4ded-823a-311423fdc2ea", + "title": "Up usually central", + "release_year": 2011, + "genres": ["Sci-Fi", "Comedy"], + "director": "Daniel Jackson", + "cast": ["Jennifer Carlson", "Jonathan Stewart DDS", "Amy Lester"], + "runtime": 159, + "rating": 5.6 + }, + { + "_id": "4aa5f384-3a05-49ff-aa9d-a0e4256c422f", + "title": "For boy only", + "release_year": 1987, + "genres": ["Thriller", "Action"], + "director": "Sara Stewart", + "cast": ["Gene Smith", "Robert Osborne Jr.", "Laura Fox", "Alexis Lowe"], + "runtime": 95, + "rating": 3.6 + }, + { + "_id": "1c858ca4-d6e9-435c-8e25-d8b05a4e825c", + "title": "Site win including your", + "release_year": 2008, + "genres": ["Sci-Fi"], + "director": "Spencer Gillespie", + "cast": [ + "John Williams", + "Jason Huang", + "Karen Klein", + "Gary Tran", + "Jessica Murphy" + ], + "runtime": 178, + "rating": 6.2 + }, + { + "_id": "bc5e5766-e998-4ec2-a40c-62ce5d39b972", + "title": "Sell huge hair", + "release_year": 1997, + "genres": ["Thriller", "Action"], + "director": "Bryan Andrews", + "cast": ["Thomas Johnson", "Ryan Morrow"], + "runtime": 157, + "rating": 4.4 + }, + { + "_id": "090215c8-29e8-4d38-ae9b-ceb78408b982", + "title": "Guy rest", + "release_year": 1997, + "genres": ["Sci-Fi", "Horror"], + "director": "Steven Miles", + "cast": ["Michael Fox", "Tyler Acosta", "Tracy Adams"], + "runtime": 122, + "rating": 7.8 + } +] diff --git a/tests/accuracy/test-data-dumps/mflix.shows.json b/tests/accuracy/test-data-dumps/mflix.shows.json new file mode 100644 index 00000000..2edc7fa7 --- /dev/null +++ b/tests/accuracy/test-data-dumps/mflix.shows.json @@ -0,0 +1,572 @@ +[ + { + "_id": "b586e37c-6b32-417d-a53c-2a4c1121b11b", + "title": "Object-based analyzing architecture", + "seasons": 8, + "episodes": 62, + "platform": "Amazon Prime", + "genres": ["Comedy"], + "cast": [ + "Roger Gomez", + "Sandra Williams", + "Matthew Rodriguez", + "Scott Brown", + "Kristie Horn", + "Nicole Avila" + ], + "start_year": 2014, + "end_year": null + }, + { + "_id": "c28471ea-336f-4060-9b18-0bbff3de6622", + "title": "Customer-focused encompassing architecture", + "seasons": 4, + "episodes": 108, + "platform": "Hulu", + "genres": ["Thriller"], + "cast": ["Joseph Holmes", "Patrick Smith", "Charles Delacruz"], + "start_year": 2001, + "end_year": null + }, + { + "_id": "93f0969b-2377-4531-9c4e-45d2593015cd", + "title": "User-centric background approach", + "seasons": 6, + "episodes": 49, + "platform": "HBO", + "genres": ["Comedy", "Documentary"], + "cast": [ + "Jason Castillo", + "Jessica Burke", + "Philip Lewis", + "Philip Goodman", + "Corey Lee" + ], + "start_year": 2016, + "end_year": 2018 + }, + { + "_id": "a0b76db0-99a1-49fe-a5ea-fe802a66bde9", + "title": "Networked directional budgetary management", + "seasons": 5, + "episodes": 23, + "platform": "Amazon Prime", + "genres": ["Comedy", "Thriller"], + "cast": ["Mark Allen", "Anthony Snyder", "Kimberly Jones"], + "start_year": 2002, + "end_year": null + }, + { + "_id": "fbdef9b9-1ad4-4a6b-a39a-2e0b90423cb5", + "title": "Enterprise-wide dynamic intranet", + "seasons": 1, + "episodes": 12, + "platform": "Amazon Prime", + "genres": ["Crime", "Documentary"], + "cast": ["Matthew Green", "Kelly Wright", "Tonya Sullivan", "Daniel Brown"], + "start_year": 2009, + "end_year": 2020 + }, + { + "_id": "db54ab5c-bf6b-48ea-8272-1b1a4a76b848", + "title": "Exclusive real-time access", + "seasons": 10, + "episodes": 76, + "platform": "Amazon Prime", + "genres": ["Drama"], + "cast": ["Stacey Shaw", "Zachary Steele", "Laurie Martinez"], + "start_year": 2011, + "end_year": 2020 + }, + { + "_id": "53869b62-c8c7-48b3-86c9-17c935b43ff6", + "title": "Persevering leadingedge application", + "seasons": 5, + "episodes": 73, + "platform": "HBO", + "genres": ["Thriller"], + "cast": ["Diane Boyd", "Anna Rubio", "Cheryl Fisher", "Tyler Villa"], + "start_year": 2008, + "end_year": 2020 + }, + { + "_id": "3be07c4d-5275-4181-b2f6-5b1a1e46aa7b", + "title": "Multi-lateral analyzing model", + "seasons": 2, + "episodes": 114, + "platform": "Amazon Prime", + "genres": ["Fantasy"], + "cast": [ + "Kathleen Marshall", + "Kimberly Quinn", + "Steven Parker", + "Adrienne Green", + "Justin Hughes", + "Jean Smith" + ], + "start_year": 2017, + "end_year": 2023 + }, + { + "_id": "50cb455b-5ec0-4e68-8601-43e58defb762", + "title": "User-centric tangible monitoring", + "seasons": 3, + "episodes": 55, + "platform": "Disney+", + "genres": ["Drama"], + "cast": [ + "Barbara Clark", + "Carolyn Scott", + "Timothy Reed", + "Cory Burton", + "Jacob Hill" + ], + "start_year": 2006, + "end_year": 2012 + }, + { + "_id": "bab2dba4-88bd-4b24-afce-8781eb280d53", + "title": "Persevering background monitoring", + "seasons": 4, + "episodes": 61, + "platform": "Amazon Prime", + "genres": ["Comedy", "Fantasy"], + "cast": ["Adam Lin", "Evan Smith", "Christine Howard", "Ruben Hopkins"], + "start_year": 2006, + "end_year": 2023 + }, + { + "_id": "518f2ad9-bb65-4228-8d4c-7a62b9f88599", + "title": "Cross-group intangible architecture", + "seasons": 1, + "episodes": 90, + "platform": "HBO", + "genres": ["Comedy"], + "cast": [ + "Eric Ryan", + "Ashley Ball", + "Douglas Barton", + "Brian Whitehead", + "Michael Greer" + ], + "start_year": 2018, + "end_year": null + }, + { + "_id": "d5f9304d-567d-4335-b43c-ec4034d7009f", + "title": "Programmable bottom-line monitoring", + "seasons": 10, + "episodes": 69, + "platform": "Hulu", + "genres": ["Documentary", "Fantasy"], + "cast": [ + "Mrs. Olivia Booth", + "William Murphy", + "Patricia Payne", + "Lisa Estes", + "Jason Martin", + "Jeff Greene" + ], + "start_year": 2011, + "end_year": 2024 + }, + { + "_id": "27718a30-6e42-47ad-8adf-1533b9b8a419", + "title": "Multi-lateral multi-tasking contingency", + "seasons": 3, + "episodes": 89, + "platform": "Disney+", + "genres": ["Crime"], + "cast": ["Elizabeth Lambert", "Corey Hughes", "Melissa Stephens"], + "start_year": 2006, + "end_year": null + }, + { + "_id": "defc7620-3b4e-46ff-a949-bec1af753812", + "title": "Focused zero administration migration", + "seasons": 9, + "episodes": 73, + "platform": "Disney+", + "genres": ["Documentary", "Drama"], + "cast": ["Shane Richardson", "Lisa Cooper", "Samantha Perkins"], + "start_year": 2008, + "end_year": null + }, + { + "_id": "9d6781fb-d095-4a00-932d-3f1fac1b0049", + "title": "Horizontal methodical encoding", + "seasons": 8, + "episodes": 40, + "platform": "Netflix", + "genres": ["Crime"], + "cast": ["Patricia Barrett", "Scott Gonzalez", "Michaela Johnson"], + "start_year": 2006, + "end_year": null + }, + { + "_id": "ac19b1b1-2bf9-4093-83fa-60411aa3f80f", + "title": "Enterprise-wide analyzing product", + "seasons": 8, + "episodes": 61, + "platform": "Hulu", + "genres": ["Drama"], + "cast": ["Christie Waters", "Casey Allen", "Nicole Frank"], + "start_year": 2001, + "end_year": 2005 + }, + { + "_id": "2dfd2240-dc9f-439f-9e06-b1ec8de397bf", + "title": "Compatible well-modulated extranet", + "seasons": 10, + "episodes": 89, + "platform": "Hulu", + "genres": ["Drama"], + "cast": [ + "Pedro Butler", + "Christian Hall", + "Dawn Gregory", + "Shannon Russell", + "Omar Mullins", + "Ian Ramos" + ], + "start_year": 2012, + "end_year": 2013 + }, + { + "_id": "94db1534-7163-430e-83e3-6a75bc6aec0f", + "title": "User-centric tangible infrastructure", + "seasons": 5, + "episodes": 11, + "platform": "Hulu", + "genres": ["Drama"], + "cast": [ + "Deborah Garcia", + "Michelle Barajas", + "Melissa Reynolds", + "Douglas Wilson" + ], + "start_year": 2001, + "end_year": null + }, + { + "_id": "65b2213f-a606-42d8-b845-0199ba2e9b82", + "title": "Inverse optimal circuit", + "seasons": 1, + "episodes": 29, + "platform": "Amazon Prime", + "genres": ["Fantasy", "Documentary"], + "cast": [ + "Grace Rodriguez", + "Alison Greene", + "Michael Allen", + "Steven Hayden" + ], + "start_year": 2013, + "end_year": null + }, + { + "_id": "5a8a2745-e57c-4086-aa09-84131f40149f", + "title": "Public-key discrete alliance", + "seasons": 9, + "episodes": 111, + "platform": "Disney+", + "genres": ["Documentary"], + "cast": [ + "Emily Irwin", + "Olivia Gibson", + "Jean Hernandez", + "Michael Cummings" + ], + "start_year": 2013, + "end_year": 2022 + }, + { + "_id": "51326558-2080-4615-a583-b4f2fbd15600", + "title": "Managed zero administration groupware", + "seasons": 8, + "episodes": 108, + "platform": "Hulu", + "genres": ["Drama", "Crime"], + "cast": [ + "Karen Phillips", + "Kelly Marsh", + "Daniel Hamilton", + "Abigail Smith" + ], + "start_year": 2018, + "end_year": 2019 + }, + { + "_id": "87a2cd5f-75ee-4650-b2a4-a56384c97137", + "title": "Reverse-engineered static initiative", + "seasons": 6, + "episodes": 66, + "platform": "Amazon Prime", + "genres": ["Crime", "Documentary"], + "cast": [ + "Bradley Chavez", + "Catherine Horn", + "Joseph Bryant", + "Tara Rodriguez" + ], + "start_year": 2003, + "end_year": 2006 + }, + { + "_id": "0f647458-d09f-4be8-b1dc-49be1ba1e104", + "title": "Fundamental tangible matrices", + "seasons": 9, + "episodes": 22, + "platform": "Hulu", + "genres": ["Drama"], + "cast": ["Eric Lee", "Patrick Estrada", "Kelsey Brown", "Jeffrey Lewis"], + "start_year": 2001, + "end_year": null + }, + { + "_id": "53d34237-0e86-4a5e-922b-0589c2e65458", + "title": "Self-enabling homogeneous infrastructure", + "seasons": 5, + "episodes": 35, + "platform": "Hulu", + "genres": ["Crime"], + "cast": [ + "Chad Torres", + "Mark Williams", + "Terry Mcguire", + "Kathleen Cantu", + "Harold Knapp" + ], + "start_year": 2006, + "end_year": null + }, + { + "_id": "71cc1515-ba84-4df6-92db-55af3cfa91f0", + "title": "Horizontal web-enabled application", + "seasons": 2, + "episodes": 94, + "platform": "Netflix", + "genres": ["Thriller", "Fantasy"], + "cast": [ + "Catherine Davila", + "Jessica James", + "Cory Miller", + "Alexis Sanchez", + "Andrew Miller" + ], + "start_year": 2002, + "end_year": 2017 + }, + { + "_id": "200556f7-10c6-4414-83f7-24ef74bff12a", + "title": "User-friendly bi-directional data-warehouse", + "seasons": 2, + "episodes": 87, + "platform": "Hulu", + "genres": ["Drama", "Fantasy"], + "cast": [ + "Tiffany Brown", + "Christina Morales", + "Samuel Blake", + "Stephanie Johnson", + "Wesley Deleon" + ], + "start_year": 2020, + "end_year": null + }, + { + "_id": "613832c9-5307-4c80-9dde-3eab4e5aa770", + "title": "Pre-emptive leadingedge capacity", + "seasons": 5, + "episodes": 56, + "platform": "Netflix", + "genres": ["Comedy"], + "cast": ["James Durham", "Jessica Myers", "Rachel King"], + "start_year": 2005, + "end_year": null + }, + { + "_id": "f9cb1076-3eaf-41d2-84df-057d27c1a544", + "title": "Fundamental intangible contingency", + "seasons": 4, + "episodes": 99, + "platform": "Disney+", + "genres": ["Crime", "Fantasy"], + "cast": [ + "Robert Foster", + "Jill Barton", + "Kimberly Simmons", + "Tracey Gomez" + ], + "start_year": 2017, + "end_year": 2020 + }, + { + "_id": "f96b112f-943e-43cd-90f0-56725cfa7e59", + "title": "Diverse asymmetric forecast", + "seasons": 9, + "episodes": 24, + "platform": "Amazon Prime", + "genres": ["Drama", "Crime"], + "cast": [ + "Carl Johnson", + "Douglas Beck", + "Kevin Guerra", + "Taylor Wilson", + "Eric Jarvis", + "Sarah Charles MD" + ], + "start_year": 2007, + "end_year": null + }, + { + "_id": "78eb682f-a03d-4cbf-bbfc-0e899e5f50d0", + "title": "Profit-focused solution-oriented Graphical User Interface", + "seasons": 10, + "episodes": 117, + "platform": "HBO", + "genres": ["Crime", "Fantasy"], + "cast": ["Carol Miller", "Jennifer Bass", "Melanie Leblanc"], + "start_year": 2002, + "end_year": null + }, + { + "_id": "ebb6d3c9-3c98-4799-94bc-aadd0bf2974c", + "title": "Reduced leadingedge system engine", + "seasons": 1, + "episodes": 58, + "platform": "Hulu", + "genres": ["Crime", "Drama"], + "cast": [ + "James Warren", + "Kelly Carter", + "Sarah Jones", + "Aaron Castaneda", + "Katherine Manning" + ], + "start_year": 2011, + "end_year": null + }, + { + "_id": "4ffd32a7-0bf4-4c95-a7c8-19002c2eb83c", + "title": "Switchable 24/7 website", + "seasons": 6, + "episodes": 71, + "platform": "Netflix", + "genres": ["Documentary"], + "cast": [ + "Sarah Brown", + "Patrick Beck", + "Angela Herrera MD", + "Steven Mcconnell" + ], + "start_year": 2018, + "end_year": null + }, + { + "_id": "37267325-4337-4912-992f-a162f9014569", + "title": "Synergized asymmetric adapter", + "seasons": 4, + "episodes": 16, + "platform": "Hulu", + "genres": ["Fantasy"], + "cast": ["Gabrielle Meyer", "Madison Matthews", "Taylor Martinez"], + "start_year": 2010, + "end_year": null + }, + { + "_id": "ea2abd77-c7da-443e-89fd-6f410f5d697e", + "title": "Extended contextually-based customer loyalty", + "seasons": 1, + "episodes": 79, + "platform": "Hulu", + "genres": ["Fantasy"], + "cast": ["Michael Lewis", "Cassandra Hicks", "Sydney Garcia"], + "start_year": 2015, + "end_year": 2023 + }, + { + "_id": "b568dd56-c083-4431-a740-4f4b5f4e1b21", + "title": "Versatile grid-enabled application", + "seasons": 7, + "episodes": 82, + "platform": "Hulu", + "genres": ["Crime", "Fantasy"], + "cast": ["Keith Brown", "Annette Johnson", "Joseph Carroll", "Derek Lewis"], + "start_year": 2006, + "end_year": 2008 + }, + { + "_id": "b6f2e1c3-6915-4e02-b1c2-44b5bec8fd68", + "title": "Operative optimizing encryption", + "seasons": 2, + "episodes": 52, + "platform": "Amazon Prime", + "genres": ["Fantasy", "Drama"], + "cast": [ + "Garrett Mcgrath", + "Craig Jackson", + "Michael Sullivan", + "Andrew Boyer" + ], + "start_year": 2011, + "end_year": null + }, + { + "_id": "51c225d5-aa67-4b14-aca5-33757cef6bf4", + "title": "Business-focused 24/7 collaboration", + "seasons": 1, + "episodes": 113, + "platform": "Netflix", + "genres": ["Thriller", "Comedy"], + "cast": ["Matthew Hill", "Andrew White", "Grant Young", "John Mathews"], + "start_year": 2015, + "end_year": 2020 + }, + { + "_id": "7465e69f-341e-4234-8ffb-400622442a40", + "title": "Organized bi-directional application", + "seasons": 3, + "episodes": 40, + "platform": "Netflix", + "genres": ["Comedy"], + "cast": [ + "Matthew Gordon", + "Mark Allen", + "Amanda Webb", + "Jeffrey Horton", + "Sheila Lewis", + "Marcus Gilbert" + ], + "start_year": 2011, + "end_year": null + }, + { + "_id": "90570eac-f923-4c30-a5b0-661b28a8e4a5", + "title": "Configurable bottom-line success", + "seasons": 10, + "episodes": 106, + "platform": "HBO", + "genres": ["Fantasy", "Drama"], + "cast": [ + "Elizabeth Taylor", + "Melissa Mullins", + "Alan Nguyen", + "Carolyn Kidd", + "Michael Pope" + ], + "start_year": 2015, + "end_year": null + }, + { + "_id": "06d70791-5487-4dab-8b84-a91b3376e396", + "title": "Organic dedicated analyzer", + "seasons": 3, + "episodes": 88, + "platform": "HBO", + "genres": ["Thriller", "Drama"], + "cast": ["Amy Aguilar", "James Williams", "Kevin Kirby"], + "start_year": 2010, + "end_year": 2025 + } +] diff --git a/tests/accuracy/updateMany.test.ts b/tests/accuracy/updateMany.test.ts new file mode 100644 index 00000000..12b36f89 --- /dev/null +++ b/tests/accuracy/updateMany.test.ts @@ -0,0 +1,42 @@ +import { describeAccuracyTests } from "./sdk/describeAccuracyTests.js"; +import { Matcher } from "./sdk/matcher.js"; + +describeAccuracyTests([ + { + prompt: "Update all the documents in 'mflix.movies' namespace with a new field 'new_field' set to 1", + expectedToolCalls: [ + { + toolName: "update-many", + parameters: { + database: "mflix", + collection: "movies", + update: { + $set: { + new_field: 1, + }, + }, + upsert: Matcher.anyOf(Matcher.undefined, Matcher.boolean()), + }, + }, + ], + }, + { + prompt: "Update all the documents in 'mflix.movies' namespace, where runtime is less than 100, with a new field 'new_field' set to 1", + expectedToolCalls: [ + { + toolName: "update-many", + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + update: { + $set: { + new_field: 1, + }, + }, + upsert: Matcher.anyOf(Matcher.undefined, Matcher.boolean()), + }, + }, + ], + }, +]); diff --git a/tests/integration/tools/mongodb/mongodbHelpers.ts b/tests/integration/tools/mongodb/mongodbHelpers.ts index 86ecdd70..05cee212 100644 --- a/tests/integration/tools/mongodb/mongodbHelpers.ts +++ b/tests/integration/tools/mongodb/mongodbHelpers.ts @@ -2,13 +2,38 @@ import { MongoCluster } from "mongodb-runner"; import path from "path"; import { fileURLToPath } from "url"; import fs from "fs/promises"; -import { MongoClient, ObjectId } from "mongodb"; +import { Document, MongoClient, ObjectId } from "mongodb"; import { getResponseContent, IntegrationTest, setupIntegrationTest, defaultTestConfig } from "../../helpers.js"; import { UserConfig } from "../../../../src/common/config.js"; import { afterAll, afterEach, beforeAll, beforeEach, describe, expect, it } from "vitest"; const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const testDataDumpPath = path.join(__dirname, "..", "..", "..", "accuracy", "test-data-dumps"); + +const testDataPaths = [ + { + db: "comics", + collection: "books", + path: path.join(testDataDumpPath, "comics.books.json"), + }, + { + db: "comics", + collection: "characters", + path: path.join(testDataDumpPath, "comics.characters.json"), + }, + { + db: "mflix", + collection: "movies", + path: path.join(testDataDumpPath, "mflix.movies.json"), + }, + { + db: "mflix", + collection: "shows", + path: path.join(testDataDumpPath, "mflix.shows.json"), + }, +]; + interface MongoDBIntegrationTest { mongoClient: () => MongoClient; connectionString: () => string; @@ -170,3 +195,41 @@ export function validateAutoConnectBehavior( }); }); } + +export function prepareTestData(integration: MongoDBIntegrationTest) { + const NON_TEST_DBS = ["admin", "config", "local"]; + const testData: { + db: string; + collection: string; + data: Document[]; + }[] = []; + + beforeAll(async () => { + for (const { db, collection, path } of testDataPaths) { + testData.push({ + db, + collection, + data: JSON.parse(await fs.readFile(path, "utf8")) as Document[], + }); + } + }); + + return { + async populateTestData(this: void) { + const client = integration.mongoClient(); + for (const { db, collection, data } of testData) { + await client.db(db).collection(collection).insertMany(data); + } + }, + async cleanupTestDatabases(this: void, integration: MongoDBIntegrationTest) { + const client = integration.mongoClient(); + const admin = client.db().admin(); + const databases = await admin.listDatabases(); + await Promise.all( + databases.databases + .filter(({ name }) => !NON_TEST_DBS.includes(name)) + .map(({ name }) => client.db(name).dropDatabase()) + ); + }, + }; +} diff --git a/tests/integration/tools/mongodb/read/find.test.ts b/tests/integration/tools/mongodb/read/find.test.ts index 5aa378c8..fef79793 100644 --- a/tests/integration/tools/mongodb/read/find.test.ts +++ b/tests/integration/tools/mongodb/read/find.test.ts @@ -34,7 +34,7 @@ describeWithMongoDB("find tool", (integration) => { { name: "sort", description: - "A document, describing the sort order, matching the syntax of the sort argument of cursor.sort()", + "A document, describing the sort order, matching the syntax of the sort argument of cursor.sort(). The keys of the object are the fields to sort on, while the values are the sort directions (1 for ascending, -1 for descending).", type: "object", required: false, }, diff --git a/tests/unit/accuracyScorer.test.ts b/tests/unit/accuracyScorer.test.ts new file mode 100644 index 00000000..231251b7 --- /dev/null +++ b/tests/unit/accuracyScorer.test.ts @@ -0,0 +1,390 @@ +import { describe, expect, it } from "vitest"; +import { calculateToolCallingAccuracy } from "../accuracy/sdk/accuracyScorer.js"; +import { ExpectedToolCall, LLMToolCall } from "../accuracy/sdk/accuracyResultStorage/resultStorage.js"; +import { Matcher } from "../accuracy/sdk/matcher.js"; + +describe("calculateToolCallingAccuracy", () => { + describe("edge cases", () => { + it("should return 1 when both expected and actual are empty", () => { + const result = calculateToolCallingAccuracy([], []); + expect(result).toBe(1); + }); + + it("should return 0.75 when expected is empty but actual has tool calls", () => { + const actualToolCalls: LLMToolCall[] = [{ toolCallId: "1", toolName: "find", parameters: { db: "test" } }]; + const result = calculateToolCallingAccuracy([], actualToolCalls); + expect(result).toBe(0.75); + }); + + it("should return 0 when expected has tool calls but actual is empty", () => { + const expectedToolCalls: ExpectedToolCall[] = [{ toolName: "find", parameters: { db: "test" } }]; + const result = calculateToolCallingAccuracy(expectedToolCalls, []); + expect(result).toBe(0); + }); + }); + + describe("perfect matches", () => { + it("should return 1 for exact match with nested parameters", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { db: "test", collection: "users", filter: { age: { $gte: 18 }, status: "active" } }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { age: { $gte: 18 }, status: "active" } }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + + it("should return 1 for exact match with multiple diverse tool calls", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + { + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + { toolName: "count", parameters: { db: "test", collection: "products" } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + { + toolCallId: "2", + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + { toolCallId: "3", toolName: "count", parameters: { db: "test", collection: "products" } }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + }); + + describe("additional parameters", () => { + it("should return 0 when tool call has additional nested parameters (default behavior)", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { + db: "test", + collection: "users", + filter: { status: "active", age: { $gte: 18 } }, + limit: 10, + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + + it("should return 1 when expected has no filter but actual has empty filter", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: Matcher.emptyObjectOrUndefined, + }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: {}, + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + + it("should return 1 when expected has no filter and actual has no filter", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: Matcher.emptyObjectOrUndefined, + }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + + it("should return 0 when expected has no filter but actual has non-empty filter", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: Matcher.emptyObjectOrUndefined, + }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: { genre: "Horror" }, + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + + it("should return 0 when there are additional nested fields", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 }, genre: "Horror" }, + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + + it("should return 1 when ignored additional fields are provided", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + limit: Matcher.number(), + sort: Matcher.anyValue, + }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { + database: "mflix", + collection: "movies", + filter: { runtime: { $lt: 100 } }, + limit: 10, + sort: { title: 1 }, + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + + it("should return 1 for array where additional elements are allowed", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "aggregate", + parameters: { + database: "mflix", + collection: "movies", + pipeline: [{ $match: { genre: "Horror" } }, Matcher.anyOf(Matcher.undefined, Matcher.anyValue)], + }, + }, + ]; + + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "aggregate", + parameters: { + database: "mflix", + collection: "movies", + pipeline: [{ $match: { genre: "Horror" } }, { $sort: { title: 1 } }], + }, + }, + ]; + + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + + it("should return 1 for array where additional elements are allowed but not provided", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "aggregate", + parameters: { + database: "mflix", + collection: "movies", + pipeline: [{ $match: { genre: "Horror" } }, Matcher.anyOf(Matcher.undefined, Matcher.anyValue)], + }, + }, + ]; + + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "aggregate", + parameters: { + database: "mflix", + collection: "movies", + pipeline: [{ $match: { genre: "Horror" } }], + }, + }, + ]; + + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(1); + }); + }); + + describe("missing or incorrect parameters", () => { + it("should return 0 when tool call has missing nested parameters", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active", age: { $gte: 18 } } }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + + it("should return 0 when aggregate tool call has incorrect pipeline", () => { + const expected: ExpectedToolCall[] = [ + { + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $lt: 50 } } }] }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); + }); + }); + + describe("additional tool calls", () => { + it("should cap accuracy at 0.75 when LLM calls extra tools", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + { toolCallId: "2", toolName: "count", parameters: { db: "test", collection: "orders" } }, + { + toolCallId: "3", + toolName: "aggregate", + parameters: { + db: "test", + collection: "products", + pipeline: [{ $group: { _id: "$category", total: { $sum: 1 } } }], + }, + }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0.75); + }); + + it("should cap accuracy at 0.75 when LLM calls same tool multiple times with variations", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + { + toolCallId: "2", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active", age: { $gte: 18 } } }, + }, + { toolCallId: "3", toolName: "find", parameters: { db: "test", collection: "users", limit: 10 } }, + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0.75); + }); + }); + + describe("missing tool calls", () => { + it("should return 0 if any expected tool call was not called", () => { + const expected: ExpectedToolCall[] = [ + { toolName: "find", parameters: { db: "test", collection: "users", filter: { status: "active" } } }, + { + toolName: "aggregate", + parameters: { db: "test", collection: "orders", pipeline: [{ $match: { total: { $gt: 100 } } }] }, + }, + ]; + const actual: LLMToolCall[] = [ + { + toolCallId: "1", + toolName: "find", + parameters: { db: "test", collection: "users", filter: { status: "active" } }, + }, + // Missing the aggregate tool call + ]; + const result = calculateToolCallingAccuracy(expected, actual); + expect(result).toBe(0); // One expected tool call was not called + }); + }); +}); diff --git a/vitest.config.ts b/vitest.config.ts index 31090929..239650ac 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -1,15 +1,41 @@ import { defineConfig } from "vitest/config"; +// Shared exclusions for all projects +// Ref: https://vitest.dev/config/#exclude +const vitestDefaultExcludes = [ + "**/node_modules/**", + "**/dist/**", + "**/cypress/**", + "**/.{idea,git,cache,output,temp}/**", + "**/{karma,rollup,webpack,vite,vitest,jest,ava,babel,nyc,cypress,tsup,build,eslint,prettier}.config.*", +]; + export default defineConfig({ test: { environment: "node", testTimeout: 3600000, hookTimeout: 3600000, - include: ["**/*.test.ts"], setupFiles: ["./tests/setup.ts"], coverage: { - exclude: ["node_modules", "tests", "dist"], + exclude: ["node_modules", "tests", "dist", "vitest.config.ts", "scripts"], reporter: ["lcov"], }, + projects: [ + { + extends: true, + test: { + name: "unit-and-integration", + include: ["**/*.test.ts"], + exclude: [...vitestDefaultExcludes, "tests/accuracy/**"], + }, + }, + { + extends: true, + test: { + name: "accuracy", + include: ["**/accuracy/*.test.ts"], + }, + }, + ], }, });