Minor internal tests fixes/documentation (#64)

toptobes · web-flow · commit 416c84db841b · 2024-07-18T13:58:33.000-05:00
* patched case of auth method not present in provider info

* updated vectorize_credentials.example.json

* updated test spec structure

* vectorize_credentials =&gt; vectorize_test_spec

* fixed couple bugs in test filtering

* updated test script helpstring a bit

* updated vectorize_test_spec structure in the DEVGUIDE

* fixed minor typo in DEVGUIDE

* fixed another minor typo in DEVGUIDE

* fixed another minor typo in DEVGUIDE

* holy crap I can't stop making typos

* updated test script helpstring a bit in DEVGUIDE

* updated devguide to talk about vectorize whitelist

* minor addition to running vectorize tests int he DEVGUIDE

* sdafkljdsal;kfjsdaklf

* so many commits...

* running out of things to say... may be time to use whatthecommit

* not like anyone's gonna read these anyways...

* pointless limitation

* [FIX] asdf

* I __ a word
diff --git a/.env.example b/.env.example
@@ -18,7 +18,7 @@ ASTRA_RUN_VECTORIZE_TESTS=1
 # - where dimension := 'specified' | 'default' | a specific number
 # - where authType := 'header' | 'providerKey' | 'none'
 # Only needs to match part of the test name to whitelist (use ^$ as necessary)
-# VECTORIZE_WHITELIST=^.*@(header|none)@default
+# VECTORIZE_WHITELIST=^.*@(header|none)@(default|specified)
 VECTORIZE_WHITELIST=.*
 
 # Set this to some value to enable running long-running tests
diff --git a/.gitignore b/.gitignore
@@ -137,4 +137,4 @@ build.zip
 temp
 tsdoc-metadata.json
 
-vectorize_credentials.json
+vectorize_test_spec.json
diff --git a/DEVGUIDE.md b/DEVGUIDE.md
@@ -7,11 +7,13 @@
 ## Running the tests
 Prerequisites:
 - A JS package manager (npm, bun, etc.)
-- A clean AstraDB instance with two keyspaces—`default_keyspace` and `other_keyspace`
+- A clean Data API instance with two keyspaces—`default_keyspace` and `other_keyspace`
 - Copy the `.env.example` file and create a new `.env` file following the example template
 
+The library comes with a small custom test script, whose usage is shown below:
+
 ```shell
-npm run test -- [--all | --light | --coverage | --prerelease] [-f <filter>] [-b] [--args <raw_args>]
+npm run test -- [--all | --light | --coverage | --prerelease] [-f <filter>] [-w <vectorize_whitelist>] [-b] [--args <raw_args>]
 # or
 npm run test -- <--types>
 ```
@@ -39,19 +41,22 @@ npm run test -- --light -f 'integration.'
 npm run test -- --types
 ```
 
-(bun does not need the extra initial `--` like npm does)
+(bun does not need the extra initial `--` like npm does).
 
 ### Running the tests on local Stargate
 You can do `sh scripts/start-stargate-4-tests.sh` to spin up an ephemeral Data API on DSE instance which automatically
 creates the required keyspaces and destroys itself on exit.
 
-Then, be sure to set the following vars in `.env` exactly, then run the tests as usual.
+Then, be sure to set the following vars in `.env` exactly.
 ```dotenv
 APPLICATION_URI=http://localhost:8181
 APPLICATION_TOKEN=Cassandra:Y2Fzc2FuZHJh:Y2Fzc2FuZHJh
 APPLICATION_ENVIRONMENT=dse
 ```
 
+Once the local Data API instance is ready (you see the output for the created namespaces and everything), you can
+run the tests.
+
 ### Running tagged tests
 Tests can be given certain tags to allow for more granular control over which tests are run. These tags currently include:
 - `[long]`/`'LONG'`: Longer running tests that take more than a few seconds to run
@@ -97,38 +102,65 @@ test suite harder to manage.
 
 ### Running vectorize tests
 To run vectorize tests, you need to have a vectorize-enabled kube running, with the correct tags enabled.
-You must create a file, `vectorize_tests.json`, in the root folder, with the following format:
+
+Ensure `ASTRA_RUN_VECTORIZE_TESTS` and `ASTRA_RUN_LONG_TESTS` are enabled as well (or just pass the `--all` flag to
+the test script).
+
+Lastly, you must create a file, `vectorize_tests.json`, in the root folder, with the following format:
 
 ```ts
-interface VectorizeTestSpec {
+type VectorizeTestSpec = {
   [providerName: string]: {
-    apiKey?: string,
-    providerKey?: string,
+    headers?: {
+      [header: `x-${string}`]: string,
+    },
+    sharedSecret?: {
+      providerKey?: string,
+    },
     dimension?: {
       [modelNameRegex: string]: number,
     },
     parameters?: {
-      [modelNameRegex: string]: Record<string, string>
+      [modelNameRegex: string]: Record<string, string>,
     },
-  }
+  },
 }
 ```
 
 where:
 - `providerName` is the name of the provider (e.g. `nvidia`, `openai`, etc.) as found in `findEmbeddingProviders`.
-- `apiKey` is the API key for the provider (which will be passed in through the header) .
+- `headers` sets the embedding headers to be used for header auth.
+  - resolves to an `EmbeddingHeadersProvider` under the hood—throws error if no corresponding one found.
   - optional if no header auth test wanted.
-- `providerKey` is the provider key for the provider (which will be passed in @ collection creation) .
+- `sharedSecret` is the block for KMS auth (isomorphic to `providerKey`, but it's an object for future-compatability).
+  - `providerKey` is the provider key for the provider (which will be passed in @ collection creation).
   - optional if no KMS auth test wanted.
 - `parameters` is a mapping of model names to their corresponding parameters. The model name can be some regex that partially matches the full model name.
   - `"text-embedding-3-small"`, `"3-small"`, and `".*"` will all match `"text-embedding-3-small"`.
   - optional if not required. `azureOpenAI`, for example, will need this.
-- `dimension` is a also a mapping of model name regex to their corresponding dimensions, like the `parameters` field.
+- `dimension` is also a mapping of model name regex to their corresponding dimensions, like the `parameters` field.
   - optional if not required. `huggingfaceDedicated`, for example, will need this.
 
 This file is gitignored by default and will not be checked into VCS.
 
-See `vectorize_credentials.example.json` for—guess what—an example.
+See `vectorize_test_spec.example.json` for, guess what, an example.
+
+This spec is cross-referenced with `findEmbeddingProviders` to create a suite of tests branching off each possible
+parameter, with tests names of the format `providerName@modelName@authType@dimension`, where each section is another
+potential branch.
+
+These branches can be narrowed down with the `VECTORIZE_WHITELIST` env var (or pass `-w <vectorize_whitelist>` to
+the test script). It's a regex parameter which only needs to match part of the test name to whitelist (so use `^$` as 
+necessary). 
+
+An example would be `VECTORIZE_WHITELIST=^.*@(header|none)@(default|specified)` to only run the vectorize tests using
+the header auth (or no-auth for nvidia), and only using the default/specified version of the dimension, essentially 
+stopping creating additional branches off of authentication and vector dimension to reduce the number of near-duplicate
+tests run.
+
+Defaults to just `*`.
+
+To run *only* the vectorize tests, a common pattern I use is `bun run test --all -f vectorize [-w <vectorize_whitelist>]`.
 
 ### Coverage testing
 
diff --git a/scripts/test.sh b/scripts/test.sh
@@ -3,9 +3,9 @@
 # Define necessary commands
 test_cmd="npx ts-mocha --paths -p tsconfig.json --recursive tests/prelude.test.ts tests/unit tests/integration --extension .test.ts -t 60000"
 
-all_tests_cmd="env ASTRA_RUN_LONG_TESTS=1 ASTRA_RUN_ADMIN_TESTS=1 ASTRA_RUN_VECTORIZE_TESTS=1 $test_cmd"
+all_tests_cmd="ASTRA_RUN_LONG_TESTS=1 ASTRA_RUN_ADMIN_TESTS=1 ASTRA_RUN_VECTORIZE_TESTS=1 $test_cmd"
 
-light_tests_cmd="env ASTRA_RUN_LONG_TESTS=0 ASTRA_RUN_ADMIN_TESTS=0 ASTRA_RUN_VECTORIZE_TESTS=0 $test_cmd"
+light_tests_cmd="ASTRA_RUN_LONG_TESTS= ASTRA_RUN_ADMIN_TESTS= ASTRA_RUN_VECTORIZE_TESTS= $test_cmd"
 
 run_lint_cmd="npm run lint"
 
@@ -48,6 +48,10 @@ while [ $# -gt 0 ]; do
     "-b")
       bail_early=1
       ;;
+    "-w")
+      shift
+      whitelist="$1"
+      ;;
     "--args")
       shift
       raw_args="$1"
@@ -56,7 +60,7 @@ while [ $# -gt 0 ]; do
       echo "Invalid flag $1"
       echo ""
       echo "Usage:"
-      echo "npm run test -- [--all | --light | --coverage | --prerelease] [-f <filter>] [-b] [--args <raw_args>]"
+      echo "npm run test -- [--all | --light | --coverage | --prerelease] [-f <filter>] [-w <vectorize_whitelist>] [-b] [--args <raw_args>]"
       echo "or"
       echo "npm run test -- <--types>"
       exit
@@ -66,8 +70,8 @@ while [ $# -gt 0 ]; do
 done
 
 # Ensure the flags are compatible with each other
-if [ "$test_type" = '--types' ] && { [ -n "$bail_early" ] || [ -n "$filter" ] || [ -n "$raw_args" ]; }; then
-  echo "Can't use a filter, bail, or args flag when typechecking"
+if [ "$test_type" = '--types' ] && { [ -n "$bail_early" ] || [ -n "$filter" ] || [ -n "$raw_args" ] || [ -n "$whitelist" ]; }; then
+  echo "Can't use a filter, bail, whitelist, or args flag when typechecking"
   exit
 fi
 
@@ -105,5 +109,10 @@ if [ -n "$raw_args" ]; then
   cmd_to_run="$cmd_to_run $raw_args"
 fi
 
+if [ -n "$whitelist" ]; then
+  cmd_to_run="VECTORIZE_WHITELIST='$whitelist' $cmd_to_run"
+fi
+
 # Run it
+echo "$cmd_to_run"
 eval "$cmd_to_run"
diff --git a/tests/integration/data-api/vectorize.test.ts b/tests/integration/data-api/vectorize.test.ts
@@ -29,8 +29,12 @@ import {
 
 type VectorizeTestSpec = {
   [providerName: string]: {
-    [header: `x-${string}`]: string,
-    providerKey?: string,
+    headers?: {
+      [header: `x-${string}`]: string,
+    }
+    sharedSecret?: {
+      providerKey?: string,
+    }
     dimension?: {
       [modelNameRegex: string]: number,
     },
@@ -84,7 +88,7 @@ describe('integration.data-api.vectorize', () => {
 });
 
 const initVectorTests = async (db: Db) => {
-  const spec = JSON.parse(fs.readFileSync('vectorize_credentials.json', 'utf8')) as VectorizeTestSpec;
+  const spec = JSON.parse(fs.readFileSync('vectorize_test_spec.json', 'utf8')) as VectorizeTestSpec;
 
   const { embeddingProviders } = await (
     (ENVIRONMENT === 'astra')
@@ -156,15 +160,15 @@ const branchOnAuth = (spec: VectorizeTestSpec[string], providerInfo: EmbeddingPr
 
   const ehp = resolveHeaderProvider(spec);
 
-  if (auth['HEADER'].enabled && ehp) {
+  if (auth['HEADER']?.enabled && ehp) {
     tests.push({ ...test, authType: 'header', header: ehp, testName: `${test.testName}@header` });
   }
 
-  if (auth['SHARED_SECRET'].enabled && spec.providerKey && ENVIRONMENT === 'astra') {
-    tests.push({ ...test, authType: 'providerKey', providerKey: spec.providerKey, testName: `${test.testName}@providerKey` });
+  if (auth['SHARED_SECRET']?.enabled && spec.sharedSecret?.providerKey && ENVIRONMENT === 'astra') {
+    tests.push({ ...test, authType: 'providerKey', providerKey: spec.sharedSecret?.providerKey, testName: `${test.testName}@providerKey` });
   }
 
-  if (auth['NONE'].enabled && ENVIRONMENT === 'astra') {
+  if (auth['NONE']?.enabled && ENVIRONMENT === 'astra') {
     tests.push({ ...test, authType: 'none', testName: `${test.testName}@none` });
   }
 
@@ -174,7 +178,7 @@ const branchOnAuth = (spec: VectorizeTestSpec[string], providerInfo: EmbeddingPr
 }
 
 const resolveHeaderProvider = (spec: VectorizeTestSpec[string]) => {
-  const headers = Object.entries(spec).filter(([k]) => k.startsWith('x-')).sort() as [string, string][];
+  const headers = Object.entries(spec?.headers ?? []).sort();
 
   if (headers.length === 0) {
     return null;
@@ -219,7 +223,9 @@ const branchOnDimension = (spec: VectorizeTestSpec[string], modelInfo: Embedding
 type VectorizeTest = DimensionBranch;
 
 const createVectorizeProvidersTest = (db: Db, test: VectorizeTest, name: string) => {
-  it(`[vectorize] [dev] has a working lifecycle (${test.testName})`, async () => {
+  it(`[vectorize] [long] has a working lifecycle (${test.testName})`, async function () {
+    assertTestsEnabled(this, 'VECTORIZE', 'LONG');
+
     const collection = await db.createCollection(name, {
       vector: {
         dimension: test.dimension,
@@ -289,12 +295,14 @@ const createVectorizeProvidersTest = (db: Db, test: VectorizeTest, name: string)
 };
 
 const createVectorizeParamTests = function (db: Db, test: VectorizeTest, name: string) {
-  describe('[vectorize] [dev] $vectorize/vectorize params', () => {
+  describe('[vectorize] $vectorize/vectorize params', () => {
     const collection = db.collection(name, {
       embeddingApiKey: test.header,
     });
 
     before(async function () {
+      assertTestsEnabled(this, 'VECTORIZE');
+
       if (!await db.listCollections({ nameOnly: true }).then(cs => cs.some((c) => c === name))) {
         this.skip();
       }
diff --git a/tests/prelude.test.ts b/tests/prelude.test.ts
@@ -12,11 +12,22 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-import { DEFAULT_COLLECTION_NAME, initTestObjects, OTHER_NAMESPACE } from '@/tests/fixtures';
+import { DEFAULT_COLLECTION_NAME, ENVIRONMENT, initTestObjects, OTHER_NAMESPACE } from '@/tests/fixtures';
+import { DEFAULT_NAMESPACE } from '@/src/api';
 
 before(async () => {
   const [, db] = await initTestObjects();
 
+  const admin = (ENVIRONMENT === 'astra')
+    ? db.admin({ environment: ENVIRONMENT })
+    : db.admin({ environment: ENVIRONMENT });
+
+  const namespaces = await admin.listNamespaces();
+
+  if (!namespaces.includes(DEFAULT_NAMESPACE) || !namespaces.includes(OTHER_NAMESPACE)) {
+    throw new Error(`Missing required namespace(s)... make sure you have both ${DEFAULT_NAMESPACE} and ${OTHER_NAMESPACE}`);
+  }
+
   await db.createCollection(DEFAULT_COLLECTION_NAME, { vector: { dimension: 5, metric: 'cosine' }, checkExists: false, namespace: OTHER_NAMESPACE })
     .then(c => c.deleteMany({}));
 
diff --git a/vectorize_credentials.example.json b/vectorize_credentials.example.json
diff --git a/vectorize_test_spec.example.json b/vectorize_test_spec.example.json