Merge branch 'main' into multi-index-support

Jibola · web-flow · commit cf10c23944ae · 2024-04-23T13:45:16.000-04:00
diff --git a/.evergreen/config.yml b/.evergreen/config.yml
@@ -7,7 +7,7 @@
 # Actual testing tasks are marked with `type: test`
 command_type: system
 
-# Protect ourself against rogue test case, or curl gone wild, that runs forever
+# Protect yourself against rogue test case, or curl gone wild, that runs forever
 # Good rule of thumb: the averageish length a task takes, times 5
 # That roughly accounts for variable system performance for various buildvariants
 exec_timeout_secs:
@@ -36,8 +36,14 @@ functions:
             echo '${REPO_NAME} could not be found' 1>&2
             exit 1
           fi
+          # Apply patches to upstream repo if desired.
           cd ${DIR}
           git clone ${CLONE_URL}
+          if [ -d "patches" ]; then
+            cd ${REPO_NAME}
+            echo "Applying patches."
+            git apply  ../patches/*
+          fi
 
   "execute tests":
     - command: subprocess.exec
@@ -84,6 +90,16 @@ tasks:
       - func: "fetch repo"
       - func: "execute tests"
 
+  - name: test-chatgpt-retrieval-plugin
+    commands:
+      - func: "fetch repo"
+      - func: "execute tests"
+
+  - name: test-llama-index
+    commands:
+      - func: "fetch repo"
+      - func: "execute tests"
+
 buildvariants:
   - name: test-semantic-kernel-python-rhel
     display_name: Semantic-Kernel RHEL Python
@@ -119,3 +135,29 @@ buildvariants:
       - rhel87-small
     tasks:
       - name: test-langchain-python
+
+  - name: test-chatgpt-retrieval-plugin-rhel
+    display_name: ChatGPT Retrieval Plugin
+    expansions:
+      DIR: chatgpt-retrieval-plugin
+      REPO_NAME: chatgpt-retrieval-plugin
+      # TODO - Update CLONE_URL: [PYTHON-4291] [PYTHON-4129]
+      CLONE_URL: -b feature/mongodb-datastore --single-branch https://github.com/caseyclements/chatgpt-retrieval-plugin.git
+      DATABASE: chatgpt_retrieval_plugin_test_db
+    run_on:
+      - rhel87-small
+    tasks:
+      - name: test-chatgpt-retrieval-plugin
+
+  - name: test-llama-index-rhel
+    display_name: LlamaIndex RHEL
+    expansions:
+      DIR: llama_index
+      REPO_NAME: llama_index
+      # TODO - Update CLONE_URL once pull-request is merged
+      CLONE_URL: -b feature/mongodb-datastore --single-branch https://github.com/caseyclements/llama_index.git
+      DATABASE: llama_index_test_db
+    run_on:
+      - rhel87-small
+    tasks:
+      - name: test-llama-index
diff --git a/.evergreen/provision-atlas.sh b/.evergreen/provision-atlas.sh
@@ -13,9 +13,9 @@ DEPLOYMENT_NAME=$DIR
 
 # Download the mongodb tar and extract the binary into the atlas directory
 set -ex
-curl https://fastdl.mongodb.org/mongocli/mongodb-atlas-cli_1.14.0_linux_x86_64.tar.gz -o atlas.tgz
+curl https://fastdl.mongodb.org/mongocli/mongodb-atlas-cli_1.16.0_linux_x86_64.tar.gz -o atlas.tgz
 tar zxf atlas.tgz
-mv mongodb-atlas-cli_1.14.0* atlas
+mv mongodb-atlas-cli_1.16.0* atlas
 
 # Create a local atlas deployment and store the connection string as an env var
 $atlas deployments setup $DIR --type local --force
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,48 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+*.dylib
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# Cython
+*.c
+*.cpp
+
+# PyCharm
+*.idea/
+
+# Sphinx documentation
+docs/_build/
+
+# libbson files
+libbson
+mongo-c-driver-*
+
+
+# Benchmark and test output files
+results/*
+xunit-results/
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 ## What is it?
 
-This repository exists to test our integrations in Third-Party AI/ML testing libraries.
+This repository exists to test our integrations in Third-Party AI/ML libraries.
 
 ## Motivation
 
@@ -95,7 +95,55 @@ Test execution flow is defined in `.evergreen/config.yml`. The test pipeline's c
 
 **[Functions](https://docs.devprod.prod.corp.mongodb.com/evergreen/Project-Configuration/Project-Configuration-Files#functions)** -- We've defined some common functions that will be used. See the `.evergreen/config.yml` for example cases. The standard procedure is to fetch the repository, provision Atlas as needed, and then execute the tests specified in the `run.sh` script you create. Ensure that the expansions are provided for these functions, otherwise the tests will run improperly and most likely fail.
 
-- [`fetch repo`](https://github.com/mongodb-labs/ai-ml-pipeline-testing/blob/main/.evergreen/config.yml#L30) -- Clones the library's git repository; make sure to provide the expansion CLONE_URL
-- [`execute tests`](https://github.com/mongodb-labs/ai-ml-pipeline-testing/blob/main/.evergreen/config.yml#L51) -- Uses [subprocess.exec](https://docs.devprod.prod.corp.mongodb.com/evergreen/Project-Configuration/Project-Commands#subprocessexec) to run the provided `run.sh` file. `run.sh` must be within the specified `DIR` path.
-- `fetch source` -- Retrieves the current (`ai-ml-pipeline-testing`) repo
-- `setup atlas cli` -- Sets up the local Atlas deployment
+-   [`fetch repo`](https://github.com/mongodb-labs/ai-ml-pipeline-testing/blob/main/.evergreen/config.yml#L30) -- Clones the library's git repository; make sure to provide the expansion CLONE_URL
+-   [`execute tests`](https://github.com/mongodb-labs/ai-ml-pipeline-testing/blob/main/.evergreen/config.yml#L51) -- Uses [subprocess.exec](https://docs.devprod.prod.corp.mongodb.com/evergreen/Project-Configuration/Project-Commands#subprocessexec) to run the provided `run.sh` file. `run.sh` must be within the specified `DIR` path.
+-   `fetch source` -- Retrieves the current (`ai-ml-pipeline-testing`) repo
+-   `setup atlas cli` -- Sets up the local Atlas deployment
+
+## Upstream Repo Considerations
+
+For better or worse, we do not maintain AI/ML libraries with which we integrate.
+We provide workarounds for a few common issues that we encounter.
+
+### Third-Party AI/ML library Maintainers have not merged our changes
+
+As we develop a testing infrastructure, we commonly make changes to our integrations with the third-party library.
+This is the case, in particular, when we add a new integration.
+Over time, we may make bug fixes, add new features, and update the API.
+At the start, we will hopefully add the integration tests themselves.
+
+The bad news is that the maintainers of the AI/ML packages may take considerable
+time to review and merge our changes. The good news is that we can begin testing
+without pointing to the main branch of the upstream repo.
+The parameter value of the `CLONE_URL` is very flexible.
+We literally just call `git clone $CLONE_URL`.
+As such, we can point to an arbitrary branch on an arbitrary repo.
+While developing, we encourage developers to point to a feature branch
+on their own fork, and add a TODO with the JIRA ticket to update the url
+once the pull-request has been merged.
+
+### Patching upstream repos
+
+We provide a simple mechanism to make changes to the third-party packages
+without requiring a pull-request (and acceptance by the upstream maintainers).
+This is done via Git Patch files.
+
+Patch files are created very simply: `git diff > mypatch.patch`.
+If you can believe it, this was the primary mechanism to share code with another maintainer
+before pull-requests existed!
+To apply patches, add them to a `patches` directory within the `$DIR` of your build variant.
+As of this writing, the `chatgpt-retrieval-plugin` contains an example that you may use as a reference.
+You can create a number of different patch files, which will be applied recursively.
+This is useful to describe rationale, or to separate out ones that will be removed
+upon a merged pull-request to the upstream repo.
+
+During ChatGPT Retrieval Plugin integration, we ran into build issues on Evergreen hosts.
+In this case, the package failed to build from source.
+It required a library that wasn't available on the host and had no wheel on PyPI.
+As it turned out, the package was actually an optional requirement,
+and so a one-line change to `pyproject.toml` solved our problem.
+
+We realized that we could easily get this working without changing the upstream
+simply by applying a git patch file.
+This is a standard practice used by `conda package` maintainers,
+as they often have to build for a more broad set of scenarios than the original authors intended.
diff --git a/chatgpt-retrieval-plugin/database/chatgpt_retrieval_plugin_test_vectorstore.json b/chatgpt-retrieval-plugin/database/chatgpt_retrieval_plugin_test_vectorstore.json
@@ -0,0 +1 @@
+[]
diff --git a/chatgpt-retrieval-plugin/indexConfig.json b/chatgpt-retrieval-plugin/indexConfig.json
@@ -0,0 +1,14 @@
+{
+  "fields": [
+    {
+      "numDimensions": 1536,
+      "path": "embedding",
+      "similarity": "cosine",
+      "type": "vector"
+    }
+  ],
+  "name": "vector_index",
+  "type": "vectorSearch",
+  "database": "chatgpt_retrieval_plugin_test_db",
+  "collectionName": "chatgpt_retrieval_plugin_test_vectorstore"
+}
diff --git a/chatgpt-retrieval-plugin/patches/remove-psycopg2.patch b/chatgpt-retrieval-plugin/patches/remove-psycopg2.patch
@@ -0,0 +1,13 @@
+diff --git a/pyproject.toml b/pyproject.toml
+index e41676e..a6ca72d 100644
+--- a/pyproject.toml
++++ b/pyproject.toml
+@@ -28,7 +28,7 @@ pymilvus = "^2.2.2"
+ qdrant-client = {version = "^1.0.4", python = "<3.12"}
+ redis = "4.5.4"
+ supabase = "^1.0.2"
+-psycopg2 = "^2.9.5"
++psycopg2 = {version = "^2.9.5", optional = true}
+ llama-index = "0.5.4"
+ azure-identity = "^1.12.0"
+ azure-search-documents = "11.4.0b8"
diff --git a/chatgpt-retrieval-plugin/run.sh b/chatgpt-retrieval-plugin/run.sh
@@ -0,0 +1,33 @@
+#!/bin/sh
+
+# chat-gpt-retrieval-plugin is a poetry run project
+
+set -x
+
+. $workdir/src/.evergreen/utils.sh
+
+PYTHON_BINARY=$(find_python3)
+$PYTHON_BINARY -c "import sys; print(f'Python version found: {sys.version_info}')"
+
+# Install Poetry into base python
+$PYTHON_BINARY -m pip install -U pip poetry
+# Create a package specific poetry environment
+$PYTHON_BINARY -m poetry env use $PYTHON_BINARY
+# Activate the poetry env, which itself does not include poetry
+source $($PYTHON_BINARY -m poetry env info --path)/bin/activate
+# Recreate the poetry lock file
+$PYTHON_BINARY -m poetry lock --no-update
+# Install from pyproject.toml into package specific environment
+$PYTHON_BINARY -m poetry install --with dev
+
+# Run tests. Sensitive variables in Evergreen come from Evergeen project: ai-ml-pipeline-testing/
+OPENAI_API_KEY=$openai_api_key \
+DATASTORE="mongodb" \
+BEARER_TOKEN="staylowandkeepmoving" \
+MONGODB_URI=$chatgpt_retrieval_plugin_mongodb_uri \
+MONGODB_DATABASE="chatgpt_retrieval_plugin_test_db" \
+MONGODB_COLLECTION="chatgpt_retrieval_plugin_test_vectorstore" \
+MONGODB_INDEX="vector_index" \
+EMBEDDING_MODEL="text-embedding-3-small" \
+EMBEDDING_DIMENSION="1536" \
+$PYTHON_BINARY -m poetry run pytest -v tests/datastore/providers/mongodb_atlas/
diff --git a/llama_index/indexConfig.json b/llama_index/indexConfig.json
@@ -0,0 +1,14 @@
+{
+  "fields": [
+    {
+      "numDimensions": 1536,
+      "path": "embedding",
+      "similarity": "cosine",
+      "type": "vector"
+    }
+  ],
+  "name": "vector_index",
+  "type": "vectorSearch",
+  "database": "llama_index_test_db",
+  "collectionName": "llama_index_test_vectorstore"
+}
diff --git a/llama_index/run.sh b/llama_index/run.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+set -x
+
+. $workdir/src/.evergreen/utils.sh
+
+PYTHON_BINARY=$(find_python3)
+$PYTHON_BINARY -c "import sys; print(f'Python version found: {sys.version_info}')"
+
+# cd to the MongoDB integration. It has its own project
+cd llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb
+
+# Install Poetry into base python
+$PYTHON_BINARY -m pip install -U pip poetry
+# Create a package specific poetry environment
+$PYTHON_BINARY -m poetry env use $PYTHON_BINARY
+# Activate the poetry env, which itself does not include poetry
+source $($PYTHON_BINARY -m poetry env info --path)/bin/activate
+# Recreate the poetry lock file
+$PYTHON_BINARY -m poetry lock --no-update
+# Install from pyproject.toml into package specific environment
+$PYTHON_BINARY -m poetry install --with dev
+
+# Run tests. Sensitive variables in Evergreen come from Evergreen project: ai-ml-pipeline-testing/
+OPENAI_API_KEY=$openai_api_key \
+MONGO_URI=$llama_index_mongodb_uri \
+MONGODB_DATABASE="llama_index_test_db" \
+MONGODB_COLLECTION="llama_index_test_vectorstore" \
+MONGODB_INDEX="vector_index" \
+$PYTHON_BINARY -m poetry run pytest -v tests
diff --git a/semantic-kernel-csharp/run.sh b/semantic-kernel-csharp/run.sh
@@ -11,7 +11,7 @@ mkdir -p "$DOTNET_SDK_PATH"
 echo "Downloading .NET SDK installer into $DOTNET_SDK_PATH folder..."
 curl -Lfo "$DOTNET_SDK_PATH"/dotnet-install.sh https://dot.net/v1/dotnet-install.sh
 echo "Installing .NET LTS SDK..."
-bash "$DOTNET_SDK_PATH"/dotnet-install.sh --channel 6.0 --install-dir "$DOTNET_SDK_PATH" --no-path
+bash "$DOTNET_SDK_PATH"/dotnet-install.sh --channel 8.0 --install-dir "$DOTNET_SDK_PATH" --no-path
 
 # Set SkipReason to null to enable tests
 sed -i -e 's/"MongoDB Atlas cluster is required"/null/g' dotnet/src/IntegrationTests/Connectors/Memory/MongoDB/MongoDBMemoryStoreTests.cs
diff --git a/semantic-kernel-python/run.sh b/semantic-kernel-python/run.sh
@@ -7,16 +7,25 @@ set -x
 CONN_STRING=$($atlas deployments connect $DIR --connectWith connectionString)
 PYTHON_BINARY=$(find_python3)
 
+
 # WORKING_DIR = src/semantic-kernel-python/semantic-kernel
 cd python
 
-$PYTHON_BINARY -m venv .
-source ./bin/activate
 
-pip install poetry
-pip install pytest
-pip install grpcio
-poetry install
+# Install Poetry into base python
+$PYTHON_BINARY -m pip install -U pip poetry
+# Create a package specific poetry environment
+$PYTHON_BINARY -m poetry env use $PYTHON_BINARY
+# Activate the poetry env, which itself does not include poetry
+source $($PYTHON_BINARY -m poetry env info --path)/bin/activate
+# Recreate the poetry lock file
+$PYTHON_BINARY -m poetry lock --no-update
+# Install from pyproject.toml into package specific environment
+$PYTHON_BINARY -m poetry install --with dev
+
+# Create .env file
+python -c "from dotenv import set_key; set_key('.env', 'OPENAI_API_KEY', '$openai_api_key')"
+python -c "from dotenv import set_key; set_key('.env', 'MONGODB_ATLAS_CONNECTION_STRING', '$CONN_STRING')"
 
 # Workaround to test vector search first
 OPENAI_API_KEY=$openai_api_key \
@@ -26,7 +35,7 @@ AZURE_OPENAI_ENDPOINT="" \
 AZURE_OPENAI_API_KEY="" \
 MONGODB_ATLAS_CONNECTION_STRING=$CONN_STRING \
 Python_Integration_Tests=1 \
-poetry run pytest tests/integration/connectors/memory/test_mongodb_atlas.py -k test_collection_knn
+$PYTHON_BINARY -m poetry run pytest tests/integration/connectors/memory/test_mongodb_atlas.py -k test_collection_knn
 
 # Stored in evergreen VARIABLES
 OPENAI_API_KEY=$openai_api_key \
@@ -36,4 +45,4 @@ AZURE_OPENAI_ENDPOINT="" \
 AZURE_OPENAI_API_KEY="" \
 MONGODB_ATLAS_CONNECTION_STRING=$CONN_STRING \
 Python_Integration_Tests=1 \
-poetry run pytest tests/integration/connectors/memory/test_mongodb_atlas.py
+$PYTHON_BINARY -m poetry run pytest tests/integration/connectors/memory/test_mongodb_atlas.py