Skip to content

Commit 9626cb1

Browse files
committed
Merge branch 'main' into update-langchain-test
2 parents 7fb57f0 + 472ba19 commit 9626cb1

File tree

12 files changed

+265
-13
lines changed

12 files changed

+265
-13
lines changed

.evergreen/config.yml

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
# Actual testing tasks are marked with `type: test`
88
command_type: system
99

10-
# Protect ourself against rogue test case, or curl gone wild, that runs forever
10+
# Protect yourself against rogue test case, or curl gone wild, that runs forever
1111
# Good rule of thumb: the averageish length a task takes, times 5
1212
# That roughly accounts for variable system performance for various buildvariants
1313
exec_timeout_secs:
@@ -36,8 +36,14 @@ functions:
3636
echo '${REPO_NAME} could not be found' 1>&2
3737
exit 1
3838
fi
39+
# Apply patches to upstream repo if desired.
3940
cd ${DIR}
4041
git clone ${CLONE_URL}
42+
if [ -d "patches" ]; then
43+
cd ${REPO_NAME}
44+
echo "Applying patches."
45+
git apply ../patches/*
46+
fi
4147
4248
"execute tests":
4349
- command: subprocess.exec
@@ -84,6 +90,16 @@ tasks:
8490
- func: "fetch repo"
8591
- func: "execute tests"
8692

93+
- name: test-chatgpt-retrieval-plugin
94+
commands:
95+
- func: "fetch repo"
96+
- func: "execute tests"
97+
98+
- name: test-llama-index
99+
commands:
100+
- func: "fetch repo"
101+
- func: "execute tests"
102+
87103
buildvariants:
88104
- name: test-semantic-kernel-python-rhel
89105
display_name: Semantic-Kernel RHEL Python
@@ -120,3 +136,29 @@ buildvariants:
120136
- rhel87-small
121137
tasks:
122138
- name: test-langchain-python
139+
140+
- name: test-chatgpt-retrieval-plugin-rhel
141+
display_name: ChatGPT Retrieval Plugin
142+
expansions:
143+
DIR: chatgpt-retrieval-plugin
144+
REPO_NAME: chatgpt-retrieval-plugin
145+
# TODO - Update CLONE_URL: [PYTHON-4291] [PYTHON-4129]
146+
CLONE_URL: -b feature/mongodb-datastore --single-branch https://github.com/caseyclements/chatgpt-retrieval-plugin.git
147+
DATABASE: chatgpt_retrieval_plugin_test_db
148+
run_on:
149+
- rhel87-small
150+
tasks:
151+
- name: test-chatgpt-retrieval-plugin
152+
153+
- name: test-llama-index-rhel
154+
display_name: LlamaIndex RHEL
155+
expansions:
156+
DIR: llama_index
157+
REPO_NAME: llama_index
158+
# TODO - Update CLONE_URL once pull-request is merged
159+
CLONE_URL: -b feature/mongodb-datastore --single-branch https://github.com/caseyclements/llama_index.git
160+
DATABASE: llama_index_test_db
161+
run_on:
162+
- rhel87-small
163+
tasks:
164+
- name: test-llama-index

.evergreen/provision-atlas.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,9 @@ DEPLOYMENT_NAME=$DIR
1313

1414
# Download the mongodb tar and extract the binary into the atlas directory
1515
set -ex
16-
curl https://fastdl.mongodb.org/mongocli/mongodb-atlas-cli_1.14.0_linux_x86_64.tar.gz -o atlas.tgz
16+
curl https://fastdl.mongodb.org/mongocli/mongodb-atlas-cli_1.16.0_linux_x86_64.tar.gz -o atlas.tgz
1717
tar zxf atlas.tgz
18-
mv mongodb-atlas-cli_1.14.0* atlas
18+
mv mongodb-atlas-cli_1.16.0* atlas
1919

2020
# Create a local atlas deployment and store the connection string as an env var
2121
$atlas deployments setup $DIR --type local --force

.gitignore

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
*.dylib
9+
10+
# Distribution / packaging
11+
.Python
12+
build/
13+
develop-eggs/
14+
dist/
15+
downloads/
16+
eggs/
17+
.eggs/
18+
lib/
19+
lib64/
20+
parts/
21+
sdist/
22+
var/
23+
wheels/
24+
pip-wheel-metadata/
25+
share/python-wheels/
26+
*.egg-info/
27+
.installed.cfg
28+
*.egg
29+
MANIFEST
30+
31+
# Cython
32+
*.c
33+
*.cpp
34+
35+
# PyCharm
36+
*.idea/
37+
38+
# Sphinx documentation
39+
docs/_build/
40+
41+
# libbson files
42+
libbson
43+
mongo-c-driver-*
44+
45+
46+
# Benchmark and test output files
47+
results/*
48+
xunit-results/

README.md

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
## What is it?
44

5-
This repository exists to test our integrations in Third-Party AI/ML testing libraries.
5+
This repository exists to test our integrations in Third-Party AI/ML libraries.
66

77
## Motivation
88

@@ -90,3 +90,51 @@ Test execution flow is defined in `.evergreen/config.yml`. The test pipeline's c
9090
- [`execute tests`](https://github.com/mongodb-labs/ai-ml-pipeline-testing/blob/main/.evergreen/config.yml#L51) -- Uses [subprocess.exec](https://docs.devprod.prod.corp.mongodb.com/evergreen/Project-Configuration/Project-Commands#subprocessexec) to run the provided `run.sh` file. `run.sh` must be within the specified `DIR` path.
9191
- `fetch source` -- Retrieves the current (`ai-ml-pipeline-testing`) repo
9292
- `setup atlas cli` -- Sets up the local Atlas deployment
93+
94+
## Upstream Repo Considerations
95+
96+
For better or worse, we do not maintain AI/ML libraries with which we integrate.
97+
We provide workarounds for a few common issues that we encounter.
98+
99+
### Third-Party AI/ML library Maintainers have not merged our changes
100+
101+
As we develop a testing infrastructure, we commonly make changes to our integrations with the third-party library.
102+
This is the case, in particular, when we add a new integration.
103+
Over time, we may make bug fixes, add new features, and update the API.
104+
At the start, we will hopefully add the integration tests themselves.
105+
106+
The bad news is that the maintainers of the AI/ML packages may take considerable
107+
time to review and merge our changes. The good news is that we can begin testing
108+
without pointing to the main branch of the upstream repo.
109+
The parameter value of the `CLONE_URL` is very flexible.
110+
We literally just call `git clone $CLONE_URL`.
111+
As such, we can point to an arbitrary branch on an arbitrary repo.
112+
While developing, we encourage developers to point to a feature branch
113+
on their own fork, and add a TODO with the JIRA ticket to update the url
114+
once the pull-request has been merged.
115+
116+
### Patching upstream repos
117+
118+
We provide a simple mechanism to make changes to the third-party packages
119+
without requiring a pull-request (and acceptance by the upstream maintainers).
120+
This is done via Git Patch files.
121+
122+
Patch files are created very simply: `git diff > mypatch.patch`.
123+
If you can believe it, this was the primary mechanism to share code with another maintainer
124+
before pull-requests existed!
125+
To apply patches, add them to a `patches` directory within the `$DIR` of your build variant.
126+
As of this writing, the `chatgpt-retrieval-plugin` contains an example that you may use as a reference.
127+
You can create a number of different patch files, which will be applied recursively.
128+
This is useful to describe rationale, or to separate out ones that will be removed
129+
upon a merged pull-request to the upstream repo.
130+
131+
During ChatGPT Retrieval Plugin integration, we ran into build issues on Evergreen hosts.
132+
In this case, the package failed to build from source.
133+
It required a library that wasn't available on the host and had no wheel on PyPI.
134+
As it turned out, the package was actually an optional requirement,
135+
and so a one-line change to `pyproject.toml` solved our problem.
136+
137+
We realized that we could easily get this working without changing the upstream
138+
simply by applying a git patch file.
139+
This is a standard practice used by `conda package` maintainers,
140+
as they often have to build for a more broad set of scenarios than the original authors intended.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[]
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"fields": [
3+
{
4+
"numDimensions": 1536,
5+
"path": "embedding",
6+
"similarity": "cosine",
7+
"type": "vector"
8+
}
9+
],
10+
"name": "vector_index",
11+
"type": "vectorSearch",
12+
"database": "chatgpt_retrieval_plugin_test_db",
13+
"collectionName": "chatgpt_retrieval_plugin_test_vectorstore"
14+
}
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
diff --git a/pyproject.toml b/pyproject.toml
2+
index e41676e..a6ca72d 100644
3+
--- a/pyproject.toml
4+
+++ b/pyproject.toml
5+
@@ -28,7 +28,7 @@ pymilvus = "^2.2.2"
6+
qdrant-client = {version = "^1.0.4", python = "<3.12"}
7+
redis = "4.5.4"
8+
supabase = "^1.0.2"
9+
-psycopg2 = "^2.9.5"
10+
+psycopg2 = {version = "^2.9.5", optional = true}
11+
llama-index = "0.5.4"
12+
azure-identity = "^1.12.0"
13+
azure-search-documents = "11.4.0b8"

chatgpt-retrieval-plugin/run.sh

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/bin/sh
2+
3+
# chat-gpt-retrieval-plugin is a poetry run project
4+
5+
set -x
6+
7+
. $workdir/src/.evergreen/utils.sh
8+
9+
PYTHON_BINARY=$(find_python3)
10+
$PYTHON_BINARY -c "import sys; print(f'Python version found: {sys.version_info}')"
11+
12+
# Install Poetry into base python
13+
$PYTHON_BINARY -m pip install -U pip poetry
14+
# Create a package specific poetry environment
15+
$PYTHON_BINARY -m poetry env use $PYTHON_BINARY
16+
# Activate the poetry env, which itself does not include poetry
17+
source $($PYTHON_BINARY -m poetry env info --path)/bin/activate
18+
# Recreate the poetry lock file
19+
$PYTHON_BINARY -m poetry lock --no-update
20+
# Install from pyproject.toml into package specific environment
21+
$PYTHON_BINARY -m poetry install --with dev
22+
23+
# Run tests. Sensitive variables in Evergreen come from Evergeen project: ai-ml-pipeline-testing/
24+
OPENAI_API_KEY=$openai_api_key \
25+
DATASTORE="mongodb" \
26+
BEARER_TOKEN="staylowandkeepmoving" \
27+
MONGODB_URI=$chatgpt_retrieval_plugin_mongodb_uri \
28+
MONGODB_DATABASE="chatgpt_retrieval_plugin_test_db" \
29+
MONGODB_COLLECTION="chatgpt_retrieval_plugin_test_vectorstore" \
30+
MONGODB_INDEX="vector_index" \
31+
EMBEDDING_MODEL="text-embedding-3-small" \
32+
EMBEDDING_DIMENSION="1536" \
33+
$PYTHON_BINARY -m poetry run pytest -v tests/datastore/providers/mongodb_atlas/

llama_index/indexConfig.json

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
"fields": [
3+
{
4+
"numDimensions": 1536,
5+
"path": "embedding",
6+
"similarity": "cosine",
7+
"type": "vector"
8+
}
9+
],
10+
"name": "vector_index",
11+
"type": "vectorSearch",
12+
"database": "llama_index_test_db",
13+
"collectionName": "llama_index_test_vectorstore"
14+
}

llama_index/run.sh

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/sh
2+
3+
set -x
4+
5+
. $workdir/src/.evergreen/utils.sh
6+
7+
PYTHON_BINARY=$(find_python3)
8+
$PYTHON_BINARY -c "import sys; print(f'Python version found: {sys.version_info}')"
9+
10+
# cd to the MongoDB integration. It has its own project
11+
cd llama-index-integrations/vector_stores/llama-index-vector-stores-mongodb
12+
13+
# Install Poetry into base python
14+
$PYTHON_BINARY -m pip install -U pip poetry
15+
# Create a package specific poetry environment
16+
$PYTHON_BINARY -m poetry env use $PYTHON_BINARY
17+
# Activate the poetry env, which itself does not include poetry
18+
source $($PYTHON_BINARY -m poetry env info --path)/bin/activate
19+
# Recreate the poetry lock file
20+
$PYTHON_BINARY -m poetry lock --no-update
21+
# Install from pyproject.toml into package specific environment
22+
$PYTHON_BINARY -m poetry install --with dev
23+
24+
# Run tests. Sensitive variables in Evergreen come from Evergreen project: ai-ml-pipeline-testing/
25+
OPENAI_API_KEY=$openai_api_key \
26+
MONGO_URI=$llama_index_mongodb_uri \
27+
MONGODB_DATABASE="llama_index_test_db" \
28+
MONGODB_COLLECTION="llama_index_test_vectorstore" \
29+
MONGODB_INDEX="vector_index" \
30+
$PYTHON_BINARY -m poetry run pytest -v tests

0 commit comments

Comments
 (0)