Skip to content

Commit eabadf7

Browse files
authored
feat: add milvus destination connector (#5)
* Add milvus destination connector * remove version constraints * Fix milvus dest test
1 parent 68aa219 commit eabadf7

File tree

19 files changed

+720
-8
lines changed

19 files changed

+720
-8
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## 0.0.1-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
* **Add Milvus destination connector** Adds support storing artifacts in Milvus vector database.
8+
9+
### Fixes
10+
111
## 0.0.0
212

313
### Enhancements

requirements/connectors/milvus.in

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
-c ../common/constraints.txt
2+
-c ../common/base.txt
3+
4+
pymilvus

requirements/connectors/milvus.txt

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
#
2+
# This file is autogenerated by pip-compile with Python 3.9
3+
# by the following command:
4+
#
5+
# pip-compile milvus.in
6+
#
7+
environs==9.5.0
8+
# via pymilvus
9+
grpcio==1.63.0
10+
# via pymilvus
11+
marshmallow==3.21.3
12+
# via
13+
# -c ../common/base.txt
14+
# environs
15+
milvus-lite==2.4.8
16+
# via pymilvus
17+
numpy==1.26.4
18+
# via
19+
# -c ../common/base.txt
20+
# pandas
21+
packaging==23.2
22+
# via
23+
# -c ../common/base.txt
24+
# -c ../common/constraints.txt
25+
# marshmallow
26+
pandas==2.2.2
27+
# via
28+
# -c ../common/base.txt
29+
# pymilvus
30+
protobuf==4.23.4
31+
# via
32+
# -c ../common/constraints.txt
33+
# pymilvus
34+
pymilvus==2.4.4
35+
# via -r milvus.in
36+
python-dateutil==2.9.0.post0
37+
# via
38+
# -c ../common/base.txt
39+
# pandas
40+
python-dotenv==1.0.1
41+
# via environs
42+
pytz==2024.1
43+
# via
44+
# -c ../common/base.txt
45+
# pandas
46+
six==1.16.0
47+
# via
48+
# -c ../common/base.txt
49+
# python-dateutil
50+
tqdm==4.66.4
51+
# via
52+
# -c ../common/base.txt
53+
# milvus-lite
54+
tzdata==2024.1
55+
# via
56+
# -c ../common/base.txt
57+
# pandas
58+
ujson==5.10.0
59+
# via pymilvus
60+
61+
# The following packages are considered to be unsafe in a requirements file:
62+
# setuptools

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ def load_requirements(file: Union[str, Path]) -> List[str]:
102102
"hubspot": load_requirements("requirements/connectors/hubspot.in"),
103103
"jira": load_requirements("requirements/connectors/jira.in"),
104104
"kafka": load_requirements("requirements/connectors/kafka.in"),
105+
"milvus": load_requirements("requirements/connectors/milvus.in"),
105106
"mongodb": load_requirements("requirements/connectors/mongodb.in"),
106107
"notion": load_requirements("requirements/connectors/notion.in"),
107108
"onedrive": load_requirements("requirements/connectors/onedrive.in"),

test_e2e/dest/milvus.sh

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/usr/bin/env bash
2+
3+
set -e
4+
5+
DEST_PATH=$(dirname "$(realpath "$0")")
6+
SCRIPT_DIR=$(dirname "$DEST_PATH")
7+
cd "$SCRIPT_DIR"/.. || exit 1
8+
OUTPUT_FOLDER_NAME=milvus-dest
9+
OUTPUT_ROOT=${OUTPUT_ROOT:-$SCRIPT_DIR}
10+
OUTPUT_DIR=$OUTPUT_ROOT/structured-output/$OUTPUT_FOLDER_NAME
11+
WORK_DIR=$OUTPUT_ROOT/workdir/$OUTPUT_FOLDER_NAME
12+
CI=${CI:-"false"}
13+
max_processes=${MAX_PROCESSES:=$(python3 -c "import os; print(os.cpu_count())")}
14+
15+
# shellcheck disable=SC1091
16+
source "$SCRIPT_DIR"/cleanup.sh
17+
function cleanup {
18+
# Index cleanup
19+
echo "Stopping Milvus Docker container"
20+
docker compose -f "$SCRIPT_DIR"/env_setup/milvus/docker-compose.yml down --remove-orphans -v
21+
22+
# Local file cleanup
23+
cleanup_dir "$WORK_DIR"
24+
cleanup_dir "$OUTPUT_DIR"
25+
26+
}
27+
28+
trap cleanup EXIT
29+
30+
DB_NAME=ingest_test_db
31+
HOST="localhost"
32+
PORT=19530
33+
MILVUS_URI="http://${HOST}:${PORT}"
34+
COLLECTION_NAME="ingest_test"
35+
36+
# Create milvus instance
37+
echo "Creating milvus instance"
38+
# shellcheck source=/dev/null
39+
docker compose -f "$SCRIPT_DIR"/env_setup/milvus/docker-compose.yml up -d --wait-timeout 60
40+
"$SCRIPT_DIR"/env_setup/milvus/create_collection.py --db-name $DB_NAME
41+
42+
PYTHONPATH=. ./unstructured_ingest/main.py \
43+
local \
44+
--num-processes "$max_processes" \
45+
--output-dir "$OUTPUT_DIR" \
46+
--strategy fast \
47+
--verbose \
48+
--reprocess \
49+
--input-path example-docs/book-war-and-peace-1p.txt \
50+
--work-dir "$WORK_DIR" \
51+
--embedding-provider "langchain-huggingface" \
52+
milvus \
53+
--uri $MILVUS_URI \
54+
--db-name $DB_NAME \
55+
--collection-name $COLLECTION_NAME
56+
57+
sample_embeddings=$(cat "$WORK_DIR"/upload_stage/* | jq '.[0].embeddings')
58+
expected_count=$(cat "$WORK_DIR"/upload_stage/* | jq 'length')
59+
60+
"$SCRIPT_DIR"/env_setup/milvus/test_outputs.py \
61+
--db-name $DB_NAME \
62+
--embeddings "$sample_embeddings" \
63+
--collection-name $COLLECTION_NAME \
64+
--count "$expected_count"
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
#!/usr/bin/env python3
2+
3+
import click
4+
from pymilvus import (
5+
CollectionSchema,
6+
DataType,
7+
FieldSchema,
8+
MilvusClient,
9+
)
10+
from pymilvus.milvus_client import IndexParams
11+
12+
13+
def get_schema() -> CollectionSchema:
14+
id_field = FieldSchema(
15+
name="id", dtype=DataType.INT64, descrition="primary field", is_primary=True, auto_id=True
16+
)
17+
embeddings_field = FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=384)
18+
19+
schema = CollectionSchema(
20+
enable_dynamic_field=True,
21+
fields=[
22+
id_field,
23+
embeddings_field,
24+
],
25+
)
26+
27+
return schema
28+
29+
30+
def get_index_params() -> IndexParams:
31+
index_params = IndexParams()
32+
index_params.add_index(field_name="embeddings", index_type="AUTOINDEX", metric_type="COSINE")
33+
return index_params
34+
35+
36+
def create_database(client: MilvusClient, db_name: str) -> None:
37+
databases = client._get_connection().list_database()
38+
if db_name in databases:
39+
drop_database(client=client, db_name=db_name)
40+
print(f"Creating database {db_name}")
41+
client._get_connection().create_database(db_name=db_name)
42+
client.using_database(db_name=db_name)
43+
44+
45+
def drop_database(client: MilvusClient, db_name: str) -> None:
46+
print("Dropping existing database first")
47+
client.using_database(db_name=db_name)
48+
collections = client.list_collections()
49+
for collection in collections:
50+
print(f"Dropping existing collection {collection} first")
51+
client.drop_collection(collection_name=collection)
52+
client._get_connection().drop_database(db_name=db_name)
53+
54+
55+
def create_collection(client: MilvusClient, collection_name: str) -> None:
56+
collections = client.list_collections()
57+
if collection_name in collections:
58+
print("Dropping existing collection first")
59+
client.drop_collection(collection_name=collection_name)
60+
schema = get_schema()
61+
index_params = get_index_params()
62+
print(f"Creating collection {collection_name}")
63+
client.create_collection(
64+
collection_name=collection_name, schema=schema, index_params=index_params
65+
)
66+
67+
68+
@click.command("milvus-init")
69+
@click.option("--host", type=str, default="localhost")
70+
@click.option("--port", type=int, default=19530)
71+
@click.option("--db-name", type=str, default="milvus")
72+
def create(host: str, port: int, db_name: str):
73+
client = MilvusClient(uri=f"http://{host}:{port}")
74+
create_database(client=client, db_name=db_name)
75+
create_collection(client=client, collection_name="ingest_test")
76+
77+
78+
if __name__ == "__main__":
79+
create()
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
services:
2+
etcd:
3+
container_name: milvus-etcd
4+
image: quay.io/coreos/etcd:v3.5.5
5+
environment:
6+
- ETCD_AUTO_COMPACTION_MODE=revision
7+
- ETCD_AUTO_COMPACTION_RETENTION=1000
8+
- ETCD_QUOTA_BACKEND_BYTES=4294967296
9+
- ETCD_SNAPSHOT_COUNT=50000
10+
command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
11+
healthcheck:
12+
test: ["CMD", "etcdctl", "endpoint", "health"]
13+
interval: 30s
14+
timeout: 20s
15+
retries: 3
16+
17+
minio:
18+
container_name: milvus-minio
19+
image: minio/minio:RELEASE.2023-03-20T20-16-18Z
20+
environment:
21+
MINIO_ACCESS_KEY: minioadmin
22+
MINIO_SECRET_KEY: minioadmin
23+
ports:
24+
- "9001:9001"
25+
- "9000:9000"
26+
command: minio server /minio_data --console-address ":9001"
27+
healthcheck:
28+
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
29+
interval: 30s
30+
timeout: 20s
31+
retries: 3
32+
33+
standalone:
34+
container_name: milvus-standalone
35+
image: milvusdb/milvus:v2.3.19
36+
command: ["milvus", "run", "standalone"]
37+
security_opt:
38+
- seccomp:unconfined
39+
environment:
40+
ETCD_ENDPOINTS: etcd:2379
41+
MINIO_ADDRESS: minio:9000
42+
healthcheck:
43+
test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
44+
interval: 30s
45+
start_period: 90s
46+
timeout: 20s
47+
retries: 3
48+
ports:
49+
- "19530:19530"
50+
- "9091:9091"
51+
depends_on:
52+
etcd:
53+
condition: service_healthy
54+
minio:
55+
condition: service_healthy
56+
57+
gui:
58+
image: zilliz/attu
59+
container_name: milvus-gui
60+
environment:
61+
MILVUS_URL: standalone:19530
62+
ports:
63+
- 3000:3000
64+
depends_on:
65+
standalone:
66+
condition: service_healthy
67+
68+
networks:
69+
default:
70+
name: milvus

0 commit comments

Comments
 (0)