Skip to content

Commit e93aae1

Browse files
authored
API: Weaviate v2 updates for local-, embedded-, and cloud-focused classes (#353)
1 parent a379a58 commit e93aae1

File tree

6 files changed

+181
-101
lines changed

6 files changed

+181
-101
lines changed

snippets/dc-shared-text/weaviate-cli-api.mdx

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,5 @@
11
Batch process all your records to store structured outputs in a Weaviate database.
22

3-
<iframe
4-
width="560"
5-
height="315"
6-
src="https://www.youtube.com/embed/uqUrH8ksI0I"
7-
title="YouTube video player"
8-
frameborder="0"
9-
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
10-
allowfullscreen
11-
></iframe>
12-
133
You will need:
144

155
import SharedWeaviate from '/snippets/general-shared-text/weaviate.mdx';

snippets/destination_connectors/weaviate.sh.mdx

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,22 +3,48 @@
33

44
# Chunking and embedding is optional.
55

6+
# For Weaviate installed locally:
67
unstructured-ingest \
78
local \
89
--input-path $LOCAL_FILE_INPUT_DIR \
9-
--output-dir $LOCAL_FILE_OUTPUT_DIR \
10+
--partition-by-api \
11+
--api-key $UNSTRUCTURED_API_KEY \
12+
--partition-endpoint $UNSTRUCTURED_API_URL \
13+
--chunking-strategy by_title \
14+
--embedding-provider huggingface \
1015
--strategy hi_res \
11-
--chunk-elements \
16+
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
17+
weaviate-local \
18+
--collection $WEAVIATE_COLLECTION
19+
20+
# For Embedded Weaviate:
21+
unstructured-ingest \
22+
local \
23+
--input-path $LOCAL_FILE_INPUT_DIR \
24+
--partition-by-api \
25+
--api-key $UNSTRUCTURED_API_KEY \
26+
--partition-endpoint $UNSTRUCTURED_API_URL \
27+
--chunking-strategy by_title \
1228
--embedding-provider huggingface \
13-
--num-processes 2 \
14-
--verbose \
15-
--strategy fast \
29+
--strategy hi_res \
30+
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
31+
weaviate-embedded \
32+
--hostname $WEAVIATE_HOST \
33+
--collection $WEAVIATE_COLLECTION
34+
35+
# For Weaviate Cloud:
36+
unstructured-ingest \
37+
local \
38+
--input-path $LOCAL_FILE_INPUT_DIR \
1639
--partition-by-api \
1740
--api-key $UNSTRUCTURED_API_KEY \
1841
--partition-endpoint $UNSTRUCTURED_API_URL \
42+
--chunking-strategy by_title \
43+
--embedding-provider huggingface \
44+
--strategy hi_res \
1945
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
20-
weaviate \
21-
--host-url $WEAVIATE_URL \
46+
weaviate-cloud \
47+
--cluster-url $WEAVIATE_URL \
2248
--api-key $WEAVIATE_API_KEY \
23-
--class-name $WEAVIATE_COLLECTION_CLASS_NAME
49+
--collection $WEAVIATE_COLLECTION
2450
```

snippets/destination_connectors/weaviate.v1.py.mdx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
```python Python Ingest v1
2+
# NOTE: Python Ingest v1 does not provide separate classes for
3+
# Weaviate installed locally, Embedded Weaviate, or Weaviate Cloud.
4+
25
from unstructured_ingest.connector.local import SimpleLocalConfig
36
from unstructured_ingest.connector.weaviate import (
47
SimpleWeaviateConfig,

snippets/destination_connectors/weaviate.v2.py.mdx

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -4,22 +4,39 @@ import os
44
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
55
from unstructured_ingest.v2.interfaces import ProcessorConfig
66

7-
from unstructured_ingest.v2.processes.connectors.weaviate import (
8-
WeaviateConnectionConfig,
9-
WeaviateAccessConfig,
10-
WeaviateUploaderConfig,
11-
WeaviateUploadStagerConfig
12-
)
137
from unstructured_ingest.v2.processes.connectors.local import (
148
LocalIndexerConfig,
159
LocalDownloaderConfig,
1610
LocalConnectionConfig
1711
)
12+
1813
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
1914
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
2015
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
2116

22-
# Chunking and embedding are optional.
17+
# For Weaviate installed locally:
18+
# from unstructured_ingest.v2.processes.connectors.weaviate.local import (
19+
# LocalWeaviateConnectionConfig,
20+
# LocalWeaviateAccessConfig,
21+
# LocalWeaviateUploadStagerConfig,
22+
# LocalWeaviateUploaderConfig
23+
# )
24+
25+
# For Embedded Weaviate:
26+
# from unstructured_ingest.v2.processes.connectors.weaviate.embedded import (
27+
# EmbeddedWeaviateConnectionConfig,
28+
# EmbeddedWeaviateAccessConfig,
29+
# EmbeddedWeaviateUploadStagerConfig,
30+
# EmbeddedWeaviateUploaderConfig
31+
# )
32+
33+
# For Weaviate Cloud:
34+
from unstructured_ingest.v2.processes.connectors.weaviate.cloud import (
35+
CloudWeaviateConnectionConfig,
36+
CloudWeaviateAccessConfig,
37+
CloudWeaviateUploaderConfig,
38+
CloudWeaviateUploadStagerConfig
39+
)
2340

2441
if __name__ == "__main__":
2542
Pipeline.from_configs(
@@ -31,23 +48,48 @@ if __name__ == "__main__":
3148
partition_by_api=True,
3249
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
3350
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
34-
strategy="hi_res",
3551
additional_partition_args={
3652
"split_pdf_page": True,
3753
"split_pdf_allow_failed": True,
3854
"split_pdf_concurrency_level": 15
3955
}
4056
),
4157
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
42-
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
43-
destination_connection_config=WeaviateConnectionConfig(
44-
access_config=WeaviateAccessConfig(
58+
embedder_config=EmbedderConfig(
59+
embedding_provider="openai",
60+
embedding_model_name=os.getenv("EMBEDDING_MODEL_NAME"),
61+
embedding_api_key=os.getenv("OPENAI_APIKEY")
62+
),
63+
64+
# For Weaviate installed locally:
65+
# destination_connection_config=LocalWeaviateConnectionConfig(
66+
# access_config=LocalWeaviateAccessConfig()
67+
# ),
68+
# stager_config=LocalWeaviateUploadStagerConfig(),
69+
# uploader_config=LocalWeaviateUploaderConfig(
70+
# collection=os.getenv("WEAVIATE_COLLECTION")
71+
# )
72+
73+
# For Embedded Weaviate:
74+
# destination_connection_config=EmbeddedWeaviateConnectionConfig(
75+
# access_config=EmbeddedWeaviateAccessConfig(),
76+
# hostname=os.getenv("WEAVIATE_HOST")
77+
# ),
78+
# stager_config=EmbeddedWeaviateUploadStagerConfig(),
79+
# uploader_config=EmbeddedWeaviateUploaderConfig(
80+
# collection=os.getenv("WEAVIATE_COLLECTION")
81+
# )
82+
83+
# For Weaviate Cloud:
84+
destination_connection_config=CloudWeaviateConnectionConfig(
85+
access_config=CloudWeaviateAccessConfig(
4586
api_key=os.getenv("WEAVIATE_API_KEY")
4687
),
47-
host_url=os.getenv("WEAVIATE_URL"),
48-
class_name=os.getenv("WEAVIATE_COLLECTION_CLASS_NAME")
88+
cluster_url=os.getenv("WEAVIATE_CLUSTER_URL")
4989
),
50-
stager_config=WeaviateUploadStagerConfig(),
51-
uploader_config=WeaviateUploaderConfig()
90+
stager_config=CloudWeaviateUploadStagerConfig(),
91+
uploader_config=CloudWeaviateUploaderConfig(
92+
collection=os.getenv("WEAVIATE_COLLECTION")
93+
)
5294
).run()
5395
```

snippets/general-shared-text/weaviate-cli-api.mdx

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,17 @@ import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-d
1010

1111
The following environment variables:
1212

13-
- `WEAVIATE_URL` - THE REST endpoint for the Weaviate database cluster, represented by `--host-url` (CLI) or `host_url` (Python).
13+
- For Weaviate installed locally, `WEAVIATE_COLLECTION` - The name of the target collection in the instance, represented by `--collection` (CLI) or `collection` (Python).
14+
- For Embedded Weaviate:
1415

15-
- `WEAVIATE_API_KEY` - The API key for the database cluster, represented by `--api-key` (CLI) or `api_key` (Python).
16+
- `WEAVIATE_HOST` - The connection URL to the instance, represented by `--hostname` (CLI) or `hostname` (Python).
17+
- `WEAVIATE_COLLECTION` - The name of the target collection in the instance, represented by `--collection` (CLI) or `collection` (Python).
1618

17-
<Note>For the CLI, the `--api-key` option here is part of the `weaviate` command. For Python, the `api_key` parameter here is part of the `WeaviateAccessConfig` object.</Note>
19+
- For Weaviate Cloud:
1820

19-
- `WEAVIATE_COLLECTION_CLASS_NAME` - The name of the collection in the database, represented by `--class-name` (CLI) or `class_name` (Python).
21+
- `WEAVIATE_CLUSTER_URL` - THE REST endpoint for the Weaviate database cluster, represented by `--cluster-url` (CLI) or `cluster_url` (Python).
22+
- `WEAVIATE_API_KEY` - The API key for the database cluster, represented by `--api-key` (CLI) or `api_key` (Python).
23+
24+
<Note>For the CLI, the `--api-key` option here is part of the `weaviate-cloud` command. For Python, the `api_key` parameter here is part of the `CloudWeaviateAccessConfig` object.</Note>
25+
26+
- `WEAVIATE_COLLECTION` - The name of the target collection in the database, represented by `--collection` (CLI) or `collection` (Python).
Lines changed: 76 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,84 @@
1-
The Weaviate prerequisites:
1+
The Weaviate prerequisites.
22

3-
<iframe
4-
width="560"
5-
height="315"
6-
src="https://www.youtube.com/embed/Ldb7PZU-pR4"
7-
title="YouTube video player"
8-
frameborder="0"
9-
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
10-
allowfullscreen
11-
></iframe>
3+
- For the [Unstructured Platform](/platform/overview): only [Weaviate Cloud](https://weaviate.io/developers/wcs) clusters are supported.
4+
- For [Unstructured Ingest](/ingestion/overview): Weaviate Cloud clusters,
5+
[Weaviate installed locally](https://weaviate.io/developers/weaviate/quickstart/local),
6+
and [Embedded Weaviate](https://weaviate.io/developers/weaviate/installation/embedded) are supported.
7+
- For Weaviate installed locally, you will need the name of the target collection on the local instance.
8+
- For Embedded Weaviate, you will need the instance's connection URL and the name of the target collection on the instance.
9+
- For Weaviate Cloud, you will need:
1210

13-
1. A Weaviate database instance. The following information assumes that you have a Weaviate Cloud (WCD) account with a Weaviate database cluster in that account.
14-
[Create a WCD account](https://weaviate.io/developers/wcs/quickstart#create-a-wcd-account). [Create a database cluster](https://weaviate.io/developers/wcs/quickstart#create-a-weaviate-cluster). For other database options, [learn more](https://weaviate.io/developers/weaviate/installation).
11+
- A Weaviate database instance. The following information assumes that you have a Weaviate Cloud (WCD) account with a Weaviate database cluster in that account.
12+
[Create a WCD account](https://weaviate.io/developers/wcs/quickstart#create-a-wcd-account). [Create a database cluster](https://weaviate.io/developers/wcs/quickstart#create-a-weaviate-cluster). For other database options, [learn more](https://weaviate.io/developers/weaviate/installation).
13+
- The name of the target collection in the database. [Create a collection](https://weaviate.io/developers/wcs/tools/collections-tool).
14+
- The URL and API key for the database cluster. [Get the URL and API key](https://weaviate.io/developers/wcs/quickstart#explore-the-details-panel).
1515

16-
2. The URL and API key for the database cluster. [Get the URL and API key](https://weaviate.io/developers/wcs/quickstart#explore-the-details-panel).
16+
The following video describes how to set up Weaviate Cloud for Unstructured.
1717

18-
3. A collection in the database cluster. Note the name of the collection, also known as the collection's _class name_. [Create a collection](https://weaviate.io/developers/wcs/tools/collections-tool).
18+
<iframe
19+
width="560"
20+
height="315"
21+
src="https://www.youtube.com/embed/Ldb7PZU-pR4"
22+
title="YouTube video player"
23+
frameborder="0"
24+
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture"
25+
allowfullscreen
26+
></iframe>
1927

20-
The schema of the collection that you use must match the data that Unstructured writes to it. Otherwise, you might get unexpected results or errors.
21-
Unstructured cannot provide a schema that is guaranteed to work for everyone in all circumstances. This is because these schemas will vary based on
22-
your source files' types; how you want Unstructured to partition, chunk, and generate embeddings;
23-
any custom post-processing code that you run; and other factors.
28+
Weaviate requires the collection to have a data schema before you add data. However, you don't have to create a data schema manually.
29+
If you don't provide one, Weaviate generates a schema based on the incoming data.
2430

25-
You can adapt the following collection schema example for your own needs:
31+
However, if you have specific schema requirements, you can create the schema manually.
32+
Unstructured cannot provide a schema that is guaranteed to work for everyone in all circumstances.
33+
This is because these schemas will vary based on
34+
your source files' types; how you want Unstructured to partition, chunk, and generate embeddings;
35+
any custom post-processing code that you run; and other factors.
2636

27-
```json
28-
{
29-
"class": "Elements",
30-
"properties": [
31-
{
32-
"name": "element_id",
33-
"dataType": ["text"]
34-
},
35-
{
36-
"name": "text",
37-
"dataType": ["text"]
38-
},
39-
{
40-
"name": "embeddings",
41-
"dataType": ["number[]"]
42-
},
43-
{
44-
"name": "metadata",
45-
"dataType": ["object"],
46-
"nestedProperties": [
47-
{
48-
"name": "parent_id",
49-
"dataType": ["text"]
50-
},
51-
{
52-
"name": "page_number",
53-
"dataType": ["text"]
54-
},
55-
{
56-
"name": "is_continuation",
57-
"dataType": ["boolean"]
58-
},
59-
{
60-
"name": "orig_elements",
61-
"dataType": ["text"]
62-
}
63-
]
64-
}
65-
]
66-
}
67-
```
68-
69-
See also :
37+
You can adapt the following collection schema example for your own specific schema requirements:
7038

71-
- [Collection schema](https://weaviate.io/developers/weaviate/config-refs/schema)
72-
- [Unstructured document elements and metadata](/api-reference/api-services/document-elements)
39+
```json
40+
{
41+
"class": "Elements",
42+
"properties": [
43+
{
44+
"name": "element_id",
45+
"dataType": ["text"]
46+
},
47+
{
48+
"name": "text",
49+
"dataType": ["text"]
50+
},
51+
{
52+
"name": "embeddings",
53+
"dataType": ["number[]"]
54+
},
55+
{
56+
"name": "metadata",
57+
"dataType": ["object"],
58+
"nestedProperties": [
59+
{
60+
"name": "parent_id",
61+
"dataType": ["text"]
62+
},
63+
{
64+
"name": "page_number",
65+
"dataType": ["text"]
66+
},
67+
{
68+
"name": "is_continuation",
69+
"dataType": ["boolean"]
70+
},
71+
{
72+
"name": "orig_elements",
73+
"dataType": ["text"]
74+
}
75+
]
76+
}
77+
]
78+
}
79+
```
80+
81+
See also :
82+
83+
- [Collection schema](https://weaviate.io/developers/weaviate/config-refs/schema)
84+
- [Unstructured document elements and metadata](/api-reference/api-services/document-elements)

0 commit comments

Comments
 (0)