Skip to content

Commit 0fe0f15

Browse files
authored
feat: migrate weaviate connector to new framework (#3160)
### Description Add weaviate output connector to those supported in the new v2 ingest framework. Some fixes were needed to the upoad stager step as this was the first connector moved over that leverages this part of the pipeline.
1 parent a883fc9 commit 0fe0f15

File tree

13 files changed

+420
-22
lines changed

13 files changed

+420
-22
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.14.5-dev6
1+
## 0.14.5-dev7
22

33
### Enhancements
44

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.14.5-dev6" # pragma: no cover
1+
__version__ = "0.14.5-dev7" # pragma: no cover

unstructured/ingest/v2/cli/base/cmd.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def get_pipline(
7474
f"setting destination on pipeline {dest} with options: {destination_options}"
7575
)
7676
if uploader_stager := self.get_upload_stager(dest=dest, options=destination_options):
77-
pipeline_kwargs["upload_stager"] = uploader_stager
77+
pipeline_kwargs["stager"] = uploader_stager
7878
pipeline_kwargs["uploader"] = self.get_uploader(dest=dest, options=destination_options)
7979
else:
8080
# Default to local uploader
@@ -148,7 +148,7 @@ def get_upload_stager(dest: str, options: dict[str, Any]) -> Optional[UploadStag
148148
dest_entry = destination_registry[dest]
149149
upload_stager_kwargs: dict[str, Any] = {}
150150
if upload_stager_config_cls := dest_entry.upload_stager_config:
151-
upload_stager_kwargs["config"] = extract_config(
151+
upload_stager_kwargs["upload_stager_config"] = extract_config(
152152
flat_data=options, config=upload_stager_config_cls
153153
)
154154
if upload_stager_cls := dest_entry.upload_stager:

unstructured/ingest/v2/cli/cmds/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from .fsspec.s3 import s3_dest_cmd, s3_src_cmd
1010
from .fsspec.sftp import sftp_dest_cmd, sftp_src_cmd
1111
from .local import local_dest_cmd, local_src_cmd
12+
from .weaviate import weaviate_dest_cmd
1213

1314
src_cmds = [
1415
azure_src_cmd,
@@ -37,6 +38,7 @@
3738
local_dest_cmd,
3839
s3_dest_cmd,
3940
sftp_dest_cmd,
41+
weaviate_dest_cmd,
4042
]
4143

4244
duplicate_dest_names = [
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
from dataclasses import dataclass
2+
3+
import click
4+
5+
from unstructured.ingest.v2.cli.base import DestCmd
6+
from unstructured.ingest.v2.cli.interfaces import CliConfig
7+
from unstructured.ingest.v2.cli.utils import DelimitedString
8+
from unstructured.ingest.v2.processes.connectors.weaviate import CONNECTOR_TYPE
9+
10+
11+
@dataclass
12+
class WeaviateCliConnectionConfig(CliConfig):
13+
@staticmethod
14+
def get_cli_options() -> list[click.Option]:
15+
options = [
16+
click.Option(
17+
["--host-url"],
18+
required=True,
19+
help="Weaviate instance url",
20+
),
21+
click.Option(
22+
["--class-name"],
23+
default=None,
24+
type=str,
25+
help="Name of the class to push the records into, e.g: Pdf-elements",
26+
),
27+
click.Option(
28+
["--access-token"], default=None, type=str, help="Used to create the bearer token."
29+
),
30+
click.Option(
31+
["--refresh-token"],
32+
default=None,
33+
type=str,
34+
help="Will tie this value to the bearer token. If not provided, "
35+
"the authentication will expire once the lifetime of the access token is up.",
36+
),
37+
click.Option(
38+
["--api-key"],
39+
default=None,
40+
type=str,
41+
),
42+
click.Option(
43+
["--client-secret"],
44+
default=None,
45+
type=str,
46+
),
47+
click.Option(
48+
["--scope"],
49+
default=None,
50+
type=DelimitedString(),
51+
),
52+
click.Option(
53+
["--username"],
54+
default=None,
55+
type=str,
56+
),
57+
click.Option(
58+
["--password"],
59+
default=None,
60+
type=str,
61+
),
62+
click.Option(
63+
["--anonymous"],
64+
is_flag=True,
65+
default=False,
66+
type=bool,
67+
help="if set, all auth values will be ignored",
68+
),
69+
]
70+
return options
71+
72+
73+
@dataclass
74+
class WeaviateCliUploaderConfig(CliConfig):
75+
@staticmethod
76+
def get_cli_options() -> list[click.Option]:
77+
options = [
78+
click.Option(
79+
["--batch-size"],
80+
default=100,
81+
type=int,
82+
help="Number of records per batch",
83+
)
84+
]
85+
return options
86+
87+
88+
@dataclass
89+
class WeaviateCliUploadStagerConfig(CliConfig):
90+
@staticmethod
91+
def get_cli_options() -> list[click.Option]:
92+
return []
93+
94+
95+
weaviate_dest_cmd = DestCmd(
96+
cmd_name=CONNECTOR_TYPE,
97+
connection_config=WeaviateCliConnectionConfig,
98+
uploader_config=WeaviateCliUploaderConfig,
99+
upload_stager_config=WeaviateCliUploadStagerConfig,
100+
)

unstructured/ingest/v2/cli/interfaces.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ def add_params(cmd: click.Command, params: list[click.Parameter]):
1919
existing_opts = []
2020
for param in cmd.params:
2121
existing_opts.extend(param.opts)
22-
2322
for param in params:
2423
for opt in param.opts:
2524
if opt in existing_opts:

unstructured/ingest/v2/interfaces/connector.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from abc import ABC
22
from dataclasses import dataclass
3-
from typing import Any, Optional, TypeVar
3+
from typing import Any, TypeVar
44

5-
from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin, enhanced_field
5+
from unstructured.ingest.enhanced_dataclass import EnhancedDataClassJsonMixin
66

77

88
@dataclass
@@ -16,7 +16,7 @@ class AccessConfig(EnhancedDataClassJsonMixin):
1616

1717
@dataclass
1818
class ConnectionConfig(EnhancedDataClassJsonMixin):
19-
access_config: Optional[AccessConfigT] = enhanced_field(sensitive=True, default=None)
19+
access_config: AccessConfigT
2020

2121
def get_access_config(self) -> dict[str, Any]:
2222
if not self.access_config:
@@ -29,4 +29,4 @@ def get_access_config(self) -> dict[str, Any]:
2929

3030
@dataclass
3131
class BaseConnector(ABC):
32-
connection_config: Optional[ConnectionConfigT] = None
32+
connection_config: ConnectionConfigT

unstructured/ingest/v2/interfaces/downloader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import ABC, abstractmethod
2-
from dataclasses import dataclass, field
2+
from dataclasses import dataclass
33
from pathlib import Path
44
from typing import Any, Optional, TypeVar
55

@@ -19,7 +19,7 @@ class DownloaderConfig(EnhancedDataClassJsonMixin):
1919

2020
class Downloader(BaseProcess, BaseConnector, ABC):
2121
connector_type: str
22-
download_config: Optional[DownloaderConfigT] = field(default_factory=DownloaderConfig)
22+
download_config: DownloaderConfigT
2323

2424
@property
2525
def download_dir(self) -> Path:

unstructured/ingest/v2/interfaces/upload_stager.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,28 @@ class UploadStager(BaseProcess, ABC):
2121
upload_stager_config: Optional[UploadStagerConfigT] = None
2222

2323
@abstractmethod
24-
def run(self, elements_filepath: Path, file_data: FileData, **kwargs: Any) -> Path:
24+
def run(
25+
self,
26+
elements_filepath: Path,
27+
file_data: FileData,
28+
output_dir: Path,
29+
output_filename: str,
30+
**kwargs: Any
31+
) -> Path:
2532
pass
2633

27-
async def run_async(self, elements_filepath: Path, file_data: FileData, **kwargs: Any) -> Path:
28-
return self.run(elements_filepath=elements_filepath, file_data=file_data, **kwargs)
34+
async def run_async(
35+
self,
36+
elements_filepath: Path,
37+
file_data: FileData,
38+
output_dir: Path,
39+
output_filename: str,
40+
**kwargs: Any
41+
) -> Path:
42+
return self.run(
43+
elements_filepath=elements_filepath,
44+
output_dir=output_dir,
45+
output_filename=output_filename,
46+
file_data=file_data,
47+
**kwargs
48+
)

unstructured/ingest/v2/interfaces/uploader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import ABC, abstractmethod
2-
from dataclasses import dataclass, field
2+
from dataclasses import dataclass
33
from pathlib import Path
44
from typing import Any, TypeVar
55

@@ -25,7 +25,7 @@ class UploadContent:
2525

2626
@dataclass
2727
class Uploader(BaseProcess, BaseConnector, ABC):
28-
upload_config: UploaderConfigT = field(default_factory=UploaderConfig)
28+
upload_config: UploaderConfigT
2929

3030
def is_async(self) -> bool:
3131
return False

0 commit comments

Comments
 (0)