Skip to content

Commit 7b61ad1

Browse files
fix: Weaviate Collection naming conventions fix (#489)
* Fix Weaviate uploader connection naming mechanism * Integration test weaviate collection name * Update changelog and version; weaviate collection names fix
1 parent 42da4fa commit 7b61ad1

File tree

5 files changed

+142
-16
lines changed

5 files changed

+142
-16
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 1.0.20
2+
3+
* **Fix Weaviate connector issue with names being wrongly transformed to match collections naming conventions**
4+
15
## 1.0.19
26

37
* **Fix databricks delta table name edge cases**

test/integration/connectors/weaviate/test_local.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,8 +141,8 @@ def test_weaviate_local_create_destination(weaviate_instance):
141141
upload_config=LocalWeaviateUploaderConfig(),
142142
connection_config=LocalWeaviateConnectionConfig(),
143143
)
144-
collection_name = "system_created-123"
145-
formatted_collection_name = "System_created_123"
144+
collection_name = "system_CREATED-123"
145+
formatted_collection_name = "System_CREATED_123"
146146
created = uploader.create_destination(destination_name=collection_name)
147147
assert created
148148
with uploader.connection_config.get_client() as weaviate_client:
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
from unittest.mock import MagicMock
2+
3+
import pytest
4+
from pydantic import Secret
5+
6+
from unstructured_ingest.processes.connectors.weaviate.weaviate import (
7+
WeaviateAccessConfig,
8+
WeaviateConnectionConfig,
9+
WeaviateUploader,
10+
WeaviateUploaderConfig,
11+
)
12+
13+
14+
class WeaviateConnectionConfigTest(WeaviateConnectionConfig):
15+
def get_client(self):
16+
yield MagicMock()
17+
18+
19+
@pytest.fixture
20+
def access_config():
21+
return WeaviateAccessConfig()
22+
23+
24+
@pytest.fixture
25+
def connection_config(access_config: WeaviateAccessConfig):
26+
return WeaviateConnectionConfigTest(
27+
access_config=Secret(access_config),
28+
init_timeout=10,
29+
query_timeout=10,
30+
insert_timeout=10,
31+
)
32+
33+
34+
@pytest.fixture
35+
def uploader_config():
36+
return WeaviateUploaderConfig(collection=None)
37+
38+
39+
@pytest.fixture
40+
def uploader(
41+
connection_config: WeaviateConnectionConfigTest, uploader_config: WeaviateUploaderConfig
42+
):
43+
return WeaviateUploader(
44+
connection_config=connection_config,
45+
upload_config=uploader_config,
46+
connector_type="weaviate",
47+
)
48+
49+
50+
@pytest.mark.parametrize(
51+
("destination_name", "expected"),
52+
[
53+
("t", "T"),
54+
("test123", "Test123"),
55+
("test__ __", "Test_____"),
56+
("test-Name", "Test_Name"),
57+
("teSt name", "TeSt_name"),
58+
("test@name#123", "Test_name_123"),
59+
],
60+
)
61+
def test_format_destination_name_success_logs(
62+
caplog: pytest.LogCaptureFixture,
63+
uploader: WeaviateUploader,
64+
destination_name: str,
65+
expected: str,
66+
):
67+
formatted_name = uploader.format_destination_name(destination_name)
68+
assert formatted_name == expected
69+
assert len(caplog.records) == 1
70+
assert (
71+
f"Given Collection name '{destination_name}' doesn't follow naming conventions. "
72+
f"Renaming to '{expected}'"
73+
) in caplog.text
74+
75+
76+
@pytest.mark.parametrize(
77+
("destination_name", "expected"),
78+
[
79+
("T", "T"),
80+
("Test_123", "Test_123"),
81+
("TEST_NAME", "TEST_NAME"),
82+
],
83+
)
84+
def test_format_destination_name_success_no_logs(
85+
caplog: pytest.LogCaptureFixture,
86+
uploader: WeaviateUploader,
87+
destination_name: str,
88+
expected: str,
89+
):
90+
formatted_name = uploader.format_destination_name(destination_name)
91+
assert formatted_name == expected
92+
assert len(caplog.records) == 0
93+
94+
95+
@pytest.mark.parametrize(
96+
("destination_name"),
97+
[
98+
("123name"),
99+
("@#$%^&*"),
100+
(""),
101+
],
102+
)
103+
def test_format_destination_name_error(uploader: WeaviateUploader, destination_name: str):
104+
with pytest.raises(ValueError):
105+
uploader.format_destination_name(destination_name)

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "1.0.19" # pragma: no cover
1+
__version__ = "1.0.20" # pragma: no cover

unstructured_ingest/processes/connectors/weaviate/weaviate.py

Lines changed: 30 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -234,34 +234,51 @@ def init(self, **kwargs: Any) -> None:
234234
self.create_destination(**kwargs)
235235

236236
def format_destination_name(self, destination_name: str) -> str:
237-
# Weaviate naming requirements:
238-
# must be alphanumeric and underscores only
237+
"""
238+
Weaviate Collection naming conventions:
239+
1. must begin with an uppercase letter
240+
2. must be alphanumeric and underscores only
241+
"""
242+
243+
# Check if the first character is an uppercase letter
244+
if not re.match(r"^[a-zA-Z]", destination_name):
245+
raise ValueError("Collection name must start with an uppercase letter")
246+
# Replace all non-alphanumeric characters with underscores
239247
formatted = re.sub(r"[^a-zA-Z0-9]", "_", destination_name)
240-
# must begin with capital letter
241-
return formatted.capitalize()
248+
# Make the first character uppercase and leave the rest as is
249+
if len(formatted) == 1:
250+
formatted = formatted.capitalize()
251+
else:
252+
formatted = formatted[0].capitalize() + formatted[1:]
253+
if formatted != destination_name:
254+
logger.warning(
255+
f"Given Collection name '{destination_name}' doesn't follow naming conventions. "
256+
f"Renaming to '{formatted}'"
257+
)
258+
return formatted
242259

243260
def create_destination(
244261
self,
245-
destination_name: str = "unstructuredautocreated",
262+
destination_name: str = "Unstructuredautocreated",
246263
vector_length: Optional[int] = None,
247264
**kwargs: Any,
248265
) -> bool:
249266
collection_name = self.upload_config.collection or destination_name
250267
collection_name = self.format_destination_name(collection_name)
251268
self.upload_config.collection = collection_name
252269

253-
connectors_dir = Path(__file__).parents[1]
254-
collection_config_file = connectors_dir / "assets" / "weaviate_collection_config.json"
255-
with collection_config_file.open() as f:
256-
collection_config = json.load(f)
257-
collection_config["class"] = collection_name
258-
259270
if not self._collection_exists():
260-
logger.info(f"creating weaviate collection '{collection_name}' with default configs")
271+
connectors_dir = Path(__file__).parents[1]
272+
collection_config_file = connectors_dir / "assets" / "weaviate_collection_config.json"
273+
with collection_config_file.open() as f:
274+
collection_config = json.load(f)
275+
collection_config["class"] = collection_name
276+
277+
logger.info(f"Creating weaviate collection '{collection_name}' with default configs")
261278
with self.connection_config.get_client() as weaviate_client:
262279
weaviate_client.collections.create_from_dict(config=collection_config)
263280
return True
264-
logger.debug(f"collection with name '{collection_name}' already exists, skipping creation")
281+
logger.debug(f"Collection with name '{collection_name}' already exists, skipping creation")
265282
return False
266283

267284
def check_for_errors(self, client: "WeaviateClient") -> None:

0 commit comments

Comments
 (0)