Skip to content

Commit b777864

Browse files
feat: Migrate over fsspec connectors (#3066)
### Description Move over all fsspec connectors to the new framework Variety of bug fixes found and fixed in this PR as well: * custom json mixin being used for the enhanced dataclass would break if typing was quoted. That was fixed. A check was also added to the enhanced dataclass to prevent `InitVar` from being used in the root dataclass since this breaks serialization. * hashing for partitioner was using the filename of the raw file being partitioned rather than the file name of the file data generated from indexing. This means that mutliple files could result in the same partition hash when recursive flag is passed in. This was updated to use the file data file name instead. --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: rbiseck3 <[email protected]>
1 parent 0e16bf4 commit b777864

File tree

36 files changed

+6187
-4943
lines changed

36 files changed

+6187
-4943
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.14.5-dev0
1+
## 0.14.5-dev1
22

33
### Enhancements
44

Lines changed: 30 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/usr/bin/env python3
2-
import sys
32
import time
43

54
from opensearchpy import OpenSearch
@@ -20,43 +19,42 @@
2019

2120
initial_query = {"query": {"simple_query_string": {"fields": ["text"], "query": EXPECTED_TEXT}}}
2221

23-
for i in range(3):
24-
try:
25-
initial_result = client.search(index="ingest-test-destination", body=initial_query)
26-
initial_embeddings = initial_result["hits"]["hits"][0]["_source"]["embeddings"]
22+
initial_embeddings = None
23+
timeout_s = 9
24+
sleep_s = 1
25+
26+
start = time.time()
27+
found = False
28+
while time.time() - start < timeout_s and not found:
29+
results = client.search(index="ingest-test-destination", body=initial_query)
30+
hits = results["hits"]["hits"]
31+
if hits:
32+
print(f"found results after {time.time() - start}s")
33+
initial_embeddings = hits[0]["_source"]["embeddings"]
34+
found = True
2735
break
28-
except: # noqa: E722
29-
print("Retrying to get initial embeddings")
30-
time.sleep(3)
36+
print(f"Waiting {sleep_s}s before checking again")
37+
time.sleep(sleep_s)
38+
39+
if not found:
40+
raise TimeoutError(
41+
f"timed out after {round(timeout_s, 3)}s trying to get results from opensearch"
42+
)
3143

3244
query = {"size": 1, "query": {"knn": {"embeddings": {"vector": initial_embeddings, "k": 1}}}}
3345

3446
vector_search = client.search(index="ingest-test-destination", body=query)
3547

36-
try:
37-
assert vector_search["hits"]["hits"][0]["_source"]["text"] == EXPECTED_TEXT
38-
print("OpenSearch vector search test was successful.")
39-
except AssertionError:
40-
sys.exit(
41-
"OpenSearch dest check failed:" f"Did not find {EXPECTED_TEXT} in via vector search."
42-
)
48+
found_text = vector_search["hits"]["hits"][0]["_source"]["text"]
49+
assert found_text == EXPECTED_TEXT, (
50+
f"OpenSearch dest check failed: Did not find "
51+
f"{EXPECTED_TEXT} in via vector search, instead: {found_text}."
52+
)
53+
print("OpenSearch vector search test was successful.")
4354

44-
for i in range(3):
45-
try:
46-
count = int(client.count(index="ingest-test-destination")["count"])
47-
assert count == N_ELEMENTS
48-
break
49-
except: # noqa: E722
50-
print("Retrying to get count")
51-
time.sleep(3)
52-
53-
try:
54-
count = int(client.count(index="ingest-test-destination")["count"])
55-
assert count == N_ELEMENTS
56-
except AssertionError:
57-
sys.exit(
58-
"OpenSearch dest check failed:"
59-
f"got {count} items in index, expected {N_ELEMENTS} items in index."
60-
)
55+
count = client.count(index="ingest-test-destination")["count"]
56+
57+
assert int(count) == N_ELEMENTS, "OpenSearch dest check failed:"
58+
f"got {count} items in index, expected {N_ELEMENTS} items in index."
6159

6260
print(f"OpenSearch destination test was successful with {count} items being uploaded.")

0 commit comments

Comments
 (0)