Skip to content

Commit 955b7de

Browse files
authored
feat/autogenerate click options (#20)
* Migrate to autogenerated click options from the pydantic Base Models * Add changelog entry * tidy * Add in azure cognitive search * Fix optional bool mapping * Remove unneeded cli config files * Minor fixes
1 parent 61c5cee commit 955b7de

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+1064
-3066
lines changed

.github/workflows/e2e.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,7 @@ jobs:
2929
test_src:
3030
strategy:
3131
matrix:
32-
# python-version: ["3.9","3.10"]
33-
python-version: [ "3.10" ]
32+
python-version: ["3.9","3.10"]
3433
runs-on: ubuntu-latest-m
3534
needs: [ setup ]
3635
steps:

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
## 0.0.4-dev1
1+
## 0.0.4-dev2
22

33
### Enhancements
44

55
* **Add Couchbase Destination Connector** Adds support for storing artifacts in Couchbase DB for Vector Search
66
* **Leverage pydantic base models** All user-supplied configs are now derived from pydantic base models to leverage better type checking and add built in support for sensitive fields.
7+
* **Autogenerate click options from base models** Leverage th pydantic base models for all configs to autogenerate teh cli options exposed when running ingest as a CLI.
78

89
## 0.0.3
910

test_e2e/dest/azure-cognitive-search.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
8585
--chunk-new-after-n-chars 1500 \
8686
--chunk-max-characters 2500 \
8787
--chunk-multipage-sections \
88-
--chunk-no-include-orig-elements \
88+
--no-chunk-include-orig-elements \
8989
--embedding-provider "langchain-huggingface" \
9090
azure-cognitive-search \
9191
--key "$AZURE_SEARCH_API_KEY" \

test_e2e/env_setup/couchbase/common/check_cluster_health.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ class ClusterConfig:
1515

1616

1717
def check_bucket_health(cluster_config: ClusterConfig, url: str):
18+
print("checking bucket health")
1819
max_attempts = 20
1920
attempt = 0
2021

@@ -51,6 +52,7 @@ def check_bucket_health(cluster_config: ClusterConfig, url: str):
5152

5253

5354
def check_fts_service_health(cluster_config: ClusterConfig, url: str):
55+
print("Checking FTS service health")
5456
max_attempts = 20
5557
attempt = 0
5658

test_e2e/env_setup/couchbase/destination_connector/ingest_destination_setup_cluster.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,3 +152,4 @@ def setup_cluster(cluster_config: ClusterConfig):
152152
)
153153

154154
setup_cluster(config)
155+
print("Done provisioning couchbase cluster.")

test_e2e/python/test-ingest-couchbase-output.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ def get_client(username, password, connection_string) -> Cluster:
1717
auth = PasswordAuthenticator(username, password)
1818
options = ClusterOptions(auth)
1919
options.apply_profile("wan_development")
20+
print(f"Creating client to {connection_string} with options {options}")
2021
cluster = Cluster(connection_string, options)
2122
cluster.wait_until_ready(timedelta(seconds=5))
2223
return cluster
@@ -66,17 +67,23 @@ def check(ctx, expected_docs):
6667
scope_name = ctx.parent.params["scope"]
6768
collection_name = ctx.parent.params["collection"]
6869

70+
print(
71+
f"Checking that the number of docs match expected "
72+
f"at {bucket_name}.{scope_name}.{collection_name}: {expected_docs}"
73+
)
6974
# Tally up the embeddings
7075
query_result = cluster.query(f"Select * from {bucket_name}.{scope_name}.{collection_name}")
7176
docs = list(query_result)
7277
number_of_docs = len(docs)
7378

7479
# Check that the assertion is true
7580
assert number_of_docs == expected_docs, (
76-
f"Number of rows in generated table ({number_of_docs})"
81+
f"Number of rows in generated table ({number_of_docs}) "
7782
f"doesn't match expected value: {expected_docs}"
7883
)
7984

85+
print("Number of docs matched expected")
86+
8087

8188
@cli.command()
8289
@click.option("--output-json", type=click.File())
@@ -87,7 +94,7 @@ def check_vector(ctx, output_json):
8794
exact_embedding = json_content[0][key_0]["embedding"]
8895
exact_text = json_content[0][key_0]["text"]
8996

90-
print("exact embedding:", len(exact_embedding), exact_embedding)
97+
print("embedding length:", len(exact_embedding))
9198

9299
cluster: Cluster = ctx.obj["cluster"]
93100
bucket_name = ctx.parent.params["bucket"]
@@ -132,6 +139,9 @@ def check_vector(ctx, output_json):
132139
assert not math.isclose(rows[1].score, 1, abs_tol=1e-4)
133140
assert rows[1].fields["text"] != exact_text
134141

142+
print("Embeddings check passed")
143+
135144

136145
if __name__ == "__main__":
146+
print("Validating results")
137147
cli()

test_e2e/src/google-drive.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
4747
--output-dir "$OUTPUT_DIR" \
4848
--verbose \
4949
--drive-id 1OQZ66OHBE30rNsNa7dweGLfRmXvkT_jr \
50-
--service-account-key "$GCP_INGEST_SERVICE_KEY_FILE" \
50+
--service-account-key-path "$GCP_INGEST_SERVICE_KEY_FILE" \
5151
--recursive \
5252
--extensions "pdf,docx" \
5353
--work-dir "$WORK_DIR"

test_e2e/src/local-single-file-chunk-no-orig-elements.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@ RUN_SCRIPT=${RUN_SCRIPT:-./unstructured_ingest/main.py}
3838
PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
3939
local \
4040
--chunking-strategy by_title \
41-
--chunk-no-include-orig-elements \
41+
--no-chunk-include-orig-elements \
4242
--chunk-max-characters 2000 \
43-
--chunk-no-multipage-sections \
43+
--no-chunk-multipage-sections \
4444
--input-path "$ABS_INPUT_PATH" \
4545
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_created,metadata.data_source.date_modified,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
4646
--num-processes "$max_processes" \

test_e2e/src/salesforce.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ PYTHONPATH=${PYTHONPATH:-.} "$RUN_SCRIPT" \
5050
--download-dir "$DOWNLOAD_DIR" \
5151
--username "$SALESFORCE_USERNAME" \
5252
--consumer-key "$SALESFORCE_CONSUMER_KEY" \
53-
--private-key "$SALESFORCE_PRIVATE_KEY_PATH" \
53+
--private-key-path "$SALESFORCE_PRIVATE_KEY_PATH" \
5454
--metadata-exclude coordinates,filename,file_directory,metadata.data_source.date_processed,metadata.last_modified,metadata.detection_class_prob,metadata.parent_id,metadata.category_depth \
5555
--num-processes "$max_processes" \
5656
--preserve-downloads \

unstructured_ingest/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.4-dev1" # pragma: no cover
1+
__version__ = "0.0.4-dev2" # pragma: no cover

0 commit comments

Comments
 (0)