Skip to content

Commit 95585c7

Browse files
authored
Ingest v2: Astra DB connectors - add missing parameters/options (#518)
1 parent 2a86090 commit 95585c7

File tree

5 files changed

+45
-19
lines changed

5 files changed

+45
-19
lines changed

snippets/destination_connectors/astradb.sh.mdx

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,7 @@ unstructured-ingest \
1717
astradb \
1818
--api-endpoint $ASTRA_DB_API_ENDPOINT \
1919
--token $ASTRA_DB_APPLICATION_TOKEN \
20+
--collection-name $ASTRA_DB_COLLECTION \
2021
--keyspace $ASTRA_DB_KEYSPACE \
21-
--collection-name $ASTRA_DB_COLLECTION
22+
--flatten-metadata
23+

snippets/destination_connectors/astradb.v2.py.mdx

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,15 @@ from unstructured_ingest.v2.processes.embedder import EmbedderConfig
2323

2424
if __name__ == "__main__":
2525
Pipeline.from_configs(
26-
context=ProcessorConfig(),
26+
context=ProcessorConfig(
27+
),
2728
indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")),
2829
downloader_config=LocalDownloaderConfig(),
2930
source_connection_config=LocalConnectionConfig(),
3031
partitioner_config=PartitionerConfig(
3132
partition_by_api=True,
3233
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
3334
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
34-
strategy="hi_res",
3535
additional_partition_args={
3636
"split_pdf_page": True,
3737
"split_pdf_allow_failed": True,
@@ -46,10 +46,14 @@ if __name__ == "__main__":
4646
token=os.getenv("ASTRA_DB_APPLICATION_TOKEN")
4747
)
4848
),
49-
stager_config=AstraDBUploadStagerConfig(),
49+
stager_config=AstraDBUploadStagerConfig(
50+
flatten_metadata=True
51+
),
5052
uploader_config=AstraDBUploaderConfig(
53+
collection_name=os.getenv("ASTRA_DB_COLLECTION"),
5154
keyspace=os.getenv("ASTRA_DB_KEYSPACE"),
52-
collection_name=os.getenv("ASTRA_DB_COLLECTION")
55+
batch_size=20,
56+
record_id_key="record_id"
5357
)
5458
).run()
5559
```

snippets/general-shared-text/astradb-cli-api.mdx

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,12 @@ These environment variables:
1313
- `ASTRA_DB_API_ENDPOINT` - The API endpoint for the Astra DB database, represented by `--api-endpoint` (CLI) or `api_endpoint` (Python). To get the endpoint, see the **Database Details > API Endpoint** value on your database's **Overview** tab.
1414
- `ASTRA_DB_APPLICATION_TOKEN` - The database application token value for the database, represented by `--token` (CLI) or `token` (Python). To get the token, see the **Database Details > Application Tokens** box on your database's **Overview** tab.
1515
- `ASTRA_DB_KEYSPACE` - The name of the keyspace for the database, represented by `--keyspace` (CLI) or `keyspace` (Python).
16-
- `ASTRA_DB_COLLECTION` - The name of the collection for the keyspace, represented by `--collection-name` (CLI) or `collection_name` (Python).
16+
- `ASTRA_DB_COLLECTION` - The name of the collection for the keyspace, represented by `--collection-name` (CLI) or `collection_name` (Python).
17+
18+
Additional settings include:
19+
20+
- For the source connector only, `--fields` (CLI) or `fields` (Python): Optionally, a comma-separated list (CLI) or an array of strings (Python) of fields
21+
to include in the output. The default is ti include all fields, if not otherwise specified.
22+
- For the destination connector only, `--flatten-metadata` (CLI) or `flatten_metadata=True` (Python): Optionally, whether to "flatten" the metadata. Specifically, the metadata key values are
23+
brought to the top level of the element, and the `metadata` key itself is removed. To not flatten the metadata (the default), specify `--no-flatten_metadata` (CLI) or
24+
`flatten_metadata=False` (Python). The default is is to not flatten the metadata if not otherwise specified.

snippets/source_connectors/astradb.sh.mdx

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@ unstructured-ingest \
55
astradb \
66
--api-endpoint $ASTRA_DB_API_ENDPOINT \
77
--token $ASTRA_DB_APPLICATION_TOKEN \
8-
--keyspace $ASTRA_DB_KEYSPACE \
98
--collection-name $ASTRA_DB_COLLECTION \
9+
--keyspace $ASTRA_DB_KEYSPACE \
10+
--fields record_id,content \
1011
--download-dir $LOCAL_FILE_DOWNLOAD_DIR \
1112
--partition-by-api \
1213
--api-key $UNSTRUCTURED_API_KEY \
1314
--partition-endpoint $UNSTRUCTURED_API_URL \
14-
--strategy hi_res \
1515
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}"
1616
```

snippets/source_connectors/astradb.v2.py.mdx

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,18 @@ import os
33

44
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
55
from unstructured_ingest.v2.interfaces import ProcessorConfig
6+
67
from unstructured_ingest.v2.processes.connectors.astradb import (
7-
AstraDBAccessConfig,
8-
AstraDBConnectionConfig,
9-
AstraDBDownloaderConfig,
108
AstraDBIndexerConfig,
9+
AstraDBDownloaderConfig,
10+
AstraDBConnectionConfig,
11+
AstraDBAccessConfig
1112
)
13+
14+
from unstructured_ingest.v2.processes.connectors.local import LocalConnectionConfig
1215
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
13-
from unstructured_ingest.v2.processes.connectors.local import LocalUploaderConfig
16+
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
17+
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
1418

1519
# Chunking and embedding are optional.
1620

@@ -19,23 +23,31 @@ if __name__ == "__main__":
1923
context=ProcessorConfig(),
2024
indexer_config=AstraDBIndexerConfig(
2125
collection_name=os.getenv("ASTRA_DB_COLLECTION"),
22-
keyspace=os.getenv("ASTRA_DB_KEYSPACE"),
26+
keyspace=os.getenv("ASTRA_DB_KEYSPACE"),
27+
batch_size=20
2328
),
2429
downloader_config=AstraDBDownloaderConfig(
25-
collection_name=os.getenv("ASTRA_DB_COLLECTION"),
26-
keyspace=os.getenv("ASTRA_DB_KEYSPACE"),
30+
download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR"),
31+
fields=["record_id", "content"]
2732
),
2833
source_connection_config=AstraDBConnectionConfig(
2934
access_config=AstraDBAccessConfig(
30-
token=os.getenv("ASTRA_DB_APPLICATION_TOKEN"),
3135
api_endpoint=os.getenv("ASTRA_DB_API_ENDPOINT"),
32-
),
36+
token=os.getenv("ASTRA_DB_APPLICATION_TOKEN")
37+
)
3338
),
3439
partitioner_config=PartitionerConfig(
3540
partition_by_api=True,
36-
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
3741
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
42+
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
43+
additional_partition_args={
44+
"split_pdf_page": True,
45+
"split_pdf_allow_failed": True,
46+
"split_pdf_concurrency_level": 15
47+
}
3848
),
39-
uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR")),
49+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
50+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
51+
destination_connection_config=LocalConnectionConfig()
4052
).run()
4153
```

0 commit comments

Comments
 (0)