Skip to content

Commit e8b50fe

Browse files
authored
Ingest v2: Delta Tables in Databricks destination connector (#447)
1 parent f28b51a commit e8b50fe

14 files changed

+511
-14
lines changed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
---
2+
title: Delta Tables in Databricks
3+
---
4+
5+
<Note>
6+
This article covers connecting Unstructured to Delta Tables in Databricks.
7+
8+
For information about connecting Unstructured to Delta Tables in Amazon S3 instead, see
9+
[Delta Tables in Amazon S3](/api-reference/ingest/destination-connector/delta-table).
10+
11+
For information about connecting Unstructured to Databricks Volumes instead, see
12+
[Databricks Volumes](/api-reference/ingest/destination-connector/databricks-volumes).
13+
</Note>
14+
15+
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
16+
17+
<NewDocument />
18+
19+
import SharedContentDatabricksDeltaTable from '/snippets/dc-shared-text/databricks-delta-table-cli-api.mdx';
20+
import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx';
21+
22+
<SharedContentDatabricksDeltaTable/>
23+
<SharedAPIKeyURL/>
24+
25+
Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector:
26+
27+
import DatabricksDeltaTableSQLBasedAPISh from '/snippets/destination_connectors/databricks_delta_table_sql_based.sh.mdx';
28+
import DatabricksDeltaTableVolumeBasedAPISh from '/snippets/destination_connectors/databricks_delta_table_volume_based.sh.mdx';
29+
import DatabricksDeltaTableSQLBasedAPIPyV2 from '/snippets/destination_connectors/databricks_delta_table_sql_based.v2.py.mdx';
30+
import DatabricksDeltaTableVolumeBasedAPIPyV2 from '/snippets/destination_connectors/databricks_delta_table_volume_based.v2.py.mdx';
31+
32+
<CodeGroup>
33+
<DatabricksDeltaTableSQLBasedAPISh />
34+
<DatabricksDeltaTableVolumeBasedAPISh />
35+
<DatabricksDeltaTableSQLBasedAPIPyV2 />
36+
<DatabricksDeltaTableVolumeBasedAPIPyV2 />
37+
</CodeGroup>

api-reference/ingest/destination-connector/databricks-volumes.mdx

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
title: Databricks Volumes
33
---
44

5-
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6-
7-
<NewDocument />
5+
<Note>
6+
This article covers connecting Unstructured to Databricks Volumes.
7+
8+
For information about connecting Unstructured to Delta Tables in Databricks instead, see
9+
[Delta Tables in Databricks](/api-reference/ingest/destination-connector/databricks-delta-table).
10+
</Note>
811

912
import SharedContentDatabricksVolumes from '/snippets/dc-shared-text/databricks-volumes-cli-api.mdx';
1013
import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx';

api-reference/ingest/destination-connector/delta-table.mdx

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
---
2-
title: Delta Table
2+
title: Delta Tables in Amazon S3
33
---
44

5-
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6-
7-
<NewDocument />
5+
<Note>
6+
This article covers connecting Unstructured to Delta Tables in Amazon S3. For information about
7+
connecting Unstructured to Delta Tables in Databricks instead, see
8+
[Delta Tables in Databricks](/api-reference/ingest/destination-connector/databricks-delta-table).
9+
</Note>
810

911
import SharedContentDeltaTable from '/snippets/dc-shared-text/delta-table-cli-api.mdx';
1012
import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx';

mint.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,7 @@
212212
"open-source/ingest/destination-connectors/couchbase",
213213
"open-source/ingest/destination-connectors/databricks-volumes",
214214
"open-source/ingest/destination-connectors/delta-table",
215+
"open-source/ingest/destination-connectors/databricks-delta-table",
215216
"open-source/ingest/destination-connectors/dropbox",
216217
"open-source/ingest/destination-connectors/duckdb",
217218
"open-source/ingest/destination-connectors/elasticsearch",
@@ -374,6 +375,7 @@
374375
"api-reference/ingest/destination-connector/couchbase",
375376
"api-reference/ingest/destination-connector/databricks-volumes",
376377
"api-reference/ingest/destination-connector/delta-table",
378+
"api-reference/ingest/destination-connector/databricks-delta-table",
377379
"api-reference/ingest/destination-connector/dropbox",
378380
"api-reference/ingest/destination-connector/duckdb",
379381
"api-reference/ingest/destination-connector/elasticsearch",
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
---
2+
title: Delta Tables in Databricks
3+
---
4+
5+
<Note>
6+
This article covers connecting Unstructured to Delta Tables in Databricks.
7+
8+
For information about connecting Unstructured to Delta Tables in Amazon S3 instead, see
9+
[Delta Tables in Amazon S3](/open-source/ingest/destination-connectors/delta-table).
10+
11+
For information about connecting Unstructured to Databricks Volumes instead, see
12+
[Databricks Volumes](/open-source/ingest/destination-connectors/databricks-volumes).
13+
</Note>
14+
15+
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
16+
17+
<NewDocument />
18+
19+
import SharedDatabricksDeltaTable from '/snippets/dc-shared-text/databricks-delta-table-cli-api.mdx';
20+
21+
<SharedDatabricksDeltaTable />
22+
23+
Now call the Unstructured Ingest CLI or the Unstructured Ingest Python library. The source connector can be any of the ones supported. This example uses the local source connector.
24+
25+
This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page.
26+
27+
import DatabricksDeltaTableSQLBasedAPISh from '/snippets/destination_connectors/databricks_delta_table_sql_based.sh.mdx';
28+
import DatabricksDeltaTableVolumeBasedAPISh from '/snippets/destination_connectors/databricks_delta_table_volume_based.sh.mdx';
29+
import DatabricksDeltaTableSQLBasedAPIPyV2 from '/snippets/destination_connectors/databricks_delta_table_sql_based.v2.py.mdx';
30+
import DatabricksDeltaTableVolumeBasedAPIPyV2 from '/snippets/destination_connectors/databricks_delta_table_volume_based.v2.py.mdx';
31+
32+
<CodeGroup>
33+
<DatabricksDeltaTableSQLBasedAPISh />
34+
<DatabricksDeltaTableVolumeBasedAPISh />
35+
<DatabricksDeltaTableSQLBasedAPIPyV2 />
36+
<DatabricksDeltaTableVolumeBasedAPIPyV2 />
37+
</CodeGroup>
38+
39+
import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx';
40+
41+
<SharedPartitionByAPIOSS/>

open-source/ingest/destination-connectors/databricks-volumes.mdx

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,12 @@
22
title: Databricks Volumes
33
---
44

5-
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6-
7-
<NewDocument />
5+
<Note>
6+
This article covers connecting Unstructured to Databricks Volumes.
7+
8+
For information about connecting Unstructured to Delta Tables in Databricks instead, see
9+
[Delta Tables in Databricks](/open-source/ingest/destination-connectors/databricks-delta-table).
10+
</Note>
811

912
import SharedDatabricksVolumes from '/snippets/dc-shared-text/databricks-volumes-cli-api.mdx';
1013

open-source/ingest/destination-connectors/delta-table.mdx

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
---
2-
title: Delta Table
2+
title: Delta Tables in Amazon S3
33
---
44

5-
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6-
7-
<NewDocument />
5+
<Note>
6+
This article covers connecting Unstructured to Delta Tables in Amazon S3. For information about
7+
connecting Unstructured to Delta Tables in Databricks instead, see
8+
[Delta Tables in Databricks](/open-source/ingest/destination-connectors/databricks-delta-table).
9+
</Note>
810

911
import SharedDeltaTable from '/snippets/dc-shared-text/delta-table-cli-api.mdx';
1012

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
Batch process all your records to store structured outputs in a Delta Table in Databricks.
2+
3+
The Delta Tables in Databricks connector provides two implementations:
4+
5+
- The _SQL-based_ implementation enables you to have Unstructured write its processed data directly to a table in Unity Catalog.
6+
If you use this implementation, you do not need to provide a volume as described in the following requirements.
7+
- The _volume-based_ implementation enables you to have Unstructured write its processed data as a file to a volume in Unity Catalog. Unstructured then
8+
uses that file in the volume to write the file's data to a table in Unity Catalog. The file remains in the volume.
9+
10+
The requirements are as follows.
11+
12+
import SharedDatabricksDeltaTable from '/snippets/general-shared-text/databricks-delta-table.mdx';
13+
import SharedDatabricksDeltaTableCLIAPI from '/snippets/general-shared-text/databricks-delta-table-cli-api.mdx';
14+
15+
<SharedDatabricksDeltaTable />
16+
<SharedDatabricksDeltaTableCLIAPI />
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
```bash CLI (SQL-Based)
2+
#!/usr/bin/env bash
3+
4+
# Chunking and embedding are optional.
5+
6+
# For authenticating with Databricks personal access tokens:
7+
unstructured-ingest \
8+
local \
9+
--input-path $LOCAL_FILE_INPUT_DIR \
10+
--chunking-strategy by_title \
11+
--embedding-provider huggingface \
12+
--partition-by-api \
13+
--api-key $UNSTRUCTURED_API_KEY \
14+
--partition-endpoint $UNSTRUCTURED_API_URL \
15+
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
16+
databricks-delta-tables \
17+
--server-hostname $DATABRICKS_HOST \
18+
--http-path $DATABRICKS_HTTP_PATH \
19+
--token $DATABRICKS_TOKEN \
20+
--catalog $DATABRICKS_CATALOG \
21+
--database $DATABRICKS_DATABASE \
22+
--table-name $DATABRICKS_TABLE
23+
--record-id-key $DATABRICKS_RECORD_ID_KEY \
24+
--batch-size 50
25+
26+
# For authenticating with Databricks managed service principals:
27+
unstructured-ingest \
28+
local \
29+
--input-path $LOCAL_FILE_INPUT_DIR \
30+
--chunking-strategy by_title \
31+
--embedding-provider huggingface \
32+
--partition-by-api \
33+
--api-key $UNSTRUCTURED_API_KEY \
34+
--partition-endpoint $UNSTRUCTURED_API_URL \
35+
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
36+
databricks-delta-tables \
37+
--server-hostname $DATABRICKS_HOST \
38+
--http-path $DATABRICKS_HTTP_PATH \
39+
--client-id $DATABRICKS_CLIENT_ID \
40+
--client-secret $DATABRICKS_CLIENT_SECRET \
41+
--catalog $DATABRICKS_CATALOG \
42+
--database $DATABRICKS_DATABASE \
43+
--table-name $DATABRICKS_TABLE
44+
--record-id-key $DATABRICKS_RECORD_ID_KEY \
45+
--batch-size 50
46+
```
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
```python Python Ingest v2 (SQL-Based)
2+
import os
3+
4+
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
5+
from unstructured_ingest.v2.interfaces import ProcessorConfig
6+
7+
from unstructured_ingest.v2.processes.connectors.sql.databricks_delta_tables import (
8+
DatabrickDeltaTablesConnectionConfig,
9+
DatabrickDeltaTablesAccessConfig,
10+
DatabrickDeltaTablesUploadStagerConfig,
11+
DatabrickDeltaTablesUploaderConfig
12+
)
13+
14+
from unstructured_ingest.v2.processes.connectors.local import (
15+
LocalIndexerConfig,
16+
LocalConnectionConfig,
17+
LocalDownloaderConfig
18+
)
19+
20+
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
21+
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
22+
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
23+
24+
# Chunking and embedding are optional.
25+
26+
if __name__ == "__main__":
27+
28+
Pipeline.from_configs(
29+
context=ProcessorConfig(),
30+
indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")),
31+
downloader_config=LocalDownloaderConfig(),
32+
source_connection_config=LocalConnectionConfig(),
33+
partitioner_config=PartitionerConfig(
34+
partition_by_api=True,
35+
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
36+
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
37+
additional_partition_args={
38+
"split_pdf_page": True,
39+
"split_pdf_allow_failed": True,
40+
"split_pdf_concurrency_level": 15
41+
}
42+
),
43+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
44+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
45+
46+
# For authenticating with Databricks personal access tokens.
47+
destination_connection_config=DatabrickDeltaTablesConnectionConfig(
48+
access_config=DatabrickDeltaTablesAccessConfig(
49+
token=os.getenv("DATABRICKS_TOKEN"),
50+
),
51+
server_hostname=os.getenv("DATABRICKS_HOST"),
52+
http_path=os.getenv("DATABRICKS_HTTP_PATH")
53+
),
54+
55+
# For authenticating with Databricks managed service principals.
56+
# destination_connection_config=DatabrickDeltaTablesConnectionConfig(
57+
# access_config=DatabrickDeltaTablesAccessConfig(
58+
# client_id=os.getenv("DATABRICKS_CLIENT_ID"),
59+
# client_secret=os.getenv("DATABRICKS_CLIENT_SECRET")
60+
# ),
61+
# server_hostname=os.getenv("DATABRICKS_HOST"),
62+
# http_path=os.getenv("DATABRICKS_HTTP_PATH")
63+
# ),
64+
65+
stager_config=DatabrickDeltaTablesUploadStagerConfig(),
66+
uploader_config=DatabrickDeltaTablesUploaderConfig(
67+
catalog=os.getenv("DATABRICKS_CATALOG"),
68+
database=os.getenv("DATABRICKS_DATABASE"),
69+
table_name=os.getenv("DATABRICKS_TABLE"),
70+
record_id_key=os.getenv("DATABRICKS_RECORD_ID_KEY"),
71+
batch_size=50
72+
)
73+
).run()
74+
```

0 commit comments

Comments
 (0)