Skip to content

Commit 5aef696

Browse files
authored
Ingest v2: DuckDB and MotherDuck destination connectors (#418)
1 parent 6cc88a7 commit 5aef696

File tree

16 files changed

+463
-0
lines changed

16 files changed

+463
-0
lines changed
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
---
2+
title: DuckDB
3+
---
4+
5+
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6+
7+
<NewDocument />
8+
9+
import SharedContentDuckDB from '/snippets/dc-shared-text/duckdb-cli-api.mdx';
10+
import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx';
11+
12+
<SharedContentDuckDB/>
13+
<SharedAPIKeyURL/>
14+
15+
Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. This example uses the local source connector:
16+
17+
import DuckDBAPISh from '/snippets/destination_connectors/duckdb.sh.mdx';
18+
import DuckDBAPIPyV2 from '/snippets/destination_connectors/duckdb.v2.py.mdx';
19+
20+
<CodeGroup>
21+
<DuckDBAPISh />
22+
<DuckDBAPIPyV2 />
23+
</CodeGroup>
24+
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
---
2+
title: MotherDuck
3+
---
4+
5+
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6+
7+
<NewDocument />
8+
9+
import SharedContentMotherDuck from '/snippets/dc-shared-text/motherduck-cli-api.mdx';
10+
import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx';
11+
12+
<SharedContentMotherDuck/>
13+
<SharedAPIKeyURL/>
14+
15+
Now call the Unstructured CLI or Python SDK. The source connector can be any of the ones supported. This example uses the local source connector:
16+
17+
import MotherDuckAPISh from '/snippets/destination_connectors/motherduck.sh.mdx';
18+
import MotherDuckAPIPyV2 from '/snippets/destination_connectors/motherduck.v2.py.mdx';
19+
20+
<CodeGroup>
21+
<MotherDuckAPISh />
22+
<MotherDuckAPIPyV2 />
23+
</CodeGroup>
24+

api-reference/ingest/ingest-dependencies.mdx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ To add support for additional connectors, run the following:
6060
| `pip install "unstructured-ingest[delta-table]"` | Delta Tables |
6161
| `pip install "unstructured-ingest[discord]"` | Discord |
6262
| `pip install "unstructured-ingest[dropbox]"` | Dropbox |
63+
| `pip install "unstructured-ingest[dropbox]"` | DuckDB, MotherDuck |
6364
| `pip install "unstructured-ingest[elasticsearch]"` | Elasticsearch |
6465
| `pip install "unstructured-ingest[gcs]"` | Google Cloud Storage |
6566
| `pip install "unstructured-ingest[github]"` | GitHub |

mint.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,7 @@
213213
"open-source/ingest/destination-connectors/databricks-volumes",
214214
"open-source/ingest/destination-connectors/delta-table",
215215
"open-source/ingest/destination-connectors/dropbox",
216+
"open-source/ingest/destination-connectors/duckdb",
216217
"open-source/ingest/destination-connectors/elasticsearch",
217218
"open-source/ingest/destination-connectors/google-cloud-service",
218219
"open-source/ingest/destination-connectors/kafka",
@@ -221,6 +222,7 @@
221222
"open-source/ingest/destination-connectors/local",
222223
"open-source/ingest/destination-connectors/milvus",
223224
"open-source/ingest/destination-connectors/mongodb",
225+
"open-source/ingest/destination-connectors/motherduck",
224226
"open-source/ingest/destination-connectors/onedrive",
225227
"open-source/ingest/destination-connectors/opensearch",
226228
"open-source/ingest/destination-connectors/pinecone",
@@ -372,6 +374,7 @@
372374
"api-reference/ingest/destination-connector/databricks-volumes",
373375
"api-reference/ingest/destination-connector/delta-table",
374376
"api-reference/ingest/destination-connector/dropbox",
377+
"api-reference/ingest/destination-connector/duckdb",
375378
"api-reference/ingest/destination-connector/elasticsearch",
376379
"api-reference/ingest/destination-connector/google-cloud-service",
377380
"api-reference/ingest/destination-connector/kafka",
@@ -380,6 +383,7 @@
380383
"api-reference/ingest/destination-connector/local",
381384
"api-reference/ingest/destination-connector/milvus",
382385
"api-reference/ingest/destination-connector/mongodb",
386+
"api-reference/ingest/destination-connector/motherduck",
383387
"api-reference/ingest/destination-connector/onedrive",
384388
"api-reference/ingest/destination-connector/opensearch",
385389
"api-reference/ingest/destination-connector/pinecone",
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
---
2+
title: DuckDB
3+
---
4+
5+
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6+
7+
<NewDocument />
8+
9+
import SharedDuckDB from '/snippets/dc-shared-text/duckdb-cli-api.mdx';
10+
11+
<SharedDuckDB />
12+
13+
Now call the Unstructured CLI or Python. The source connector can be any of the ones supported. This example uses the local source connector.
14+
15+
This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page.
16+
17+
import DuckDBAPISh from '/snippets/destination_connectors/duckdb.sh.mdx';
18+
import DuckDBAPIPyV2 from '/snippets/destination_connectors/duckdb.v2.py.mdx';
19+
20+
<CodeGroup>
21+
<DuckDBAPISh />
22+
<DuckDBAPIPyV2 />
23+
</CodeGroup>
24+
25+
import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx';
26+
27+
<SharedPartitionByAPIOSS/>
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
---
2+
title: MotherDuck
3+
---
4+
5+
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6+
7+
<NewDocument />
8+
9+
import SharedMotherDuck from '/snippets/dc-shared-text/motherduck-cli-api.mdx';
10+
11+
<SharedMotherDuck />
12+
13+
Now call the Unstructured CLI or Python. The source connector can be any of the ones supported. This example uses the local source connector.
14+
15+
This example sends files to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page.
16+
17+
import MotherDuckAPISh from '/snippets/destination_connectors/motherduck.sh.mdx';
18+
import MotherDuckAPIPyV2 from '/snippets/destination_connectors/motherduck.v2.py.mdx';
19+
20+
<CodeGroup>
21+
<MotherDuckAPISh />
22+
<MotherDuckAPIPyV2 />
23+
</CodeGroup>
24+
25+
import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx';
26+
27+
<SharedPartitionByAPIOSS/>
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Batch process all your records to store structured outputs in a DuckDB installation.
2+
3+
The requirements are as follows.
4+
5+
import SharedDuckDB from '/snippets/general-shared-text/duckdb.mdx';
6+
import SharedDuckDBCLIAPI from '/snippets/general-shared-text/duckdb-cli-api.mdx';
7+
8+
<SharedDuckDB />
9+
<SharedDuckDBCLIAPI />
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Batch process all your records to store structured outputs in a MotherDuck account.
2+
3+
The requirements are as follows.
4+
5+
import SharedMotherDuck from '/snippets/general-shared-text/motherduck.mdx';
6+
import SharedMotherDuckCLIAPI from '/snippets/general-shared-text/motherduck-cli-api.mdx';
7+
8+
<SharedMotherDuck />
9+
<SharedMotherDuckCLIAPI />
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
```bash CLI
2+
#!/usr/bin/env bash
3+
4+
# Chunking and embedding are optional.
5+
6+
unstructured-ingest \
7+
local \
8+
--input-path $LOCAL_FILE_INPUT_DIR \
9+
--chunking-strategy by_title \
10+
--embedding-provider huggingface \
11+
--partition-by-api \
12+
--api-key $UNSTRUCTURED_API_KEY \
13+
--partition-endpoint $UNSTRUCTURED_API_URL \
14+
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
15+
duckdb \
16+
--database $DUCKDB_DATABASE \
17+
--db-schema $DUCKDB_DB_SCHEMA \
18+
--table $DUCKDB_TABLE
19+
```
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
```python Python Ingest v2
2+
import os
3+
4+
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
5+
from unstructured_ingest.v2.interfaces import ProcessorConfig
6+
7+
from unstructured_ingest.v2.processes.connectors.duckdb.duckdb import (
8+
DuckDBAccessConfig,
9+
DuckDBConnectionConfig,
10+
DuckDBUploadStagerConfig,
11+
DuckDBUploaderConfig
12+
)
13+
from unstructured_ingest.v2.processes.connectors.local import (
14+
LocalIndexerConfig,
15+
LocalConnectionConfig,
16+
LocalDownloaderConfig
17+
)
18+
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
19+
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
20+
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
21+
22+
# Chunking and embedding are optional.
23+
24+
if __name__ == "__main__":
25+
Pipeline.from_configs(
26+
context=ProcessorConfig(),
27+
indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")),
28+
downloader_config=LocalDownloaderConfig(),
29+
source_connection_config=LocalConnectionConfig(),
30+
partitioner_config=PartitionerConfig(
31+
partition_by_api=True,
32+
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
33+
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
34+
additional_partition_args={
35+
"split_pdf_page": True,
36+
"split_pdf_allow_failed": True,
37+
"split_pdf_concurrency_level": 15
38+
}
39+
),
40+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
41+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
42+
destination_connection_config=DuckDBConnectionConfig(
43+
access_config=DuckDBAccessConfig(),
44+
database=os.getenv("DUCKDB_DATABASE"),
45+
db_schema=os.getenv("DUCKDB_DB_SCHEMA"),
46+
table=os.getenv("DUCKDB_TABLE")
47+
),
48+
stager_config=DuckDBUploadStagerConfig(),
49+
uploader_config=DuckDBUploaderConfig(batch_size=50)
50+
).run()
51+
```

0 commit comments

Comments
 (0)