Skip to content

Commit b2d0338

Browse files
authored
PostgreSQL v2 API source connector (#299)
1 parent 4d8f02c commit b2d0338

File tree

6 files changed

+140
-0
lines changed

6 files changed

+140
-0
lines changed
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
---
2+
title: PostgreSQL
3+
---
4+
5+
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6+
7+
<NewDocument />
8+
9+
import SharedContentPostgreSQL from '/snippets/sc-shared-text/postgresql-cli-api.mdx';
10+
import SharedAPIKeyURL from '/snippets/general-shared-text/api-key-url.mdx';
11+
12+
<SharedContentPostgreSQL/>
13+
<SharedAPIKeyURL/>
14+
15+
Now call the Unstructured CLI or Python SDK. The destination connector can be any of the ones supported. This example uses the local destination connector:
16+
17+
import PostgreSQLAPISh from '/snippets/source_connectors/postgresql.sh.mdx';
18+
import PostgreSQLAPIPyV2 from '/snippets/source_connectors/postgresql.v2.py.mdx';
19+
20+
<CodeGroup>
21+
<PostgreSQLAPISh />
22+
<PostgreSQLAPIPyV2 />
23+
</CodeGroup>

mint.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@
168168
"open-source/ingest/source-connectors/one-drive",
169169
"open-source/ingest/source-connectors/opensearch",
170170
"open-source/ingest/source-connectors/outlook",
171+
"open-source/ingest/source-connectors/postgresql",
171172
"open-source/ingest/source-connectors/reddit",
172173
"open-source/ingest/source-connectors/s3",
173174
"open-source/ingest/source-connectors/salesforce",
@@ -321,9 +322,11 @@
321322
"api-reference/ingest/source-connectors/one-drive",
322323
"api-reference/ingest/source-connectors/opensearch",
323324
"api-reference/ingest/source-connectors/outlook",
325+
"api-reference/ingest/source-connectors/postgresql",
324326
"api-reference/ingest/source-connectors/reddit",
325327
"api-reference/ingest/source-connectors/s3",
326328
"api-reference/ingest/source-connectors/salesforce",
329+
"api-reference/ingest/source-connectors/slack",
327330
"api-reference/ingest/source-connectors/sftp",
328331
"api-reference/ingest/source-connectors/sharepoint",
329332
"api-reference/ingest/source-connectors/slack",
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
---
2+
title: PostgreSQL
3+
---
4+
5+
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6+
7+
<NewDocument />
8+
9+
import SharedContentPostgreSQL from '/snippets/sc-shared-text/postgresql-cli-api.mdx';
10+
11+
<SharedContentPostgreSQL/>
12+
13+
Now call the Unstructured CLI or Python. The destination connector can be any of the ones supported. This example uses the local destination connector:
14+
15+
This example sends data to Unstructured API services for processing by default. To process files locally instead, see the instructions at the end of this page.
16+
17+
import PostgreSQLSh from '/snippets/source_connectors/postgresql.sh.mdx';
18+
import PostgreSQLPyV2 from '/snippets/source_connectors/postgresql.v2.py.mdx';
19+
20+
<CodeGroup>
21+
<PostgreSQLSh />
22+
<PostgreSQLPyV2 />
23+
</CodeGroup>
24+
25+
import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx';
26+
27+
<SharedPartitionByAPIOSS/>
28+
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Connect PostgreSQL to your preprocessing pipeline, and use the Unstructured Ingest CLI or the Unstructured Ingest Python library to batch process all your documents and store structured outputs locally on your filesystem.
2+
3+
You will need:
4+
5+
import SharedPostgreSQL from '/snippets/general-shared-text/postgresql.mdx';
6+
import SharedPostgreSQLCLIAPI from '/snippets/general-shared-text/postgresql-cli-api.mdx';
7+
8+
<SharedPostgreSQL />
9+
<SharedPostgreSQLCLIAPI />
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
```bash CLI
2+
#!/usr/bin/env bash
3+
4+
# Chunking and embedding are optional.
5+
6+
unstructured-ingest \
7+
postgres \
8+
--host $PGHOST \
9+
--port $PGPORT \
10+
--database $PGDATABASE \
11+
--table-name elements \
12+
--id-column id \
13+
--username $PGUSER \
14+
--password $PGPASSWORD \
15+
--download-dir $LOCAL_FILE_DOWNLOAD_DIR \
16+
--chunking-strategy by_title \
17+
--embedding-provider huggingface \
18+
--output-dir $LOCAL_FILE_OUTPUT_DIR \
19+
--partition-by-api \
20+
--api-key $UNSTRUCTURED_API_KEY \
21+
--partition-endpoint $UNSTRUCTURED_API_URL
22+
```
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
```python Python Ingest v2
2+
import os
3+
4+
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
5+
from unstructured_ingest.v2.interfaces import ProcessorConfig
6+
7+
from unstructured_ingest.v2.processes.connectors.sql.postgres import (
8+
PostgresIndexerConfig,
9+
PostgresDownloaderConfig,
10+
PostgresConnectionConfig,
11+
PostgresAccessConfig
12+
)
13+
14+
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
15+
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
16+
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
17+
18+
from unstructured_ingest.v2.processes.connectors.local import (
19+
LocalConnectionConfig,
20+
LocalUploaderConfig
21+
)
22+
23+
# Chunking and embedding are optional.
24+
25+
if __name__ == "__main__":
26+
Pipeline.from_configs(
27+
context=ProcessorConfig(),
28+
indexer_config=PostgresIndexerConfig(
29+
table_name="elements",
30+
id_column="id"
31+
),
32+
downloader_config=PostgresDownloaderConfig(download_dir=os.getenv("LOCAL_FILE_DOWNLOAD_DIR")),
33+
source_connection_config=PostgresConnectionConfig(
34+
access_config=PostgresAccessConfig(password=os.getenv("PGPASSWORD")),
35+
host=os.getenv("PGHOST"),
36+
port=os.getenv("PGPORT"),
37+
username=os.getenv("PGUSER"),
38+
database=os.getenv("PGDATABASE")
39+
),
40+
partitioner_config=PartitionerConfig(
41+
partition_by_api=True,
42+
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
43+
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
44+
additional_partition_args={
45+
"split_pdf_page": True,
46+
"split_pdf_allow_failed": True,
47+
"split_pdf_concurrency_level": 15
48+
}
49+
),
50+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
51+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
52+
destination_connection_config=LocalConnectionConfig(),
53+
uploader_config=LocalUploaderConfig(output_dir=os.getenv("LOCAL_FILE_OUTPUT_DIR"))
54+
).run()
55+
```

0 commit comments

Comments
 (0)