Skip to content

Commit ad44446

Browse files
authored
Ingest: IBM watsonx.data destination connector (#563)
1 parent 6d59212 commit ad44446

File tree

7 files changed

+339
-0
lines changed

7 files changed

+339
-0
lines changed

docs.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,6 +356,7 @@
356356
"ingestion/destination-connectors/duckdb",
357357
"ingestion/destination-connectors/elasticsearch",
358358
"ingestion/destination-connectors/google-cloud-service",
359+
"ingestion/destination-connectors/ibm-watsonxdata",
359360
"ingestion/destination-connectors/kafka",
360361
"ingestion/destination-connectors/kdbai",
361362
"ingestion/destination-connectors/lancedb",
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
---
2+
title: IBM watsonx.data
3+
---
4+
5+
import NewDocument from '/snippets/general-shared-text/new-document.mdx';
6+
7+
<NewDocument />
8+
9+
import SharedIBMWatsonxdata from '/snippets/dc-shared-text/ibm-watsonxdata-cli-api.mdx';
10+
11+
<SharedIBMWatsonxdata />
12+
13+
Now call the Unstructured Ingest CLI or Unstructured Ingest Python. The source connector can be any of the ones supported. This example uses the local source connector.
14+
15+
This example sends files to Unstructured for processing by default. To process files locally instead, see the instructions at the end of this page.
16+
17+
import IBMWatsonxdataSh from '/snippets/destination_connectors/ibm_watsonxdata.sh.mdx';
18+
import IBMWatsonxdataPyV2 from '/snippets/destination_connectors/ibm_watsonxdata.v2.py.mdx';
19+
20+
<CodeGroup>
21+
<IBMWatsonxdataSh />
22+
<IBMWatsonxdataPyV2 />
23+
</CodeGroup>
24+
25+
import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx';
26+
27+
<SharedPartitionByAPIOSS/>
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
Batch process all your records to store structured outputs in IBM watsonx.data.
2+
3+
The requirements are as follows.
4+
5+
import SharedIBMWatsonxdata from '/snippets/general-shared-text/ibm-watsonxdata.mdx';
6+
import SharedIBMWatsonxdataCLIAPI from '/snippets/general-shared-text/ibm-watsonxdata-cli-api.mdx';
7+
8+
<SharedIBMWatsonxdata />
9+
<SharedIBMWatsonxdataCLIAPI />
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
```bash CLI
2+
#!/usr/bin/env bash
3+
4+
# Chunking and embedding are optional.
5+
6+
unstructured-ingest \
7+
local \
8+
--input-path $LOCAL_FILE_INPUT_DIR \
9+
--chunk-elements \
10+
--embedding-provider huggingface \
11+
--partition-by-api \
12+
--api-key $UNSTRUCTURED_API_KEY\
13+
--partition-endpoint $UNSTRUCTURED_API_URL \
14+
--strategy hi_res \
15+
--additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
16+
ibm-watsonx-s3 \
17+
--iam-api-key $IBM_IAM_API_KEY \
18+
--access-key-id $IBM_COS_ACCESS_KEY \
19+
--secret-access-key $IBM_COS_SECRET_ACCESS_KEY \
20+
--iceberg-endpoint $IBM_ICEBERG_CATALOG_METASTORE_REST_ENDPOINT \
21+
--object-storage-endpoint $IBM_COS_BUCKET_PUBLIC_ENDPOINT \
22+
--object-storage-region $IBM_COS_BUCKET_REGION \
23+
--catalog $IBM_ICEBERG_CATALOG \
24+
--namespace $IBM_ICEBERG_SCHEMA \
25+
--table $IBM_ICEBERG_TABLE \
26+
--max-retries 5 \
27+
--record-id-key $IBM_ICEBERG_TABLE_UNIQUE_RECORD_COLUMN
28+
```
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
```python Python Ingest v2
2+
import os
3+
4+
from unstructured_ingest.v2.pipeline.pipeline import Pipeline
5+
from unstructured_ingest.v2.interfaces import ProcessorConfig
6+
7+
from unstructured_ingest.v2.processes.connectors.local import (
8+
LocalIndexerConfig,
9+
LocalDownloaderConfig,
10+
LocalConnectionConfig
11+
)
12+
13+
from unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3 import (
14+
IbmWatsonxConnectionConfig,
15+
IbmWatsonxAccessConfig,
16+
IbmWatsonxUploadStagerConfig,
17+
IbmWatsonxUploaderConfig
18+
)
19+
20+
from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
21+
from unstructured_ingest.v2.processes.chunker import ChunkerConfig
22+
from unstructured_ingest.v2.processes.embedder import EmbedderConfig
23+
24+
# Chunking and embedding are optional.
25+
26+
if __name__ == "__main__":
27+
Pipeline.from_configs(
28+
context=ProcessorConfig(reprocess=True, clear_cache=True),
29+
indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")),
30+
downloader_config=LocalDownloaderConfig(),
31+
source_connection_config=LocalConnectionConfig(),
32+
partitioner_config=PartitionerConfig(
33+
partition_by_api=True,
34+
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
35+
partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
36+
additional_partition_args={
37+
"split_pdf_page": True,
38+
"split_pdf_allow_failed": True,
39+
"split_pdf_concurrency_level": 15
40+
}
41+
),
42+
chunker_config=ChunkerConfig(chunking_strategy="by_title"),
43+
embedder_config=EmbedderConfig(embedding_provider="huggingface"),
44+
destination_connection_config=IbmWatsonxConnectionConfig(
45+
access_config=IbmWatsonxAccessConfig(
46+
iam_api_key=os.getenv("IBM_IAM_API_KEY"),
47+
access_key_id=os.getenv("IBM_COS_ACCESS_KEY"),
48+
secret_access_key=os.getenv("IBM_COS_SECRET_ACCESS_KEY")
49+
),
50+
iceberg_endpoint=os.getenv("IBM_ICEBERG_CATALOG_METASTORE_REST_ENDPOINT"),
51+
object_storage_endpoint=os.getenv("IBM_COS_BUCKET_PUBLIC_ENDPOINT"),
52+
object_storage_region=os.getenv("IBM_COS_BUCKET_REGION"),
53+
catalog=os.getenv("IBM_ICEBERG_CATALOG")
54+
),
55+
stager_config=IbmWatsonxUploadStagerConfig(),
56+
uploader_config=IbmWatsonxUploaderConfig(
57+
namespace=os.getenv("IBM_ICEBERG_SCHEMA"),
58+
table=os.getenv("IBM_ICEBERG_TABLE"),
59+
max_retries=5,
60+
record_id_key=os.getenv("IBM_ICEBERG_TABLE_UNIQUE_RECORD_COLUMN")
61+
)
62+
).run()
63+
```
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
The IBM watsonx.data connector dependencies:
2+
3+
```bash CLI, Python
4+
pip install "unstructured-ingest[ibm-watsonx-s3]"
5+
```
6+
7+
import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-dependencies.mdx';
8+
9+
<AdditionalIngestDependencies />
10+
11+
The following environment variables:
12+
13+
- `IBM_IAM_API_KEY` - An API key for the target IBM Cloud account, represented by `--iam-api-key` (CLI) or `iam_api_key` (Python).
14+
- `IBM_COS_ACCESS_KEY` - An HMAC access key ID for the target IBM Cloud Object Storage (COS) instance, represented by `--access-key-id` (CLI) or `access_key_id` (Python).
15+
- `IBM_COS_SECRET_ACCESS_KEY` - The associated HMAC secret access key ID for the target HMAC access key, represented by `--secret-access-key` (CLI) or `secret_access_key` (Python).
16+
- `IBM_ICEBERG_CATALOG_METASTORE_REST_ENDPOINT` - The metastore REST endpoint value for the target Apache Iceberg catalog in the target IBM watsonx.data data store instance, represented by `--iceberg_endpoint` (CLI) or `iceberg_endpoint` (Python). Do not include `https://` in this value.
17+
- `IBM_COS_BUCKET_PUBLIC_ENDPOINT` - The target COS instance's endpoint value, represented by `--object-storage-endpoint` (CLI) or `object_storage_endpoint` (Python).
18+
- `IBM_COS_BUCKET_REGION` - The target COS instance's region short ID, represented by `--object-storage-region` (CLI) or `object_storage_region` (Python).
19+
- `IBM_ICEBERG_CATALOG` - The name of the target Iceberg catalog, represented by `--catalog` (CLI) or `catalog` (Python).
20+
- `IBM_ICEBERG_SCHEMA` - The name of the target namespace (also known as a schema) in the target catalog, represented by `--namespace` (CLI) or `namespace` (Python).
21+
- `IBM_ICEBERG_TABLE` - The name of the target table in the target schema, represented by `--table` (CLI) or `table` (Python).
22+
- `IBM_ICEBERG_TABLE_UNIQUE_RECORD_COLUMN` - The name of the column that uniquely identifies each record in the target table, represented by `--record-id-key` (CLI) or `record_id_key` (Python). The default is `record_id`.
23+
24+
Additionally, `--max-retries` (CLI) or `max_retries` (Python) is an optional parameter that specifies the number of times to retry uploading data. The default is `5`. If specified, it must be a number between `2` and `10`, inclusive.
Lines changed: 187 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,187 @@
1+
- An [IBM Cloud account](https://cloud.ibm.com/login). [Create an IBM Cloud account](https://cloud.ibm.com/registration) if you do not already have one.
2+
- An API key for the IBM Cloud account. If you do not have one already, create one as follows:
3+
4+
1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
5+
2. In the top navigation bar, click **Manage** and then, under **Security and access**, click **Access (IAM)**.
6+
3. On the sidebar, under **Manage identities**, click **API keys**.
7+
4. With the **View** list showing **My IBM Cloud API keys**, click **Create**.
8+
5. Enter some **Name** and an optional **Description** for the API key.
9+
6. Leave **Leaked action** set to **Disable the leaked key** and **Session creation** set to **No**.
10+
7. Click **Create**.
11+
8. Click **Copy** or **Download** to copy or save the API key to a secure location. You won't be able to access this API key from this screen again.
12+
13+
- An IBM Cloud Object Storage (COS) instance in the account, and a bucket within that instance. If you do not have them already,
14+
create them as follows:
15+
16+
1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
17+
2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the
18+
top navigation bar.
19+
3. Click **Create resource**.
20+
4. With **IBM Cloud catalog** selected, search for and select **Object Storage**.
21+
5. Complete the on-screen instructions to finish creating the COS instance.
22+
6. With the COS instance's settings page shown, on the **Buckets** tab, click **Create bucket**.
23+
7. Complete the on-screen instructions to finish creating the bucket.
24+
25+
- The name, region, and public endpoint for the target bucket within the target Cloud Object Storage (COS) instance. To get these:
26+
27+
1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
28+
2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the
29+
top navigation bar.
30+
3. In the list of resources, expand **Storage**, and then click the target COS instance.
31+
4. On the **Buckets** tab, click the target bucket.
32+
5. On the **Configuration** tab, note the following:
33+
34+
- Under **Bucket details**, note the **Bucket name**. This is the bucket's name.
35+
- Under **Bucket details** section, note the value inside of the parentheses inside **Location**, for example `us-east`. This is the bucket's region.
36+
- Under **Endpoints**, note the value of **Public**, for example `s3.us-east.cloud-object-storage.appdomain.cloud`. (Ignore the values of
37+
**Private** and **Direct**). This is the bucket's public endpoint.
38+
39+
- An HMAC access key ID and secret access key for the target Cloud Object Storage (COS) instance. If you do not have them already,
40+
get or create them as follows:
41+
42+
1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
43+
2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the
44+
top navigation bar.
45+
3. In the list of resources, expand **Storage**, and then click the target COS instance.
46+
4. On the **Service credentials** tab, if there is a credential that you want to use in the list, expand the credential, and copy the following values to a secure location:
47+
48+
- `access_key_id` under `cos_hmac_keys`, which represents the HMAC access key ID.
49+
- `secret_access_key` under `cos_hmac_keys`, which represents the HMAC secret access key.
50+
51+
After you have copied the preceding values, you have completed this procedure.
52+
53+
5. If there is not a credential that you want to use, or there are no credentials at all, click **New Credential**.
54+
6. Enter some **Name** for the credential.
55+
7. For **Role**, select at least **Writer**, leave **Select Service ID** set to **Auto Generated**,
56+
switch on **Include HMAC Credential**, and then click **Add**.
57+
8. In the list of credentials, expand the credential, and copy the following values to a secure location:
58+
59+
- `access_key_id` under `cos_hmac_keys`, which represents the HMAC access key ID.
60+
- `secret_access_key` under `cos_hmac_keys`, which represents the HMAC secret access key.
61+
62+
- An IBM watsonx.data data store instance in the IBM Cloud account. If you do not have one already, create one as follows:
63+
64+
1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
65+
2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the
66+
top navigation bar.
67+
3. Click **Create resource**.
68+
4. With **IBM Cloud catalog** selected, search for and select **watsonx.data**.
69+
5. Complete the on-screen instructions to finish creating the watsonx.data data store instance.
70+
71+
- An Apache Iceberg-based catalog within the watsonx.data data store instance. If you do not have one already, create one as follows:
72+
73+
1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
74+
2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the
75+
top navigation bar.
76+
3. In the list of resources, expand **Databases**, and then click the target watsonx.data data store instance.
77+
4. Click **Open web console**.
78+
5. If prompted, log in to the web console.
79+
6. On the sidebar, click **Infrastructure manager**. If the sidebar is not visible, click the **Global navigation** icon to the far left of the
80+
top navigation bar.
81+
7. Click **Add component**.
82+
8. Under **Storage**, click **IBM Cloud Object Storage**, and then click **Next**.
83+
9. Complete the on-screen instructions to finish creating the Iceberg catalog. This includes providing the following settings:
84+
85+
- Some display name for the component.
86+
- The name of the target bucket within the target Cloud Object Storage (COS) instance that you noted earlier.
87+
- The region for the target bucket, which you noted earlier.
88+
- The public endpoint for the target bucket, which you noted earlier. For this screen only, be sure to prefix the public endpoint with `https://`.
89+
- The HMAC access key ID for the target COS instance, which you noted earlier.
90+
- The HMAC secret access key for the target COS instance, which you noted earlier.
91+
92+
10. Next to **Connection status**, click **Test connection** to test the connection. Do not proceed until **Successful** is shown. If the connection is
93+
not successful, check the values you entered for the target bucket name, region, endpoint, access key, and secret access key, and try again.
94+
11. Check the box labelled **Associate Catalog**.
95+
12. Check the box labelled **Activate now**.
96+
13. Under **Associated catalog**, for **Catalog type**, select **Apache Iceberg**.
97+
14. Enter some **Catalog name**.
98+
15. Click **Associate**.
99+
16. On the sidebar, click **Infrastructure manager**. Make sure the catalog is associated with the appropriate engines. If it is not, rest your mouse
100+
on an unassociated target engine, click the **Manage associations** icon, check the box next to the target catalog's name, and then
101+
click **Save and restart engine**.
102+
103+
To create an engine if one is not already shown, click **Add component**, and follow the on-screen to add an appropriate engine from the list of available **Engines**
104+
(for example, an **IBM Presto** engine).
105+
106+
- The catalog name and metastore REST endpoint for the target Iceberg catalog. To get this:
107+
108+
1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
109+
2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the
110+
top navigation bar.
111+
3. In the list of resources, expand **Databases**, and then click the target watsonx.data data store instance.
112+
4. Click **Open web console**.
113+
5. If prompted, log in to the web console.
114+
6. On the sidebar, click **Infrastructure manager**. If the sidebar is not visible, click the **Global navigation** icon to the far left of the
115+
top navigation bar.
116+
7. In the **Catalogs** section, click the target Iceberg catalog.
117+
8. On the **Details** tab, note the value of **Name** representing the catalog name, and **Metastore REST endpoint** representing the metastore REST endpoint. (Ignore the **Metastore Thrift endpoint** value.)
118+
119+
- A namespace (also known as a schema) and a table in the target catalog. If you do not have these already, create them as follows:
120+
121+
1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
122+
2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the
123+
top navigation bar.
124+
3. In the list of resources, expand **Databases**, and then click the target watsonx.data data store instance.
125+
4. Click **Open web console**.
126+
5. If prompted, log in to the web console.
127+
6. On the sidebar, click **Data manager**. If the sidebar is not visible, click the **Global navigation** icon to the far left of the
128+
top navigation bar.
129+
7. On the **Browse data** tab, under **Catalogs associated**, click the target catalog.
130+
8. Click the ellipses, and then click **Create schema**.
131+
9. Enter some **Name** for the schema, and then click **Create**.
132+
10. On the sidebar, click **Query workspace**.
133+
11. In the SQL editor, enter and run a table creation statement such as the following, replacing `<catalog-name>` with the name of the target
134+
catalog and `<schema-name>` with the name of the target schema:
135+
136+
```sql
137+
CREATE TABLE <catalog-name>.<schema-name>.elements (
138+
"type" varchar,
139+
"element_id" varchar,
140+
"text" varchar,
141+
"file_directory" varchar,
142+
"filename" varchar,
143+
"languages" array(varchar),
144+
"last_modified" double,
145+
"page_number" varchar,
146+
"filetype" varchar,
147+
"url" varchar,
148+
"version" varchar,
149+
"record_locator" varchar,
150+
"date_created" double,
151+
"date_modified" double,
152+
"date_processed" double,
153+
"filesize_bytes" bigint,
154+
"points" varchar,
155+
"system" varchar,
156+
"layout_width" bigint,
157+
"layout_height" bigint,
158+
"id" varchar,
159+
"record_id" varchar,
160+
"parent_id" varchar
161+
)
162+
WITH (
163+
delete_mode = 'copy-on-write',
164+
format = 'PARQUET',
165+
format_version = '2'
166+
)
167+
```
168+
169+
Note that incoming elements that do not have matching column
170+
names will be dropped upon record insertion. For example, if the incoming data has an element named `sent_from` and there is no
171+
column named `sent_from` in the table, the `sent_from` element will be dropped upon record insertion. You should modify the preceding
172+
sample table creation statement to add columns for any additional elements that you want to be included upon record
173+
insertion.
174+
175+
- The name of the target namespace (also known as a schema) within the target catalog, and name of the target table within that schema. To get these:
176+
177+
1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
178+
2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the
179+
top navigation bar.
180+
3. In the list of resources, expand **Databases**, and then click the target watsonx.data data store instance.
181+
4. Click **Open web console**.
182+
5. If prompted, log in to the web console.
183+
6. On the sidebar, click **Data manager**. If the sidebar is not visible, click the **Global navigation** icon to the far left of the
184+
top navigation bar.
185+
7. On the **Browse data** tab, expand the name of the target catalog, and note the names of the target schema and target table.
186+
187+
- The name of the column in the target table that uniquely identifies each of the records in the table.

0 commit comments

Comments
 (0)