Ingest: IBM watsonx.data destination connector (#563)

Paul-Cornell · web-flow · commit ad444466608d · 2025-03-26T13:45:35.000-07:00
diff --git a/docs.json b/docs.json
@@ -356,6 +356,7 @@
                   "ingestion/destination-connectors/duckdb",
                   "ingestion/destination-connectors/elasticsearch",
                   "ingestion/destination-connectors/google-cloud-service",
+                  "ingestion/destination-connectors/ibm-watsonxdata",
                   "ingestion/destination-connectors/kafka",
                   "ingestion/destination-connectors/kdbai",
                   "ingestion/destination-connectors/lancedb",
diff --git a/ingestion/destination-connectors/ibm-watsonxdata.mdx b/ingestion/destination-connectors/ibm-watsonxdata.mdx
@@ -0,0 +1,27 @@
+---
+title: IBM watsonx.data
+---
+
+import NewDocument from '/snippets/general-shared-text/new-document.mdx';
+
+<NewDocument />
+
+import SharedIBMWatsonxdata from '/snippets/dc-shared-text/ibm-watsonxdata-cli-api.mdx';
+
+<SharedIBMWatsonxdata />
+
+Now call the Unstructured Ingest CLI or Unstructured Ingest Python. The source connector can be any of the ones supported. This example uses the local source connector.
+
+This example sends files to Unstructured for processing by default. To process files locally instead, see the instructions at the end of this page.
+
+import IBMWatsonxdataSh from '/snippets/destination_connectors/ibm_watsonxdata.sh.mdx';
+import IBMWatsonxdataPyV2 from '/snippets/destination_connectors/ibm_watsonxdata.v2.py.mdx';
+
+<CodeGroup>
+  <IBMWatsonxdataSh />
+  <IBMWatsonxdataPyV2 />
+</CodeGroup>
+
+import SharedPartitionByAPIOSS from '/snippets/ingest-configuration-shared/partition-by-api-oss.mdx';
+
+<SharedPartitionByAPIOSS/>
diff --git a/snippets/dc-shared-text/ibm-watsonxdata-cli-api.mdx b/snippets/dc-shared-text/ibm-watsonxdata-cli-api.mdx
@@ -0,0 +1,9 @@
+Batch process all your records to store structured outputs in IBM watsonx.data.
+
+The requirements are as follows.
+
+import SharedIBMWatsonxdata from '/snippets/general-shared-text/ibm-watsonxdata.mdx';
+import SharedIBMWatsonxdataCLIAPI from '/snippets/general-shared-text/ibm-watsonxdata-cli-api.mdx';
+
+<SharedIBMWatsonxdata />
+<SharedIBMWatsonxdataCLIAPI />
diff --git a/snippets/destination_connectors/ibm_watsonxdata.sh.mdx b/snippets/destination_connectors/ibm_watsonxdata.sh.mdx
@@ -0,0 +1,28 @@
+```bash CLI
+#!/usr/bin/env bash
+
+# Chunking and embedding are optional.
+
+unstructured-ingest \
+  local \
+    --input-path $LOCAL_FILE_INPUT_DIR \
+    --chunk-elements \
+    --embedding-provider huggingface \
+    --partition-by-api \
+    --api-key $UNSTRUCTURED_API_KEY\
+    --partition-endpoint $UNSTRUCTURED_API_URL \
+    --strategy hi_res \
+    --additional-partition-args="{\"split_pdf_page\":\"true\", \"split_pdf_allow_failed\":\"true\", \"split_pdf_concurrency_level\": 15}" \
+  ibm-watsonx-s3 \
+    --iam-api-key $IBM_IAM_API_KEY \
+    --access-key-id $IBM_COS_ACCESS_KEY \
+    --secret-access-key $IBM_COS_SECRET_ACCESS_KEY \
+    --iceberg-endpoint $IBM_ICEBERG_CATALOG_METASTORE_REST_ENDPOINT \
+    --object-storage-endpoint $IBM_COS_BUCKET_PUBLIC_ENDPOINT \
+    --object-storage-region $IBM_COS_BUCKET_REGION \
+    --catalog $IBM_ICEBERG_CATALOG \
+    --namespace $IBM_ICEBERG_SCHEMA \
+    --table $IBM_ICEBERG_TABLE \
+    --max-retries 5 \
+    --record-id-key $IBM_ICEBERG_TABLE_UNIQUE_RECORD_COLUMN
+```
diff --git a/snippets/destination_connectors/ibm_watsonxdata.v2.py.mdx b/snippets/destination_connectors/ibm_watsonxdata.v2.py.mdx
@@ -0,0 +1,63 @@
+```python Python Ingest v2
+import os
+
+from unstructured_ingest.v2.pipeline.pipeline import Pipeline
+from unstructured_ingest.v2.interfaces import ProcessorConfig
+
+from unstructured_ingest.v2.processes.connectors.local import (
+    LocalIndexerConfig,
+    LocalDownloaderConfig,
+    LocalConnectionConfig
+)
+
+from unstructured_ingest.v2.processes.connectors.ibm_watsonx.ibm_watsonx_s3 import (
+    IbmWatsonxConnectionConfig,
+    IbmWatsonxAccessConfig,
+    IbmWatsonxUploadStagerConfig,
+    IbmWatsonxUploaderConfig
+)
+
+from unstructured_ingest.v2.processes.partitioner import PartitionerConfig
+from unstructured_ingest.v2.processes.chunker import ChunkerConfig
+from unstructured_ingest.v2.processes.embedder import EmbedderConfig
+
+# Chunking and embedding are optional.
+
+if __name__ == "__main__":
+    Pipeline.from_configs(
+        context=ProcessorConfig(reprocess=True, clear_cache=True),
+        indexer_config=LocalIndexerConfig(input_path=os.getenv("LOCAL_FILE_INPUT_DIR")),
+        downloader_config=LocalDownloaderConfig(),
+        source_connection_config=LocalConnectionConfig(),
+        partitioner_config=PartitionerConfig(
+            partition_by_api=True,
+            api_key=os.getenv("UNSTRUCTURED_API_KEY"),
+            partition_endpoint=os.getenv("UNSTRUCTURED_API_URL"),
+            additional_partition_args={
+                "split_pdf_page": True,
+                "split_pdf_allow_failed": True,
+                "split_pdf_concurrency_level": 15
+            }
+        ),
+        chunker_config=ChunkerConfig(chunking_strategy="by_title"),
+        embedder_config=EmbedderConfig(embedding_provider="huggingface"),
+        destination_connection_config=IbmWatsonxConnectionConfig(
+            access_config=IbmWatsonxAccessConfig(
+                iam_api_key=os.getenv("IBM_IAM_API_KEY"),
+                access_key_id=os.getenv("IBM_COS_ACCESS_KEY"),
+                secret_access_key=os.getenv("IBM_COS_SECRET_ACCESS_KEY")
+            ),
+            iceberg_endpoint=os.getenv("IBM_ICEBERG_CATALOG_METASTORE_REST_ENDPOINT"),
+            object_storage_endpoint=os.getenv("IBM_COS_BUCKET_PUBLIC_ENDPOINT"),
+            object_storage_region=os.getenv("IBM_COS_BUCKET_REGION"),
+            catalog=os.getenv("IBM_ICEBERG_CATALOG")
+        ),
+        stager_config=IbmWatsonxUploadStagerConfig(),
+        uploader_config=IbmWatsonxUploaderConfig(
+            namespace=os.getenv("IBM_ICEBERG_SCHEMA"),
+            table=os.getenv("IBM_ICEBERG_TABLE"),
+            max_retries=5,
+            record_id_key=os.getenv("IBM_ICEBERG_TABLE_UNIQUE_RECORD_COLUMN")
+        )
+    ).run()
+```
diff --git a/snippets/general-shared-text/ibm-watsonxdata-cli-api.mdx b/snippets/general-shared-text/ibm-watsonxdata-cli-api.mdx
@@ -0,0 +1,24 @@
+The IBM watsonx.data connector dependencies:
+
+  ```bash CLI, Python
+  pip install "unstructured-ingest[ibm-watsonx-s3]"
+  ```
+
+import AdditionalIngestDependencies from '/snippets/general-shared-text/ingest-dependencies.mdx';
+
+<AdditionalIngestDependencies />
+
+The following environment variables:
+
+- `IBM_IAM_API_KEY` - An API key for the target IBM Cloud account, represented by `--iam-api-key` (CLI) or `iam_api_key` (Python).
+- `IBM_COS_ACCESS_KEY` - An HMAC access key ID for the target IBM Cloud Object Storage (COS) instance, represented by `--access-key-id` (CLI) or `access_key_id` (Python).
+- `IBM_COS_SECRET_ACCESS_KEY` - The associated HMAC secret access key ID for the target HMAC access key, represented by `--secret-access-key` (CLI) or `secret_access_key` (Python).
+- `IBM_ICEBERG_CATALOG_METASTORE_REST_ENDPOINT` - The metastore REST endpoint value for the target Apache Iceberg catalog in the target IBM watsonx.data data store instance, represented by `--iceberg_endpoint` (CLI) or `iceberg_endpoint` (Python). Do not include `https://` in this value.
+- `IBM_COS_BUCKET_PUBLIC_ENDPOINT` - The target COS instance's endpoint value, represented by `--object-storage-endpoint` (CLI) or `object_storage_endpoint` (Python).
+- `IBM_COS_BUCKET_REGION` - The target COS instance's region short ID, represented by `--object-storage-region` (CLI) or `object_storage_region` (Python).
+- `IBM_ICEBERG_CATALOG` - The name of the target Iceberg catalog, represented by `--catalog` (CLI) or `catalog` (Python).
+- `IBM_ICEBERG_SCHEMA` - The name of the target namespace (also known as a schema) in the target catalog, represented by `--namespace` (CLI) or `namespace` (Python).
+- `IBM_ICEBERG_TABLE` - The name of the target table in the target schema, represented by `--table` (CLI) or `table` (Python).
+- `IBM_ICEBERG_TABLE_UNIQUE_RECORD_COLUMN` - The name of the column that uniquely identifies each record in the target table, represented by `--record-id-key` (CLI) or `record_id_key` (Python). The default is `record_id`.
+
+Additionally, `--max-retries` (CLI) or `max_retries` (Python) is an optional parameter that specifies the number of times to retry uploading data. The default is `5`. If specified, it must be a number between `2` and `10`, inclusive.
diff --git a/snippets/general-shared-text/ibm-watsonxdata.mdx b/snippets/general-shared-text/ibm-watsonxdata.mdx
@@ -0,0 +1,187 @@
+- An [IBM Cloud account](https://cloud.ibm.com/login). [Create an IBM Cloud account](https://cloud.ibm.com/registration) if you do not already have one.
+- An API key for the IBM Cloud account. If you do not have one already, create one as follows:
+
+  1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
+  2. In the top navigation bar, click **Manage** and then, under **Security and access**, click **Access (IAM)**.
+  3. On the sidebar, under **Manage identities**, click **API keys**.
+  4. With the **View** list showing **My IBM Cloud API keys**, click **Create**. 
+  5. Enter some **Name** and an optional **Description** for the API key. 
+  6. Leave **Leaked action** set to **Disable the leaked key** and **Session creation** set to **No**.
+  7. Click **Create**. 
+  8. Click **Copy** or **Download** to copy or save the API key to a secure location. You won't be able to access this API key from this screen again.
+
+- An IBM Cloud Object Storage (COS) instance in the account, and a bucket within that instance. If you do not have them already, 
+  create them as follows:
+
+  1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
+  2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the 
+     top navigation bar.
+  3. Click **Create resource**.
+  4. With **IBM Cloud catalog** selected, search for and select **Object Storage**.
+  5. Complete the on-screen instructions to finish creating the COS instance.
+  6. With the COS instance's settings page shown, on the **Buckets** tab, click **Create bucket**.
+  7. Complete the on-screen instructions to finish creating the bucket.
+
+- The name, region, and public endpoint for the target bucket within the target Cloud Object Storage (COS) instance. To get these:
+
+  1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
+  2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the 
+     top navigation bar.
+  3. In the list of resources, expand **Storage**, and then click the target COS instance.
+  4. On the **Buckets** tab, click the target bucket.
+  5. On the **Configuration** tab, note the following:
+  
+     - Under **Bucket details**, note the **Bucket name**. This is the bucket's name.
+     - Under **Bucket details** section, note the value inside of the parentheses inside **Location**, for example `us-east`. This is the bucket's region.
+     - Under **Endpoints**, note the value of **Public**, for example `s3.us-east.cloud-object-storage.appdomain.cloud`. (Ignore the values of 
+       **Private** and **Direct**). This is the bucket's public endpoint.
+
+- An HMAC access key ID and secret access key for the target Cloud Object Storage (COS) instance. If you do not have them already, 
+  get or create them as follows:
+
+  1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
+  2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the 
+     top navigation bar.
+  3. In the list of resources, expand **Storage**, and then click the target COS instance.
+  4. On the **Service credentials** tab, if there is a credential that you want to use in the list, expand the credential, and copy the following values to a secure location:
+
+     - `access_key_id` under `cos_hmac_keys`, which represents the HMAC access key ID.
+     - `secret_access_key` under `cos_hmac_keys`, which represents the HMAC secret access key.
+
+     After you have copied the preceding values, you have completed this procedure.
+
+  5. If there is not a credential that you want to use, or there are no credentials at all, click **New Credential**.
+  6. Enter some **Name** for the credential.
+  7. For **Role**, select at least **Writer**, leave **Select Service ID** set to **Auto Generated**, 
+     switch on **Include HMAC Credential**, and then click **Add**.
+  8. In the list of credentials, expand the credential, and copy the following values to a secure location:
+
+     - `access_key_id` under `cos_hmac_keys`, which represents the HMAC access key ID.
+     - `secret_access_key` under `cos_hmac_keys`, which represents the HMAC secret access key.
+
+- An IBM watsonx.data data store instance in the IBM Cloud account. If you do not have one already, create one as follows:
+
+  1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
+  2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the 
+     top navigation bar.
+  3. Click **Create resource**.
+  4. With **IBM Cloud catalog** selected, search for and select **watsonx.data**.
+  5. Complete the on-screen instructions to finish creating the watsonx.data data store instance.
+
+- An Apache Iceberg-based catalog within the watsonx.data data store instance. If you do not have one already, create one as follows:
+
+  1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
+  2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the 
+     top navigation bar.
+  3. In the list of resources, expand **Databases**, and then click the target watsonx.data data store instance.
+  4. Click **Open web console**.
+  5. If prompted, log in to the web console.
+  6. On the sidebar, click **Infrastructure manager**. If the sidebar is not visible, click the **Global navigation** icon to the far left of the 
+     top navigation bar.
+  7. Click **Add component**.
+  8. Under **Storage**, click **IBM Cloud Object Storage**, and then click **Next**.
+  9. Complete the on-screen instructions to finish creating the Iceberg catalog. This includes providing the following settings:
+  
+     - Some display name for the component.
+     - The name of the target bucket within the target Cloud Object Storage (COS) instance that you noted earlier.
+     - The region for the target bucket, which you noted earlier. 
+     - The public endpoint for the target bucket, which you noted earlier. For this screen only, be sure to prefix the public endpoint with `https://`.
+     - The HMAC access key ID for the target COS instance, which you noted earlier.
+     - The HMAC secret access key for the target COS instance, which you noted earlier.
+
+  10. Next to **Connection status**, click **Test connection** to test the connection. Do not proceed until **Successful** is shown. If the connection is 
+      not successful, check the values you entered for the target bucket name, region, endpoint, access key, and secret access key, and try again.
+  11. Check the box labelled **Associate Catalog**.
+  12. Check the box labelled **Activate now**.
+  13. Under **Associated catalog**, for **Catalog type**, select **Apache Iceberg**.
+  14. Enter some **Catalog name**.
+  15. Click **Associate**.
+  16. On the sidebar, click **Infrastructure manager**. Make sure the catalog is associated with the appropriate engines. If it is not, rest your mouse 
+      on an unassociated target engine, click the **Manage associations** icon, check the box next to the target catalog's name, and then 
+      click **Save and restart engine**. 
+      
+      To create an engine if one is not already shown, click **Add component**, and follow the on-screen to add an appropriate engine from the list of available **Engines** 
+      (for example, an **IBM Presto** engine). 
+
+- The catalog name and metastore REST endpoint for the target Iceberg catalog. To get this:
+
+  1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
+  2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the 
+     top navigation bar.
+  3. In the list of resources, expand **Databases**, and then click the target watsonx.data data store instance.
+  4. Click **Open web console**.
+  5. If prompted, log in to the web console.
+  6. On the sidebar, click **Infrastructure manager**. If the sidebar is not visible, click the **Global navigation** icon to the far left of the 
+     top navigation bar.
+  7. In the **Catalogs** section, click the target Iceberg catalog.
+  8. On the **Details** tab, note the value of **Name** representing the catalog name, and **Metastore REST endpoint** representing the metastore REST endpoint. (Ignore the **Metastore Thrift endpoint** value.)
+
+- A namespace (also known as a schema) and a table in the target catalog. If you do not have these already, create them as follows:
+
+  1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
+  2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the 
+     top navigation bar.
+  3. In the list of resources, expand **Databases**, and then click the target watsonx.data data store instance.
+  4. Click **Open web console**.
+  5. If prompted, log in to the web console.
+  6. On the sidebar, click **Data manager**. If the sidebar is not visible, click the **Global navigation** icon to the far left of the 
+     top navigation bar.
+  7. On the **Browse data** tab, under **Catalogs associated**, click the target catalog.
+  8. Click the ellipses, and then click **Create schema**.
+  9. Enter some **Name** for the schema, and then click **Create**.
+  10. On the sidebar, click **Query workspace**.
+  11. In the SQL editor, enter and run a table creation statement such as the following, replacing `<catalog-name>` with the name of the target 
+      catalog and `<schema-name>` with the name of the target schema:
+
+      ```sql      
+      CREATE TABLE <catalog-name>.<schema-name>.elements (
+         "type" varchar,
+         "element_id" varchar,
+         "text" varchar,
+         "file_directory" varchar,
+         "filename" varchar,
+         "languages" array(varchar),
+         "last_modified" double,
+         "page_number" varchar,
+         "filetype" varchar,
+         "url" varchar,
+         "version" varchar,
+         "record_locator" varchar,
+         "date_created" double,
+         "date_modified" double,
+         "date_processed" double,
+         "filesize_bytes" bigint,
+         "points" varchar,
+         "system" varchar,
+         "layout_width" bigint,
+         "layout_height" bigint,
+         "id" varchar,
+         "record_id" varchar,
+         "parent_id" varchar
+      )
+      WITH (
+         delete_mode = 'copy-on-write',
+         format = 'PARQUET',
+         format_version = '2'
+      )
+      ```
+
+      Note that incoming elements that do not have matching column 
+      names will be dropped upon record insertion. For example, if the incoming data has an element named `sent_from` and there is no 
+      column named `sent_from` in the table, the `sent_from` element will be dropped upon record insertion. You should modify the preceding 
+      sample table creation statement to add columns for any additional elements that you want to be included upon record 
+      insertion.
+
+- The name of the target namespace (also known as a schema) within the target catalog, and name of the target table within that schema. To get these:
+
+  1. [Log in to your IBM Cloud account](https://cloud.ibm.com/login).
+  2. On the sidebar, click the **Resource list** icon. If the sidebar is not visible, click the **Navigation Menu** icon to the far left of the 
+     top navigation bar.
+  3. In the list of resources, expand **Databases**, and then click the target watsonx.data data store instance.
+  4. Click **Open web console**.
+  5. If prompted, log in to the web console.
+  6. On the sidebar, click **Data manager**. If the sidebar is not visible, click the **Global navigation** icon to the far left of the 
+     top navigation bar.
+  7. On the **Browse data** tab, expand the name of the target catalog, and note the names of the target schema and target table.
+
+- The name of the column in the target table that uniquely identifies each of the records in the table.