documentation: Added Collab notebook Ingesting data from DB into database Example (#3059)

Mahesh7667 · Mahesh7667 · commit ebced3768d59 · 2025-01-28T14:40:49.000Z
Added Collab notebook for the example - Ingesting data from DB into Database #3059 Open To make it easier for the user to work the library in Google Collab notebook. enhancement Approved-pr-by: @duyguHsnHsn Signed-off-by: Mahesh Kumar Kadireddy <mahesh.k7667@gmail.com>
diff --git a/examples/ingest-from-db-example/ingest-from-db-example-notebook/Ingesting_data_from_DB_into_Database.ipynb b/examples/ingest-from-db-example/ingest-from-db-example-notebook/Ingesting_data_from_DB_into_Database.ipynb
diff --git a/projects/control-service/projects/job-builder-secure/Dockerfile.python.vdk b/projects/control-service/projects/job-builder-secure/Dockerfile.python.vdk
@@ -52,14 +52,14 @@ RUN : \
           && pip install --no-cache-dir --disable-pip-version-check -q -r "$job_name/$requirements_file"  \
           || ( echo ">requirements_failed<" && exit 1 ) \
           && echo "Removing native dependencies ..." \
-          && yum autoremove build-essential gcc glibc-devel git unzip -y \
+          && yum remove build-essential gcc glibc-devel git unzip -y \
           && yum remove freetype2-devel libpng-devel -y;  fi \
     && echo "Installing native dependencies ..." \
     && yum install libstdc++ findutils openssl-c_rehash -y \
     && echo "Refreshing CA certificates ..." \
     && /usr/bin/rehash_ca_certificates.sh \
     && echo "Deleting system packages ..." \
-    && yum autoremove shadow toybox openssl-c_rehash -y \
+    && yum remove shadow toybox openssl-c_rehash -y \
     && echo "Deleting system directories ..." \
     && rm -rf /boot /home /media /mnt /root /srv /usr/lib/ldscripts /usr/lib/rpm /usr/lib/sysimage \
     /usr/lib/tdnf /usr/lib/perl5 /usr/lib/gcc /usr/share/locale /tmp/* /usr/include /usr/libexec /usr/libexec  \
diff --git a/projects/control-service/projects/job-builder-secure/version.txt b/projects/control-service/projects/job-builder-secure/version.txt
@@ -1 +1 @@
-1.3.11
+1.3.12
diff --git a/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/00-verify-valid-target.py b/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/00-verify-valid-target.py
@@ -0,0 +1,26 @@
+# Copyright 2023-2024 Broadcom
+# SPDX-License-Identifier: Apache-2.0
+import logging
+
+from vdk.api.job_input import IJobInput
+from vdk.plugin.trino.trino_utils import TrinoTemplateQueries
+
+log = logging.getLogger(__name__)
+
+
+def run(job_input: IJobInput):
+    """
+    In this step we try to recover potentially unexistent target table from backup.
+    In some cases the template might fail during the step where new data is written in target table
+    (last step where tmp_target_table contents are moved to target_table). If this happens, the job fails and
+    target table is no longer present. Fortunately it has a backup.
+    So when the job is retried, this first step should recover the target (if the reason for the previous fail
+    is no longer present).
+    """
+
+    args = job_input.get_arguments()
+    target_schema = args.get("target_schema")
+    target_table = args.get("target_table")
+    trino_queries = TrinoTemplateQueries(job_input)
+
+    trino_queries.ensure_target_exists_step(db=target_schema, target_name=target_table)
diff --git a/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/01-test-if-view-matches-target.sql b/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/01-test-if-view-matches-target.sql
@@ -0,0 +1,3 @@
+(SELECT * FROM "{source_schema}"."{source_view}" LIMIT 0)
+UNION ALL
+(SELECT * FROM "{target_schema}"."{target_table}" LIMIT 0)
diff --git a/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/02-requisite-sql-scripts/02-create-table-and-insert-data.sql b/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/02-requisite-sql-scripts/02-create-table-and-insert-data.sql
@@ -0,0 +1,12 @@
+CREATE TABLE "{target_schema_staging}"."{target_table_staging}" AS
+(
+SELECT t.*
+FROM "{target_schema}"."{target_table}" AS t
+LEFT JOIN "{source_schema}"."{source_view}" AS s ON s."{id_column}" = t."{id_column}"
+WHERE s."{id_column}" IS NULL
+)
+UNION ALL
+(
+SELECT *
+FROM "{source_schema}"."{source_view}"
+)
diff --git a/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/02-requisite-sql-scripts/02-drop-table.sql b/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/02-requisite-sql-scripts/02-drop-table.sql
@@ -0,0 +1 @@
+DROP TABLE IF EXISTS "{target_schema}"."{target_table}"
diff --git a/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/02-requisite-sql-scripts/02-insert-into-table.sql b/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/02-requisite-sql-scripts/02-insert-into-table.sql
@@ -0,0 +1,2 @@
+INSERT INTO "{target_schema}"."{target_table}"
+SELECT * FROM "{source_schema}"."{source_table}"
diff --git a/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/02-requisite-sql-scripts/02-show-create-table.sql b/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/02-requisite-sql-scripts/02-show-create-table.sql
@@ -0,0 +1 @@
+SHOW CREATE TABLE "{target_schema}"."{target_table}"
diff --git a/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/03-handle-quality-checks_and_move_data.py b/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/03-handle-quality-checks_and_move_data.py
@@ -0,0 +1,146 @@
+# Copyright 2023-2024 Broadcom
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import os
+import re
+
+from vdk.api.job_input import IJobInput
+from vdk.plugin.trino.templates.data_quality_exception import DataQualityException
+from vdk.plugin.trino.trino_utils import CommonUtilities
+
+log = logging.getLogger(__name__)
+
+SQL_FILES_FOLDER = (
+    os.path.dirname(os.path.abspath(__file__)) + "/02-requisite-sql-scripts"
+)
+
+
+"""
+This step is intended to handle quality checks if such are provided
+and stop the data from being populated into the target table if the check has negative outcome.
+Otherwise the data will be directly processed according to the used template type
+"""
+
+
+def run(job_input: IJobInput):
+    """
+    0. Drop staging table
+    1. Insert target table data, upserted by source view data, to staging table
+    2. if check,
+        - send temp/staging table for check validation
+        - If validated,
+            - copy the data from staging to target table
+        - else Raise error
+        else,
+        - copy the data from staging to target table
+    3. Copying the data:
+        - truncate target table and insert the data from staging table
+    """
+
+    job_arguments = job_input.get_arguments()
+
+    check = job_arguments.get("check")
+    source_schema = job_arguments.get("source_schema")
+    source_view = job_arguments.get("source_view")
+    target_schema = job_arguments.get("target_schema")
+    target_table = job_arguments.get("target_table")
+    id_column = job_arguments.get("id_column")
+
+    staging_schema = job_arguments.get("staging_schema", target_schema)
+    staging_table = CommonUtilities.get_staging_table_name(target_schema, target_table)
+
+    # Drop staging table
+    drop_table_query = CommonUtilities.get_file_content(
+        SQL_FILES_FOLDER, "02-drop-table.sql"
+    )
+    drop_table = drop_table_query.format(
+        target_schema=staging_schema, target_table=staging_table
+    )
+    job_input.execute_query(drop_table)
+
+    # create staging table and insert data
+    create_table_and_insert_data_query = CommonUtilities.get_file_content(
+        SQL_FILES_FOLDER, "02-create-table-and-insert-data.sql"
+    )
+    create_staging_table_and_insert_data = create_table_and_insert_data_query.format(
+        target_schema=target_schema,
+        target_table=target_table,
+        source_schema=source_schema,
+        source_view=source_view,
+        target_schema_staging=staging_schema,
+        target_table_staging=staging_table,
+        id_column=id_column,
+    )
+    job_input.execute_query(create_staging_table_and_insert_data)
+
+    staging_table_full_name = f"{staging_schema}.{staging_table}"
+
+    # copy the data if there's no quality check configure or if it passes
+    if not check or check(staging_table_full_name):
+        copy_staging_table_to_target_table(
+            job_input, target_schema, target_table, staging_schema, staging_table
+        )
+    else:
+        target_table_full_name = f"{target_schema}.{target_table}"
+        raise DataQualityException(
+            checked_object=staging_table_full_name,
+            source_view=f"{source_schema}.{source_view}",
+            target_table=target_table_full_name,
+        )
+
+
+def copy_staging_table_to_target_table(
+    job_input: IJobInput,
+    target_schema,
+    target_table,
+    source_schema,
+    source_table,
+):
+    # non-partitioned tables:
+    # - Since truncate and delete do not work for non-partitioned tables - get the create statement, drop the table and then re-create it - we do this to preserve and metadata like user comments
+    # - Insert contents from staging table in target table
+    # - Delete staging table
+    show_create_query = CommonUtilities.get_file_content(
+        SQL_FILES_FOLDER, "02-show-create-table.sql"
+    )
+    show_create_target_table = show_create_query.format(
+        target_schema=target_schema, target_table=target_table
+    )
+
+    table_create_statement = job_input.execute_query(show_create_target_table)
+    # remove the "external_location" clause from the create statement as it might lead to data not being cleaned up properly in hive
+    table_create_statement = remove_external_location(table_create_statement[0][0])
+
+    # drop the table
+    drop_table_query = CommonUtilities.get_file_content(
+        SQL_FILES_FOLDER, "02-drop-table.sql"
+    )
+    drop_table = drop_table_query.format(
+        target_schema=target_schema, target_table=target_table
+    )
+    job_input.execute_query(drop_table)
+
+    # re-create the table
+    job_input.execute_query(table_create_statement)
+
+    # insert the data
+    insert_into_table_query = CommonUtilities.get_file_content(
+        SQL_FILES_FOLDER, "02-insert-into-table.sql"
+    )
+    insert_into_table = insert_into_table_query.format(
+        target_schema=target_schema,
+        target_table=target_table,
+        source_schema=source_schema,
+        source_table=source_table,
+    )
+    job_input.execute_query(insert_into_table)
+
+
+def remove_external_location(sql_statement):
+    # Regular expression pattern to match the external_location clause
+    pattern = r"external_location\s*=\s*'[^']*',?\s*"
+
+    # Remove the external_location clause from the SQL statement
+    cleaned_sql = re.sub(pattern, "", sql_statement, flags=re.IGNORECASE)
+
+    return cleaned_sql
diff --git a/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/README.md b/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/templates/load/dimension/scd1_upsert/README.md
@@ -0,0 +1,49 @@
+### Purpose:
+
+This template can be used to load raw data from a database to target 'Slowly Changing Dimension Type 1' table with specific implementation.
+In summary, it upserts the target table with the source data.
+
+### Template Name (template_name):
+
+- "scd1_upsert"
+
+### Template Parameters (template_args):
+
+- target_schema   - database schema, where target data is loaded
+- target_table    - database table where target data is loaded
+- source_schema   - database schema, where source raw data is loaded from
+- source_view     - database view, where source raw data is loaded from
+- id_column       - column that will be used for tracking which row should be updated and which inserted
+- check           - (Optional) Callback function responsible for checking the quality of the data. Takes in a table name as a parameter which will be used for data validation
+- staging_schema  - (Optional) Schema where the checks will be executed. If not provided target_schema will be used as default
+
+### Database (database):
+- if only one trino db is being used then value will be "trino"
+- if multiple databases being used then based on database requirement value will be given.
+
+### Prerequisites:
+
+In order to use this template you need to ensure the following:
+- {source_schema}.{source_view} exists
+- {target_schema}.{target_table} exists
+- {source_schema}.{source_view} has the exact same schema as {target_schema}.{target_table}
+- Both {source_schema}.{source_view} and {target_schema}.{target_table} have unique key that will be used for the {id_column} argument
+
+### Sample Usage:
+
+Say there is SDDC-related target table called 'dim_sddc' in 'history' schema which has unique key column called 'dim_sddc_id'.
+Upserting it with the latest raw data from a database (from source view called 'vw_dim_sddc' in 'default' schema) is done in the following manner:
+
+```python
+def run(job_input):
+    # . . .
+    template_args = {
+        'source_schema': 'default',
+        'source_view': 'vw_dim_sddc',
+        'target_schema': 'history',
+        'target_table': 'dim_sddc',
+        'id_column': 'dim_sddc_id'
+    }
+    job_input.execute_template("scd1_upsert", template_args, database="trino")
+    # . . .
+```
diff --git a/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/trino_connection.py b/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/trino_connection.py
@@ -159,11 +159,12 @@ def db_connection_recover_operation(self, recovery_cursor: RecoveryCursor) -> No
             raise recovery_cursor.get_exception()
 
     def execute_query(self, query):
-        res = self.execute_query_with_retries(query)
+        # first evaluate lineage because current behavior of 'explain create as select' fails if the table exists
         if self._lineage_logger:
             lineage_data = self._get_lineage_data(query)
-            if lineage_data:
-                self._lineage_logger.send(lineage_data)
+        res = self.execute_query_with_retries(query)
+        if self._lineage_logger and lineage_data:
+            self._lineage_logger.send(lineage_data)
         #  TODO: collect lineage for failed query
         return res
 
@@ -196,7 +197,13 @@ def _get_lineage_data(self, query):
                     "ALTER operation not a RENAME TABLE operation. No lineage will be collected."
                 )
 
-        elif statement.get_type() == "SELECT" or statement.get_type() == "INSERT":
+        elif (
+            statement.get_type() == "SELECT"
+            or statement.get_type() == "INSERT"
+            or (
+                statement.get_type() == "CREATE" and "select" in statement.value.lower()
+            )
+        ):
             if lineage_utils.is_heartbeat_query(query):
                 return None
             log.debug("Collecting lineage for SELECT/INSERT query ...")
diff --git a/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/trino_plugin.py b/projects/vdk-plugins/vdk-trino/src/vdk/plugin/trino/trino_plugin.py
@@ -75,6 +75,12 @@ def initialize_job(self, context: JobContext):
                 connection_name,
             )
 
+            context.templates.add_template(
+                "scd1_upsert",
+                pathlib.Path(get_job_path("load/dimension/scd1_upsert")),
+                connection_name,
+            )
+
             context.templates.add_template(
                 "scd1",
                 pathlib.Path(get_job_path("load/dimension/scd1")),
diff --git a/projects/vdk-plugins/vdk-trino/tests/jobs/load_dimension_scd1_upsert_template_job/01_prepare_input_data.py b/projects/vdk-plugins/vdk-trino/tests/jobs/load_dimension_scd1_upsert_template_job/01_prepare_input_data.py
diff --git a/projects/vdk-plugins/vdk-trino/tests/jobs/load_dimension_scd1_upsert_template_job/02_run_load_dimension_scd1_template.py b/projects/vdk-plugins/vdk-trino/tests/jobs/load_dimension_scd1_upsert_template_job/02_run_load_dimension_scd1_template.py
diff --git a/projects/vdk-plugins/vdk-trino/tests/test_vdk_templates.py b/projects/vdk-plugins/vdk-trino/tests/test_vdk_templates.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+(SELECT * FROM "{source_schema}"."{source_view}" LIMIT 0)`
	`2`	`+UNION ALL`
	`3`	`+(SELECT * FROM "{target_schema}"."{target_table}" LIMIT 0)`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+DROP TABLE IF EXISTS "{target_schema}"."{target_table}"`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+INSERT INTO "{target_schema}"."{target_table}"`
	`2`	`+SELECT * FROM "{source_schema}"."{source_table}"`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+SHOW CREATE TABLE "{target_schema}"."{target_table}"`