Skip to content
Open
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
54d0407
Create readme
shyamraodb Feb 26, 2025
ed4caf6
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-etl directory
shyamraodb Feb 26, 2025
a185326
Create readme
shyamraodb Feb 26, 2025
4d991f6
Create _util
shyamraodb Feb 26, 2025
6dbf4bb
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/_util
shyamraodb Feb 26, 2025
18e47c1
Add files via upload
shyamraodb Feb 26, 2025
b15202a
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/readme
shyamraodb Feb 26, 2025
a07de86
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/02-Popul…
shyamraodb Feb 26, 2025
eb6a371
Add files via upload
shyamraodb Feb 26, 2025
2bd85c7
Update initialize-staging.py
shyamraodb Feb 26, 2025
58de132
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/04-Utili…
shyamraodb Feb 26, 2025
abd76d5
Add files via upload
shyamraodb Feb 26, 2025
3d768a5
Add files via upload
shyamraodb Feb 26, 2025
c270770
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/_images/…
shyamraodb Feb 26, 2025
a614192
Add files via upload
shyamraodb Feb 26, 2025
43e9b37
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/_images/…
shyamraodb Feb 26, 2025
5d316c8
Add files via upload
shyamraodb Feb 26, 2025
ec83c94
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl directory
shyamraodb Feb 26, 2025
0cfe126
Add files via upload
shyamraodb Feb 26, 2025
6849d91
Add files via upload
shyamraodb Feb 26, 2025
8acac76
Added comment to COPY INTO (about streaming tables)
shyamraodb Feb 27, 2025
b92b62e
enable serverless sqlw
shyamraodb Feb 28, 2025
da21621
Updates following Quentin's review of PR
Mar 4, 2025
fe79363
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/_images …
shyamraodb Mar 11, 2025
3136e7d
Refinements, bundle_config
Mar 12, 2025
72c513a
changed folder/dir name
Mar 12, 2025
08552b4
name change - etl_run_log
Mar 14, 2025
3016d3d
bundle config
Mar 15, 2025
994609c
Updates, Case, Removed Logging
Mar 28, 2025
64cb555
Some changes
Mar 28, 2025
048a288
ETL Log insert in main notebook
Mar 28, 2025
819d8a8
With sql scripting sample snippet
Apr 10, 2025
3e7e2fe
New Intro images
shyamraodb Apr 24, 2025
0e28324
Changes to commentary (including what next)
May 1, 2025
e6eff1c
backtick comment
shyamraodb May 7, 2025
e29f696
Comment changes in 00
shyamraodb May 7, 2025
87370a2
Latest comments for 00; moved scripting example to separate notebook
shyamraodb May 7, 2025
ee76745
Changed start schema to star schema in Cmd 1
shyamraodb May 30, 2025
89fa05a
environement to environment
shyamraodb May 30, 2025
6619da5
parameterize
shyamraodb May 31, 2025
2b75b48
Not creating catalog / schema. Has to pre-exist
shyamraodb Jun 20, 2025
490c3ab
task rename
shyamraodb Jun 20, 2025
b153f55
more changes to comments
shyamraodb Jun 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
-- Databricks notebook source
-- MAGIC %md-sandbox
-- MAGIC **Configure settings** <br>
-- MAGIC
-- MAGIC 1. Specify Catalog to create demo schemas <br>
-- MAGIC
-- MAGIC 2. Specify Schema to create data warehouse tables, staging volume <br>
-- MAGIC
-- MAGIC 3. Specify whether to enable Predictive Optimization for DW schema
-- MAGIC
-- MAGIC <u>NOTE:</u>
-- MAGIC The catalog and schema can be create beforehand. If not, ensure that the user running the workflow has permissions to create catalog and schema.

-- COMMAND ----------

-- DBTITLE 1,dimension schema
/*
Manually update the following, to use a different catalog / schema:
*/

declare or replace variable catalog_nm string = 'dbsqldemos';
declare or replace variable schema_nm string = 'clinical_star';

-- COMMAND ----------

-- enable PO at schema level? else inherit from account setting
declare or replace variable enable_po_for_schema boolean = true;

-- COMMAND ----------

-- MAGIC %md
-- MAGIC **Additional settings**

-- COMMAND ----------

declare or replace variable run_log_table string;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add comments to explain what these var are doing

declare or replace variable code_table string;

-- COMMAND ----------

set variable (run_log_table, code_table) = (select catalog_nm || '.' || schema_nm || '.' || 'elt_run_log', catalog_nm || '.' || schema_nm || '.' || 'code_m');

-- COMMAND ----------

declare or replace variable volume_name string = 'staging';

-- COMMAND ----------

declare or replace variable staging_path string;
set variable staging_path = '/Volumes/' || catalog_nm || "/" || schema_nm || "/" || volume_name;
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
-- Databricks notebook source
-- MAGIC %run "./Initialize"

-- COMMAND ----------

declare or replace variable sqlstr string; -- variable to hold any sql statement for EXECUTE IMMEDIATE

-- COMMAND ----------

-- MAGIC %md
-- MAGIC Create Catalog and Schema(s) if required

-- COMMAND ----------

set variable sqlstr = "create catalog if not exists " || catalog_nm;
execute immediate sqlstr;

-- COMMAND ----------

set variable sqlstr = "create schema if not exists " || catalog_nm || "." || schema_nm;

-- COMMAND ----------

execute immediate sqlstr;

-- COMMAND ----------

set variable sqlstr = "alter schema " || catalog_nm || "." || schema_nm || if(enable_po_for_schema, ' enable', ' inherit') || ' predictive optimization';
execute immediate sqlstr;

-- COMMAND ----------

-- MAGIC %md
-- MAGIC Create Volume for staging source data files

-- COMMAND ----------

set variable sqlstr = "create volume if not exists " || catalog_nm || "." || schema_nm || "." || volume_name;
execute immediate sqlstr;
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
-- Databricks notebook source
-- MAGIC %run "../00-Setup/Initialize"

-- COMMAND ----------

-- MAGIC %md
-- MAGIC # Create Table

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ##Master Data
-- MAGIC Standardized codes used for coded attributes

-- COMMAND ----------

drop table if exists identifier(code_table);

-- COMMAND ----------

-- LC options - m_code, m_type

create table identifier(code_table) (
m_code string comment 'code',
m_desc string comment 'name or description for the code',
m_type string comment 'attribute type utilizing code'
)
comment 'master table for coded attributes'

-- COMMAND ----------

insert into identifier(code_table)
values
('M', 'Male', 'GENDER'),
('F', 'Female', 'GENDER'),
('hispanic', 'Hispanic', 'ETHNICITY'),
('nonhispanic', 'Not Hispanic', 'ETHNICITY')e
;
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
-- Databricks notebook source
-- MAGIC %run "../00-Setup/Initialize"

-- COMMAND ----------

-- MAGIC %md
-- MAGIC # Create Tables

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ## Config/Log Table for ETL
-- MAGIC This table captures the metadata for a given table that includes the table name, load start time and load end time.

-- COMMAND ----------

drop table if exists identifier(run_log_table);

-- COMMAND ----------

create table identifier(run_log_table) (data_source string, table_name string, load_start_time timestamp, locked boolean, load_end_time timestamp, num_inserts int, num_updates int, process_id string)
;
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
-- Databricks notebook source
-- MAGIC %run "../00-Setup/Initialize"

-- COMMAND ----------

declare or replace variable br_table string; -- staging/bronze table identifier
declare or replace variable si_table string; -- integration/silver table identifier
declare or replace variable gd_table string; -- dimension table identifier

-- COMMAND ----------

declare or replace variable sqlstr string;

-- COMMAND ----------

set variable (br_table, si_table, gd_table) = (select catalog_nm || '.' || schema_nm || '.' || 'patient_stg', catalog_nm || '.' || schema_nm || '.' || 'patient_int', catalog_nm || '.' || schema_nm || '.' || 'g_patient_d');

-- COMMAND ----------

-- MAGIC %md
-- MAGIC
-- MAGIC # Create Tables
-- MAGIC Create the staging, integration, and dimension tables for patient.<br>
-- MAGIC The patient dimension is part of the clinical data warehouse (star schema).
-- MAGIC
-- MAGIC <u>NOTE:</u> By default, the tables are created in the **catalog dbsqldemos**. To change this, or specify an existing catalog / schema, please see [Configure notebook]($../00-Setup/Configure) for more context.

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ## Create Staging Table
-- MAGIC The schema for the staging table will be derived from the source data file(s)

-- COMMAND ----------

drop table if exists identifier(br_table);

-- COMMAND ----------

create table if not exists identifier(br_table)
comment 'Patient staging table ingesting initial and incremental master data from csv files'
;

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ## Create Integration Table

-- COMMAND ----------

drop table if exists identifier(si_table);

-- COMMAND ----------

-- MAGIC %md
-- MAGIC Potential clustering columns - (data_source, patient_src_id) <br>
-- MAGIC Also, column src_changed_on_dt will be naturally ordered (ingestion-time clustering) AND data_source will typically be the same for all records in a source file.
-- MAGIC
-- MAGIC **Note:** Predictive Optimization intelligently optimizes your table data layouts for faster queries and reduced storage costs.
-- MAGIC
-- MAGIC
-- MAGIC

-- COMMAND ----------

create table if not exists identifier(si_table) (
patient_src_id string not null comment 'ID of the record in the source',
date_of_birth date comment 'date of birth',
ssn string comment 'social security number',
drivers_license string comment 'driver\'s license',
name_prefix string comment 'name prefix',
first_name string comment 'first name of patient',
last_name string not null comment 'last name of patient',
name_suffix string comment 'name suffix',
maiden_name string comment 'maiden name',
gender_cd string comment 'code for patient\'s gender',
gender_nm string comment 'description of patient\'s gender',
marital_status string comment 'marital status',
ethnicity_cd string comment 'code for patient\'s ethnicity',
ethnicity_nm string comment 'description of patient\'s ethnicity',
src_changed_on_dt timestamp comment 'date of last change to record in source',
data_source string not null comment 'code for source system',
insert_dt timestamp comment 'date record inserted',
update_dt timestamp comment 'date record updated',
process_id string comment 'Process ID for run',
constraint c_int_pk primary key (patient_src_id, data_source) RELY
)
comment 'curated integration table for patient data'
tblproperties (delta.enableChangeDataFeed = true)
;

-- COMMAND ----------

-- MAGIC %md
-- MAGIC ## Create Dimension

-- COMMAND ----------

drop table if exists identifier(gd_table);

-- COMMAND ----------

-- MAGIC %md
-- MAGIC Potential clustering columns - attributes used for filtering in end-user queries. For e.g., Last Name, Gender Code.
-- MAGIC
-- MAGIC Additionally, for large dimensions, using the Source ID (patient_src_id) as a cluster key may help with ETL performance.
-- MAGIC
-- MAGIC **Note:** <br>
-- MAGIC For the dimension table, take advantage of Predictive Optimization and Auto clustering.
-- MAGIC
-- MAGIC Auto Clustering can be used to automatically cluster your tables based on your evolving workload!
-- MAGIC <br>
-- MAGIC Auto Clustering is enabled via **CLUSTER BY AUTO** clause.

-- COMMAND ----------

create table if not exists identifier(gd_table) (
patient_sk bigint generated always as identity comment 'Primary Key (ID)',
last_name string NOT NULL comment 'Last name of the person',
first_name string NOT NULL comment 'First name of the person',
name_prefix string comment 'Prefix of person name',
name_suffix string comment 'Suffix of person name',
maiden_name string comment 'Maiden name',
gender_code string comment 'Gender code',
gender string comment 'gender description',
date_of_birth timestamp comment 'Birth date and time',
marital_status string comment 'Marital status',
ethnicity_code string,
ethnicity string,
ssn string comment 'Patient SSN',
other_identifiers map <string, string> comment 'Identifier type (passport number, license number except mrn, ssn) and value',
uda map <string, string> comment 'User Defined Attributes',
patient_src_id string comment 'Unique reference to the source record',
effective_start_date timestamp comment 'SCD2 effective start date for version',
effective_end_date timestamp comment 'SCD2 effective start date for version',
checksum string comment 'Checksum for the record',
data_source string comment 'Code for source system',
insert_dt timestamp comment 'record inserted time',
update_dt timestamp comment 'record updated time',
process_id string comment 'Process ID for run',
constraint c_d_pk primary key (patient_sk) RELY
)
cluster by auto
comment 'Patient dimension'
tblproperties (
delta.deletedFileRetentionDuration = 'interval 30 days'
)
;


-- COMMAND ----------

-- FK to integration table
set variable sqlstr = 'alter table ' || gd_table || ' add constraint c_d_int_source_fk foreign key (patient_src_id, data_source) references ' || si_table || '(patient_src_id, data_source) not enforced rely';
execute immediate sqlstr;
Loading