Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
54d0407
Create readme
shyamraodb Feb 26, 2025
ed4caf6
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-etl directory
shyamraodb Feb 26, 2025
a185326
Create readme
shyamraodb Feb 26, 2025
4d991f6
Create _util
shyamraodb Feb 26, 2025
6dbf4bb
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/_util
shyamraodb Feb 26, 2025
18e47c1
Add files via upload
shyamraodb Feb 26, 2025
b15202a
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/readme
shyamraodb Feb 26, 2025
a07de86
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/02-Popul…
shyamraodb Feb 26, 2025
eb6a371
Add files via upload
shyamraodb Feb 26, 2025
2bd85c7
Update initialize-staging.py
shyamraodb Feb 26, 2025
58de132
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/04-Utili…
shyamraodb Feb 26, 2025
abd76d5
Add files via upload
shyamraodb Feb 26, 2025
3d768a5
Add files via upload
shyamraodb Feb 26, 2025
c270770
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/_images/…
shyamraodb Feb 26, 2025
a614192
Add files via upload
shyamraodb Feb 26, 2025
43e9b37
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/_images/…
shyamraodb Feb 26, 2025
5d316c8
Add files via upload
shyamraodb Feb 26, 2025
ec83c94
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl directory
shyamraodb Feb 26, 2025
0cfe126
Add files via upload
shyamraodb Feb 26, 2025
6849d91
Add files via upload
shyamraodb Feb 26, 2025
8acac76
Added comment to COPY INTO (about streaming tables)
shyamraodb Feb 27, 2025
b92b62e
enable serverless sqlw
shyamraodb Feb 28, 2025
da21621
Updates following Quentin's review of PR
Mar 4, 2025
fe79363
Delete product_demos/DBSQL-Datawarehousing/dbsql-for-dim-etl/_images …
shyamraodb Mar 11, 2025
3136e7d
Refinements, bundle_config
Mar 12, 2025
72c513a
changed folder/dir name
Mar 12, 2025
08552b4
name change - etl_run_log
Mar 14, 2025
3016d3d
bundle config
Mar 15, 2025
994609c
Updates, Case, Removed Logging
Mar 28, 2025
64cb555
Some changes
Mar 28, 2025
048a288
ETL Log insert in main notebook
Mar 28, 2025
819d8a8
With sql scripting sample snippet
Apr 10, 2025
3e7e2fe
New Intro images
shyamraodb Apr 24, 2025
0e28324
Changes to commentary (including what next)
May 1, 2025
e6eff1c
backtick comment
shyamraodb May 7, 2025
e29f696
Comment changes in 00
shyamraodb May 7, 2025
87370a2
Latest comments for 00; moved scripting example to separate notebook
shyamraodb May 7, 2025
ee76745
Changed start schema to star schema in Cmd 1
shyamraodb May 30, 2025
89fa05a
environement to environment
shyamraodb May 30, 2025
6619da5
parameterize
shyamraodb May 31, 2025
2b75b48
Not creating catalog / schema. Has to pre-exist
shyamraodb Jun 20, 2025
490c3ab
task rename
shyamraodb Jun 20, 2025
b153f55
more changes to comments
shyamraodb Jun 21, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "a555ed11-89b5-4e6f-ac55-9c19e0a6cbe7",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"# Databricks SQL\n",
"<br>\n",
"\n",
"<div style=\"float: right; width: 100%;\">\n",
" <img \n",
" src=\"https://raw.githubusercontent.com/databricks-demos/dbdemos-resources/refs/heads/main/images/dbsql/sql-etl-hls-patient/Databricks%20SQL%20Intro.png?raw=true\" \n",
" width=\"100%\"\n",
" >\n",
"</div>\n",
"\n",
"<br>"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "368abc59-295a-46ff-a327-49b356f0fc47",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"# Migrate a Healthcare Data Warehouse and Build a Star Schema with Databricks\n",
"\n",
"## **🎯 Scenario**\n",
"\n",
"A hospital is migrating its legacy data warehouse to the **Databricks Lakehouse Platform** to modernize analytics and reduce operational complexity.\n",
"\n",
"Two personas lead the effort:\n",
"\n",
"- 🏗️ **Data Architect**:\n",
" - Design data models (star or snowflake schema), considering performance and reporting.\n",
" - Map source data, defining types and transformations.\n",
" - Collaborate with stakeholders to translate business needs (KPIs, reporting) into logical and physical models.\n",
" - Establish data governance and quality rules.\n",
" - Ensure scalability.\n",
"- 🔧 **Data Engineer**:\n",
" - Build and maintain data pipelines to ingest, transform, and load data into the data warehouse.\n",
" - Design and develop ETL/ELT processes for efficient data flow.\n",
" - Monitor and troubleshoot data pipelines for performance and reliability.\n",
" - Implement data quality checks and validation processes.\n",
" - Manage and optimize data warehouse infrastructure.\n",
" - Automate data-related tasks and workflows.\n",
" - Collaborate with data architects and analysts to understand data requirements.\n",
" - Deploy and manage data pipelines in production environments.\n",
"\n",
"This demo covers **Step 1**: creating and populating the patient\\_dim dimension table. **Step 2** will involve building the full star schema and powering BI reports."
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "6008ccf2-cb0b-43fe-b747-5065cce504c7",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"# End-to-End Data Warehousing Solution\n",
"<br>\n",
"\n",
"<div style=\"float: right; width: 100%;\">\n",
"\n",
"<img src=\"https://raw.githubusercontent.com/databricks-demos/dbdemos-resources/refs/heads/main/images/dbsql/sql-etl-hls-patient/Databricks%20SQL%20Marketecture.png?raw=true\" style=\"float: right\" width=\"100%\">\n",
"\n",
"</div>"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "28f47341-2eeb-4db9-a67f-efe73c3f965f",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"# 🛠 What We’ll Build\n",
"\n",
"- Model the patient\\_dim table\n",
"- Ingest raw patient data\n",
"- Clean and standardize the data\n",
"- Populate the SCD 2 Patient dimension\n",
"- Build idempotent (i.e. recoverable) pipelines\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "ef81883a-4ab9-43a7-85d2-f7301eaf28f8",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## 🔄 Workflow Overview\n",
"\n",
"High level flow:\n",
"\n",
"`Raw → Clean → patient_dim → Unity Catalog → Ready for fact joins`\n",
"\n",
"**Note:** We will be relying on several SQL Centric Engine Capabilities. Check out the examples in [SQL Centric Capabilities Examples]($./sql-centric-capabilities-examples)."
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {},
"inputWidgets": {},
"nuid": "31f098b8-445f-4e45-bebe-33cee11148ed",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## ✅ Outcome\n",
"\n",
"- patient\\_dim is clean, queryable, and governed\n",
"- Analysts and BI users can join it with future fact tables (e.g. Patient Visit Fact Table)\n",
"- Foundation for the full star schema is in place"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "ccf8b02c-a593-4251-b533-e660d2a9759b",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"# Ok you're now ready, let's get started with the demo \n",
"\n",
"This example will create and populate a SCD Type 2 dimension using Databricks SQL.\n",
"\n",
"Start with: [Patient Dimension ETL Introduction]($./01-patient-dimension-ETL-introduction)"
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"computePreferences": null,
"dashboards": [],
"environmentMetadata": {
"base_environment": "",
"environment_version": "2"
},
"inputWidgetPreferences": null,
"language": "sql",
"notebookMetadata": {
"pythonIndentUnit": 2
},
"notebookName": "00-get-started-with-SQL",
"widgets": {}
},
"language_info": {
"name": "sql"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
-- Databricks notebook source
-- MAGIC %md-sandbox
-- MAGIC # Parametrize your SQL Script
-- MAGIC
-- MAGIC In this initial notebook, we're defining our catalog / schema / table names as global variables.
-- MAGIC This makes it easy to run your ETL pipeline on different catalogs (for e.g., dev/test)

-- COMMAND ----------

-- MAGIC %md
-- MAGIC **Configure:**
-- MAGIC 1. Catalog name (to contain demo schema and objects)
-- MAGIC 2. Schema name (to create data warehouse tables, staging volume)
-- MAGIC <br>
-- MAGIC
-- MAGIC <b>NOTE</b>
-- MAGIC - Ensure that the Catalog and Schema exist.<br>
-- MAGIC - Ensure that the user running the demo has <b>CREATE TABLE</b> and <b>CREATE VOLUME</b> privileges in the above schema.

-- COMMAND ----------

-- Name of catalog under which to create the demo schema
DECLARE OR REPLACE VARIABLE catalog_name STRING = 'main';

-- Name of the demo schema under which to create tables, volume
DECLARE OR REPLACE VARIABLE schema_name STRING = 'dbdemos_sql_etl';

-- COMMAND ----------

-- MAGIC %md
-- MAGIC **Configure Predictive Optimization (PO)**
-- MAGIC <br>
-- MAGIC Specify (true/false) whether to enable Predictive Optimization (PO) for the DW schema

-- COMMAND ----------

-- Enable PO prodictive optimization at schema level / else inherit from account setting
-- User needs to have ALTER SCHEMA privilege
DECLARE OR REPLACE VARIABLE enable_po_for_schema BOOLEAN = false;

-- COMMAND ----------

-- MAGIC %md
-- MAGIC **Configure Volume**
-- MAGIC <br>
-- MAGIC A folder named "patient" will be created in this volume, and used to stage the source data files that comprise the demo.
-- MAGIC <br>
-- MAGIC Please note, the code removes any existing folder named "patient" from this volume.

-- COMMAND ----------

-- Name of the UC volume where patient source data will be staged
-- Created in the demo schema
DECLARE OR REPLACE VARIABLE volume_name STRING = 'staging';

-- COMMAND ----------

-- MAGIC %md
-- MAGIC **Additional global variables**

-- COMMAND ----------

-- Path of the UC volume where patient source data will be staged
DECLARE OR REPLACE VARIABLE staging_path STRING
= '/Volumes/' || catalog_name || "/" || schema_name || "/" || volume_name;

-- COMMAND ----------

SELECT staging_path;

-- COMMAND ----------

-- Two-level schema name
DECLARE OR REPLACE VARIABLE full_schema_name STRING
= catalog_name || '.' || schema_name;

-- COMMAND ----------

-- Three-level name of ETL Log Table
DECLARE OR REPLACE VARIABLE run_log_table STRING
= full_schema_name || '.' || 'etl_run_log';

-- Three-level name of Code Master Table
DECLARE OR REPLACE VARIABLE code_table STRING
= full_schema_name || '.' || 'code_m';
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
-- Databricks notebook source
-- MAGIC %run ./01.1-initialize

-- COMMAND ----------

DECLARE OR REPLACE VARIABLE sqlstr STRING; -- Variable to hold any SQL statement for EXECUTE IMMEDIATE

-- COMMAND ----------

-- MAGIC %md
-- MAGIC -- To CREATE Catalog<br>
-- MAGIC -- This option is disabled<br>
-- MAGIC
-- MAGIC SET VARIABLE sqlstr = "CREATE CATALOG IF NOT EXISTS " || catalog_name;
-- MAGIC EXECUTE IMMEDIATE sqlstr;

-- COMMAND ----------

-- MAGIC %md
-- MAGIC -- To CREATE Catalog<br>
-- MAGIC -- This option is disabled<br>
-- MAGIC
-- MAGIC EXECUTE IMMEDIATE 'CREATE SCHEMA IF NOT EXISTS IDENTIFIER(?)' USING full_schema_name;

-- COMMAND ----------

-- MAGIC %md
-- MAGIC **Enable/disable Predictive Optimization for schema**

-- COMMAND ----------

BEGIN
DECLARE sqlstr STRING;

IF enable_po_for_schema THEN
SET sqlstr = "ALTER SCHEMA " || full_schema_name || ' ENABLE PREDICTIVE OPTIMIZATION';
EXECUTE IMMEDIATE sqlstr;
END IF;
END

-- COMMAND ----------

-- MAGIC %md
-- MAGIC **Create Volume for staging source data files**

-- COMMAND ----------

DECLARE OR REPLACE full_volume_name = full_schema_name || '.' || volume_name;
EXECUTE IMMEDIATE "CREATE VOLUME IF NOT EXISTS IDENTIFIER(?)" USING full_volume_name;
Loading