Data engineering template (#45)

lennartkats-db · pietern · web-flow · commit f86b32c8892f · 2024-12-02T09:27:58.000+01:00
# Summary This adds a data engineering template. * ETL pipelines have their own folder structure as seen under https://github.com/databricks/bundle-examples/tree/data-engineering/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/%7B%7B.pipeline_name%7D%7D * uv is used to manage packages and scripts * `uv run add-asset` can be used to add a pipeline to a project * `uv run test` can be used to run all tests on serverless compute * `uv run pytest` can be used to run all tests on any type of compute as configured in the current IDE or .databrickscfg settings # How to try this You can give the template a try using ``` databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering ``` Note that each pipeline has a separate template. New pipelines can be added by using the short-hand `uv run add-asset` or by manually instantiating the pipeline template with ``` databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline ``` --------- Co-authored-by: Pieter Noordhuis <pieter.noordhuis@databricks.com>
diff --git a/contrib/templates/data-engineering/README.md b/contrib/templates/data-engineering/README.md
@@ -0,0 +1,13 @@
+# data-engineering template
+
+This template introduces a new structure for organizing data-engineering
+assets in DABs.
+
+Install it using
+
+```
+databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering
+```
+
+Note that by default this template doesn't come with any assets such as jobs or pipelines.
+Follow the instructions in the template setup and README to add them.
diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json b/contrib/templates/data-engineering/assets/etl-pipeline/databricks_template_schema.json
@@ -0,0 +1,46 @@
+{
+    "welcome_message": "\nWelcome to the data-engineering pipeline template!",
+    "properties": {
+        "pipeline_name": {
+            "type": "string",
+            "description": "\nPlease provide the name of the pipeline to generate.\npipeline_name",
+            "default": "etl_pipeline",
+            "order": 1
+        },
+        "format": {
+            "type": "string",
+            "description": "\nPlease select the format to use to define this pipeline.\nformat",
+            "order": 2,
+            "enum": [
+                "python files",
+                "sql files",
+                "notebooks"
+            ],
+            "default": "python files"
+        },
+        "only_python_files_supported": {
+            "skip_prompt_if": {
+                "properties": {
+                    "format": {
+                        "pattern": "python files"
+                    }
+                }
+            },
+            "default": "ignored",
+            "type": "string",
+            "description": "{{fail \"Only Python files are supported in this template at this time.\"}}",
+            "order": 3
+        },
+        "include_job": {
+            "type": "string",
+            "description": "\nWould you like to include a job that automatically triggers this pipeline?\nThis trigger will only be enabled for production deployments.\ninclude_job",
+            "order": 4,
+            "enum": [
+                "yes",
+                "no"
+            ],
+            "default": "yes"
+        }
+    },
+    "success_message": "\n\n🪠 New pipeline definition generated under 'assets/{{.pipeline_name}}'!"
+}
diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/__init__.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/__init__.py
@@ -0,0 +1,5 @@
+# This is the entry point for the {{.pipeline_name}} pipeline.
+# It makes sure all transformations in the transformations directory are included.
+import transformations
+
+__all__ = ["transformations"]
diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/README.md
@@ -0,0 +1,4 @@
+# explorations
+
+This folder is reserved for personal, exploratory notebooks.
+By default these are not committed to Git, as 'explorations' is listed in .gitignore.
diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/explorations/exploration.ipynb.tmpl
@@ -0,0 +1,52 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.append('..')\n",
+    "from {{.pipeline_name}}.transformations import taxi_stats\n",
+    "\n",
+    "\n",
+    "spark = SparkSession.builder.getOrCreate()\n",
+    "spark.sql('SELECT * FROM taxi_stats').show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 2
+   },
+   "notebookName": "ipynb-notebook",
+   "widgets": {}
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/dev/taxis.py
@@ -0,0 +1,8 @@
+import dlt
+from pyspark.sql import DataFrame
+from databricks.sdk.runtime import spark
+
+
+@dlt.view(comment="Small set of taxis for development (uses LIMIT 10)")
+def taxis() -> DataFrame:
+    return spark.sql("SELECT * FROM samples.nyctaxi.trips LIMIT 10")
diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/sources/prod/taxis.py
@@ -0,0 +1,8 @@
+import dlt
+from pyspark.sql import DataFrame
+from databricks.sdk.runtime import spark
+
+
+@dlt.view
+def taxis() -> DataFrame:
+    return spark.sql("SELECT * FROM samples.nyctaxi.trips")
diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/tests/taxi_stats_test.py
@@ -0,0 +1,7 @@
+from ..sources.dev.taxis import taxis
+from ..transformations import taxi_stats
+
+
+def test_taxi_stats():
+    result = taxi_stats.filter_taxis(taxis())
+    assert len(result.collect()) > 5
diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/__init__.py
@@ -0,0 +1,9 @@
+# __init__.py defines the 'transformations' Python package
+import importlib
+import pkgutil
+
+
+# Import all modules in the package except those starting with '_', like '__init__.py'
+for _, module_name, _ in pkgutil.iter_modules(__path__):
+    if not module_name.startswith("_"):
+        importlib.import_module(f"{__name__}.{module_name}")
diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/transformations/taxi_stats.py
@@ -0,0 +1,20 @@
+import dlt
+from pyspark.sql.functions import to_date, count
+from pyspark.sql import DataFrame
+
+
+@dlt.table(comment="Daily statistics of NYC Taxi trips")
+def taxi_stats() -> DataFrame:
+    """Read from the 'taxis' view from etl_pipeline/sources."""
+    taxis = dlt.read("taxis")
+
+    return filter_taxis(taxis)
+
+
+def filter_taxis(taxis: DataFrame) -> DataFrame:
+    """Group by date and calculate the number of trips."""
+    return (
+        taxis.withColumn("pickup_date", to_date("tpep_pickup_datetime"))
+        .groupBy("pickup_date")
+        .agg(count("*").alias("number_of_trips"))
+    )
diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.job.yml.tmpl
@@ -0,0 +1,23 @@
+# The job that triggers {{.pipeline_name}}.
+resources:
+  jobs:
+    {{.pipeline_name}}_job:
+      name: {{.pipeline_name}}_job
+
+      trigger:
+        # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger
+        periodic:
+          interval: 1
+          unit: DAYS
+
+      {{- if not is_service_principal}}
+
+      email_notifications:
+        on_failure: ${var.notifications}
+
+      {{- end}}
+
+      tasks:
+        - task_key: refresh_pipeline
+          pipeline_task:
+            pipeline_id: ${resources.pipelines.{{.pipeline_name}}.id}
diff --git a/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl b/contrib/templates/data-engineering/assets/etl-pipeline/template/assets/{{.pipeline_name}}/{{.pipeline_name}}.pipeline.yml.tmpl
@@ -0,0 +1,17 @@
+resources:
+  pipelines:
+    {{.pipeline_name}}:
+      name: {{.pipeline_name}}
+      serverless: true
+      {{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}}
+      ## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog:
+      # catalog: ${var.catalog}
+      {{- else}}
+      catalog: ${var.catalog}
+      {{- end}}
+      target: ${var.schema}
+      libraries:
+        - file:
+            path: sources/${bundle.target}/*.py
+        - file:
+            path: __init__.py
diff --git a/contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json b/contrib/templates/data-engineering/assets/ingest-pipeline/databricks_template_schema.json
@@ -0,0 +1,10 @@
+{
+    "welcome_message": "\nWelcome to the data-engineering ingest-pipeline template!",
+    "properties": {
+        "pipeline_name": {
+            "type": "string",
+            "description": "\n{{fail \"The ingest-pipeline template is not yet implemented.\"}}",
+            "order": 3
+        }
+    }
+}
diff --git a/contrib/templates/data-engineering/assets/job/databricks_template_schema.json b/contrib/templates/data-engineering/assets/job/databricks_template_schema.json
@@ -0,0 +1,10 @@
+{
+    "welcome_message": "\nWelcome to the data-engineering job resource template!",
+    "properties": {
+        "pipeline_name": {
+            "type": "string",
+            "description": "\n{{fail \"The job template is not yet implemented.\"}}",
+            "order": 3
+        }
+    }
+}
diff --git a/contrib/templates/data-engineering/databricks_template_schema.json b/contrib/templates/data-engineering/databricks_template_schema.json
@@ -0,0 +1,46 @@
+{
+    "welcome_message": "\nWelcome to the data-engineering template for Databricks Asset Bundles!",
+    "properties": {
+        "project_name": {
+            "type": "string",
+            "default": "my_data_project",
+            "description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project\nproject_name",
+            "order": 1,
+            "pattern": "^[A-Za-z0-9_]+$",
+            "pattern_match_failure_message": "Name must consist of letters, numbers, and underscores."
+        },
+        "default_catalog": {
+            "type": "string",
+            "default": "{{default_catalog}}",
+            "pattern": "^\\w*$",
+            "pattern_match_failure_message": "Invalid catalog name.",
+            "description": "\nPlease provide an initial catalog{{if eq (default_catalog) \"\"}} (leave blank when not using Unity Catalog){{end}}.\ndefault_catalog",
+            "order": 2
+        },
+        "personal_schemas": {
+            "type": "string",
+            "description": "\nWould you like to use a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas",
+            "enum": [
+                "yes, use a schema based on the current user name during development",
+                "no, use a shared schema during development"
+            ],
+            "order": 3
+        },
+        "shared_schema": {
+            "skip_prompt_if": {
+                "properties": {
+                    "personal_schemas": {
+                        "const": "yes, use a schema based on the current user name during development"
+                    }
+                }
+            },
+            "type": "string",
+            "default": "default",
+            "pattern": "^\\w+$",
+            "pattern_match_failure_message": "Invalid schema name.",
+            "description": "\nPlease provide an initial schema during development.\ndefault_schema",
+            "order": 4
+        }
+    },
+    "success_message": "\n\nYour new project has been created in the '{{.project_name}}' directory!\n\nTo add an example asset to your project, use\n\n  $ cd {{.project_name}}\n  $ databricks bundle init https://github.com/databricks/bundle-examples --template-dir contrib/templates/data-engineering/assets/etl-pipeline\n\nRefer to the README.md file for full \"getting started\" instructions!"
+}
diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/.gitignore b/contrib/templates/data-engineering/template/{{.project_name}}/.gitignore
@@ -0,0 +1,8 @@
+.databricks/
+build/
+dist/
+__pycache__/
+*.egg-info
+.venv/
+**/explorations/**
+**/!explorations/README.md
diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/__builtins__.pyi b/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/__builtins__.pyi
@@ -0,0 +1,3 @@
+# Typings for Pylance in Visual Studio Code
+# see https://github.com/microsoft/pyright/blob/main/docs/builtins.md
+from databricks.sdk.runtime import *
diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/extensions.json b/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/extensions.json
@@ -0,0 +1,7 @@
+{
+    "recommendations": [
+        "databricks.databricks",
+        "ms-python.vscode-pylance",
+        "redhat.vscode-yaml"
+    ]
+}
diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/settings.json.tmpl b/contrib/templates/data-engineering/template/{{.project_name}}/.vscode/settings.json.tmpl
@@ -0,0 +1,22 @@
+{
+    "python.analysis.stubPath": ".vscode",
+    "databricks.python.envFile": "${workspaceFolder}/.env",
+    "jupyter.interactiveWindow.cellMarker.codeRegex": "^# COMMAND ----------|^# Databricks notebook source|^(#\\s*%%|#\\s*\\<codecell\\>|#\\s*In\\[\\d*?\\]|#\\s*In\\[ \\])",
+    "jupyter.interactiveWindow.cellMarker.default": "# COMMAND ----------",
+    "python.testing.pytestArgs": [
+        "."
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true,
+    {{- /* Unfortunately extraPaths doesn't support globs!! See: https://github.com/microsoft/pylance-release/issues/973 */}}
+    "python.analysis.extraPaths": ["assets/etl_pipeline"],
+    "files.exclude": {
+        "**/*.egg-info": true,
+        "**/__pycache__": true,
+        ".pytest_cache": true,
+    },
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter",
+        "editor.formatOnSave": true,
+    },
+}
diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/README.md.tmpl b/contrib/templates/data-engineering/template/{{.project_name}}/README.md.tmpl
diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/assets/README.md b/contrib/templates/data-engineering/template/{{.project_name}}/assets/README.md
diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/conftest.py b/contrib/templates/data-engineering/template/{{.project_name}}/conftest.py
diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/databricks.yml.tmpl b/contrib/templates/data-engineering/template/{{.project_name}}/databricks.yml.tmpl
diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml b/contrib/templates/data-engineering/template/{{.project_name}}/pyproject.toml
diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/scripts/add_asset.py b/contrib/templates/data-engineering/template/{{.project_name}}/scripts/add_asset.py
diff --git a/contrib/templates/data-engineering/template/{{.project_name}}/scripts/test.py b/contrib/templates/data-engineering/template/{{.project_name}}/scripts/test.py