databricks · pietern · Oct 22, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025
diff --git a/contrib/templates/file-push/README.md b/contrib/templates/file-push/README.md
@@ -0,0 +1,10 @@
+# Zerobus - File Mode
+
+This is an (experimental) template for creating a file push pipeline with Databricks Asset Bundles. 
+
+Install it using
+```
+databricks bundle init --template-dir contrib/templates/file-push https://github.com/databricks/bundle-examples
+```
+
+and follow the generated README.md to get started.
diff --git a/contrib/templates/file-push/databricks_template_schema.json b/contrib/templates/file-push/databricks_template_schema.json
@@ -0,0 +1,22 @@
+{
+  "welcome_message": "\nWelcome to the file-push template for Databricks Asset Bundles!\n\nA workspace was selected based on your current profile. For information about how to change this, see https://docs.databricks.com/dev-tools/cli/profiles.html.\nworkspace_host: {{workspace_host}}",
+  "properties": {
+    "catalog_name": {
+      "type": "string",
+      "description": "\nPlease provide the name of an EXISTING UC catalog with default storage enabled.\nCatalog Name",
+      "order": 1,
+      "default": "main",
+      "pattern": "^[a-z_][a-z0-9_]{0,254}$",
+      "pattern_match_failure_message": "Name must only consist of letters, numbers, and underscores."
+    },
+    "schema_name": {
+      "type": "string",
+      "description": "\nPlease provide a NEW schema name where the pipelines and tables will land in.\nSchema Name",
+      "order": 2,
+      "default": "filepushschema",
+      "pattern": "^[a-z_][a-z0-9_]{0,254}$",
+      "pattern_match_failure_message": "Name must only consist of letters, numbers, dashes, and underscores."
+    }
+  },
+  "success_message": "\nBundle folder '{{.catalog_name}}.{{.schema_name}}' has been created. Please refer to the README.md for next steps."
+}
diff --git a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/README.md.tmpl b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/README.md.tmpl
@@ -0,0 +1,164 @@
+# Zerobus - File Mode
+
+A lightweight, no‑code file ingestion workflow. Configure a set of tables, get a volume path for each, and drop files into those paths—your data lands in Unity Catalog tables via Auto Loader and Lakeflow Pipeline.
+
+## Table of Contents
+- [Quick Start](#quick-start)
+  - [Step 1. Configure tables](#step-1-configure-tables)
+  - [Step 2. Deploy & set up](#step-2-deploy--set-up)
+  - [Step 3. Retrieve endpoint & push files](#step-3-retrieve-endpoint--push-files)
+- [Debug Table Issues](#debug-table-issues)
+  - [Step 1. Configure tables to debug](#step-1-configure-tables-to-debug)
+  - [Step 2. Deploy & set up in dev mode](#step-2-deploy--set-up-in-dev-mode)
+  - [Step 3. Retrieve endpoint & push files to debug](#step-3-retrieve-endpoint--push-files-to-debug)
+  - [Step 4. Debug table configs](#step-4-debug-table-configs)
+  - [Step 5. Fix the table configs in production](#step-5-fix-the-table-configs-in-production)
+
+---
+
+## Quick Start
+
+### Step 1. Configure tables
+Edit table configs in `./src/configs/tables.json`. Only `name` and `format` are required.
+
+Currently supported formats are `csv`, `json`, `avro` and `parquet`.
+
+For supported `format_options`, see the [Auto Loader options](https://docs.databricks.com/aws/en/ingestion/cloud-object-storage/auto-loader/options). Not all options are supported here. If unsure, specify only `name` and `format`, or follow [Debug Table Issues](#debug-table-issues) to discover the correct options.
+
+```json
+[
+  {
+    "name": "table1",
+    "format": "csv",
+    "format_options":
+    {
+      "escape": "\""
+    },
+    "schema_hints": "id int, name string"
+  },
+  {
+    "name": "table2",
+    "format": "json"
+  }
+]
+```
+
+> **Tip:** Keep `schema_hints` minimal; Auto Loader can evolve the schema as new columns appear.
+
+### Step 2. Deploy & set up
+
+```bash
+databricks bundle deploy
+databricks bundle run configuration_job
+```
+
+Wait for the configuration job to finish before moving on.
+
+### Step 3. Retrieve endpoint & push files
+First, grant write permissions to the volume. This enables the client to push files:
+
+```bash
+databricks bundle open filepush_volume
+```
+
+Fetch the volume path for uploading files to a specific table (example: `table1`):
+
+```bash
+databricks tables get {{.catalog_name}}.{{.schema_name}}.table1 --output json \
+  | jq -r '.properties["filepush.table_volume_path_data"]'
+```
+
+Example output:
+
+```text
+/Volumes/{{.catalog_name}}/{{.schema_name}}/{{.catalog_name}}_{{.schema_name}}_filepush_volume/data/table1
+```
+
+Upload files to the path above using any of the [Volumes file APIs](https://docs.databricks.com/aws/en/volumes/volume-files#methods-for-managing-files-in-volumes).
+
+**Databricks CLI example** (destination uses the `dbfs:` scheme):
+
+```bash
+databricks fs cp /local/file/path/datafile1.csv \
+  dbfs:/Volumes/{{.catalog_name}}/{{.schema_name}}/{{.catalog_name}}_{{.schema_name}}_filepush_volume/data/table1
+```
+
+**REST API example**:
+
+```bash
+# prerequisites: export DATABRICKS_HOST and DATABRICKS_TOKEN (PAT token)
+curl -X PUT "$DATABRICKS_HOST/api/2.0/fs/files/Volumes/{{.catalog_name}}/{{.schema_name}}/{{.catalog_name}}_{{.schema_name}}_filepush_volume/data/table1/datafile1.csv" \
+  -H "Authorization: Bearer $DATABRICKS_TOKEN" \
+  -H "Content-Type: application/octet-stream" \
+  --data-binary @"/local/file/path/datafile1.csv"
+```
+
+Within about a minute, the data should appear in the table `{{.catalog_name}}.{{.schema_name}}.table1`.
+
+---
+
+## Debug Table Issues
+If data isn’t parsed as expected, use **dev mode** to iterate on table options safely.
+
+### Step 1. Configure tables to debug
+Configure tables as in [Step 1 of Quick Start](#step-1-configure-tables).
+
+### Step 2. Deploy & set up in **dev mode**
+
+```bash
+databricks bundle deploy -t dev
+databricks bundle run configuration_job -t dev
+```
+
+Wait for the configuration job to finish. Example output:
+
+```text
+2025-09-23 22:03:04,938 [INFO] initialization - ==========
+catalog_name: {{.catalog_name}}
+schema_name: dev_first_last_{{.schema_name}}
+volume_path_root: /Volumes/{{.catalog_name}}/dev_first_last_{{.schema_name}}/{{.catalog_name}}_{{.schema_name}}_filepush_volume
+volume_path_data: /Volumes/{{.catalog_name}}/dev_first_last_{{.schema_name}}/{{.catalog_name}}_{{.schema_name}}_filepush_volume/data
+volume_path_archive: /Volumes/{{.catalog_name}}/dev_first_last_{{.schema_name}}/{{.catalog_name}}_{{.schema_name}}_filepush_volume/archive
+==========
+```
+
+> **Note:** In **dev mode**, the schema name is **prefixed**. Use the printed schema name for the remaining steps.
+
+### Step 3. Retrieve endpoint & push files to debug
+
+Get the dev volume path (note the **prefixed schema**):
+
+```bash
+databricks tables get {{.catalog_name}}.dev_first_last_{{.schema_name}}.table1 --output json \
+  | jq -r '.properties["filepush.table_volume_path_data"]'
+```
+
+Example output:
+
+```text
+/Volumes/{{.catalog_name}}/dev_first_last_{{.schema_name}}/{{.catalog_name}}_{{.schema_name}}_filepush_volume/data/table1
+```
+
+Then follow the upload instructions from [Quick Start → Step 3](#step-3-retrieve-endpoint--push-files) to send test files.
+
+### Step 4. Debug table configs
+Open the pipeline in the workspace:
+
+```bash
+databricks bundle open refresh_pipeline -t dev
+```
+
+Click **Edit pipeline** to launch the development UI. Open the `debug_table_config` notebook and follow its guidance to refine the table options. When satisfied, copy the final config back to `./src/configs/tables.json`.
+
+### Step 5. Fix the table configs in production
+Redeploy the updated config and run a full refresh to correct existing data for an affected table:
+
+```bash
+databricks bundle deploy
+databricks bundle run refresh_pipeline --full-refresh table1
+```
+
+---
+
+**That’s it!** You now have a managed, push-based file ingestion workflow with debuggable table configs and repeatable deployments!
+
diff --git a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/databricks.yml.tmpl b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/databricks.yml.tmpl
@@ -0,0 +1,37 @@
+# databricks.yml
+# This is the configuration for the file push DAB dab.
+
+bundle:
+  name: {{.schema_name}}
+  uuid: {{bundle_uuid}}
+
+include:
+  - resources/*.yml
+
+targets:
+  # The deployment targets. See https://docs.databricks.com/en/dev-tools/bundles/deployment-modes.html
+  dev:
+    mode: development
+    workspace:
+      host: {{workspace_host}}
+
+  prod:
+    mode: production
+    default: true
+    workspace:
+      host: {{workspace_host}}
+      root_path: /Workspace/Users/${workspace.current_user.userName}/.bundle/${bundle.name}/${bundle.target}
+    permissions:
+      - user_name: ${workspace.current_user.userName}
+        level: CAN_MANAGE
+
+variables:
+  catalog_name:
+    description: The existing catalog where the NEW schema will be created.
+    default: {{.catalog_name}}
+  schema_name:
+    description: The name of the NEW schema where the tables will be created.
+    default: {{.schema_name}}
+  resource_name_prefix:
+    description: The prefix for the resource names.
+    default: ${var.catalog_name}_${var.schema_name}_
diff --git a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/resources/job.yml b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/resources/job.yml
@@ -0,0 +1,46 @@
+# The main job for schema dab
+# This job will trigger in the schema pipeline
+
+resources:
+  jobs:
+    filetrigger_job:
+      name: ${var.resource_name_prefix}filetrigger_job
+      tasks:
+        - task_key: pipeline_refresh
+          pipeline_task:
+            pipeline_id: ${resources.pipelines.refresh_pipeline.id}
+      trigger:
+        file_arrival:
+          url: ${resources.volumes.filepush_volume.volume_path}/data/
+    configuration_job:
+      name: ${var.resource_name_prefix}configuration_job
+      tasks:
+        - task_key: initialization
+          spark_python_task:
+            python_file: ../src/utils/initialization.py
+            parameters:
+              - "--catalog_name"
+              - "{{job.parameters.catalog_name}}"
+              - "--schema_name"
+              - "{{job.parameters.schema_name}}"
+              - "--volume_path_root"
+              - "{{job.parameters.volume_path_root}}"
+              - "--logging_level"
+              - "${bundle.target}"
+          environment_key: serverless
+        - task_key: trigger_refresh
+          run_job_task:
+            job_id: ${resources.jobs.filetrigger_job.id}
+          depends_on:
+            - task_key: initialization
+      environments:
+        - environment_key: serverless
+          spec:
+            client: "3"
+      parameters:
+        - name: catalog_name
+          default: ${var.catalog_name}
+        - name: schema_name
+          default: ${resources.schemas.main_schema.name}
+        - name: volume_path_root
+          default: ${resources.volumes.filepush_volume.volume_path}
diff --git a/...ib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/resources/pipeline.yml b/...ib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/resources/pipeline.yml
@@ -0,0 +1,16 @@
+# The table refresh pipeline for schema dab
+
+resources:
+  pipelines:
+    refresh_pipeline:
+      name: ${var.resource_name_prefix}refresh_pipeline
+      catalog: ${var.catalog_name}
+      schema: ${resources.schemas.main_schema.name}
+      serverless: true
+      libraries:
+        - file:
+            path: ../src/ingestion.py
+      root_path: ../src
+      configuration:
+        lakeflow.experimantal.filepush.version: 0.1
+        filepush.volume_path_root: ${resources.volumes.filepush_volume.volume_path}
diff --git a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/resources/schema.yml b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/resources/schema.yml
@@ -0,0 +1,7 @@
+# The schema dab
+
+resources:
+  schemas:
+    main_schema:
+      name: ${var.schema_name}
+      catalog_name: ${var.catalog_name}
diff --git a/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/resources/volume.yml b/contrib/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/resources/volume.yml
@@ -0,0 +1,8 @@
+# The file staging volume for schema dab
+
+resources:
+  volumes:
+    filepush_volume:
+      name: ${var.resource_name_prefix}filepush_volume
+      catalog_name: ${var.catalog_name}
+      schema_name: ${var.schema_name}
diff --git a/...b/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/configs/tables.json b/...b/templates/file-push/template/{{.catalog_name}}.{{.schema_name}}/src/configs/tables.json
@@ -0,0 +1,18 @@
+[
+  {
+    "name": "example_table_csv",
+    "format": "csv"
+  },
+  {
+    "name": "example_table_json",
+    "format": "json"
+  },
+  {
+    "name": "example_table_avro",
+    "format": "avro"
+  },
+  {
+    "name": "example_table_parquet",
+    "format": "parquet"
+  }
+]