From 5176b569d3ecba6d94f2c7ebbfc8c3b732011c1e Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 6 Dec 2024 02:50:18 +0530 Subject: [PATCH 01/13] Add example for a job writing to a UC Volume --- .../save_job_result_to_volume/.gitignore | 8 ++ .../save_job_result_to_volume/README.md | 46 +++++++++++ .../save_job_result_to_volume/databricks.yml | 14 ++++ .../resources/my_volume.volume.yml | 9 +++ .../resources/top_ten_trips.job.yml | 26 +++++++ .../resources/trips.schema.yml | 5 ++ .../save_job_result_to_volume/src/query.ipynb | 77 +++++++++++++++++++ 7 files changed, 185 insertions(+) create mode 100644 knowledge_base/save_job_result_to_volume/.gitignore create mode 100644 knowledge_base/save_job_result_to_volume/README.md create mode 100644 knowledge_base/save_job_result_to_volume/databricks.yml create mode 100644 knowledge_base/save_job_result_to_volume/resources/my_volume.volume.yml create mode 100644 knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml create mode 100644 knowledge_base/save_job_result_to_volume/resources/trips.schema.yml create mode 100644 knowledge_base/save_job_result_to_volume/src/query.ipynb diff --git a/knowledge_base/save_job_result_to_volume/.gitignore b/knowledge_base/save_job_result_to_volume/.gitignore new file mode 100644 index 0000000..0dab7f4 --- /dev/null +++ b/knowledge_base/save_job_result_to_volume/.gitignore @@ -0,0 +1,8 @@ +.databricks/ +build/ +dist/ +__pycache__/ +*.egg-info +.venv/ +scratch/** +!scratch/README.md diff --git a/knowledge_base/save_job_result_to_volume/README.md b/knowledge_base/save_job_result_to_volume/README.md new file mode 100644 index 0000000..dfe517d --- /dev/null +++ b/knowledge_base/save_job_result_to_volume/README.md @@ -0,0 +1,46 @@ +# my_project + +The 'my_project' project was generated by using the default-python template. + +## Getting started + +1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +2. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +3. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + For example, the default template would deploy a job called + `[dev yourname] my_project_job` to your workspace. + You can find that job by opening your workpace and clicking on **Workflows**. + +4. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + + Note that the default job from the template has a schedule that runs every day + (defined in resources/my_project.job.yml). The schedule + is paused when deploying in development mode (see + https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). + +5. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` + +6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from + https://docs.databricks.com/dev-tools/vscode-ext.html. + +7. For documentation on the Databricks asset bundles format used + for this project, and for CI/CD configuration, see + https://docs.databricks.com/dev-tools/bundles/index.html. diff --git a/knowledge_base/save_job_result_to_volume/databricks.yml b/knowledge_base/save_job_result_to_volume/databricks.yml new file mode 100644 index 0000000..92220e4 --- /dev/null +++ b/knowledge_base/save_job_result_to_volume/databricks.yml @@ -0,0 +1,14 @@ +# This is a Databricks asset bundle definition for my_project. +# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. +bundle: + name: nyc_top_trips_project + +include: + - resources/*.yml + +workspace: + host: https://e2-dogfood.staging.cloud.databricks.com + +targets: + dev: + default: true diff --git a/knowledge_base/save_job_result_to_volume/resources/my_volume.volume.yml b/knowledge_base/save_job_result_to_volume/resources/my_volume.volume.yml new file mode 100644 index 0000000..4431a6b --- /dev/null +++ b/knowledge_base/save_job_result_to_volume/resources/my_volume.volume.yml @@ -0,0 +1,9 @@ +resources: + volumes: + my_volume: + catalog_name: main + name: my_volume + # We use the ${resources.schemas...} interpolation syntax to force the creation + # of the schema before the volume. Usage of the ${resources.schemas...} syntax + # allows Databricks Asset Bundles to form a dependency graph between resources. + schema_name: ${resources.schemas.trips.name} \ No newline at end of file diff --git a/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml b/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml new file mode 100644 index 0000000..20836cb --- /dev/null +++ b/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml @@ -0,0 +1,26 @@ +resources: + jobs: + top_ten_trips: + name: top_ten_trips + + trigger: + # Run this job every day to update all time the top ten trips report. + periodic: + interval: 1 + unit: DAYS + + # No job cluster is configured. The job will run on serverless compute. + # You can explictly configure job compute here if your workspace does + # not have serverless compute enabled. + tasks: + - task_key: top_ten_trips_task + notebook_task: + notebook_path: ../src/query.ipynb + + parameters: + - name: schema_name + default: ${resources.schemas.trips.name} + + + + diff --git a/knowledge_base/save_job_result_to_volume/resources/trips.schema.yml b/knowledge_base/save_job_result_to_volume/resources/trips.schema.yml new file mode 100644 index 0000000..6710daf --- /dev/null +++ b/knowledge_base/save_job_result_to_volume/resources/trips.schema.yml @@ -0,0 +1,5 @@ +resources: + schemas: + trips: + catalog_name: main + name: ${workspace.current_user.short_name}_trips diff --git a/knowledge_base/save_job_result_to_volume/src/query.ipynb b/knowledge_base/save_job_result_to_volume/src/query.ipynb new file mode 100644 index 0000000..91a11ea --- /dev/null +++ b/knowledge_base/save_job_result_to_volume/src/query.ipynb @@ -0,0 +1,77 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Default notebook\n", + "\n", + "This default notebook is executed using Databricks Workflows as defined in resources/my_project.job.yml." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "schema_name = dbutils.widgets.get(\"schema_name\")\n", + "best_trips_df = spark.sql(\"SELECT * FROM samples.nyctaxi.trips order by fare_amount desc LIMIT 10\")\n", + "best_trips_df.write.format(\"csv\").option(\"header\", \"true\").save(f\"/Volumes/main/{schema_name}/my_volume/top_ten/trip_fares.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Load the saved data from the UC volume and display it.\n", + "df = spark.read.format(\"csv\").load(f\"/Volumes/main/{schema_name}/my_volume/top_ten/trip_fares.csv\")\n", + "\n", + "display(df)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "notebook", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From c4bc362f67462bf1d96f0b310dfe9eb2f023352a Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 6 Dec 2024 02:52:16 +0530 Subject: [PATCH 02/13] - --- .../save_job_result_to_volume/.gitignore | 9 +--- .../save_job_result_to_volume/README.md | 48 +++++-------------- 2 files changed, 12 insertions(+), 45 deletions(-) diff --git a/knowledge_base/save_job_result_to_volume/.gitignore b/knowledge_base/save_job_result_to_volume/.gitignore index 0dab7f4..a4561ff 100644 --- a/knowledge_base/save_job_result_to_volume/.gitignore +++ b/knowledge_base/save_job_result_to_volume/.gitignore @@ -1,8 +1 @@ -.databricks/ -build/ -dist/ -__pycache__/ -*.egg-info -.venv/ -scratch/** -!scratch/README.md +.databricks/ \ No newline at end of file diff --git a/knowledge_base/save_job_result_to_volume/README.md b/knowledge_base/save_job_result_to_volume/README.md index dfe517d..00d3e97 100644 --- a/knowledge_base/save_job_result_to_volume/README.md +++ b/knowledge_base/save_job_result_to_volume/README.md @@ -1,46 +1,20 @@ -# my_project +# Save job result to volume -The 'my_project' project was generated by using the default-python template. +This example demonstrates how to define and use a Unity Catalog Volume in a Databricks Asset Bundle. -## Getting started +Specifically we'll define a `top_ten_trips` job which computes the top ten NYC trips with the highest +fares and stores the result in a Unity Catalog Volume. -1. Install the Databricks CLI from https://docs.databricks.com/dev-tools/cli/databricks-cli.html +The bundle also defines a Volume and the associated schema to which we'll store the results to. -2. Authenticate to your Databricks workspace, if you have not done so already: - ``` - $ databricks configure - ``` +## Prerequisites -3. To deploy a development copy of this project, type: - ``` - $ databricks bundle deploy --target dev - ``` - (Note that "dev" is the default target, so the `--target` parameter - is optional here.) +* Databricks CLI v0.236.0 or above - This deploys everything that's defined for this project. - For example, the default template would deploy a job called - `[dev yourname] my_project_job` to your workspace. - You can find that job by opening your workpace and clicking on **Workflows**. +## Usage -4. Similarly, to deploy a production copy, type: - ``` - $ databricks bundle deploy --target prod - ``` +Update the `host` field under `workspace` in `databricks.yml` to the Databricks workspace you wish to deploy to. - Note that the default job from the template has a schedule that runs every day - (defined in resources/my_project.job.yml). The schedule - is paused when deploying in development mode (see - https://docs.databricks.com/dev-tools/bundles/deployment-modes.html). +Run `databricks bundle deploy` to deploy the job. -5. To run a job or pipeline, use the "run" command: - ``` - $ databricks bundle run - ``` - -6. Optionally, install developer tools such as the Databricks extension for Visual Studio Code from - https://docs.databricks.com/dev-tools/vscode-ext.html. - -7. For documentation on the Databricks asset bundles format used - for this project, and for CI/CD configuration, see - https://docs.databricks.com/dev-tools/bundles/index.html. +Run `databricks bundle run top_ten_trips` to run the job and store the results in UC volume. From b35c8035c3c1fa8719c58715afba4a3f5b28fff7 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 6 Dec 2024 02:53:00 +0530 Subject: [PATCH 03/13] - --- knowledge_base/save_job_result_to_volume/databricks.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/knowledge_base/save_job_result_to_volume/databricks.yml b/knowledge_base/save_job_result_to_volume/databricks.yml index 92220e4..ed789c1 100644 --- a/knowledge_base/save_job_result_to_volume/databricks.yml +++ b/knowledge_base/save_job_result_to_volume/databricks.yml @@ -1,7 +1,7 @@ # This is a Databricks asset bundle definition for my_project. # See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: - name: nyc_top_trips_project + name: save_job_result_to_volume include: - resources/*.yml From 84c6a473b7b8a75691167a2538f1317f5a020abb Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 6 Dec 2024 02:53:44 +0530 Subject: [PATCH 04/13] - --- knowledge_base/save_job_result_to_volume/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/knowledge_base/save_job_result_to_volume/README.md b/knowledge_base/save_job_result_to_volume/README.md index 00d3e97..f156d72 100644 --- a/knowledge_base/save_job_result_to_volume/README.md +++ b/knowledge_base/save_job_result_to_volume/README.md @@ -5,7 +5,7 @@ This example demonstrates how to define and use a Unity Catalog Volume in a Data Specifically we'll define a `top_ten_trips` job which computes the top ten NYC trips with the highest fares and stores the result in a Unity Catalog Volume. -The bundle also defines a Volume and the associated schema to which we'll store the results to. +The bundle also defines a Volume and the associated Schema in which we store the results. ## Prerequisites From 3f6edb5bfd8c4605851afde2dc1aa9c77b2328b9 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 6 Dec 2024 02:54:25 +0530 Subject: [PATCH 05/13] - --- knowledge_base/save_job_result_to_volume/databricks.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/knowledge_base/save_job_result_to_volume/databricks.yml b/knowledge_base/save_job_result_to_volume/databricks.yml index ed789c1..2cb900c 100644 --- a/knowledge_base/save_job_result_to_volume/databricks.yml +++ b/knowledge_base/save_job_result_to_volume/databricks.yml @@ -1,5 +1,3 @@ -# This is a Databricks asset bundle definition for my_project. -# See https://docs.databricks.com/dev-tools/bundles/index.html for documentation. bundle: name: save_job_result_to_volume From 0715a0268df054668a1cb65957bd5895a501213f Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 6 Dec 2024 02:55:36 +0530 Subject: [PATCH 06/13] - --- .../save_job_result_to_volume/resources/top_ten_trips.job.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml b/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml index 20836cb..6d0ed30 100644 --- a/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml +++ b/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml @@ -4,8 +4,8 @@ resources: name: top_ten_trips trigger: - # Run this job every day to update all time the top ten trips report. - periodic: + # Run this job every day to update all time the top ten trips recorded top + # 10 trips in the UC Volume. interval: 1 unit: DAYS From ff31515c0dda2b0b7fc0ffc465dd0160768d5885 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 6 Dec 2024 02:55:46 +0530 Subject: [PATCH 07/13] - --- .../save_job_result_to_volume/resources/top_ten_trips.job.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml b/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml index 6d0ed30..1338828 100644 --- a/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml +++ b/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml @@ -10,7 +10,7 @@ resources: unit: DAYS # No job cluster is configured. The job will run on serverless compute. - # You can explictly configure job compute here if your workspace does + # You can explicitly configure job compute here if your workspace does # not have serverless compute enabled. tasks: - task_key: top_ten_trips_task From 3e83f88b6a569c24067d57d2bfe4c14dd41c4f76 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 6 Dec 2024 02:56:47 +0530 Subject: [PATCH 08/13] - --- .../save_job_result_to_volume/src/query.ipynb | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/knowledge_base/save_job_result_to_volume/src/query.ipynb b/knowledge_base/save_job_result_to_volume/src/query.ipynb index 91a11ea..c489ba1 100644 --- a/knowledge_base/save_job_result_to_volume/src/query.ipynb +++ b/knowledge_base/save_job_result_to_volume/src/query.ipynb @@ -1,22 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "ee353e42-ff58-4955-9608-12865bd0950e", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# Default notebook\n", - "\n", - "This default notebook is executed using Databricks Workflows as defined in resources/my_project.job.yml." - ] - }, { "cell_type": "code", "execution_count": null, From 82624d42190126fa7c8af11d6b99063e13b3e7e2 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 6 Dec 2024 03:00:32 +0530 Subject: [PATCH 09/13] - --- knowledge_base/save_job_result_to_volume/src/query.ipynb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/knowledge_base/save_job_result_to_volume/src/query.ipynb b/knowledge_base/save_job_result_to_volume/src/query.ipynb index c489ba1..f8b407c 100644 --- a/knowledge_base/save_job_result_to_volume/src/query.ipynb +++ b/knowledge_base/save_job_result_to_volume/src/query.ipynb @@ -6,6 +6,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Run the SQL query to get the top 10 trips by fare amount and store the result in CSV format in a Volume.\n", "schema_name = dbutils.widgets.get(\"schema_name\")\n", "best_trips_df = spark.sql(\"SELECT * FROM samples.nyctaxi.trips order by fare_amount desc LIMIT 10\")\n", "best_trips_df.write.format(\"csv\").option(\"header\", \"true\").save(f\"/Volumes/main/{schema_name}/my_volume/top_ten/trip_fares.csv\")" @@ -28,7 +29,7 @@ }, "outputs": [], "source": [ - "# Load the saved data from the UC volume and display it.\n", + "# Load the saved data from the Volume and display it.\n", "df = spark.read.format(\"csv\").load(f\"/Volumes/main/{schema_name}/my_volume/top_ten/trip_fares.csv\")\n", "\n", "display(df)" From a09967811ee5ceeccf1a8564d17e291e26b36e0c Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 6 Dec 2024 16:29:21 +0530 Subject: [PATCH 10/13] address comments --- .../resources/top_ten_trips.job.yml | 26 -------- .../resources/trips.schema.yml | 5 -- .../save_job_result_to_volume/src/query.ipynb | 61 ------------------- .../.gitignore | 0 .../README.md | 8 +-- .../databricks.yml | 2 +- .../resources/hello_world.job.yml | 20 ++++++ .../resources/hello_world.schema.yml | 5 ++ .../resources/my_volume.volume.yml | 4 +- .../write_from_job_to_volume/src/hello.ipynb | 21 +++++++ 10 files changed, 53 insertions(+), 99 deletions(-) delete mode 100644 knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml delete mode 100644 knowledge_base/save_job_result_to_volume/resources/trips.schema.yml delete mode 100644 knowledge_base/save_job_result_to_volume/src/query.ipynb rename knowledge_base/{save_job_result_to_volume => write_from_job_to_volume}/.gitignore (100%) rename knowledge_base/{save_job_result_to_volume => write_from_job_to_volume}/README.md (60%) rename knowledge_base/{save_job_result_to_volume => write_from_job_to_volume}/databricks.yml (80%) create mode 100644 knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml create mode 100644 knowledge_base/write_from_job_to_volume/resources/hello_world.schema.yml rename knowledge_base/{save_job_result_to_volume => write_from_job_to_volume}/resources/my_volume.volume.yml (84%) create mode 100644 knowledge_base/write_from_job_to_volume/src/hello.ipynb diff --git a/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml b/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml deleted file mode 100644 index 1338828..0000000 --- a/knowledge_base/save_job_result_to_volume/resources/top_ten_trips.job.yml +++ /dev/null @@ -1,26 +0,0 @@ -resources: - jobs: - top_ten_trips: - name: top_ten_trips - - trigger: - # Run this job every day to update all time the top ten trips recorded top - # 10 trips in the UC Volume. - interval: 1 - unit: DAYS - - # No job cluster is configured. The job will run on serverless compute. - # You can explicitly configure job compute here if your workspace does - # not have serverless compute enabled. - tasks: - - task_key: top_ten_trips_task - notebook_task: - notebook_path: ../src/query.ipynb - - parameters: - - name: schema_name - default: ${resources.schemas.trips.name} - - - - diff --git a/knowledge_base/save_job_result_to_volume/resources/trips.schema.yml b/knowledge_base/save_job_result_to_volume/resources/trips.schema.yml deleted file mode 100644 index 6710daf..0000000 --- a/knowledge_base/save_job_result_to_volume/resources/trips.schema.yml +++ /dev/null @@ -1,5 +0,0 @@ -resources: - schemas: - trips: - catalog_name: main - name: ${workspace.current_user.short_name}_trips diff --git a/knowledge_base/save_job_result_to_volume/src/query.ipynb b/knowledge_base/save_job_result_to_volume/src/query.ipynb deleted file mode 100644 index f8b407c..0000000 --- a/knowledge_base/save_job_result_to_volume/src/query.ipynb +++ /dev/null @@ -1,61 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Run the SQL query to get the top 10 trips by fare amount and store the result in CSV format in a Volume.\n", - "schema_name = dbutils.widgets.get(\"schema_name\")\n", - "best_trips_df = spark.sql(\"SELECT * FROM samples.nyctaxi.trips order by fare_amount desc LIMIT 10\")\n", - "best_trips_df.write.format(\"csv\").option(\"header\", \"true\").save(f\"/Volumes/main/{schema_name}/my_volume/top_ten/trip_fares.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "6bca260b-13d1-448f-8082-30b60a85c9ae", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Load the saved data from the Volume and display it.\n", - "df = spark.read.format(\"csv\").load(f\"/Volumes/main/{schema_name}/my_volume/top_ten/trip_fares.csv\")\n", - "\n", - "display(df)" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "notebook", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/knowledge_base/save_job_result_to_volume/.gitignore b/knowledge_base/write_from_job_to_volume/.gitignore similarity index 100% rename from knowledge_base/save_job_result_to_volume/.gitignore rename to knowledge_base/write_from_job_to_volume/.gitignore diff --git a/knowledge_base/save_job_result_to_volume/README.md b/knowledge_base/write_from_job_to_volume/README.md similarity index 60% rename from knowledge_base/save_job_result_to_volume/README.md rename to knowledge_base/write_from_job_to_volume/README.md index f156d72..5194ca5 100644 --- a/knowledge_base/save_job_result_to_volume/README.md +++ b/knowledge_base/write_from_job_to_volume/README.md @@ -2,10 +2,10 @@ This example demonstrates how to define and use a Unity Catalog Volume in a Databricks Asset Bundle. -Specifically we'll define a `top_ten_trips` job which computes the top ten NYC trips with the highest -fares and stores the result in a Unity Catalog Volume. +Specifically we'll define a `hello_world_job` job which writes "Hello, World!" +to a file in a Unity Catalog Volume. -The bundle also defines a Volume and the associated Schema in which we store the results. +The bundle also defines a Volume and the associated Schema in which the Job writes text to. ## Prerequisites @@ -17,4 +17,4 @@ Update the `host` field under `workspace` in `databricks.yml` to the Databricks Run `databricks bundle deploy` to deploy the job. -Run `databricks bundle run top_ten_trips` to run the job and store the results in UC volume. +Run `databricks bundle run hello_world_job` to run the job and store the results in UC volume. diff --git a/knowledge_base/save_job_result_to_volume/databricks.yml b/knowledge_base/write_from_job_to_volume/databricks.yml similarity index 80% rename from knowledge_base/save_job_result_to_volume/databricks.yml rename to knowledge_base/write_from_job_to_volume/databricks.yml index 2cb900c..7ae0888 100644 --- a/knowledge_base/save_job_result_to_volume/databricks.yml +++ b/knowledge_base/write_from_job_to_volume/databricks.yml @@ -1,5 +1,5 @@ bundle: - name: save_job_result_to_volume + name: write_from_job_to_volume include: - resources/*.yml diff --git a/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml b/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml new file mode 100644 index 0000000..d1d8b03 --- /dev/null +++ b/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml @@ -0,0 +1,20 @@ +resources: + jobs: + hello_world_job: + name: hello_world_job + + # No job cluster is configured. The job will run on serverless compute. + # You can explicitly configure job compute here if your workspace does + # not have serverless compute enabled. + tasks: + - task_key: hello_world_job_task + notebook_task: + notebook_path: ../src/hello.ipynb + + parameters: + - name: file_path + default: /Volumes/main/${resources.schemas.hello_world_schema.name}/${resources.volumes.my_volume.name}/hello_world.txt + + + + diff --git a/knowledge_base/write_from_job_to_volume/resources/hello_world.schema.yml b/knowledge_base/write_from_job_to_volume/resources/hello_world.schema.yml new file mode 100644 index 0000000..e1c0ed6 --- /dev/null +++ b/knowledge_base/write_from_job_to_volume/resources/hello_world.schema.yml @@ -0,0 +1,5 @@ +resources: + schemas: + hello_world_schema: + catalog_name: main + name: ${workspace.current_user.short_name}_hello_world diff --git a/knowledge_base/save_job_result_to_volume/resources/my_volume.volume.yml b/knowledge_base/write_from_job_to_volume/resources/my_volume.volume.yml similarity index 84% rename from knowledge_base/save_job_result_to_volume/resources/my_volume.volume.yml rename to knowledge_base/write_from_job_to_volume/resources/my_volume.volume.yml index 4431a6b..7e47321 100644 --- a/knowledge_base/save_job_result_to_volume/resources/my_volume.volume.yml +++ b/knowledge_base/write_from_job_to_volume/resources/my_volume.volume.yml @@ -2,8 +2,8 @@ resources: volumes: my_volume: catalog_name: main - name: my_volume # We use the ${resources.schemas...} interpolation syntax to force the creation # of the schema before the volume. Usage of the ${resources.schemas...} syntax # allows Databricks Asset Bundles to form a dependency graph between resources. - schema_name: ${resources.schemas.trips.name} \ No newline at end of file + schema_name: ${resources.schemas.hello_world_schema.name} + name: my_volume diff --git a/knowledge_base/write_from_job_to_volume/src/hello.ipynb b/knowledge_base/write_from_job_to_volume/src/hello.ipynb new file mode 100644 index 0000000..093e5d4 --- /dev/null +++ b/knowledge_base/write_from_job_to_volume/src/hello.ipynb @@ -0,0 +1,21 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file_path = dbutils.widgets.get(\"file_path\")\n", + "dbutils.fs.put(file_path, \"Hello World!\", overwrite=True)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 3f93da66b8ba51afdb554e9b16dcac85e1b91727 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 6 Dec 2024 16:31:50 +0530 Subject: [PATCH 11/13] address comments --- .../write_from_job_to_volume/resources/hello_world.job.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml b/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml index d1d8b03..def04aa 100644 --- a/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml +++ b/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml @@ -13,8 +13,4 @@ resources: parameters: - name: file_path - default: /Volumes/main/${resources.schemas.hello_world_schema.name}/${resources.volumes.my_volume.name}/hello_world.txt - - - - + default: /Volumes/main/${resources.schemas.hello_world_schema.name}/${resources.volumes.my_volume.name}/hello_world.txt \ No newline at end of file From e4d957666e9582bbfef567efe630729d844c9e1f Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 20 Dec 2024 11:47:52 +0530 Subject: [PATCH 12/13] Add trailing newline --- .../write_from_job_to_volume/resources/hello_world.job.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml b/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml index def04aa..80a35d5 100644 --- a/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml +++ b/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml @@ -13,4 +13,5 @@ resources: parameters: - name: file_path - default: /Volumes/main/${resources.schemas.hello_world_schema.name}/${resources.volumes.my_volume.name}/hello_world.txt \ No newline at end of file + default: /Volumes/main/${resources.schemas.hello_world_schema.name}/${resources.volumes.my_volume.name}/hello_world.txt + \ No newline at end of file From f98a0c3d74997dcde1e21f467f677cb6fe553494 Mon Sep 17 00:00:00 2001 From: Shreyas Goenka Date: Fri, 20 Dec 2024 11:48:32 +0530 Subject: [PATCH 13/13] - --- .../write_from_job_to_volume/resources/hello_world.job.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml b/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml index 80a35d5..36b13ab 100644 --- a/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml +++ b/knowledge_base/write_from_job_to_volume/resources/hello_world.job.yml @@ -14,4 +14,3 @@ resources: parameters: - name: file_path default: /Volumes/main/${resources.schemas.hello_world_schema.name}/${resources.volumes.my_volume.name}/hello_world.txt - \ No newline at end of file