diff --git a/README.md b/README.md index fd222a63..64d02f85 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ https://github.com/databricks/mlops-stacks/assets/87999496/0d220d55-465e-4a69-bd ### Prerequisites - Python 3.8+ - - [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html) >= v0.236.0 + - [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html) >= v0.278.0 [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/databricks-cli.html) contains [Databricks asset bundle templates](https://docs.databricks.com/en/dev-tools/bundles/templates.html) for the purpose of project creation. diff --git a/databricks_template_schema.json b/databricks_template_schema.json index 69fc4797..2e560952 100644 --- a/databricks_template_schema.json +++ b/databricks_template_schema.json @@ -1,18 +1,25 @@ { - "welcome_message": "Welcome to MLOps Stacks. For detailed information on project generation, see the README at https://github.com/databricks/mlops-stacks/blob/main/README.md.", - "min_databricks_cli_version": "v0.236.0", + "welcome_message":"Welcome to MLOps Stacks. For detailed information on project generation, see the README at https://github.com/databricks/mlops-stacks/blob/main/README.md.", + "min_databricks_cli_version": "v0.266.0", "properties": { - "input_setup_cicd_and_project": { + "input_project_type": { "order": 1, "type": "string", - "description": "{{if false}}\n\nERROR: This template is not supported by your current Databricks CLI version.\nPlease hit control-C and go to https://docs.databricks.com/en/dev-tools/cli/install.html for instructions on upgrading the CLI to the minimum version supported by MLOps Stacks.\n\n\n{{end}}\nSelect if both CI/CD and the Project should be set up, or only one of them.\nYou can always set up the other later by running initialization again", + "description": "{{if false}}\n\nERROR: This template is not supported by your current Databricks CLI version.\nPlease hit control-C and go to https://docs.databricks.com/en/dev-tools/cli/install.html for instructions on upgrading the CLI to the minimum version supported by MLOps Stacks.\n\n\n{{end}}\nSelect project type", + "default": "mlops", + "enum": ["mlops", "agentops"] + }, + "input_setup_cicd_and_project": { + "order": 2, + "type": "string", + "description": "Select if both CI/CD and the Project should be set up, or only one of them.\nYou can always set up the other later by running initialization again", "default": "CICD_and_Project", "enum": ["CICD_and_Project", "Project_Only", "CICD_Only"] }, "input_project_name": { - "order": 2, + "order": 3, "type": "string", - "default": "my_mlops_project", + "default": "my_{{ .input_project_type }}_project", "description": "\nProject Name. Default", "pattern": "^[^ .\\\\/]{3,}$", "pattern_match_failure_message": "Project name must be at least 3 characters long and cannot contain the following characters: \"\\\", \"/\", \" \" and \".\".", @@ -24,8 +31,21 @@ } } }, + "input_doc_link": { + "order": 4, + "type": "string", + "description": "URL of the documentation link", + "default": "https://docs.databricks.com/en/doc-sitemap.xml", + "skip_prompt_if": { + "properties": { + "input_project_type": { + "enum": ["mlops", "agentops"] + } + } + } + }, "input_root_dir": { - "order": 3, + "order": 5, "type": "string", "default": "{{ .input_project_name }}", "description": "\nRoot directory name.\nFor monorepos, name of the root directory that contains all the projects.\nDefault", @@ -38,14 +58,14 @@ } }, "input_cloud": { - "order": 4, + "order": 6, "type": "string", "description": "\nSelect cloud", "default": "azure", "enum": ["azure", "aws", "gcp"] }, "input_cicd_platform": { - "order": 5, + "order": 7, "type": "string", "description": "\nSelect CICD platform", "default": "github_actions", @@ -59,7 +79,7 @@ } }, "input_databricks_staging_workspace_host": { - "order": 6, + "order": 8, "type": "string", "default": "{{if eq .input_cloud `azure`}}https://adb-xxxx.xx.azuredatabricks.net{{else if eq .input_cloud `aws`}}https://your-staging-workspace.cloud.databricks.com{{else if eq .input_cloud `gcp`}}https://your-staging-workspace.gcp.databricks.com{{end}}", "description": "\nURL of staging Databricks workspace,\nIt will run PR CI and preview changes before they're deployed to production.\nDefault", @@ -74,7 +94,7 @@ } }, "input_databricks_prod_workspace_host": { - "order": 7, + "order": 9, "type": "string", "default": "{{if eq .input_cloud `azure`}}https://adb-xxxx.xx.azuredatabricks.net{{else if eq .input_cloud `aws`}}https://your-prod-workspace.cloud.databricks.com{{else if eq .input_cloud `gcp`}}https://your-prod-workspace.gcp.databricks.com{{end}}", "description": "\nURL of production Databricks workspace.\nDefault", @@ -89,7 +109,7 @@ } }, "input_default_branch": { - "order": 8, + "order": 10, "type": "string", "default": "main", "description": "\nName of the default branch,\nStaging resources are deployed from this branch and stages the latest ML code.\nDefault", @@ -102,7 +122,7 @@ } }, "input_release_branch": { - "order": 9, + "order": 11, "type": "string", "default": "release", "description": "\nName of the release branch.\nThe training and other production jobs pull ML code from this branch.\nDefault", @@ -115,7 +135,7 @@ } }, "input_read_user_group": { - "order": 10, + "order": 12, "type": "string", "default": "users", "description": "\nUser group name to give READ permissions to for project resources\n(ML jobs, integration test job runs, and machine learning resources).\nA group with this name must exist in both the staging and prod workspaces.\nDefault", @@ -128,14 +148,21 @@ } }, "input_include_models_in_unity_catalog": { - "order": 11, + "order": 13, "type": "string", "description": "\nWhether to use the Model Registry with Unity Catalog", "default": "no", - "enum": ["yes", "no"] + "enum": ["yes", "no"], + "skip_prompt_if": { + "properties": { + "input_project_type": { + "const": "agentops" + } + } + } }, "input_staging_catalog_name": { - "order": 12, + "order": 14, "type": "string", "description": "\nName of the catalog in Unity Catalog that will host the staging UC resources. \nThis catalog must already exist and service principals must have access to it.\nDefault", "default": "staging", @@ -159,7 +186,7 @@ } }, "input_prod_catalog_name": { - "order": 13, + "order": 15, "type": "string", "description": "\nName of the catalog in Unity Catalog that will host the production UC resources.\nThis catalog must already exist and service principals must have access to it.\nDefault", "default": "prod", @@ -183,7 +210,7 @@ } }, "input_test_catalog_name": { - "order": 14, + "order": 16, "type": "string", "description": "\nName of the catalog in Unity Catalog that will be used for integration tests.\nThis catalog must already exist and service principals must have access to it.\nDefault", "default": "test", @@ -207,7 +234,7 @@ } }, "input_schema_name": { - "order": 15, + "order": 17, "type": "string", "description": "\nName of schema to use when registering a model in Unity Catalog.\nThis schema must already exist and service principals must have access.\nWe recommend using the project name.\nDefault", "default": "{{if (eq .input_include_models_in_unity_catalog `no`)}}schema{{else}}{{ .input_project_name }}{{end}}", @@ -233,7 +260,7 @@ } }, "input_unity_catalog_read_user_group": { - "order": 16, + "order": 18, "type": "string", "default": "account users", "description": "\nUser group name to give EXECUTE privileges to models in Unity Catalog (UC).\nIt must exist in UC with access granted to the staging and prod workspaces.\nDefault", @@ -257,36 +284,58 @@ } }, "input_inference_table_name": { - "order": 17, + "order": 19, "type": "string", "description": "\nFully qualified name of inference table to attach monitoring to.\nThis table must already exist and service principals must have access.", "default": "dev.{{ .input_project_name }}.predictions", "pattern": "^[^ .\\-\\/]+(\\.[^ .\\-\\/]+){2}$", "pattern_match_failure_message": "Fully qualified Unity Catalog table names must have catalog, schema, and table separated by \".\" and each cannot contain any of the following characters: \" \", \".\", \"-\", \"\\\", \"/\"", "skip_prompt_if": { + "anyOf":[ + { "properties": { "input_setup_cicd_and_project": { "const": "CICD_Only" } } + }, + { + "properties": { + "input_project_type": { + "const": "agentops" + } + } + } + ] } }, "input_include_feature_store": { - "order": 18, + "order": 20, "type": "string", "description": "\nWhether to include Feature Store", "default": "no", "enum": ["no", "yes"], "skip_prompt_if": { - "properties": { - "input_setup_cicd_and_project": { - "const": "CICD_Only" + "anyOf":[ + { + "properties": { + "input_setup_cicd_and_project": { + "const": "CICD_Only" + } + } + }, + { + "properties": { + "input_project_type": { + "const": "agentops" + } + } } - } + ] } }, "input_include_mlflow_recipes": { - "order": 19, + "order": 21, "type": "string", "description": "\nWhether to include MLflow Recipes", "default": "no", @@ -313,12 +362,19 @@ "const": "CICD_Only" } } + }, + { + "properties": { + "input_project_type": { + "const": "agentops" + } + } } ] } }, "input_docker_image": { - "order": 20, + "order": 22, "type": "string", "description": "\nDocker image for the execution of Gitlab pipelines", "default": "databricksfieldeng/mlopsstacks:latest", @@ -357,4 +413,4 @@ } }, "success_message" : "\n*** Your MLOps Stack has been created in the '{{.input_root_dir}}{{if not (eq .input_setup_cicd_and_project `CICD_Only`) }}/{{.input_project_name}}{{end}}' directory! ***\n\nPlease refer to the README.md for further instructions on getting started." -} +} \ No newline at end of file diff --git a/library/template_variables.tmpl b/library/template_variables.tmpl index 805f267d..cacb8754 100644 --- a/library/template_variables.tmpl +++ b/library/template_variables.tmpl @@ -56,7 +56,7 @@ {{- end }} {{ define `cli_version` -}} - v0.236.0 + v0.278.0 {{- end }} {{ define `stacks_version` -}} diff --git a/template/update_layout.tmpl b/template/update_layout.tmpl index a666e23c..7fe38366 100644 --- a/template/update_layout.tmpl +++ b/template/update_layout.tmpl @@ -2,6 +2,28 @@ {{ $project_name_alphanumeric_underscore := (regexp `-`).ReplaceAllString ((regexp `[^A-Za-z0-9_-]`).ReplaceAllString .input_project_name ``) `_` -}} {{ $root_dir := .input_root_dir}} +{{ if (eq .input_project_type `mlops`) }} + # Skip agentops-specific directories and resources + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `data_preparation`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/integration`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/integration/model_serving_test.py`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/data-preparation-resource.yml`) }} +{{ else if (eq .input_project_type `agentops`) }} + # Skip mlops-specific components + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `training`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/training`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `feature_engineering`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `tests/feature_engineering`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `validation`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `monitoring`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `deployment`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/model-workflow-resource.yml`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/batch-inference-workflow-resource.yml`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/feature-engineering-workflow-resource.yml`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/monitoring-resource.yml`) }} + {{ skip (printf `%s/%s/%s` $root_dir $project_name_alphanumeric_underscore `resources/ml-artifacts-resource.yml`) }} +{{ end }} + {{ if (eq .input_setup_cicd_and_project `Project_Only`) }} {{ skip (printf `%s/%s` $root_dir `.azure`) }} {{ skip (printf `%s/%s` $root_dir `.github`) }} @@ -88,4 +110,4 @@ # Remove template files {{ skip (printf `%s/%s` $root_dir `cicd`) }} {{ skip `update_layout` }} -{{ skip `run_validations` }} +{{ skip `run_validations` }} \ No newline at end of file diff --git a/template/{{.input_root_dir}}/.azure/devops-pipelines/{{.input_project_name}}-tests-ci.yml.tmpl b/template/{{.input_root_dir}}/.azure/devops-pipelines/{{.input_project_name}}-tests-ci.yml.tmpl index d9574c8f..4c546179 100644 --- a/template/{{.input_root_dir}}/.azure/devops-pipelines/{{.input_project_name}}-tests-ci.yml.tmpl +++ b/template/{{.input_root_dir}}/.azure/devops-pipelines/{{.input_project_name}}-tests-ci.yml.tmpl @@ -124,6 +124,7 @@ jobs: {{- end }} {{ end }} + {{ if (eq .input_project_type `mlops`) }} # Run model_training_job defined in bundle in the staging workspace - script: | databricks bundle run model_training_job -t test @@ -137,3 +138,31 @@ jobs: {{ else -}} DATABRICKS_TOKEN: $(STAGING_WORKSPACE_TOKEN) {{- end }} + {{ else -}} + # Run data_preprocessing_job defined in bundle in the staging workspace + - script: | + databricks bundle run data_preprocessing_job -t test + workingDirectory: $(workingDirectory) + displayName: Run data preprocessing workflow for test deployment target in staging workspace + env: + {{ if (eq .input_cloud `azure`) -}} + ARM_TENANT_ID: $(STAGING_AZURE_SP_TENANT_ID) + ARM_CLIENT_ID: $(STAGING_AZURE_SP_APPLICATION_ID) + ARM_CLIENT_SECRET: $(STAGING_AZURE_SP_CLIENT_SECRET) + {{ else -}} + DATABRICKS_TOKEN: $(STAGING_WORKSPACE_TOKEN) + {{- end }} + # Run agent_development_job defined in bundle in the staging workspace + - script: | + databricks bundle run agent_development_job -t test + workingDirectory: $(workingDirectory) + displayName: Run agent development workflow for test deployment target in staging workspace + env: + {{ if (eq .input_cloud `azure`) -}} + ARM_TENANT_ID: $(STAGING_AZURE_SP_TENANT_ID) + ARM_CLIENT_ID: $(STAGING_AZURE_SP_APPLICATION_ID) + ARM_CLIENT_SECRET: $(STAGING_AZURE_SP_CLIENT_SECRET) + {{ else -}} + DATABRICKS_TOKEN: $(STAGING_WORKSPACE_TOKEN) + {{- end }} + {{ end }} \ No newline at end of file diff --git a/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-run-tests.yml.tmpl b/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-run-tests.yml.tmpl index 77a59a6f..bde102f7 100644 --- a/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-run-tests.yml.tmpl +++ b/template/{{.input_root_dir}}/.github/workflows/{{.input_project_name}}-run-tests.yml.tmpl @@ -66,7 +66,18 @@ jobs: run: | databricks bundle run write_feature_table_job -t test {{- end }} + {{- if (eq .input_project_type `mlops`) }} - name: Run Training Workflow for Test Deployment Target in Staging Workspace id: training run: | databricks bundle run model_training_job -t test + {{ else -}} + - name: Run Data Preprocessing Workflow for Test Deployment Target in Staging Workspace + id: data_preprocessing + run: | + databricks bundle run data_preprocessing_job -t test + - name: Run Agent Development Workflow for Test Deployment Target in Staging Workspace + id: agent_development + run: | + databricks bundle run agent_development_job -t test + {{- end }} diff --git a/template/{{.input_root_dir}}/.gitlab/pipelines/{{.input_project_name}}-bundle-ci.yml.tmpl b/template/{{.input_root_dir}}/.gitlab/pipelines/{{.input_project_name}}-bundle-ci.yml.tmpl index 81859eb2..b623accb 100644 --- a/template/{{.input_root_dir}}/.gitlab/pipelines/{{.input_project_name}}-bundle-ci.yml.tmpl +++ b/template/{{.input_root_dir}}/.gitlab/pipelines/{{.input_project_name}}-bundle-ci.yml.tmpl @@ -22,7 +22,15 @@ integration-test: - cd {{template `project_name_alphanumeric_underscore` .}} - databricks bundle validate -t test - databricks bundle deploy -t test + {{ if (eq .input_include_feature_store `yes`) }} - databricks bundle run write_feature_table_job -t test + {{ end }} + {{ if (eq .input_project_type `mlops`) }} - databricks bundle run model_training_job -t test + {{ else -}} + - databricks bundle run data_preprocessing_job -t test + - databricks bundle run agent_development_job -t test + {{ end }} + rules: - when: on_success # mandatory to ensure this job can be called by parent cicd on merge request diff --git a/template/{{.input_root_dir}}/README.md.tmpl b/template/{{.input_root_dir}}/README.md.tmpl index 9894e69f..a9ca863f 100644 --- a/template/{{.input_root_dir}}/README.md.tmpl +++ b/template/{{.input_root_dir}}/README.md.tmpl @@ -1,6 +1,6 @@ # {{ .input_root_dir }} -This directory contains an ML project based on the default +{{ if (eq .input_project_type `mlops`) }}This directory contains an ML project based on the default [Databricks MLOps Stacks](https://github.com/databricks/mlops-stacks), defining a production-grade ML pipeline for automated retraining and batch inference of an ML model on tabular data. The "Getting Started" docs can be found at {{ template `generate_doc_link` (map (pair "cloud" .input_cloud) (pair "path" "dev-tools/bundles/mlops-stacks.html")) }}. @@ -97,7 +97,7 @@ contained in the following files: │ ├── training <- Folder for model development via MLflow recipes. │ │ │ │ │ ├── steps <- MLflow recipe steps (Python modules) implementing ML pipeline logic, e.g. model training and evaluation. Most -│ │ │ development work happens here. See https://mlflow.org/docs/latest/recipes.html for details +│ │ │ development work happens here. │ │ │ │ │ ├── notebooks <- Databricks notebook that runs the MLflow recipe, i.e. run the logic in `steps`. Used to │ │ │ drive code execution on Databricks for CI/CD. In most cases, you do not need to modify @@ -196,3 +196,84 @@ run the above mentioned workflow, specifying the `project_name` as a parameter. NOTE: This project has already been initialized with an instantiation of the above workflow, so there's no need to run it again for project `{{.input_project_name}}`. {{ end -}} + +{{ else if (eq .input_project_type `agentops`) }}This directory contains an Agent project defining a production-grade Agent pipeline for automated data preparation, agent development, and deployment. + +## Code structure +This project contains the following components: + +| Component | Description | +|----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Agent Code | Example Agent project code and notebooks | +| Agent Resources as Code | Agent artifacts (model and experiment) and agent pipeline resources (data preparation and development jobs with schedules, etc) configured and deployed through databricks CLI bundles. | +| CI/CD | [GitHub Actions](https://github.com/actions) workflows to test and deploy code and resources. + + +contained in the following files: + +``` +my_agent_project <- Root directory. Both monorepo and polyrepo are supported. +│ +├── my_agent_project <- Contains python code, notebooks and resources related to one project. +│ │ +│ ├── databricks.yml <- root bundle file for the project that can be loaded by databricks CLI bundles. It defines the bundle name, workspace URL and resource config component to be included. +│ │ +│ ├── data_preparation <- Retrieves, stores, cleans, and vectorizes source data that is then ingested into a Vector Search index. +│ │ │ +│ │ ├── data_ingestion <- Databricks Documentation scraping retrieval and storage. +│ │ │ +│ │ ├── data_preprocessing <- Documentation cleansing and vectorization. +│ │ │ +│ │ ├── vector_search <- Vector Search and index creation and ingestion. +│ │ +│ │ +│ ├── agent_development <- Creates, registers, and evaluates the agent. +│ │ │ +│ │ ├── agent_requirements.txt <- Specifies Python dependencies for agent development, evaluation, and model deployment workflow. +│ │ │ +│ │ ├── agent <- LangGraph Agent creation. +│ │ +│ │ +│ ├── agent_deployment <- Deploys agent on Model Serving endpoint +│ │ │ +│ │ ├── model_serving <- Model serving endpoint for the Agent. +│ │ +│ ├── requirements.txt <- Specifies Python dependencies for data preparation workflow. +│ │ +│ ├── agent_requirements.txt <- Specifies Python dependencies for agent workflow. +│ │ +│ ├── tests <- Tests for the Agent project. +│ │ +│ ├── resources <- Agent resource (Agent jobs, MLflow models) config definitions expressed as code, across dev/staging/prod/test. +│ │ +│ ├── data-preparation-resource.yml <- Agent resource config definition for data preparation and vectorization. +│ │ +│ ├── agent-resource-workflow-resource.yml <- Agent resource config definition for agent development and deployment. +│ │ +│ ├── agents-artifacts-resource.yml <- Agent resource config definition for model and experiment. +│ +├── .github <- Configuration folder for CI/CD using GitHub Actions. The CI/CD workflows deploy resources defined in the `./resources/*` folder with databricks CLI bundles. +│ +├── docs <- Contains documentation for the repo. +│ +├── cicd.tar.gz <- Contains CI/CD bundle that should be deployed by deploy-cicd.yml to set up CI/CD for projects. +``` + +## Using this repo + + +If you're a data scientist just getting started with this repo for a brand new Agent, we recommend adapting the provided example code to your Agent. Then making and testing the agent workflow code changes on Databricks or your local machine. Follow the instructions from the [project README](./my_agent_project/README.md). + +When you're ready to deploy production training/inference +pipelines, ask your ops team to follow the [setup guide](docs/agentops-setup.md) to configure CI/CD and deploy production pipelines. + +After that, follow the [pull request guide](docs/pull-request.md) + and [agent resource config guide](my_agent_project/resources/README.md) to propose, test, and deploy changes to production code (e.g. updating a prompt) or pipeline resources (e.g. using a different size for Model Serving) via pull request. + +| Role | Goal | Docs | +|-------------------------------|------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| Data Scientist | Get started writing Agent code for a brand new project | [project README](./my_agent_project/README.md) | +| Ops | Set up CI/CD for the current Agent | [AgentOps setup guide](docs/agentops-setup.md) | +| Data Scientist | Update production Agent for an existing project | [pull request guide](docs/pull-request.md) | +| Data Scientist/MLE | Modify production Agent resources, e.g. data preparation or agent development jobs | [Agent resource config guide](my_agent_project/resources/README.md) | +{{ end }} \ No newline at end of file diff --git a/template/{{.input_root_dir}}/_params_testing_only.txt.tmpl b/template/{{.input_root_dir}}/_params_testing_only.txt.tmpl index 465f32a1..ae1f11d2 100644 --- a/template/{{.input_root_dir}}/_params_testing_only.txt.tmpl +++ b/template/{{.input_root_dir}}/_params_testing_only.txt.tmpl @@ -1,5 +1,6 @@ input_root_dir={{.input_root_dir}} input_project_name={{.input_project_name}} +input_project_type={{.input_project_type}} input_cloud={{.input_cloud}} input_cicd_platform={{.input_cicd_platform}} input_databricks_staging_workspace_host={{.input_databricks_staging_workspace_host}} diff --git a/template/{{.input_root_dir}}/cicd.tar.gz b/template/{{.input_root_dir}}/cicd.tar.gz index affa3dc8..7d79a335 100644 Binary files a/template/{{.input_root_dir}}/cicd.tar.gz and b/template/{{.input_root_dir}}/cicd.tar.gz differ diff --git a/template/{{.input_root_dir}}/cicd/databricks_template_schema.json b/template/{{.input_root_dir}}/cicd/databricks_template_schema.json index 13b5ea58..2f457628 100644 --- a/template/{{.input_root_dir}}/cicd/databricks_template_schema.json +++ b/template/{{.input_root_dir}}/cicd/databricks_template_schema.json @@ -1,66 +1,72 @@ { "welcome_message": "Please use the provided deploy CICD workflow to initialize this bundle.", "properties": { - "input_root_dir": { + "input_project_type": { "order": 1, + "type": "string", + "description": "Project Type", + "default": "mlops" + }, + "input_root_dir": { + "order": 2, "description": "Root Directory", "type": "string" }, "input_cloud": { - "order": 2, + "order": 3, "description": "Cloud", "type": "string" }, "input_cicd_platform": { - "order": 3, + "order": 4, "description": "CICD Platform", "type": "string" }, "input_default_branch": { - "order": 4, + "order": 5, "description": "Default Branch", "type": "string" }, "input_release_branch": { - "order": 5, + "order": 6, "description": "Release Branch", "type": "string" }, "cloud_specific_node_type_id": { - "order": 6, + "order": 7, "description": "Cloud Specific Node Type ID", "type": "string" }, "input_cli_version": { - "order": 7, + "order": 8, "description": "CLI Version", "type": "string" }, "input_test_catalog_name": { - "order": 8, + "order": 9, "type": "string", "description": "\nName of the Test Unity Catalog", "default": "test" }, "input_staging_catalog_name": { - "order": 9, + "order": 10, "type": "string", "description": "\nName of the Staging Unity Catalog", "default": "staging" }, "input_prod_catalog_name": { - "order": 10, + "order": 11, "type": "string", "description": "\nName of the Prod Unity Catalog", "default": "prod" }, "input_project_name": { - "order": 11, + "order": 12, "description": "Project Name", "type": "string" }, "input_include_feature_store": { - "order": 12, + "order": 13, "description": "Use Feature Store (yes) or not (no)", "type": "string" } diff --git a/template/{{.input_root_dir}}/cicd_params.json.tmpl b/template/{{.input_root_dir}}/cicd_params.json.tmpl index eba55d04..8ee5a6bf 100644 --- a/template/{{.input_root_dir}}/cicd_params.json.tmpl +++ b/template/{{.input_root_dir}}/cicd_params.json.tmpl @@ -1,5 +1,6 @@ { "input_root_dir": "{{ .input_root_dir }}", + "input_project_type": "{{ .input_project_type }}", "input_cloud": "{{ .input_cloud }}", "input_cicd_platform": "{{ .input_cicd_platform }}", "input_default_branch": "{{ .input_default_branch }}", diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl index cdb7f98e..6d978a8a 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/README.md.tmpl @@ -97,7 +97,7 @@ contained in the following files: │ ├── training <- Folder for model development via MLflow recipes. │ │ │ │ │ ├── steps <- MLflow recipe steps (Python modules) implementing ML pipeline logic, e.g. model training and evaluation. Most -│ │ │ development work happens here. See https://mlflow.org/docs/latest/recipes.html for details +│ │ │ development work happens here. │ │ │ │ │ ├── notebooks <- Databricks notebook that runs the MLflow recipe, i.e. run the logic in `steps`. Used to │ │ │ drive code execution on Databricks for CI/CD. In most cases, you do not need to modify diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/notebooks/DataIngestion.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/notebooks/DataIngestion.py.tmpl new file mode 100644 index 00000000..425c2d1e --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/notebooks/DataIngestion.py.tmpl @@ -0,0 +1,123 @@ +# Databricks notebook source +# MAGIC %load_ext autoreload +# MAGIC %autoreload 2 + +# COMMAND ---------- + +# DBTITLE 1,Data Ingestion Pipeline - Overview +################################################################################### +# Data Ingestion Pipeline +# +# This pipeline is designed to process raw documentation data from a specified data source URL. +# The data is stored in a Unity Catalog within a specified database for later processing. +# +# Parameters: +# * catalog_name (required) - Name of the Unity Catalog containing the input data +# * schema (required) - Name of the schema inside the Unity Catalog +# * raw_data_table (required) - Name of the raw data table inside the database of the Unity Catalog +# * data_source_url (required) - URL of the data source +# +# Widgets: +# * Unity Catalog: Text widget to input the name of the Unity Catalog +# * Schema: Text widget to input the name of the database inside the Unity Catalog +# * Raw data table: Text widget to input the name of the raw data table inside the database of the Unity Catalog +# * Data Source URL: Text widget to input the URL of the data source +# +# Usage: +# 1. Set the appropriate values for the widgets. +# 2. Run the pipeline to collect and store the raw documentation data. +# +################################################################################## + +# COMMAND ---------- + +# DBTITLE 1,Install Prerequisites +# Install prerequisite packages +%pip install -r ../../../requirements.txt + +# COMMAND ---------- + +# DBTITLE 1,Widget creation +# List of input args needed to run this notebook as a job +# Provide them via DB widgets or notebook arguments in your DAB resources + +# A Unity Catalog containing the input data +dbutils.widgets.text( + "catalog_name", + "", + label="Unity Catalog", +) +# Name of schema +dbutils.widgets.text( + "schema", + "", + label="Schema", +) +# Name of raw data table +dbutils.widgets.text( + "raw_data_table", + "raw_documentation", + label="Raw data table", +) + +# Data source url +dbutils.widgets.text( + "data_source_url", + "", + label="Data Source URL", +) + +# COMMAND ---------- + +# DBTITLE 1,Define input and output variables +catalog_name = dbutils.widgets.get("catalog_name") +schema = dbutils.widgets.get("schema") +raw_data_table = dbutils.widgets.get("raw_data_table") +data_source_url = dbutils.widgets.get("data_source_url") + +assert catalog_name != "", "catalog_name notebook parameter must be specified" +assert schema != "", "schema notebook parameter must be specified" +assert raw_data_table != "", "raw_data_table notebook parameter must be specified" +assert data_source_url != "", "data_source_url notebook parameter must be specified" + +# COMMAND ---------- + +# DBTITLE 1,Set up path to import utilities +# Set up path to import utility and other helper functions +import sys +import os + +# Get notebook's directory using dbutils +notebook_path = '/Workspace/' + os.path.dirname( + dbutils.notebook.entry_point.getDbutils().notebook() + .getContext().notebookPath().get() +) +# Navigate up from notebooks/ to component level (data_ingestion/) +utils_dir = os.path.dirname(notebook_path) +sys.path.insert(0, utils_dir) + +# COMMAND ---------- + +# DBTITLE 1,Use the catalog and database specified in the notebook parameters +spark.sql(f"""CREATE SCHEMA IF NOT EXISTS `{catalog_name}`.`{schema}`""") + +spark.sql(f"""USE `{catalog_name}`.`{schema}`""") + +# COMMAND ---------- + +# DBTITLE 1,Download and store data to UC +from utils.fetch_data import fetch_data_from_url + +if not spark.catalog.tableExists(f"{raw_data_table}") or spark.table(f"{raw_data_table}").isEmpty(): + # Download the data to a DataFrame + doc_articles = fetch_data_from_url(spark, data_source_url) + + #Save them as to unity catalog + doc_articles.write.mode('overwrite').saveAsTable(f"{raw_data_table}") + + doc_articles.display() + + +# COMMAND ---------- + +dbutils.notebook.exit(0) \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/utils/fetch_data.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/utils/fetch_data.py.tmpl new file mode 100644 index 00000000..fa9bad86 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_ingestion/utils/fetch_data.py.tmpl @@ -0,0 +1,85 @@ +""" +This sample module contains data ingestion logic for ingesting data from a URL and parsing the HTML content. +You should adapt the code based ont the HTML structure of your own data. The function returns a DataFrame with the parsed content. +""" + + +from pyspark.sql.types import StringType +from pyspark.sql.functions import col, udf, length, pandas_udf + + +from bs4 import BeautifulSoup +import xml.etree.ElementTree as ET +from concurrent.futures import ThreadPoolExecutor +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +import pandas as pd + + +retries = Retry( + total=3, + backoff_factor=3, + status_forcelist=[429], +) + + +def fetch_data_from_url(spark, data_source_url, max_documents=None): + # Fetch the XML content from sitemap + response = requests.get(data_source_url) + root = ET.fromstring(response.content) + + # Find all 'loc' elements (URLs) in the XML + urls = [loc.text for loc in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc")] + if max_documents: + urls = urls[:max_documents] + + # Create DataFrame from URLs + df_urls = spark.createDataFrame(urls, StringType()).toDF("url").repartition(10) + + # Pandas UDF to fetch HTML content for a batch of URLs + @pandas_udf("string") + def fetch_html_udf(urls: pd.Series) -> pd.Series: + adapter = HTTPAdapter(max_retries=retries) + http = requests.Session() + http.mount("http://", adapter) + http.mount("https://", adapter) + def fetch_html(url): + try: + response = http.get(url) + if response.status_code == 200: + return response.content + except requests.RequestException: + return None + return None + + with ThreadPoolExecutor(max_workers=200) as executor: + results = list(executor.map(fetch_html, urls)) + return pd.Series(results) + + # Pandas UDF to process HTML content and extract text + @pandas_udf("string") + def download_web_page_udf(html_contents: pd.Series) -> pd.Series: + def extract_text(html_content): + if html_content: + soup = BeautifulSoup(html_content, "html.parser") + article_div = soup.find("div", class_="theme-doc-markdown markdown") + if article_div: + return str(article_div).strip() + return None + + return html_contents.apply(extract_text) + + # Apply UDFs to DataFrame + df_with_html = df_urls.withColumn("html_content", fetch_html_udf("url")) + final_df = df_with_html.withColumn("text", download_web_page_udf("html_content")) + + # Select and filter non-null results + final_df = final_df.select("url", "text").filter("text IS NOT NULL") + if final_df.isEmpty(): + raise Exception("""Dataframe is empty, couldn't download Databricks documentation. + This is most likely caused by article_div = soup.find("div", class_="theme-doc-markdown markdown") in download_web_page_udf. + Please check the html of the documentation page you are trying to download and chance the filter accordingly. + """) + + return final_df \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/notebooks/DataPreprocessing.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/notebooks/DataPreprocessing.py.tmpl new file mode 100644 index 00000000..ca0d0502 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/notebooks/DataPreprocessing.py.tmpl @@ -0,0 +1,197 @@ +# Databricks notebook source +# MAGIC %load_ext autoreload +# MAGIC %autoreload 2 + +# COMMAND ---------- + +################################################################################### +# Data Preprocessing Pipeline +# +# This notebook shows an example of a Data Preprocessing pipeline using Unity Catalog. +# It is configured and can be executed as the tasks in the PreprocessRawData workflow defined under +# ``{{template `project_name_alphanumeric_underscore` .}}/resources/data-preprocessing-workflow-resource.yml`` +# +# Parameters: +# * catalog_name (required) - Name of the Unity Catalog +# * schema (required) - Name of the schema inside Unity Catalog +# * raw_data_table (required) - Name of the raw data table inside UC database +# * preprocessed_data_table (required) - Name of the preprocessed data table inside UC database +# * hf_tokenizer_model (optional) - Name of the HuggingFace tokenizer model name +# * max_chunk_size (optional) - Maximum chunk size +# * min_chunk_size (optional) - Minimum chunk size +# * chunk_overlap (optional) - Overlap between chunks +# +# Widgets: +# * Unity Catalog: Text widget to input the name of the Unity Catalog +# * schema: Text widget to input the name of the database inside the Unity Catalog +# * Raw data table: Text widget to input the name of the raw data table inside the database of the Unity Catalog +# * Preprocessed data table: Text widget to input the name of the preprocessed data table inside the database of the Unity Catalog +# * HuggingFace tokenizer model: Text widget to input the name of the hugging face tokenizer model to import +# * Maximum chunk size: Maximum characters chunks will be split into +# * Minimum chunk size: minimum characters chunks will be split into +# * Chunk overlap: Overlap between chunks +# +# Usage: +# 1. Set the appropriate values for the widgets. +# 2. Run the pipeline to chunk the raw data and store in Unity Catalog. +# +################################################################################## + +# COMMAND ---------- + +# DBTITLE 1,Install Prerequisites +# Install prerequisite packages +%pip install -r ../../../requirements.txt + +# COMMAND ---------- + +# List of input args needed to run this notebook as a job. +# Provide them via DB widgets or notebook arguments. + +# A Unity Catalog containing the input data +dbutils.widgets.text( + "catalog_name", + "", + label="Unity Catalog", +) +# Name of schema +dbutils.widgets.text( + "schema", + "", + label="Schema", +) +# Name of input table +dbutils.widgets.text( + "raw_data_table", + "raw_documentation", + label="Raw data table", +) +# Name of output table +dbutils.widgets.text( + "preprocessed_data_table", + "databricks_documentation", + label="Preprocessed data table", +) +# Name of huggingface tokenizer model +dbutils.widgets.text( + "hf_tokenizer_model", + "openai-community/openai-gpt", + label="HuggingFace tokenizer model", +) +# Maximum chunk size +dbutils.widgets.text("max_chunk_size", "500", label="Maximum chunk size") +# Minimum chunk size +dbutils.widgets.text("min_chunk_size", "20", label="Minimum chunk size") +# Chunk overlap +dbutils.widgets.text("chunk_overlap", "50", label="Chunk overlap") + +# COMMAND ---------- + +# DBTITLE 1,Define input and output variables +catalog_name = dbutils.widgets.get("catalog_name") +schema = dbutils.widgets.get("schema") +raw_data_table = dbutils.widgets.get("raw_data_table") +preprocessed_data_table = dbutils.widgets.get("preprocessed_data_table") +hf_tokenizer_model = dbutils.widgets.get("hf_tokenizer_model") +max_chunk_size = int(dbutils.widgets.get("max_chunk_size")) +min_chunk_size = int(dbutils.widgets.get("min_chunk_size")) +chunk_overlap = int(dbutils.widgets.get("chunk_overlap")) + +assert catalog_name != "", "catalog_name notebook parameter must be specified" +assert schema != "", "schema notebook parameter must be specified" +assert raw_data_table != "", "raw_data_table notebook parameter must be specified" +assert preprocessed_data_table != "", "preprocessed_data_table notebook parameter must be specified" +assert hf_tokenizer_model != "", "hf_tokenizer_model notebook parameter must be specified" +assert max_chunk_size != "", "max_chunk_size notebook parameter must be specified" +assert min_chunk_size != "", "min_chunk_size notebook parameter must be specified" +assert chunk_overlap != "", "chunk_overlap notebook parameter must be specified" + +# COMMAND ---------- + +# DBTITLE 1,Set up path to import utilities +# Set up path to import utility and other helper functions +import sys +import os + +# Get notebook's directory using dbutils +notebook_path = '/Workspace/' + os.path.dirname( + dbutils.notebook.entry_point.getDbutils().notebook() + .getContext().notebookPath().get() +) +# Navigate up from notebooks/ to component level (data_preprocessing/) +utils_dir = os.path.dirname(notebook_path) +sys.path.insert(0, utils_dir) + +# COMMAND ---------- + +# DBTITLE 1,Initialize tokenizer +# Download tokenizer model to UC volume +from transformers import AutoTokenizer + +volume_folder = f"/Volumes/{catalog_name}/{schema}/volume_databricks_documentation" + +spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema}.volume_databricks_documentation") + +# Initialize tokenizer once +tokenizer = AutoTokenizer.from_pretrained(hf_tokenizer_model, cache_dir=f'{volume_folder}/hg_cache') + + +# COMMAND ---------- + +# DBTITLE 1, Use the catalog and database specified in the notebook parameters +spark.sql(f"""USE `{catalog_name}`.`{schema}`""") + +# COMMAND ---------- + +# DBTITLE 1, Create output preprocessed data table +if not spark.catalog.tableExists(f"{preprocessed_data_table}") or spark.table(f"{preprocessed_data_table}").isEmpty(): + spark.sql(f""" + CREATE TABLE IF NOT EXISTS {preprocessed_data_table} ( + id BIGINT GENERATED ALWAYS AS IDENTITY, + url STRING, + content STRING + ) + TBLPROPERTIES ('delta.enableChangeDataFeed' = 'true') + """) + + +# COMMAND ---------- + +# DBTITLE 1,Create a user-defined function (UDF) to chunk all our documents with spark. +from functools import partial +import pandas as pd +from pyspark.sql.functions import pandas_udf +from utils.create_chunk import split_html_on_p + +@pandas_udf("array") +def parse_and_split( + docs: pd.Series +) -> pd.Series: + """Parse and split html content into chunks. + + :param docs: Input documents + :return: List of chunked text for each input document + """ + + return docs.apply(lambda html: split_html_on_p( + html, + tokenizer=tokenizer, + chunk_overlap=chunk_overlap, + min_chunk_size=min_chunk_size, + max_chunk_size=max_chunk_size + )) + +# COMMAND ---------- + +# DBTITLE 1,Perform data preprocessing. +from pyspark.sql import functions as F + +(spark.table(raw_data_table) + .filter('text is not null') + .withColumn('content', F.explode(parse_and_split('text'))) + .drop("text") + .write.mode('overwrite').saveAsTable(preprocessed_data_table)) + +# COMMAND ---------- + +dbutils.notebook.exit(0) \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/utils/config.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/utils/config.py.tmpl new file mode 100644 index 00000000..0851169a --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/utils/config.py.tmpl @@ -0,0 +1,14 @@ +""" +Configuration constants for data preprocessing. + +This module contains default values for chunking and tokenization +used in the data preprocessing pipeline. +""" + +# Chunking configuration +MIN_CHUNK_SIZE = 20 +MAX_CHUNK_SIZE = 500 +CHUNK_OVERLAP = 50 + +# Tokenizer configuration +HF_TOKENIZER_MODEL = "openai-community/openai-gpt" \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/utils/create_chunk.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/utils/create_chunk.py.tmpl new file mode 100644 index 00000000..c75e37f7 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/data_preprocessing/utils/create_chunk.py.tmpl @@ -0,0 +1,84 @@ +""" +This sample module contains data preprocessing logic to chunk HTML text. +You should plug in your own data chunking logic in the split_html_on_p method below. +""" + +from langchain.text_splitter import ( + HTMLHeaderTextSplitter, + RecursiveCharacterTextSplitter, +) +from lxml import etree + + +def get_splitters(tokenizer, max_chunk_size: int, chunk_overlap: int): + """Initialize splitters with the shared tokenizer. + + :param max_chunk_size: The maximum size of a chunk. + :param chunk_overlap: Target overlap between chunks. + Overlapping chunks helps to mitigate loss of information when context is divided between chunks. + :return: A tuple of text splitter and html text splitter + """ + text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( + tokenizer, chunk_size=max_chunk_size, chunk_overlap=chunk_overlap + ) + html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=[("p", "paragraph")]) + return text_splitter, html_splitter + + +def split_html_on_p( + html: str, + tokenizer, + chunk_overlap: int = 50, + min_chunk_size: int = 20, + max_chunk_size: int = 500, +): + try: + """Parse and split HTML content into chunks. + + Split on

, but merge small paragraph chunks together to avoid too small. + It uses HTMLHeaderTextSplitter to parse the HTML content and + RecursiveCharacterTextSplitter to split the text into chunks + + TODO: Update and adapt the sample code for your use case + + :param html: HTML content + :param chunk_overlap: Target overlap between chunks. + Overlapping chunks helps to mitigate loss of information when context is divided between chunks. + :param min_chunk_size: The minimum size of a chunk. + :param max_chunk_size: The maximum size of a chunk. + :return: List of chunked text for input HTML content + """ + if not html: + return [] + + # Get splitters + text_splitter, html_splitter = get_splitters( + tokenizer, max_chunk_size, chunk_overlap + ) + + p_chunks = html_splitter.split_text(html) + chunks = [] + previous_chunk = "" + + # Merge chunks together to add text before

and avoid too small docs. + for c in p_chunks: + # Concat the paragraph + content = c.page_content + if len(tokenizer.encode(previous_chunk + content)) <= max_chunk_size / 2: + previous_chunk += content + "\n" + else: + chunks.extend(text_splitter.split_text(previous_chunk.strip())) + previous_chunk = content + "\n" + + if previous_chunk: + chunks.extend(text_splitter.split_text(previous_chunk.strip())) + + # Discard chunks smaller than min_chunk_size + return [c for c in chunks if len(tokenizer.encode(c)) > min_chunk_size] + + except etree.XSLTApplyError as e: + print(f"XSLTApplyError: {e}") + return None + except Exception as e: + print(f"An error occurred: {e}") + return None \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/README.md.tmpl new file mode 100644 index 00000000..4ef3434f --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/README.md.tmpl @@ -0,0 +1,9 @@ +# Vector Search + +To enable vector search as part of a scheduled Databricks workflow, please: +- Update all the TODOs in the [vector search resource file](../resources/vector-search-resource.yml). +- Uncomment the vector search workflow from the main Databricks Asset Bundles file [databricks.yml](../databricks.yml). + +For more details, refer to [{{template `project_name_alphanumeric_underscore` .}}/resources/README.md](../resources/README.md). + +This workflow supports the building of a vector index given a source table. \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/notebooks/VectorSearch.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/notebooks/VectorSearch.py.tmpl new file mode 100644 index 00000000..e800e80b --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/notebooks/VectorSearch.py.tmpl @@ -0,0 +1,154 @@ +# Databricks notebook source +# MAGIC %load_ext autoreload +# MAGIC %autoreload 2 + +# COMMAND ---------- + +# DBTITLE 1,Vector Search Pipeline - Overview +################################################################################### +# Vector Search +# +# This notebook creates a Vector Search index from a table containing chunked documents. +# +# Parameters: +# * catalog_name (required) - Name of the Unity Catalog +# * schema (required) - Name of the schema inside Unity Catalog +# * preprocessed_data_table (required) - Name of the preprocessed data table inside database of Unity Catalog +# * vector_search_endpoint (required) - Name of the Vector Search endpoint +# +# Widgets: +# * Vector Search endpoint: Text widget to input the name of the Vector Search endpoint +# * Unity Catalog: Text widget to input the name of the Unity Catalog +# * Schema: Text widget to input the name of the database inside the Unity Catalog +# * Preprocessed data table: Text widget to input the name of the preprocessed data table inside the database of Unity Catalog +# +# Usage: +# 1. Set the appropriate values for the widgets. +# 2. Run the pipeline to set up the vector search endpoint. +# 3. Create index. +# +################################################################################## + +# COMMAND ---------- + +# DBTITLE 1,Install Prerequisites +# Install prerequisite packages +%pip install -r ../../../requirements.txt + +# COMMAND ---------- + +# DBTITLE 1,Widget Creation +# List of input args needed to run this notebook as a job. +# Provide them via DB widgets or notebook arguments in your DAB resources. + +# A Unity Catalog location containing the input data +dbutils.widgets.text( + "catalog_name", + "", + label="Unity Catalog", +) +# Name of schema +dbutils.widgets.text( + "schema", + "", + label="Schema", +) +# Name of preprocessed data table +dbutils.widgets.text( + "preprocessed_data_table", + "databricks_documentation", + label="Preprocessed data table", +) +# A Vector Search Endpoint for retrieving processed data +dbutils.widgets.text( + "vector_search_endpoint", + "ai_agent_endpoint", + label="Vector Search endpoint", +) + + +# COMMAND ---------- + +# DBTITLE 1,Define variables +vector_search_endpoint = dbutils.widgets.get("vector_search_endpoint") +catalog_name = dbutils.widgets.get("catalog_name") +schema = dbutils.widgets.get("schema") +preprocessed_data_table = dbutils.widgets.get("preprocessed_data_table") + +assert vector_search_endpoint != "", "vector_search_endpoint notebook parameter must be specified" +assert catalog_name != "", "catalog_name notebook parameter must be specified" +assert schema != "", "schema notebook parameter must be specified" +assert preprocessed_data_table != "", "preprocessed_data_table notebook parameter must be specified" + +# COMMAND ---------- + +# DBTITLE 1,Set up path to import utility functions +import sys +import os + +# Get notebook's directory using dbutils +notebook_path = '/Workspace/' + os.path.dirname( + dbutils.notebook.entry_point.getDbutils().notebook() + .getContext().notebookPath().get() +) +# Navigate up from notebooks/ to component level +utils_dir = os.path.dirname(notebook_path) +sys.path.insert(0, utils_dir) + +# COMMAND ---------- + +# DBTITLE 1,Initialize endpoint +from databricks.vector_search.client import VectorSearchClient +from utils.vector_search_utils import vs_endpoint_exists, wait_for_vs_endpoint_to_be_ready + +vsc = VectorSearchClient(disable_notice=True) + +if not vs_endpoint_exists(vsc, vector_search_endpoint): + vsc.create_endpoint(name=vector_search_endpoint, endpoint_type="STANDARD") + +# this may throw an error on the first pass, once the endpoint is created we'd see correct messages +wait_for_vs_endpoint_to_be_ready(vsc, vector_search_endpoint) +print(f"Endpoint named {vector_search_endpoint} is ready.") + +# COMMAND ---------- + +# DBTITLE 1,Create Index +from utils.vector_search_utils import index_exists, wait_for_index_to_be_ready +from databricks.sdk import WorkspaceClient +import databricks.sdk.service.catalog as c + +# The table we'd like to index +source_table_fullname = f"{catalog_name}.{schema}.{preprocessed_data_table}" + +# Where we want to store our index +vs_index_fullname = f"{catalog_name}.{schema}.{preprocessed_data_table}_vs_index" + +if not index_exists(vsc, vector_search_endpoint, vs_index_fullname): + print(f"Creating index {vs_index_fullname} on endpoint {vector_search_endpoint}...") + vsc.create_delta_sync_index( + endpoint_name=vector_search_endpoint, + index_name=vs_index_fullname, + source_table_name=source_table_fullname, + pipeline_type="TRIGGERED", + primary_key="id", + embedding_source_column="content", # The column containing our text + embedding_model_endpoint_name="databricks-gte-large-en" # The embedding endpoint used to create the embeddings + ) + #Let's wait for the index to be ready and all our embeddings to be created and indexed + vsc.get_index(vector_search_endpoint, vs_index_fullname).wait_until_ready() +else: + #Trigger a sync to update our vs content with the new data saved in the table + vsc.get_index(vector_search_endpoint, vs_index_fullname).sync() + +print(f"Index {vs_index_fullname} on table {source_table_fullname} is ready") + +# COMMAND ---------- + +# DBTITLE 1,Test if Index Online +import databricks +import time +from utils.vector_search_utils import check_index_online + +vector_index=vsc.get_index(endpoint_name=vector_search_endpoint, index_name=vs_index_fullname) + +check_index_online(vs_index_fullname, vector_index) \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/utils/__init__.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/utils/__init__.py.tmpl new file mode 100644 index 00000000..1c6646f3 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/utils/__init__.py.tmpl @@ -0,0 +1,17 @@ +"""Vector Search utilities for data preparation.""" + +from .vector_search_utils import ( + vs_endpoint_exists, + wait_for_vs_endpoint_to_be_ready, + index_exists, + wait_for_index_to_be_ready, + check_index_online, +) + +__all__ = [ + "vs_endpoint_exists", + "wait_for_vs_endpoint_to_be_ready", + "index_exists", + "wait_for_index_to_be_ready", + "check_index_online", +] \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/utils/vector_search_utils.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/utils/vector_search_utils.py.tmpl new file mode 100644 index 00000000..73459b74 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/data_preparation/vector_search/utils/vector_search_utils.py.tmpl @@ -0,0 +1,65 @@ +import databricks +import time + +def vs_endpoint_exists(vsc, endpoint_name): + try: + vsc.get_endpoint(endpoint_name) + return True + except Exception as e: + if 'Not Found' in str(e): + print(f'Unexpected error describing the endpoint. Try deleting it? vsc.delete_endpoint({endpoint_name}) and rerun the previous cell') + raise e + return False + +def wait_for_vs_endpoint_to_be_ready(vsc, vs_endpoint_name): + for i in range(180): + endpoint = vsc.get_endpoint(vs_endpoint_name) + status = endpoint.get("endpoint_status", endpoint.get("status"))["state"].upper() + if "ONLINE" in status: + return endpoint + elif "PROVISIONING" in status or i <6: + if i % 20 == 0: + print(f"Waiting for endpoint to be ready, this can take a few min... {endpoint}") + time.sleep(10) + else: + raise Exception(f'''Error with the endpoint {vs_endpoint_name}. - this shouldn't happen: {endpoint}.\n Please delete it and re-run the previous cell: vsc.delete_endpoint("{vs_endpoint_name}")''') + raise Exception(f"Timeout, your endpoint isn't ready yet: {vsc.get_endpoint(vs_endpoint_name)}") + + +def index_exists(vsc, endpoint_name, index_full_name): + try: + vsc.get_index(endpoint_name, index_full_name).describe() + return True + except Exception as e: + if 'RESOURCE_DOES_NOT_EXIST' not in str(e): + print(f'Unexpected error describing the index. This could be a permission issue. Try deleting it? vsc.delete_index({index_full_name})') + raise e + return False + +def wait_for_index_to_be_ready(vsc, vs_endpoint_name, index_name): + for i in range(180): + idx = vsc.get_index(vs_endpoint_name, index_name).describe() + index_status = idx.get('status', idx.get('index_status', {})) + status = index_status.get('status', 'UNKOWN').upper() + url = index_status.get('index_url', index_status.get('url', 'UNKOWN')) + if "ONLINE" in status: + return idx + if "UNKOWN" in status: + print(f"Can't get the status - will assume index is ready {idx} - url: {url}") + return idx + elif "PROVISIONING" in status: + if i % 20 == 0: print(f"Waiting for index to be ready, this can take a few min... {index_status} - pipeline url:{url}") + time.sleep(10) + else: + raise Exception(f'''Error with the index - this shouldn't happen. DLT pipeline might have been killed.\n Please delete it and re-run the previous cell: vsc.delete_index("{index_name}, {vs_endpoint_name}") \nIndex details: {idx}''') + raise Exception(f"Timeout, your index isn't ready yet: {vsc.get_index(index_name, vs_endpoint_name)}") + +def check_index_online(vs_index_fullname: str, vector_index: databricks.vector_search.index.VectorSearchIndex): + for i in range(180): + status = vector_index.describe()['status']["detailed_state"] + if (status != "ONLINE" and status != "ONLINE_NO_PENDING_UPDATE"): + print(f"Syncing {vs_index_fullname}") + time.sleep(10) + else: + print(f"{vs_index_fullname} is now synced") + return \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml.tmpl index de207874..31d2501e 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/databricks.yml.tmpl @@ -2,10 +2,9 @@ bundle: # Do not modify the below line, this autogenerated field is used by the Databricks backend. uuid: {{ bundle_uuid }} - name: {{ .input_project_name }} -variables: +{{ if (eq .input_project_type `mlops`) }}variables: experiment_name: description: Experiment name for the model training. default: /Users/${workspace.current_user.userName}/${bundle.target}-{{template `experiment_base_name` .}} @@ -67,4 +66,67 @@ targets: {{- end}} workspace: host: {{template `databricks_staging_workspace_host` .}} +{{ end }} +{{ else if (eq .input_project_type `agentops`) }}variables: + catalog_name: + default: main + schema: + default: default + # Data tables + raw_data_table: + description: "Table in Unity Catalog to store raw data." + default: raw_documentation + preprocessed_data_table: + description: "Table in Unity Catalog to store preprocessed data." + default: databricks_documentation + eval_table: + description: "Table in Unity Catalog to store evaluation data." + default: "databricks_documentation_eval" + + # Vector search infrastructure + vector_search_endpoint: + description: "Vector Search endpoint to create." + default: ai_agent_endpoint + vector_search_index: + description: "Vector Search index to populate." + default: databricks_documentation_vs_index + +include: + # Resources folder contains Agent artifact resources for the Agent project that defines the agent + # and workflows resources for data preparation + - ./resources/data-preparation-resource.yml + +# Deployment Target specific values for workspace +targets: + dev: + default: true + workspace: + # TODO: add dev workspace URL + host: + variables: + catalog_name: dev + schema: {{ .input_schema_name }} + + staging: + workspace: + host: {{template `databricks_staging_workspace_host` .}} + + variables: + catalog_name: {{ .input_staging_catalog_name }} + schema: {{ .input_schema_name }} + + prod: + workspace: + host: {{template `databricks_prod_workspace_host` .}} + variables: + catalog_name: {{ .input_prod_catalog_name }} + schema: {{ .input_schema_name }} + + test: + workspace: + host: {{template `databricks_staging_workspace_host` .}} + + variables: + catalog_name: {{ .input_test_catalog_name }} + schema: {{ .input_schema_name }} {{ end }} \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/requirements.txt.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/requirements.txt.tmpl index 1f6f9926..dddda769 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/requirements.txt.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/requirements.txt.tmpl @@ -1,4 +1,4 @@ -mlflow==2.11.3 +{{ if (eq .input_project_type `mlops`) }}mlflow==2.11.3 numpy>=1.23.0 pandas==1.5.3 scikit-learn>=1.1.1 @@ -8,3 +8,17 @@ Jinja2==3.0.3 pyspark~=3.3.0 pytz~=2022.2.1 pytest>=7.1.2 +{{ else if (eq .input_project_type `agentops`) }}mlflow==3.1.1 +numpy>=1.23.0 +scikit-learn>=1.1.1 +matplotlib>=3.5.2 +pillow>=10.0.1 +Jinja2==3.0.3 +pytz~=2022.2.1 +databricks-sdk==0.58.0 +beautifulsoup4==4.13.3 +transformers==4.41.1 +langchain==0.2.1 +databricks-vectorsearch==0.56 +lxml==5.4.0 +{{ end }} \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/data-preparation-resource.yml.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/data-preparation-resource.yml.tmpl new file mode 100644 index 00000000..6d14e302 --- /dev/null +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/resources/data-preparation-resource.yml.tmpl @@ -0,0 +1,72 @@ +common_permissions: &permissions + permissions: + - level: CAN_VIEW + group_name: users + + +resources: + jobs: + data_preprocessing_job: + name: ${bundle.target}-{{ .input_project_name }}-data-preprocessing-job + tasks: + - task_key: RawDataIngest + notebook_task: + notebook_path: ../data_preparation/data_ingestion/notebooks/DataIngestion.py + base_parameters: + # TODO modify these arguments to reflect your setup. + catalog_name: ${var.catalog_name} + schema: ${var.schema} + raw_data_table: ${var.raw_data_table} + data_source_url: "{{ .input_doc_link }}" + # git source information of current ML resource deployment. It will be persisted as part of the workflow run + git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} + environment_key: vs_requirements + + - task_key: PreprocessRawData + depends_on: + - task_key: RawDataIngest + notebook_task: + notebook_path: ../data_preparation/data_preprocessing/notebooks/DataPreprocessing.py + base_parameters: + # TODO modify these arguments to reflect your setup. + catalog_name: ${var.catalog_name} + schema: ${var.schema} + raw_data_table: ${var.raw_data_table} + preprocessed_data_table_name: ${var.preprocessed_data_table} + # git source information of current ML resource deployment. It will be persisted as part of the workflow run + git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} + environment_key: vs_requirements + + - task_key: VectorSearchIndex + depends_on: + - task_key: PreprocessRawData + notebook_task: + notebook_path: ../data_preparation/vector_search/notebooks/VectorSearch.py + base_parameters: + # TODO modify these arguments to reflect your setup. + catalog_name: ${var.catalog_name} + schema: ${var.schema} + preprocessed_data_table: ${var.preprocessed_data_table} + vector_search_endpoint: ${var.vector_search_endpoint} + # git source information of current ML resource deployment. It will be persisted as part of the workflow run + git_source_info: url:${bundle.git.origin_url}; branch:${bundle.git.branch}; commit:${bundle.git.commit} + environment_key: vs_requirements + + environments: + - environment_key: vs_requirements + spec: + environment_version: "3" + dependencies: + - "-r /Workspace/Users/${workspace.current_user.userName}/.bundle/${bundle.name}/${bundle.target}/files/requirements.txt" + + schedule: + quartz_cron_expression: "0 0 5 * * ?" # daily at 5am + timezone_id: UTC + <<: *permissions + # If you want to turn on notifications for this job, please uncomment the below code, + # and provide a list of emails to the on_failure argument. + # + # email_notifications: + # on_failure: + # - first@company.com + # - second@company.com \ No newline at end of file diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl index efc2e449..c5eaa23a 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/README.md.tmpl @@ -9,7 +9,7 @@ ## Initial setup This folder contains example ML code to train a regression model to predict NYC taxi fares using -[MLflow recipes](https://mlflow.org/docs/latest/recipes.html). +[MLflow recipes](https://www.mlflow.org/docs/2.5.0/recipes.html). **Note**: MLflow Recipes currently supports regression and classification problems. Usage of MLflow Recipes is encouraged but not required: you can still use the provided CI/CD and ML resource configs to build production ML pipelines, as long as you provide ML notebooks under `notebooks` @@ -32,8 +32,8 @@ locally or on Databricks. For details on the meaning of recipe configurations, see the comments in [this example recipe.yaml](https://github.com/mlflow/recipes-regression-template/blob/main/recipe.yaml). The purpose and behavior of the individual recipe steps (`ingest`, `train`, etc) being configured are also described in detail in -the [Recipe overview](https://mlflow.org/docs/latest/recipes.html) -and [API documentation](https://mlflow.org/docs/latest/python_api/mlflow.recipes.html). +the [Recipe overview](https://www.mlflow.org/docs/2.5.0/recipes.html) +and [API documentation](https://www.mlflow.org/docs/2.5.0/python_api/mlflow.recipes.html). After configuring your recipe, you can iterate on and test ML code under ``{{template `project_name_alphanumeric_underscore` .}}/training/steps``. We expect most development to take place in the abovementioned YAML config files and @@ -89,7 +89,7 @@ You can also iterate on ML code locally. #### Trigger model training Run `mlp run --profile local` to trigger training locally. See the -[MLflow recipes CLI docs](https://mlflow.org/docs/latest/recipes.html#key-concepts) for details. +[MLflow recipes CLI docs](https://www.mlflow.org/docs/2.5.0/recipes.html#key-concepts) for details. #### Inspect results in the UI To facilitate saving and sharing results from local iteration with collaborators, we recommend configuring your diff --git a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl index c06cc6c4..e54bf99a 100644 --- a/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl +++ b/template/{{.input_root_dir}}/{{template `project_name_alphanumeric_underscore` .}}/training/notebooks/TrainWithMLflowRecipes.py.tmpl @@ -32,8 +32,8 @@ # dbutils.jobs.taskValues(...), for use by downstream notebooks. # # For details on MLflow Recipes and the individual split, transform, train etc steps below, including usage examples, -# see the Regression Recipe overview documentation: https://mlflow.org/docs/latest/recipes.html -# and Regression Recipes API documentation: https://mlflow.org/docs/latest/python_api/mlflow.recipes.html +# see the Regression Recipe overview documentation: https://www.mlflow.org/docs/2.5.0/recipes.html +# and Regression Recipes API documentation: https://www.mlflow.org/docs/2.5.0/python_api/mlflow.recipes.html ################################################################################## # COMMAND ---------- diff --git a/tests/example-project-configs/aws/aws-github.json b/tests/example-project-configs/aws/aws-github.json index 30b6e596..918d5a6b 100644 --- a/tests/example-project-configs/aws/aws-github.json +++ b/tests/example-project-configs/aws/aws-github.json @@ -1,4 +1,5 @@ { + "input_project_type": "mlops", "input_setup_cicd_and_project": "CICD_and_Project", "input_root_dir": "example-aws-github-project", "input_project_name": "example-aws-github-project", diff --git a/tests/example-project-configs/azure/azure-devops.json b/tests/example-project-configs/azure/azure-devops.json index c38784b6..aa1b8477 100644 --- a/tests/example-project-configs/azure/azure-devops.json +++ b/tests/example-project-configs/azure/azure-devops.json @@ -1,4 +1,5 @@ { + "input_project_type": "mlops", "input_setup_cicd_and_project": "CICD_and_Project", "input_root_dir": "test-azure-devops-project", "input_project_name": "test-azure-devops-project", diff --git a/tests/example-project-configs/azure/azure-github.json b/tests/example-project-configs/azure/azure-github.json index 1e405b3a..0741ee24 100644 --- a/tests/example-project-configs/azure/azure-github.json +++ b/tests/example-project-configs/azure/azure-github.json @@ -1,4 +1,5 @@ { + "input_project_type": "mlops", "input_setup_cicd_and_project": "CICD_and_Project", "input_root_dir": "example-azure-github-project", "input_project_name": "example-azure-github-project", diff --git a/tests/example-project-configs/azure/azure-gitlab.json b/tests/example-project-configs/azure/azure-gitlab.json index 547ca50e..99303a2c 100644 --- a/tests/example-project-configs/azure/azure-gitlab.json +++ b/tests/example-project-configs/azure/azure-gitlab.json @@ -1,4 +1,5 @@ { + "input_project_type": "mlops", "input_setup_cicd_and_project": "CICD_and_Project", "input_root_dir": "example-azure-gitlab-project", "input_project_name": "example-azure-gitlab-project", diff --git a/tests/example-project-configs/gcp/gcp-github.json b/tests/example-project-configs/gcp/gcp-github.json index 45e39adb..6888dddf 100644 --- a/tests/example-project-configs/gcp/gcp-github.json +++ b/tests/example-project-configs/gcp/gcp-github.json @@ -1,4 +1,5 @@ { + "input_project_type": "mlops", "input_root_dir": "example-gcp-github-project", "input_project_name": "example-gcp-github-project", "input_cloud": "gcp", diff --git a/tests/install.sh b/tests/install.sh index c91e5088..e163040a 100755 --- a/tests/install.sh +++ b/tests/install.sh @@ -4,7 +4,7 @@ # Usage in the wild uses the "curl | sh" approach and we need that to continue working. set -e -VERSION="0.236.0" +VERSION="0.266.0" FILE="databricks_cli_$VERSION" # Include operating system in file name. diff --git a/tests/test_create_project.py b/tests/test_create_project.py index deb981dd..3eb75118 100644 --- a/tests/test_create_project.py +++ b/tests/test_create_project.py @@ -14,12 +14,14 @@ ) from unittest import mock +DEFAULT_PROJECT_TYPE = "mlops" DEFAULT_PROJECT_NAME = "my-mlops-project" DEFAULT_PROJECT_DIRECTORY = "my_mlops_project" # UUID that when set as project name, prevents the removal of files needed in testing TEST_PROJECT_NAME = "27896cf3-bb3e-476e-8129-96df0406d5c7" TEST_PROJECT_DIRECTORY = "27896cf3_bb3e_476e_8129_96df0406d5c7" DEFAULT_PARAM_VALUES = { + "input_project_type": DEFAULT_PROJECT_TYPE, "input_default_branch": "main", "input_release_branch": "release", "input_read_user_group": "users", @@ -191,6 +193,7 @@ def test_generate_project_with_default_values( # Skip test for GCP with Unity Catalog return context = { + "input_project_type": DEFAULT_PROJECT_TYPE, "input_project_name": TEST_PROJECT_NAME, "input_root_dir": TEST_PROJECT_NAME, "input_cloud": cloud, @@ -211,6 +214,7 @@ def test_generate_project_with_default_values( params = {**DEFAULT_PARAM_VALUES, **DEFAULT_PARAMS_GCP} for param, value in params.items(): assert f"{param}={value}" in test_file_contents + assert f"input_project_type={DEFAULT_PROJECT_TYPE}" in test_file_contents def prepareContext( @@ -222,6 +226,7 @@ def prepareContext( include_models_in_unity_catalog, ): context = { + "input_project_type": DEFAULT_PROJECT_TYPE, "input_setup_cicd_and_project": setup_cicd_and_project, "input_project_name": TEST_PROJECT_NAME, "input_root_dir": TEST_PROJECT_NAME, @@ -383,6 +388,7 @@ def test_workspace_dir_strip_query_params( }[cloud] workspace_url = f"{workspace_host}{workspace_url_suffix}" context = { + "input_project_type": DEFAULT_PROJECT_TYPE, "input_project_name": TEST_PROJECT_NAME, "input_root_dir": TEST_PROJECT_NAME, "input_databricks_staging_workspace_host": workspace_url, diff --git a/tests/utils.py b/tests/utils.py index 42eaa8ac..18127d28 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -8,6 +8,7 @@ RESOURCE_TEMPLATE_ROOT_DIRECTORY = str(pathlib.Path(__file__).parent.parent) AZURE_DEFAULT_PARAMS = { + "input_project_type": "mlops", "input_setup_cicd_and_project": "CICD_and_Project", "input_root_dir": "my-mlops-project", "input_project_name": "my-mlops-project", @@ -95,6 +96,7 @@ def generated_project_dir( include_models_in_unity_catalog, ): params = { + "input_project_type": "mlops", "input_setup_cicd_and_project": setup_cicd_and_project, "input_root_dir": "my-mlops-project", "input_cloud": cloud,