From 6a7d31fb27bd346ed54a7ddb0a2e64e2abc93384 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sun, 20 Oct 2024 08:37:23 +0200 Subject: [PATCH 01/27] Add proof of concept for catalog/schema presets --- bundle/config/mutator/apply_presets.go | 95 ++++++++++++++++--- bundle/config/presets.go | 6 ++ bundle/phases/initialize.go | 1 + .../{{.project_name}}/databricks.yml.tmpl | 6 ++ .../resources/{{.project_name}}.job.yml.tmpl | 14 ++- .../{{.project_name}}/src/notebook.ipynb.tmpl | 15 +++ .../src/{{.project_name}}/main.py.tmpl | 10 +- 7 files changed, 130 insertions(+), 17 deletions(-) diff --git a/bundle/config/mutator/apply_presets.go b/bundle/config/mutator/apply_presets.go index 1fd49206fd..8e3a0baf37 100644 --- a/bundle/config/mutator/apply_presets.go +++ b/bundle/config/mutator/apply_presets.go @@ -2,6 +2,7 @@ package mutator import ( "context" + "fmt" "path" "slices" "sort" @@ -9,6 +10,7 @@ import ( "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config" + "github.com/databricks/cli/bundle/config/resources" "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/dyn" "github.com/databricks/cli/libs/textutil" @@ -37,6 +39,9 @@ func (m *applyPresets) Name() string { func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { var diags diag.Diagnostics + if d := validateCatalogAndSchema(b); d != nil { + return d // fast fail since code below would fail + } if d := validatePauseStatus(b); d != nil { diags = diags.Extend(d) } @@ -46,7 +51,7 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos prefix := t.NamePrefix tags := toTagArray(t.Tags) - // Jobs presets: Prefix, Tags, JobsMaxConcurrentRuns, TriggerPauseStatus + // Jobs presets: Prefix, Tags, JobsMaxConcurrentRuns, TriggerPauseStatus, Catalog, Schema for key, j := range r.Jobs { if j.JobSettings == nil { diags = diags.Extend(diag.Errorf("job %s is not defined", key)) @@ -80,9 +85,12 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos j.Trigger.PauseStatus = paused } } + if t.Catalog != "" || t.Schema != "" { + diags = diags.Extend(validateJobUsesCatalogAndSchema(b, key, j)) + } } - // Pipelines presets: Prefix, PipelinesDevelopment + // Pipelines presets: Prefix, PipelinesDevelopment, Catalog, Schema for key, p := range r.Pipelines { if p.PipelineSpec == nil { diags = diags.Extend(diag.Errorf("pipeline %s is not defined", key)) @@ -95,7 +103,13 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos if t.TriggerPauseStatus == config.Paused { p.Continuous = false } - // As of 2024-06, pipelines don't yet support tags + if t.Catalog != "" && p.Catalog == "" { + p.Catalog = t.Catalog + } + if t.Schema != "" && p.Target == "" { + p.Target = t.Schema + } + // As of 2024-10, pipelines don't yet support tags } // Models presets: Prefix, Tags @@ -155,18 +169,23 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos // As of 2024-06, model serving endpoints don't yet support tags } - // Registered models presets: Prefix + // Registered models presets: Prefix, Catalog, Schema for key, m := range r.RegisteredModels { if m.CreateRegisteredModelRequest == nil { diags = diags.Extend(diag.Errorf("registered model %s is not defined", key)) continue } m.Name = normalizePrefix(prefix) + m.Name - + if t.Catalog != "" && m.CatalogName == "" { + m.CatalogName = t.Catalog + } + if t.Schema != "" && m.SchemaName == "" { + m.SchemaName = t.Schema + } // As of 2024-06, registered models don't yet support tags } - // Quality monitors presets: Schedule + // Quality monitors presets: Schedule, Catalog, Schema if t.TriggerPauseStatus == config.Paused { for key, q := range r.QualityMonitors { if q.CreateMonitor == nil { @@ -179,16 +198,30 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos if q.Schedule != nil && q.Schedule.PauseStatus != catalog.MonitorCronSchedulePauseStatusUnpaused { q.Schedule = nil } + if t.Catalog != "" && t.Schema != "" { + parts := strings.Split(q.TableName, ".") + if len(parts) != 3 { + q.TableName = fmt.Sprintf("%s.%s.%s", t.Catalog, t.Schema, q.TableName) + } + } } } - // Schemas: Prefix + // Schemas: Prefix, Catalog, Schema for key, s := range r.Schemas { if s.CreateSchema == nil { diags = diags.Extend(diag.Errorf("schema %s is not defined", key)) continue } s.Name = normalizePrefix(prefix) + s.Name + if t.Catalog != "" && s.CatalogName == "" { + s.CatalogName = t.Catalog + } + if t.Schema != "" && s.Name == "" { + // If there is a schema preset such as 'dev', we directly + // use that name and don't add any prefix (which might result in dev_dev). + s.Name = t.Schema + } // HTTP API for schemas doesn't yet support tags. It's only supported in // the Databricks UI and via the SQL API. } @@ -204,10 +237,10 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos c.CustomTags = make(map[string]string) } for _, tag := range tags { - normalisedKey := b.Tagging.NormalizeKey(tag.Key) - normalisedValue := b.Tagging.NormalizeValue(tag.Value) - if _, ok := c.CustomTags[normalisedKey]; !ok { - c.CustomTags[normalisedKey] = normalisedValue + normalizedKey := b.Tagging.NormalizeKey(tag.Key) + normalizedValue := b.Tagging.NormalizeValue(tag.Value) + if _, ok := c.CustomTags[normalizedKey]; !ok { + c.CustomTags[normalizedKey] = normalizedValue } } } @@ -227,6 +260,46 @@ func validatePauseStatus(b *bundle.Bundle) diag.Diagnostics { }} } +func validateCatalogAndSchema(b *bundle.Bundle) diag.Diagnostics { + p := b.Config.Presets + if (p.Catalog != "" && p.Schema == "") || (p.Catalog == "" && p.Schema != "") { + return diag.Diagnostics{{ + Summary: "presets.catalog and presets.schema must always be set together", + Severity: diag.Error, + Locations: []dyn.Location{b.Config.GetLocation("presets")}, + }} + } + return nil +} + +func validateJobUsesCatalogAndSchema(b *bundle.Bundle, key string, job *resources.Job) diag.Diagnostics { + if !hasParameter(job.Parameters, "catalog") || !hasParameter(job.Parameters, "schema") { + return diag.Diagnostics{{ + Summary: fmt.Sprintf("job %s must pass catalog and schema presets as parameters as follows:\n"+ + " parameters:\n"+ + " - name: catalog:\n"+ + " default: ${presets.catalog}\n"+ + " - name: schema\n"+ + " default: ${presets.schema}\n", key), + Severity: diag.Error, + Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key)}, + }} + } + return nil +} + +func hasParameter(parameters []jobs.JobParameterDefinition, name string) bool { + if parameters == nil { + return false + } + for _, p := range parameters { + if p.Name == name { + return true + } + } + return false +} + // toTagArray converts a map of tags to an array of tags. // We sort tags so ensure stable ordering. func toTagArray(tags map[string]string) []Tag { diff --git a/bundle/config/presets.go b/bundle/config/presets.go index 61009a2521..948c9043a2 100644 --- a/bundle/config/presets.go +++ b/bundle/config/presets.go @@ -19,6 +19,12 @@ type Presets struct { // Tags to add to all resources. Tags map[string]string `json:"tags,omitempty"` + + // Catalog is the default catalog for all resources. + Catalog string `json:"catalog,omitempty"` + + // Schema is the default schema for all resources. + Schema string `json:"schema,omitempty"` } // IsExplicitlyEnabled tests whether this feature is explicitly enabled. diff --git a/bundle/phases/initialize.go b/bundle/phases/initialize.go index 5582016fdb..2e6d7dce9d 100644 --- a/bundle/phases/initialize.go +++ b/bundle/phases/initialize.go @@ -61,6 +61,7 @@ func Initialize() bundle.Mutator { "bundle", "workspace", "variables", + "presets", ), // Provide permission config errors & warnings after initializing all variables permissions.PermissionDiagnostics(), diff --git a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl index c42b822a8d..13e2e6c9c9 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl @@ -16,6 +16,9 @@ targets: default: true workspace: host: {{workspace_host}} + presets: + catalog: {{default_catalog}} + schema: default prod: mode: production @@ -23,6 +26,9 @@ targets: host: {{workspace_host}} # We explicitly specify /Workspace/Users/{{user_name}} to make sure we only have a single copy. root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} + presets: + catalog: {{default_catalog}} + schema: default permissions: - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} level: CAN_MANAGE diff --git a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl index 5211e3894b..17b209a11e 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl @@ -10,22 +10,26 @@ resources: {{.project_name}}_job: name: {{.project_name}}_job + {{if or (eq .include_notebook "yes") (eq .include_python "yes") -}} + parameters: + - name: catalog + default: ${presets.catalog} + - name: schema + default: ${presets.schema} + + {{end -}} trigger: # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger periodic: interval: 1 unit: DAYS - {{- if not is_service_principal}} - + {{if not is_service_principal -}} email_notifications: on_failure: - {{user_name}} - {{else}} - {{end -}} - tasks: {{- if eq .include_notebook "yes" }} - task_key: notebook_task diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl index 6782a053ba..4163389b3a 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl @@ -23,10 +23,25 @@ "metadata": {}, "outputs": [], "source": [ + "# Automatically reload this notebook when it is edited\n", "%load_ext autoreload\n", "%autoreload 2" ] }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Set the catalog and schema for the current session\n", + "dbutils.widgets.text('catalog', '{{default_catalog}}')\n", + "dbutils.widgets.text('schema', 'default')\n", + "catalog = dbutils.widgets.get('catalog')\n", + "schema = dbutils.widgets.get('schema')\n", + "spark.sql(f'USE {catalog}.{schema}')" + ] + }, { "cell_type": "code", "execution_count": 0, diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl index c514c6dc5d..1ac627a0c4 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl @@ -15,7 +15,15 @@ def get_spark() -> SparkSession: return SparkSession.builder.getOrCreate() def main(): - get_taxis(get_spark()).show(5) + # Set the catalog and schema for the current session + parser = argparse.ArgumentParser() + parser.add_argument('--catalog', '-c', required=True) + parser.add_argument('--schema', '-s', required=True) + args, unknown = parser.parse_known_args() + spark = get_spark() + spark.sql(f"USE {args.catalog}.{args.schema}") + + get_taxis(spark).show(5) if __name__ == '__main__': main() From 7fb646a5388ffe4e61ba16f38f145939c6820442 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Tue, 22 Oct 2024 09:39:57 +0200 Subject: [PATCH 02/27] WIP --- bundle/config/mutator/apply_presets.go | 80 ++++++++++++++++++++------ 1 file changed, 64 insertions(+), 16 deletions(-) diff --git a/bundle/config/mutator/apply_presets.go b/bundle/config/mutator/apply_presets.go index 8e3a0baf37..9d265cc99f 100644 --- a/bundle/config/mutator/apply_presets.go +++ b/bundle/config/mutator/apply_presets.go @@ -51,7 +51,8 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos prefix := t.NamePrefix tags := toTagArray(t.Tags) - // Jobs presets: Prefix, Tags, JobsMaxConcurrentRuns, TriggerPauseStatus, Catalog, Schema + // Jobs presets. + // Supported: Prefix, Tags, JobsMaxConcurrentRuns, TriggerPauseStatus, Catalog, Schema for key, j := range r.Jobs { if j.JobSettings == nil { diags = diags.Extend(diag.Errorf("job %s is not defined", key)) @@ -86,11 +87,23 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } } if t.Catalog != "" || t.Schema != "" { + for _, task := range j.Tasks { + if task.DbtTask != nil { + if task.DbtTask.Catalog == "" { + task.DbtTask.Catalog = t.Catalog + } + if task.DbtTask.Schema == "" { + task.DbtTask.Schema = t.Catalog + } + } + } diags = diags.Extend(validateJobUsesCatalogAndSchema(b, key, j)) } } - // Pipelines presets: Prefix, PipelinesDevelopment, Catalog, Schema + // Pipelines presets. + // Supported: Prefix, PipelinesDevelopment, Catalog, Schema + // Not supported: Tags (as of 2024-10 not in pipelines API) for key, p := range r.Pipelines { if p.PipelineSpec == nil { diags = diags.Extend(diag.Errorf("pipeline %s is not defined", key)) @@ -109,10 +122,10 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos if t.Schema != "" && p.Target == "" { p.Target = t.Schema } - // As of 2024-10, pipelines don't yet support tags } - // Models presets: Prefix, Tags + // Models presets + // Supported: Prefix, Tags for key, m := range r.Models { if m.Model == nil { diags = diags.Extend(diag.Errorf("model %s is not defined", key)) @@ -130,7 +143,8 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } } - // Experiments presets: Prefix, Tags + // Experiments presets + // Supported: Prefix, Tags for key, e := range r.Experiments { if e.Experiment == nil { diags = diags.Extend(diag.Errorf("experiment %s is not defined", key)) @@ -158,7 +172,9 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } } - // Model serving endpoint presets: Prefix + // Model serving endpoint presets + // Supported: Prefix, Catalog, Schema + // Not supported: Tags (not in API as of 2024-10) for key, e := range r.ModelServingEndpoints { if e.CreateServingEndpoint == nil { diags = diags.Extend(diag.Errorf("model serving endpoint %s is not defined", key)) @@ -166,10 +182,19 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } e.Name = normalizePrefix(prefix) + e.Name - // As of 2024-06, model serving endpoints don't yet support tags + TODO: + - e.AiGateway.InferenceTableConfig.CatalogName + - e.AiGateway.InferenceTableConfig.SchemaName + - e.Config.AutoCaptureConfig.SchemaName + - e.Config.AutoCaptureConfig.CatalogName + - e.Config.ServedEntities[0].EntityName (__catalog_name__.__schema_name__.__model_name__.) + - e.Config.ServedModels[0].ModelName (__catalog_name__.__schema_name__.__model_name__.) + } - // Registered models presets: Prefix, Catalog, Schema + // Registered models presets + // Supported: Prefix, Catalog, Schema + // Not supported: Tags (not in API as of 2024-10) for key, m := range r.RegisteredModels { if m.CreateRegisteredModelRequest == nil { diags = diags.Extend(diag.Errorf("registered model %s is not defined", key)) @@ -182,10 +207,11 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos if t.Schema != "" && m.SchemaName == "" { m.SchemaName = t.Schema } - // As of 2024-06, registered models don't yet support tags } - // Quality monitors presets: Schedule, Catalog, Schema + // Quality monitors presets + // Supported: Schedule, Catalog, Schema + // Not supported: Tags (not in API as of 2024-10) if t.TriggerPauseStatus == config.Paused { for key, q := range r.QualityMonitors { if q.CreateMonitor == nil { @@ -208,6 +234,7 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } // Schemas: Prefix, Catalog, Schema + // Not supported: Tags (as of 2024-10, only supported in Databricks UI / SQL API) for key, s := range r.Schemas { if s.CreateSchema == nil { diags = diags.Extend(diag.Errorf("schema %s is not defined", key)) @@ -222,11 +249,10 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos // use that name and don't add any prefix (which might result in dev_dev). s.Name = t.Schema } - // HTTP API for schemas doesn't yet support tags. It's only supported in - // the Databricks UI and via the SQL API. } // Clusters: Prefix, Tags + // Not supported: Catalog / Schema (not applicable) for key, c := range r.Clusters { if c.ClusterSpec == nil { diags = diags.Extend(diag.Errorf("cluster %s is not defined", key)) @@ -273,7 +299,10 @@ func validateCatalogAndSchema(b *bundle.Bundle) diag.Diagnostics { } func validateJobUsesCatalogAndSchema(b *bundle.Bundle, key string, job *resources.Job) diag.Diagnostics { - if !hasParameter(job.Parameters, "catalog") || !hasParameter(job.Parameters, "schema") { + if !hasTasksRequiringParameters(job) { + return nil + } + if !hasParameter(job, "catalog") || !hasParameter(job, "schema") { return diag.Diagnostics{{ Summary: fmt.Sprintf("job %s must pass catalog and schema presets as parameters as follows:\n"+ " parameters:\n"+ @@ -288,11 +317,30 @@ func validateJobUsesCatalogAndSchema(b *bundle.Bundle, key string, job *resource return nil } -func hasParameter(parameters []jobs.JobParameterDefinition, name string) bool { - if parameters == nil { +// hasTasksRequiringParameters determines if there is a task in this job that +// requires the 'catalog' and 'schema' parameters when they are enabled in presets. +func hasTasksRequiringParameters(job *resources.Job) bool { + for _, task := range job.Tasks { + // Allowlisted task types: these don't require catalog / schema to be passed as a paramater + if task.DbtTask != nil || task.ConditionTask != nil || task.RunJobTask != nil || task.ForEachTask != nil || task.PipelineTask != nil { + continue + } + // Alert tasks, query object tasks, etc. don't require a parameter; + // the catalog / schema is set inside those objects instead. + if task.SqlTask != nil && task.SqlTask.File == nil { + continue + } + return true + } + return false +} + +// hasParameter determines if a job has a parameter with the given name. +func hasParameter(job *resources.Job, name string) bool { + if job.Parameters == nil { return false } - for _, p := range parameters { + for _, p := range job.Parameters { if p.Name == name { return true } From c1f5b3c8e738f73af74d2b12eb8ce246e41c7612 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sat, 2 Nov 2024 09:52:09 +0100 Subject: [PATCH 03/27] Use a helper notebook to set catalog/schema --- bundle/config/mutator/apply_presets.go | 145 +++++++++++------- .../databricks_template_schema.json | 34 +++- .../{{.project_name}}/databricks.yml.tmpl | 15 +- .../resources/{{.project_name}}.job.yml.tmpl | 8 - .../{{.project_name}}.pipeline.yml.tmpl | 7 - .../src/apply_defaults.ipynb.tmpl | 83 ++++++++++ .../{{.project_name}}/src/notebook.ipynb.tmpl | 23 +-- .../src/{{.project_name}}/main.py.tmpl | 27 ++-- 8 files changed, 238 insertions(+), 104 deletions(-) create mode 100644 libs/template/templates/default-python/template/{{.project_name}}/src/apply_defaults.ipynb.tmpl diff --git a/bundle/config/mutator/apply_presets.go b/bundle/config/mutator/apply_presets.go index 860d080d9e..a9d475cfb2 100644 --- a/bundle/config/mutator/apply_presets.go +++ b/bundle/config/mutator/apply_presets.go @@ -93,11 +93,13 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos task.DbtTask.Catalog = t.Catalog } if task.DbtTask.Schema == "" { - task.DbtTask.Schema = t.Catalog + task.DbtTask.Schema = t.Schema } } } - diags = diags.Extend(validateJobUsesCatalogAndSchema(b, key, j)) + + diags = diags.Extend(addCatalogSchemaParameters(b, key, j, t)) + diags = diags.Extend(validateCatalogSchemaUsage(b, key, j)) } } @@ -116,7 +118,7 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos if t.TriggerPauseStatus == config.Paused { p.Continuous = false } - if t.Catalog != "" && p.Catalog == "" { + if t.Catalog != "" && p.Catalog == "" && p.Catalog != "hive_metastore" { p.Catalog = t.Catalog } if t.Schema != "" && p.Target == "" { @@ -182,13 +184,16 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } e.Name = normalizePrefix(prefix) + e.Name - TODO: - - e.AiGateway.InferenceTableConfig.CatalogName - - e.AiGateway.InferenceTableConfig.SchemaName - - e.Config.AutoCaptureConfig.SchemaName - - e.Config.AutoCaptureConfig.CatalogName - - e.Config.ServedEntities[0].EntityName (__catalog_name__.__schema_name__.__model_name__.) - - e.Config.ServedModels[0].ModelName (__catalog_name__.__schema_name__.__model_name__.) + if t.Catalog != "" || t.Schema != "" { + // TODO: + // - e.AiGateway.InferenceTableConfig.CatalogName + // - e.AiGateway.InferenceTableConfig.SchemaName + // - e.Config.AutoCaptureConfig.SchemaName + // - e.Config.AutoCaptureConfig.CatalogName + // - e.Config.ServedEntities[0].EntityName (__catalog_name__.__schema_name__.__model_name__.) + // - e.Config.ServedModels[0].ModelName (__catalog_name__.__schema_name__.__model_name__.) + diags = diags.Extend(diag.Errorf("model serving endpoints are not supported with catalog/schema presets")) + } } @@ -306,55 +311,30 @@ func validateCatalogAndSchema(b *bundle.Bundle) diag.Diagnostics { } return nil } - -func validateJobUsesCatalogAndSchema(b *bundle.Bundle, key string, job *resources.Job) diag.Diagnostics { - if !hasTasksRequiringParameters(job) { - return nil - } - if !hasParameter(job, "catalog") || !hasParameter(job, "schema") { - return diag.Diagnostics{{ - Summary: fmt.Sprintf("job %s must pass catalog and schema presets as parameters as follows:\n"+ - " parameters:\n"+ - " - name: catalog:\n"+ - " default: ${presets.catalog}\n"+ - " - name: schema\n"+ - " default: ${presets.schema}\n", key), - Severity: diag.Error, - Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key)}, - }} - } - return nil -} - -// hasTasksRequiringParameters determines if there is a task in this job that -// requires the 'catalog' and 'schema' parameters when they are enabled in presets. -func hasTasksRequiringParameters(job *resources.Job) bool { - for _, task := range job.Tasks { - // Allowlisted task types: these don't require catalog / schema to be passed as a paramater - if task.DbtTask != nil || task.ConditionTask != nil || task.RunJobTask != nil || task.ForEachTask != nil || task.PipelineTask != nil { - continue - } - // Alert tasks, query object tasks, etc. don't require a parameter; - // the catalog / schema is set inside those objects instead. - if task.SqlTask != nil && task.SqlTask.File == nil { - continue +func validateCatalogSchemaUsage(b *bundle.Bundle, key string, job *resources.Job) diag.Diagnostics { + for _, t := range job.Tasks { + if t.NotebookTask != nil { + // TODO: proper validation that warns + return diag.Diagnostics{{ + Summary: fmt.Sprintf("job %s must read catalog and schema from parameters for notebook %s", + key, t.NotebookTask.NotebookPath), + Severity: diag.Recommendation, + Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key)}, + }} } - return true } - return false -} -// hasParameter determines if a job has a parameter with the given name. -func hasParameter(job *resources.Job, name string) bool { - if job.Parameters == nil { - return false - } - for _, p := range job.Parameters { - if p.Name == name { - return true - } - } - return false + // TODO: validate that any notebooks actually read the catalog/schema arguments + // if ... { + // return diag.Diagnostics{{ + // Summary: fmt.Sprintf("job %s must pass catalog and schema presets as parameters as follows:\n"+ + // " ...", key), + // Severity: diag.Error, + // Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key)}, + // }} + // } + + return nil } // toTagArray converts a map of tags to an array of tags. @@ -388,3 +368,56 @@ func normalizePrefix(prefix string) string { return textutil.NormalizeString(prefix) + suffix } + +// addCatalogSchemaParameters adds catalog and schema parameters to a job if they don't already exist. +// Returns any warning diagnostics for existing parameters. +func addCatalogSchemaParameters(b *bundle.Bundle, key string, job *resources.Job, t config.Presets) diag.Diagnostics { + var diags diag.Diagnostics + + // Check for existing catalog/schema parameters + hasCatalog := false + hasSchema := false + if job.Parameters != nil { + for _, param := range job.Parameters { + if param.Name == "catalog" { + hasCatalog = true + diags = diags.Extend(diag.Diagnostics{{ + Summary: fmt.Sprintf("job %s already has 'catalog' parameter defined; ignoring preset value", key), + Severity: diag.Warning, + Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key)}, + }}) + } + if param.Name == "schema" { + hasSchema = true + diags = diags.Extend(diag.Diagnostics{{ + Summary: fmt.Sprintf("job %s already has 'schema' parameter defined; ignoring preset value", key), + Severity: diag.Warning, + Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key)}, + }}) + } + } + } + + // Initialize parameters if nil + if job.Parameters == nil { + job.Parameters = []jobs.JobParameterDefinition{} + } + + // Add catalog parameter if not already present + if !hasCatalog && t.Catalog != "" { + job.Parameters = append(job.Parameters, jobs.JobParameterDefinition{ + Name: "catalog", + Default: t.Catalog, + }) + } + + // Add schema parameter if not already present + if !hasSchema && t.Schema != "" { + job.Parameters = append(job.Parameters, jobs.JobParameterDefinition{ + Name: "schema", + Default: t.Schema, + }) + } + + return diags +} diff --git a/libs/template/templates/default-python/databricks_template_schema.json b/libs/template/templates/default-python/databricks_template_schema.json index d53bad91ab..7115762478 100644 --- a/libs/template/templates/default-python/databricks_template_schema.json +++ b/libs/template/templates/default-python/databricks_template_schema.json @@ -29,7 +29,39 @@ "enum": ["yes", "no"], "description": "Include a stub (sample) Python package in '{{.project_name}}{{path_separator}}src'", "order": 4 + }, + "default_catalog": { + "type": "string", + "default": "{{default_catalog}}", + "pattern": "^\\w*$", + "pattern_match_failure_message": "Invalid catalog name.", + "description": "\nPlease provide an initial catalog{{if eq (default_catalog) \"\"}} (leave blank when not using Unity Catalog){{end}}.\ndefault_catalog", + "order": 5 + }, + "personal_schemas": { + "type": "string", + "description": "\nWould you like to use a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas", + "enum": [ + "yes, use a schema based on the current user name during development", + "no, use a shared schema during development" + ], + "order": 6 + }, + "shared_schema": { + "skip_prompt_if": { + "properties": { + "personal_schemas": { + "const": "yes, use a schema based on the current user name during development" + } + } + }, + "type": "string", + "default": "default", + "pattern": "^\\w+$", + "pattern_match_failure_message": "Invalid schema name.", + "description": "\nPlease provide an initial schema during development.\ndefault_schema", + "order": 7 } }, - "success_message": "Workspace to use (auto-detected, edit in '{{.project_name}}/databricks.yml'): {{workspace_host}}\n\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html." + "success_message": "\nWorkspace to use (auto-detected, edit in '{{.project_name}}/databricks.yml').\nworkspace_host: {{workspace_host}}\n\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html." } diff --git a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl index 13e2e6c9c9..2a0bdee098 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl @@ -6,6 +6,13 @@ bundle: include: - resources/*.yml +{{- $dev_schema := .shared_schema }} +{{- $prod_schema := .shared_schema }} +{{- if (regexp "^yes").MatchString .personal_schemas}} + {{- $dev_schema = "${workspace.current_user.short_name}"}} + {{- $prod_schema = "default"}} +{{- end}} + targets: dev: # The default target uses 'mode: development' to create a development copy. @@ -17,8 +24,8 @@ targets: workspace: host: {{workspace_host}} presets: - catalog: {{default_catalog}} - schema: default + catalog: {{.default_catalog}} + schema: {{$dev_schema}} prod: mode: production @@ -27,8 +34,8 @@ targets: # We explicitly specify /Workspace/Users/{{user_name}} to make sure we only have a single copy. root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} presets: - catalog: {{default_catalog}} - schema: default + catalog: {{.default_catalog}} + schema: {{$prod_schema}} permissions: - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} level: CAN_MANAGE diff --git a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl index 17b209a11e..0ea69a75ae 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl @@ -10,14 +10,6 @@ resources: {{.project_name}}_job: name: {{.project_name}}_job - {{if or (eq .include_notebook "yes") (eq .include_python "yes") -}} - parameters: - - name: catalog - default: ${presets.catalog} - - name: schema - default: ${presets.schema} - - {{end -}} trigger: # Run this job every day, exactly one day from the last run; see https://docs.databricks.com/api/workspace/jobs/create#trigger periodic: diff --git a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.pipeline.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.pipeline.yml.tmpl index 1c6b8607e2..c3f94cb1c8 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.pipeline.yml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.pipeline.yml.tmpl @@ -3,13 +3,6 @@ resources: pipelines: {{.project_name}}_pipeline: name: {{.project_name}}_pipeline - {{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}} - ## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog: - # catalog: catalog_name - {{- else}} - catalog: {{default_catalog}} - {{- end}} - target: {{.project_name}}_${bundle.environment} libraries: - notebook: path: ../src/dlt_pipeline.ipynb diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/apply_defaults.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/apply_defaults.ipynb.tmpl new file mode 100644 index 0000000000..a68e7d822d --- /dev/null +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/apply_defaults.ipynb.tmpl @@ -0,0 +1,83 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# helper notebook: apply_defaults\n", + "\n", + "This helper notebook is used to create widgets that configure the default catalog\n", + "and schema.\n", + "\n", + "Usage:\n", + "\n", + "```\n", + "% run ../relative/path/to/apply_defaults\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f", + "showTitle": false, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Load default catalog and schema as widget and set their values as the default catalog / schema\n", + "dbutils.widgets.text('catalog', '{{.default_catalog}}')\n", + "dbutils.widgets.text('schema', 'default')\n", + "catalog = dbutils.widgets.get('catalog')\n", + "schema = dbutils.widgets.get('schema')\n", + "spark.sql(f'USE {catalog}.{schema}')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Automatically reload imported modules when they change\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "dlt_pipeline", + "widgets": {} + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl index 4163389b3a..95f6b16a3f 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl @@ -23,23 +23,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Automatically reload this notebook when it is edited\n", - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Set the catalog and schema for the current session\n", - "dbutils.widgets.text('catalog', '{{default_catalog}}')\n", - "dbutils.widgets.text('schema', 'default')\n", - "catalog = dbutils.widgets.get('catalog')\n", - "schema = dbutils.widgets.get('schema')\n", - "spark.sql(f'USE {catalog}.{schema}')" + "# Load default catalog and schema as widget and set their values as the default catalog / schema\n", + "%run ./apply_defaults" ] }, { @@ -62,9 +47,9 @@ {{- if (eq .include_python "yes") }} "from {{.project_name}} import main\n", "\n", - "main.get_taxis(spark).show(10)" + "main.create_example_table()" {{else}} - "spark.range(10)" + "spark.sql("CREATE OR REPLACE TABLE example AS SELECT 'example table' AS text_column")" {{end -}} ] } diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl index 1ac627a0c4..80d447f6c1 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl @@ -1,21 +1,30 @@ from pyspark.sql import SparkSession, DataFrame -def get_taxis(spark: SparkSession) -> DataFrame: - return spark.read.table("samples.nyctaxi.trips") - - -# Create a new Databricks Connect session. If this fails, -# check that you have configured Databricks Connect correctly. -# See https://docs.databricks.com/dev-tools/databricks-connect.html. def get_spark() -> SparkSession: + """ + Create a new Databricks Connect session. If this fails, + check that you have configured Databricks Connect correctly. + See https://docs.databricks.com/dev-tools/databricks-connect.html. + """ try: from databricks.connect import DatabricksSession return DatabricksSession.builder.getOrCreate() except ImportError: return SparkSession.builder.getOrCreate() +def get_taxis(spark: SparkSession) -> DataFrame: + return spark.read.table("samples.nyctaxi.trips") + +def create_example_table(): + """ + Create a table called 'example' in the default catalog and schema. + """ + get_spark().sql("CREATE OR REPLACE TABLE example AS SELECT 'example table' AS text_column") + def main(): - # Set the catalog and schema for the current session + # Set the catalog and schema for the current session. + # In the default template, these parameters are set + # using the 'catalog' and 'schema' presets in databricks.yml. parser = argparse.ArgumentParser() parser.add_argument('--catalog', '-c', required=True) parser.add_argument('--schema', '-s', required=True) @@ -23,7 +32,7 @@ def main(): spark = get_spark() spark.sql(f"USE {args.catalog}.{args.schema}") - get_taxis(spark).show(5) + create_example_table() if __name__ == '__main__': main() From 487884b6a5fc4a692e7a3e925bdb1522446fc19f Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 2 Dec 2024 08:43:51 +0100 Subject: [PATCH 04/27] WIP --- .../databricks_template_schema.json | 4 +- .../src/apply_defaults.ipynb.tmpl | 83 ------------------- .../{{.project_name}}/src/notebook.ipynb.tmpl | 63 +++++++++++++- .../src/{{.project_name}}/main.py.tmpl | 1 + 4 files changed, 64 insertions(+), 87 deletions(-) delete mode 100644 libs/template/templates/default-python/template/{{.project_name}}/src/apply_defaults.ipynb.tmpl diff --git a/libs/template/templates/default-python/databricks_template_schema.json b/libs/template/templates/default-python/databricks_template_schema.json index 7115762478..6f457a6ae2 100644 --- a/libs/template/templates/default-python/databricks_template_schema.json +++ b/libs/template/templates/default-python/databricks_template_schema.json @@ -4,7 +4,7 @@ "project_name": { "type": "string", "default": "my_project", - "description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project", + "description": "\nPlease provide a unique name for this project.\nproject_name", "order": 1, "pattern": "^[A-Za-z0-9_]+$", "pattern_match_failure_message": "Name must consist of letters, numbers, and underscores." @@ -13,7 +13,7 @@ "type": "string", "default": "yes", "enum": ["yes", "no"], - "description": "Include a stub (sample) notebook in '{{.project_name}}{{path_separator}}src'", + "description": "\n\nInclude a stub (sample) notebook in '{{.project_name}}{{path_separator}}src'", "order": 2 }, "include_dlt": { diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/apply_defaults.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/apply_defaults.ipynb.tmpl deleted file mode 100644 index a68e7d822d..0000000000 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/apply_defaults.ipynb.tmpl +++ /dev/null @@ -1,83 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "9a626959-61c8-4bba-84d2-2a4ecab1f7ec", - "showTitle": false, - "title": "" - } - }, - "source": [ - "# helper notebook: apply_defaults\n", - "\n", - "This helper notebook is used to create widgets that configure the default catalog\n", - "and schema.\n", - "\n", - "Usage:\n", - "\n", - "```\n", - "% run ../relative/path/to/apply_defaults\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 0, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "9198e987-5606-403d-9f6d-8f14e6a4017f", - "showTitle": false, - "title": "" - } - }, - "outputs": [], - "source": [ - "# Load default catalog and schema as widget and set their values as the default catalog / schema\n", - "dbutils.widgets.text('catalog', '{{.default_catalog}}')\n", - "dbutils.widgets.text('schema', 'default')\n", - "catalog = dbutils.widgets.get('catalog')\n", - "schema = dbutils.widgets.get('schema')\n", - "spark.sql(f'USE {catalog}.{schema}')" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Automatically reload imported modules when they change\n", - "%load_ext autoreload\n", - "%autoreload 2" - ] - } - ], - "metadata": { - "application/vnd.databricks.v1+notebook": { - "dashboards": [], - "language": "python", - "notebookMetadata": { - "pythonIndentUnit": 2 - }, - "notebookName": "dlt_pipeline", - "widgets": {} - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl index 95f6b16a3f..4d3c398c91 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl @@ -24,7 +24,9 @@ "outputs": [], "source": [ "# Load default catalog and schema as widget and set their values as the default catalog / schema\n", - "%run ./apply_defaults" + "catalog = dbutils.widgets.get('catalog')\n", + "schema = dbutils.widgets.get('schema')\n", + "spark.sql(f'USE {catalog}.{schema}')" ] }, { @@ -62,7 +64,64 @@ "pythonIndentUnit": 2 }, "notebookName": "notebook", - "widgets": {} + "widgets": { + "catalog": { + "currentValue": "{{.default_catalog}}", + "nuid": "3965fc9c-8080-45b1-bee3-f75cef7685b4", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "{{.default_catalog}}", + "label": null, + "name": "catalog", + "options": { + "widgetDisplayType": "Text", + "validationRegex": null + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "widgetType": "text", + "defaultValue": "{{.default_catalog}}", + "label": null, + "name": "catalog", + "options": { + "widgetType": "text", + "autoCreated": null, + "validationRegex": null + } + } + }, +{{- $dev_schema := .shared_schema }} +{{- if (regexp "^yes").MatchString .personal_schemas}} + {{- $dev_schema = "{{short_name}}"}} +{{- end}} + "schema": { + "currentValue": "{{$dev_schema}}", + "nuid": "6ec0d70f-39bf-4859-a510-02c3e3d59bff", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "{{$dev_schema}}", + "label": null, + "name": "schema", + "options": { + "widgetDisplayType": "Text", + "validationRegex": null + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "widgetType": "text", + "defaultValue": "{{$dev_schema}}", + "label": null, + "name": "schema", + "options": { + "widgetType": "text", + "autoCreated": null, + "validationRegex": null + } + } + } + } }, "kernelspec": { "display_name": "Python 3", diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl index 80d447f6c1..45af6fa86e 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl @@ -1,4 +1,5 @@ from pyspark.sql import SparkSession, DataFrame +import argparse def get_spark() -> SparkSession: """ From 963f5e6636c928609bd64f2e9ba046b00a092c8b Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 6 Dec 2024 22:12:24 +0100 Subject: [PATCH 05/27] Add translate_paths.GetLocalPath --- bundle/config/mutator/translate_paths.go | 83 +++++++++++------ .../mutator/translate_paths_artifacts.go | 5 +- .../mutator/translate_paths_dashboards.go | 5 +- bundle/config/mutator/translate_paths_jobs.go | 5 +- .../mutator/translate_paths_pipelines.go | 5 +- bundle/config/mutator/translate_paths_test.go | 90 +++++++++++++++++++ 6 files changed, 156 insertions(+), 37 deletions(-) diff --git a/bundle/config/mutator/translate_paths.go b/bundle/config/mutator/translate_paths.go index 5e016d8a1e..f7b6547e84 100644 --- a/bundle/config/mutator/translate_paths.go +++ b/bundle/config/mutator/translate_paths.go @@ -14,6 +14,7 @@ import ( "github.com/databricks/cli/bundle/config" "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/dyn" + "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/notebook" ) @@ -58,6 +59,47 @@ type translateContext struct { seen map[string]string } +// GetLocalPath returns the local file system paths for a path referenced from a resource.. +// If it's an absolute path, we treat it as a workspace path and return "". +// +// Arguments: +// +// sourceDir - the source directory for the resource definition. +// p - the path referenced from the resource definition. +// +// Returns: +// +// localPath - the full local file system path. +// localRelPath - the relative path from the base directory. +func GetLocalPath(ctx context.Context, b *bundle.Bundle, sourceDir string, p string) (string, string, error) { + if p == "" { + return "", "", fmt.Errorf("path cannot be empty") + } + if path.IsAbs(p) { + return "", "", nil + } + + url, err := url.Parse(p) + if err != nil { + // Apparently this path is not a URL; this can happen for paths that + // have non-URL characters like './profit-%.csv'. + log.Warnf(ctx, "Failed to parse path as a URL '%s': %v", p, err) + } else if url.Scheme != "" { + return "", "", nil + } + + localPath := filepath.Join(sourceDir, filepath.FromSlash(p)) + localRelPath, err := filepath.Rel(b.SyncRootPath, localPath) + if err != nil { + return "", "", err + } + if strings.HasPrefix(localRelPath, "..") { + return "", "", fmt.Errorf("path '%s' is not contained in sync root path", p) + } + + return localPath, localRelPath, nil +} + // rewritePath converts a given relative path from the loaded config to a new path based on the passed rewriting function // // It takes these arguments: @@ -68,42 +110,25 @@ type translateContext struct { // // The function returns an error if it is impossible to rewrite the given relative path. func (t *translateContext) rewritePath( + ctx context.Context, dir string, p *string, fn rewriteFunc, ) error { - // We assume absolute paths point to a location in the workspace - if path.IsAbs(*p) { - return nil - } - - url, err := url.Parse(*p) + localPath, localRelPath, err := GetLocalPath(ctx, t.b, dir, *p) if err != nil { return err } - - // If the file path has scheme, it's a full path and we don't need to transform it - if url.Scheme != "" { + if localPath == "" { + // Skip absolute paths return nil } - // Local path is relative to the directory the resource was defined in. - localPath := filepath.Join(dir, filepath.FromSlash(*p)) if interp, ok := t.seen[localPath]; ok { *p = interp return nil } - // Local path must be contained in the sync root. - // If it isn't, it won't be synchronized into the workspace. - localRelPath, err := filepath.Rel(t.b.SyncRootPath, localPath) - if err != nil { - return err - } - if strings.HasPrefix(localRelPath, "..") { - return fmt.Errorf("path %s is not contained in sync root path", localPath) - } - var workspacePath string if config.IsExplicitlyEnabled(t.b.Config.Presets.SourceLinkedDeployment) { workspacePath = t.b.SyncRootPath @@ -215,9 +240,9 @@ func (t *translateContext) translateNoOpWithPrefix(literal, localFullPath, local return localRelPath, nil } -func (t *translateContext) rewriteValue(p dyn.Path, v dyn.Value, fn rewriteFunc, dir string) (dyn.Value, error) { +func (t *translateContext) rewriteValue(ctx context.Context, p dyn.Path, v dyn.Value, fn rewriteFunc, dir string) (dyn.Value, error) { out := v.MustString() - err := t.rewritePath(dir, &out, fn) + err := t.rewritePath(ctx, dir, &out, fn) if err != nil { if target := (&ErrIsNotebook{}); errors.As(err, target) { return dyn.InvalidValue, fmt.Errorf(`expected a file for "%s" but got a notebook: %w`, p, target) @@ -231,15 +256,15 @@ func (t *translateContext) rewriteValue(p dyn.Path, v dyn.Value, fn rewriteFunc, return dyn.NewValue(out, v.Locations()), nil } -func (t *translateContext) rewriteRelativeTo(p dyn.Path, v dyn.Value, fn rewriteFunc, dir, fallback string) (dyn.Value, error) { - nv, err := t.rewriteValue(p, v, fn, dir) +func (t *translateContext) rewriteRelativeTo(ctx context.Context, p dyn.Path, v dyn.Value, fn rewriteFunc, dir, fallback string) (dyn.Value, error) { + nv, err := t.rewriteValue(ctx, p, v, fn, dir) if err == nil { return nv, nil } // If we failed to rewrite the path, try to rewrite it relative to the fallback directory. if fallback != "" { - nv, nerr := t.rewriteValue(p, v, fn, fallback) + nv, nerr := t.rewriteValue(ctx, p, v, fn, fallback) if nerr == nil { // TODO: Emit a warning that this path should be rewritten. return nv, nil @@ -249,7 +274,7 @@ func (t *translateContext) rewriteRelativeTo(p dyn.Path, v dyn.Value, fn rewrite return dyn.InvalidValue, err } -func (m *translatePaths) Apply(_ context.Context, b *bundle.Bundle) diag.Diagnostics { +func (m *translatePaths) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { t := &translateContext{ b: b, seen: make(map[string]string), @@ -257,13 +282,13 @@ func (m *translatePaths) Apply(_ context.Context, b *bundle.Bundle) diag.Diagnos err := b.Config.Mutate(func(v dyn.Value) (dyn.Value, error) { var err error - for _, fn := range []func(dyn.Value) (dyn.Value, error){ + for _, fn := range []func(ctx context.Context, v dyn.Value) (dyn.Value, error){ t.applyJobTranslations, t.applyPipelineTranslations, t.applyArtifactTranslations, t.applyDashboardTranslations, } { - v, err = fn(v) + v, err = fn(ctx, v) if err != nil { return dyn.InvalidValue, err } diff --git a/bundle/config/mutator/translate_paths_artifacts.go b/bundle/config/mutator/translate_paths_artifacts.go index 921c00c734..44e034204e 100644 --- a/bundle/config/mutator/translate_paths_artifacts.go +++ b/bundle/config/mutator/translate_paths_artifacts.go @@ -1,6 +1,7 @@ package mutator import ( + "context" "fmt" "github.com/databricks/cli/libs/dyn" @@ -27,7 +28,7 @@ func (t *translateContext) artifactRewritePatterns() []artifactRewritePattern { } } -func (t *translateContext) applyArtifactTranslations(v dyn.Value) (dyn.Value, error) { +func (t *translateContext) applyArtifactTranslations(ctx context.Context, v dyn.Value) (dyn.Value, error) { var err error for _, rewritePattern := range t.artifactRewritePatterns() { @@ -38,7 +39,7 @@ func (t *translateContext) applyArtifactTranslations(v dyn.Value) (dyn.Value, er return dyn.InvalidValue, fmt.Errorf("unable to determine directory for artifact %s: %w", key, err) } - return t.rewriteRelativeTo(p, v, rewritePattern.fn, dir, "") + return t.rewriteRelativeTo(ctx, p, v, rewritePattern.fn, dir, "") }) if err != nil { return dyn.InvalidValue, err diff --git a/bundle/config/mutator/translate_paths_dashboards.go b/bundle/config/mutator/translate_paths_dashboards.go index 93822a5991..a6aac82e95 100644 --- a/bundle/config/mutator/translate_paths_dashboards.go +++ b/bundle/config/mutator/translate_paths_dashboards.go @@ -1,12 +1,13 @@ package mutator import ( + "context" "fmt" "github.com/databricks/cli/libs/dyn" ) -func (t *translateContext) applyDashboardTranslations(v dyn.Value) (dyn.Value, error) { +func (t *translateContext) applyDashboardTranslations(ctx context.Context, v dyn.Value) (dyn.Value, error) { // Convert the `file_path` field to a local absolute path. // We load the file at this path and use its contents for the dashboard contents. pattern := dyn.NewPattern( @@ -23,6 +24,6 @@ func (t *translateContext) applyDashboardTranslations(v dyn.Value) (dyn.Value, e return dyn.InvalidValue, fmt.Errorf("unable to determine directory for dashboard %s: %w", key, err) } - return t.rewriteRelativeTo(p, v, t.retainLocalAbsoluteFilePath, dir, "") + return t.rewriteRelativeTo(ctx, p, v, t.retainLocalAbsoluteFilePath, dir, "") }) } diff --git a/bundle/config/mutator/translate_paths_jobs.go b/bundle/config/mutator/translate_paths_jobs.go index c29ff0ea95..4165c0c070 100644 --- a/bundle/config/mutator/translate_paths_jobs.go +++ b/bundle/config/mutator/translate_paths_jobs.go @@ -1,6 +1,7 @@ package mutator import ( + "context" "fmt" "slices" @@ -9,7 +10,7 @@ import ( "github.com/databricks/cli/libs/dyn" ) -func (t *translateContext) applyJobTranslations(v dyn.Value) (dyn.Value, error) { +func (t *translateContext) applyJobTranslations(ctx context.Context, v dyn.Value) (dyn.Value, error) { var err error fallback, err := gatherFallbackPaths(v, "jobs") @@ -43,7 +44,7 @@ func (t *translateContext) applyJobTranslations(v dyn.Value) (dyn.Value, error) return dyn.InvalidValue, err } - return t.rewriteRelativeTo(p, v, rewritePatternFn, dir, fallback[key]) + return t.rewriteRelativeTo(ctx, p, v, rewritePatternFn, dir, fallback[key]) }) } diff --git a/bundle/config/mutator/translate_paths_pipelines.go b/bundle/config/mutator/translate_paths_pipelines.go index 71a65e8462..584da825a4 100644 --- a/bundle/config/mutator/translate_paths_pipelines.go +++ b/bundle/config/mutator/translate_paths_pipelines.go @@ -1,6 +1,7 @@ package mutator import ( + "context" "fmt" "github.com/databricks/cli/libs/dyn" @@ -34,7 +35,7 @@ func (t *translateContext) pipelineRewritePatterns() []pipelineRewritePattern { } } -func (t *translateContext) applyPipelineTranslations(v dyn.Value) (dyn.Value, error) { +func (t *translateContext) applyPipelineTranslations(ctx context.Context, v dyn.Value) (dyn.Value, error) { var err error fallback, err := gatherFallbackPaths(v, "pipelines") @@ -50,7 +51,7 @@ func (t *translateContext) applyPipelineTranslations(v dyn.Value) (dyn.Value, er return dyn.InvalidValue, fmt.Errorf("unable to determine directory for pipeline %s: %w", key, err) } - return t.rewriteRelativeTo(p, v, rewritePattern.fn, dir, fallback[key]) + return t.rewriteRelativeTo(ctx, p, v, rewritePattern.fn, dir, fallback[key]) }) if err != nil { return dyn.InvalidValue, err diff --git a/bundle/config/mutator/translate_paths_test.go b/bundle/config/mutator/translate_paths_test.go index bf6ba15d8b..c93470e5b9 100644 --- a/bundle/config/mutator/translate_paths_test.go +++ b/bundle/config/mutator/translate_paths_test.go @@ -1002,3 +1002,93 @@ func TestTranslatePathsWithSourceLinkedDeployment(t *testing.T) { b.Config.Resources.Pipelines["pipeline"].Libraries[1].Notebook.Path, ) } + +// TestGetLocalPath contains test cases for the GetLocalPath function. +func TestGetLocalPath(t *testing.T) { + testCases := []struct { + name string + input string + expected string + errMsg string + }{ + { + name: "EmptyPath", + input: "", + expected: "", + errMsg: "path cannot be empty", + }, + { + name: "AbsolutePathUnix", + input: "/usr/local/bin", + expected: "", + errMsg: "", + }, + { + name: "AbsolutePathWindows", + input: `C:\Program Files\`, + expected: ``, + errMsg: "", + }, + { + name: "RelativePath", + input: "./local/path", + expected: "root/src/local/path", + errMsg: "", + }, + { + name: "NestedRelativePath", + input: "../relative/path", + expected: "root/relative/path", + errMsg: "", + }, + { + name: "PathWithSpaces", + input: "path/with spaces and slash/", + expected: "root/src/path/with spaces and slash", + errMsg: "", + }, + { + name: "PathWithSpecialChars", + input: "path/with/@#$%^&*()!", + expected: "root/src/path/with/@#$%^&*()!", + errMsg: "", + }, + { + name: "DBFS path", + input: "dbfs:/some/path", + expected: "", + errMsg: "", + }, + { + name: "PathTraversal", + input: "path/with/../../../traversal", + expected: "root/traversal", + errMsg: "", + }, + { + name: "RelativeOutOfBundle", + input: "../../outside", + expected: "", + errMsg: "path '../../outside' is not contained in sync root path", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + b := &bundle.Bundle{ + SyncRootPath: "root", + } + ctx := context.Background() + localPath, _, err := mutator.GetLocalPath(ctx, b, "root/src", tc.input) + if tc.errMsg != "" { + require.Error(t, err) + assert.Contains(t, err.Error(), tc.errMsg) + } else { + require.NoError(t, err) + // On Windows, filepath.Join may return backslashes. Normalize to forward slashes for comparison. + normalizedResult := filepath.ToSlash(localPath) + assert.Equal(t, tc.expected, normalizedResult, "For test case: %s", tc.name) + } + }) + } +} From 2addd0c506f61b8c1eed98269931b4c9212d97b4 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 6 Dec 2024 22:12:42 +0100 Subject: [PATCH 06/27] Extend catalog/schema preset support --- bundle/config/mutator/apply_presets.go | 113 ++++++++++++++---- bundle/config/mutator/apply_presets_test.go | 50 ++++++++ .../databricks_template_schema.json | 6 +- .../scratch/exploration.ipynb.tmpl | 78 ++++++++++-- .../src/{{.project_name}}/main.py.tmpl | 4 +- 5 files changed, 213 insertions(+), 38 deletions(-) diff --git a/bundle/config/mutator/apply_presets.go b/bundle/config/mutator/apply_presets.go index a68ebfc1c0..39d979992a 100644 --- a/bundle/config/mutator/apply_presets.go +++ b/bundle/config/mutator/apply_presets.go @@ -3,7 +3,9 @@ package mutator import ( "context" "fmt" + "os" "path" + "regexp" "slices" "sort" "strings" @@ -14,6 +16,7 @@ import ( "github.com/databricks/cli/libs/dbr" "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/dyn" + "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/textutil" "github.com/databricks/databricks-sdk-go/service/catalog" "github.com/databricks/databricks-sdk-go/service/jobs" @@ -100,7 +103,7 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } diags = diags.Extend(addCatalogSchemaParameters(b, key, j, t)) - diags = diags.Extend(validateCatalogSchemaUsage(b, key, j)) + diags = diags.Extend(recommendCatalogSchemaUsage(b, ctx, key, j)) } } @@ -333,31 +336,6 @@ func validateCatalogAndSchema(b *bundle.Bundle) diag.Diagnostics { } return nil } -func validateCatalogSchemaUsage(b *bundle.Bundle, key string, job *resources.Job) diag.Diagnostics { - for _, t := range job.Tasks { - if t.NotebookTask != nil { - // TODO: proper validation that warns - return diag.Diagnostics{{ - Summary: fmt.Sprintf("job %s must read catalog and schema from parameters for notebook %s", - key, t.NotebookTask.NotebookPath), - Severity: diag.Recommendation, - Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key)}, - }} - } - } - - // TODO: validate that any notebooks actually read the catalog/schema arguments - // if ... { - // return diag.Diagnostics{{ - // Summary: fmt.Sprintf("job %s must pass catalog and schema presets as parameters as follows:\n"+ - // " ...", key), - // Severity: diag.Error, - // Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key)}, - // }} - // } - - return nil -} // toTagArray converts a map of tags to an array of tags. // We sort tags so ensure stable ordering. @@ -443,3 +421,86 @@ func addCatalogSchemaParameters(b *bundle.Bundle, key string, job *resources.Job return diags } + +func recommendCatalogSchemaUsage(b *bundle.Bundle, ctx context.Context, key string, job *resources.Job) diag.Diagnostics { + var diags diag.Diagnostics + for _, t := range job.Tasks { + var relPath string + var expected string + var fix string + if t.NotebookTask != nil { + relPath = t.NotebookTask.NotebookPath + expected = `" dbutils.widgets.text(['"]schema|` + + `USE[^)]+schema` + fix = " dbutils.widgets.text('catalog')\n" + + " dbutils.widgets.text('schema')\n" + + " catalog = dbutils.widgets.get('catalog')\n" + + " schema = dbutils.widgets.get('schema')\n" + + " spark.sql(f'USE {catalog}.{schema}')\n" + } else if t.SparkPythonTask != nil { + relPath = t.SparkPythonTask.PythonFile + expected = `add_argument\(['"]--catalog'|` + + `USE[^)]+catalog` + fix = " def main():\n" + + " parser = argparse.ArgumentParser()\n" + + " parser.add_argument('--catalog', required=True)\n" + + " parser.add_argument('--schema', '-s', required=True)\n" + + " args, unknown = parser.parse_known_args()\n" + + " spark.sql(f\"USE {args.catalog}.{args.schema}\")\n" + } else if t.SqlTask != nil && t.SqlTask.File != nil { + relPath = t.SqlTask.File.Path + expected = `:schema|\{\{schema\}\}` + fix = " USE CATALOG {{catalog}};\n" + + " USE IDENTIFIER({schema});\n" + } else { + continue + } + + sourceDir, err := b.Config.GetLocation("resources.jobs." + key).Directory() + if err != nil { + continue + } + + localPath, _, err := GetLocalPath(ctx, b, sourceDir, relPath) + if err != nil { + // Any path errors are reported by another mutator + continue + } + if localPath == "" { + // If there is no local copy we don't want to download it and skip this check + continue + } + + log.Warnf(ctx, "LocalPath: %s, relPath: %s, sourceDir: %s", localPath, relPath, sourceDir) + + if !fileIncludesPattern(ctx, localPath, expected) { + diags = diags.Extend(diag.Diagnostics{{ + Summary: fmt.Sprintf("Use the 'catalog' and 'schema' parameters provided via 'presets.catalog' and 'presets.schema' using\n\n" + + fix), + Severity: diag.Recommendation, + Locations: []dyn.Location{{ + File: localPath, + Line: 1, + Column: 1, + }}, + }}) + } + } + + return diags +} + +func fileIncludesPattern(ctx context.Context, filePath string, expected string) bool { + content, err := os.ReadFile(filePath) + if err != nil { + log.Warnf(ctx, "failed to check file %s: %v", filePath, err) + return true + } + + matched, err := regexp.MatchString(expected, string(content)) + if err != nil { + log.Warnf(ctx, "failed to check pattern in %s: %v", filePath, err) + return true + } + return matched +} diff --git a/bundle/config/mutator/apply_presets_test.go b/bundle/config/mutator/apply_presets_test.go index 497ef051ad..eb0eea5de3 100644 --- a/bundle/config/mutator/apply_presets_test.go +++ b/bundle/config/mutator/apply_presets_test.go @@ -453,3 +453,53 @@ func TestApplyPresetsSourceLinkedDeployment(t *testing.T) { } } + +// func TestApplyPresetsRecommendsCatalogSchemaUsage(t *testing.T) { +// dir := t.TempDir() + +// ... + +// b := &bundle.Bundle{ +// Config: config.Root{ +// Resources: config.Resources{ +// Jobs: map[string]*resources.Job{ +// "job1": { +// JobSettings: &jobs.JobSettings{ +// Tasks: []jobs.Task{ +// { +// NotebookTask: &jobs.NotebookTask{ +// NotebookPath: notebookPath, +// }, +// }, +// { +// SparkPythonTask: &jobs.SparkPythonTask{ +// PythonFile: pythonPath, +// }, +// }, +// { +// NotebookTask: &jobs.NotebookTask{ +// NotebookPath: "/Workspace/absolute/path/notebook", +// }, +// }, +// }, +// }, +// }, +// }, +// }, +// }, +// } + +// ctx := context.Background() +// diags := bundle.Apply(ctx, b, ApplyPresets()) +// require.Len(t, diags, 2) + +// // Check notebook diagnostic +// assert.Equal(t, notebookPath, diags[0].Locations[0].File) +// assert.Equal(t, 1, diags[0].Locations[0].Line) +// assert.Equal(t, 1, diags[0].Locations[0].Column) + +// // Check Python script diagnostic +// assert.Equal(t, pythonPath, diags[1].Locations[0].File) +// assert.Equal(t, 1, diags[1].Locations[0].Line) +// assert.Equal(t, 1, diags[1].Locations[0].Column) +// } diff --git a/libs/template/templates/default-python/databricks_template_schema.json b/libs/template/templates/default-python/databricks_template_schema.json index 6f457a6ae2..92e8316df5 100644 --- a/libs/template/templates/default-python/databricks_template_schema.json +++ b/libs/template/templates/default-python/databricks_template_schema.json @@ -13,21 +13,21 @@ "type": "string", "default": "yes", "enum": ["yes", "no"], - "description": "\n\nInclude a stub (sample) notebook in '{{.project_name}}{{path_separator}}src'", + "description": "\nWould you like to include a stub (sample) notebook in '{{.project_name}}{{path_separator}}src'?", "order": 2 }, "include_dlt": { "type": "string", "default": "yes", "enum": ["yes", "no"], - "description": "Include a stub (sample) Delta Live Tables pipeline in '{{.project_name}}{{path_separator}}src'", + "description": "Would you like to include a stub (sample) Delta Live Tables pipeline in '{{.project_name}}{{path_separator}}src'?", "order": 3 }, "include_python": { "type": "string", "default": "yes", "enum": ["yes", "no"], - "description": "Include a stub (sample) Python package in '{{.project_name}}{{path_separator}}src'", + "description": "Would you like to include a stub (sample) Python package in '{{.project_name}}{{path_separator}}src'?", "order": 4 }, "default_catalog": { diff --git a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl index 42164dff07..8c0298fec3 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl @@ -27,15 +27,24 @@ }, "outputs": [], "source": [ - {{- if (eq .include_python "yes") }} + {{- if (eq .include_python "yes") }} "import sys\n", "sys.path.append('../src')\n", "from {{.project_name}} import main\n", "\n", - "main.get_taxis(spark).show(10)" - {{else}} - "spark.range(10)" - {{end -}} + "catalog = dbutils.widgets.get('catalog')\n", + "schema = dbutils.widgets.get('schema')\n", + "spark.sql(f'USE {catalog}.{schema}')\n", + "\n", + "spark.sql('SELECT * FROM example').show(10)" + {{- else}} + "# Load default catalog and schema as widget and set their values as the default catalog / schema\n", + "catalog = dbutils.widgets.get('catalog')\n", + "schema = dbutils.widgets.get('schema')\n", + "spark.sql(f'USE {catalog}.{schema}')\n", + "\n", + "spark.sql('SELECT * FROM example').show(10)" + {{- end}} ] } ], @@ -46,8 +55,63 @@ "notebookMetadata": { "pythonIndentUnit": 2 }, - "notebookName": "ipynb-notebook", - "widgets": {} + "notebookName": "exploration", + "widgets": { + "catalog": { + "currentValue": "{{.default_catalog}}", + "nuid": "c47e96d8-5751-4c8a-9d6b-5c6c7c3f1234", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "{{.default_catalog}}", + "label": null, + "name": "catalog", + "options": { + "widgetDisplayType": "Text", + "validationRegex": null + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "widgetType": "text", + "defaultValue": "{{.default_catalog}}", + "label": null, + "name": "catalog", + "options": { + "widgetType": "text", + "autoCreated": null, + "validationRegex": null + } + } + }, +{{- $dev_schema := .shared_schema }} +{{- if (regexp "^yes").MatchString .personal_schemas}} + {{- $dev_schema = "{{short_name}}"}} +{{- end}} + "schema": { + "currentValue": "{{$dev_schema}}", + "nuid": "c47e96d8-5751-4c8a-9d6b-5c6c7c3f5678", + "typedWidgetInfo": { + "autoCreated": false, + "defaultValue": "{{$dev_schema}}", + "label": null, + "name": "schema", + "options": { + "widgetDisplayType": "Text", + "validationRegex": null + }, + "parameterDataType": "String" + }, + "widgetInfo": { + "widgetType": "text", + "defaultValue": "{{$dev_schema}}", + "label": null, + "name": "schema", + "options": { + "widgetType": "text", + "autoCreated": null, + "validationRegex": null + } + } }, "kernelspec": { "display_name": "Python 3", diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl index 45af6fa86e..e79920a9e2 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl @@ -27,8 +27,8 @@ def main(): # In the default template, these parameters are set # using the 'catalog' and 'schema' presets in databricks.yml. parser = argparse.ArgumentParser() - parser.add_argument('--catalog', '-c', required=True) - parser.add_argument('--schema', '-s', required=True) + parser.add_argument('--catalog', required=True) + parser.add_argument('--schema', required=True) args, unknown = parser.parse_known_args() spark = get_spark() spark.sql(f"USE {args.catalog}.{args.schema}") From 2ab497dc5c93d81a8c8bb52ed6e0e6a09da98f8a Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sun, 8 Dec 2024 20:44:33 +0100 Subject: [PATCH 07/27] Tweak templates --- .../templates/dbt-sql/databricks_template_schema.json | 2 +- .../default-python/databricks_template_schema.json | 2 +- .../template/{{.project_name}}/databricks.yml.tmpl | 8 +++----- .../templates/default-sql/databricks_template_schema.json | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/libs/template/templates/dbt-sql/databricks_template_schema.json b/libs/template/templates/dbt-sql/databricks_template_schema.json index cccf145dc5..bb512153f4 100644 --- a/libs/template/templates/dbt-sql/databricks_template_schema.json +++ b/libs/template/templates/dbt-sql/databricks_template_schema.json @@ -45,7 +45,7 @@ "default": "default", "pattern": "^\\w+$", "pattern_match_failure_message": "Invalid schema name.", - "description": "\nPlease provide an initial schema during development.\ndefault_schema", + "description": "\nPlease provide a default schema during development.\ndefault_schema", "order": 5 } }, diff --git a/libs/template/templates/default-python/databricks_template_schema.json b/libs/template/templates/default-python/databricks_template_schema.json index 92e8316df5..461aaa0c46 100644 --- a/libs/template/templates/default-python/databricks_template_schema.json +++ b/libs/template/templates/default-python/databricks_template_schema.json @@ -59,7 +59,7 @@ "default": "default", "pattern": "^\\w+$", "pattern_match_failure_message": "Invalid schema name.", - "description": "\nPlease provide an initial schema during development.\ndefault_schema", + "description": "\nPlease provide default schema during development.\ndefault_schema", "order": 7 } }, diff --git a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl index 2a0bdee098..421fe5014f 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl @@ -33,11 +33,9 @@ targets: host: {{workspace_host}} # We explicitly specify /Workspace/Users/{{user_name}} to make sure we only have a single copy. root_path: /Workspace/Users/{{user_name}}/.bundle/${bundle.name}/${bundle.target} - presets: - catalog: {{.default_catalog}} - schema: {{$prod_schema}} permissions: - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} level: CAN_MANAGE - run_as: - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} + presets: + catalog: {{.default_catalog}} + schema: {{$prod_schema}} diff --git a/libs/template/templates/default-sql/databricks_template_schema.json b/libs/template/templates/default-sql/databricks_template_schema.json index 113cbef642..81fa7e2d22 100644 --- a/libs/template/templates/default-sql/databricks_template_schema.json +++ b/libs/template/templates/default-sql/databricks_template_schema.json @@ -45,7 +45,7 @@ "default": "default", "pattern": "^\\w+$", "pattern_match_failure_message": "Invalid schema name.", - "description": "\nPlease provide an initial schema during development.\ndefault_schema", + "description": "\nPlease provide a default schema during development.\ndefault_schema", "order": 5 } }, From b6fcc1d1dbe97574e0b585346ced42dde076f980 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 9 Dec 2024 09:41:30 +0100 Subject: [PATCH 08/27] Cleanup --- bundle/config/mutator/apply_presets.go | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/bundle/config/mutator/apply_presets.go b/bundle/config/mutator/apply_presets.go index 39d979992a..86d61837e8 100644 --- a/bundle/config/mutator/apply_presets.go +++ b/bundle/config/mutator/apply_presets.go @@ -430,7 +430,7 @@ func recommendCatalogSchemaUsage(b *bundle.Bundle, ctx context.Context, key stri var fix string if t.NotebookTask != nil { relPath = t.NotebookTask.NotebookPath - expected = `" dbutils.widgets.text(['"]schema|` + + expected = `dbutils.widgets.text\(['"]schema|` + `USE[^)]+schema` fix = " dbutils.widgets.text('catalog')\n" + " dbutils.widgets.text('schema')\n" + @@ -462,17 +462,12 @@ func recommendCatalogSchemaUsage(b *bundle.Bundle, ctx context.Context, key stri } localPath, _, err := GetLocalPath(ctx, b, sourceDir, relPath) - if err != nil { - // Any path errors are reported by another mutator - continue - } - if localPath == "" { - // If there is no local copy we don't want to download it and skip this check + if err != nil || localPath == "" { + // We ignore errors (they're reported by another mutator) + // and ignore empty local paths (which means we'd have to download the file) continue } - log.Warnf(ctx, "LocalPath: %s, relPath: %s, sourceDir: %s", localPath, relPath, sourceDir) - if !fileIncludesPattern(ctx, localPath, expected) { diags = diags.Extend(diag.Diagnostics{{ Summary: fmt.Sprintf("Use the 'catalog' and 'schema' parameters provided via 'presets.catalog' and 'presets.schema' using\n\n" + From d4977c69198fbfbcb30b0771f86160a64ad5eaf5 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Tue, 10 Dec 2024 10:59:29 +0100 Subject: [PATCH 09/27] WIP test --- bundle/config/mutator/apply_presets_test.go | 137 +++++++++++++------- 1 file changed, 88 insertions(+), 49 deletions(-) diff --git a/bundle/config/mutator/apply_presets_test.go b/bundle/config/mutator/apply_presets_test.go index eb0eea5de3..265638d249 100644 --- a/bundle/config/mutator/apply_presets_test.go +++ b/bundle/config/mutator/apply_presets_test.go @@ -2,7 +2,9 @@ package mutator_test import ( "context" + "fmt" "runtime" + "strings" "testing" "github.com/databricks/cli/bundle" @@ -454,52 +456,89 @@ func TestApplyPresetsSourceLinkedDeployment(t *testing.T) { } -// func TestApplyPresetsRecommendsCatalogSchemaUsage(t *testing.T) { -// dir := t.TempDir() - -// ... - -// b := &bundle.Bundle{ -// Config: config.Root{ -// Resources: config.Resources{ -// Jobs: map[string]*resources.Job{ -// "job1": { -// JobSettings: &jobs.JobSettings{ -// Tasks: []jobs.Task{ -// { -// NotebookTask: &jobs.NotebookTask{ -// NotebookPath: notebookPath, -// }, -// }, -// { -// SparkPythonTask: &jobs.SparkPythonTask{ -// PythonFile: pythonPath, -// }, -// }, -// { -// NotebookTask: &jobs.NotebookTask{ -// NotebookPath: "/Workspace/absolute/path/notebook", -// }, -// }, -// }, -// }, -// }, -// }, -// }, -// }, -// } - -// ctx := context.Background() -// diags := bundle.Apply(ctx, b, ApplyPresets()) -// require.Len(t, diags, 2) - -// // Check notebook diagnostic -// assert.Equal(t, notebookPath, diags[0].Locations[0].File) -// assert.Equal(t, 1, diags[0].Locations[0].Line) -// assert.Equal(t, 1, diags[0].Locations[0].Column) - -// // Check Python script diagnostic -// assert.Equal(t, pythonPath, diags[1].Locations[0].File) -// assert.Equal(t, 1, diags[1].Locations[0].Line) -// assert.Equal(t, 1, diags[1].Locations[0].Column) -// } +func TestApplyPresetsCatalogSchema(t *testing.T) { + // Create a bundle in a known mode, e.g. development or production doesn't matter much here. + b := mockBundle(config.Development) + // Set the catalog and schema in presets. + b.Config.Presets.Catalog = "my_catalog" + b.Config.Presets.Schema = "my_schema" + + ctx := context.Background() + diags := bundle.Apply(ctx, b, mutator.ApplyPresets()) + require.NoError(t, diags.Error()) + + // Verify that jobs got catalog/schema if they support it. + // For DBT tasks in jobs: + for _, job := range b.Config.Resources.Jobs { + if job.JobSettings != nil && job.Tasks != nil { + for _, task := range job.Tasks { + if task.DbtTask != nil { + require.Equal(t, "my_catalog", task.DbtTask.Catalog, "dbt catalog should be set") + require.Equal(t, "my_schema", task.DbtTask.Schema, "dbt schema should be set") + } + } + } + } + + // Pipelines: Catalog/Schema + for _, p := range b.Config.Resources.Pipelines { + if p.PipelineSpec != nil { + // pipeline catalog and schema + if p.Catalog == "" || p.Catalog == "hive_metastore" { + require.Equal(t, "my_catalog", p.Catalog, "pipeline catalog should be set") + } + require.Equal(t, "my_schema", p.Target, "pipeline schema (target) should be set") + } + } + + // Registered models: Catalog/Schema + for _, rm := range b.Config.Resources.RegisteredModels { + if rm.CreateRegisteredModelRequest != nil { + require.Equal(t, "my_catalog", rm.CatalogName, "registered model catalog should be set") + require.Equal(t, "my_schema", rm.SchemaName, "registered model schema should be set") + } + } + + // Quality monitors: If paused, we rewrite tableName to include catalog.schema. + // In our code, if paused, we prepend catalog/schema if tableName wasn't already fully qualified. + // Let's verify that: + for _, qm := range b.Config.Resources.QualityMonitors { + // If not fully qualified (3 parts), it should have been rewritten. + parts := strings.Split(qm.TableName, ".") + if len(parts) != 3 { + require.Equal(t, fmt.Sprintf("my_catalog.my_schema.%s", parts[0]), qm.TableName, "quality monitor tableName should include catalog and schema") + } + } + + // Schemas: If there's a schema preset, we might replace the schema name or catalog name. + for _, s := range b.Config.Resources.Schemas { + if s.CreateSchema != nil { + // If catalog was empty before, now should be set: + require.Equal(t, "my_catalog", s.CatalogName, "schema catalog should be set") + // If schema was empty before, it should be set, but we did have "schema1", + // so let's verify that if schema had a name, prefix logic may apply: + // The code attempts to handle schema naming carefully. If t.Schema != "" and s.Name == "", + // s.Name is set to t.Schema. Since s.Name was originally "schema1", it should remain "schema1" with prefix applied. + // If you want to verify behavior, do so explicitly if changed code logic. + } + } + + // Model serving endpoints currently return a warning that they don't support catalog/schema presets. + // We can just verify that the warning is generated or that no fields were set since they are not supported. + // The ApplyPresets code emits a diag error if we attempt to use catalog/schema with model serving endpoints. + // Let's check that we got an error diagnostic: + // The code currently returns a diag error if model serving endpoints are present and catalog/schema are set. + // So we verify diags here: + foundEndpointError := false + for _, d := range diags { + if strings.Contains(d.Summary, "model serving endpoints are not supported with catalog/schema presets") { + foundEndpointError = true + break + } + } + require.True(t, foundEndpointError, "should have diag error for model serving endpoints") + + // Add assertions for any other resources that support catalog/schema if needed. + // This list is maintained manually. If you add new resource types that support catalog/schema, + // add them here as well. +} From a08b59d4dd21ab5b7303078a4510c6bdefa6a062 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Tue, 10 Dec 2024 11:06:56 +0100 Subject: [PATCH 10/27] Add comments --- .../template/{{.project_name}}/scratch/exploration.ipynb.tmpl | 1 + .../template/{{.project_name}}/src/notebook.ipynb.tmpl | 1 + 2 files changed, 2 insertions(+) diff --git a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl index 8c0298fec3..adb353c58c 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl @@ -32,6 +32,7 @@ "sys.path.append('../src')\n", "from {{.project_name}} import main\n", "\n", + {{- /* We can use the short form here without 'dbutils.text()' since the widgets are defined in the metadata below. */}} "catalog = dbutils.widgets.get('catalog')\n", "schema = dbutils.widgets.get('schema')\n", "spark.sql(f'USE {catalog}.{schema}')\n", diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl index 4d3c398c91..0924e60f35 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl @@ -24,6 +24,7 @@ "outputs": [], "source": [ "# Load default catalog and schema as widget and set their values as the default catalog / schema\n", + {{- /* We can use the short form here without 'dbutils.text()' since the widgets are defined in the metadata below. */}} "catalog = dbutils.widgets.get('catalog')\n", "schema = dbutils.widgets.get('schema')\n", "spark.sql(f'USE {catalog}.{schema}')" From 7d1f8ad19b595fb3337d8c46fe12d92805fa3800 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Wed, 11 Dec 2024 11:35:11 +0100 Subject: [PATCH 11/27] Add TODO --- bundle/config/mutator/apply_presets.go | 1 + 1 file changed, 1 insertion(+) diff --git a/bundle/config/mutator/apply_presets.go b/bundle/config/mutator/apply_presets.go index 86d61837e8..c8d3ae26ac 100644 --- a/bundle/config/mutator/apply_presets.go +++ b/bundle/config/mutator/apply_presets.go @@ -122,6 +122,7 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos if t.TriggerPauseStatus == config.Paused { p.Continuous = false } + // TODO: add recommendation when catalog is already set? if t.Catalog != "" && p.Catalog == "" && p.Catalog != "hive_metastore" { p.Catalog = t.Catalog } From 6b5948cef9ce59c97da4135aa2545453c6b96de3 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sat, 14 Dec 2024 20:39:10 +0100 Subject: [PATCH 12/27] Add serving end point support --- bundle/config/mutator/apply_presets.go | 58 ++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/bundle/config/mutator/apply_presets.go b/bundle/config/mutator/apply_presets.go index c8d3ae26ac..3866ae57f2 100644 --- a/bundle/config/mutator/apply_presets.go +++ b/bundle/config/mutator/apply_presets.go @@ -190,16 +190,38 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos e.Name = normalizePrefix(prefix) + e.Name if t.Catalog != "" || t.Schema != "" { - // TODO: - // - e.AiGateway.InferenceTableConfig.CatalogName - // - e.AiGateway.InferenceTableConfig.SchemaName - // - e.Config.AutoCaptureConfig.SchemaName - // - e.Config.AutoCaptureConfig.CatalogName - // - e.Config.ServedEntities[0].EntityName (__catalog_name__.__schema_name__.__model_name__.) - // - e.Config.ServedModels[0].ModelName (__catalog_name__.__schema_name__.__model_name__.) - diags = diags.Extend(diag.Errorf("model serving endpoints are not supported with catalog/schema presets")) - } + // Apply catalog & schema to inference table config if not set + if e.CreateServingEndpoint.AiGateway != nil && e.CreateServingEndpoint.AiGateway.InferenceTableConfig != nil { + if t.Catalog != "" && e.CreateServingEndpoint.AiGateway.InferenceTableConfig.CatalogName == "" { + e.CreateServingEndpoint.AiGateway.InferenceTableConfig.CatalogName = t.Catalog + } + if t.Schema != "" && e.CreateServingEndpoint.AiGateway.InferenceTableConfig.SchemaName == "" { + e.CreateServingEndpoint.AiGateway.InferenceTableConfig.SchemaName = t.Schema + } + } + + // Apply catalog & schema to auto capture config if not set + if e.CreateServingEndpoint.Config.AutoCaptureConfig != nil { + if t.Catalog != "" && e.CreateServingEndpoint.Config.AutoCaptureConfig.CatalogName == "" { + e.CreateServingEndpoint.Config.AutoCaptureConfig.CatalogName = t.Catalog + } + if t.Schema != "" && e.CreateServingEndpoint.Config.AutoCaptureConfig.SchemaName == "" { + e.CreateServingEndpoint.Config.AutoCaptureConfig.SchemaName = t.Schema + } + } + // Fully qualify served entities and models if they are not already qualified + for i := range e.CreateServingEndpoint.Config.ServedEntities { + e.CreateServingEndpoint.Config.ServedEntities[i].EntityName = fullyQualifyName( + e.CreateServingEndpoint.Config.ServedEntities[i].EntityName, t.Catalog, t.Schema, + ) + } + for i := range e.CreateServingEndpoint.Config.ServedModels { + e.CreateServingEndpoint.Config.ServedModels[i].ModelName = fullyQualifyName( + e.CreateServingEndpoint.Config.ServedModels[i].ModelName, t.Catalog, t.Schema, + ) + } + } } // Registered models presets @@ -484,6 +506,24 @@ func recommendCatalogSchemaUsage(b *bundle.Bundle, ctx context.Context, key stri } return diags + +} + +// fullyQualifyName checks if the given name is already qualified with a catalog and schema. +// If not, and both catalog and schema are available, it prefixes the name with catalog.schema. +// If name is empty, returns name as-is. +func fullyQualifyName(name, catalog, schema string) string { + if name == "" || catalog == "" || schema == "" { + return name + } + // If it's already qualified (contains at least two '.'), we assume it's fully qualified. + parts := strings.Split(name, ".") + if len(parts) >= 3 { + // Already fully qualified + return name + } + // Otherwise, fully qualify it + return fmt.Sprintf("%s.%s.%s", catalog, schema, name) } func fileIncludesPattern(ctx context.Context, filePath string, expected string) bool { From 35bed3f549529494ed451c6351b9f863bef9a4bf Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sat, 14 Dec 2024 20:39:22 +0100 Subject: [PATCH 13/27] Add experimental test --- bundle/config/mutator/apply_presets_test.go | 411 +++++++++++++++++--- 1 file changed, 348 insertions(+), 63 deletions(-) diff --git a/bundle/config/mutator/apply_presets_test.go b/bundle/config/mutator/apply_presets_test.go index 265638d249..fd99dc212d 100644 --- a/bundle/config/mutator/apply_presets_test.go +++ b/bundle/config/mutator/apply_presets_test.go @@ -3,7 +3,9 @@ package mutator_test import ( "context" "fmt" + "reflect" "runtime" + "strconv" "strings" "testing" @@ -15,10 +17,20 @@ import ( "github.com/databricks/cli/libs/dbr" "github.com/databricks/cli/libs/dyn" "github.com/databricks/databricks-sdk-go/service/catalog" + "github.com/databricks/databricks-sdk-go/service/compute" + "github.com/databricks/databricks-sdk-go/service/dashboards" "github.com/databricks/databricks-sdk-go/service/jobs" + "github.com/databricks/databricks-sdk-go/service/ml" + "github.com/databricks/databricks-sdk-go/service/pipelines" + "github.com/databricks/databricks-sdk-go/service/serving" "github.com/stretchr/testify/require" ) +type RecordedField struct { + Path string + Value string +} + func TestApplyPresetsPrefix(t *testing.T) { tests := []struct { name string @@ -457,88 +469,361 @@ func TestApplyPresetsSourceLinkedDeployment(t *testing.T) { } func TestApplyPresetsCatalogSchema(t *testing.T) { - // Create a bundle in a known mode, e.g. development or production doesn't matter much here. - b := mockBundle(config.Development) - // Set the catalog and schema in presets. - b.Config.Presets.Catalog = "my_catalog" - b.Config.Presets.Schema = "my_schema" + b := &bundle.Bundle{ + Config: config.Root{ + Resources: config.Resources{ + Jobs: map[string]*resources.Job{ + "object": { + JobSettings: &jobs.JobSettings{ + Name: "job", + Parameters: []jobs.JobParameterDefinition{ + {Name: "catalog", Default: ""}, + {Name: "schema", Default: ""}, + }, + }, + }, + }, + Pipelines: map[string]*resources.Pipeline{ + "object": { + PipelineSpec: &pipelines.PipelineSpec{ + Name: "pipeline", + Catalog: "", + Target: "", + }, + }, + }, + ModelServingEndpoints: map[string]*resources.ModelServingEndpoint{ + "object": { + CreateServingEndpoint: &serving.CreateServingEndpoint{ + Name: "serving", + AiGateway: &serving.AiGatewayConfig{ + InferenceTableConfig: &serving.AiGatewayInferenceTableConfig{ + CatalogName: "", + SchemaName: "", + }, + }, + Config: serving.EndpointCoreConfigInput{ + AutoCaptureConfig: &serving.AutoCaptureConfigInput{ + CatalogName: "", + SchemaName: "", + }, + ServedEntities: []serving.ServedEntityInput{ + {EntityName: "..entity"}, + }, + ServedModels: []serving.ServedModelInput{ + {ModelName: "..model"}, + }, + }, + }, + }, + }, + RegisteredModels: map[string]*resources.RegisteredModel{ + "object": { + CreateRegisteredModelRequest: &catalog.CreateRegisteredModelRequest{ + Name: "registered_model", + CatalogName: "", + SchemaName: "", + }, + }, + }, + QualityMonitors: map[string]*resources.QualityMonitor{ + "object": { + TableName: "table", + CreateMonitor: &catalog.CreateMonitor{ + OutputSchemaName: ".", + }, + }, + }, + Schemas: map[string]*resources.Schema{ + "object": { + CreateSchema: &catalog.CreateSchema{ + Name: "", + CatalogName: "", + }, + }, + }, + Models: map[string]*resources.MlflowModel{ + "object": { + Model: &ml.Model{ + Name: "..model", + }, + }, + }, + Experiments: map[string]*resources.MlflowExperiment{ + "object": { + Experiment: &ml.Experiment{ + Name: "..experiment", + }, + }, + }, + Clusters: map[string]*resources.Cluster{ + "object": { + ClusterSpec: &compute.ClusterSpec{ + ClusterName: "cluster", + }, + }, + }, + Dashboards: map[string]*resources.Dashboard{ + "object": { + Dashboard: &dashboards.Dashboard{ + DisplayName: "dashboard", + }, + }, + }, + }, + }, + } + b.Config.Presets = config.Presets{ + Catalog: "my_catalog", + Schema: "my_schema", + } + // Stage 1: Apply presets BEFORE cleanup. + // Because all fields are already set to placeholders, Apply should NOT overwrite them (no-op). ctx := context.Background() diags := bundle.Apply(ctx, b, mutator.ApplyPresets()) - require.NoError(t, diags.Error()) - - // Verify that jobs got catalog/schema if they support it. - // For DBT tasks in jobs: - for _, job := range b.Config.Resources.Jobs { - if job.JobSettings != nil && job.Tasks != nil { - for _, task := range job.Tasks { - if task.DbtTask != nil { - require.Equal(t, "my_catalog", task.DbtTask.Catalog, "dbt catalog should be set") - require.Equal(t, "my_schema", task.DbtTask.Schema, "dbt schema should be set") + require.False(t, diags.HasError(), "unexpected error before cleanup: %v", diags.Error()) + verifyNoChangesBeforeCleanup(t, b.Config) + + // Stage 2: Cleanup all "" and "" placeholders + // and record where they were. + b.Config.MarkMutatorEntry(ctx) + resources := reflect.ValueOf(&b.Config.Resources).Elem() + recordedFields := recordAndCleanupFields(resources, "Resources") + b.Config.Resources.Jobs["object"].Parameters = nil + b.Config.MarkMutatorExit(ctx) + + // Stage 3: Apply presets after cleanup. + diags = bundle.Apply(ctx, b, mutator.ApplyPresets()) + require.False(t, diags.HasError(), "unexpected error after cleanup: %v", diags.Error()) + verifyAllFields(t, b.Config, recordedFields) + + // Stage 4: Verify that all known fields in config.Resources have been processed. + checkCompleteness(t, &b.Config.Resources, "Resources", recordedFields) +} + +func verifyNoChangesBeforeCleanup(t *testing.T, cfg config.Root) { + t.Helper() + + // Just check a few representative fields to ensure they are still placeholders. + // For example: Job parameter defaults should still have "" and "" + jobParams := cfg.Resources.Jobs["object"].Parameters + require.Len(t, jobParams, 2, "job parameters count mismatch") + require.Equal(t, "", jobParams[0].Default, "expected no changes before cleanup") + require.Equal(t, "", jobParams[1].Default, "expected no changes before cleanup") + + pipeline := cfg.Resources.Pipelines["object"] + require.Equal(t, "", pipeline.Catalog, "expected no changes before cleanup") + require.Equal(t, "", pipeline.Target, "expected no changes before cleanup") +} + +// recordAndCleanupFields recursively finds all Catalog/CatalogName/Schema/SchemaName fields, +// records their original values, and replaces them with empty strings. +func recordAndCleanupFields(rv reflect.Value, path string) []RecordedField { + var recordedFields []RecordedField + + switch rv.Kind() { + case reflect.Ptr, reflect.Interface: + if !rv.IsNil() { + recordedFields = append(recordedFields, recordAndCleanupFields(rv.Elem(), path)...) + } + + case reflect.Struct: + tp := rv.Type() + for i := 0; i < rv.NumField(); i++ { + ft := tp.Field(i) + fv := rv.Field(i) + fPath := path + "." + ft.Name + + if fv.Kind() == reflect.String { + original := fv.String() + newVal := cleanedValue(original) + if newVal != original { + fv.SetString(newVal) + recordedFields = append(recordedFields, RecordedField{fPath, original}) } } + + recordedFields = append(recordedFields, recordAndCleanupFieldsRecursive(fv, fPath)...) + } + + case reflect.Map: + for _, mk := range rv.MapKeys() { + mVal := rv.MapIndex(mk) + recordedFields = append(recordedFields, recordAndCleanupFieldsRecursive(mVal, path+"."+mk.String())...) + } + + case reflect.Slice, reflect.Array: + for i := 0; i < rv.Len(); i++ { + recordedFields = append(recordedFields, recordAndCleanupFieldsRecursive(rv.Index(i), fmt.Sprintf("%s[%d]", path, i))...) } } - // Pipelines: Catalog/Schema - for _, p := range b.Config.Resources.Pipelines { - if p.PipelineSpec != nil { - // pipeline catalog and schema - if p.Catalog == "" || p.Catalog == "hive_metastore" { - require.Equal(t, "my_catalog", p.Catalog, "pipeline catalog should be set") + return recordedFields +} + +// verifyAllFields checks if all collected fields are now properly replaced after ApplyPresets. +func verifyAllFields(t *testing.T, cfg config.Root, recordedFields []RecordedField) { + t.Helper() + for _, f := range recordedFields { + expected := replaceCatalogSchemaPlaceholders(f.Value) + got := getStringValueAtPath(t, reflect.ValueOf(cfg), f.Path) + require.Equal(t, expected, got, "expected catalog/schema to be replaced by preset values at %s", f.Path) + } +} + +// checkCompleteness ensures that all catalog/schema fields have been processed. +func checkCompleteness(t *testing.T, root interface{}, rootPath string, recordedFields []RecordedField) { + t.Helper() + recordedSet := make(map[string]bool) + for _, f := range recordedFields { + recordedSet[f.Path] = true + } + + var check func(rv reflect.Value, path string) + check = func(rv reflect.Value, path string) { + switch rv.Kind() { + case reflect.Ptr, reflect.Interface: + if !rv.IsNil() { + check(rv.Elem(), path) + } + case reflect.Struct: + tp := rv.Type() + for i := 0; i < rv.NumField(); i++ { + ft := tp.Field(i) + fv := rv.Field(i) + fPath := path + "." + ft.Name + if isCatalogOrSchemaField(ft.Name) { + require.Truef(t, recordedSet[fPath], + "Field %s was not recorded in recordedFields (completeness check failed)", fPath) + } + check(fv, fPath) + } + case reflect.Map: + for _, mk := range rv.MapKeys() { + mVal := rv.MapIndex(mk) + check(mVal, path+"."+mk.String()) + } + case reflect.Slice, reflect.Array: + for i := 0; i < rv.Len(); i++ { + check(rv.Index(i), fmt.Sprintf("%s[%d]", path, i)) } - require.Equal(t, "my_schema", p.Target, "pipeline schema (target) should be set") } } - // Registered models: Catalog/Schema - for _, rm := range b.Config.Resources.RegisteredModels { - if rm.CreateRegisteredModelRequest != nil { - require.Equal(t, "my_catalog", rm.CatalogName, "registered model catalog should be set") - require.Equal(t, "my_schema", rm.SchemaName, "registered model schema should be set") - } + rv := reflect.ValueOf(root) + if rv.Kind() == reflect.Ptr { + rv = rv.Elem() } + check(rv, rootPath) +} - // Quality monitors: If paused, we rewrite tableName to include catalog.schema. - // In our code, if paused, we prepend catalog/schema if tableName wasn't already fully qualified. - // Let's verify that: - for _, qm := range b.Config.Resources.QualityMonitors { - // If not fully qualified (3 parts), it should have been rewritten. - parts := strings.Split(qm.TableName, ".") - if len(parts) != 3 { - require.Equal(t, fmt.Sprintf("my_catalog.my_schema.%s", parts[0]), qm.TableName, "quality monitor tableName should include catalog and schema") - } +// getStringValueAtPath navigates the given path and returns the string value at that path. +func getStringValueAtPath(t *testing.T, root reflect.Value, path string) string { + t.Helper() + parts := strings.Split(path, ".") + return navigatePath(t, root, parts) +} + +func navigatePath(t *testing.T, rv reflect.Value, parts []string) string { + t.Helper() + + // Trim empty parts if any + for len(parts) > 0 && parts[0] == "" { + parts = parts[1:] } - // Schemas: If there's a schema preset, we might replace the schema name or catalog name. - for _, s := range b.Config.Resources.Schemas { - if s.CreateSchema != nil { - // If catalog was empty before, now should be set: - require.Equal(t, "my_catalog", s.CatalogName, "schema catalog should be set") - // If schema was empty before, it should be set, but we did have "schema1", - // so let's verify that if schema had a name, prefix logic may apply: - // The code attempts to handle schema naming carefully. If t.Schema != "" and s.Name == "", - // s.Name is set to t.Schema. Since s.Name was originally "schema1", it should remain "schema1" with prefix applied. - // If you want to verify behavior, do so explicitly if changed code logic. + for len(parts) > 0 { + part := parts[0] + parts = parts[1:] + + // Dereference pointers/interfaces before proceeding + for rv.Kind() == reflect.Ptr || rv.Kind() == reflect.Interface { + require.Falsef(t, rv.IsNil(), "nil pointer or interface encountered at part '%s'", part) + rv = rv.Elem() } - } - // Model serving endpoints currently return a warning that they don't support catalog/schema presets. - // We can just verify that the warning is generated or that no fields were set since they are not supported. - // The ApplyPresets code emits a diag error if we attempt to use catalog/schema with model serving endpoints. - // Let's check that we got an error diagnostic: - // The code currently returns a diag error if model serving endpoints are present and catalog/schema are set. - // So we verify diags here: - foundEndpointError := false - for _, d := range diags { - if strings.Contains(d.Summary, "model serving endpoints are not supported with catalog/schema presets") { - foundEndpointError = true - break + // If the part has indexing like "Parameters[0]", split it into "Parameters" and "[0]" + var indexPart string + fieldName := part + if idx := strings.IndexRune(part, '['); idx != -1 { + // e.g. part = "Parameters[0]" + fieldName = part[:idx] // "Parameters" + indexPart = part[idx:] // "[0]" + require.Truef(t, strings.HasPrefix(indexPart, "["), "expected '[' in indexing") + require.Truef(t, strings.HasSuffix(indexPart, "]"), "expected ']' at end of indexing") } + + // Navigate down structures/maps + switch rv.Kind() { + case reflect.Struct: + // Find the struct field by name + ft, ok := rv.Type().FieldByName(fieldName) + if !ok { + t.Fatalf("Could not find field '%s' in struct at path", fieldName) + } + rv = rv.FieldByIndex(ft.Index) + + case reflect.Map: + // Use fieldName as map key + mapVal := rv.MapIndex(reflect.ValueOf(fieldName)) + require.Truef(t, mapVal.IsValid(), "no map entry '%s' found in path", fieldName) + rv = mapVal + + default: + // If we're here, maybe we expected a struct or map but got something else + t.Fatalf("Unexpected kind '%s' when looking for '%s'", rv.Kind(), fieldName) + } + + // If there's an index part, apply it now + if indexPart != "" { + // Dereference again if needed + for rv.Kind() == reflect.Ptr || rv.Kind() == reflect.Interface { + require.False(t, rv.IsNil(), "nil pointer or interface when indexing") + rv = rv.Elem() + } + + require.Truef(t, rv.Kind() == reflect.Slice || rv.Kind() == reflect.Array, "expected slice/array for indexing but got %s", rv.Kind()) + + idxStr := indexPart[1 : len(indexPart)-1] // remove [ and ] + idx, err := strconv.Atoi(idxStr) + require.NoError(t, err, "invalid slice index %s", indexPart) + + require.Truef(t, idx < rv.Len(), "index %d out of range in slice/array of length %d", idx, rv.Len()) + rv = rv.Index(idx) + } + } + + // Dereference if needed at the leaf + for rv.Kind() == reflect.Ptr || rv.Kind() == reflect.Interface { + require.False(t, rv.IsNil(), "nil pointer or interface at leaf") + rv = rv.Elem() + } + + require.Equal(t, reflect.String, rv.Kind(), "expected a string at the final path") + return rv.String() +} + +func isCatalogOrSchemaField(name string) bool { + switch name { + case "Catalog", "CatalogName", "Schema", "SchemaName", "Target": + return true + default: + return false } - require.True(t, foundEndpointError, "should have diag error for model serving endpoints") +} + +func cleanedValue(value string) string { + value = strings.ReplaceAll(value, ".", "") + value = strings.ReplaceAll(value, ".", "") + value = strings.ReplaceAll(value, "", "") + value = strings.ReplaceAll(value, "", "") + return value +} - // Add assertions for any other resources that support catalog/schema if needed. - // This list is maintained manually. If you add new resource types that support catalog/schema, - // add them here as well. +// replaceCatalogSchemaPlaceholders replaces placeholders with the final expected values. +func replaceCatalogSchemaPlaceholders(value string) string { + value = strings.ReplaceAll(value, "", "my_catalog") + value = strings.ReplaceAll(value, "", "my_schema") + return value } From 3e1e169225da28299e7c1deae4f818511bc80823 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 16 Dec 2024 10:33:48 +0100 Subject: [PATCH 14/27] Add reflection/dyn-based tests --- bundle/config/mutator/apply_presets.go | 28 +- bundle/config/mutator/apply_presets_test.go | 401 ++++++++------------ 2 files changed, 174 insertions(+), 255 deletions(-) diff --git a/bundle/config/mutator/apply_presets.go b/bundle/config/mutator/apply_presets.go index 3866ae57f2..7cfbfd8513 100644 --- a/bundle/config/mutator/apply_presets.go +++ b/bundle/config/mutator/apply_presets.go @@ -244,23 +244,23 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos // Quality monitors presets // Supported: Schedule, Catalog, Schema // Not supported: Tags (not in API as of 2024-10) - if t.TriggerPauseStatus == config.Paused { - for key, q := range r.QualityMonitors { - if q.CreateMonitor == nil { - diags = diags.Extend(diag.Errorf("quality monitor %s is not defined", key)) - continue - } - // Remove all schedules from monitors, since they don't support pausing/unpausing. - // Quality monitors might support the "pause" property in the future, so at the - // CLI level we do respect that property if it is set to "unpaused." + for key, q := range r.QualityMonitors { + if q.CreateMonitor == nil { + diags = diags.Extend(diag.Errorf("quality monitor %s is not defined", key)) + continue + } + // Remove all schedules from monitors, since they don't support pausing/unpausing. + // Quality monitors might support the "pause" property in the future, so at the + // CLI level we do respect that property if it is set to "unpaused." + if t.TriggerPauseStatus == config.Paused { if q.Schedule != nil && q.Schedule.PauseStatus != catalog.MonitorCronSchedulePauseStatusUnpaused { q.Schedule = nil } - if t.Catalog != "" && t.Schema != "" { - parts := strings.Split(q.TableName, ".") - if len(parts) != 3 { - q.TableName = fmt.Sprintf("%s.%s.%s", t.Catalog, t.Schema, q.TableName) - } + } + if t.Catalog != "" && t.Schema != "" { + q.TableName = fullyQualifyName(q.TableName, t.Catalog, t.Schema) + if q.OutputSchemaName == "" { + q.OutputSchemaName = t.Catalog + "." + t.Schema } } } diff --git a/bundle/config/mutator/apply_presets_test.go b/bundle/config/mutator/apply_presets_test.go index fd99dc212d..5a05115c4b 100644 --- a/bundle/config/mutator/apply_presets_test.go +++ b/bundle/config/mutator/apply_presets_test.go @@ -2,10 +2,8 @@ package mutator_test import ( "context" - "fmt" "reflect" "runtime" - "strconv" "strings" "testing" @@ -17,18 +15,17 @@ import ( "github.com/databricks/cli/libs/dbr" "github.com/databricks/cli/libs/dyn" "github.com/databricks/databricks-sdk-go/service/catalog" - "github.com/databricks/databricks-sdk-go/service/compute" - "github.com/databricks/databricks-sdk-go/service/dashboards" "github.com/databricks/databricks-sdk-go/service/jobs" - "github.com/databricks/databricks-sdk-go/service/ml" "github.com/databricks/databricks-sdk-go/service/pipelines" "github.com/databricks/databricks-sdk-go/service/serving" "github.com/stretchr/testify/require" ) -type RecordedField struct { - Path string - Value string +type recordedField struct { + Path dyn.Path + PathString string + Placeholder string + Expected string } func TestApplyPresetsPrefix(t *testing.T) { @@ -473,7 +470,7 @@ func TestApplyPresetsCatalogSchema(t *testing.T) { Config: config.Root{ Resources: config.Resources{ Jobs: map[string]*resources.Job{ - "object": { + "key": { JobSettings: &jobs.JobSettings{ Name: "job", Parameters: []jobs.JobParameterDefinition{ @@ -484,7 +481,7 @@ func TestApplyPresetsCatalogSchema(t *testing.T) { }, }, Pipelines: map[string]*resources.Pipeline{ - "object": { + "key": { PipelineSpec: &pipelines.PipelineSpec{ Name: "pipeline", Catalog: "", @@ -493,7 +490,7 @@ func TestApplyPresetsCatalogSchema(t *testing.T) { }, }, ModelServingEndpoints: map[string]*resources.ModelServingEndpoint{ - "object": { + "key": { CreateServingEndpoint: &serving.CreateServingEndpoint{ Name: "serving", AiGateway: &serving.AiGatewayConfig{ @@ -518,7 +515,7 @@ func TestApplyPresetsCatalogSchema(t *testing.T) { }, }, RegisteredModels: map[string]*resources.RegisteredModel{ - "object": { + "key": { CreateRegisteredModelRequest: &catalog.CreateRegisteredModelRequest{ Name: "registered_model", CatalogName: "", @@ -527,49 +524,21 @@ func TestApplyPresetsCatalogSchema(t *testing.T) { }, }, QualityMonitors: map[string]*resources.QualityMonitor{ - "object": { - TableName: "table", + "key": { + TableName: "..table", CreateMonitor: &catalog.CreateMonitor{ OutputSchemaName: ".", }, }, }, Schemas: map[string]*resources.Schema{ - "object": { + "key": { CreateSchema: &catalog.CreateSchema{ Name: "", CatalogName: "", }, }, }, - Models: map[string]*resources.MlflowModel{ - "object": { - Model: &ml.Model{ - Name: "..model", - }, - }, - }, - Experiments: map[string]*resources.MlflowExperiment{ - "object": { - Experiment: &ml.Experiment{ - Name: "..experiment", - }, - }, - }, - Clusters: map[string]*resources.Cluster{ - "object": { - ClusterSpec: &compute.ClusterSpec{ - ClusterName: "cluster", - }, - }, - }, - Dashboards: map[string]*resources.Dashboard{ - "object": { - Dashboard: &dashboards.Dashboard{ - DisplayName: "dashboard", - }, - }, - }, }, }, } @@ -577,243 +546,194 @@ func TestApplyPresetsCatalogSchema(t *testing.T) { Catalog: "my_catalog", Schema: "my_schema", } - - // Stage 1: Apply presets BEFORE cleanup. - // Because all fields are already set to placeholders, Apply should NOT overwrite them (no-op). ctx := context.Background() - diags := bundle.Apply(ctx, b, mutator.ApplyPresets()) - require.False(t, diags.HasError(), "unexpected error before cleanup: %v", diags.Error()) - verifyNoChangesBeforeCleanup(t, b.Config) - - // Stage 2: Cleanup all "" and "" placeholders - // and record where they were. - b.Config.MarkMutatorEntry(ctx) - resources := reflect.ValueOf(&b.Config.Resources).Elem() - recordedFields := recordAndCleanupFields(resources, "Resources") - b.Config.Resources.Jobs["object"].Parameters = nil - b.Config.MarkMutatorExit(ctx) - - // Stage 3: Apply presets after cleanup. - diags = bundle.Apply(ctx, b, mutator.ApplyPresets()) - require.False(t, diags.HasError(), "unexpected error after cleanup: %v", diags.Error()) - verifyAllFields(t, b.Config, recordedFields) - - // Stage 4: Verify that all known fields in config.Resources have been processed. - checkCompleteness(t, &b.Config.Resources, "Resources", recordedFields) -} - -func verifyNoChangesBeforeCleanup(t *testing.T, cfg config.Root) { - t.Helper() - - // Just check a few representative fields to ensure they are still placeholders. - // For example: Job parameter defaults should still have "" and "" - jobParams := cfg.Resources.Jobs["object"].Parameters - require.Len(t, jobParams, 2, "job parameters count mismatch") - require.Equal(t, "", jobParams[0].Default, "expected no changes before cleanup") - require.Equal(t, "", jobParams[1].Default, "expected no changes before cleanup") - pipeline := cfg.Resources.Pipelines["object"] - require.Equal(t, "", pipeline.Catalog, "expected no changes before cleanup") - require.Equal(t, "", pipeline.Target, "expected no changes before cleanup") -} + // Initial scan: record all fields that contain placeholders. + // We do this before the first apply so we can verify no changes occur. + var recordedFields []recordedField + require.NoError(t, b.Config.Mutate(func(root dyn.Value) (dyn.Value, error) { + _, err := dyn.Walk(b.Config.Value(), func(p dyn.Path, v dyn.Value) (dyn.Value, error) { + if v.Kind() == dyn.KindString { + val := v.MustString() + if strings.Contains(val, "") || strings.Contains(val, "") { + pathCopy := make(dyn.Path, len(p)) + copy(pathCopy, p) + recordedFields = append(recordedFields, recordedField{ + Path: pathCopy, + PathString: pathCopy.String(), + Placeholder: val, + Expected: replacePlaceholders(val, "my_catalog", "my_schema"), + }) + } + } + return v, nil + }) + return root, err + })) -// recordAndCleanupFields recursively finds all Catalog/CatalogName/Schema/SchemaName fields, -// records their original values, and replaces them with empty strings. -func recordAndCleanupFields(rv reflect.Value, path string) []RecordedField { - var recordedFields []RecordedField + // Stage 1: Apply presets before cleanup, should be no-op. + diags := bundle.Apply(ctx, b, mutator.ApplyPresets()) + require.False(t, diags.HasError(), "unexpected error before cleanup: %v", diags.Error()) - switch rv.Kind() { - case reflect.Ptr, reflect.Interface: - if !rv.IsNil() { - recordedFields = append(recordedFields, recordAndCleanupFields(rv.Elem(), path)...) - } + // Verify that no recorded fields changed + verifyNoChangesBeforeCleanup(t, b.Config.Value(), recordedFields) - case reflect.Struct: - tp := rv.Type() - for i := 0; i < rv.NumField(); i++ { - ft := tp.Field(i) - fv := rv.Field(i) - fPath := path + "." + ft.Name - - if fv.Kind() == reflect.String { - original := fv.String() - newVal := cleanedValue(original) - if newVal != original { - fv.SetString(newVal) - recordedFields = append(recordedFields, RecordedField{fPath, original}) - } - } + // Stage 2: Cleanup: Walk over rootVal and remove placeholders, adjusting recordedFields Expected values. + require.NoError(t, b.Config.Mutate(func(root dyn.Value) (dyn.Value, error) { + for _, f := range recordedFields { + value, err := dyn.GetByPath(root, f.Path) + require.NoError(t, err) - recordedFields = append(recordedFields, recordAndCleanupFieldsRecursive(fv, fPath)...) + val := value.MustString() + cleanedVal := removePlaceholders(val) + root, err = dyn.SetByPath(root, f.Path, dyn.V(cleanedVal)) + require.NoError(t, err) } + root, err := dyn.Set(root, "resources.jobs.key.parameters", dyn.NilValue) + require.NoError(t, err) + return root, nil + })) - case reflect.Map: - for _, mk := range rv.MapKeys() { - mVal := rv.MapIndex(mk) - recordedFields = append(recordedFields, recordAndCleanupFieldsRecursive(mVal, path+"."+mk.String())...) - } + // Stage 3: Apply presets after cleanup. + diags = bundle.Apply(ctx, b, mutator.ApplyPresets()) + require.False(t, diags.HasError(), "unexpected error after cleanup: %v", diags.Error()) - case reflect.Slice, reflect.Array: - for i := 0; i < rv.Len(); i++ { - recordedFields = append(recordedFields, recordAndCleanupFieldsRecursive(rv.Index(i), fmt.Sprintf("%s[%d]", path, i))...) - } + // Verify that fields have the expected replacements + config := b.Config.Value() + for _, f := range recordedFields { + val, err := dyn.GetByPath(config, f.Path) + require.NoError(t, err, "failed to get path %s", f.Path) + require.Equal(t, f.Expected, val.MustString(), "preset value expected for %s based on placeholder %s", f.Path, f.Placeholder) } - return recordedFields + // Stage 4: Check completeness + ignoredFields := map[string]string{ + // Any fields that should be ignored in the completeness check + // Example: + // "resources.jobs.object.schema_something": "this property doesn't relate to the catalog/schema", + } + checkCompleteness(t, recordedFields, ignoredFields) } -// verifyAllFields checks if all collected fields are now properly replaced after ApplyPresets. -func verifyAllFields(t *testing.T, cfg config.Root, recordedFields []RecordedField) { +func verifyNoChangesBeforeCleanup(t *testing.T, rootVal dyn.Value, recordedFields []recordedField) { t.Helper() + for _, f := range recordedFields { - expected := replaceCatalogSchemaPlaceholders(f.Value) - got := getStringValueAtPath(t, reflect.ValueOf(cfg), f.Path) - require.Equal(t, expected, got, "expected catalog/schema to be replaced by preset values at %s", f.Path) + val, err := dyn.GetByPath(rootVal, f.Path) + require.NoError(t, err, "failed to get path %s", f.Path) + require.Equal(t, f.Placeholder, val.MustString(), + "expected placeholder '%s' at %s to remain unchanged before cleanup", f.Placeholder, f.Path) } } -// checkCompleteness ensures that all catalog/schema fields have been processed. -func checkCompleteness(t *testing.T, root interface{}, rootPath string, recordedFields []RecordedField) { +func checkCompleteness(t *testing.T, recordedFields []recordedField, ignoredFields map[string]string) { t.Helper() - recordedSet := make(map[string]bool) - for _, f := range recordedFields { - recordedSet[f.Path] = true - } - - var check func(rv reflect.Value, path string) - check = func(rv reflect.Value, path string) { - switch rv.Kind() { - case reflect.Ptr, reflect.Interface: - if !rv.IsNil() { - check(rv.Elem(), path) - } - case reflect.Struct: - tp := rv.Type() - for i := 0; i < rv.NumField(); i++ { - ft := tp.Field(i) - fv := rv.Field(i) - fPath := path + "." + ft.Name - if isCatalogOrSchemaField(ft.Name) { - require.Truef(t, recordedSet[fPath], - "Field %s was not recorded in recordedFields (completeness check failed)", fPath) - } - check(fv, fPath) - } - case reflect.Map: - for _, mk := range rv.MapKeys() { - mVal := rv.MapIndex(mk) - check(mVal, path+"."+mk.String()) - } - case reflect.Slice, reflect.Array: - for i := 0; i < rv.Len(); i++ { - check(rv.Index(i), fmt.Sprintf("%s[%d]", path, i)) - } - } - } - rv := reflect.ValueOf(root) - if rv.Kind() == reflect.Ptr { - rv = rv.Elem() + // Build a set for recorded fields + recordedSet := make(map[string]struct{}) + for _, field := range recordedFields { + recordedSet[field.PathString] = struct{}{} } - check(rv, rootPath) -} -// getStringValueAtPath navigates the given path and returns the string value at that path. -func getStringValueAtPath(t *testing.T, root reflect.Value, path string) string { - t.Helper() - parts := strings.Split(path, ".") - return navigatePath(t, root, parts) -} + // Obtain the type of config.Resources + var r config.Resources + resourcesType := reflect.TypeOf(r) -func navigatePath(t *testing.T, rv reflect.Value, parts []string) string { - t.Helper() + // Track missing fields + var missingFields []string - // Trim empty parts if any - for len(parts) > 0 && parts[0] == "" { - parts = parts[1:] - } + // Keep track of visited types to prevent infinite loops (cycles) + visited := make(map[reflect.Type]struct{}) - for len(parts) > 0 { - part := parts[0] - parts = parts[1:] - - // Dereference pointers/interfaces before proceeding - for rv.Kind() == reflect.Ptr || rv.Kind() == reflect.Interface { - require.Falsef(t, rv.IsNil(), "nil pointer or interface encountered at part '%s'", part) - rv = rv.Elem() + // Helper function to handle maps, slices, arrays, and nested pointers/interfaces + verifyFieldType := func(fieldType reflect.Type, path string, fn func(reflect.Type, string)) { + switch fieldType.Kind() { + case reflect.Slice, reflect.Array: + // For arrays/slices, inspect the element type + fn(fieldType.Elem(), path+"[0]") + case reflect.Map: + // For maps, inspect the value type + fn(fieldType.Elem(), path+".key") + case reflect.Ptr, reflect.Interface: + // For pointers/interfaces, inspect the element if it's a pointer + if fieldType.Kind() == reflect.Ptr { + fn(fieldType.Elem(), path) + } + case reflect.Struct: + // For structs, directly recurse into their fields + fn(fieldType, path) + default: + // For basic or unknown kinds, do nothing } + } - // If the part has indexing like "Parameters[0]", split it into "Parameters" and "[0]" - var indexPart string - fieldName := part - if idx := strings.IndexRune(part, '['); idx != -1 { - // e.g. part = "Parameters[0]" - fieldName = part[:idx] // "Parameters" - indexPart = part[idx:] // "[0]" - require.Truef(t, strings.HasPrefix(indexPart, "["), "expected '[' in indexing") - require.Truef(t, strings.HasSuffix(indexPart, "]"), "expected ']' at end of indexing") + // Recursive function to verify the fields of a given type. + var verifyTypeFields func(rt reflect.Type, path string) + verifyTypeFields = func(rt reflect.Type, path string) { + // Avoid cycles by skipping already visited types + if _, seen := visited[rt]; seen { + return } + visited[rt] = struct{}{} - // Navigate down structures/maps - switch rv.Kind() { - case reflect.Struct: - // Find the struct field by name - ft, ok := rv.Type().FieldByName(fieldName) - if !ok { - t.Fatalf("Could not find field '%s' in struct at path", fieldName) + switch rt.Kind() { + case reflect.Ptr, reflect.Interface: + // For pointers/interfaces, inspect the element type if available + if rt.Kind() == reflect.Ptr { + verifyTypeFields(rt.Elem(), path) } - rv = rv.FieldByIndex(ft.Index) + case reflect.Struct: + for i := 0; i < rt.NumField(); i++ { + ft := rt.Field(i) + jsonTag := ft.Tag.Get("json") + if jsonTag == "" || jsonTag == "-" { + // Ignore field names when there's no JSON tag, + // e.g. for Jobs.JobSettings + verifyFieldType(ft.Type, path, verifyTypeFields) + continue + } - case reflect.Map: - // Use fieldName as map key - mapVal := rv.MapIndex(reflect.ValueOf(fieldName)) - require.Truef(t, mapVal.IsValid(), "no map entry '%s' found in path", fieldName) - rv = mapVal + fieldName := strings.Split(jsonTag, ",")[0] + fieldPath := path + "." + fieldName + + if isCatalogOrSchemaField(fieldName) { + // Only check if the field is a string + if ft.Type.Kind() == reflect.String { + if _, recorded := recordedSet[fieldPath]; !recorded { + if _, ignored := ignoredFields[fieldPath]; !ignored { + missingFields = append(missingFields, fieldPath) + } + } + } + } + verifyFieldType(ft.Type, fieldPath, verifyTypeFields) + } default: - // If we're here, maybe we expected a struct or map but got something else - t.Fatalf("Unexpected kind '%s' when looking for '%s'", rv.Kind(), fieldName) + // For other kinds at this level, do nothing } + } - // If there's an index part, apply it now - if indexPart != "" { - // Dereference again if needed - for rv.Kind() == reflect.Ptr || rv.Kind() == reflect.Interface { - require.False(t, rv.IsNil(), "nil pointer or interface when indexing") - rv = rv.Elem() - } - - require.Truef(t, rv.Kind() == reflect.Slice || rv.Kind() == reflect.Array, "expected slice/array for indexing but got %s", rv.Kind()) - - idxStr := indexPart[1 : len(indexPart)-1] // remove [ and ] - idx, err := strconv.Atoi(idxStr) - require.NoError(t, err, "invalid slice index %s", indexPart) + // Start from "resources" + verifyTypeFields(resourcesType, "resources") - require.Truef(t, idx < rv.Len(), "index %d out of range in slice/array of length %d", idx, rv.Len()) - rv = rv.Index(idx) - } + // Report all missing fields + for _, field := range missingFields { + t.Errorf("Field %s was not included in the test (should be covered in 'recordedFields' or 'ignoredFields')", field) } - // Dereference if needed at the leaf - for rv.Kind() == reflect.Ptr || rv.Kind() == reflect.Interface { - require.False(t, rv.IsNil(), "nil pointer or interface at leaf") - rv = rv.Elem() + // Fail the test if there were any missing fields + if len(missingFields) > 0 { + t.FailNow() } - - require.Equal(t, reflect.String, rv.Kind(), "expected a string at the final path") - return rv.String() } +// isCatalogOrSchemaField returns true for a field names in config.Resources that we suspect could contain a catalog or schema name func isCatalogOrSchemaField(name string) bool { - switch name { - case "Catalog", "CatalogName", "Schema", "SchemaName", "Target": - return true - default: - return false - } + return strings.Contains(name, "catalog") || strings.Contains(name, "schema") } -func cleanedValue(value string) string { +func removePlaceholders(value string) string { value = strings.ReplaceAll(value, ".", "") value = strings.ReplaceAll(value, ".", "") value = strings.ReplaceAll(value, "", "") @@ -821,9 +741,8 @@ func cleanedValue(value string) string { return value } -// replaceCatalogSchemaPlaceholders replaces placeholders with the final expected values. -func replaceCatalogSchemaPlaceholders(value string) string { - value = strings.ReplaceAll(value, "", "my_catalog") - value = strings.ReplaceAll(value, "", "my_schema") - return value +func replacePlaceholders(placeholder, catalog, schema string) string { + expected := strings.ReplaceAll(placeholder, "", catalog) + expected = strings.ReplaceAll(expected, "", schema) + return expected } From 70f54dca12a0cef5f525e7f6fe244cf6bbafe6bc Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Mon, 16 Dec 2024 14:11:01 +0100 Subject: [PATCH 15/27] Add missing fields --- bundle/config/mutator/apply_presets.go | 64 +++++++++++++++--- bundle/config/mutator/apply_presets_test.go | 75 +++++++++++++++++---- 2 files changed, 119 insertions(+), 20 deletions(-) diff --git a/bundle/config/mutator/apply_presets.go b/bundle/config/mutator/apply_presets.go index 7cfbfd8513..8c6415b355 100644 --- a/bundle/config/mutator/apply_presets.go +++ b/bundle/config/mutator/apply_presets.go @@ -122,12 +122,61 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos if t.TriggerPauseStatus == config.Paused { p.Continuous = false } - // TODO: add recommendation when catalog is already set? - if t.Catalog != "" && p.Catalog == "" && p.Catalog != "hive_metastore" { - p.Catalog = t.Catalog - } - if t.Schema != "" && p.Target == "" { - p.Target = t.Schema + if t.Catalog != "" && t.Schema != "" { + if p.Catalog == "" { + p.Catalog = t.Catalog + } + if p.Target == "" { + p.Target = t.Schema + } + if p.GatewayDefinition != nil { + if p.GatewayDefinition.GatewayStorageCatalog == "" { + p.GatewayDefinition.GatewayStorageCatalog = t.Catalog + } + if p.GatewayDefinition.GatewayStorageSchema == "" { + p.GatewayDefinition.GatewayStorageSchema = t.Schema + } + } + if p.IngestionDefinition != nil { + for _, obj := range p.IngestionDefinition.Objects { + if obj.Report != nil { + if obj.Report.DestinationCatalog == "" { + obj.Report.DestinationCatalog = t.Catalog + } + if obj.Report.DestinationSchema == "" { + obj.Report.DestinationSchema = t.Schema + } + } + if obj.Schema != nil { + if obj.Schema.SourceCatalog == "" { + obj.Schema.SourceCatalog = t.Catalog + } + if obj.Schema.SourceSchema == "" { + obj.Schema.SourceSchema = t.Schema + } + if obj.Schema.DestinationCatalog == "" { + obj.Schema.DestinationCatalog = t.Catalog + } + if obj.Schema.DestinationSchema == "" { + obj.Schema.DestinationSchema = t.Schema + } + } + if obj.Table != nil { + if obj.Table.SourceCatalog == "" { + obj.Table.SourceCatalog = t.Catalog + } + if obj.Table.SourceSchema == "" { + obj.Table.SourceSchema = t.Schema + } + if obj.Table.DestinationCatalog == "" { + obj.Table.DestinationCatalog = t.Catalog + } + if obj.Table.DestinationSchema == "" { + obj.Table.DestinationSchema = t.Schema + } + } + } + } } } @@ -493,8 +542,7 @@ func recommendCatalogSchemaUsage(b *bundle.Bundle, ctx context.Context, key stri if !fileIncludesPattern(ctx, localPath, expected) { diags = diags.Extend(diag.Diagnostics{{ - Summary: fmt.Sprintf("Use the 'catalog' and 'schema' parameters provided via 'presets.catalog' and 'presets.schema' using\n\n" + - fix), + Summary: "Use the 'catalog' and 'schema' parameters provided via 'presets.catalog' and 'presets.schema' using\n\n" + fix, Severity: diag.Recommendation, Locations: []dyn.Location{{ File: localPath, diff --git a/bundle/config/mutator/apply_presets_test.go b/bundle/config/mutator/apply_presets_test.go index 5a05115c4b..c94f510ddb 100644 --- a/bundle/config/mutator/apply_presets_test.go +++ b/bundle/config/mutator/apply_presets_test.go @@ -18,6 +18,7 @@ import ( "github.com/databricks/databricks-sdk-go/service/jobs" "github.com/databricks/databricks-sdk-go/service/pipelines" "github.com/databricks/databricks-sdk-go/service/serving" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -465,8 +466,8 @@ func TestApplyPresetsSourceLinkedDeployment(t *testing.T) { } -func TestApplyPresetsCatalogSchema(t *testing.T) { - b := &bundle.Bundle{ +func PresetsMock() *bundle.Bundle { + return &bundle.Bundle{ Config: config.Root{ Resources: config.Resources{ Jobs: map[string]*resources.Job{ @@ -477,6 +478,24 @@ func TestApplyPresetsCatalogSchema(t *testing.T) { {Name: "catalog", Default: ""}, {Name: "schema", Default: ""}, }, + Tasks: []jobs.Task{ + { + DbtTask: &jobs.DbtTask{ + Catalog: "", + Schema: "", + }, + }, + { + SparkPythonTask: &jobs.SparkPythonTask{ + PythonFile: "/file", + }, + }, + { + NotebookTask: &jobs.NotebookTask{ + NotebookPath: "/notebook", + }, + }, + }, }, }, }, @@ -486,6 +505,32 @@ func TestApplyPresetsCatalogSchema(t *testing.T) { Name: "pipeline", Catalog: "", Target: "", + GatewayDefinition: &pipelines.IngestionGatewayPipelineDefinition{ + GatewayStorageCatalog: "", + GatewayStorageSchema: "", + }, + IngestionDefinition: &pipelines.IngestionPipelineDefinition{ + Objects: []pipelines.IngestionConfig{ + { + Report: &pipelines.ReportSpec{ + DestinationCatalog: "", + DestinationSchema: "", + }, + Schema: &pipelines.SchemaSpec{ + SourceCatalog: "", + SourceSchema: "", + DestinationCatalog: "", + DestinationSchema: "", + }, + Table: &pipelines.TableSpec{ + SourceCatalog: "", + SourceSchema: "", + DestinationCatalog: "", + DestinationSchema: "", + }, + }, + }, + }, }, }, }, @@ -542,6 +587,17 @@ func TestApplyPresetsCatalogSchema(t *testing.T) { }, }, } +} + +var PresetsIgnoredFields = map[string]string{ + // Any fields that should be ignored in the completeness check + // Example: + // "resources.jobs.object.schema_something": "this property doesn't relate to the catalog/schema", + "resources.pipelines.key.schema": "schema is still in private preview", +} + +func TestApplyPresetsCatalogSchema(t *testing.T) { + b := PresetsMock() b.Config.Presets = config.Presets{ Catalog: "my_catalog", Schema: "my_schema", @@ -603,16 +659,11 @@ func TestApplyPresetsCatalogSchema(t *testing.T) { for _, f := range recordedFields { val, err := dyn.GetByPath(config, f.Path) require.NoError(t, err, "failed to get path %s", f.Path) - require.Equal(t, f.Expected, val.MustString(), "preset value expected for %s based on placeholder %s", f.Path, f.Placeholder) + assert.Equal(t, f.Expected, val.MustString(), "preset value expected for %s based on placeholder %s", f.Path, f.Placeholder) } // Stage 4: Check completeness - ignoredFields := map[string]string{ - // Any fields that should be ignored in the completeness check - // Example: - // "resources.jobs.object.schema_something": "this property doesn't relate to the catalog/schema", - } - checkCompleteness(t, recordedFields, ignoredFields) + checkCompleteness(t, recordedFields) } func verifyNoChangesBeforeCleanup(t *testing.T, rootVal dyn.Value, recordedFields []recordedField) { @@ -626,7 +677,7 @@ func verifyNoChangesBeforeCleanup(t *testing.T, rootVal dyn.Value, recordedField } } -func checkCompleteness(t *testing.T, recordedFields []recordedField, ignoredFields map[string]string) { +func checkCompleteness(t *testing.T, recordedFields []recordedField) { t.Helper() // Build a set for recorded fields @@ -700,7 +751,7 @@ func checkCompleteness(t *testing.T, recordedFields []recordedField, ignoredFiel // Only check if the field is a string if ft.Type.Kind() == reflect.String { if _, recorded := recordedSet[fieldPath]; !recorded { - if _, ignored := ignoredFields[fieldPath]; !ignored { + if _, ignored := PresetsIgnoredFields[fieldPath]; !ignored { missingFields = append(missingFields, fieldPath) } } @@ -719,7 +770,7 @@ func checkCompleteness(t *testing.T, recordedFields []recordedField, ignoredFiel // Report all missing fields for _, field := range missingFields { - t.Errorf("Field %s was not included in the test (should be covered in 'recordedFields' or 'ignoredFields')", field) + t.Errorf("Field %s was not included in the catalog/schema presets test. If this is a new field, please add it to PresetsMock or PresetsIgnoredFields and add support for it as appropriate.", field) } // Fail the test if there were any missing fields From 6d63b319d2e4b644873d9fd7e976fa54c1e80612 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Thu, 19 Dec 2024 21:20:17 +0100 Subject: [PATCH 16/27] Refine test --- bundle/config/mutator/apply_presets_test.go | 142 +++++++++----------- 1 file changed, 62 insertions(+), 80 deletions(-) diff --git a/bundle/config/mutator/apply_presets_test.go b/bundle/config/mutator/apply_presets_test.go index c94f510ddb..dfcb411719 100644 --- a/bundle/config/mutator/apply_presets_test.go +++ b/bundle/config/mutator/apply_presets_test.go @@ -589,11 +589,26 @@ func PresetsMock() *bundle.Bundle { } } +// Any fields that should be ignored in the completeness check var PresetsIgnoredFields = map[string]string{ - // Any fields that should be ignored in the completeness check - // Example: - // "resources.jobs.object.schema_something": "this property doesn't relate to the catalog/schema", - "resources.pipelines.key.schema": "schema is still in private preview", + "resources.pipelines.key.schema": "schema is still in private preview", + "resources.jobs.key.tasks[0].notebook_task.base_parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].python_wheel_task.named_parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].python_wheel_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.job_parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].spark_jar_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].spark_python_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].spark_submit_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].sql_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.jar_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.notebook_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.pipeline_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.python_named_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.python_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.spark_submit_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.sql_params": "catalog/schema are passed via job parameters", + "resources.pipelines.key.ingestion_definition.objects[0].schema": "schema name is under schema.source_schema/destination_schema", + "resources.schemas": "schema name of schemas is under resources.schemas.key.Name", } func TestApplyPresetsCatalogSchema(t *testing.T) { @@ -627,6 +642,16 @@ func TestApplyPresetsCatalogSchema(t *testing.T) { return root, err })) + // Convert the recordedFields to a set for easier lookup + recordedSet := make(map[string]struct{}) + for _, field := range recordedFields { + recordedSet[field.PathString] = struct{}{} + if i := strings.Index(field.PathString, "["); i >= 0 { + // For entries like resources.jobs.key.parameters[1].default, just add resources.jobs.key.parameters + recordedSet[field.PathString[:i]] = struct{}{} + } + } + // Stage 1: Apply presets before cleanup, should be no-op. diags := bundle.Apply(ctx, b, mutator.ApplyPresets()) require.False(t, diags.HasError(), "unexpected error before cleanup: %v", diags.Error()) @@ -663,7 +688,15 @@ func TestApplyPresetsCatalogSchema(t *testing.T) { } // Stage 4: Check completeness - checkCompleteness(t, recordedFields) + expectedFields := findCatalogSchemaFields() + assert.GreaterOrEqual(t, len(expectedFields), 42, "expected at least 42 catalog/schema fields, but got %d", len(expectedFields)) + for _, field := range expectedFields { + if _, recorded := recordedSet[field]; !recorded { + if _, ignored := PresetsIgnoredFields[field]; !ignored { + t.Errorf("Field %s was not included in the catalog/schema presets test. If this is a new field, please add it to PresetsMock or PresetsIgnoredFields and add support for it as appropriate.", field) + } + } + } } func verifyNoChangesBeforeCleanup(t *testing.T, rootVal dyn.Value, recordedFields []recordedField) { @@ -677,70 +710,34 @@ func verifyNoChangesBeforeCleanup(t *testing.T, rootVal dyn.Value, recordedField } } -func checkCompleteness(t *testing.T, recordedFields []recordedField) { - t.Helper() - - // Build a set for recorded fields - recordedSet := make(map[string]struct{}) - for _, field := range recordedFields { - recordedSet[field.PathString] = struct{}{} - } - - // Obtain the type of config.Resources - var r config.Resources - resourcesType := reflect.TypeOf(r) - - // Track missing fields - var missingFields []string - - // Keep track of visited types to prevent infinite loops (cycles) +// findCatalogSchemaFields finds all fields in config.Resources that might refer +// to a catalog or schema. Returns a slice of field paths. +func findCatalogSchemaFields() []string { visited := make(map[reflect.Type]struct{}) + var results []string - // Helper function to handle maps, slices, arrays, and nested pointers/interfaces - verifyFieldType := func(fieldType reflect.Type, path string, fn func(reflect.Type, string)) { - switch fieldType.Kind() { - case reflect.Slice, reflect.Array: - // For arrays/slices, inspect the element type - fn(fieldType.Elem(), path+"[0]") - case reflect.Map: - // For maps, inspect the value type - fn(fieldType.Elem(), path+".key") - case reflect.Ptr, reflect.Interface: - // For pointers/interfaces, inspect the element if it's a pointer - if fieldType.Kind() == reflect.Ptr { - fn(fieldType.Elem(), path) - } - case reflect.Struct: - // For structs, directly recurse into their fields - fn(fieldType, path) - default: - // For basic or unknown kinds, do nothing - } - } - - // Recursive function to verify the fields of a given type. - var verifyTypeFields func(rt reflect.Type, path string) - verifyTypeFields = func(rt reflect.Type, path string) { - // Avoid cycles by skipping already visited types + // verifyTypeFields is a recursive function to verify the fields of a given type + var walkTypeFields func(rt reflect.Type, path string) + walkTypeFields = func(rt reflect.Type, path string) { if _, seen := visited[rt]; seen { return } visited[rt] = struct{}{} switch rt.Kind() { - case reflect.Ptr, reflect.Interface: - // For pointers/interfaces, inspect the element type if available - if rt.Kind() == reflect.Ptr { - verifyTypeFields(rt.Elem(), path) - } + case reflect.Slice, reflect.Array: + walkTypeFields(rt.Elem(), path+"[0]") + case reflect.Map: + walkTypeFields(rt.Elem(), path+".key") + case reflect.Ptr: + walkTypeFields(rt.Elem(), path) case reflect.Struct: for i := 0; i < rt.NumField(); i++ { ft := rt.Field(i) jsonTag := ft.Tag.Get("json") if jsonTag == "" || jsonTag == "-" { - // Ignore field names when there's no JSON tag, - // e.g. for Jobs.JobSettings - verifyFieldType(ft.Type, path, verifyTypeFields) + // Ignore field names when there's no JSON tag, e.g. for Jobs.JobSettings + walkTypeFields(ft.Type, path) continue } @@ -748,40 +745,25 @@ func checkCompleteness(t *testing.T, recordedFields []recordedField) { fieldPath := path + "." + fieldName if isCatalogOrSchemaField(fieldName) { - // Only check if the field is a string - if ft.Type.Kind() == reflect.String { - if _, recorded := recordedSet[fieldPath]; !recorded { - if _, ignored := PresetsIgnoredFields[fieldPath]; !ignored { - missingFields = append(missingFields, fieldPath) - } - } - } + results = append(results, fieldPath) } - verifyFieldType(ft.Type, fieldPath, verifyTypeFields) + walkTypeFields(ft.Type, fieldPath) } - default: - // For other kinds at this level, do nothing } } - // Start from "resources" - verifyTypeFields(resourcesType, "resources") - - // Report all missing fields - for _, field := range missingFields { - t.Errorf("Field %s was not included in the catalog/schema presets test. If this is a new field, please add it to PresetsMock or PresetsIgnoredFields and add support for it as appropriate.", field) - } - - // Fail the test if there were any missing fields - if len(missingFields) > 0 { - t.FailNow() - } + var r config.Resources + walkTypeFields(reflect.TypeOf(r), "resources") + return results } // isCatalogOrSchemaField returns true for a field names in config.Resources that we suspect could contain a catalog or schema name func isCatalogOrSchemaField(name string) bool { - return strings.Contains(name, "catalog") || strings.Contains(name, "schema") + return strings.Contains(name, "catalog") || + strings.Contains(name, "schema") || + strings.Contains(name, "parameters") || + strings.Contains(name, "params") } func removePlaceholders(value string) string { From feb23ddefb3bcd32c8faafafa9ca15f55634c379 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Thu, 19 Dec 2024 22:57:38 +0100 Subject: [PATCH 17/27] Move catalog/schema preset logic to a separate module --- bundle/config/mutator/apply_presets.go | 310 +-------------- .../mutator/apply_presets_catalog_schema.go | 364 ++++++++++++++++++ .../apply_presets_catalog_schema_test.go | 348 +++++++++++++++++ bundle/config/mutator/apply_presets_test.go | 326 ---------------- bundle/phases/initialize.go | 1 + 5 files changed, 722 insertions(+), 627 deletions(-) create mode 100644 bundle/config/mutator/apply_presets_catalog_schema.go create mode 100644 bundle/config/mutator/apply_presets_catalog_schema_test.go diff --git a/bundle/config/mutator/apply_presets.go b/bundle/config/mutator/apply_presets.go index 8c6415b355..14a99e41dc 100644 --- a/bundle/config/mutator/apply_presets.go +++ b/bundle/config/mutator/apply_presets.go @@ -2,21 +2,16 @@ package mutator import ( "context" - "fmt" - "os" "path" - "regexp" "slices" "sort" "strings" "github.com/databricks/cli/bundle" "github.com/databricks/cli/bundle/config" - "github.com/databricks/cli/bundle/config/resources" "github.com/databricks/cli/libs/dbr" "github.com/databricks/cli/libs/diag" "github.com/databricks/cli/libs/dyn" - "github.com/databricks/cli/libs/log" "github.com/databricks/cli/libs/textutil" "github.com/databricks/databricks-sdk-go/service/catalog" "github.com/databricks/databricks-sdk-go/service/jobs" @@ -27,6 +22,7 @@ type applyPresets struct{} // Apply all presets, e.g. the prefix presets that // adds a prefix to all names of all resources. +// Note the catalog/schema presets are applied in ApplyPresetsCatalogSchema. func ApplyPresets() *applyPresets { return &applyPresets{} } @@ -43,9 +39,6 @@ func (m *applyPresets) Name() string { func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { var diags diag.Diagnostics - if d := validateCatalogAndSchema(b); d != nil { - return d // fast fail since code below would fail - } if d := validatePauseStatus(b); d != nil { diags = diags.Extend(d) } @@ -56,7 +49,7 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos tags := toTagArray(t.Tags) // Jobs presets. - // Supported: Prefix, Tags, JobsMaxConcurrentRuns, TriggerPauseStatus, Catalog, Schema + // Supported: Prefix, Tags, JobsMaxConcurrentRuns, TriggerPauseStatus for key, j := range r.Jobs { if j.JobSettings == nil { diags = diags.Extend(diag.Errorf("job %s is not defined", key)) @@ -90,25 +83,10 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos j.Trigger.PauseStatus = paused } } - if t.Catalog != "" || t.Schema != "" { - for _, task := range j.Tasks { - if task.DbtTask != nil { - if task.DbtTask.Catalog == "" { - task.DbtTask.Catalog = t.Catalog - } - if task.DbtTask.Schema == "" { - task.DbtTask.Schema = t.Schema - } - } - } - - diags = diags.Extend(addCatalogSchemaParameters(b, key, j, t)) - diags = diags.Extend(recommendCatalogSchemaUsage(b, ctx, key, j)) - } } // Pipelines presets. - // Supported: Prefix, PipelinesDevelopment, Catalog, Schema + // Supported: Prefix, PipelinesDevelopment // Not supported: Tags (as of 2024-10 not in pipelines API) for key, p := range r.Pipelines { if p.PipelineSpec == nil { @@ -122,62 +100,6 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos if t.TriggerPauseStatus == config.Paused { p.Continuous = false } - if t.Catalog != "" && t.Schema != "" { - if p.Catalog == "" { - p.Catalog = t.Catalog - } - if p.Target == "" { - p.Target = t.Schema - } - if p.GatewayDefinition != nil { - if p.GatewayDefinition.GatewayStorageCatalog == "" { - p.GatewayDefinition.GatewayStorageCatalog = t.Catalog - } - if p.GatewayDefinition.GatewayStorageSchema == "" { - p.GatewayDefinition.GatewayStorageSchema = t.Schema - } - } - if p.IngestionDefinition != nil { - for _, obj := range p.IngestionDefinition.Objects { - if obj.Report != nil { - if obj.Report.DestinationCatalog == "" { - obj.Report.DestinationCatalog = t.Catalog - } - if obj.Report.DestinationSchema == "" { - obj.Report.DestinationSchema = t.Schema - } - } - if obj.Schema != nil { - if obj.Schema.SourceCatalog == "" { - obj.Schema.SourceCatalog = t.Catalog - } - if obj.Schema.SourceSchema == "" { - obj.Schema.SourceSchema = t.Schema - } - if obj.Schema.DestinationCatalog == "" { - obj.Schema.DestinationCatalog = t.Catalog - } - if obj.Schema.DestinationSchema == "" { - obj.Schema.DestinationSchema = t.Schema - } - } - if obj.Table != nil { - if obj.Table.SourceCatalog == "" { - obj.Table.SourceCatalog = t.Catalog - } - if obj.Table.SourceSchema == "" { - obj.Table.SourceSchema = t.Schema - } - if obj.Table.DestinationCatalog == "" { - obj.Table.DestinationCatalog = t.Catalog - } - if obj.Table.DestinationSchema == "" { - obj.Table.DestinationSchema = t.Schema - } - } - } - } - } } // Models presets @@ -229,7 +151,7 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } // Model serving endpoint presets - // Supported: Prefix, Catalog, Schema + // Supported: Prefix // Not supported: Tags (not in API as of 2024-10) for key, e := range r.ModelServingEndpoints { if e.CreateServingEndpoint == nil { @@ -237,44 +159,10 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos continue } e.Name = normalizePrefix(prefix) + e.Name - - if t.Catalog != "" || t.Schema != "" { - // Apply catalog & schema to inference table config if not set - if e.CreateServingEndpoint.AiGateway != nil && e.CreateServingEndpoint.AiGateway.InferenceTableConfig != nil { - if t.Catalog != "" && e.CreateServingEndpoint.AiGateway.InferenceTableConfig.CatalogName == "" { - e.CreateServingEndpoint.AiGateway.InferenceTableConfig.CatalogName = t.Catalog - } - if t.Schema != "" && e.CreateServingEndpoint.AiGateway.InferenceTableConfig.SchemaName == "" { - e.CreateServingEndpoint.AiGateway.InferenceTableConfig.SchemaName = t.Schema - } - } - - // Apply catalog & schema to auto capture config if not set - if e.CreateServingEndpoint.Config.AutoCaptureConfig != nil { - if t.Catalog != "" && e.CreateServingEndpoint.Config.AutoCaptureConfig.CatalogName == "" { - e.CreateServingEndpoint.Config.AutoCaptureConfig.CatalogName = t.Catalog - } - if t.Schema != "" && e.CreateServingEndpoint.Config.AutoCaptureConfig.SchemaName == "" { - e.CreateServingEndpoint.Config.AutoCaptureConfig.SchemaName = t.Schema - } - } - - // Fully qualify served entities and models if they are not already qualified - for i := range e.CreateServingEndpoint.Config.ServedEntities { - e.CreateServingEndpoint.Config.ServedEntities[i].EntityName = fullyQualifyName( - e.CreateServingEndpoint.Config.ServedEntities[i].EntityName, t.Catalog, t.Schema, - ) - } - for i := range e.CreateServingEndpoint.Config.ServedModels { - e.CreateServingEndpoint.Config.ServedModels[i].ModelName = fullyQualifyName( - e.CreateServingEndpoint.Config.ServedModels[i].ModelName, t.Catalog, t.Schema, - ) - } - } } // Registered models presets - // Supported: Prefix, Catalog, Schema + // Supported: Prefix // Not supported: Tags (not in API as of 2024-10) for key, m := range r.RegisteredModels { if m.CreateRegisteredModelRequest == nil { @@ -282,16 +170,10 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos continue } m.Name = normalizePrefix(prefix) + m.Name - if t.Catalog != "" && m.CatalogName == "" { - m.CatalogName = t.Catalog - } - if t.Schema != "" && m.SchemaName == "" { - m.SchemaName = t.Schema - } } // Quality monitors presets - // Supported: Schedule, Catalog, Schema + // Supported: Schedule // Not supported: Tags (not in API as of 2024-10) for key, q := range r.QualityMonitors { if q.CreateMonitor == nil { @@ -306,15 +188,9 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos q.Schedule = nil } } - if t.Catalog != "" && t.Schema != "" { - q.TableName = fullyQualifyName(q.TableName, t.Catalog, t.Schema) - if q.OutputSchemaName == "" { - q.OutputSchemaName = t.Catalog + "." + t.Schema - } - } } - // Schemas: Prefix, Catalog, Schema + // Schemas: Prefix only // Not supported: Tags (as of 2024-10, only supported in Databricks UI / SQL API) for key, s := range r.Schemas { if s.CreateSchema == nil { @@ -322,18 +198,9 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos continue } s.Name = normalizePrefix(prefix) + s.Name - if t.Catalog != "" && s.CatalogName == "" { - s.CatalogName = t.Catalog - } - if t.Schema != "" && s.Name == "" { - // If there is a schema preset such as 'dev', we directly - // use that name and don't add any prefix (which might result in dev_dev). - s.Name = t.Schema - } } // Clusters: Prefix, Tags - // Not supported: Catalog / Schema (not applicable) for key, c := range r.Clusters { if c.ClusterSpec == nil { diags = diags.Extend(diag.Errorf("cluster %s is not defined", key)) @@ -385,6 +252,7 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos return diags } +// validatePauseStatus checks the user-provided pause status is valid. func validatePauseStatus(b *bundle.Bundle) diag.Diagnostics { p := b.Config.Presets.TriggerPauseStatus if p == "" || p == config.Paused || p == config.Unpaused { @@ -397,20 +265,8 @@ func validatePauseStatus(b *bundle.Bundle) diag.Diagnostics { }} } -func validateCatalogAndSchema(b *bundle.Bundle) diag.Diagnostics { - p := b.Config.Presets - if (p.Catalog != "" && p.Schema == "") || (p.Catalog == "" && p.Schema != "") { - return diag.Diagnostics{{ - Summary: "presets.catalog and presets.schema must always be set together", - Severity: diag.Error, - Locations: []dyn.Location{b.Config.GetLocation("presets")}, - }} - } - return nil -} - // toTagArray converts a map of tags to an array of tags. -// We sort tags so ensure stable ordering. +// We sort tags to ensure stable ordering. func toTagArray(tags map[string]string) []Tag { var tagArray []Tag if tags == nil { @@ -440,151 +296,3 @@ func normalizePrefix(prefix string) string { return textutil.NormalizeString(prefix) + suffix } - -// addCatalogSchemaParameters adds catalog and schema parameters to a job if they don't already exist. -// Returns any warning diagnostics for existing parameters. -func addCatalogSchemaParameters(b *bundle.Bundle, key string, job *resources.Job, t config.Presets) diag.Diagnostics { - var diags diag.Diagnostics - - // Check for existing catalog/schema parameters - hasCatalog := false - hasSchema := false - if job.Parameters != nil { - for _, param := range job.Parameters { - if param.Name == "catalog" { - hasCatalog = true - diags = diags.Extend(diag.Diagnostics{{ - Summary: fmt.Sprintf("job %s already has 'catalog' parameter defined; ignoring preset value", key), - Severity: diag.Warning, - Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key)}, - }}) - } - if param.Name == "schema" { - hasSchema = true - diags = diags.Extend(diag.Diagnostics{{ - Summary: fmt.Sprintf("job %s already has 'schema' parameter defined; ignoring preset value", key), - Severity: diag.Warning, - Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key)}, - }}) - } - } - } - - // Initialize parameters if nil - if job.Parameters == nil { - job.Parameters = []jobs.JobParameterDefinition{} - } - - // Add catalog parameter if not already present - if !hasCatalog && t.Catalog != "" { - job.Parameters = append(job.Parameters, jobs.JobParameterDefinition{ - Name: "catalog", - Default: t.Catalog, - }) - } - - // Add schema parameter if not already present - if !hasSchema && t.Schema != "" { - job.Parameters = append(job.Parameters, jobs.JobParameterDefinition{ - Name: "schema", - Default: t.Schema, - }) - } - - return diags -} - -func recommendCatalogSchemaUsage(b *bundle.Bundle, ctx context.Context, key string, job *resources.Job) diag.Diagnostics { - var diags diag.Diagnostics - for _, t := range job.Tasks { - var relPath string - var expected string - var fix string - if t.NotebookTask != nil { - relPath = t.NotebookTask.NotebookPath - expected = `dbutils.widgets.text\(['"]schema|` + - `USE[^)]+schema` - fix = " dbutils.widgets.text('catalog')\n" + - " dbutils.widgets.text('schema')\n" + - " catalog = dbutils.widgets.get('catalog')\n" + - " schema = dbutils.widgets.get('schema')\n" + - " spark.sql(f'USE {catalog}.{schema}')\n" - } else if t.SparkPythonTask != nil { - relPath = t.SparkPythonTask.PythonFile - expected = `add_argument\(['"]--catalog'|` + - `USE[^)]+catalog` - fix = " def main():\n" + - " parser = argparse.ArgumentParser()\n" + - " parser.add_argument('--catalog', required=True)\n" + - " parser.add_argument('--schema', '-s', required=True)\n" + - " args, unknown = parser.parse_known_args()\n" + - " spark.sql(f\"USE {args.catalog}.{args.schema}\")\n" - } else if t.SqlTask != nil && t.SqlTask.File != nil { - relPath = t.SqlTask.File.Path - expected = `:schema|\{\{schema\}\}` - fix = " USE CATALOG {{catalog}};\n" + - " USE IDENTIFIER({schema});\n" - } else { - continue - } - - sourceDir, err := b.Config.GetLocation("resources.jobs." + key).Directory() - if err != nil { - continue - } - - localPath, _, err := GetLocalPath(ctx, b, sourceDir, relPath) - if err != nil || localPath == "" { - // We ignore errors (they're reported by another mutator) - // and ignore empty local paths (which means we'd have to download the file) - continue - } - - if !fileIncludesPattern(ctx, localPath, expected) { - diags = diags.Extend(diag.Diagnostics{{ - Summary: "Use the 'catalog' and 'schema' parameters provided via 'presets.catalog' and 'presets.schema' using\n\n" + fix, - Severity: diag.Recommendation, - Locations: []dyn.Location{{ - File: localPath, - Line: 1, - Column: 1, - }}, - }}) - } - } - - return diags - -} - -// fullyQualifyName checks if the given name is already qualified with a catalog and schema. -// If not, and both catalog and schema are available, it prefixes the name with catalog.schema. -// If name is empty, returns name as-is. -func fullyQualifyName(name, catalog, schema string) string { - if name == "" || catalog == "" || schema == "" { - return name - } - // If it's already qualified (contains at least two '.'), we assume it's fully qualified. - parts := strings.Split(name, ".") - if len(parts) >= 3 { - // Already fully qualified - return name - } - // Otherwise, fully qualify it - return fmt.Sprintf("%s.%s.%s", catalog, schema, name) -} - -func fileIncludesPattern(ctx context.Context, filePath string, expected string) bool { - content, err := os.ReadFile(filePath) - if err != nil { - log.Warnf(ctx, "failed to check file %s: %v", filePath, err) - return true - } - - matched, err := regexp.MatchString(expected, string(content)) - if err != nil { - log.Warnf(ctx, "failed to check pattern in %s: %v", filePath, err) - return true - } - return matched -} diff --git a/bundle/config/mutator/apply_presets_catalog_schema.go b/bundle/config/mutator/apply_presets_catalog_schema.go new file mode 100644 index 0000000000..04b6dc73f1 --- /dev/null +++ b/bundle/config/mutator/apply_presets_catalog_schema.go @@ -0,0 +1,364 @@ +package mutator + +import ( + "context" + "fmt" + "os" + "regexp" + "strings" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/config" + "github.com/databricks/cli/bundle/config/resources" + "github.com/databricks/cli/libs/diag" + "github.com/databricks/cli/libs/dyn" + "github.com/databricks/cli/libs/log" + "github.com/databricks/databricks-sdk-go/service/jobs" +) + +type applyPresetsCatalogSchema struct{} + +// ApplyPresetsCatalogSchema applies catalog and schema presets to bundle resources. +func ApplyPresetsCatalogSchema() *applyPresetsCatalogSchema { + return &applyPresetsCatalogSchema{} +} + +func (m *applyPresetsCatalogSchema) Name() string { + return "ApplyPresetsCatalogSchema" +} + +func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { + diags := validateCatalogAndSchema(b) + if diags.HasError() { + return diags + } + + r := b.Config.Resources + p := b.Config.Presets + + // Jobs + for key, j := range r.Jobs { + if j.JobSettings == nil { + continue + } + if p.Catalog != "" || p.Schema != "" { + for _, task := range j.Tasks { + if task.DbtTask != nil { + if task.DbtTask.Catalog == "" { + task.DbtTask.Catalog = p.Catalog + } + if task.DbtTask.Schema == "" { + task.DbtTask.Schema = p.Schema + } + } + } + + diags = diags.Extend(addCatalogSchemaParameters(b, key, j, p)) + diags = diags.Extend(recommendCatalogSchemaUsage(b, ctx, key, j)) + } + } + + // Pipelines + for _, pl := range r.Pipelines { + if pl.PipelineSpec == nil { + continue + } + if p.Catalog != "" && p.Schema != "" { + if pl.Catalog == "" { + pl.Catalog = p.Catalog + } + if pl.Target == "" { + pl.Target = p.Schema + } + if pl.GatewayDefinition != nil { + if pl.GatewayDefinition.GatewayStorageCatalog == "" { + pl.GatewayDefinition.GatewayStorageCatalog = p.Catalog + } + if pl.GatewayDefinition.GatewayStorageSchema == "" { + pl.GatewayDefinition.GatewayStorageSchema = p.Schema + } + } + if pl.IngestionDefinition != nil { + for _, obj := range pl.IngestionDefinition.Objects { + if obj.Report != nil { + if obj.Report.DestinationCatalog == "" { + obj.Report.DestinationCatalog = p.Catalog + } + if obj.Report.DestinationSchema == "" { + obj.Report.DestinationSchema = p.Schema + } + } + if obj.Schema != nil { + if obj.Schema.SourceCatalog == "" { + obj.Schema.SourceCatalog = p.Catalog + } + if obj.Schema.SourceSchema == "" { + obj.Schema.SourceSchema = p.Schema + } + if obj.Schema.DestinationCatalog == "" { + obj.Schema.DestinationCatalog = p.Catalog + } + if obj.Schema.DestinationSchema == "" { + obj.Schema.DestinationSchema = p.Schema + } + } + if obj.Table != nil { + if obj.Table.SourceCatalog == "" { + obj.Table.SourceCatalog = p.Catalog + } + if obj.Table.SourceSchema == "" { + obj.Table.SourceSchema = p.Schema + } + if obj.Table.DestinationCatalog == "" { + obj.Table.DestinationCatalog = p.Catalog + } + if obj.Table.DestinationSchema == "" { + obj.Table.DestinationSchema = p.Schema + } + } + } + } + } + } + + // Model serving endpoints + for _, e := range r.ModelServingEndpoints { + if e.CreateServingEndpoint == nil { + continue + } + + if p.Catalog != "" || p.Schema != "" { + if e.CreateServingEndpoint.AiGateway != nil && e.CreateServingEndpoint.AiGateway.InferenceTableConfig != nil { + if p.Catalog != "" && e.CreateServingEndpoint.AiGateway.InferenceTableConfig.CatalogName == "" { + e.CreateServingEndpoint.AiGateway.InferenceTableConfig.CatalogName = p.Catalog + } + if p.Schema != "" && e.CreateServingEndpoint.AiGateway.InferenceTableConfig.SchemaName == "" { + e.CreateServingEndpoint.AiGateway.InferenceTableConfig.SchemaName = p.Schema + } + } + + if e.CreateServingEndpoint.Config.AutoCaptureConfig != nil { + if p.Catalog != "" && e.CreateServingEndpoint.Config.AutoCaptureConfig.CatalogName == "" { + e.CreateServingEndpoint.Config.AutoCaptureConfig.CatalogName = p.Catalog + } + if p.Schema != "" && e.CreateServingEndpoint.Config.AutoCaptureConfig.SchemaName == "" { + e.CreateServingEndpoint.Config.AutoCaptureConfig.SchemaName = p.Schema + } + } + + for i := range e.CreateServingEndpoint.Config.ServedEntities { + e.CreateServingEndpoint.Config.ServedEntities[i].EntityName = fullyQualifyName( + e.CreateServingEndpoint.Config.ServedEntities[i].EntityName, p.Catalog, p.Schema, + ) + } + for i := range e.CreateServingEndpoint.Config.ServedModels { + e.CreateServingEndpoint.Config.ServedModels[i].ModelName = fullyQualifyName( + e.CreateServingEndpoint.Config.ServedModels[i].ModelName, p.Catalog, p.Schema, + ) + } + } + } + + // Registered models + for _, m := range r.RegisteredModels { + if m.CreateRegisteredModelRequest == nil { + continue + } + if p.Catalog != "" && m.CatalogName == "" { + m.CatalogName = p.Catalog + } + if p.Schema != "" && m.SchemaName == "" { + m.SchemaName = p.Schema + } + } + + // Quality monitors + for _, q := range r.QualityMonitors { + if q.CreateMonitor == nil { + continue + } + if p.Catalog != "" && p.Schema != "" { + q.TableName = fullyQualifyName(q.TableName, p.Catalog, p.Schema) + if q.OutputSchemaName == "" { + q.OutputSchemaName = p.Catalog + "." + p.Schema + } + } + } + + // Schemas + for _, s := range r.Schemas { + if s.CreateSchema == nil { + continue + } + if p.Catalog != "" && s.CatalogName == "" { + s.CatalogName = p.Catalog + } + if p.Schema != "" && s.Name == "" { + // If there is a schema preset such as 'dev', we directly + // use that name and don't add any prefix (which might result in dev_dev). + s.Name = p.Schema + } + } + + return diags +} + +func validateCatalogAndSchema(b *bundle.Bundle) diag.Diagnostics { + p := b.Config.Presets + if (p.Catalog != "" && p.Schema == "") || (p.Catalog == "" && p.Schema != "") { + return diag.Diagnostics{{ + Summary: "presets.catalog and presets.schema must always be set together", + Severity: diag.Error, + Locations: []dyn.Location{b.Config.GetLocation("presets")}, + }} + } + return diag.Diagnostics{} +} + +// addCatalogSchemaParameters adds catalog and schema parameters to a job if they don't already exist. +// Returns any warning diagnostics for existing parameters. +func addCatalogSchemaParameters(b *bundle.Bundle, key string, job *resources.Job, t config.Presets) diag.Diagnostics { + var diags diag.Diagnostics + + // Check for existing catalog/schema parameters + hasCatalog := false + hasSchema := false + if job.Parameters != nil { + for _, param := range job.Parameters { + if param.Name == "catalog" { + hasCatalog = true + diags = diags.Extend(diag.Diagnostics{{ + Summary: fmt.Sprintf("job %s already has 'catalog' parameter defined; ignoring preset value", key), + Severity: diag.Warning, + Locations: b.Config.GetLocations("resources.jobs." + key + ".parameters"), + }}) + } + if param.Name == "schema" { + hasSchema = true + diags = diags.Extend(diag.Diagnostics{{ + Summary: fmt.Sprintf("job %s already has 'schema' parameter defined; ignoring preset value", key), + Severity: diag.Warning, + Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key)}, + }}) + } + } + } + + // Initialize parameters if nil + if job.Parameters == nil { + job.Parameters = []jobs.JobParameterDefinition{} + } + + // Add catalog parameter if not already present + if !hasCatalog && t.Catalog != "" { + job.Parameters = append(job.Parameters, jobs.JobParameterDefinition{ + Name: "catalog", + Default: t.Catalog, + }) + } + + // Add schema parameter if not already present + if !hasSchema && t.Schema != "" { + job.Parameters = append(job.Parameters, jobs.JobParameterDefinition{ + Name: "schema", + Default: t.Schema, + }) + } + + return diags +} + +func recommendCatalogSchemaUsage(b *bundle.Bundle, ctx context.Context, key string, job *resources.Job) diag.Diagnostics { + var diags diag.Diagnostics + for _, t := range job.Tasks { + var relPath string + var expected string + var fix string + if t.NotebookTask != nil { + relPath = t.NotebookTask.NotebookPath + expected = `dbutils.widgets.text\(['"]schema|` + + `USE[^)]+schema` + fix = " dbutils.widgets.text('catalog')\n" + + " dbutils.widgets.text('schema')\n" + + " catalog = dbutils.widgets.get('catalog')\n" + + " schema = dbutils.widgets.get('schema')\n" + + " spark.sql(f'USE {catalog}.{schema}')\n" + } else if t.SparkPythonTask != nil { + relPath = t.SparkPythonTask.PythonFile + expected = `add_argument\(['"]--catalog'|` + + `USE[^)]+catalog` + fix = " def main():\n" + + " parser = argparse.ArgumentParser()\n" + + " parser.add_argument('--catalog', required=True)\n" + + " parser.add_argument('--schema', '-s', required=True)\n" + + " args, unknown = parser.parse_known_args()\n" + + " spark.sql(f\"USE {args.catalog}.{args.schema}\")\n" + } else if t.SqlTask != nil && t.SqlTask.File != nil { + relPath = t.SqlTask.File.Path + expected = `:schema|\{\{schema\}\}` + fix = " USE CATALOG {{catalog}};\n" + + " USE IDENTIFIER({schema});\n" + } else { + continue + } + + sourceDir, err := b.Config.GetLocation("resources.jobs." + key).Directory() + if err != nil { + continue + } + + localPath, _, err := GetLocalPath(ctx, b, sourceDir, relPath) + if err != nil || localPath == "" { + // We ignore errors (they're reported by another mutator) + // and ignore empty local paths (which means we'd have to download the file) + continue + } + + if !fileIncludesPattern(ctx, localPath, expected) { + diags = diags.Extend(diag.Diagnostics{{ + Summary: "Use the 'catalog' and 'schema' parameters provided via 'presets.catalog' and 'presets.schema' using\n\n" + fix, + Severity: diag.Recommendation, + Locations: []dyn.Location{{ + File: localPath, + Line: 1, + Column: 1, + }}, + }}) + } + } + + return diags + +} + +// fullyQualifyName checks if the given name is already qualified with a catalog and schema. +// If not, and both catalog and schema are available, it prefixes the name with catalog.schema. +// If name is empty, returns name as-is. +func fullyQualifyName(name, catalog, schema string) string { + if name == "" || catalog == "" || schema == "" { + return name + } + // If it's already qualified (contains at least two '.'), we assume it's fully qualified. + parts := strings.Split(name, ".") + if len(parts) >= 3 { + // Already fully qualified + return name + } + // Otherwise, fully qualify it + return fmt.Sprintf("%s.%s.%s", catalog, schema, name) +} + +func fileIncludesPattern(ctx context.Context, filePath string, expected string) bool { + content, err := os.ReadFile(filePath) + if err != nil { + log.Warnf(ctx, "failed to check file %s: %v", filePath, err) + return true + } + + matched, err := regexp.MatchString(expected, string(content)) + if err != nil { + log.Warnf(ctx, "failed to check pattern in %s: %v", filePath, err) + return true + } + return matched +} diff --git a/bundle/config/mutator/apply_presets_catalog_schema_test.go b/bundle/config/mutator/apply_presets_catalog_schema_test.go new file mode 100644 index 0000000000..df4efeb8a4 --- /dev/null +++ b/bundle/config/mutator/apply_presets_catalog_schema_test.go @@ -0,0 +1,348 @@ +package mutator_test + +import ( + "context" + "reflect" + "strings" + "testing" + + "github.com/databricks/cli/bundle" + "github.com/databricks/cli/bundle/config" + "github.com/databricks/cli/bundle/config/mutator" + "github.com/databricks/cli/bundle/config/resources" + "github.com/databricks/cli/libs/dyn" + "github.com/databricks/databricks-sdk-go/service/catalog" + "github.com/databricks/databricks-sdk-go/service/jobs" + "github.com/databricks/databricks-sdk-go/service/pipelines" + "github.com/databricks/databricks-sdk-go/service/serving" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +type recordedField struct { + Path dyn.Path + PathString string + Placeholder string + Expected string +} + +func mockPresetsCatalogSchema() *bundle.Bundle { + return &bundle.Bundle{ + Config: config.Root{ + Resources: config.Resources{ + Jobs: map[string]*resources.Job{ + "key": { + JobSettings: &jobs.JobSettings{ + Name: "job", + Parameters: []jobs.JobParameterDefinition{ + {Name: "catalog", Default: ""}, + {Name: "schema", Default: ""}, + }, + Tasks: []jobs.Task{ + { + DbtTask: &jobs.DbtTask{ + Catalog: "", + Schema: "", + }, + }, + { + SparkPythonTask: &jobs.SparkPythonTask{ + PythonFile: "/file", + }, + }, + { + NotebookTask: &jobs.NotebookTask{ + NotebookPath: "/notebook", + }, + }, + }, + }, + }, + }, + Pipelines: map[string]*resources.Pipeline{ + "key": { + PipelineSpec: &pipelines.PipelineSpec{ + Name: "pipeline", + Catalog: "", + Target: "", + GatewayDefinition: &pipelines.IngestionGatewayPipelineDefinition{ + GatewayStorageCatalog: "", + GatewayStorageSchema: "", + }, + IngestionDefinition: &pipelines.IngestionPipelineDefinition{ + Objects: []pipelines.IngestionConfig{ + { + Report: &pipelines.ReportSpec{ + DestinationCatalog: "", + DestinationSchema: "", + }, + Schema: &pipelines.SchemaSpec{ + SourceCatalog: "", + SourceSchema: "", + DestinationCatalog: "", + DestinationSchema: "", + }, + Table: &pipelines.TableSpec{ + SourceCatalog: "", + SourceSchema: "", + DestinationCatalog: "", + DestinationSchema: "", + }, + }, + }, + }, + }, + }, + }, + ModelServingEndpoints: map[string]*resources.ModelServingEndpoint{ + "key": { + CreateServingEndpoint: &serving.CreateServingEndpoint{ + Name: "serving", + AiGateway: &serving.AiGatewayConfig{ + InferenceTableConfig: &serving.AiGatewayInferenceTableConfig{ + CatalogName: "", + SchemaName: "", + }, + }, + Config: serving.EndpointCoreConfigInput{ + AutoCaptureConfig: &serving.AutoCaptureConfigInput{ + CatalogName: "", + SchemaName: "", + }, + ServedEntities: []serving.ServedEntityInput{ + {EntityName: "..entity"}, + }, + ServedModels: []serving.ServedModelInput{ + {ModelName: "..model"}, + }, + }, + }, + }, + }, + RegisteredModels: map[string]*resources.RegisteredModel{ + "key": { + CreateRegisteredModelRequest: &catalog.CreateRegisteredModelRequest{ + Name: "registered_model", + CatalogName: "", + SchemaName: "", + }, + }, + }, + QualityMonitors: map[string]*resources.QualityMonitor{ + "key": { + TableName: "..table", + CreateMonitor: &catalog.CreateMonitor{ + OutputSchemaName: ".", + }, + }, + }, + Schemas: map[string]*resources.Schema{ + "key": { + CreateSchema: &catalog.CreateSchema{ + Name: "", + CatalogName: "", + }, + }, + }, + }, + Presets: config.Presets{ + Catalog: "my_catalog", + Schema: "my_schema", + }, + }, + } +} + +// ignoredFields are fields that should be ignored in the completeness check +var ignoredFields = map[string]string{ + "resources.pipelines.key.schema": "schema is still in private preview", + "resources.jobs.key.tasks[0].notebook_task.base_parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].python_wheel_task.named_parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].python_wheel_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.job_parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].spark_jar_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].spark_python_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].spark_submit_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].sql_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.jar_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.notebook_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.pipeline_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.python_named_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.python_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.spark_submit_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.sql_params": "catalog/schema are passed via job parameters", + "resources.pipelines.key.ingestion_definition.objects[0].schema": "schema name is under schema.source_schema/destination_schema", + "resources.schemas": "schema name of schemas is under resources.schemas.key.Name", +} + +func TestApplyPresetsCatalogSchemaWhenAlreadySet(t *testing.T) { + b := mockPresetsCatalogSchema() + recordedFields := recordPlaceholderFields(t, b) + + diags := bundle.Apply(context.Background(), b, mutator.ApplyPresets()) + require.NoError(t, diags.Error()) + + for _, f := range recordedFields { + val, err := dyn.GetByPath(b.Config.Value(), f.Path) + require.NoError(t, err, "failed to get path %s", f.Path) + require.Equal(t, f.Placeholder, val.MustString(), + "expected placeholder '%s' at %s to remain unchanged before cleanup", f.Placeholder, f.Path) + } +} + +func TestApplyPresetsCatalogSchemaWhenNotSet(t *testing.T) { + b := mockPresetsCatalogSchema() + recordedFields := recordPlaceholderFields(t, b) + + // Set all catalog/schema fields to empty strings / nil + require.NoError(t, b.Config.Mutate(func(root dyn.Value) (dyn.Value, error) { + for _, f := range recordedFields { + value, err := dyn.GetByPath(root, f.Path) + require.NoError(t, err) + + val := value.MustString() + cleanedVal := removePlaceholders(val) + root, err = dyn.SetByPath(root, f.Path, dyn.V(cleanedVal)) + require.NoError(t, err) + } + return dyn.Set(root, "resources.jobs.key.parameters", dyn.NilValue) + })) + + // Apply catalog/schema presets + diags := bundle.Apply(context.Background(), b, mutator.ApplyPresetsCatalogSchema()) + require.NoError(t, diags.Error()) + + // Verify that all catalog/schema fields have been set to the presets + for _, f := range recordedFields { + val, err := dyn.GetByPath(b.Config.Value(), f.Path) + require.NoError(t, err, "could not find expected field(s) at %s", f.Path) + assert.Equal(t, f.Expected, val.MustString(), "preset value expected for %s based on placeholder %s", f.Path, f.Placeholder) + } +} + +func TestApplyPresetsCatalogSchemaCompleteness(t *testing.T) { + b := mockPresetsCatalogSchema() + recordedFields := recordPlaceholderFields(t, b) + + // Convert the recordedFields to a set for easier lookup + recordedPaths := make(map[string]struct{}) + for _, field := range recordedFields { + recordedPaths[field.PathString] = struct{}{} + if i := strings.Index(field.PathString, "["); i >= 0 { + // For entries like resources.jobs.key.parameters[1].default, just add resources.jobs.key.parameters + recordedPaths[field.PathString[:i]] = struct{}{} + } + } + + // Find all catalog/schema fields that we think should be covered based + // on all properties in config.Resources. + expectedFields := findCatalogSchemaFields() + assert.GreaterOrEqual(t, len(expectedFields), 42, "expected at least 42 catalog/schema fields, but got %d", len(expectedFields)) + + // Verify that all expected fields are there + for _, field := range expectedFields { + if _, recorded := recordedPaths[field]; !recorded { + if _, ignored := ignoredFields[field]; !ignored { + t.Errorf("Field %s was not included in the catalog/schema presets test. If this is a new field, please add it to PresetsMock or PresetsIgnoredFields and add support for it as appropriate.", field) + } + } + } +} + +// recordPlaceholderFields scans the config and records all fields containing catalog/schema placeholders +func recordPlaceholderFields(t *testing.T, b *bundle.Bundle) []recordedField { + t.Helper() + + var recordedFields []recordedField + err := b.Config.Mutate(func(root dyn.Value) (dyn.Value, error) { + _, err := dyn.Walk(b.Config.Value(), func(p dyn.Path, v dyn.Value) (dyn.Value, error) { + if v.Kind() == dyn.KindString { + val := v.MustString() + if strings.Contains(val, "") || strings.Contains(val, "") { + pathCopy := make(dyn.Path, len(p)) + copy(pathCopy, p) + recordedFields = append(recordedFields, recordedField{ + Path: pathCopy, + PathString: pathCopy.String(), + Placeholder: val, + Expected: replacePlaceholders(val, "my_catalog", "my_schema"), + }) + } + } + return v, nil + }) + return root, err + }) + require.NoError(t, err) + return recordedFields +} + +// findCatalogSchemaFields finds all fields in config.Resources that might refer +// to a catalog or schema. Returns a slice of field paths. +func findCatalogSchemaFields() []string { + visited := make(map[reflect.Type]struct{}) + var results []string + + // verifyTypeFields is a recursive function to verify the fields of a given type + var walkTypeFields func(rt reflect.Type, path string) + walkTypeFields = func(rt reflect.Type, path string) { + if _, seen := visited[rt]; seen { + return + } + visited[rt] = struct{}{} + + switch rt.Kind() { + case reflect.Slice, reflect.Array: + walkTypeFields(rt.Elem(), path+"[0]") + case reflect.Map: + walkTypeFields(rt.Elem(), path+".key") + case reflect.Ptr: + walkTypeFields(rt.Elem(), path) + case reflect.Struct: + for i := 0; i < rt.NumField(); i++ { + ft := rt.Field(i) + jsonTag := ft.Tag.Get("json") + if jsonTag == "" || jsonTag == "-" { + // Ignore field names when there's no JSON tag, e.g. for Jobs.JobSettings + walkTypeFields(ft.Type, path) + continue + } + + fieldName := strings.Split(jsonTag, ",")[0] + fieldPath := path + "." + fieldName + + if isCatalogOrSchemaField(fieldName) { + results = append(results, fieldPath) + } + + walkTypeFields(ft.Type, fieldPath) + } + } + } + + var r config.Resources + walkTypeFields(reflect.TypeOf(r), "resources") + return results +} + +// isCatalogOrSchemaField returns true for a field names in config.Resources that we suspect could contain a catalog or schema name +func isCatalogOrSchemaField(name string) bool { + return strings.Contains(name, "catalog") || + strings.Contains(name, "schema") || + strings.Contains(name, "parameters") || + strings.Contains(name, "params") +} + +func removePlaceholders(value string) string { + value = strings.ReplaceAll(value, ".", "") + value = strings.ReplaceAll(value, ".", "") + value = strings.ReplaceAll(value, "", "") + value = strings.ReplaceAll(value, "", "") + return value +} + +func replacePlaceholders(placeholder, catalog, schema string) string { + expected := strings.ReplaceAll(placeholder, "", catalog) + expected = strings.ReplaceAll(expected, "", schema) + return expected +} diff --git a/bundle/config/mutator/apply_presets_test.go b/bundle/config/mutator/apply_presets_test.go index dfcb411719..497ef051ad 100644 --- a/bundle/config/mutator/apply_presets_test.go +++ b/bundle/config/mutator/apply_presets_test.go @@ -2,9 +2,7 @@ package mutator_test import ( "context" - "reflect" "runtime" - "strings" "testing" "github.com/databricks/cli/bundle" @@ -16,19 +14,9 @@ import ( "github.com/databricks/cli/libs/dyn" "github.com/databricks/databricks-sdk-go/service/catalog" "github.com/databricks/databricks-sdk-go/service/jobs" - "github.com/databricks/databricks-sdk-go/service/pipelines" - "github.com/databricks/databricks-sdk-go/service/serving" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) -type recordedField struct { - Path dyn.Path - PathString string - Placeholder string - Expected string -} - func TestApplyPresetsPrefix(t *testing.T) { tests := []struct { name string @@ -465,317 +453,3 @@ func TestApplyPresetsSourceLinkedDeployment(t *testing.T) { } } - -func PresetsMock() *bundle.Bundle { - return &bundle.Bundle{ - Config: config.Root{ - Resources: config.Resources{ - Jobs: map[string]*resources.Job{ - "key": { - JobSettings: &jobs.JobSettings{ - Name: "job", - Parameters: []jobs.JobParameterDefinition{ - {Name: "catalog", Default: ""}, - {Name: "schema", Default: ""}, - }, - Tasks: []jobs.Task{ - { - DbtTask: &jobs.DbtTask{ - Catalog: "", - Schema: "", - }, - }, - { - SparkPythonTask: &jobs.SparkPythonTask{ - PythonFile: "/file", - }, - }, - { - NotebookTask: &jobs.NotebookTask{ - NotebookPath: "/notebook", - }, - }, - }, - }, - }, - }, - Pipelines: map[string]*resources.Pipeline{ - "key": { - PipelineSpec: &pipelines.PipelineSpec{ - Name: "pipeline", - Catalog: "", - Target: "", - GatewayDefinition: &pipelines.IngestionGatewayPipelineDefinition{ - GatewayStorageCatalog: "", - GatewayStorageSchema: "", - }, - IngestionDefinition: &pipelines.IngestionPipelineDefinition{ - Objects: []pipelines.IngestionConfig{ - { - Report: &pipelines.ReportSpec{ - DestinationCatalog: "", - DestinationSchema: "", - }, - Schema: &pipelines.SchemaSpec{ - SourceCatalog: "", - SourceSchema: "", - DestinationCatalog: "", - DestinationSchema: "", - }, - Table: &pipelines.TableSpec{ - SourceCatalog: "", - SourceSchema: "", - DestinationCatalog: "", - DestinationSchema: "", - }, - }, - }, - }, - }, - }, - }, - ModelServingEndpoints: map[string]*resources.ModelServingEndpoint{ - "key": { - CreateServingEndpoint: &serving.CreateServingEndpoint{ - Name: "serving", - AiGateway: &serving.AiGatewayConfig{ - InferenceTableConfig: &serving.AiGatewayInferenceTableConfig{ - CatalogName: "", - SchemaName: "", - }, - }, - Config: serving.EndpointCoreConfigInput{ - AutoCaptureConfig: &serving.AutoCaptureConfigInput{ - CatalogName: "", - SchemaName: "", - }, - ServedEntities: []serving.ServedEntityInput{ - {EntityName: "..entity"}, - }, - ServedModels: []serving.ServedModelInput{ - {ModelName: "..model"}, - }, - }, - }, - }, - }, - RegisteredModels: map[string]*resources.RegisteredModel{ - "key": { - CreateRegisteredModelRequest: &catalog.CreateRegisteredModelRequest{ - Name: "registered_model", - CatalogName: "", - SchemaName: "", - }, - }, - }, - QualityMonitors: map[string]*resources.QualityMonitor{ - "key": { - TableName: "..table", - CreateMonitor: &catalog.CreateMonitor{ - OutputSchemaName: ".", - }, - }, - }, - Schemas: map[string]*resources.Schema{ - "key": { - CreateSchema: &catalog.CreateSchema{ - Name: "", - CatalogName: "", - }, - }, - }, - }, - }, - } -} - -// Any fields that should be ignored in the completeness check -var PresetsIgnoredFields = map[string]string{ - "resources.pipelines.key.schema": "schema is still in private preview", - "resources.jobs.key.tasks[0].notebook_task.base_parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].python_wheel_task.named_parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].python_wheel_task.parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.job_parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].spark_jar_task.parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].spark_python_task.parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].spark_submit_task.parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].sql_task.parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.jar_params": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.notebook_params": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.pipeline_params": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.python_named_params": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.python_params": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.spark_submit_params": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.sql_params": "catalog/schema are passed via job parameters", - "resources.pipelines.key.ingestion_definition.objects[0].schema": "schema name is under schema.source_schema/destination_schema", - "resources.schemas": "schema name of schemas is under resources.schemas.key.Name", -} - -func TestApplyPresetsCatalogSchema(t *testing.T) { - b := PresetsMock() - b.Config.Presets = config.Presets{ - Catalog: "my_catalog", - Schema: "my_schema", - } - ctx := context.Background() - - // Initial scan: record all fields that contain placeholders. - // We do this before the first apply so we can verify no changes occur. - var recordedFields []recordedField - require.NoError(t, b.Config.Mutate(func(root dyn.Value) (dyn.Value, error) { - _, err := dyn.Walk(b.Config.Value(), func(p dyn.Path, v dyn.Value) (dyn.Value, error) { - if v.Kind() == dyn.KindString { - val := v.MustString() - if strings.Contains(val, "") || strings.Contains(val, "") { - pathCopy := make(dyn.Path, len(p)) - copy(pathCopy, p) - recordedFields = append(recordedFields, recordedField{ - Path: pathCopy, - PathString: pathCopy.String(), - Placeholder: val, - Expected: replacePlaceholders(val, "my_catalog", "my_schema"), - }) - } - } - return v, nil - }) - return root, err - })) - - // Convert the recordedFields to a set for easier lookup - recordedSet := make(map[string]struct{}) - for _, field := range recordedFields { - recordedSet[field.PathString] = struct{}{} - if i := strings.Index(field.PathString, "["); i >= 0 { - // For entries like resources.jobs.key.parameters[1].default, just add resources.jobs.key.parameters - recordedSet[field.PathString[:i]] = struct{}{} - } - } - - // Stage 1: Apply presets before cleanup, should be no-op. - diags := bundle.Apply(ctx, b, mutator.ApplyPresets()) - require.False(t, diags.HasError(), "unexpected error before cleanup: %v", diags.Error()) - - // Verify that no recorded fields changed - verifyNoChangesBeforeCleanup(t, b.Config.Value(), recordedFields) - - // Stage 2: Cleanup: Walk over rootVal and remove placeholders, adjusting recordedFields Expected values. - require.NoError(t, b.Config.Mutate(func(root dyn.Value) (dyn.Value, error) { - for _, f := range recordedFields { - value, err := dyn.GetByPath(root, f.Path) - require.NoError(t, err) - - val := value.MustString() - cleanedVal := removePlaceholders(val) - root, err = dyn.SetByPath(root, f.Path, dyn.V(cleanedVal)) - require.NoError(t, err) - } - root, err := dyn.Set(root, "resources.jobs.key.parameters", dyn.NilValue) - require.NoError(t, err) - return root, nil - })) - - // Stage 3: Apply presets after cleanup. - diags = bundle.Apply(ctx, b, mutator.ApplyPresets()) - require.False(t, diags.HasError(), "unexpected error after cleanup: %v", diags.Error()) - - // Verify that fields have the expected replacements - config := b.Config.Value() - for _, f := range recordedFields { - val, err := dyn.GetByPath(config, f.Path) - require.NoError(t, err, "failed to get path %s", f.Path) - assert.Equal(t, f.Expected, val.MustString(), "preset value expected for %s based on placeholder %s", f.Path, f.Placeholder) - } - - // Stage 4: Check completeness - expectedFields := findCatalogSchemaFields() - assert.GreaterOrEqual(t, len(expectedFields), 42, "expected at least 42 catalog/schema fields, but got %d", len(expectedFields)) - for _, field := range expectedFields { - if _, recorded := recordedSet[field]; !recorded { - if _, ignored := PresetsIgnoredFields[field]; !ignored { - t.Errorf("Field %s was not included in the catalog/schema presets test. If this is a new field, please add it to PresetsMock or PresetsIgnoredFields and add support for it as appropriate.", field) - } - } - } -} - -func verifyNoChangesBeforeCleanup(t *testing.T, rootVal dyn.Value, recordedFields []recordedField) { - t.Helper() - - for _, f := range recordedFields { - val, err := dyn.GetByPath(rootVal, f.Path) - require.NoError(t, err, "failed to get path %s", f.Path) - require.Equal(t, f.Placeholder, val.MustString(), - "expected placeholder '%s' at %s to remain unchanged before cleanup", f.Placeholder, f.Path) - } -} - -// findCatalogSchemaFields finds all fields in config.Resources that might refer -// to a catalog or schema. Returns a slice of field paths. -func findCatalogSchemaFields() []string { - visited := make(map[reflect.Type]struct{}) - var results []string - - // verifyTypeFields is a recursive function to verify the fields of a given type - var walkTypeFields func(rt reflect.Type, path string) - walkTypeFields = func(rt reflect.Type, path string) { - if _, seen := visited[rt]; seen { - return - } - visited[rt] = struct{}{} - - switch rt.Kind() { - case reflect.Slice, reflect.Array: - walkTypeFields(rt.Elem(), path+"[0]") - case reflect.Map: - walkTypeFields(rt.Elem(), path+".key") - case reflect.Ptr: - walkTypeFields(rt.Elem(), path) - case reflect.Struct: - for i := 0; i < rt.NumField(); i++ { - ft := rt.Field(i) - jsonTag := ft.Tag.Get("json") - if jsonTag == "" || jsonTag == "-" { - // Ignore field names when there's no JSON tag, e.g. for Jobs.JobSettings - walkTypeFields(ft.Type, path) - continue - } - - fieldName := strings.Split(jsonTag, ",")[0] - fieldPath := path + "." + fieldName - - if isCatalogOrSchemaField(fieldName) { - results = append(results, fieldPath) - } - - walkTypeFields(ft.Type, fieldPath) - } - } - } - - var r config.Resources - walkTypeFields(reflect.TypeOf(r), "resources") - return results -} - -// isCatalogOrSchemaField returns true for a field names in config.Resources that we suspect could contain a catalog or schema name -func isCatalogOrSchemaField(name string) bool { - return strings.Contains(name, "catalog") || - strings.Contains(name, "schema") || - strings.Contains(name, "parameters") || - strings.Contains(name, "params") -} - -func removePlaceholders(value string) string { - value = strings.ReplaceAll(value, ".", "") - value = strings.ReplaceAll(value, ".", "") - value = strings.ReplaceAll(value, "", "") - value = strings.ReplaceAll(value, "", "") - return value -} - -func replacePlaceholders(placeholder, catalog, schema string) string { - expected := strings.ReplaceAll(placeholder, "", catalog) - expected = strings.ReplaceAll(expected, "", schema) - return expected -} diff --git a/bundle/phases/initialize.go b/bundle/phases/initialize.go index b62fa0b143..d3b6f81f92 100644 --- a/bundle/phases/initialize.go +++ b/bundle/phases/initialize.go @@ -70,6 +70,7 @@ func Initialize() bundle.Mutator { mutator.ConfigureDashboardDefaults(), mutator.ProcessTargetMode(), mutator.ApplyPresets(), + mutator.ApplyPresetsCatalogSchema(), mutator.DefaultQueueing(), mutator.ExpandPipelineGlobPaths(), From 65f10d187dafd4d1456681f680f20f04bfea04c1 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 20 Dec 2024 09:28:38 +0100 Subject: [PATCH 18/27] Cleanup --- .../mutator/apply_presets_catalog_schema.go | 211 ++++++++---------- 1 file changed, 98 insertions(+), 113 deletions(-) diff --git a/bundle/config/mutator/apply_presets_catalog_schema.go b/bundle/config/mutator/apply_presets_catalog_schema.go index 04b6dc73f1..6276d7c525 100644 --- a/bundle/config/mutator/apply_presets_catalog_schema.go +++ b/bundle/config/mutator/apply_presets_catalog_schema.go @@ -28,34 +28,40 @@ func (m *applyPresetsCatalogSchema) Name() string { } func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnostics { - diags := validateCatalogAndSchema(b) - if diags.HasError() { + diags := diag.Diagnostics{} + p := b.Config.Presets + r := b.Config.Resources + + if p.Catalog == "" && p.Schema == "" { return diags } - - r := b.Config.Resources - p := b.Config.Presets + if (p.Schema == "") || (p.Catalog == "" && p.Schema != "") { + return diag.Diagnostics{{ + Summary: "presets.catalog and presets.schema must always be set together", + Severity: diag.Error, + Locations: []dyn.Location{b.Config.GetLocation("presets")}, + }} + } // Jobs for key, j := range r.Jobs { if j.JobSettings == nil { continue } - if p.Catalog != "" || p.Schema != "" { - for _, task := range j.Tasks { - if task.DbtTask != nil { - if task.DbtTask.Catalog == "" { - task.DbtTask.Catalog = p.Catalog - } - if task.DbtTask.Schema == "" { - task.DbtTask.Schema = p.Schema - } + + for _, task := range j.Tasks { + if task.DbtTask != nil { + if task.DbtTask.Catalog == "" { + task.DbtTask.Catalog = p.Catalog + } + if task.DbtTask.Schema == "" { + task.DbtTask.Schema = p.Schema } } - - diags = diags.Extend(addCatalogSchemaParameters(b, key, j, p)) - diags = diags.Extend(recommendCatalogSchemaUsage(b, ctx, key, j)) } + + diags = diags.Extend(addCatalogSchemaParameters(b, key, j, p)) + diags = diags.Extend(recommendCatalogSchemaUsage(b, ctx, key, j)) } // Pipelines @@ -63,58 +69,53 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) if pl.PipelineSpec == nil { continue } - if p.Catalog != "" && p.Schema != "" { - if pl.Catalog == "" { - pl.Catalog = p.Catalog + if pl.Catalog == "" { + pl.Catalog = p.Catalog + } + if pl.GatewayDefinition != nil { + if pl.GatewayDefinition.GatewayStorageCatalog == "" { + pl.GatewayDefinition.GatewayStorageCatalog = p.Catalog } - if pl.Target == "" { - pl.Target = p.Schema + if pl.GatewayDefinition.GatewayStorageSchema == "" { + pl.GatewayDefinition.GatewayStorageSchema = p.Schema } - if pl.GatewayDefinition != nil { - if pl.GatewayDefinition.GatewayStorageCatalog == "" { - pl.GatewayDefinition.GatewayStorageCatalog = p.Catalog + } + if pl.IngestionDefinition != nil { + for _, obj := range pl.IngestionDefinition.Objects { + if obj.Report != nil { + if obj.Report.DestinationCatalog == "" { + obj.Report.DestinationCatalog = p.Catalog + } + if obj.Report.DestinationSchema == "" { + obj.Report.DestinationSchema = p.Schema + } } - if pl.GatewayDefinition.GatewayStorageSchema == "" { - pl.GatewayDefinition.GatewayStorageSchema = p.Schema + if obj.Schema != nil { + if obj.Schema.SourceCatalog == "" { + obj.Schema.SourceCatalog = p.Catalog + } + if obj.Schema.SourceSchema == "" { + obj.Schema.SourceSchema = p.Schema + } + if obj.Schema.DestinationCatalog == "" { + obj.Schema.DestinationCatalog = p.Catalog + } + if obj.Schema.DestinationSchema == "" { + obj.Schema.DestinationSchema = p.Schema + } } - } - if pl.IngestionDefinition != nil { - for _, obj := range pl.IngestionDefinition.Objects { - if obj.Report != nil { - if obj.Report.DestinationCatalog == "" { - obj.Report.DestinationCatalog = p.Catalog - } - if obj.Report.DestinationSchema == "" { - obj.Report.DestinationSchema = p.Schema - } + if obj.Table != nil { + if obj.Table.SourceCatalog == "" { + obj.Table.SourceCatalog = p.Catalog } - if obj.Schema != nil { - if obj.Schema.SourceCatalog == "" { - obj.Schema.SourceCatalog = p.Catalog - } - if obj.Schema.SourceSchema == "" { - obj.Schema.SourceSchema = p.Schema - } - if obj.Schema.DestinationCatalog == "" { - obj.Schema.DestinationCatalog = p.Catalog - } - if obj.Schema.DestinationSchema == "" { - obj.Schema.DestinationSchema = p.Schema - } + if obj.Table.SourceSchema == "" { + obj.Table.SourceSchema = p.Schema } - if obj.Table != nil { - if obj.Table.SourceCatalog == "" { - obj.Table.SourceCatalog = p.Catalog - } - if obj.Table.SourceSchema == "" { - obj.Table.SourceSchema = p.Schema - } - if obj.Table.DestinationCatalog == "" { - obj.Table.DestinationCatalog = p.Catalog - } - if obj.Table.DestinationSchema == "" { - obj.Table.DestinationSchema = p.Schema - } + if obj.Table.DestinationCatalog == "" { + obj.Table.DestinationCatalog = p.Catalog + } + if obj.Table.DestinationSchema == "" { + obj.Table.DestinationSchema = p.Schema } } } @@ -127,36 +128,34 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) continue } - if p.Catalog != "" || p.Schema != "" { - if e.CreateServingEndpoint.AiGateway != nil && e.CreateServingEndpoint.AiGateway.InferenceTableConfig != nil { - if p.Catalog != "" && e.CreateServingEndpoint.AiGateway.InferenceTableConfig.CatalogName == "" { - e.CreateServingEndpoint.AiGateway.InferenceTableConfig.CatalogName = p.Catalog - } - if p.Schema != "" && e.CreateServingEndpoint.AiGateway.InferenceTableConfig.SchemaName == "" { - e.CreateServingEndpoint.AiGateway.InferenceTableConfig.SchemaName = p.Schema - } + if e.CreateServingEndpoint.AiGateway != nil && e.CreateServingEndpoint.AiGateway.InferenceTableConfig != nil { + if e.CreateServingEndpoint.AiGateway.InferenceTableConfig.CatalogName == "" { + e.CreateServingEndpoint.AiGateway.InferenceTableConfig.CatalogName = p.Catalog } - - if e.CreateServingEndpoint.Config.AutoCaptureConfig != nil { - if p.Catalog != "" && e.CreateServingEndpoint.Config.AutoCaptureConfig.CatalogName == "" { - e.CreateServingEndpoint.Config.AutoCaptureConfig.CatalogName = p.Catalog - } - if p.Schema != "" && e.CreateServingEndpoint.Config.AutoCaptureConfig.SchemaName == "" { - e.CreateServingEndpoint.Config.AutoCaptureConfig.SchemaName = p.Schema - } + if e.CreateServingEndpoint.AiGateway.InferenceTableConfig.SchemaName == "" { + e.CreateServingEndpoint.AiGateway.InferenceTableConfig.SchemaName = p.Schema } + } - for i := range e.CreateServingEndpoint.Config.ServedEntities { - e.CreateServingEndpoint.Config.ServedEntities[i].EntityName = fullyQualifyName( - e.CreateServingEndpoint.Config.ServedEntities[i].EntityName, p.Catalog, p.Schema, - ) + if e.CreateServingEndpoint.Config.AutoCaptureConfig != nil { + if e.CreateServingEndpoint.Config.AutoCaptureConfig.CatalogName == "" { + e.CreateServingEndpoint.Config.AutoCaptureConfig.CatalogName = p.Catalog } - for i := range e.CreateServingEndpoint.Config.ServedModels { - e.CreateServingEndpoint.Config.ServedModels[i].ModelName = fullyQualifyName( - e.CreateServingEndpoint.Config.ServedModels[i].ModelName, p.Catalog, p.Schema, - ) + if e.CreateServingEndpoint.Config.AutoCaptureConfig.SchemaName == "" { + e.CreateServingEndpoint.Config.AutoCaptureConfig.SchemaName = p.Schema } } + + for i := range e.CreateServingEndpoint.Config.ServedEntities { + e.CreateServingEndpoint.Config.ServedEntities[i].EntityName = fullyQualifyName( + e.CreateServingEndpoint.Config.ServedEntities[i].EntityName, p.Catalog, p.Schema, + ) + } + for i := range e.CreateServingEndpoint.Config.ServedModels { + e.CreateServingEndpoint.Config.ServedModels[i].ModelName = fullyQualifyName( + e.CreateServingEndpoint.Config.ServedModels[i].ModelName, p.Catalog, p.Schema, + ) + } } // Registered models @@ -164,10 +163,10 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) if m.CreateRegisteredModelRequest == nil { continue } - if p.Catalog != "" && m.CatalogName == "" { + if m.CatalogName == "" { m.CatalogName = p.Catalog } - if p.Schema != "" && m.SchemaName == "" { + if m.SchemaName == "" { m.SchemaName = p.Schema } } @@ -177,11 +176,9 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) if q.CreateMonitor == nil { continue } - if p.Catalog != "" && p.Schema != "" { - q.TableName = fullyQualifyName(q.TableName, p.Catalog, p.Schema) - if q.OutputSchemaName == "" { - q.OutputSchemaName = p.Catalog + "." + p.Schema - } + q.TableName = fullyQualifyName(q.TableName, p.Catalog, p.Schema) + if q.OutputSchemaName == "" { + q.OutputSchemaName = p.Catalog + "." + p.Schema } } @@ -190,10 +187,10 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) if s.CreateSchema == nil { continue } - if p.Catalog != "" && s.CatalogName == "" { + if s.CatalogName == "" { s.CatalogName = p.Catalog } - if p.Schema != "" && s.Name == "" { + if s.Name == "" { // If there is a schema preset such as 'dev', we directly // use that name and don't add any prefix (which might result in dev_dev). s.Name = p.Schema @@ -203,21 +200,9 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) return diags } -func validateCatalogAndSchema(b *bundle.Bundle) diag.Diagnostics { - p := b.Config.Presets - if (p.Catalog != "" && p.Schema == "") || (p.Catalog == "" && p.Schema != "") { - return diag.Diagnostics{{ - Summary: "presets.catalog and presets.schema must always be set together", - Severity: diag.Error, - Locations: []dyn.Location{b.Config.GetLocation("presets")}, - }} - } - return diag.Diagnostics{} -} - // addCatalogSchemaParameters adds catalog and schema parameters to a job if they don't already exist. // Returns any warning diagnostics for existing parameters. -func addCatalogSchemaParameters(b *bundle.Bundle, key string, job *resources.Job, t config.Presets) diag.Diagnostics { +func addCatalogSchemaParameters(b *bundle.Bundle, key string, job *resources.Job, p config.Presets) diag.Diagnostics { var diags diag.Diagnostics // Check for existing catalog/schema parameters @@ -250,18 +235,18 @@ func addCatalogSchemaParameters(b *bundle.Bundle, key string, job *resources.Job } // Add catalog parameter if not already present - if !hasCatalog && t.Catalog != "" { + if !hasCatalog { job.Parameters = append(job.Parameters, jobs.JobParameterDefinition{ Name: "catalog", - Default: t.Catalog, + Default: p.Catalog, }) } // Add schema parameter if not already present - if !hasSchema && t.Schema != "" { + if !hasSchema { job.Parameters = append(job.Parameters, jobs.JobParameterDefinition{ Name: "schema", - Default: t.Schema, + Default: p.Schema, }) } From e0b6faddbe0012b7a3e130bc863fabcfd4dfc385 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 20 Dec 2024 09:39:30 +0100 Subject: [PATCH 19/27] Add a recommendation for pipeline catalogs --- .../mutator/apply_presets_catalog_schema.go | 29 ++++++++++++++++++- .../apply_presets_catalog_schema_test.go | 12 +++++++- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/bundle/config/mutator/apply_presets_catalog_schema.go b/bundle/config/mutator/apply_presets_catalog_schema.go index 6276d7c525..269cfd47b4 100644 --- a/bundle/config/mutator/apply_presets_catalog_schema.go +++ b/bundle/config/mutator/apply_presets_catalog_schema.go @@ -65,13 +65,24 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) } // Pipelines - for _, pl := range r.Pipelines { + allSameCatalog := allPipelinesSameCatalog(&r) + for key, pl := range r.Pipelines { if pl.PipelineSpec == nil { continue } if pl.Catalog == "" { pl.Catalog = p.Catalog } + if allSameCatalog && pl.Catalog == p.Catalog { + // Just for the common case where all pipelines have the same catalog, + // we show a recommendation to leave it out and rely on presets. + // This can happen when using the original default template. + diags = diags.Extend(diag.Diagnostics{{ + Summary: "Omit the catalog field since it will be automatically populated from presets.catalog", + Severity: diag.Recommendation, + Locations: b.Config.GetLocations("resources.pipelines." + key + ".catalog"), + }}) + } if pl.GatewayDefinition != nil { if pl.GatewayDefinition.GatewayStorageCatalog == "" { pl.GatewayDefinition.GatewayStorageCatalog = p.Catalog @@ -347,3 +358,19 @@ func fileIncludesPattern(ctx context.Context, filePath string, expected string) } return matched } + +func allPipelinesSameCatalog(r *config.Resources) bool { + var firstCatalog string + + for _, pl := range r.Pipelines { + if pl.PipelineSpec == nil || pl.PipelineSpec.Catalog == "" { + return false + } + if firstCatalog == "" { + firstCatalog = pl.PipelineSpec.Catalog + } else if pl.PipelineSpec.Catalog != firstCatalog { + return false + } + } + return firstCatalog != "" +} diff --git a/bundle/config/mutator/apply_presets_catalog_schema_test.go b/bundle/config/mutator/apply_presets_catalog_schema_test.go index df4efeb8a4..d0f9975fc0 100644 --- a/bundle/config/mutator/apply_presets_catalog_schema_test.go +++ b/bundle/config/mutator/apply_presets_catalog_schema_test.go @@ -179,7 +179,7 @@ func TestApplyPresetsCatalogSchemaWhenAlreadySet(t *testing.T) { b := mockPresetsCatalogSchema() recordedFields := recordPlaceholderFields(t, b) - diags := bundle.Apply(context.Background(), b, mutator.ApplyPresets()) + diags := bundle.Apply(context.Background(), b, mutator.ApplyPresetsCatalogSchema()) require.NoError(t, diags.Error()) for _, f := range recordedFields { @@ -190,6 +190,16 @@ func TestApplyPresetsCatalogSchemaWhenAlreadySet(t *testing.T) { } } +func TestApplyPresetsCatalogSchemaRecommmendRemovingCatalog(t *testing.T) { + b := mockPresetsCatalogSchema() + b.Config.Resources.Jobs["key"].Parameters = nil // avoid warnings about the job parameters + b.Config.Resources.Pipelines["key"].Catalog = "my_catalog" + + diags := bundle.Apply(context.Background(), b, mutator.ApplyPresetsCatalogSchema()) + require.Equal(t, 1, len(diags)) + require.Equal(t, "Omit the catalog field since it will be automatically populated from presets.catalog", diags[0].Summary) +} + func TestApplyPresetsCatalogSchemaWhenNotSet(t *testing.T) { b := mockPresetsCatalogSchema() recordedFields := recordPlaceholderFields(t, b) From 8eb96ccb7d642667d596c6c7d5f5015a5432e296 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 20 Dec 2024 09:44:01 +0100 Subject: [PATCH 20/27] Cleanup --- bundle/config/mutator/apply_presets.go | 33 ++++++++++---------------- 1 file changed, 13 insertions(+), 20 deletions(-) diff --git a/bundle/config/mutator/apply_presets.go b/bundle/config/mutator/apply_presets.go index 14a99e41dc..366c24a68e 100644 --- a/bundle/config/mutator/apply_presets.go +++ b/bundle/config/mutator/apply_presets.go @@ -48,8 +48,7 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos prefix := t.NamePrefix tags := toTagArray(t.Tags) - // Jobs presets. - // Supported: Prefix, Tags, JobsMaxConcurrentRuns, TriggerPauseStatus + // Jobs presets: Prefix, Tags, JobsMaxConcurrentRuns, TriggerPauseStatus for key, j := range r.Jobs { if j.JobSettings == nil { diags = diags.Extend(diag.Errorf("job %s is not defined", key)) @@ -85,9 +84,8 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } } - // Pipelines presets. - // Supported: Prefix, PipelinesDevelopment - // Not supported: Tags (as of 2024-10 not in pipelines API) + // Pipelines presets: Prefix, PipelinesDevelopment + // Not supported: Tags (not in API as of 2024-12) for key, p := range r.Pipelines { if p.PipelineSpec == nil { diags = diags.Extend(diag.Errorf("pipeline %s is not defined", key)) @@ -102,8 +100,7 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } } - // Models presets - // Supported: Prefix, Tags + // Models presets: Prefix, Tags for key, m := range r.Models { if m.Model == nil { diags = diags.Extend(diag.Errorf("model %s is not defined", key)) @@ -121,8 +118,7 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } } - // Experiments presets - // Supported: Prefix, Tags + // Experiments presets: Prefix, Tags for key, e := range r.Experiments { if e.Experiment == nil { diags = diags.Extend(diag.Errorf("experiment %s is not defined", key)) @@ -150,9 +146,8 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } } - // Model serving endpoint presets - // Supported: Prefix - // Not supported: Tags (not in API as of 2024-10) + // Model serving endpoint presets: Prefix + // Not supported: Tags (not in API as of 2024-12) for key, e := range r.ModelServingEndpoints { if e.CreateServingEndpoint == nil { diags = diags.Extend(diag.Errorf("model serving endpoint %s is not defined", key)) @@ -161,9 +156,8 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos e.Name = normalizePrefix(prefix) + e.Name } - // Registered models presets - // Supported: Prefix - // Not supported: Tags (not in API as of 2024-10) + // Registered models presets: Prefix + // Not supported: Tags (not in API as of 2024-12) for key, m := range r.RegisteredModels { if m.CreateRegisteredModelRequest == nil { diags = diags.Extend(diag.Errorf("registered model %s is not defined", key)) @@ -172,9 +166,8 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos m.Name = normalizePrefix(prefix) + m.Name } - // Quality monitors presets - // Supported: Schedule - // Not supported: Tags (not in API as of 2024-10) + // Quality monitors presets: Schedule + // Not supported: Tags (not in API as of 2024-12) for key, q := range r.QualityMonitors { if q.CreateMonitor == nil { diags = diags.Extend(diag.Errorf("quality monitor %s is not defined", key)) @@ -190,8 +183,8 @@ func (m *applyPresets) Apply(ctx context.Context, b *bundle.Bundle) diag.Diagnos } } - // Schemas: Prefix only - // Not supported: Tags (as of 2024-10, only supported in Databricks UI / SQL API) + // Schemas: Prefix + // Not supported: Tags (only supported in Databricks UI / SQL API as of 2024-12) for key, s := range r.Schemas { if s.CreateSchema == nil { diags = diags.Extend(diag.Errorf("schema %s is not defined", key)) From 86cf7e0732ca8c446b476f8435318e1fb3c64e1a Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 20 Dec 2024 16:49:46 +0100 Subject: [PATCH 21/27] Restore Target logic --- bundle/config/mutator/apply_presets_catalog_schema.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/bundle/config/mutator/apply_presets_catalog_schema.go b/bundle/config/mutator/apply_presets_catalog_schema.go index 269cfd47b4..692b0792d6 100644 --- a/bundle/config/mutator/apply_presets_catalog_schema.go +++ b/bundle/config/mutator/apply_presets_catalog_schema.go @@ -73,6 +73,11 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) if pl.Catalog == "" { pl.Catalog = p.Catalog } + if pl.Schema == "" && pl.Target == "" { + // As of 2024-12, the Schema field isn't broadly supported yet in the pipelines API. + // Until it is, we set the Target field. + pl.Target = p.Schema + } if allSameCatalog && pl.Catalog == p.Catalog { // Just for the common case where all pipelines have the same catalog, // we show a recommendation to leave it out and rely on presets. From be08585fb76454bfcab97612576889e54497a64a Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 20 Dec 2024 16:59:18 +0100 Subject: [PATCH 22/27] presets-catalog-schema-as-params # Your branch is ahead of 'origin/presets-catalog-schema-as-params' by 67 commits. # (use "git push" to publish your local commits) # # Changes to be committed: # modified: dbt-sql/databricks_template_schema.json # modified: default-python/databricks_template_schema.json # modified: default-python/template/{{.project_name}}/databricks.yml.tmpl # modified: default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl # modified: default-python/template/{{.project_name}}/resources/{{.project_name}}.pipeline.yml.tmpl # modified: default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl # modified: default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl # modified: default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl # modified: default-sql/databricks_template_schema.json # # Untracked files: # ../../../.cursorrules # ../../../bundle/config/resources/:tmp:tmp.py # ../../../delme.py # ../../../pr-cache-current-user-me # ../../../pr-cleanup-warnings.md # ../../../pr-contrib-templates.md # ../../../pr-cp-diag-ids-for-all.md # ../../../pr-cp-serverless-templates.md # ../../../pr-presets-catalog-schema-using-params.md # ../../../pr-update-sync-command-help.md # Revert template changes for now --- .../dbt-sql/databricks_template_schema.json | 2 +- .../databricks_template_schema.json | 42 ++-------- .../{{.project_name}}/databricks.yml.tmpl | 15 +--- .../resources/{{.project_name}}.job.yml.tmpl | 6 +- .../{{.project_name}}.pipeline.yml.tmpl | 7 ++ .../scratch/exploration.ipynb.tmpl | 79 ++----------------- .../{{.project_name}}/src/notebook.ipynb.tmpl | 70 ++-------------- .../src/{{.project_name}}/main.py.tmpl | 34 ++------ .../databricks_template_schema.json | 2 +- 9 files changed, 41 insertions(+), 216 deletions(-) diff --git a/libs/template/templates/dbt-sql/databricks_template_schema.json b/libs/template/templates/dbt-sql/databricks_template_schema.json index bb512153f4..cccf145dc5 100644 --- a/libs/template/templates/dbt-sql/databricks_template_schema.json +++ b/libs/template/templates/dbt-sql/databricks_template_schema.json @@ -45,7 +45,7 @@ "default": "default", "pattern": "^\\w+$", "pattern_match_failure_message": "Invalid schema name.", - "description": "\nPlease provide a default schema during development.\ndefault_schema", + "description": "\nPlease provide an initial schema during development.\ndefault_schema", "order": 5 } }, diff --git a/libs/template/templates/default-python/databricks_template_schema.json b/libs/template/templates/default-python/databricks_template_schema.json index 461aaa0c46..d53bad91ab 100644 --- a/libs/template/templates/default-python/databricks_template_schema.json +++ b/libs/template/templates/default-python/databricks_template_schema.json @@ -4,7 +4,7 @@ "project_name": { "type": "string", "default": "my_project", - "description": "\nPlease provide a unique name for this project.\nproject_name", + "description": "Please provide the following details to tailor the template to your preferences.\n\nUnique name for this project", "order": 1, "pattern": "^[A-Za-z0-9_]+$", "pattern_match_failure_message": "Name must consist of letters, numbers, and underscores." @@ -13,55 +13,23 @@ "type": "string", "default": "yes", "enum": ["yes", "no"], - "description": "\nWould you like to include a stub (sample) notebook in '{{.project_name}}{{path_separator}}src'?", + "description": "Include a stub (sample) notebook in '{{.project_name}}{{path_separator}}src'", "order": 2 }, "include_dlt": { "type": "string", "default": "yes", "enum": ["yes", "no"], - "description": "Would you like to include a stub (sample) Delta Live Tables pipeline in '{{.project_name}}{{path_separator}}src'?", + "description": "Include a stub (sample) Delta Live Tables pipeline in '{{.project_name}}{{path_separator}}src'", "order": 3 }, "include_python": { "type": "string", "default": "yes", "enum": ["yes", "no"], - "description": "Would you like to include a stub (sample) Python package in '{{.project_name}}{{path_separator}}src'?", + "description": "Include a stub (sample) Python package in '{{.project_name}}{{path_separator}}src'", "order": 4 - }, - "default_catalog": { - "type": "string", - "default": "{{default_catalog}}", - "pattern": "^\\w*$", - "pattern_match_failure_message": "Invalid catalog name.", - "description": "\nPlease provide an initial catalog{{if eq (default_catalog) \"\"}} (leave blank when not using Unity Catalog){{end}}.\ndefault_catalog", - "order": 5 - }, - "personal_schemas": { - "type": "string", - "description": "\nWould you like to use a personal schema for each user working on this project? (e.g., 'catalog.{{short_name}}')\npersonal_schemas", - "enum": [ - "yes, use a schema based on the current user name during development", - "no, use a shared schema during development" - ], - "order": 6 - }, - "shared_schema": { - "skip_prompt_if": { - "properties": { - "personal_schemas": { - "const": "yes, use a schema based on the current user name during development" - } - } - }, - "type": "string", - "default": "default", - "pattern": "^\\w+$", - "pattern_match_failure_message": "Invalid schema name.", - "description": "\nPlease provide default schema during development.\ndefault_schema", - "order": 7 } }, - "success_message": "\nWorkspace to use (auto-detected, edit in '{{.project_name}}/databricks.yml').\nworkspace_host: {{workspace_host}}\n\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html." + "success_message": "Workspace to use (auto-detected, edit in '{{.project_name}}/databricks.yml'): {{workspace_host}}\n\n✨ Your new project has been created in the '{{.project_name}}' directory!\n\nPlease refer to the README.md file for \"getting started\" instructions.\nSee also the documentation at https://docs.databricks.com/dev-tools/bundles/index.html." } diff --git a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl index 421fe5014f..c42b822a8d 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/databricks.yml.tmpl @@ -6,13 +6,6 @@ bundle: include: - resources/*.yml -{{- $dev_schema := .shared_schema }} -{{- $prod_schema := .shared_schema }} -{{- if (regexp "^yes").MatchString .personal_schemas}} - {{- $dev_schema = "${workspace.current_user.short_name}"}} - {{- $prod_schema = "default"}} -{{- end}} - targets: dev: # The default target uses 'mode: development' to create a development copy. @@ -23,9 +16,6 @@ targets: default: true workspace: host: {{workspace_host}} - presets: - catalog: {{.default_catalog}} - schema: {{$dev_schema}} prod: mode: production @@ -36,6 +26,5 @@ targets: permissions: - {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} level: CAN_MANAGE - presets: - catalog: {{.default_catalog}} - schema: {{$prod_schema}} + run_as: + {{if is_service_principal}}service_principal{{else}}user{{end}}_name: {{user_name}} diff --git a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl index 0ea69a75ae..5211e3894b 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.job.yml.tmpl @@ -16,12 +16,16 @@ resources: interval: 1 unit: DAYS - {{if not is_service_principal -}} + {{- if not is_service_principal}} + email_notifications: on_failure: - {{user_name}} + {{else}} + {{end -}} + tasks: {{- if eq .include_notebook "yes" }} - task_key: notebook_task diff --git a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.pipeline.yml.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.pipeline.yml.tmpl index c3f94cb1c8..50f11fe2cc 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.pipeline.yml.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/resources/{{.project_name}}.pipeline.yml.tmpl @@ -3,6 +3,13 @@ resources: pipelines: {{.project_name}}_pipeline: name: {{.project_name}}_pipeline + {{- if or (eq default_catalog "") (eq default_catalog "hive_metastore")}} + ## Specify the 'catalog' field to configure this pipeline to make use of Unity Catalog: + # catalog: catalog_name + {{- else}} + catalog: {{default_catalog}} + {{- end}} + target: {{.project_name}}_${bundle.target} libraries: - notebook: path: ../src/dlt_pipeline.ipynb diff --git a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl index adb353c58c..42164dff07 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/scratch/exploration.ipynb.tmpl @@ -27,25 +27,15 @@ }, "outputs": [], "source": [ - {{- if (eq .include_python "yes") }} + {{- if (eq .include_python "yes") }} "import sys\n", "sys.path.append('../src')\n", "from {{.project_name}} import main\n", "\n", - {{- /* We can use the short form here without 'dbutils.text()' since the widgets are defined in the metadata below. */}} - "catalog = dbutils.widgets.get('catalog')\n", - "schema = dbutils.widgets.get('schema')\n", - "spark.sql(f'USE {catalog}.{schema}')\n", - "\n", - "spark.sql('SELECT * FROM example').show(10)" - {{- else}} - "# Load default catalog and schema as widget and set their values as the default catalog / schema\n", - "catalog = dbutils.widgets.get('catalog')\n", - "schema = dbutils.widgets.get('schema')\n", - "spark.sql(f'USE {catalog}.{schema}')\n", - "\n", - "spark.sql('SELECT * FROM example').show(10)" - {{- end}} + "main.get_taxis(spark).show(10)" + {{else}} + "spark.range(10)" + {{end -}} ] } ], @@ -56,63 +46,8 @@ "notebookMetadata": { "pythonIndentUnit": 2 }, - "notebookName": "exploration", - "widgets": { - "catalog": { - "currentValue": "{{.default_catalog}}", - "nuid": "c47e96d8-5751-4c8a-9d6b-5c6c7c3f1234", - "typedWidgetInfo": { - "autoCreated": false, - "defaultValue": "{{.default_catalog}}", - "label": null, - "name": "catalog", - "options": { - "widgetDisplayType": "Text", - "validationRegex": null - }, - "parameterDataType": "String" - }, - "widgetInfo": { - "widgetType": "text", - "defaultValue": "{{.default_catalog}}", - "label": null, - "name": "catalog", - "options": { - "widgetType": "text", - "autoCreated": null, - "validationRegex": null - } - } - }, -{{- $dev_schema := .shared_schema }} -{{- if (regexp "^yes").MatchString .personal_schemas}} - {{- $dev_schema = "{{short_name}}"}} -{{- end}} - "schema": { - "currentValue": "{{$dev_schema}}", - "nuid": "c47e96d8-5751-4c8a-9d6b-5c6c7c3f5678", - "typedWidgetInfo": { - "autoCreated": false, - "defaultValue": "{{$dev_schema}}", - "label": null, - "name": "schema", - "options": { - "widgetDisplayType": "Text", - "validationRegex": null - }, - "parameterDataType": "String" - }, - "widgetInfo": { - "widgetType": "text", - "defaultValue": "{{$dev_schema}}", - "label": null, - "name": "schema", - "options": { - "widgetType": "text", - "autoCreated": null, - "validationRegex": null - } - } + "notebookName": "ipynb-notebook", + "widgets": {} }, "kernelspec": { "display_name": "Python 3", diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl index 0924e60f35..6782a053ba 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/notebook.ipynb.tmpl @@ -23,11 +23,8 @@ "metadata": {}, "outputs": [], "source": [ - "# Load default catalog and schema as widget and set their values as the default catalog / schema\n", - {{- /* We can use the short form here without 'dbutils.text()' since the widgets are defined in the metadata below. */}} - "catalog = dbutils.widgets.get('catalog')\n", - "schema = dbutils.widgets.get('schema')\n", - "spark.sql(f'USE {catalog}.{schema}')" + "%load_ext autoreload\n", + "%autoreload 2" ] }, { @@ -50,9 +47,9 @@ {{- if (eq .include_python "yes") }} "from {{.project_name}} import main\n", "\n", - "main.create_example_table()" + "main.get_taxis(spark).show(10)" {{else}} - "spark.sql("CREATE OR REPLACE TABLE example AS SELECT 'example table' AS text_column")" + "spark.range(10)" {{end -}} ] } @@ -65,64 +62,7 @@ "pythonIndentUnit": 2 }, "notebookName": "notebook", - "widgets": { - "catalog": { - "currentValue": "{{.default_catalog}}", - "nuid": "3965fc9c-8080-45b1-bee3-f75cef7685b4", - "typedWidgetInfo": { - "autoCreated": false, - "defaultValue": "{{.default_catalog}}", - "label": null, - "name": "catalog", - "options": { - "widgetDisplayType": "Text", - "validationRegex": null - }, - "parameterDataType": "String" - }, - "widgetInfo": { - "widgetType": "text", - "defaultValue": "{{.default_catalog}}", - "label": null, - "name": "catalog", - "options": { - "widgetType": "text", - "autoCreated": null, - "validationRegex": null - } - } - }, -{{- $dev_schema := .shared_schema }} -{{- if (regexp "^yes").MatchString .personal_schemas}} - {{- $dev_schema = "{{short_name}}"}} -{{- end}} - "schema": { - "currentValue": "{{$dev_schema}}", - "nuid": "6ec0d70f-39bf-4859-a510-02c3e3d59bff", - "typedWidgetInfo": { - "autoCreated": false, - "defaultValue": "{{$dev_schema}}", - "label": null, - "name": "schema", - "options": { - "widgetDisplayType": "Text", - "validationRegex": null - }, - "parameterDataType": "String" - }, - "widgetInfo": { - "widgetType": "text", - "defaultValue": "{{$dev_schema}}", - "label": null, - "name": "schema", - "options": { - "widgetType": "text", - "autoCreated": null, - "validationRegex": null - } - } - } - } + "widgets": {} }, "kernelspec": { "display_name": "Python 3", diff --git a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl index e79920a9e2..c514c6dc5d 100644 --- a/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl +++ b/libs/template/templates/default-python/template/{{.project_name}}/src/{{.project_name}}/main.py.tmpl @@ -1,39 +1,21 @@ from pyspark.sql import SparkSession, DataFrame -import argparse +def get_taxis(spark: SparkSession) -> DataFrame: + return spark.read.table("samples.nyctaxi.trips") + + +# Create a new Databricks Connect session. If this fails, +# check that you have configured Databricks Connect correctly. +# See https://docs.databricks.com/dev-tools/databricks-connect.html. def get_spark() -> SparkSession: - """ - Create a new Databricks Connect session. If this fails, - check that you have configured Databricks Connect correctly. - See https://docs.databricks.com/dev-tools/databricks-connect.html. - """ try: from databricks.connect import DatabricksSession return DatabricksSession.builder.getOrCreate() except ImportError: return SparkSession.builder.getOrCreate() -def get_taxis(spark: SparkSession) -> DataFrame: - return spark.read.table("samples.nyctaxi.trips") - -def create_example_table(): - """ - Create a table called 'example' in the default catalog and schema. - """ - get_spark().sql("CREATE OR REPLACE TABLE example AS SELECT 'example table' AS text_column") - def main(): - # Set the catalog and schema for the current session. - # In the default template, these parameters are set - # using the 'catalog' and 'schema' presets in databricks.yml. - parser = argparse.ArgumentParser() - parser.add_argument('--catalog', required=True) - parser.add_argument('--schema', required=True) - args, unknown = parser.parse_known_args() - spark = get_spark() - spark.sql(f"USE {args.catalog}.{args.schema}") - - create_example_table() + get_taxis(get_spark()).show(5) if __name__ == '__main__': main() diff --git a/libs/template/templates/default-sql/databricks_template_schema.json b/libs/template/templates/default-sql/databricks_template_schema.json index 81fa7e2d22..113cbef642 100644 --- a/libs/template/templates/default-sql/databricks_template_schema.json +++ b/libs/template/templates/default-sql/databricks_template_schema.json @@ -45,7 +45,7 @@ "default": "default", "pattern": "^\\w+$", "pattern_match_failure_message": "Invalid schema name.", - "description": "\nPlease provide a default schema during development.\ndefault_schema", + "description": "\nPlease provide an initial schema during development.\ndefault_schema", "order": 5 } }, From fbd476093fdf2d77aa6aee12f92d9ca219b6ed48 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 20 Dec 2024 17:09:04 +0100 Subject: [PATCH 23/27] Fix test --- bundle/internal/schema/annotations.yml | 6 ++++++ bundle/schema/jsonschema.json | 8 ++++++++ 2 files changed, 14 insertions(+) diff --git a/bundle/internal/schema/annotations.yml b/bundle/internal/schema/annotations.yml index e52189daa1..556f9729bd 100644 --- a/bundle/internal/schema/annotations.yml +++ b/bundle/internal/schema/annotations.yml @@ -97,6 +97,12 @@ github.com/databricks/cli/bundle/config.Lock: "description": |- Whether to force this lock if it is enabled. github.com/databricks/cli/bundle/config.Presets: + "catalog": + "description": |- + The default catalog to use for all resources. + "schema": + "description": |- + The default schema to use for all resources. "jobs_max_concurrent_runs": "description": |- The maximum concurrent runs for a job. diff --git a/bundle/schema/jsonschema.json b/bundle/schema/jsonschema.json index 8e8efa7fc1..93dc179f03 100644 --- a/bundle/schema/jsonschema.json +++ b/bundle/schema/jsonschema.json @@ -1175,6 +1175,10 @@ { "type": "object", "properties": { + "catalog": { + "description": "The default catalog to use for all resources.", + "$ref": "#/$defs/string" + }, "jobs_max_concurrent_runs": { "description": "The maximum concurrent runs for a job.", "$ref": "#/$defs/int" @@ -1187,6 +1191,10 @@ "description": "Whether pipeline deployments should be locked in development mode.", "$ref": "#/$defs/bool" }, + "schema": { + "description": "The default schema to use for all resources.", + "$ref": "#/$defs/string" + }, "source_linked_deployment": { "description": "Whether to link the deployment to the bundle source.", "$ref": "#/$defs/bool" From f3923da6a2ecc2a0d6686a9e59e3f8b0c38d598b Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 20 Dec 2024 20:38:55 +0100 Subject: [PATCH 24/27] Cleanup --- bundle/config/mutator/apply_presets.go | 3 +- .../mutator/apply_presets_catalog_schema.go | 30 +++++++------------ 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/bundle/config/mutator/apply_presets.go b/bundle/config/mutator/apply_presets.go index 366c24a68e..587ae1c5f0 100644 --- a/bundle/config/mutator/apply_presets.go +++ b/bundle/config/mutator/apply_presets.go @@ -22,7 +22,8 @@ type applyPresets struct{} // Apply all presets, e.g. the prefix presets that // adds a prefix to all names of all resources. -// Note the catalog/schema presets are applied in ApplyPresetsCatalogSchema. +// +// Note that the catalog/schema presets are applied in ApplyPresetsCatalogSchema. func ApplyPresets() *applyPresets { return &applyPresets{} } diff --git a/bundle/config/mutator/apply_presets_catalog_schema.go b/bundle/config/mutator/apply_presets_catalog_schema.go index 692b0792d6..7326c9aea7 100644 --- a/bundle/config/mutator/apply_presets_catalog_schema.go +++ b/bundle/config/mutator/apply_presets_catalog_schema.go @@ -35,7 +35,7 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) if p.Catalog == "" && p.Schema == "" { return diags } - if (p.Schema == "") || (p.Catalog == "" && p.Schema != "") { + if (p.Schema == "" && p.Catalog != "") || (p.Catalog == "" && p.Schema != "") { return diag.Diagnostics{{ Summary: "presets.catalog and presets.schema must always be set together", Severity: diag.Error, @@ -164,12 +164,12 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) for i := range e.CreateServingEndpoint.Config.ServedEntities { e.CreateServingEndpoint.Config.ServedEntities[i].EntityName = fullyQualifyName( - e.CreateServingEndpoint.Config.ServedEntities[i].EntityName, p.Catalog, p.Schema, + e.CreateServingEndpoint.Config.ServedEntities[i].EntityName, p, ) } for i := range e.CreateServingEndpoint.Config.ServedModels { e.CreateServingEndpoint.Config.ServedModels[i].ModelName = fullyQualifyName( - e.CreateServingEndpoint.Config.ServedModels[i].ModelName, p.Catalog, p.Schema, + e.CreateServingEndpoint.Config.ServedModels[i].ModelName, p, ) } } @@ -192,7 +192,7 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) if q.CreateMonitor == nil { continue } - q.TableName = fullyQualifyName(q.TableName, p.Catalog, p.Schema) + q.TableName = fullyQualifyName(q.TableName, p) if q.OutputSchemaName == "" { q.OutputSchemaName = p.Catalog + "." + p.Schema } @@ -207,8 +207,6 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) s.CatalogName = p.Catalog } if s.Name == "" { - // If there is a schema preset such as 'dev', we directly - // use that name and don't add any prefix (which might result in dev_dev). s.Name = p.Schema } } @@ -239,26 +237,19 @@ func addCatalogSchemaParameters(b *bundle.Bundle, key string, job *resources.Job diags = diags.Extend(diag.Diagnostics{{ Summary: fmt.Sprintf("job %s already has 'schema' parameter defined; ignoring preset value", key), Severity: diag.Warning, - Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key)}, + Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key + ".parameters")}, }}) } } } - // Initialize parameters if nil - if job.Parameters == nil { - job.Parameters = []jobs.JobParameterDefinition{} - } - - // Add catalog parameter if not already present + // Add catalog/schema parameters if !hasCatalog { job.Parameters = append(job.Parameters, jobs.JobParameterDefinition{ Name: "catalog", Default: p.Catalog, }) } - - // Add schema parameter if not already present if !hasSchema { job.Parameters = append(job.Parameters, jobs.JobParameterDefinition{ Name: "schema", @@ -329,14 +320,13 @@ func recommendCatalogSchemaUsage(b *bundle.Bundle, ctx context.Context, key stri } return diags - } // fullyQualifyName checks if the given name is already qualified with a catalog and schema. // If not, and both catalog and schema are available, it prefixes the name with catalog.schema. // If name is empty, returns name as-is. -func fullyQualifyName(name, catalog, schema string) string { - if name == "" || catalog == "" || schema == "" { +func fullyQualifyName(name string, p config.Presets) string { + if name == "" || p.Catalog == "" || p.Schema == "" { return name } // If it's already qualified (contains at least two '.'), we assume it's fully qualified. @@ -346,10 +336,10 @@ func fullyQualifyName(name, catalog, schema string) string { return name } // Otherwise, fully qualify it - return fmt.Sprintf("%s.%s.%s", catalog, schema, name) + return fmt.Sprintf("%s.%s.%s", p.Catalog, p.Schema, name) } -func fileIncludesPattern(ctx context.Context, filePath string, expected string) bool { +func fileIncludesPattern(ctx context.Context, filePath, expected string) bool { content, err := os.ReadFile(filePath) if err != nil { log.Warnf(ctx, "failed to check file %s: %v", filePath, err) From 6d5cb1d07554511e4f82d09f35f0332d95783ebf Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 20 Dec 2024 20:43:18 +0100 Subject: [PATCH 25/27] Add volumes and clean room notebooks --- .../mutator/apply_presets_catalog_schema.go | 13 ++++ .../apply_presets_catalog_schema_test.go | 60 ++++++++++++------- 2 files changed, 53 insertions(+), 20 deletions(-) diff --git a/bundle/config/mutator/apply_presets_catalog_schema.go b/bundle/config/mutator/apply_presets_catalog_schema.go index 7326c9aea7..ad28344447 100644 --- a/bundle/config/mutator/apply_presets_catalog_schema.go +++ b/bundle/config/mutator/apply_presets_catalog_schema.go @@ -211,6 +211,19 @@ func (m *applyPresetsCatalogSchema) Apply(ctx context.Context, b *bundle.Bundle) } } + // Volumes + for _, v := range r.Volumes { + if v.CreateVolumeRequestContent == nil { + continue + } + if v.CatalogName == "" { + v.CatalogName = p.Catalog + } + if v.SchemaName == "" { + v.SchemaName = p.Schema + } + } + return diags } diff --git a/bundle/config/mutator/apply_presets_catalog_schema_test.go b/bundle/config/mutator/apply_presets_catalog_schema_test.go index d0f9975fc0..5230b205d0 100644 --- a/bundle/config/mutator/apply_presets_catalog_schema_test.go +++ b/bundle/config/mutator/apply_presets_catalog_schema_test.go @@ -3,6 +3,7 @@ package mutator_test import ( "context" "reflect" + "regexp" "strings" "testing" @@ -144,6 +145,14 @@ func mockPresetsCatalogSchema() *bundle.Bundle { }, }, }, + Volumes: map[string]*resources.Volume{ + "key": { + CreateVolumeRequestContent: &catalog.CreateVolumeRequestContent{ + CatalogName: "", + SchemaName: "", + }, + }, + }, }, Presets: config.Presets{ Catalog: "my_catalog", @@ -155,23 +164,24 @@ func mockPresetsCatalogSchema() *bundle.Bundle { // ignoredFields are fields that should be ignored in the completeness check var ignoredFields = map[string]string{ - "resources.pipelines.key.schema": "schema is still in private preview", - "resources.jobs.key.tasks[0].notebook_task.base_parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].python_wheel_task.named_parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].python_wheel_task.parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.job_parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].spark_jar_task.parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].spark_python_task.parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].spark_submit_task.parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].sql_task.parameters": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.jar_params": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.notebook_params": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.pipeline_params": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.python_named_params": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.python_params": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.spark_submit_params": "catalog/schema are passed via job parameters", - "resources.jobs.key.tasks[0].run_job_task.sql_params": "catalog/schema are passed via job parameters", - "resources.pipelines.key.ingestion_definition.objects[0].schema": "schema name is under schema.source_schema/destination_schema", + "resources.pipelines.key.schema": "schema is still in private preview", + "resources.jobs.key.tasks[0].notebook_task.base_parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].python_wheel_task.named_parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].python_wheel_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.job_parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].spark_jar_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].spark_python_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].spark_submit_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].sql_task.parameters": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.jar_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.notebook_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.pipeline_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.python_named_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.python_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.spark_submit_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].run_job_task.sql_params": "catalog/schema are passed via job parameters", + "resources.jobs.key.tasks[0].clean_rooms_notebook_task.notebook_base_parameters": "catalog/schema are properties inside this struct", + "resources.pipelines.key.ingestion_definition.objects[0].schema": "schema name is under schema.source_schema/destination_schema", "resources.schemas": "schema name of schemas is under resources.schemas.key.Name", } @@ -236,11 +246,21 @@ func TestApplyPresetsCatalogSchemaCompleteness(t *testing.T) { // Convert the recordedFields to a set for easier lookup recordedPaths := make(map[string]struct{}) + arrayIndexPattern := regexp.MustCompile(`\[\d+\]`) for _, field := range recordedFields { recordedPaths[field.PathString] = struct{}{} - if i := strings.Index(field.PathString, "["); i >= 0 { - // For entries like resources.jobs.key.parameters[1].default, just add resources.jobs.key.parameters - recordedPaths[field.PathString[:i]] = struct{}{} + + // Add base paths for any array indices in the path. + // For example, for resources.jobs.key.parameters[0].default we add "resources.jobs.key.parameters + path := field.PathString + path = arrayIndexPattern.ReplaceAllString(path, "[0]") + for { + i := strings.Index(path, "[") + if i < 0 { + break + } + recordedPaths[path[:i]] = struct{}{} + path = path[i+1:] } } From 0acb2223bcabb41f176e1620cd8b5658818d32b9 Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Fri, 20 Dec 2024 21:05:37 +0100 Subject: [PATCH 26/27] Fix silly lint error --- bundle/config/mutator/translate_paths.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bundle/config/mutator/translate_paths.go b/bundle/config/mutator/translate_paths.go index b6396766e2..306efc29dd 100644 --- a/bundle/config/mutator/translate_paths.go +++ b/bundle/config/mutator/translate_paths.go @@ -71,7 +71,7 @@ type translateContext struct { // // localPath - the full local file system path. // localRelPath - the relative path from the base directory. -func GetLocalPath(ctx context.Context, b *bundle.Bundle, sourceDir string, p string) (string, string, error) { +func GetLocalPath(ctx context.Context, b *bundle.Bundle, sourceDir, p string) (string, string, error) { if p == "" { return "", "", fmt.Errorf("path cannot be empty") } From 204d3b08d13b90b71f3236313412f72b5333908e Mon Sep 17 00:00:00 2001 From: Lennart Kats Date: Sat, 21 Dec 2024 09:49:23 +0100 Subject: [PATCH 27/27] Cleanup --- .../config/mutator/apply_presets_catalog_schema.go | 4 ++-- .../mutator/apply_presets_catalog_schema_test.go | 14 +++++++++----- bundle/config/mutator/translate_paths.go | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/bundle/config/mutator/apply_presets_catalog_schema.go b/bundle/config/mutator/apply_presets_catalog_schema.go index ad28344447..227a25843b 100644 --- a/bundle/config/mutator/apply_presets_catalog_schema.go +++ b/bundle/config/mutator/apply_presets_catalog_schema.go @@ -240,7 +240,7 @@ func addCatalogSchemaParameters(b *bundle.Bundle, key string, job *resources.Job if param.Name == "catalog" { hasCatalog = true diags = diags.Extend(diag.Diagnostics{{ - Summary: fmt.Sprintf("job %s already has 'catalog' parameter defined; ignoring preset value", key), + Summary: fmt.Sprintf("job '%s' already has 'catalog' parameter defined; ignoring preset value", key), Severity: diag.Warning, Locations: b.Config.GetLocations("resources.jobs." + key + ".parameters"), }}) @@ -248,7 +248,7 @@ func addCatalogSchemaParameters(b *bundle.Bundle, key string, job *resources.Job if param.Name == "schema" { hasSchema = true diags = diags.Extend(diag.Diagnostics{{ - Summary: fmt.Sprintf("job %s already has 'schema' parameter defined; ignoring preset value", key), + Summary: fmt.Sprintf("job '%s' already has 'schema' parameter defined; ignoring preset value", key), Severity: diag.Warning, Locations: []dyn.Location{b.Config.GetLocation("resources.jobs." + key + ".parameters")}, }}) diff --git a/bundle/config/mutator/apply_presets_catalog_schema_test.go b/bundle/config/mutator/apply_presets_catalog_schema_test.go index 5230b205d0..ffcbb1526a 100644 --- a/bundle/config/mutator/apply_presets_catalog_schema_test.go +++ b/bundle/config/mutator/apply_presets_catalog_schema_test.go @@ -27,6 +27,8 @@ type recordedField struct { Expected string } +// mockPresetsCatalogSchema returns a mock bundle with all known resources +// that have catalog/schema fields, with those fields filled in as placeholders. func mockPresetsCatalogSchema() *bundle.Bundle { return &bundle.Bundle{ Config: config.Root{ @@ -162,7 +164,8 @@ func mockPresetsCatalogSchema() *bundle.Bundle { } } -// ignoredFields are fields that should be ignored in the completeness check +// ignoredFields are all paths to fields in resources where we don't want to +// apply the catalog/schema presets. var ignoredFields = map[string]string{ "resources.pipelines.key.schema": "schema is still in private preview", "resources.jobs.key.tasks[0].notebook_task.base_parameters": "catalog/schema are passed via job parameters", @@ -266,7 +269,7 @@ func TestApplyPresetsCatalogSchemaCompleteness(t *testing.T) { // Find all catalog/schema fields that we think should be covered based // on all properties in config.Resources. - expectedFields := findCatalogSchemaFields() + expectedFields := findAllPossibleCatalogSchemaFields() assert.GreaterOrEqual(t, len(expectedFields), 42, "expected at least 42 catalog/schema fields, but got %d", len(expectedFields)) // Verify that all expected fields are there @@ -307,9 +310,10 @@ func recordPlaceholderFields(t *testing.T, b *bundle.Bundle) []recordedField { return recordedFields } -// findCatalogSchemaFields finds all fields in config.Resources that might refer -// to a catalog or schema. Returns a slice of field paths. -func findCatalogSchemaFields() []string { +// findAllPossibleCatalogSchemaFields finds all fields in config.Resources that might refer +// to a catalog or schema. Returns a slice of field paths like +// "resources.pipelines.key.catalog". +func findAllPossibleCatalogSchemaFields() []string { visited := make(map[reflect.Type]struct{}) var results []string diff --git a/bundle/config/mutator/translate_paths.go b/bundle/config/mutator/translate_paths.go index 306efc29dd..be07c77e08 100644 --- a/bundle/config/mutator/translate_paths.go +++ b/bundle/config/mutator/translate_paths.go @@ -59,7 +59,7 @@ type translateContext struct { seen map[string]string } -// GetLocalPath returns the local file system paths for a path referenced from a resource.. +// GetLocalPath returns the local file system paths for a path referenced from a resource. // If it's an absolute path, we treat it as a workspace path and return "". // // Arguments: