[Feature] Add support for filters in databricks_clusters data source (#4014)

mgyucht · web-flow · commit 1153bba6de91 · 2024-09-16T13:01:15.000Z
## Changes
Version 1.50 of the Terraform Provider featured an upgrade to the Go SDK
affecting cluster listing. The new List Clusters API returns all
terminated clusters in the last 30 days without a limit. This results in
the list operation taking considerably longer for some workspaces,
especially workspaces with many jobs where clusters are frequently
created. This impacts the `databricks_clusters` data source, which can
be slow.

This PR partially addresses this by adding support for `filter_by` to
the `databricks_clusters` API. Filters expressed here are pushed to the
server and result in fewer clusters being returned by the API. Users of
this data source can specify a particular cluster state, cluster source,
pinned status, or cluster policy ID to limit the number of clusters
returned by the API, drastically speeding up performance.

## Tests
Integration tests for `databricks_cluster` data source test setting the
`filter_by` parameter's attributes.

- [ ] `make test` run locally
- [ ] relevant change in `docs/` folder
- [ ] covered with integration tests in `internal/acceptance`
- [ ] relevant acceptance tests are passing
- [ ] using Go SDK
diff --git a/clusters/data_clusters.go b/clusters/data_clusters.go
@@ -11,11 +11,14 @@ import (
 
 func DataSourceClusters() common.Resource {
 	return common.WorkspaceData(func(ctx context.Context, data *struct {
-		Id                  string   `json:"id,omitempty" tf:"computed"`
-		Ids                 []string `json:"ids,omitempty" tf:"computed,slice_set"`
-		ClusterNameContains string   `json:"cluster_name_contains,omitempty"`
+		Id                  string                        `json:"id,omitempty" tf:"computed"`
+		Ids                 []string                      `json:"ids,omitempty" tf:"computed,slice_set"`
+		ClusterNameContains string                        `json:"cluster_name_contains,omitempty"`
+		FilterBy            *compute.ListClustersFilterBy `json:"filter_by,omitempty"`
 	}, w *databricks.WorkspaceClient) error {
-		clusters, err := w.Clusters.ListAll(ctx, compute.ListClustersRequest{})
+		clusters, err := w.Clusters.ListAll(ctx, compute.ListClustersRequest{
+			FilterBy: data.FilterBy,
+		})
 		if err != nil {
 			return err
 		}
diff --git a/docs/data-sources/clusters.md b/docs/data-sources/clusters.md
@@ -27,6 +27,16 @@ data "databricks_clusters" "all_shared" {
 ## Argument Reference
 
 * `cluster_name_contains` - (Optional) Only return [databricks_cluster](../resources/cluster.md#cluster_id) ids that match the given name string.
+* `filter_by` - (Optional) Filters to apply to the listed clusters. See [filter_by Configuration Block](#filter_by-configuration-block) below for details.
+
+### filter_by Configuration Block
+
+The `filter_by` block controls the filtering of the listed clusters. It supports the following arguments:
+
+* `cluster_sources` - (Optional) List of cluster sources to filter by. Possible values are `API`, `JOB`, `MODELS`, `PIPELINE`, `PIPELINE_MAINTENANCE`, `SQL`, and `UI`.
+* `cluster_states` - (Optional) List of cluster states to filter by. Possible values are `RUNNING`, `PENDING`, `RESIZING`, `RESTARTING`, `TERMINATING`, `TERMINATED`, `ERROR`, and `UNKNOWN`.
+* `is_pinned` - (Optional) Whether to filter by pinned clusters.
+* `policy_id` - (Optional) Filter by [databricks_cluster_policy](../resources/cluster_policy.md) id.
 
 ## Attribute Reference
 
diff --git a/internal/acceptance/data_clusters_test.go b/internal/acceptance/data_clusters_test.go
@@ -1,7 +1,13 @@
 package acceptance
 
 import (
+	"context"
 	"testing"
+
+	"github.com/databricks/databricks-sdk-go"
+	"github.com/databricks/databricks-sdk-go/service/compute"
+	"github.com/hashicorp/terraform-plugin-testing/terraform"
+	"github.com/stretchr/testify/assert"
 )
 
 func TestAccDataSourceClustersNoFilter(t *testing.T) {
@@ -20,3 +26,67 @@ func TestAccDataSourceClustersWithFilter(t *testing.T) {
 		}`,
 	})
 }
+
+func checkFirstCluster(t *testing.T, f func(*compute.ClusterDetails)) func(*terraform.State) error {
+	return func(s *terraform.State) error {
+		w := databricks.Must(databricks.NewWorkspaceClient())
+		firstClusterId, ok := s.RootModule().Resources["data.databricks_clusters.this"].Primary.Attributes["ids.0"]
+		if ok {
+			firstCluster, err := w.Clusters.GetByClusterId(context.Background(), firstClusterId)
+			assert.NoError(t, err)
+			f(firstCluster)
+		}
+		return nil
+	}
+}
+
+func TestAccDataSourceClusters_FilterBy(t *testing.T) {
+	WorkspaceLevel(t, Step{
+		Template: `
+		data "databricks_clusters" "this" {
+			filter_by {
+				cluster_sources = ["UI", "API"]
+			}
+		}`,
+		Check: checkFirstCluster(t, func(c *compute.ClusterDetails) {
+			assert.Contains(t, []compute.ClusterSource{"UI", "API"}, c.ClusterSource)
+		}),
+	}, Step{
+		Template: `
+		data "databricks_clusters" "this" {
+			filter_by {
+				cluster_states = ["RUNNING", "RESIZING"]
+			}
+		}`,
+		Check: checkFirstCluster(t, func(c *compute.ClusterDetails) {
+			assert.Contains(t, []compute.State{"RUNNING", "RESIZING"}, c.State)
+		}),
+	}, Step{
+		Template: `
+		data "databricks_clusters" "this" {
+			filter_by {
+				is_pinned = true
+			}
+		}`,
+		// Not possible to get whether a cluster is pinned or not
+	}, Step{
+		Template: `
+		resource "databricks_cluster_policy" "this" {
+			name = "test"
+			definition = jsonencode({
+				"spark_conf.spark.hadoop.javax.jdo.option.ConnectionURL": {
+					"type": "fixed",
+					"value": "jdbc:sqlserver://<jdbc-url>"
+				}
+			})
+		}
+		data "databricks_clusters" "this" {
+			filter_by {
+				policy_id = databricks_cluster_policy.this.id
+			}
+		}`,
+		Check: checkFirstCluster(t, func(c *compute.ClusterDetails) {
+			assert.Equal(t, "abc-123", c.PolicyId)
+		}),
+	})
+}