|
| 1 | +--- |
| 2 | +page_title: "Enable Backend AWS PrivateLink for Databricks Workspace" |
| 3 | +--- |
| 4 | + |
| 5 | +# Deploying pre-requisite resources and enabling PrivateLink connections (AWS Preview) |
| 6 | + |
| 7 | +-> **Private Preview** This feature is in [Public Preview](https://docs.databricks.com/release-notes/release-types.html). Contact your Databricks representative to request access. |
| 8 | + |
| 9 | +Databricks PrivateLink support enables private connectivity between users and their Databricks workspaces and between clusters on the data plane and core services on the control plane within the Databricks workspace infrastructure. You can use Terraform to deploy the underlying cloud resources and the private access settings resources automatically, using a programmatic approach. This guide assumes you are deploying into an existing VPC and you have set up credentials and storage configurations as per prior examples, notably here. |
| 10 | + |
| 11 | + |
| 12 | + |
| 13 | +This guide uses the following variables in configurations: |
| 14 | + |
| 15 | +- `databricks_account_username`: The username an account-level admin uses to log in to [https://accounts.cloud.databricks.com](https://accounts.cloud.databricks.com). |
| 16 | +- `databricks_account_password`: The password for `databricks_account_username`. |
| 17 | +- `databricks_account_id`: The numeric ID for your Databricks account. When you are logged in, it appears in the bottom left corner of the page. |
| 18 | +- `vpc_id` - The ID for the AWS VPC |
| 19 | +- `region` - AWS region |
| 20 | +- `security_group_id` - Security groups set up for the existing VPC |
| 21 | +- `subnet_ids` - Existing subnets being used for the customer managed VPC |
| 22 | +- `workspace_vpce_service` - Choose the region-specific service endpoint from this table. |
| 23 | +- `relay_vpce_service` - Choose the region-specific service from this table. |
| 24 | +- `vpce_subnet_cidr` - CIDR range for the subnet chosen for the VPC endpoint |
| 25 | +- `tags` - tags for the Private Link backend setup |
| 26 | +- `root_bucket_name` - AWS bucket name required for [storage mws resource](https://registry.terraform.io/providers/databrickslabs/databricks/latest/docs/resources/mws_storage_configurations) reference |
| 27 | +- `cross_account_arn` - AWS EC2 role ARN required for [credentials mws resource](https://registry.terraform.io/providers/databrickslabs/databricks/latest/docs/resources/mws_credentials) |
| 28 | + |
| 29 | +This guide is provided as-is and you can use this guide as the basis for your custom Terraform module. |
| 30 | + |
| 31 | +To get started with AWS PrivateLink integration, this guide takes you throw the following high-level steps: |
| 32 | +- Initialize the required providers |
| 33 | +- Configure AWS objects |
| 34 | + - A subnet dedicated to your VPC relay and workspace endpoints |
| 35 | + - A security group dedicated to your VPC endpoints |
| 36 | + - Two AWS VPC endpoints |
| 37 | +- Workspace Creation |
| 38 | + |
| 39 | +## Provider initialization |
| 40 | + |
| 41 | +Initialize [provider with `mws` alias](https://www.terraform.io/language/providers/configuration#alias-multiple-provider-configurations) to set up account-level resources. See [provider authentication](../index.md#authenticating-with-hostname,-username,-and-password) for more details. |
| 42 | + |
| 43 | +```hcl |
| 44 | +terraform { |
| 45 | + required_providers { |
| 46 | + databricks = { |
| 47 | + source = "databrickslabs/databricks" |
| 48 | + version = "0.5.0" |
| 49 | + } |
| 50 | + aws = { |
| 51 | + source = "hashicorp/aws" |
| 52 | + version = "3.49.0" |
| 53 | + } |
| 54 | + } |
| 55 | +} |
| 56 | +
|
| 57 | +provider "aws" { |
| 58 | + region = var.region |
| 59 | +} |
| 60 | +
|
| 61 | +// initialize provider in "MWS" mode for provisioning workspace with AWS PrivateLink |
| 62 | +provider "databricks" { |
| 63 | + alias = "mws" |
| 64 | + host = "https://accounts.cloud.databricks.com" |
| 65 | + username = var.databricks_account_username |
| 66 | + password = var.databricks_account_password |
| 67 | +} |
| 68 | +
|
| 69 | +
|
| 70 | +``` |
| 71 | + |
| 72 | +Define the required variables |
| 73 | + |
| 74 | +```hcl |
| 75 | +variable "databricks_account_id" {} |
| 76 | +variable "databricks_account_username" {} |
| 77 | +variable "databricks_account_password" {} |
| 78 | +variable "vpc_id" {} |
| 79 | +variable "region" {} |
| 80 | +variable "security_group_id" {} |
| 81 | +
|
| 82 | +// this input variable is of array type |
| 83 | +variable "subnet_ids" { |
| 84 | + type = list(string) |
| 85 | +} |
| 86 | +
|
| 87 | +variable "workspace_vpce_service" {} |
| 88 | +variable "relay_vpce_service" {} |
| 89 | +variable "vpce_subnet_cidr" {} |
| 90 | +
|
| 91 | +variable "private_dns_enabled" { default = false} |
| 92 | +variable "tags" { default = {}} |
| 93 | +
|
| 94 | +// these resources (bucket and IAM role) are assumed created using your AWS provider and the examples here https://registry.terraform.io/providers/databrickslabs/databricks/latest/docs/resources/mws_storage_configurations and https://registry.terraform.io/providers/databrickslabs/databricks/latest/docs/resources/mws_credentials, respectively. |
| 95 | +variable "root_bucket_name" {} |
| 96 | +variable "cross_account_arn" {} |
| 97 | +
|
| 98 | +locals { |
| 99 | + prefix = "private-link-ws" |
| 100 | +} |
| 101 | +``` |
| 102 | + |
| 103 | +## Existing Storage Objects |
| 104 | +The following object is used in order to reference the storage configuration ID. |
| 105 | +```hcl |
| 106 | +resource "databricks_mws_storage_configurations" "this" { |
| 107 | + provider = databricks.mws |
| 108 | + account_id = var.databricks_account_id |
| 109 | + bucket_name = var.root_bucket_name |
| 110 | + storage_configuration_name = "${local.prefix}-storage}" |
| 111 | +} |
| 112 | +``` |
| 113 | + |
| 114 | +## Existing IAM Role |
| 115 | +The following object is used in order to reference the credential configuration ID. |
| 116 | +```hcl |
| 117 | +resource "databricks_mws_credentials" "this" { |
| 118 | + provider = databricks.mws |
| 119 | + account_id = var.databricks_account_id |
| 120 | + role_arn = var.cross_account_arn |
| 121 | + credentials_name = "${local.prefix}-credentials" |
| 122 | +} |
| 123 | +
|
| 124 | +``` |
| 125 | + |
| 126 | + |
| 127 | +## Configure AWS objects |
| 128 | +The first step is to create the required AWS objects: |
| 129 | +- A subnet dedicated to your VPC endpoints |
| 130 | +- A security group dedicated to your VPC endpoints and satisfying required inbound/outbound TCP/HTTPS traffic rules on ports 443 and 6666, respectively. |
| 131 | +- Lastly, creation of the private access settings and workspace. |
| 132 | + |
| 133 | +```hcl |
| 134 | +// this subnet houses the data plane VPC endpoints |
| 135 | +resource "aws_subnet" "dataplane_vpce" { |
| 136 | + vpc_id = var.vpc_id |
| 137 | + cidr_block = var.vpce_subnet_cidr |
| 138 | +
|
| 139 | + tags = merge( |
| 140 | + data.aws_vpc.prod.tags, |
| 141 | + { |
| 142 | + Name = "${local.prefix}-${data.aws_vpc.prod.id}-pl-vpce" |
| 143 | + }, |
| 144 | + ) |
| 145 | +} |
| 146 | +
|
| 147 | +resource "aws_route_table" "this" { |
| 148 | + vpc_id = var.vpc_id |
| 149 | +
|
| 150 | + tags = merge( |
| 151 | + data.aws_vpc.prod.tags, |
| 152 | + { |
| 153 | + Name = "${local.prefix}-${data.aws_vpc.prod.id}-pl-local-route-tbl" |
| 154 | + }, |
| 155 | + ) |
| 156 | +} |
| 157 | +
|
| 158 | +resource "aws_route_table_association" "dataplane_vpce_rtb" { |
| 159 | + subnet_id = aws_subnet.dataplane_vpce.id |
| 160 | + route_table_id = aws_route_table.this.id |
| 161 | +} |
| 162 | +``` |
| 163 | + |
| 164 | +```hcl |
| 165 | +data "aws_subnet" "ws_vpc_subnets" { |
| 166 | + for_each = toset(var.subnet_ids) |
| 167 | + id = each.value |
| 168 | +} |
| 169 | +
|
| 170 | +locals { |
| 171 | + vpc_cidr_blocks = [ |
| 172 | + for subnet in data.aws_subnet.ws_vpc_subnets : |
| 173 | + subnet.cidr_block |
| 174 | + ] |
| 175 | +} |
| 176 | +
|
| 177 | +// security group for data plane VPC endpoints for backend/relay connections |
| 178 | +resource "aws_security_group" "dataplane_vpce" { |
| 179 | + name = "Data Plane VPC endpoint security group" |
| 180 | + description = "Security group shared with relay and workspace endpoints" |
| 181 | + vpc_id = var.vpc_id |
| 182 | +
|
| 183 | + ingress { |
| 184 | + description = "Inbound rules" |
| 185 | + from_port = 443 |
| 186 | + to_port = 443 |
| 187 | + protocol = "tcp" |
| 188 | + cidr_blocks = concat([var.vpce_subnet_cidr], local.vpc_cidr_blocks) |
| 189 | + } |
| 190 | +
|
| 191 | + ingress { |
| 192 | + description = "Inbound rules" |
| 193 | + from_port = 6666 |
| 194 | + to_port = 6666 |
| 195 | + protocol = "tcp" |
| 196 | + cidr_blocks = concat([var.vpce_subnet_cidr], local.vpc_cidr_blocks) |
| 197 | + } |
| 198 | +
|
| 199 | + egress { |
| 200 | + description = "Outbound rules" |
| 201 | + from_port = 443 |
| 202 | + to_port = 443 |
| 203 | + protocol = "tcp" |
| 204 | + cidr_blocks = concat([var.vpce_subnet_cidr], local.vpc_cidr_blocks) |
| 205 | + } |
| 206 | +
|
| 207 | + egress { |
| 208 | + description = "Outbound rules" |
| 209 | + from_port = 6666 |
| 210 | + to_port = 6666 |
| 211 | + protocol = "tcp" |
| 212 | + cidr_blocks = concat([var.vpce_subnet_cidr], local.vpc_cidr_blocks) |
| 213 | + } |
| 214 | +
|
| 215 | + tags = merge( |
| 216 | + data.aws_vpc.prod.tags, |
| 217 | + { |
| 218 | + Name = "${local.prefix}-${data.aws_vpc.prod.id}-pl-vpce-sg-rules" |
| 219 | + }, |
| 220 | + ) |
| 221 | +} |
| 222 | +``` |
| 223 | + |
| 224 | +```hcl |
| 225 | +data "aws_vpc" "prod" { |
| 226 | + id = var.vpc_id |
| 227 | +} |
| 228 | +
|
| 229 | +resource "aws_vpc_endpoint" "backend_rest" { |
| 230 | + vpc_id = var.vpc_id |
| 231 | + service_name = var.workspace_vpce_service |
| 232 | + vpc_endpoint_type = "Interface" |
| 233 | + security_group_ids = [aws_security_group.dataplane_vpce.id] |
| 234 | + subnet_ids = [aws_subnet.dataplane_vpce.id] |
| 235 | + // run terraform apply twice when configuring PrivateLink - see this outstanding issue for understanding why this is required - https://github.com/hashicorp/terraform-provider-aws/issues/7148 |
| 236 | + // Run 1 - comment the `private_dns_enabled` line |
| 237 | + // Run 2 - uncomment the `private_dns_enabled` line |
| 238 | + // private_dns_enabled = var.private_dns_enabled |
| 239 | + depends_on = [aws_subnet.dataplane_vpce] |
| 240 | +} |
| 241 | +
|
| 242 | +resource "aws_vpc_endpoint" "relay" { |
| 243 | + vpc_id = var.vpc_id |
| 244 | + service_name = var.relay_vpce_service |
| 245 | + vpc_endpoint_type = "Interface" |
| 246 | + security_group_ids = [aws_security_group.dataplane_vpce.id] |
| 247 | + subnet_ids = [aws_subnet.dataplane_vpce.id] |
| 248 | + // run terraform apply twice when configuring PrivateLink - see this outstanding issue for understanding why this is required - https://github.com/hashicorp/terraform-provider-aws/issues/7148 |
| 249 | + // Run 1 - comment the `private_dns_enabled` line |
| 250 | + // Run 2 - uncomment the `private_dns_enabled` line |
| 251 | + // private_dns_enabled = var.private_dns_enabled |
| 252 | + depends_on = [aws_subnet.dataplane_vpce] |
| 253 | +} |
| 254 | +
|
| 255 | +
|
| 256 | +resource "databricks_mws_vpc_endpoint" "backend_rest_vpce" { |
| 257 | + provider = databricks.mws |
| 258 | + account_id = var.databricks_account_id |
| 259 | + aws_vpc_endpoint_id = aws_vpc_endpoint.backend_rest.id |
| 260 | + vpc_endpoint_name = "${local.prefix}-vpc-backend-${var.vpc_id}" |
| 261 | + region = var.region |
| 262 | + depends_on = [aws_vpc_endpoint.backend_rest] |
| 263 | +} |
| 264 | +
|
| 265 | +resource "databricks_mws_vpc_endpoint" "relay" { |
| 266 | + provider = databricks.mws |
| 267 | + account_id = var.databricks_account_id |
| 268 | + aws_vpc_endpoint_id = aws_vpc_endpoint.relay.id |
| 269 | + vpc_endpoint_name = "${local.prefix}-vpc-relay-${var.vpc_id}" |
| 270 | + region = var.region |
| 271 | + depends_on = [aws_vpc_endpoint.relay] |
| 272 | +} |
| 273 | +
|
| 274 | +``` |
| 275 | + |
| 276 | +## Workspace creation |
| 277 | + |
| 278 | +Once the VPC endpoints are created, they can be supplied in the `databricks_mws_networks` resource for workspace creation with AWS PrivateLink. After the terraform apply is run once (see the comment in the aws_vpc_endpoint resource above), run the terraform apply a second time with the line for private_dns_enabled set to true uncommented to set the proper DNS settings for PrivateLink. For understanding the reason that this needs to be applied twice, see this existing [issue](hashicorp/terraform-provider-aws#7148) in the underlying AWS provider. |
| 279 | + |
| 280 | +The credentials ID which is referenced below is one of the attributes which is created as a result of configuring the cross-account IAM role, which Databricks uses to orchestrate EC2 resources. The credentials are created via [databricks_mws_credentials](https://registry.terraform.io/providers/databrickslabs/databricks/latest/docs/resources/mws_credentials). Similarly, the storage configuration ID is obtained from the [databricks_mws_storage_configurations](https://registry.terraform.io/providers/databrickslabs/databricks/latest/docs/resources/mws_storage_configurations) resource. |
| 281 | + |
| 282 | +```hcl |
| 283 | +// Inputs are 2 subnets and one security group from existing VPC that will be used for your Databricks workspace |
| 284 | +resource "databricks_mws_networks" "this" { |
| 285 | + provider = databricks.mws |
| 286 | + account_id = var.databricks_account_id |
| 287 | + network_name = "${local.prefix}-network" |
| 288 | + security_group_ids = [var.security_group_id] |
| 289 | + subnet_ids = var.subnet_ids |
| 290 | + vpc_id = var.vpc_id |
| 291 | + vpc_endpoints { |
| 292 | + dataplane_relay = [databricks_mws_vpc_endpoint.relay.vpc_endpoint_id] |
| 293 | + rest_api = [databricks_mws_vpc_endpoint.backend_rest_vpce.vpc_endpoint_id] |
| 294 | + } |
| 295 | +} |
| 296 | +
|
| 297 | +resource "databricks_mws_private_access_settings" "pas" { |
| 298 | + provider = databricks.mws |
| 299 | + account_id = var.databricks_account_id |
| 300 | + private_access_settings_name = "Private Access Settings for ${local.prefix}" |
| 301 | + region = var.region |
| 302 | + public_access_enabled = true |
| 303 | +} |
| 304 | +
|
| 305 | +resource "databricks_mws_workspaces" "this" { |
| 306 | + provider = databricks.mws |
| 307 | + account_id = var.databricks_account_id |
| 308 | + aws_region = var.region |
| 309 | + workspace_name = local.prefix |
| 310 | + deployment_name = local.prefix |
| 311 | + credentials_id = databricks_mws_credentials.this.credentials_id |
| 312 | + storage_configuration_id = databricks_mws_storage_configurations.this.storage_configuration_id |
| 313 | + network_id = databricks_mws_networks.this.network_id |
| 314 | + private_access_settings_id = databricks_mws_private_access_settings.pas.private_access_settings_id |
| 315 | + pricing_tier = "ENTERPRISE" |
| 316 | + depends_on = [databricks_mws_networks.this] |
| 317 | +} |
| 318 | +``` |
| 319 | + |
0 commit comments