Website: https://www.cast.ai
- Terraform 0.13+
A module to create Azure role and a service principal that can be used to connect to CAST AI
Requires castai/castai, hashicorp/azurerm, hashicorp/azuread, hashicorp/helm providers to be configured.
The required parameters can be provided manually or alternatively can be easily acquired from your AKS cluster resource or Azure RM subscription data source.
module "castai-aks-cluster" {
source = "castai/aks/castai"
aks_cluster_name = var.aks_cluster_name
aks_cluster_region = var.aks_cluster_region
node_resource_group = azurerm_kubernetes_cluster.example.node_resource_group
resource_group = azurerm_kubernetes_cluster.example.resource_group_name
delete_nodes_on_disconnect = true
subscription_id = data.azurerm_subscription.current.subscription_id
tenant_id = data.azurerm_subscription.current.tenant_id
default_node_configuration = module.castai-aks-cluster.castai_node_configurations["default"]
node_configurations = {
default = {
disk_cpu_ratio = 25
subnets = [azurerm_subnet.internal.id]
tags = {
"node-config" : "default"
}
}
}
node_templates = {
spot_tmpl = {
configuration_id = module.castai-aks-cluster.castai_node_configurations["default"]
should_taint = true
custom_labels = {
custom-label-key-1 = "custom-label-value-1"
custom-label-key-2 = "custom-label-value-2"
}
custom_taints = [
{
key = "custom-taint-key-1"
value = "custom-taint-value-1"
},
{
key = "custom-taint-key-2"
value = "custom-taint-value-2"
}
]
constraints = {
fallback_restore_rate_seconds = 1800
spot = true
use_spot_fallbacks = true
min_cpu = 4
max_cpu = 100
instance_families = {
exclude = ["standard_DPLSv5"]
}
compute_optimized_state = "disabled"
storage_optimized_state = "disabled"
}
}
}
autoscaler_settings = {
enabled = true
node_templates_partial_matching_enabled = false
unschedulable_pods = {
enabled = true
}
node_downscaler = {
enabled = true
empty_nodes = {
enabled = true
}
evictor = {
aggressive_mode = false
cycle_interval = "5s10s"
dry_run = false
enabled = true
node_grace_period_minutes = 10
scoped_mode = false
}
}
cluster_limits = {
enabled = true
cpu = {
max_cores = 20
min_cores = 1
}
}
}
workload_scaling_policies = {
default = {
apply_type = "IMMEDIATE"
management_option = "MANAGED"
cpu = {
function = "QUANTILE"
args = ["0.9"]
overhead = 0.15
look_back_period_seconds = 172800
min = 0.1
max = 2.0
}
memory = {
function = "MAX"
overhead = 0.35
look_back_period_seconds = 172800
limit = {
type = "NO_LIMIT"
}
}
assignment_rules = {
rules = [
{
namespace = {
names = ["default", "kube-system"]
}
},
{
workload = {
gvk = ["Deployment", "StatefulSet"]
labels_expressions = [
{
key = "region"
operator = "NotIn"
values = ["eu-west-1", "eu-west-2"]
},
{
key = "helm.sh/chart"
operator = "Exists"
}
]
}
}
]
}
startup = {
period_seconds = 300
}
predictive_scaling = {
cpu = {
enabled = true
}
}
}
}
}Version 3.x.x changes:
- Removed
custom_labelattribute incastai_node_templateresource. Usecustom_labelsinstead.
Old configuration:
module "castai-aks-cluster" {
node_templates = {
spot_tmpl = {
custom_label = {
key = "custom-label-key-1"
value = "custom-label-value-1"
}
}
}
}New configuration:
module "castai-aks-cluster" {
node_templates = {
spot_tmpl = {
custom_labels = {
custom-label-key-1 = "custom-label-value-1"
}
}
}
}Version 4.x.x changed:
- Removed
compute_optimizedandstorage_optimizedattributes incastai_node_templateresource,constraintsobject. Usecompute_optimized_stateandstorage_optimized_stateinstead.
Old configuration:
module "castai-aks-cluster" {
node_templates = {
spot_tmpl = {
constraints = {
compute_optimized = false
storage_optimized = true
}
}
}
}New configuration:
module "castai-aks-cluster" {
node_templates = {
spot_tmpl = {
constraints = {
compute_optimized_state = "disabled"
storage_optimized_state = "enabled"
}
}
}
}Version 5.2.x changed:
- Deprecated
autoscaler_policies_jsonattribute. Useautoscaler_settingsinstead.
Old configuration:
module "castai-aks-cluster" {
autoscaler_policies_json = <<-EOT
{
"enabled": true,
"unschedulablePods": {
"enabled": true
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": false,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
},
"nodeTemplatesPartialMatchingEnabled": false,
"clusterLimits": {
"cpu": {
"maxCores": 20,
"minCores": 1
},
"enabled": true
}
}
EOT
}New configuration:
module "castai-aks-cluster" {
autoscaler_settings = {
enabled = true
node_templates_partial_matching_enabled = false
unschedulable_pods = {
enabled = true
}
node_downscaler = {
enabled = true
empty_nodes = {
enabled = true
}
evictor = {
aggressive_mode = false
cycle_interval = "5m10s"
dry_run = false
enabled = true
node_grace_period_minutes = 10
scoped_mode = false
}
}
cluster_limits = {
enabled = true
cpu = {
max_cores = 20
min_cores = 1
}
}
}
}Version 10.x.x removes deprecated autoscaler fields that have been moved to node templates.
Removed:
autoscaler_policies_json- Useautoscaler_settingsinsteadautoscaler_settings.unschedulable_pods.custom_instances_enabled- No longer neededautoscaler_settings.unschedulable_pods.headroom- Use low-priority placeholder workloads insteadautoscaler_settings.unschedulable_pods.headroom_spot- Use low-priority placeholder workloads insteadautoscaler_settings.unschedulable_pods.node_constraints- Usenode_templatesconstraints insteadautoscaler_settings.spot_instances- Usenode_templatesconstraints instead
Migration guide:
Old configuration:
module "castai-aks-cluster" {
source = "castai/aks/castai"
autoscaler_settings = {
enabled = true
unschedulable_pods = {
enabled = true
custom_instances_enabled = true
headroom = {
enabled = true
cpu_percentage = 10
memory_percentage = 10
}
node_constraints = {
min_cpu_cores = 4
max_cpu_cores = 32
}
}
spot_instances = {
enabled = true
spot_backups = {
enabled = true
}
}
}
}New configuration:
module "castai-aks-cluster" {
source = "castai/aks/castai"
autoscaler_settings = {
enabled = true
unschedulable_pods = {
enabled = true
}
}
node_templates = {
default_by_castai = {
configuration_id = module.castai-aks-cluster.castai_node_configurations["default"]
is_default = true
constraints = {
min_cpu = 4
max_cpu = 32
spot = true
use_spot_fallbacks = true
}
}
}
}
# For headroom: Deploy low-priority placeholder workloads
# See: https://docs.cast.ai/docs/autoscaler-faq#how-can-i-maintain-cluster-headroomKey changes:
- Node constraints (
min_cpu_cores,max_cpu_cores,min_ram_mib,max_ram_mib) are now configured innode_templates.constraintsasmin_cpu,max_cpu,min_memory,max_memory - Spot instance settings (
spot,use_spot_fallbacks) are now configured innode_templates.constraints - The default node template is named
default_by_castai - For headroom functionality, deploy low-priority placeholder workloads as described in the CAST AI documentation
Usage examples are located in terraform provider repo
| Name | Version |
|---|---|
| terraform | >= 0.13 |
| azuread | ~> 3 |
| azurerm | >= 3.7.0 |
| castai | >= 8.7 |
| helm | >= 3.0.0 |
| null | ~> 3 |
| Name | Version |
|---|---|
| azuread | ~> 3 |
| azurerm | >= 3.7.0 |
| castai | >= 8.7 |
| helm | >= 3.0.0 |
| null | ~> 3 |
| Name | Source | Version |
|---|---|---|
| castai_omni_cluster | castai/omni-cluster/castai | ~> 2.0 |
| Name | Description | Type | Default | Required |
|---|---|---|---|---|
| additional_resource_groups | n/a | list(string) |
[] |
no |
| agent_values | List of YAML formatted string values for agent helm chart | list(string) |
[] |
no |
| agent_version | Version of castai-agent helm chart. If not provided, latest version will be used. | string |
null |
no |
| ai_optimizer_values | List of YAML formatted string with ai-optimizer values | list(string) |
[] |
no |
| ai_optimizer_version | Version of castai-ai-optimizer helm chart. Default latest | string |
null |
no |
| aks_cluster_name | Name of the cluster to be connected to CAST AI. | string |
n/a | yes |
| aks_cluster_region | Region of the AKS cluster | string |
n/a | yes |
| api_url | URL of alternative CAST AI API to be used during development or testing | string |
"https://api.cast.ai" |
no |
| authentication_method | Authentication method for CAST AI. Use 'client_secret' for service principal with password, or 'workload_identity' for federated identity credential | string |
"client_secret" |
no |
| autoscaler_settings | Optional Autoscaler policy definitions to override current autoscaler settings | any |
null |
no |
| azuread_owners | A set of object IDs of principals that will be granted ownership of the Azure AD service principal and application. Defaults to current user. | list(string) |
null |
no |
| castai_api_token | Optional CAST AI API token created in console.cast.ai API Access keys section. Used only when wait_for_cluster_ready is set to true |
string |
"" |
no |
| castai_components_labels | Optional additional Kubernetes labels for CAST AI pods | map(any) |
{} |
no |
| castai_components_sets | Optional additional 'set' configurations for every CAST AI Helm release. | map(string) |
{} |
no |
| cluster_controller_values | List of YAML formatted string values for cluster-controller helm chart | list(string) |
[] |
no |
| cluster_controller_version | Version of castai-cluster-controller helm chart. If not provided, latest version will be used. | string |
null |
no |
| default_node_configuration | ID of the default node configuration | string |
"" |
no |
| default_node_configuration_name | Name of the default node configuration | string |
"" |
no |
| delete_nodes_on_disconnect | Optionally delete Cast AI created nodes when the cluster is destroyed | bool |
false |
no |
| evictor_ext_values | List of YAML formatted string with evictor-ext values | list(string) |
[] |
no |
| evictor_ext_version | Version of castai-evictor-ext chart. Default latest | string |
null |
no |
| evictor_values | List of YAML formatted string values for evictor helm chart | list(string) |
[] |
no |
| evictor_version | Version of castai-evictor chart. If not provided, latest version will be used. | string |
null |
no |
| grpc_url | gRPC endpoint used by pod-pinner | string |
"grpc.cast.ai:443" |
no |
| http_proxy | Address to use for proxying http requests from CAST AI components running directly on nodes. | string |
null |
no |
| https_proxy | Address to use for proxying https requests from CAST AI components running directly on nodes. | string |
null |
no |
| install_ai_optimizer | Optional flag for installation of AI Optimizer (https://docs.cast.ai/docs/getting-started-ai) | bool |
false |
no |
| install_omni | Optional flag for installing Omni capability | bool |
false |
no |
| install_pod_mutator | Optional flag for installation of pod mutator | bool |
false |
no |
| install_security_agent | Optional flag for installation of security agent (Kvisor - https://docs.cast.ai/docs/kvisor) | bool |
false |
no |
| install_workload_autoscaler | Optional flag for installation of workload autoscaler (https://docs.cast.ai/docs/workload-autoscaling-configuration) | bool |
false |
no |
| install_workload_autoscaler_exporter | Optional flag for installation of workload autoscaler exporter (custom metrics exporter) | bool |
false |
no |
| kvisor_controller_extra_args | map(string) |
{ |
no | |
| kvisor_grpc_addr | CAST AI Kvisor optimized GRPC API address | string |
"kvisor.prod-master.cast.ai:443" |
no |
| kvisor_values | List of YAML formatted string values for kvisor helm chart, see example: https://github.com/castai/terraform-provider-castai/tree/master/examples/aks/aks_cluster_with_security/castai.tf | list(string) |
[] |
no |
| kvisor_version | Version of kvisor chart. If not provided, latest version will be used. | string |
null |
no |
| kvisor_wait | Wait for kvisor chart to finish release | bool |
true |
no |
| no_proxy | List of addresses to skip proxying requests from CAST AI components running directly on nodes. Used with http_proxy and https_proxy. | list(string) |
[] |
no |
| node_configurations | Map of AKS node configurations to create | any |
{} |
no |
| node_resource_group | n/a | string |
n/a | yes |
| node_templates | Map of node templates to create | any |
{} |
no |
| organization_id | DEPRECATED (required only for pod mutator v0.0.25 and older): CAST AI Organization ID | string |
"" |
no |
| pod_mutator_values | List of YAML formatted string values for pod-mutator helm chart | list(string) |
[] |
no |
| pod_mutator_version | Version of castai-pod-mutator helm chart. Default latest | string |
null |
no |
| pod_pinner_values | List of YAML formatted string values for agent helm chart | list(string) |
[] |
no |
| pod_pinner_version | Version of pod-pinner helm chart. Default latest | string |
null |
no |
| resource_group | n/a | string |
n/a | yes |
| self_managed | Whether CAST AI components' upgrades are managed by a customer; by default upgrades are managed CAST AI central system. WARNING: changing this after the module was created is not supported. | bool |
false |
no |
| spot_handler_values | List of YAML formatted string values for spot-handler helm chart | list(string) |
[] |
no |
| spot_handler_version | Version of castai-spot-handler helm chart. If not provided, latest version will be used. | string |
null |
no |
| subscription_id | Azure subscription ID | string |
n/a | yes |
| tenant_id | n/a | string |
n/a | yes |
| wait_for_cluster_ready | Wait for cluster to be ready before finishing the module execution, this option requires castai_api_token to be set |
bool |
false |
no |
| workload_autoscaler_exporter_values | List of YAML formatted string with castai-workload-autoscaler-exporter values | list(string) |
[] |
no |
| workload_autoscaler_exporter_version | Version of castai-workload-autoscaler-exporter helm chart. Default latest | string |
null |
no |
| workload_autoscaler_values | List of YAML formatted string with cluster-workload-autoscaler values | list(string) |
[] |
no |
| workload_autoscaler_version | Version of castai-workload-autoscaler helm chart. Default latest | string |
null |
no |
| workload_scaling_policies | Map of workload scaling policies to create | any |
{} |
no |
| Name | Description |
|---|---|
| castai_node_configurations | Map of node configurations ids by name |
| castai_node_templates | Map of node template by name |
| cluster_id | CAST.AI cluster id, which can be used for accessing cluster data using API |
| organization_id | CAST.AI organization id of the cluster |