Website: https://www.cast.ai
- Terraform 0.13+
A module to connect a GKE cluster to CAST AI.
Requires castai/castai and hashicorp/google providers to be configured.
For Phase 2 onboarding credentials from terraform-gke-iam are required
module "castai_gke_cluster" {
source = "castai/gke-cluster/castai"
project_id = var.project_id
gke_cluster_name = var.cluster_name
gke_cluster_location = module.gke.location # cluster region or zone
gke_credentials = module.castai_gke_iam.private_key
delete_nodes_on_disconnect = var.delete_nodes_on_disconnect
default_node_configuration = module.castai_gke_cluster.node_configurations["default"]
node_configurations = {
default = {
disk_cpu_ratio = 25
subnets = [module.vpc.subnets_ids[0]]
tags = {
"node-config" : "default"
}
max_pods_per_node = 110
network_tags = ["dev"]
disk_type = "pd-balanced"
}
}
node_templates = {
spot_tmpl = {
configuration_id = module.castai_gke_cluster.node_configurations["default"]
should_taint = true
custom_labels = {
custom-label-key-1 = "custom-label-value-1"
custom-label-key-2 = "custom-label-value-2"
}
custom_taints = [
{
key = "custom-taint-key-1"
value = "custom-taint-value-1"
},
{
key = "custom-taint-key-2"
value = "custom-taint-value-2"
}
]
constraints = {
fallback_restore_rate_seconds = 1800
spot = true
use_spot_fallbacks = true
min_cpu = 4
max_cpu = 100
instance_families = {
exclude = ["e2"]
}
compute_optimized_state = "disabled"
storage_optimized_state = "disabled"
is_gpu_only = false
architectures = ["amd64"]
}
gpu = {
default_shared_clients_per_gpu = 9
sharing_strategy = "time-slicing"
user_managed_gpu_drivers = false
sharing_configuration = [
{
gpu_name = "nvidia-a100-80gb"
shared_clients_per_gpu = 11
},
{
gpu_name = "nvidia-l4"
shared_clients_per_gpu = 5
},
{
gpu_name = "nvidia-tesla-t4"
shared_clients_per_gpu = 3
}
]
}
custom_instances_enabled = true
custom_instances_with_extended_memory_enabled = true
}
}
autoscaler_settings = {
enabled = true
node_templates_partial_matching_enabled = false
unschedulable_pods = {
enabled = true
}
node_downscaler = {
enabled = true
empty_nodes = {
enabled = true
}
evictor = {
aggressive_mode = false
cycle_interval = "5s10s"
dry_run = false
enabled = true
node_grace_period_minutes = 10
scoped_mode = false
}
}
cluster_limits = {
enabled = true
cpu = {
max_cores = 20
min_cores = 1
}
}
}
workload_scaling_policies = {
default = {
apply_type = "IMMEDIATE"
management_option = "MANAGED"
cpu = {
function = "QUANTILE"
args = ["0.9"]
overhead = 0.15
look_back_period_seconds = 172800
min = 0.1
max = 2.0
}
memory = {
function = "MAX"
overhead = 0.35
look_back_period_seconds = 172800
limit = {
type = "NO_LIMIT"
}
}
assignment_rules = {
rules = [
{
namespace = {
names = ["default", "kube-system"]
}
},
{
workload = {
gvk = ["Deployment", "StatefulSet"]
labels_expressions = [
{
key = "region"
operator = "NotIn"
values = ["eu-west-1", "eu-west-2"]
},
{
key = "helm.sh/chart"
operator = "Exists"
}
]
}
}
]
}
startup = {
period_seconds = 300
}
predictive_scaling = {
cpu = {
enabled = true
}
}
}
}
}Version 4.x.x changes:
- Removed
custom_labelattribute incastai_node_templateresource. Usecustom_labelsinstead.
Old configuration:
module "castai-gke-cluster" {
node_templates = {
spot_tmpl = {
custom_label = {
key = "custom-label-key-1"
value = "custom-label-value-1"
}
}
}
}New configuration:
module "castai-gke-cluster" {
node_templates = {
spot_tmpl = {
custom_labels = {
custom-label-key-1 = "custom-label-value-1"
}
}
}
}Version 5.x.x changed:
- Removed
compute_optimizedandstorage_optimizedattributes incastai_node_templateresource,constraintsobject. Usecompute_optimized_stateandstorage_optimized_stateinstead.
Old configuration:
module "castai-gke-cluster" {
node_templates = {
spot_tmpl = {
constraints = {
compute_optimized = false
storage_optimized = true
}
}
}
}New configuration:
module "castai-gke-cluster" {
node_templates = {
spot_tmpl = {
constraints = {
compute_optimized_state = "disabled"
storage_optimized_state = "enabled"
}
}
}
}Version 6.3.x changed:
- Deprecated
autoscaler_policies_jsonattribute. Useautoscaler_settingsinstead.
Old configuration:
module "castai-gke-cluster" {
autoscaler_policies_json = <<-EOT
{
"enabled": true,
"unschedulablePods": {
"enabled": true
},
"nodeDownscaler": {
"enabled": true,
"emptyNodes": {
"enabled": true
},
"evictor": {
"aggressiveMode": false,
"cycleInterval": "5m10s",
"dryRun": false,
"enabled": true,
"nodeGracePeriodMinutes": 10,
"scopedMode": false
}
},
"nodeTemplatesPartialMatchingEnabled": false,
"clusterLimits": {
"cpu": {
"maxCores": 20,
"minCores": 1
},
"enabled": true
}
}
EOT
}New configuration:
module "castai-gke-cluster" {
autoscaler_settings = {
enabled = true
node_templates_partial_matching_enabled = false
unschedulable_pods = {
enabled = true
}
node_downscaler = {
enabled = true
empty_nodes = {
enabled = true
}
evictor = {
aggressive_mode = false
cycle_interval = "5m10s"
dry_run = false
enabled = true
node_grace_period_minutes = 10
scoped_mode = false
}
}
cluster_limits = {
enabled = true
cpu = {
max_cores = 20
min_cores = 1
}
}
}
}Version 10.x.x removes deprecated fields. These settings should now be configured via node_templates constraints.
The autoscaler_policies_json variable has been removed. Use autoscaler_settings instead.
The following fields have been removed from autoscaler_settings:
| Removed Field | Migration Path |
|---|---|
unschedulable_pods.custom_instances_enabled |
Use node_templates.<name>.custom_instances_enabled |
unschedulable_pods.headroom |
Deploy low-priority placeholder workloads (docs) |
unschedulable_pods.headroom_spot |
Deploy low-priority placeholder workloads (docs) |
unschedulable_pods.node_constraints |
Use node_templates.<name>.constraints (min_cpu, max_cpu, min_memory, max_memory) |
spot_instances.enabled |
Use node_templates.<name>.constraints.spot |
spot_instances.spot_backups |
Use node_templates.<name>.constraints.use_spot_fallbacks and fallback_restore_rate_seconds |
spot_instances.spot_diversity_enabled |
Use node_templates.<name>.constraints.enable_spot_diversity |
spot_instances.spot_diversity_price_increase_limit |
Use node_templates.<name>.constraints.spot_diversity_price_increase_limit_percent |
spot_instances.spot_interruption_predictions |
Use node_templates.<name>.constraints.spot_interruption_predictions_enabled and spot_interruption_predictions_type |
Old configuration:
module "castai-gke-cluster" {
source = "castai/gke-cluster/castai"
autoscaler_settings = {
enabled = true
unschedulable_pods = {
enabled = true
custom_instances_enabled = true
headroom = {
enabled = true
cpu_percentage = 10
memory_percentage = 10
}
node_constraints = {
min_cpu_cores = 4
max_cpu_cores = 32
}
}
spot_instances = {
enabled = true
spot_backups = {
enabled = true
}
}
}
}New configuration:
module "castai-gke-cluster" {
source = "castai/gke-cluster/castai"
autoscaler_settings = {
enabled = true
unschedulable_pods = {
enabled = true
}
}
node_templates = {
default_by_castai = {
configuration_id = module.castai-gke-cluster.castai_node_configurations["default"]
is_default = true
custom_instances_enabled = true
constraints = {
min_cpu = 4
max_cpu = 32
spot = true
use_spot_fallbacks = true
}
}
}
}Headroom functionality has been replaced with the recommended approach of deploying low-priority placeholder workloads. This provides more flexibility and follows Kubernetes native patterns.
See the CAST AI documentation on maintaining cluster headroom for detailed instructions.
Example placeholder deployment:
apiVersion: apps/v1
kind: Deployment
metadata:
name: headroom-placeholder
spec:
replicas: 1
selector:
matchLabels:
app: headroom-placeholder
template:
metadata:
labels:
app: headroom-placeholder
spec:
priorityClassName: low-priority # Create a PriorityClass with low priority
containers:
- name: pause
image: registry.k8s.io/pause:3.9
resources:
requests:
cpu: "2" # Adjust based on desired headroom
memory: "4Gi"Usage examples are located in terraform provider repo
| Name | Version |
|---|---|
| terraform | >= 0.13 |
| castai | >= 8.3 |
| >= 2.49 | |
| helm | >= 3.0.0 |
| null | >= 3.0 |
| Name | Version |
|---|---|
| castai | 7.61.0 |
| 6.46.0 | |
| helm | 3.0.2 |
| null | >= 3.0 |
| Name | Source | Version |
|---|---|---|
| castai_omni_cluster | castai/omni-cluster/castai | ~> 2.0 |
| Name | Description | Type | Default | Required |
|---|---|---|---|---|
| agent_values | List of YAML formatted string values for agent helm chart | list(string) |
[] |
no |
| agent_version | Version of castai-agent helm chart. Default latest | string |
null |
no |
| ai_optimizer_values | List of YAML formatted string with ai-optimizer values | list(string) |
[] |
no |
| ai_optimizer_version | Version of castai-ai-optimizer helm chart. Default latest | string |
null |
no |
| api_url | URL of alternative CAST AI API to be used during development or testing | string |
"https://api.cast.ai" |
no |
| autoscaler_settings | Optional Autoscaler policy definitions to override current autoscaler settings | any |
null |
no |
| castai_api_token | Optional CAST AI API token created in console.cast.ai API Access keys section. Used only when wait_for_cluster_ready is set to true |
string |
"" |
no |
| castai_components_labels | Optional additional Kubernetes labels for CAST AI pods | map(any) |
{} |
no |
| castai_components_sets | Optional additional 'set' configurations for every CAST AI Helm release. | map(string) |
{} |
no |
| castware_api_url | URL of CAST AI API to be used from within the cluster by Cast AI applications (Castware). If left empty, api_url will be used within the cluster. |
string |
"" |
no |
| cluster_controller_values | List of YAML formatted string values for cluster-controller helm chart | list(string) |
[] |
no |
| cluster_controller_version | Version of castai-cluster-controller helm chart. Default latest | string |
null |
no |
| default_node_configuration | ID of the default node configuration | string |
"" |
no |
| default_node_configuration_name | Name of the default node configuration | string |
"" |
no |
| delete_nodes_on_disconnect | Optionally delete Cast AI created nodes when the cluster is destroyed | bool |
false |
no |
| evictor_ext_values | List of YAML formatted string with evictor-ext values | list(string) |
[] |
no |
| evictor_ext_version | Version of castai-evictor-ext chart. Default latest | string |
null |
no |
| evictor_values | List of YAML formatted string values for evictor helm chart | list(string) |
[] |
no |
| evictor_version | Version of castai-evictor chart. Default latest | string |
null |
no |
| gke_cluster_location | Location of the cluster to be connected to CAST AI. Can be region or zone for zonal clusters | string |
n/a | yes |
| gke_cluster_name | Name of the cluster to be connected to CAST AI. | string |
n/a | yes |
| gke_credentials | Optional GCP Service account credentials.json | string |
n/a | yes |
| grpc_url | gRPC endpoint used by pod-pinner | string |
"grpc.cast.ai:443" |
no |
| install_ai_optimizer | Optional flag for installation of AI Optimizer (https://docs.cast.ai/docs/getting-started-ai) | bool |
false |
no |
| install_omni | Optional flag for installation of Omni product | bool |
false |
no |
| install_pod_mutator | Optional flag for installation of pod mutator | bool |
false |
no |
| install_security_agent | Optional flag for installation of security agent (Kvisor - https://docs.cast.ai/docs/kvisor) | bool |
false |
no |
| install_workload_autoscaler | Optional flag for installation of workload autoscaler (https://docs.cast.ai/docs/workload-autoscaling-configuration) | bool |
false |
no |
| install_workload_autoscaler_exporter | Optional flag for installation of workload autoscaler exporter (custom metrics exporter) | bool |
false |
no |
| kvisor_controller_extra_args | map(string) |
{ |
no | |
| kvisor_grpc_addr | CAST AI Kvisor optimized GRPC API address | string |
"kvisor.prod-master.cast.ai:443" |
no |
| kvisor_values | List of YAML formatted string values for kvisor helm chart, see example: https://github.com/castai/terraform-provider-castai/tree/master/examples/gke/gke_cluster_with_security/castai.tf | list(string) |
[] |
no |
| kvisor_version | Version of kvisor chart. If not provided, latest version will be used. | string |
null |
no |
| kvisor_wait | Wait for kvisor chart to finish release | bool |
true |
no |
| node_configurations | Map of GKE node configurations to create | any |
{} |
no |
| node_templates | Map of node templates to create | any |
{} |
no |
| organization_id | DEPRECATED (required only for pod mutator v0.0.25 and older): CAST AI Organization ID | string |
"" |
no |
| pod_mutator_values | List of YAML formatted string values for pod-mutator helm chart | list(string) |
[] |
no |
| pod_mutator_version | Version of castai-pod-mutator helm chart. Default latest | string |
null |
no |
| pod_pinner_values | List of YAML formatted string values for agent helm chart | list(string) |
[] |
no |
| pod_pinner_version | Version of pod-pinner helm chart. Default latest | string |
null |
no |
| project_id | The project id from GCP | string |
n/a | yes |
| self_managed | Whether CAST AI components' upgrades are managed by a customer; by default upgrades are managed CAST AI central system. WARNING: changing this after the module was created is not supported. | bool |
false |
no |
| spot_handler_values | List of YAML formatted string values for spot-handler helm chart | list(string) |
[] |
no |
| spot_handler_version | Version of castai-spot-handler helm chart. Default latest | string |
null |
no |
| wait_for_cluster_ready | Wait for cluster to be ready before finishing the module execution, this option requires castai_api_token to be set |
bool |
false |
no |
| workload_autoscaler_values | List of YAML formatted string with cluster-workload-autoscaler values | list(string) |
[] |
no |
| workload_autoscaler_version | Version of castai-workload-autoscaler helm chart. Default latest | string |
null |
no |
| workload_autoscaler_exporter_values | List of YAML formatted string with workload-autoscaler-exporter values | list(string) |
[] |
no |
| workload_autoscaler_exporter_version | Version of castai-workload-autoscaler-exporter helm chart. Default latest | string |
null |
no |
| workload_scaling_policies | Map of workload scaling policies to create | any |
{} |
no |
| Name | Description |
|---|---|
| castai_node_configurations | Map of node configurations ids by name |
| castai_node_templates | Map of node template by name |
| cluster_id | CAST.AI cluster id, which can be used for accessing cluster data using API |
| organization_id | CAST.AI organization id of the cluster |