diff --git a/modules/aws_ecs/ecs.tf b/modules/aws_ecs/ecs.tf index 4e16958..80da59f 100644 --- a/modules/aws_ecs/ecs.tf +++ b/modules/aws_ecs/ecs.tf @@ -37,28 +37,29 @@ data "aws_ami" "this" { ] } -resource "aws_launch_configuration" "this" { +resource "aws_launch_template" "this" { count = var.launch_type == "EC2" ? 1 : 0 - name_prefix = "${var.deployment_name}-ecs-launch-configuration-" + name_prefix = "${var.deployment_name}-ecs-launch-template-" image_id = data.aws_ami.this.id instance_type = var.instance_type # e.g. t2.medium - enable_monitoring = true - associate_public_ip_address = true + monitoring { + enabled = true + } + + network_interfaces { + associate_public_ip_address = false + security_groups = [aws_security_group.containers.id] + } # This user data represents a collection of “scripts” that will be executed the first time the machine starts. # This specific example makes sure the EC2 instance is automatically attached to the ECS cluster that we create earlier # and marks the instance as purchased through the Spot pricing - user_data = <<-EOF - #!/bin/bash - echo ECS_CLUSTER=${var.deployment_name}-ecs >> /etc/ecs/ecs.config - EOF - - # We’ll see security groups later - security_groups = [ - aws_security_group.containers.id - ] - + user_data = base64encode(<<-EOF + #!/bin/bash + echo ECS_CLUSTER=${var.deployment_name}-ecs >> /etc/ecs/ecs.config + EOF + ) # If you want to SSH into the instance and manage it directly: # 1. Make sure this key exists in the AWS EC2 dashboard # 2. Make sure your local SSH agent has it loaded @@ -66,7 +67,9 @@ resource "aws_launch_configuration" "this" { key_name = var.ssh_key_name # Allow the EC2 instances to access AWS resources on your behalf, using this instance profile and the permissions defined there - iam_instance_profile = aws_iam_instance_profile.ec2[0].arn + iam_instance_profile { + name = aws_iam_instance_profile.ec2[0].name + } lifecycle { create_before_destroy = true @@ -80,10 +83,15 @@ resource "aws_autoscaling_group" "this" { min_size = var.min_instance_count desired_capacity = var.min_instance_count vpc_zone_identifier = var.private_subnet_ids - launch_configuration = aws_launch_configuration.this[0].name + + launch_template { + id = aws_launch_template.this[0].id + version = "$Latest" + } default_cooldown = 30 health_check_grace_period = 30 + health_check_type = "EC2" termination_policies = [ "OldestInstance" @@ -112,33 +120,224 @@ resource "aws_autoscaling_group" "this" { } } -# Attach an autoscaling policy to the spot cluster to target 70% MemoryReservation on the ECS cluster. -resource "aws_autoscaling_policy" "this" { - count = var.launch_type == "EC2" ? 1 : 0 - name = "${var.deployment_name}-ecs-scale-policy" - policy_type = "TargetTrackingScaling" - adjustment_type = "ChangeInCapacity" - autoscaling_group_name = aws_autoscaling_group.this[0].name - - target_tracking_configuration { - customized_metric_specification { - metric_dimension { - name = "ClusterName" - value = "${var.deployment_name}-ecs" - } - metric_name = "MemoryReservation" - namespace = "AWS/ECS" - statistic = "Average" - } - target_value = var.autoscaling_memory_reservation_target - } -} - resource "aws_ecs_capacity_provider" "this" { count = var.launch_type == "EC2" ? 1 : 0 name = "${var.deployment_name}-ecs-capacity-provider" auto_scaling_group_provider { auto_scaling_group_arn = aws_autoscaling_group.this[0].arn + managed_scaling { + status = "ENABLED" + target_capacity = 80 + minimum_scaling_step_size = 1 + maximum_scaling_step_size = 2 + instance_warmup_period = 300 + } + } +} + +resource "aws_appautoscaling_target" "retool" { + count = 1 + service_namespace = "ecs" + resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-main-service" + scalable_dimension = "ecs:service:DesiredCount" + min_capacity = 1 + max_capacity = 3 + depends_on = [aws_ecs_service.retool] +} + +resource "aws_appautoscaling_target" "workflows_worker" { + count = var.workflows_enabled ? 1 : 0 + service_namespace = "ecs" + resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-workflows-worker-service" + scalable_dimension = "ecs:service:DesiredCount" + min_capacity = 1 + max_capacity = 3 + depends_on = [aws_ecs_service.workflows_worker] +} + +resource "aws_appautoscaling_target" "workflows_backend" { + count = var.workflows_enabled ? 1 : 0 + service_namespace = "ecs" + resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-workflows-backend-service" + scalable_dimension = "ecs:service:DesiredCount" + min_capacity = 1 + max_capacity = 3 + depends_on = [aws_ecs_service.workflows_backend] +} + +resource "aws_appautoscaling_target" "code_executor" { + count = var.code_executor_enabled ? 1 : 0 + service_namespace = "ecs" + resource_id = "service/${aws_ecs_cluster.this.name}/${var.deployment_name}-code-executor-service" + scalable_dimension = "ecs:service:DesiredCount" + min_capacity = 1 + max_capacity = 3 + depends_on = [aws_ecs_service.code_executor] +} + +resource "aws_appautoscaling_policy" "retool_cpu" { + count = 1 + name = "retool-cpu-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.retool[0].resource_id + scalable_dimension = aws_appautoscaling_target.retool[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 60.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +resource "aws_appautoscaling_policy" "workflows_worker_cpu" { + count = var.workflows_enabled ? 1 : 0 + name = "workflows-worker-cpu-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.workflows_worker[0].resource_id + scalable_dimension = aws_appautoscaling_target.workflows_worker[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 60.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 30 } } + +resource "aws_appautoscaling_policy" "workflows_backend_cpu" { + count = var.workflows_enabled ? 1 : 0 + name = "workflows_backend-cpu-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.workflows_backend[0].resource_id + scalable_dimension = aws_appautoscaling_target.workflows_backend[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 60.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +resource "aws_appautoscaling_policy" "code_executor_cpu" { + count = var.code_executor_enabled ? 1 : 0 + name = "code-executor-cpu-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.code_executor[0].resource_id + scalable_dimension = aws_appautoscaling_target.code_executor[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 60.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageCPUUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +resource "aws_appautoscaling_policy" "retool_memory" { + count = 1 + name = "retool-memory-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.retool[0].resource_id + scalable_dimension = aws_appautoscaling_target.retool[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 70.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageMemoryUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +resource "aws_appautoscaling_policy" "workflows_worker_memory" { + count = var.workflows_enabled ? 1 : 0 + name = "workflows-worker-memory-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.workflows_worker[0].resource_id + scalable_dimension = aws_appautoscaling_target.workflows_worker[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 70.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageMemoryUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +resource "aws_appautoscaling_policy" "workflows_backend_memory" { + count = var.workflows_enabled ? 1 : 0 + name = "workflows_backend-memory-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.workflows_backend[0].resource_id + scalable_dimension = aws_appautoscaling_target.workflows_backend[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 70.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageMemoryUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +resource "aws_appautoscaling_policy" "code_executor_memory" { + count = var.code_executor_enabled ? 1 : 0 + name = "code-executor-memory-policy" + service_namespace = "ecs" + resource_id = aws_appautoscaling_target.code_executor[0].resource_id + scalable_dimension = aws_appautoscaling_target.code_executor[0].scalable_dimension + policy_type = "TargetTrackingScaling" + + target_tracking_scaling_policy_configuration { + target_value = 70.0 + predefined_metric_specification { + predefined_metric_type = "ECSServiceAverageMemoryUtilization" + } + scale_in_cooldown = 60 + scale_out_cooldown = 60 + } +} + +# Attach an autoscaling policy to the spot cluster to target 70% MemoryReservation on the ECS cluster. +# resource "aws_autoscaling_policy" "this" { +# count = var.launch_type == "EC2" ? 1 : 0 +# name = "${var.deployment_name}-ecs-scale-policy" +# policy_type = "TargetTrackingScaling" +# adjustment_type = "ChangeInCapacity" +# autoscaling_group_name = aws_autoscaling_group.this[0].name +# +# target_tracking_configuration { +# customized_metric_specification { +# metric_dimension { +# name = "ClusterName" +# value = "${var.deployment_name}-ecs" +# } +# metric_name = "MemoryReservation" +# namespace = "AWS/ECS" +# statistic = "Average" +# } +# target_value = var.autoscaling_memory_reservation_target +# } +# } diff --git a/modules/aws_ecs/loadbalancers.tf b/modules/aws_ecs/loadbalancers.tf index 1d0d2ca..3f3e336 100644 --- a/modules/aws_ecs/loadbalancers.tf +++ b/modules/aws_ecs/loadbalancers.tf @@ -58,7 +58,7 @@ resource "aws_lb_target_group" "this" { deregistration_delay = 30 port = 3000 protocol = "HTTP" - target_type = var.launch_type == "FARGATE" ? "ip" : "instance" + target_type = "ip" health_check { interval = 61 diff --git a/modules/aws_ecs/locals.tf b/modules/aws_ecs/locals.tf index a58c2bb..dd65328 100644 --- a/modules/aws_ecs/locals.tf +++ b/modules/aws_ecs/locals.tf @@ -26,13 +26,6 @@ locals { environment_variables = concat( var.additional_env_vars, # add additional environment variables local.base_environment_variables, - local.temporal_mtls_config, - var.code_executor_enabled ? [ - { - name = "CODE_EXECUTOR_INGRESS_DOMAIN" - value = format("http://code-executor.%s:3004", local.service_discovery_namespace) - } - ] : [], var.telemetry_enabled ? [ { name = "RTEL_ENABLED" @@ -72,27 +65,26 @@ locals { "name" = "POSTGRES_USER", "value" = var.rds_username }, - { - "name" = "POSTGRES_PASSWORD", - "value" = random_string.rds_password.result - }, - { - "name" : "JWT_SECRET", - "value" : random_string.jwt_secret.result - }, - { - "name" : "ENCRYPTION_KEY", - "value" : random_string.encryption_key.result - }, { "name" : "LICENSE_KEY", "value" : var.retool_license_key }, - # Workflows-specific + # WORKFLOW_BACKEND_HOST and CODE_EXECUTOR_INGRESS_DOMAIN are workflows-specific services { "name" : "WORKFLOW_BACKEND_HOST", "value" : format("http://workflow-backend.%s:3000", local.service_discovery_namespace) - }, + } + ], + var.code_executor_enabled ? [ + { + name = "CODE_EXECUTOR_INGRESS_DOMAIN" + value = format("http://code-executor.%s:3004", local.service_discovery_namespace) + } + ] : [], + # The section below is only needed if deploying Temporal locally from this template. + # Retool strongly reccommends using the Retool Managed Temporal option instead. + local.temporal_mtls_config, + var.use_existing_temporal_cluster ? [] : [ { "name" : "WORKFLOW_TEMPORAL_CLUSTER_NAMESPACE", "value" : var.temporal_cluster_config.namespace @@ -112,6 +104,24 @@ locals { ] ) + secrets = concat( + var.additional_secrets, + [ + { + name = "POSTGRES_PASSWORD", + valueFrom = aws_secretsmanager_secret.rds_password.arn + }, + { + name = "JWT_SECRET", + valueFrom = aws_secretsmanager_secret.jwt_secret.arn + }, + { + name = "ENCRYPTION_KEY", + valueFrom = aws_secretsmanager_secret.encryption_key.arn + } + ] + ) + task_log_configuration = ( var.telemetry_enabled ? { # Send logs to CloudWatch in addition to telemetry service: @@ -136,11 +146,12 @@ locals { common_containers = ( var.telemetry_enabled ? [ { - name = "retool-fluentbit" - essential = true - image = var.ecs_telemetry_fluentbit_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["fluentbit"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["fluentbit"]["memory"] : null + name = "retool-fluentbit" + essential = true + image = var.ecs_telemetry_fluentbit_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["fluentbit"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["fluentbit"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["fluentbit"]["memory"] : null firelensConfiguration = { type = "fluentbit" diff --git a/modules/aws_ecs/main.tf b/modules/aws_ecs/main.tf index b89b2a7..352684e 100644 --- a/modules/aws_ecs/main.tf +++ b/modules/aws_ecs/main.tf @@ -7,6 +7,11 @@ terraform { } } +provider "aws" { + profile = var.profile + region = var.aws_region +} + data "aws_vpc" "selected" { id = var.vpc_id } @@ -51,10 +56,8 @@ resource "aws_ecs_service" "retool" { name = "${var.deployment_name}-main-service" cluster = aws_ecs_cluster.this.id task_definition = aws_ecs_task_definition.retool.arn - desired_count = var.min_instance_count - 1 deployment_maximum_percent = var.maximum_percent deployment_minimum_healthy_percent = var.minimum_healthy_percent - iam_role = var.launch_type == "EC2" ? aws_iam_role.service_role.arn : null propagate_tags = var.task_propagate_tags enable_execute_command = var.enable_execute_command @@ -71,26 +74,23 @@ resource "aws_ecs_service" "retool" { capacity_provider = var.launch_type == "FARGATE" ? "FARGATE" : aws_ecs_capacity_provider.this[0].name } - dynamic "network_configuration" { - for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) - - content { - subnets = var.private_subnet_ids - security_groups = [ - aws_security_group.containers.id - ] - assign_public_ip = true - } + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.containers.id] + assign_public_ip = false } } resource "aws_ecs_service" "jobs_runner" { - name = "${var.deployment_name}-jobs-runner-service" - cluster = aws_ecs_cluster.this.id - desired_count = 1 - task_definition = aws_ecs_task_definition.retool_jobs_runner.arn - propagate_tags = var.task_propagate_tags - enable_execute_command = var.enable_execute_command + name = "${var.deployment_name}-jobs-runner-service" + cluster = aws_ecs_cluster.this.id + # desired_count is set to 1 since the Jobs Runner must be run as a singleton. + desired_count = 1 + task_definition = aws_ecs_task_definition.retool_jobs_runner.arn + propagate_tags = var.task_propagate_tags + enable_execute_command = var.enable_execute_command + deployment_minimum_healthy_percent = 0 + deployment_maximum_percent = 100 # Need to explictly set this in aws_ecs_service to avoid destructive behavior: https://github.com/hashicorp/terraform-provider-aws/issues/22823 capacity_provider_strategy { @@ -99,27 +99,22 @@ resource "aws_ecs_service" "jobs_runner" { capacity_provider = var.launch_type == "FARGATE" ? "FARGATE" : aws_ecs_capacity_provider.this[0].name } - dynamic "network_configuration" { - for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) - - content { - subnets = var.private_subnet_ids - security_groups = [ - aws_security_group.containers.id - ] - assign_public_ip = true - } + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.containers.id] + assign_public_ip = false } } resource "aws_ecs_service" "workflows_backend" { - count = var.workflows_enabled ? 1 : 0 - name = "${var.deployment_name}-workflows-backend-service" - cluster = aws_ecs_cluster.this.id - desired_count = 1 - task_definition = aws_ecs_task_definition.retool_workflows_backend[0].arn - propagate_tags = var.task_propagate_tags - enable_execute_command = var.enable_execute_command + count = var.workflows_enabled ? 1 : 0 + name = "${var.deployment_name}-workflows-backend-service" + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.retool_workflows_backend[0].arn + propagate_tags = var.task_propagate_tags + enable_execute_command = var.enable_execute_command + deployment_maximum_percent = var.maximum_percent + deployment_minimum_healthy_percent = var.minimum_healthy_percent # Need to explictly set this in aws_ecs_service to avoid destructive behavior: https://github.com/hashicorp/terraform-provider-aws/issues/22823 capacity_provider_strategy { @@ -132,27 +127,22 @@ resource "aws_ecs_service" "workflows_backend" { registry_arn = aws_service_discovery_service.retool_workflow_backend_service[0].arn } - dynamic "network_configuration" { - for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) - - content { - subnets = var.private_subnet_ids - security_groups = [ - aws_security_group.containers.id - ] - assign_public_ip = true - } + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.containers.id] + assign_public_ip = false } } resource "aws_ecs_service" "workflows_worker" { - count = var.workflows_enabled ? 1 : 0 - name = "${var.deployment_name}-workflows-worker-service" - cluster = aws_ecs_cluster.this.id - desired_count = 1 - task_definition = aws_ecs_task_definition.retool_workflows_worker[0].arn - propagate_tags = var.task_propagate_tags - enable_execute_command = var.enable_execute_command + count = var.workflows_enabled ? 1 : 0 + name = "${var.deployment_name}-workflows-worker-service" + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.retool_workflows_worker[0].arn + deployment_maximum_percent = var.maximum_percent + deployment_minimum_healthy_percent = var.minimum_healthy_percent + propagate_tags = var.task_propagate_tags + enable_execute_command = var.enable_execute_command # Need to explictly set this in aws_ecs_service to avoid destructive behavior: https://github.com/hashicorp/terraform-provider-aws/issues/22823 capacity_provider_strategy { @@ -161,26 +151,21 @@ resource "aws_ecs_service" "workflows_worker" { capacity_provider = var.launch_type == "FARGATE" ? "FARGATE" : aws_ecs_capacity_provider.this[0].name } - dynamic "network_configuration" { - for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) - - content { - subnets = var.private_subnet_ids - security_groups = [ - aws_security_group.containers.id - ] - assign_public_ip = true - } + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.containers.id] + assign_public_ip = false } } resource "aws_ecs_service" "code_executor" { - count = var.code_executor_enabled ? 1 : 0 - name = "${var.deployment_name}-code-executor-service" - cluster = aws_ecs_cluster.this.id - desired_count = 1 - task_definition = aws_ecs_task_definition.retool_code_executor[0].arn - enable_execute_command = var.enable_execute_command + count = var.code_executor_enabled ? 1 : 0 + name = "${var.deployment_name}-code-executor-service" + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.retool_code_executor[0].arn + enable_execute_command = var.enable_execute_command + deployment_maximum_percent = var.maximum_percent + deployment_minimum_healthy_percent = var.minimum_healthy_percent # Need to explictly set this in aws_ecs_service to avoid destructive behavior: https://github.com/hashicorp/terraform-provider-aws/issues/22823 capacity_provider_strategy { @@ -193,16 +178,10 @@ resource "aws_ecs_service" "code_executor" { registry_arn = aws_service_discovery_service.retool_code_executor_service[0].arn } - dynamic "network_configuration" { - for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) - - content { - subnets = var.private_subnet_ids - security_groups = [ - aws_security_group.containers.id - ] - assign_public_ip = true - } + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.containers.id] + assign_public_ip = false } } @@ -226,37 +205,32 @@ resource "aws_ecs_service" "telemetry" { registry_arn = aws_service_discovery_service.retool_telemetry_service[0].arn } - dynamic "network_configuration" { - for_each = var.launch_type == "FARGATE" ? toset([1]) : toset([]) - - content { - subnets = var.private_subnet_ids - security_groups = [ - aws_security_group.containers.id - ] - assign_public_ip = true - } + network_configuration { + subnets = var.private_subnet_ids + security_groups = [aws_security_group.containers.id] + assign_public_ip = false } } resource "aws_ecs_task_definition" "retool_jobs_runner" { family = "retool-jobs-runner" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null - requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null - network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" - cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["jobs_runner"]["cpu"] : null - memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["jobs_runner"]["memory"] : null + execution_role_arn = aws_iam_role.execution_role.arn + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] + network_mode = "awsvpc" + cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["jobs_runner"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["jobs_runner"]["memory"] : null container_definitions = jsonencode(concat( local.common_containers, [ { - name = "retool-jobs-runner" - essential = true - image = var.ecs_retool_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["jobs_runner"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["jobs_runner"]["memory"] : null - command = [ + name = "retool-jobs-runner" + essential = true + image = var.ecs_retool_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["jobs_runner"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["jobs_runner"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["jobs_runner"]["memoryReservation"] : null + command = [ "./docker_scripts/start_api.sh" ] @@ -279,6 +253,8 @@ resource "aws_ecs_task_definition" "retool_jobs_runner" { } ] ) + + secrets = local.secrets } ] )) @@ -287,22 +263,23 @@ resource "aws_ecs_task_definition" "retool_jobs_runner" { resource "aws_ecs_task_definition" "retool" { family = "retool" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null - requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null - network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" - cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["main"]["cpu"] : null - memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["main"]["memory"] : null + execution_role_arn = aws_iam_role.execution_role.arn + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] + network_mode = "awsvpc" + cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["main"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["main"]["memory"] : null container_definitions = jsonencode(concat( local.common_containers, [ { - name = "retool" - essential = true - image = var.ecs_retool_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["main"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["main"]["memory"] : null - command = [ + name = "retool" + essential = true + image = var.ecs_retool_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["main"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["main"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["main"]["memoryReservation"] : null + command = [ "./docker_scripts/start_api.sh" ] @@ -329,6 +306,8 @@ resource "aws_ecs_task_definition" "retool" { } ] ) + + secrets = local.secrets } ] )) @@ -338,22 +317,23 @@ resource "aws_ecs_task_definition" "retool_workflows_backend" { count = var.workflows_enabled ? 1 : 0 family = "retool-workflows-backend" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null - requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null - network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" - cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["workflows_backend"]["cpu"] : null - memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["workflows_backend"]["memory"] : null + execution_role_arn = aws_iam_role.execution_role.arn + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] + network_mode = "awsvpc" + cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["workflows_backend"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["workflows_backend"]["memory"] : null container_definitions = jsonencode(concat( local.common_containers, [ { - name = "retool-workflows-backend" - essential = true - image = var.ecs_retool_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["workflows_backend"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["workflows_backend"]["memory"] : null - command = [ + name = "retool-workflows-backend" + essential = true + image = var.ecs_retool_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["workflows_backend"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["workflows_backend"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["workflows_backend"]["memoryReservation"] : null + command = [ "./docker_scripts/start_api.sh" ] @@ -380,6 +360,8 @@ resource "aws_ecs_task_definition" "retool_workflows_backend" { } ] ) + + secrets = local.secrets } ] )) @@ -389,22 +371,23 @@ resource "aws_ecs_task_definition" "retool_workflows_worker" { count = var.workflows_enabled ? 1 : 0 family = "retool-workflows-worker" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null - requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null - network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" - cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["workflows_worker"]["cpu"] : null - memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["workflows_worker"]["memory"] : null + execution_role_arn = aws_iam_role.execution_role.arn + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] + network_mode = "awsvpc" + cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["code_executor"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["code_executor"]["memory"] : null container_definitions = jsonencode(concat( local.common_containers, [ { - name = "retool-workflows-worker" - essential = true - image = var.ecs_retool_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["workflows_worker"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["workflows_worker"]["memory"] : null - command = [ + name = "retool-workflows-worker" + essential = true + image = var.ecs_retool_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["code_executor"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["code_executor"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["code_executor"]["memoryReservation"] : null + command = [ "./docker_scripts/start_api.sh" ] @@ -435,6 +418,8 @@ resource "aws_ecs_task_definition" "retool_workflows_worker" { } ] ) + + secrets = local.secrets } ] )) @@ -444,22 +429,23 @@ resource "aws_ecs_task_definition" "retool_code_executor" { count = var.code_executor_enabled ? 1 : 0 family = "retool-code-executor" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null - requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null - network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" - cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["code_executor"]["cpu"] : null - memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["code_executor"]["memory"] : null + execution_role_arn = aws_iam_role.execution_role.arn + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] + network_mode = "awsvpc" + cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["telemetry"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["telemetry"]["memory"] : null container_definitions = jsonencode(concat( local.common_containers, [ { - name = "retool-code-executor" - essential = true - image = local.ecs_code_executor_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["code_executor"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["code_executor"]["memory"] : null - user = var.launch_type == "EC2" ? null : "1001:1001" + name = "retool-code-executor" + essential = true + image = local.ecs_code_executor_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["telemetry"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["telemetry"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["telemetry"]["memoryReservation"] : null + user = var.launch_type == "EC2" ? null : "1001:1001" # required to use nsjail sandboxing, which is required for custom libraries for JS and Python # Learn more here: https://docs.retool.com/self-hosted/concepts/architecture#code-executor # If not using nsjail sandboxing, update this to be false and use user = "1001:1001" @@ -498,6 +484,8 @@ resource "aws_ecs_task_definition" "retool_code_executor" { } ] : [] ) + + secrets = local.secrets } ] )) @@ -507,20 +495,21 @@ resource "aws_ecs_task_definition" "retool_telemetry" { count = var.telemetry_enabled ? 1 : 0 family = "retool-telemetry" task_role_arn = aws_iam_role.task_role.arn - execution_role_arn = var.launch_type == "FARGATE" ? aws_iam_role.execution_role[0].arn : null - requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : null - network_mode = var.launch_type == "FARGATE" ? "awsvpc" : "bridge" - cpu = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["telemetry"]["cpu"] : null - memory = var.launch_type == "FARGATE" ? var.ecs_task_resource_map["telemetry"]["memory"] : null + execution_role_arn = aws_iam_role.execution_role.arn + requires_compatibilities = var.launch_type == "FARGATE" ? ["FARGATE"] : ["EC2"] + network_mode = "awsvpc" + cpu = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["telemetry"]["cpu"] : null + memory = var.launch_type == "FARGATE" ? var.fargate_task_resource_map["telemetry"]["memory"] : null container_definitions = jsonencode( [ { - name = "retool-telemetry" - essential = true - image = local.ecs_telemetry_image - cpu = var.launch_type == "EC2" ? var.ecs_task_resource_map["telemetry"]["cpu"] : null - memory = var.launch_type == "EC2" ? var.ecs_task_resource_map["telemetry"]["memory"] : null + name = "retool-telemetry" + essential = true + image = local.ecs_telemetry_image + cpu = var.launch_type == "EC2" ? var.ec2_task_resource_map["telemetry"]["cpu"] : null + memory = var.launch_type == "EC2" ? var.ec2_task_resource_map["telemetry"]["memory"] : null + memoryReservation = var.launch_type == "EC2" ? var.ec2_task_resource_map["telemetry"]["memoryReservation"] : null command = [ "retool-telemetry" ] diff --git a/modules/aws_ecs/roles.tf b/modules/aws_ecs/roles.tf index b242bd2..f28387b 100644 --- a/modules/aws_ecs/roles.tf +++ b/modules/aws_ecs/roles.tf @@ -69,7 +69,6 @@ resource "aws_iam_role" "service_role" { } } -# Execution Role for Fargate data "aws_iam_policy_document" "execution_role_assume_policy" { statement { actions = ["sts:AssumeRole"] @@ -82,17 +81,42 @@ data "aws_iam_policy_document" "execution_role_assume_policy" { } resource "aws_iam_role" "execution_role" { - count = var.launch_type == "FARGATE" ? 1 : 0 name = "${var.deployment_name}-execution-role" assume_role_policy = data.aws_iam_policy_document.execution_role_assume_policy.json } resource "aws_iam_role_policy_attachment" "execution_role" { - count = var.launch_type == "FARGATE" ? 1 : 0 - role = aws_iam_role.execution_role[0].name + role = aws_iam_role.execution_role.name policy_arn = "arn:${var.iam_partition}:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" } +data "aws_iam_policy_document" "execution_role_read_secrets" { + statement { + effect = "Allow" + + actions = [ + "secretsmanager:GetSecretValue", + ] + + resources = [ + aws_secretsmanager_secret.rds_password.arn, + aws_secretsmanager_secret.encryption_key.arn, + aws_secretsmanager_secret.jwt_secret.arn + ] + } +} + +resource aws_iam_policy "execution_role_read_secrets" { + name = "ExecutionRoleReadSecrets" + description = "Allows ECS or EC2 instance execution to read secrets block values from AWS Secret Manager" + policy = data.aws_iam_policy_document.execution_role_read_secrets.json +} + +resource "aws_iam_role_policy_attachment" "execution_role_read_secrets" { + role = aws_iam_role.execution_role.name + policy_arn = aws_iam_policy.execution_role_read_secrets.arn +} + # IAM Role for EC2 instances resource "aws_iam_instance_profile" "ec2" { count = var.launch_type == "EC2" ? 1 : 0 diff --git a/modules/aws_ecs/variables.tf b/modules/aws_ecs/variables.tf index 81651d9..6ac0be3 100644 --- a/modules/aws_ecs/variables.tf +++ b/modules/aws_ecs/variables.tf @@ -4,6 +4,11 @@ variable "aws_region" { description = "AWS region. Defaults to `us-east-1`" } +variable "profile" { + type = string + description = "Optional AWS CLI Profile." +} + variable "node_env" { type = string default = "production" @@ -45,7 +50,7 @@ variable "max_instance_count" { variable "min_instance_count" { type = number - description = "Min/desired number of EC2 instances. Defaults to 4." + description = "Min/desired number of EC2 instances. Defaults to 3." default = 3 } @@ -97,7 +102,58 @@ variable "ecs_telemetry_fluentbit_image" { default = "tryretool/retool-aws-for-fluent-bit:3.120.0-edge" } -variable "ecs_task_resource_map" { +# ECS treats CPU and Memory differently between EC2 and Fargate launch types. +# Retool provides separate sane defaults for both launch types and the template will use the resource map for the configured launch type. +# With Fargate, ECS treats CPU and Memory as exact requests, but with EC2, ECS treats CPU as a soft limit, +# memory as a hard limit and supports the additional memoryReservation as a soft limit. + +variable "ec2_task_resource_map" { + type = map(object({ + cpu = number + memory = number + memoryReservation = number + })) + default = { + main = { + cpu = 2048 + memory = 4096 + memoryReservation = 4096 + }, + jobs_runner = { + cpu = 1024 + memory = 4096 + memoryReservation = 2048 + }, + workflows_backend = { + cpu = 2048 + memory = 4096 + memoryReservation = 4096 + } + workflows_worker = { + cpu = 1024 + memory = 4096 + memoryReservation = 2048 + } + code_executor = { + cpu = 1024 + memory = 4096 + memoryReservation = 2048 + } + telemetry = { + cpu = 1024 + memory = 4096 + memoryReservation = 2048 + } + fluentbit = { + cpu = 512 + memory = 2048 + memoryReservation = 1024 + } + } + description = "Amount of CPU and Memory provisioned for each task with EC2 launch type set." +} + +variable "fargate_task_resource_map" { type = map(object({ cpu = number memory = number @@ -108,8 +164,8 @@ variable "ecs_task_resource_map" { memory = 4096 }, jobs_runner = { - cpu = 1024 - memory = 2048 + cpu = 2048 + memory = 4096 }, workflows_backend = { cpu = 2048 @@ -132,7 +188,7 @@ variable "ecs_task_resource_map" { memory = 1024 } } - description = "Amount of CPU and Memory provisioned for each task." + description = "Amount of CPU and Memory provisioned for each task with Fargate launch type." } variable "temporal_ecs_task_resource_map" { @@ -451,12 +507,24 @@ variable "autoscaling_memory_reservation_target" { description = "Memory reservation target for the Autoscaling Group. Defaults to 70.0." } +variable "autoscaling_cpu_reservation_target" { + type = number + default = 60.0 + description = "Memory reservation target for the Autoscaling Group. Defaults to 60.0." +} + variable "additional_env_vars" { type = list(map(string)) default = [] description = "Additional environment variables (e.g. BASE_DOMAIN)" } +variable "additional_secrets" { + type = list(map(string)) + default = [] + description = "Optional additional environment variables set from pre-existing AWS Secrets Manager Secrets." +} + variable "additional_temporal_env_vars" { type = list(map(string)) default = []