From 4bf1c4987a1a389686e4b1ade85c5d7cbfa963e8 Mon Sep 17 00:00:00 2001 From: Misael Ramirez Date: Wed, 11 Dec 2024 12:51:37 +0100 Subject: [PATCH 1/3] Enabling variables to control job batch limits --- main.tf | 4 ++++ modules/computation/variables.tf | 15 +++++++++++++++ variables.tf | 15 +++++++++++++++ 3 files changed, 34 insertions(+) diff --git a/main.tf b/main.tf index 9b2aaee..abe44a2 100644 --- a/main.tf +++ b/main.tf @@ -97,6 +97,10 @@ module "metaflow-computation" { launch_template_http_endpoint = var.launch_template_http_endpoint launch_template_http_tokens = var.launch_template_http_tokens launch_template_http_put_response_hop_limit = var.launch_template_http_put_response_hop_limit + job_state_time_limit_action = var.job_state_time_limit_action + job_state_time_limit_timeout = var.job_state_time_limit_timeout + job_state_time_limit_reason = var.job_state_time_limit_reason + standard_tags = var.tags } diff --git a/modules/computation/variables.tf b/modules/computation/variables.tf index 92d11ba..4ae5df5 100644 --- a/modules/computation/variables.tf +++ b/modules/computation/variables.tf @@ -102,3 +102,18 @@ variable "launch_template_image_id" { nullable = true default = null } + +variable "job_state_time_limit_action" { + type = string + description = "The action to take when the job times out" +} + +variable "job_state_time_limit_timeout" { + type = number + description = "The time limit in seconds for the job to run before the action is taken" +} + +variable "job_state_time_limit_reason" { + type = number + description = "The reason for the job state time limit action" +} diff --git a/variables.tf b/variables.tf index 1738c0b..0b0758e 100644 --- a/variables.tf +++ b/variables.tf @@ -199,3 +199,18 @@ variable "enable_key_rotation" { description = "Enable key rotation for KMS keys" default = false } + +variable "job_state_time_limit_action" { + type = string + description = "The action to take when the job times out" +} + +variable "job_state_time_limit_timeout" { + type = number + description = "The time limit in seconds for the job to run before the action is taken" +} + +variable "job_state_time_limit_reason" { + type = number + description = "The reason for the job state time limit action" +} From 1ef6a3ac4dbc507120446a104adf6136c78eeb09 Mon Sep 17 00:00:00 2001 From: Misael Ramirez Date: Mon, 3 Feb 2025 12:39:01 +0100 Subject: [PATCH 2/3] Deleting action variable and change data type for time limit reason --- main.tf | 2 -- modules/computation/batch.tf | 6 ++++++ modules/computation/variables.tf | 7 +------ variables.tf | 13 ++++++------- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/main.tf b/main.tf index abe44a2..ee1d495 100644 --- a/main.tf +++ b/main.tf @@ -97,11 +97,9 @@ module "metaflow-computation" { launch_template_http_endpoint = var.launch_template_http_endpoint launch_template_http_tokens = var.launch_template_http_tokens launch_template_http_put_response_hop_limit = var.launch_template_http_put_response_hop_limit - job_state_time_limit_action = var.job_state_time_limit_action job_state_time_limit_timeout = var.job_state_time_limit_timeout job_state_time_limit_reason = var.job_state_time_limit_reason - standard_tags = var.tags } diff --git a/modules/computation/batch.tf b/modules/computation/batch.tf index 655e4a6..e0c5f77 100644 --- a/modules/computation/batch.tf +++ b/modules/computation/batch.tf @@ -83,5 +83,11 @@ resource "aws_batch_job_queue" "this" { aws_batch_compute_environment.this.arn ] + job_state_time_limit_action { + action= "CANCEL" + max_time_seconds = var.job_state_time_limit_timeout + reason=var.job_state_time_limit_reason + state="RUNNABLE" + } tags = var.standard_tags } diff --git a/modules/computation/variables.tf b/modules/computation/variables.tf index 4ae5df5..bca601f 100644 --- a/modules/computation/variables.tf +++ b/modules/computation/variables.tf @@ -103,17 +103,12 @@ variable "launch_template_image_id" { default = null } -variable "job_state_time_limit_action" { - type = string - description = "The action to take when the job times out" -} - variable "job_state_time_limit_timeout" { type = number description = "The time limit in seconds for the job to run before the action is taken" } variable "job_state_time_limit_reason" { - type = number + type = string description = "The reason for the job state time limit action" } diff --git a/variables.tf b/variables.tf index 0b0758e..9c454a3 100644 --- a/variables.tf +++ b/variables.tf @@ -200,17 +200,16 @@ variable "enable_key_rotation" { default = false } -variable "job_state_time_limit_action" { - type = string - description = "The action to take when the job times out" -} - variable "job_state_time_limit_timeout" { type = number description = "The time limit in seconds for the job to run before the action is taken" + validation { + condition = var.job_state_time_limit_timeout >= 600 + error_message = "The value for 'job_state_time_limit_timeout' must be greater than 600." + } } variable "job_state_time_limit_reason" { - type = number - description = "The reason for the job state time limit action" + type = string + description = "The reason to log for the action being taken." } From 7b222fc35ca8108539accd7b01b3469b4c169b2d Mon Sep 17 00:00:00 2001 From: Misael Ramirez Date: Mon, 3 Feb 2025 15:38:18 +0100 Subject: [PATCH 3/3] Dynamic job state_time_limit_action --- modules/computation/batch.tf | 17 ++++++++++++----- modules/computation/locals.tf | 2 ++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/modules/computation/batch.tf b/modules/computation/batch.tf index e0c5f77..d6010d3 100644 --- a/modules/computation/batch.tf +++ b/modules/computation/batch.tf @@ -75,6 +75,8 @@ resource "aws_batch_compute_environment" "this" { } } + + resource "aws_batch_job_queue" "this" { name = local.batch_queue_name state = "ENABLED" @@ -83,11 +85,16 @@ resource "aws_batch_job_queue" "this" { aws_batch_compute_environment.this.arn ] - job_state_time_limit_action { - action= "CANCEL" - max_time_seconds = var.job_state_time_limit_timeout - reason=var.job_state_time_limit_reason - state="RUNNABLE" + dynamic "job_state_time_limit_action" { + for_each = local.job_state_valid ? [1] : [] + + content { + action= "CANCEL" + max_time_seconds = var.job_state_time_limit_timeout + /* For valid reasons go to docs --> https://docs.aws.amazon.com/batch/latest/userguide/job_stuck_in_runnable.html */ + reason=var.job_state_time_limit_reason + state="RUNNABLE" + } } tags = var.standard_tags } diff --git a/modules/computation/locals.tf b/modules/computation/locals.tf index bea84cb..9c40220 100644 --- a/modules/computation/locals.tf +++ b/modules/computation/locals.tf @@ -19,4 +19,6 @@ locals { ecs_instance_role_name = "${var.resource_prefix}ecs-iam-role${var.resource_suffix}" enable_fargate_on_batch = var.batch_type == "fargate" + + job_state_valid = ( var.job_state_time_limit_timeout != null && var.job_state_time_limit_reason != null ) }