outerbounds
diff --git a/‎aws/terraform/modules/metaflow/README.md
Lines changed: 17 additions & 0 deletions b/‎aws/terraform/modules/metaflow/README.md
Lines changed: 17 additions & 0 deletions
diff --git a/‎aws/terraform/modules/metaflow/ecr.tf
Lines changed: 5 additions & 0 deletions b/‎aws/terraform/modules/metaflow/ecr.tf
Lines changed: 5 additions & 0 deletions
diff --git a/‎aws/terraform/modules/metaflow/locals.tf
Lines changed: 6 additions & 0 deletions b/‎aws/terraform/modules/metaflow/locals.tf
Lines changed: 6 additions & 0 deletions
diff --git a/‎aws/terraform/modules/metaflow/main.tf
Lines changed: 73 additions & 0 deletions b/‎aws/terraform/modules/metaflow/main.tf
Lines changed: 73 additions & 0 deletions
diff --git a/‎aws/terraform/modules/metaflow/modules/README.md
Lines changed: 26 additions & 0 deletions b/‎aws/terraform/modules/metaflow/modules/README.md
Lines changed: 26 additions & 0 deletions
diff --git a/‎aws/terraform/modules/metaflow/modules/computation/README.md
Lines changed: 8 additions & 0 deletions b/‎aws/terraform/modules/metaflow/modules/computation/README.md
Lines changed: 8 additions & 0 deletions
diff --git a/‎aws/terraform/modules/metaflow/modules/computation/batch.tf
Lines changed: 213 additions & 0 deletions b/‎aws/terraform/modules/metaflow/modules/computation/batch.tf
Lines changed: 213 additions & 0 deletions
diff --git a/‎aws/terraform/modules/metaflow/modules/computation/data.tf
Lines changed: 1 addition & 0 deletions b/‎aws/terraform/modules/metaflow/modules/computation/data.tf
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,17 @@
+# README
+
+This project is composed of modules which break up the responsibility into logical parts. See each module's 
+corresponding `README.md` for more details.
+
+Provides the core functionality for Metaflow which includes:
+
+- on demand processing (`computation`)
+- blob and tabular storage (`datastore`)
+- an API to record and query past executions (`metadata-service`)
+- orchestrated processing (`step-functions`)
+
+Depends on the output of the project `infra`.
+
+## ECR
+
+Sets up an AWS ECR to hold the Docker image we wish to use with Metaflow.
@@ -0,0 +1,5 @@
+resource "aws_ecr_repository" "metaflow_batch_image" {
+  name = local.metaflow_batch_image_name
+
+  tags = var.tags
+}
@@ -0,0 +1,6 @@
+locals {
+  resource_prefix = length(var.resource_prefix) > 0 ? "${var.resource_prefix}-" : ""
+  resource_suffix = length(var.resource_suffix) > 0 ? "-${var.resource_suffix}" : ""
+
+  metaflow_batch_image_name = "${local.resource_prefix}batch${local.resource_suffix}"
+}
@@ -0,0 +1,73 @@
+module "metaflow-datastore" {
+  source = "./modules/datastore"
+
+  resource_prefix                    = local.resource_prefix
+  resource_suffix                    = local.resource_suffix
+  metaflow_vpc_id                    = var.vpc_id
+  ecs_instance_role_arn              = module.metaflow-computation.ecs_instance_role_arn
+  ecs_execution_role_arn             = module.metaflow-computation.ecs_execution_role_arn
+  aws_batch_service_role_arn         = module.metaflow-computation.batch_service_role_arn
+  subnet_private_1_id                = var.subnet_private_1_id
+  subnet_private_2_id                = var.subnet_private_2_id
+  metadata_service_security_group_id = module.metaflow-metadata-service.metadata_service_security_group_id
+
+  standard_tags = var.tags
+}
+
+module "metaflow-metadata-service" {
+  source = "./modules/metadata-service"
+
+  resource_prefix              = local.resource_prefix
+  resource_suffix              = local.resource_suffix
+  metaflow_vpc_id              = var.vpc_id
+  vpc_cidr_block               = var.vpc_cidr_block
+  subnet_private_1_id          = var.subnet_private_1_id
+  subnet_private_2_id          = var.subnet_private_2_id
+  rds_master_instance_endpoint = module.metaflow-datastore.rds_master_instance_endpoint
+  database_username            = module.metaflow-datastore.database_username
+  database_password            = module.metaflow-datastore.database_password
+  fargate_task_role_arn        = module.metaflow-datastore.iam_s3_access_role_arn
+  fargate_execution_role_arn   = module.metaflow-computation.ecs_execution_role_arn
+  access_list_cidr_blocks      = var.access_list_cidr_blocks
+
+  standard_tags = var.tags
+}
+
+module "metaflow-computation" {
+  source = "./modules/computation"
+
+  resource_prefix                                   = local.resource_prefix
+  resource_suffix                                   = local.resource_suffix
+  metaflow_vpc_id                                   = var.vpc_id
+  subnet_private_1_id                               = var.subnet_private_1_id
+  subnet_private_2_id                               = var.subnet_private_2_id
+  s3_kms_policy_arn                                 = module.metaflow-datastore.metaflow_kms_s3_policy_arn
+  metaflow_policy_arn                               = var.metaflow_policy_arn
+  metaflow_step_functions_dynamodb_policy           = module.metaflow-step-functions.metaflow_step_functions_dynamodb_policy
+  batch_compute_environment_cpu_max_vcpus           = var.cpu_max_compute_vcpus
+  batch_compute_environment_cpu_desired_vcpus       = var.cpu_desired_compute_vcpus
+  batch_compute_environment_cpu_min_vcpus           = var.cpu_min_compute_vcpus
+  batch_compute_environment_large_cpu_max_vcpus     = var.large_cpu_max_compute_vcpus
+  batch_compute_environment_large_cpu_desired_vcpus = var.large_cpu_desired_compute_vcpus
+  batch_compute_environment_large_cpu_min_vcpus     = var.large_cpu_min_compute_vcpus
+  batch_compute_environment_gpu_max_vcpus           = var.gpu_max_compute_vcpus
+  batch_compute_environment_gpu_desired_vcpus       = var.gpu_desired_compute_vcpus
+  batch_compute_environment_gpu_min_vcpus           = var.gpu_min_compute_vcpus
+  enable_step_functions                             = var.enable_step_functions
+
+  standard_tags = var.tags
+}
+
+module "metaflow-step-functions" {
+  source = "./modules/step-functions"
+
+  active          = var.enable_step_functions
+  resource_prefix = local.resource_prefix
+  resource_suffix = local.resource_suffix
+
+  batch_job_queue_arn = module.metaflow-computation.METAFLOW_BATCH_JOB_QUEUE
+  s3_bucket_arn       = module.metaflow-datastore.s3_bucket_arn
+  s3_bucket_kms_arn   = module.metaflow-datastore.datastore_s3_bucket_kms_key_arn
+
+  standard_tags = var.tags
+}
@@ -0,0 +1,26 @@
+# Modules
+
+Our Metaflow Terraform code has been separated into separate modules based on the service architecture.
+
+## Computation
+
+Sets up remote computation resources so flows can be run on EC2 instances. These resources do not perform 
+orchestration and rely on the data scientist's computer to perform this coordination.
+
+## Datastore
+
+Sets up blob and tabular data storage. Records all flows, the steps they took, their conda environments, artifacts 
+and results.
+
+Should exist for the lifetime of the stack.
+
+## Metadata Service
+
+Sets up an API entrypoint to interact with all other services, both for running flows and interacting with the 
+Datastore to explore historic runs.
+
+## Step Functions
+
+Sets up remote computation resources that come with orchestration. This allows data scientists to schedule flows 
+using crons as well as being able to kick off flows and shut down their machine, as the remote resources will handle 
+all coordination.
@@ -0,0 +1,8 @@
+# Computation
+
+This module sets up the required resources to perform remote AWS Batch executions. One can modify how many resources 
+we want to have available at a given moment, as well as what resources we want to allow to scale up.
+
+One can use Metaflow without this module by running purely locally and leveraging the Datastore
+
+To read more, see [the Metaflow docs](https://docs.metaflow.org/metaflow-on-aws/metaflow-on-aws#compute)
@@ -0,0 +1,213 @@
+resource "aws_batch_compute_environment" "cpu" {
+  /* Unique name for compute environment.
+     We use compute_environment_name_prefix opposed to just compute_environment_name as batch compute environments must
+     be created and destroyed, never edited. This way when we go to make a "modification" we will stand up a new
+     batch compute environment with a new unique name and once that succeeds, the old one will be torn down. If we had
+     just used compute_environment_name, then there would be a conflict when we went to stand up the new
+     compute_environment that had the modifications applied and the process would fail.
+  */
+  compute_environment_name_prefix = local.cpu_compute_env_prefix_name
+
+  # Give permissions so the batch service can make API calls.
+  service_role = aws_iam_role.batch_service_role.arn
+  type         = "MANAGED"
+  depends_on   = [aws_iam_role_policy_attachment.batch_service_role]
+
+  compute_resources {
+    # Give permissions so the ECS container instances can make API call.
+    instance_role = aws_iam_instance_profile.ecs_instance_role.arn
+
+    # List of types that can be launched.
+    instance_type = var.batch_cpu_instance_types
+
+    # Range of number of CPUs.
+    max_vcpus     = var.batch_compute_environment_cpu_max_vcpus
+    min_vcpus     = var.batch_compute_environment_cpu_min_vcpus
+    desired_vcpus = var.batch_compute_environment_cpu_desired_vcpus
+
+    # Prefers cheap vCPU approaches
+    allocation_strategy = "BEST_FIT"
+
+    /* Links to a launch template who has more than the standard 8GB of disk space. So we can download training data.
+       Always uses the "default version", which means we can update the Launch Template to a smaller or larger disk size
+       and this compute environment will not have to be destroyed and then created to point to a new Launch Template.
+    */
+    launch_template {
+      launch_template_id = aws_launch_template.this.id
+      version            = aws_launch_template.this.latest_version
+    }
+
+    # Security group to apply to the instances launched.
+    security_group_ids = [
+      aws_security_group.batch.id,
+    ]
+
+    # Which subnet to launch the instances into.
+    subnets = [
+      var.subnet_private_1_id,
+      var.subnet_private_2_id
+    ]
+
+    # Type of instance EC2 for on-demand. Can use "SPOT" to use unused instances at discount if available
+    type = "EC2"
+
+    tags = var.standard_tags
+  }
+
+  lifecycle {
+    /* From here https://github.com/terraform-providers/terraform-provider-aws/issues/11077#issuecomment-560416740
+       helps with "modifying" batch compute environments which requires creating new ones and deleting old ones
+       as no inplace modification can be made
+    */
+    create_before_destroy = true
+    # To ensure terraform redeploys do not silently overwrite an up to date desired_vcpus that metaflow may modify
+    ignore_changes = [compute_resources.0.desired_vcpus]
+  }
+}
+
+resource "aws_batch_compute_environment" "large-cpu" {
+  /* Unique name for compute environment.
+     We use compute_environment_name_prefix opposed to just compute_environment_name as batch compute environments must
+     be created and destroyed, never edited. This way when we go to make a "modification" we will stand up a new
+     batch compute environment with a new unique name and once that succeeds, the old one will be torn down. If we had
+     just used compute_environment_name, then there would be a conflict when we went to stand up the new
+     compute_environment that had the modifications applied and the process would fail.
+  */
+  compute_environment_name_prefix = local.large_cpu_compute_env_prefix_name
+
+  # Give permissions so the batch service can make API calls.
+  service_role = aws_iam_role.batch_service_role.arn
+  type         = "MANAGED"
+  depends_on   = [aws_iam_role_policy_attachment.batch_service_role]
+
+  compute_resources {
+    # Give permissions so the ECS container instances can make API call.
+    instance_role = aws_iam_instance_profile.ecs_instance_role.arn
+
+    # List of types that can be launched.
+    instance_type = var.batch_large_cpu_instance_types
+
+    # Range of number of CPUs.
+    max_vcpus     = var.batch_compute_environment_large_cpu_max_vcpus
+    min_vcpus     = var.batch_compute_environment_large_cpu_min_vcpus
+    desired_vcpus = var.batch_compute_environment_large_cpu_desired_vcpus
+
+    # Prefers cheap vCPU approaches
+    allocation_strategy = "BEST_FIT"
+
+    /* Links to a launch template who has more than the standard 8GB of disk space. So we can download training data.
+       Always uses the "default version", which means we can update the Launch Template to a smaller or larger disk size
+       and this compute environment will not have to be destroyed and then created to point to a new Launch Template.
+    */
+    launch_template {
+      launch_template_id = aws_launch_template.this.id
+      version            = aws_launch_template.this.latest_version
+    }
+
+    # Security group to apply to the instances launched.
+    security_group_ids = [
+      aws_security_group.batch.id,
+    ]
+
+    # Which subnet to launch the instances into.
+    subnets = [
+      var.subnet_private_1_id,
+      var.subnet_private_2_id
+    ]
+
+    # Type of instance EC2 for on-demand. Can use "SPOT" to use unused instances at discount if available
+    type = "EC2"
+
+    tags = var.standard_tags
+  }
+
+  lifecycle {
+    /* From here https://github.com/terraform-providers/terraform-provider-aws/issues/11077#issuecomment-560416740
+       helps with "modifying" batch compute environments which requires creating new ones and deleting old ones
+       as no inplace modification can be made
+    */
+    create_before_destroy = true
+    # To ensure terraform redeploys do not silently overwrite an up to date desired_vcpus that metaflow may modify
+    ignore_changes = [compute_resources.0.desired_vcpus]
+  }
+}
+
+resource "aws_batch_compute_environment" "gpu" {
+  /* Unique name for compute environment.
+     We use compute_environment_name_prefix opposed to just compute_environment_name as batch compute environments must
+     be created and destroyed, never edited. This way when we go to make a "modification" we will stand up a new
+     batch compute environment with a new unique name and once that succeeds, the old one will be torn down. If we had
+     just used compute_environment_name, then there would be a conflict when we went to stand up the new
+     compute_environment that had the modifications applied and the process would fail.
+  */
+  compute_environment_name_prefix = local.gpu_compute_env_prefix_name
+
+  # Give permissions so the batch service can make API calls.
+  service_role = aws_iam_role.batch_service_role.arn
+  type         = "MANAGED"
+  depends_on   = [aws_iam_role_policy_attachment.batch_service_role]
+
+  compute_resources {
+    # Give permissions so the ECS container instances can make API call.
+    instance_role = aws_iam_instance_profile.ecs_instance_role.arn
+
+    # List of types that can be launched.
+    instance_type = var.batch_gpu_instance_types
+
+    # Range of number of CPUs.
+    max_vcpus     = var.batch_compute_environment_gpu_max_vcpus
+    min_vcpus     = var.batch_compute_environment_gpu_min_vcpus
+    desired_vcpus = var.batch_compute_environment_gpu_desired_vcpus
+
+    # Prefers cheap vCPU approaches
+    allocation_strategy = "BEST_FIT"
+
+    /* Links to a launch template who has more than the standard 8GB of disk space. So we can download training data.
+       Always uses the "default version", which means we can update the Launch Template to a smaller or larger disk size
+       and this compute environment will not have to be destroyed and then created to point to a new Launch Template.
+    */
+    launch_template {
+      launch_template_id = aws_launch_template.this.id
+      version            = aws_launch_template.this.latest_version
+    }
+
+    # Security group to apply to the instances launched.
+    security_group_ids = [
+      aws_security_group.batch.id,
+    ]
+
+    # Which subnet to launch the instances into.
+    subnets = [
+      var.subnet_private_1_id,
+      var.subnet_private_2_id
+    ]
+
+    # Type of instance EC2 for on-demand. Can use "SPOT" to use unused instances at discount if available
+    type = "EC2"
+
+    tags = var.standard_tags
+  }
+
+  lifecycle {
+    /* From here https://github.com/terraform-providers/terraform-provider-aws/issues/11077#issuecomment-560416740
+       helps with "modifying" batch compute environments which requires creating new ones and deleting old ones
+       as no inplace modification can be made
+    */
+    create_before_destroy = true
+    # To ensure terraform redeploys do not silently overwrite an up to date desired_vcpus that metaflow may modify
+    ignore_changes = [compute_resources.0.desired_vcpus]
+  }
+}
+
+resource "aws_batch_job_queue" "this" {
+  name     = local.batch_queue_name
+  state    = "ENABLED"
+  priority = 1
+  compute_environments = [
+    aws_batch_compute_environment.cpu.arn,
+    aws_batch_compute_environment.large-cpu.arn,
+    aws_batch_compute_environment.gpu.arn
+  ]
+
+  tags = var.standard_tags
+}
@@ -0,0 +1 @@
+data "aws_region" "current" {}