diff --git a/.gitignore b/.gitignore index 5def054..a551b80 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,31 @@ -.terraform +# Local .terraform directories +**/.terraform/* + +# .tfstate files +*.tfstate +*.tfstate.* + +# Crash log files +crash.log + +# Ignore override files as they are usually used to override resources locally and should not be committed +override.tf +override.tf.json +*_override.tf +*_override.tf.json + +# Ignore CLI configuration files +.terraformrc +terraform.rc + +# Ignore sensitive variable files +*.tfvars +*.tfvars.json + +# Ignore plan output files +*.tfplan + +# Ignore lock files .terraform.lock.hcl -terraform.tfstate -terraform.tfstate.backup + +*.metaflow* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index fd46fa5..431d7a7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,15 +1,15 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v2.3.0 + rev: v5.0.0 hooks: - id: end-of-file-fixer - id: trailing-whitespace - repo: https://github.com/antonbabenko/pre-commit-terraform - rev: v1.62.0 # Get the latest from: https://github.com/antonbabenko/pre-commit-terraform/releases + rev: v1.96.3 hooks: - id: terraform_fmt - repo: https://github.com/terraform-docs/terraform-docs - rev: "v0.15.0" + rev: "v0.18.0" hooks: - id: terraform-docs-go name: "Main terraform module docs" diff --git a/README.md b/README.md index f4cedd3..e1695c8 100644 --- a/README.md +++ b/README.md @@ -90,33 +90,54 @@ resource "local_file" "metaflow_config" { | Name | Source | Version | |------|--------|---------| +| [eks](#module\_eks) | terraform-aws-modules/eks/aws | 20.31.6 | | [metaflow-common](#module\_metaflow-common) | ./modules/common | n/a | | [metaflow-computation](#module\_metaflow-computation) | ./modules/computation | n/a | | [metaflow-datastore](#module\_metaflow-datastore) | ./modules/datastore | n/a | | [metaflow-metadata-service](#module\_metaflow-metadata-service) | ./modules/metadata-service | n/a | | [metaflow-step-functions](#module\_metaflow-step-functions) | ./modules/step-functions | n/a | | [metaflow-ui](#module\_metaflow-ui) | ./modules/ui | n/a | +| [metaflow\_helm](#module\_metaflow\_helm) | ./modules/services | n/a | +| [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 | ## Inputs | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [access\_list\_cidr\_blocks](#input\_access\_list\_cidr\_blocks) | List of CIDRs we want to grant access to our Metaflow Metadata Service. Usually this is our VPN's CIDR blocks. | `list(string)` | `[]` | no | +| [access\_list\_cidr\_blocks](#input\_access\_list\_cidr\_blocks) | List of CIDRs we want to grant access to the Metaflow Metadata Service. Usually this is should be your VPN's CIDR blocks. | `list(string)` | `[]` | no | +| [azs](#input\_azs) | A list of availability zones names in the region | `list(string)` | `[]` | no | | [batch\_type](#input\_batch\_type) | AWS Batch Compute Type ('ec2', 'fargate') | `string` | `"ec2"` | no | | [compute\_environment\_desired\_vcpus](#input\_compute\_environment\_desired\_vcpus) | Desired Starting VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate) | `number` | `8` | no | | [compute\_environment\_egress\_cidr\_blocks](#input\_compute\_environment\_egress\_cidr\_blocks) | CIDR blocks to which egress is allowed from the Batch Compute environment's security group | `list(string)` |
[
"0.0.0.0/0"
]
| no | -| [compute\_environment\_instance\_types](#input\_compute\_environment\_instance\_types) | The instance types for the compute environment | `list(string)` |
[
"c4.large",
"c4.xlarge",
"c4.2xlarge",
"c4.4xlarge",
"c4.8xlarge"
]
| no | +| [compute\_environment\_instance\_types](#input\_compute\_environment\_instance\_types) | The instance types for the compute environment | `list(string)` |
[
"c5.large",
"c5.xlarge",
"c5.2xlarge",
"c5.4xlarge",
"c5.9xlarge"
]
| no | | [compute\_environment\_max\_vcpus](#input\_compute\_environment\_max\_vcpus) | Maximum VCPUs for Batch Compute Environment [16-96] | `number` | `64` | no | | [compute\_environment\_min\_vcpus](#input\_compute\_environment\_min\_vcpus) | Minimum VCPUs for Batch Compute Environment [0-16] for EC2 Batch Compute Environment (ignored for Fargate) | `number` | `8` | no | -| [db\_engine\_version](#input\_db\_engine\_version) | n/a | `string` | `"11"` | no | -| [db\_instance\_type](#input\_db\_instance\_type) | RDS instance type to launch for PostgresQL database. | `string` | `"db.t2.small"` | no | +| [create\_datastore](#input\_create\_datastore) | Set to create the datastore components for metaflow such as S3 bucket, Postgres database, etc. This value should be set to true in most cases except if the components created by the module are being deployed in kubernetes or are being created through another means. | `bool` | `true` | no | +| [create\_eks\_cluster](#input\_create\_eks\_cluster) | Set to create an EKS cluster | `bool` | `false` | no | +| [create\_managed\_compute](#input\_create\_managed\_compute) | Set to create metaflow compute resources in AWS Batch. This value should be set to false if the compute resources are deployed within a kubernetes cluster | `bool` | `true` | no | +| [create\_managed\_metaflow\_metadata\_service](#input\_create\_managed\_metaflow\_metadata\_service) | Set to create metaflow metadata-service in managed AWS ECS service. This value should be set to false if the metadata service is deployed within a kubernetes cluster | `bool` | `true` | no | +| [create\_managed\_metaflow\_ui](#input\_create\_managed\_metaflow\_ui) | Set to create metaflow UI in managed AWS ECS service. This value should be set to false if the UI is deployed within a kubernetes cluster | `bool` | `false` | no | +| [create\_public\_subnets\_only](#input\_create\_public\_subnets\_only) | Set to create a VPC with only public subnets. Using only public subnets helps reduce AWS costs by removing the need to create a NAT gateway. However, it also increases security risk to your infrastructure since a misconfigured security group can expose your infrastructure on the public internet. Hence we only recommend setting this for experimental deployments. | `bool` | `false` | no | +| [create\_step\_functions](#input\_create\_step\_functions) | Provisions infrastructure for step functions if enabled | `bool` | `false` | no | +| [create\_vpc](#input\_create\_vpc) | Controls if VPC should be created (it affects almost all resources) | `bool` | `false` | no | +| [database\_endpoint](#input\_database\_endpoint) | Endpoint for the database when create\_datastore is set to false. This variable must be set if you create\_datastore is set to false. | `string` | `""` | no | +| [database\_name](#input\_database\_name) | Name of the database to be used when create\_datastore is set to false. This variable must be set if you create\_datastore is set to false. | `string` | `""` | no | +| [database\_password](#input\_database\_password) | Password for the database when create\_datastore is set to false. This variable must be set if you create\_datastore is set to false. | `string` | `""` | no | +| [database\_username](#input\_database\_username) | Username for the database when create\_datastore is set to false. This variable must be set if you create\_datastore is set to false. | `string` | `""` | no | +| [db\_engine\_version](#input\_db\_engine\_version) | The database engine version for the RDS instances. This value is also used to determine whether to create an Aurora RDS cluster or a classic RDS instance. | `string` | `"14"` | no | +| [db\_instance\_type](#input\_db\_instance\_type) | RDS instance type to launch for PostgresQL database. | `string` | `"db.t3.small"` | no | | [db\_migrate\_lambda\_zip\_file](#input\_db\_migrate\_lambda\_zip\_file) | Output path for the zip file containing the DB migrate lambda | `string` | `null` | no | +| [deploy\_cluster\_autoscaler](#input\_deploy\_cluster\_autoscaler) | Set to deploy the cluster autoscaler | `bool` | `false` | no | +| [deploy\_metaflow\_services\_in\_eks](#input\_deploy\_metaflow\_services\_in\_eks) | Set to deploy metaflow metadata service and metaflow ui via the helm chart. | `bool` | `false` | no | | [enable\_custom\_batch\_container\_registry](#input\_enable\_custom\_batch\_container\_registry) | Provisions infrastructure for custom Amazon ECR container registry if enabled | `bool` | `false` | no | | [enable\_key\_rotation](#input\_enable\_key\_rotation) | Enable key rotation for KMS keys | `bool` | `false` | no | -| [enable\_step\_functions](#input\_enable\_step\_functions) | Provisions infrastructure for step functions if enabled | `bool` | n/a | yes | +| [existing\_private\_subnet\_ids](#input\_existing\_private\_subnet\_ids) | List of private subnet ids that will be used to create metaflow components in. If create\_vpc is set to false, either private\_subnet\_ids, public\_subnet\_ids or both need to be set. Setting private\_subnet\_ids will result in a more | `list(string)` | `[]` | no | +| [existing\_public\_subnet\_ids](#input\_existing\_public\_subnet\_ids) | List of public subnet\_ids that will be used to create metaflow components that you want to expose on the public internet. This may need to be set if create\_vpc is set to false | `list(string)` | `[]` | no | +| [existing\_vpc\_cidr\_blocks](#input\_existing\_vpc\_cidr\_blocks) | The VPC CIDR blocks that we'll access list on our Metadata Service API to allow all internal communications. Needs to be set if create\_vpc is set to false | `list(string)` | `[]` | no | +| [existing\_vpc\_id](#input\_existing\_vpc\_id) | The id of the single VPC we stood up for all Metaflow resources to exist in. Needs to be set if create\_vpc is set to false | `string` | `""` | no | | [extra\_ui\_backend\_env\_vars](#input\_extra\_ui\_backend\_env\_vars) | Additional environment variables for UI backend container | `map(string)` | `{}` | no | | [extra\_ui\_static\_env\_vars](#input\_extra\_ui\_static\_env\_vars) | Additional environment variables for UI static app | `map(string)` | `{}` | no | -| [force\_destroy\_s3\_bucket](#input\_force\_destroy\_s3\_bucket) | Empty S3 bucket before destroying via terraform destroy | `bool` | `false` | no | +| [force\_destroy\_s3\_bucket](#input\_force\_destroy\_s3\_bucket) | Empty S3 bucket before destroying via terraform destroy | `bool` | `true` | no | | [iam\_partition](#input\_iam\_partition) | IAM Partition (Select aws-us-gov for AWS GovCloud, otherwise leave as is) | `string` | `"aws"` | no | | [launch\_template\_http\_endpoint](#input\_launch\_template\_http\_endpoint) | Whether the metadata service is available. Can be 'enabled' or 'disabled' | `string` | `"enabled"` | no | | [launch\_template\_http\_put\_response\_hop\_limit](#input\_launch\_template\_http\_put\_response\_hop\_limit) | The desired HTTP PUT response hop limit for instance metadata requests. Can be an integer from 1 to 64 | `number` | `2` | no | @@ -124,18 +145,23 @@ resource "local_file" "metaflow_config" { | [metadata\_service\_container\_image](#input\_metadata\_service\_container\_image) | Container image for metadata service | `string` | `""` | no | | [metadata\_service\_enable\_api\_basic\_auth](#input\_metadata\_service\_enable\_api\_basic\_auth) | Enable basic auth for API Gateway? (requires key export) | `bool` | `true` | no | | [metadata\_service\_enable\_api\_gateway](#input\_metadata\_service\_enable\_api\_gateway) | Enable API Gateway for public metadata service endpoint | `bool` | `true` | no | -| [resource\_prefix](#input\_resource\_prefix) | string prefix for all resources | `string` | `"metaflow"` | no | +| [metaflow\_helm\_values](#input\_metaflow\_helm\_values) | These are used to override the default values of the metaflow helm chart | `any` | `{}` | no | +| [metaflow\_s3\_bucket\_arn](#input\_metaflow\_s3\_bucket\_arn) | ARN of the S3 bucket to be used when create\_datastore is set to false. This variable must be set if you create\_datastore is set to false. | `string` | `""` | no | +| [metaflow\_s3\_bucket\_kms\_key\_arn](#input\_metaflow\_s3\_bucket\_kms\_key\_arn) | ARN of the KMS key used to encrypt the S3 bucket when create\_datastore is set to false. This variable must be set if you create\_datastore is set to false. | `string` | `""` | no | +| [metaflow\_s3\_sys\_root](#input\_metaflow\_s3\_sys\_root) | The S3 root prefix in the metaflow s3 bucket to use. This variable must be set if you create\_datastore is set to false. | `string` | `""` | no | +| [metaflow\_ui\_is\_public](#input\_metaflow\_ui\_is\_public) | Set to true if you would like to make the metaflow UI load balancer publicly accessible | `bool` | `false` | no | +| [node\_group\_defaults](#input\_node\_group\_defaults) | A key value map of EKS node group default configurations that will directly override the inputs top the upstream EKS terraform module. | `any` | `{}` | no | +| [node\_group\_iam\_role\_additional\_policies](#input\_node\_group\_iam\_role\_additional\_policies) | A list of additional IAM policies to attach to the EKS worker nodes. This value directly overrides the input to the upstream EKS terraform module | `map(string)` | `{}` | no | +| [node\_groups](#input\_node\_groups) | A key value map of EKS node group definitions that will directly override the inputs top the upstream EKS terraform module. | `any` | `{}` | no | +| [private\_subnet\_tags](#input\_private\_subnet\_tags) | Additional tags for the private subnets | `map(string)` | `{}` | no | +| [public\_subnet\_tags](#input\_public\_subnet\_tags) | Additional tags for the public subnets | `map(string)` | `{}` | no | +| [resource\_prefix](#input\_resource\_prefix) | string prefix for all resources | `string` | `""` | no | | [resource\_suffix](#input\_resource\_suffix) | string suffix for all resources | `string` | `""` | no | -| [subnet1\_id](#input\_subnet1\_id) | First subnet used for availability zone redundancy | `string` | n/a | yes | -| [subnet2\_id](#input\_subnet2\_id) | Second subnet used for availability zone redundancy | `string` | n/a | yes | | [tags](#input\_tags) | aws tags | `map(string)` | n/a | yes | -| [ui\_alb\_internal](#input\_ui\_alb\_internal) | Defines whether the ALB for the UI is internal | `bool` | `false` | no | | [ui\_allow\_list](#input\_ui\_allow\_list) | List of CIDRs we want to grant access to our Metaflow UI Service. Usually this is our VPN's CIDR blocks. | `list(string)` | `[]` | no | -| [ui\_certificate\_arn](#input\_ui\_certificate\_arn) | SSL certificate for UI. If set to empty string, UI is disabled. | `string` | `""` | no | +| [ui\_certificate\_arn](#input\_ui\_certificate\_arn) | SSL certificate for UI. This value must be set if create\_metaflow\_ui is set to true. | `string` | `""` | no | | [ui\_static\_container\_image](#input\_ui\_static\_container\_image) | Container image for the UI frontend app | `string` | `""` | no | -| [vpc\_cidr\_blocks](#input\_vpc\_cidr\_blocks) | The VPC CIDR blocks that we'll access list on our Metadata Service API to allow all internal communications | `list(string)` | n/a | yes | -| [vpc\_id](#input\_vpc\_id) | The id of the single VPC we stood up for all Metaflow resources to exist in. | `string` | n/a | yes | -| [with\_public\_ip](#input\_with\_public\_ip) | Enable public IP assignment for the Metadata Service. If the subnets specified for subnet1\_id and subnet2\_id are public subnets, you will NEED to set this to true to allow pulling container images from public registries. Otherwise this should be set to false. | `bool` | n/a | yes | +| [vpc\_cidr](#input\_vpc\_cidr) | The CIDR block for the VPC | `string` | `"10.0.0.0/16"` | no | ## Outputs @@ -155,8 +181,8 @@ resource "local_file" "metaflow_config" { | [datastore\_s3\_bucket\_kms\_key\_arn](#output\_datastore\_s3\_bucket\_kms\_key\_arn) | The ARN of the KMS key used to encrypt the Metaflow datastore S3 bucket | | [metadata\_svc\_ecs\_task\_role\_arn](#output\_metadata\_svc\_ecs\_task\_role\_arn) | n/a | | [metaflow\_api\_gateway\_rest\_api\_id](#output\_metaflow\_api\_gateway\_rest\_api\_id) | The ID of the API Gateway REST API we'll use to accept MetaData service requests to forward to the Fargate API instance | +| [metaflow\_aws\_managed\_profile\_json](#output\_metaflow\_aws\_managed\_profile\_json) | Metaflow profile JSON object that can be used to communicate with this Metaflow Stack. Store this in `~/.metaflow/config_[stack-name]` and select with `$ export METAFLOW_PROFILE=[stack-name]`. | | [metaflow\_batch\_container\_image](#output\_metaflow\_batch\_container\_image) | The ECR repo containing the metaflow batch image | -| [metaflow\_profile\_json](#output\_metaflow\_profile\_json) | Metaflow profile JSON object that can be used to communicate with this Metaflow Stack. Store this in `~/.metaflow/config_[stack-name]` and select with `$ export METAFLOW_PROFILE=[stack-name]`. | | [metaflow\_s3\_bucket\_arn](#output\_metaflow\_s3\_bucket\_arn) | The ARN of the bucket we'll be using as blob storage | | [metaflow\_s3\_bucket\_name](#output\_metaflow\_s3\_bucket\_name) | The name of the bucket we'll be using as blob storage | | [migration\_function\_arn](#output\_migration\_function\_arn) | ARN of DB Migration Function | diff --git a/aws_managed.tf b/aws_managed.tf new file mode 100644 index 0000000..52b6610 --- /dev/null +++ b/aws_managed.tf @@ -0,0 +1,112 @@ +moved { + from = module.metaflow-metadata-service + to = module.metaflow-metadata-service[0] +} + +module "metaflow-metadata-service" { + source = "./modules/metadata-service" + + count = var.create_managed_metaflow_metadata_service ? 1 : 0 + + resource_prefix = local.resource_prefix + resource_suffix = local.resource_suffix + + access_list_cidr_blocks = var.access_list_cidr_blocks + database_name = local.database_name + database_password = local.database_password + database_username = local.database_username + db_migrate_lambda_zip_file = var.db_migrate_lambda_zip_file + datastore_s3_bucket_kms_key_arn = local.datastore_s3_bucket_kms_key_arn + enable_api_basic_auth = var.metadata_service_enable_api_basic_auth + enable_api_gateway = var.metadata_service_enable_api_gateway + fargate_execution_role_arn = module.metaflow-computation[0].ecs_execution_role_arn + iam_partition = var.iam_partition + metadata_service_container_image = local.metadata_service_container_image + metaflow_vpc_id = local.vpc_id + rds_master_instance_endpoint = local.rds_master_instance_endpoint + s3_bucket_arn = local.s3_bucket_arn + subnet_ids = local.subnet_ids + vpc_cidr_blocks = local.vpc_cidr_block + with_public_ip = local.with_public_ip + + standard_tags = var.tags +} + +module "metaflow-ui" { + source = "./modules/ui" + count = var.create_managed_metaflow_ui ? 1 : 0 + + resource_prefix = local.resource_prefix + resource_suffix = local.resource_suffix + + database_name = local.database_name + database_password = local.database_password + database_username = local.database_username + datastore_s3_bucket_kms_key_arn = local.datastore_s3_bucket_kms_key_arn + fargate_execution_role_arn = module.metaflow-computation[0].ecs_execution_role_arn + iam_partition = var.iam_partition + metaflow_vpc_id = local.vpc_id + rds_master_instance_endpoint = local.rds_master_instance_endpoint + s3_bucket_arn = local.s3_bucket_arn + subnet_ids = local.subnet_ids + alb_subnet_ids = local.alb_subnet_ids + ui_backend_container_image = local.metadata_service_container_image + ui_static_container_image = var.ui_static_container_image + alb_internal = !var.metaflow_ui_is_public + ui_allow_list = var.ui_allow_list + + METAFLOW_DATASTORE_SYSROOT_S3 = local.METAFLOW_DATASTORE_SYSROOT_S3 + certificate_arn = var.ui_certificate_arn + metadata_service_security_group_id = module.metaflow-metadata-service[0].metadata_service_security_group_id + + extra_ui_static_env_vars = var.extra_ui_static_env_vars + extra_ui_backend_env_vars = var.extra_ui_backend_env_vars + standard_tags = var.tags +} + +moved { + from = module.metaflow-computation + to = module.metaflow-computation[0] +} + +module "metaflow-computation" { + source = "./modules/computation" + count = var.create_managed_compute ? 1 : 0 + + resource_prefix = local.resource_prefix + resource_suffix = local.resource_suffix + + batch_type = var.batch_type + compute_environment_desired_vcpus = var.compute_environment_desired_vcpus + compute_environment_instance_types = var.compute_environment_instance_types + compute_environment_max_vcpus = var.compute_environment_max_vcpus + compute_environment_min_vcpus = var.compute_environment_min_vcpus + compute_environment_egress_cidr_blocks = var.compute_environment_egress_cidr_blocks + iam_partition = var.iam_partition + metaflow_vpc_id = local.vpc_id + subnet_ids = local.subnet_ids + launch_template_http_endpoint = var.launch_template_http_endpoint + launch_template_http_tokens = var.launch_template_http_tokens + launch_template_http_put_response_hop_limit = var.launch_template_http_put_response_hop_limit + + standard_tags = var.tags +} + +moved { + from = module.metaflow-step-function + to = module.metaflow-step-function[0] +} + +module "metaflow-step-functions" { + source = "./modules/step-functions" + count = var.create_step_functions ? 1 : 0 + + resource_prefix = local.resource_prefix + resource_suffix = local.resource_suffix + batch_job_queue_arn = module.metaflow-computation[0].METAFLOW_BATCH_JOB_QUEUE + iam_partition = var.iam_partition + s3_bucket_arn = module.metaflow-datastore[0].s3_bucket_arn + s3_bucket_kms_arn = module.metaflow-datastore[0].datastore_s3_bucket_kms_key_arn + + standard_tags = var.tags +} diff --git a/data.tf b/data.tf deleted file mode 100644 index eb58f21..0000000 --- a/data.tf +++ /dev/null @@ -1,3 +0,0 @@ -data "aws_region" "current" {} - -data "aws_caller_identity" "current" {} diff --git a/ecr.tf b/ecr.tf deleted file mode 100644 index 6b9415c..0000000 --- a/ecr.tf +++ /dev/null @@ -1,7 +0,0 @@ -resource "aws_ecr_repository" "metaflow_batch_image" { - count = var.enable_custom_batch_container_registry ? 1 : 0 - - name = local.metaflow_batch_image_name - - tags = var.tags -} diff --git a/eks.tf b/eks.tf new file mode 100644 index 0000000..1022332 --- /dev/null +++ b/eks.tf @@ -0,0 +1,168 @@ +module "eks" { + source = "terraform-aws-modules/eks/aws" + version = "20.31.6" + count = var.create_eks_cluster ? 1 : 0 + + cluster_version = "1.31" # Specify the desired EKS version + cluster_name = local.eks_name + vpc_id = local.vpc_id + subnet_ids = local.subnet_ids + enable_irsa = true + eks_managed_node_group_defaults = merge({ + ami_type = "AL2023_x86_64_STANDARD" + disk_size = 50 + }, var.node_group_defaults) + + eks_managed_node_groups = merge({ + metaflow_default = { + desired_capacity = 2 + max_size = 2 + min_size = 1 + instance_type = "m5.large" + } }, var.node_groups) + + + cluster_endpoint_public_access = true + cluster_endpoint_private_access = true + + iam_role_additional_policies = length(var.node_group_iam_role_additional_policies) > 0 ? var.node_group_iam_role_additional_policies : { + "default_node" = aws_iam_policy.default_node[0].arn, + "autoscaler" = aws_iam_policy.cluster_autoscaler[0].arn, + # Allow SSM access to the machines incase direct access is needed + "ssm" = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore", + } + + tags = var.tags +} + +resource "aws_iam_policy" "default_node" { + count = var.create_eks_cluster && length(var.node_group_iam_role_additional_policies) == 0 ? 1 : 0 + + name_prefix = "${local.resource_prefix}-default-node-policy${local.resource_suffix}" + description = "Default policy for cluster ${local.resource_prefix}-eks${local.resource_suffix}" + policy = data.aws_iam_policy_document.default_node.json +} + +data "aws_iam_policy_document" "default_node" { + statement { + sid = "S3" + effect = "Allow" + + actions = [ + "s3:*", + "kms:*", + ] + + resources = ["*"] + } +} + +data "aws_iam_role" "current_role" { + name = element(split("/", data.aws_caller_identity.current.arn), 1) +} + +resource "aws_eks_access_entry" "provider_cluster_admin" { + count = var.create_eks_cluster ? 1 : 0 + + cluster_name = module.eks[0].cluster_name + principal_arn = data.aws_iam_role.current_role.arn + type = "STANDARD" +} + +resource "aws_eks_access_policy_association" "provider_cluster_admin" { + count = var.create_eks_cluster ? 1 : 0 + + depends_on = [aws_eks_access_entry.provider_cluster_admin] + cluster_name = module.eks[0].cluster_name + policy_arn = "arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy" + principal_arn = data.aws_iam_role.current_role.arn + + access_scope { + type = "cluster" + } +} + +resource "aws_iam_policy" "cluster_autoscaler" { + count = var.create_eks_cluster && length(var.node_group_iam_role_additional_policies) == 0 ? 1 : 0 + + name_prefix = "${local.resource_prefix}-cluster-autoscaler${local.resource_suffix}" + description = "EKS cluster-autoscaler policy for cluster ${local.eks_name}" + policy = data.aws_iam_policy_document.cluster_autoscaler[0].json +} + +data "aws_iam_policy_document" "cluster_autoscaler" { + count = var.create_eks_cluster ? 1 : 0 + statement { + sid = "clusterAutoscalerAll" + effect = "Allow" + + actions = [ + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeAutoScalingInstances", + "autoscaling:DescribeLaunchConfigurations", + "autoscaling:DescribeTags", + "ec2:DescribeLaunchTemplateVersions", + ] + + resources = ["*"] + } + + statement { + sid = "clusterAutoscalerOwn" + effect = "Allow" + + actions = [ + "autoscaling:SetDesiredCapacity", + "autoscaling:TerminateInstanceInAutoScalingGroup", + "autoscaling:UpdateAutoScalingGroup", + ] + + resources = ["*"] + + condition { + test = "StringEquals" + variable = "autoscaling:ResourceTag/kubernetes.io/cluster/${local.eks_name}" + values = ["owned"] + } + + condition { + test = "StringEquals" + variable = "autoscaling:ResourceTag/k8s.io/cluster-autoscaler/enabled" + values = ["true"] + } + } +} + +data "aws_eks_cluster" "cluster" { + count = var.create_eks_cluster ? 1 : 0 + name = module.eks[0].cluster_name +} + +data "aws_eks_cluster_auth" "cluster" { + count = var.create_eks_cluster ? 1 : 0 + name = module.eks[0].cluster_name +} + +module "metaflow_helm" { + source = "./modules/services" + + kubernetes_cluster_host = var.create_eks_cluster ? data.aws_eks_cluster.cluster[0].endpoint : "" + kubernetes_cluster_ca_certificate = var.create_eks_cluster ? data.aws_eks_cluster.cluster[0].certificate_authority.0.data : "" + kubernetes_token = var.create_eks_cluster ? data.aws_eks_cluster_auth.cluster[0].token : "" + + resource_name_prefix = local.resource_prefix + deploy_metaflow_service = var.deploy_metaflow_services_in_eks + metaflow_helm_values = var.metaflow_helm_values + cluster_name = var.create_eks_cluster ? module.eks[0].cluster_name : "" + region = data.aws_region.current.name + deploy_cluster_autoscaler = var.deploy_cluster_autoscaler + cluster_oidc_provider = var.create_eks_cluster ? module.eks[0].oidc_provider : "" + account_id = data.aws_caller_identity.current.account_id + + metaflow_database = { + database_name = local.database_name + host = element(split(":", local.rds_master_instance_endpoint), 0) + user = local.database_username + password = local.database_password + } +} diff --git a/examples/basic-aws-managed/main.tf b/examples/basic-aws-managed/main.tf new file mode 100644 index 0000000..3ea35a1 --- /dev/null +++ b/examples/basic-aws-managed/main.tf @@ -0,0 +1,34 @@ +############################################################################### +# An example using this module to set up a minimal deployment Metaflow +# with AWS Batch support, without the UI. +############################################################################### + +terraform { + required_version = ">= 1.10" + + required_providers { + aws = ">= 5.82" + random = ">= 3.6" + } +} + +provider "aws" { + region = "us-west-2" # make sure to set the region to the one you want to deploy to +} + + +module "metaflow" { + source = "../../" + + create_vpc = true + + tags = { + "managedBy" = "terraform" + } +} + +# The module will generate a Metaflow config in JSON format, write it to a file +resource "local_file" "metaflow_config" { + content = module.metaflow.metaflow_aws_managed_profile_json + filename = "./metaflow_profile.json" +} diff --git a/examples/basic-eks/main.tf b/examples/basic-eks/main.tf new file mode 100644 index 0000000..f799ff9 --- /dev/null +++ b/examples/basic-eks/main.tf @@ -0,0 +1,40 @@ +############################################################################### +# An example using this module to set up a minimal deployment Metaflow +# To EKS cluster using helm charts +############################################################################### + +terraform { + required_version = ">= 1.10" + + required_providers { + aws = ">= 5.82" + random = ">= 3.6" + } +} + +provider "aws" { + region = "us-west-2" # make sure to set the region to the one you want to deploy to +} + + +module "metaflow" { + source = "../../" + + create_vpc = true + create_eks_cluster = true + deploy_cluster_autoscaler = true + deploy_metaflow_services_in_eks = true + + create_managed_compute = false + create_managed_metaflow_metadata_service = false + + tags = { + "managedBy" = "terraform" + } +} + +# The module will generate a Metaflow config in JSON format, write it to a file +resource "local_file" "metaflow_config" { + content = module.metaflow.metaflow_aws_managed_profile_json + filename = "./metaflow_profile.json" +} diff --git a/examples/minimal/minimal_example.tf b/examples/minimal/minimal_example.tf deleted file mode 100644 index 22ef4f6..0000000 --- a/examples/minimal/minimal_example.tf +++ /dev/null @@ -1,67 +0,0 @@ -############################################################################### -# An example using this module to set up a minimal deployment Metaflow -# with AWS Batch support, without the UI. -############################################################################### - -# Random suffix for this deployment -resource "random_string" "suffix" { - length = 8 - special = false - upper = false -} - -locals { - resource_prefix = "metaflow" - resource_suffix = random_string.suffix.result -} - -data "aws_availability_zones" "available" { -} - -# VPC infra using https://github.com/terraform-aws-modules/terraform-aws-vpc -module "vpc" { - source = "terraform-aws-modules/vpc/aws" - version = "3.13.0" - - name = "${local.resource_prefix}-${local.resource_suffix}" - cidr = "10.10.0.0/16" - - azs = data.aws_availability_zones.available.names - private_subnets = ["10.10.8.0/21", "10.10.16.0/21", "10.10.24.0/21"] - public_subnets = ["10.10.128.0/21", "10.10.136.0/21", "10.10.144.0/21"] - - enable_nat_gateway = true - single_nat_gateway = true - enable_dns_hostnames = true -} - - -module "metaflow" { - source = "outerbounds/metaflow/aws" - version = "0.9.0" - - resource_prefix = local.resource_prefix - resource_suffix = local.resource_suffix - - enable_step_functions = false - subnet1_id = module.vpc.public_subnets[0] - subnet2_id = module.vpc.public_subnets[1] - vpc_cidr_blocks = [module.vpc.vpc_cidr_block] - vpc_id = module.vpc.vpc_id - with_public_ip = true - - tags = { - "managedBy" = "terraform" - } -} - -# export all outputs from metaflow modules -output "metaflow" { - value = module.metaflow -} - -# The module will generate a Metaflow config in JSON format, write it to a file -resource "local_file" "metaflow_config" { - content = module.metaflow.metaflow_profile_json - filename = "./metaflow_profile.json" -} diff --git a/examples/minimal/versions.tf b/examples/minimal/versions.tf deleted file mode 100644 index d1c6f88..0000000 --- a/examples/minimal/versions.tf +++ /dev/null @@ -1,8 +0,0 @@ -terraform { - required_version = ">= 0.13.1" - - required_providers { - aws = ">= 3.54.0" - random = ">= 2.1" - } -} diff --git a/iam.tf b/iam.tf index 9ee5deb..d93d3a8 100644 --- a/iam.tf +++ b/iam.tf @@ -35,7 +35,7 @@ data "aws_iam_policy_document" "custom_s3_list_batch" { effect = "Allow" resources = [ - module.metaflow-datastore.s3_bucket_arn + local.s3_bucket_arn ] } } @@ -52,7 +52,7 @@ data "aws_iam_policy_document" "custom_s3_batch" { effect = "Allow" resources = [ - "${module.metaflow-datastore.s3_bucket_arn}/*" + "${local.s3_bucket_arn}/*" ] } } @@ -68,7 +68,7 @@ data "aws_iam_policy_document" "s3_kms" { ] resources = [ - module.metaflow-datastore.datastore_s3_bucket_kms_key_arn + local.datastore_s3_bucket_kms_key_arn ] } } @@ -168,6 +168,7 @@ data "aws_iam_policy_document" "iam_pass_role" { } data "aws_iam_policy_document" "dynamodb" { + count = var.create_step_functions ? 1 : 0 statement { sid = "Items" actions = [ @@ -179,7 +180,7 @@ data "aws_iam_policy_document" "dynamodb" { effect = "Allow" resources = [ - module.metaflow-step-functions.metaflow_step_functions_dynamodb_table_arn + module.metaflow-step-functions[0].metaflow_step_functions_dynamodb_table_arn ] } } @@ -237,10 +238,10 @@ resource "aws_iam_role_policy" "grant_iam_pass_role" { } resource "aws_iam_role_policy" "grant_dynamodb" { - count = var.enable_step_functions ? 1 : 0 + count = var.create_step_functions ? 1 : 0 name = "dynamodb" role = aws_iam_role.batch_s3_task_role.name - policy = data.aws_iam_policy_document.dynamodb.json + policy = data.aws_iam_policy_document.dynamodb[0].json } resource "aws_iam_role_policy" "grant_cloudwatch" { diff --git a/locals.tf b/locals.tf index 34268e3..a72eea7 100644 --- a/locals.tf +++ b/locals.tf @@ -2,23 +2,48 @@ module "metaflow-common" { source = "./modules/common" } +resource "random_string" "alphanumeric" { + count = var.resource_prefix == "" ? 1 : 0 + length = 5 + special = false + upper = false +} + locals { - resource_prefix = length(var.resource_prefix) > 0 ? "${var.resource_prefix}-" : "" - resource_suffix = length(var.resource_suffix) > 0 ? "-${var.resource_suffix}" : "" + resource_prefix = var.resource_prefix == "" ? "metaflow-${random_string.alphanumeric[0].result}" : var.resource_prefix + resource_suffix = var.resource_suffix != "" ? "-${var.resource_suffix}" : "" - aws_region = data.aws_region.current.name - aws_account_id = data.aws_caller_identity.current.account_id + # VPC related locals + vpc_id = var.create_vpc ? module.vpc[0].vpc_id : var.existing_vpc_id + azs = length(var.azs) > 0 ? var.azs : slice(data.aws_availability_zones.available.names, 0, 3) + private_subnet_ids = var.create_vpc && !var.create_public_subnets_only ? module.vpc[0].private_subnets : var.existing_private_subnet_ids + public_subnet_ids = var.create_vpc ? module.vpc[0].private_subnets : var.existing_public_subnet_ids + vpc_cidr_block = var.create_vpc ? [module.vpc[0].vpc_cidr_block] : var.existing_vpc_cidr_blocks + subnet_ids = length(local.private_subnet_ids) > 0 ? local.private_subnet_ids : local.public_subnet_ids + with_public_ip = length(local.private_subnet_ids) == 0 + alb_subnet_ids = var.metaflow_ui_is_public ? local.public_subnet_ids : local.subnet_ids - batch_s3_task_role_name = "${local.resource_prefix}batch_s3_task_role${local.resource_suffix}" - metaflow_batch_image_name = "${local.resource_prefix}batch${local.resource_suffix}" metadata_service_container_image = ( var.metadata_service_container_image == "" ? module.metaflow-common.default_metadata_service_container_image : var.metadata_service_container_image ) - ui_static_container_image = ( - var.ui_static_container_image == "" ? - module.metaflow-common.default_ui_static_container_image : - var.ui_static_container_image - ) + + aws_region = data.aws_region.current.name + aws_account_id = data.aws_caller_identity.current.account_id + + batch_s3_task_role_name = "${local.resource_prefix}batch_s3_task_role${local.resource_suffix}" + metaflow_batch_image_name = "${local.resource_prefix}batch${local.resource_suffix}" + eks_name = "${local.resource_prefix}-eks${local.resource_suffix}" + + database_name = var.create_datastore ? module.metaflow-datastore[0].database_name : var.database_name + database_password = var.create_datastore ? module.metaflow-datastore[0].database_password : var.database_password + database_username = var.create_datastore ? module.metaflow-datastore[0].database_username : var.database_username + rds_master_instance_endpoint = var.create_datastore ? module.metaflow-datastore[0].rds_master_instance_endpoint : var.database_endpoint + datastore_s3_bucket_kms_key_arn = var.create_datastore ? module.metaflow-datastore[0].datastore_s3_bucket_kms_key_arn : var.metaflow_s3_bucket_kms_key_arn + s3_bucket_arn = var.create_datastore ? module.metaflow-datastore[0].s3_bucket_arn : var.metaflow_s3_bucket_arn + METAFLOW_DATASTORE_SYSROOT_S3 = var.create_datastore ? module.metaflow-datastore[0].METAFLOW_DATASTORE_SYSROOT_S3 : var.metaflow_s3_sys_root + + + sgs_access_to_rds = var.create_managed_metaflow_metadata_service ? [module.metaflow-metadata-service[0].metadata_service_security_group_id] : [] } diff --git a/main.tf b/main.tf index 9b2aaee..a02797a 100644 --- a/main.tf +++ b/main.tf @@ -1,5 +1,50 @@ +data "aws_availability_zones" "available" {} +data "aws_region" "current" {} +data "aws_caller_identity" "current" {} + +module "vpc" { + source = "terraform-aws-modules/vpc/aws" + version = "~> 5.0" + count = var.create_vpc ? 1 : 0 + + name = "${local.resource_prefix}-vpc${local.resource_suffix}" + cidr = var.vpc_cidr + + azs = local.azs + private_subnets = var.create_public_subnets_only ? [] : [for i, az in local.azs : cidrsubnet(var.vpc_cidr, 4, i)] + public_subnets = [for i, az in local.azs : cidrsubnet(var.vpc_cidr, 4, i + 3)] + + enable_nat_gateway = !var.create_public_subnets_only + single_nat_gateway = !var.create_public_subnets_only + enable_dns_hostnames = true + enable_dns_support = true + + # Use custom tags for subnets + private_subnet_tags = merge( + var.private_subnet_tags, + { + "kubernetes.io/role/internal-elb" = "1" + }, + ) + + public_subnet_tags = merge( + var.public_subnet_tags, + { + "kubernetes.io/role/elb" = "1" + }, + ) + + tags = var.tags +} + +moved { + from = module.metaflow-datastore + to = module.metaflow-datastore[0] +} + module "metaflow-datastore" { source = "./modules/datastore" + count = var.create_datastore ? 1 : 0 force_destroy_s3_bucket = var.force_destroy_s3_bucket enable_key_rotation = var.enable_key_rotation @@ -7,10 +52,10 @@ module "metaflow-datastore" { resource_prefix = local.resource_prefix resource_suffix = local.resource_suffix - metadata_service_security_group_id = module.metaflow-metadata-service.metadata_service_security_group_id - metaflow_vpc_id = var.vpc_id - subnet1_id = var.subnet1_id - subnet2_id = var.subnet2_id + allowed_security_group_ids = local.sgs_access_to_rds + metaflow_vpc_id = local.vpc_id + subnet_ids = local.private_subnet_ids + vpc_cidr_blocks = local.vpc_cidr_block db_instance_type = var.db_instance_type db_engine_version = var.db_engine_version @@ -18,100 +63,10 @@ module "metaflow-datastore" { standard_tags = var.tags } -module "metaflow-metadata-service" { - source = "./modules/metadata-service" - - resource_prefix = local.resource_prefix - resource_suffix = local.resource_suffix - - access_list_cidr_blocks = var.access_list_cidr_blocks - database_name = module.metaflow-datastore.database_name - database_password = module.metaflow-datastore.database_password - database_username = module.metaflow-datastore.database_username - db_migrate_lambda_zip_file = var.db_migrate_lambda_zip_file - datastore_s3_bucket_kms_key_arn = module.metaflow-datastore.datastore_s3_bucket_kms_key_arn - enable_api_basic_auth = var.metadata_service_enable_api_basic_auth - enable_api_gateway = var.metadata_service_enable_api_gateway - fargate_execution_role_arn = module.metaflow-computation.ecs_execution_role_arn - iam_partition = var.iam_partition - metadata_service_container_image = local.metadata_service_container_image - metaflow_vpc_id = var.vpc_id - rds_master_instance_endpoint = module.metaflow-datastore.rds_master_instance_endpoint - s3_bucket_arn = module.metaflow-datastore.s3_bucket_arn - subnet1_id = var.subnet1_id - subnet2_id = var.subnet2_id - vpc_cidr_blocks = var.vpc_cidr_blocks - with_public_ip = var.with_public_ip - - standard_tags = var.tags -} - -module "metaflow-ui" { - source = "./modules/ui" - count = var.ui_certificate_arn == "" ? 0 : 1 - - resource_prefix = local.resource_prefix - resource_suffix = local.resource_suffix - - database_name = module.metaflow-datastore.database_name - database_password = module.metaflow-datastore.database_password - database_username = module.metaflow-datastore.database_username - datastore_s3_bucket_kms_key_arn = module.metaflow-datastore.datastore_s3_bucket_kms_key_arn - fargate_execution_role_arn = module.metaflow-computation.ecs_execution_role_arn - iam_partition = var.iam_partition - metaflow_vpc_id = var.vpc_id - rds_master_instance_endpoint = module.metaflow-datastore.rds_master_instance_endpoint - s3_bucket_arn = module.metaflow-datastore.s3_bucket_arn - subnet1_id = var.subnet1_id - subnet2_id = var.subnet2_id - ui_backend_container_image = local.metadata_service_container_image - ui_static_container_image = local.ui_static_container_image - alb_internal = var.ui_alb_internal - ui_allow_list = var.ui_allow_list - - METAFLOW_DATASTORE_SYSROOT_S3 = module.metaflow-datastore.METAFLOW_DATASTORE_SYSROOT_S3 - certificate_arn = var.ui_certificate_arn - metadata_service_security_group_id = module.metaflow-metadata-service.metadata_service_security_group_id - - extra_ui_static_env_vars = var.extra_ui_static_env_vars - extra_ui_backend_env_vars = var.extra_ui_backend_env_vars - standard_tags = var.tags -} - -module "metaflow-computation" { - source = "./modules/computation" - - resource_prefix = local.resource_prefix - resource_suffix = local.resource_suffix - - batch_type = var.batch_type - compute_environment_desired_vcpus = var.compute_environment_desired_vcpus - compute_environment_instance_types = var.compute_environment_instance_types - compute_environment_max_vcpus = var.compute_environment_max_vcpus - compute_environment_min_vcpus = var.compute_environment_min_vcpus - compute_environment_egress_cidr_blocks = var.compute_environment_egress_cidr_blocks - iam_partition = var.iam_partition - metaflow_vpc_id = var.vpc_id - subnet1_id = var.subnet1_id - subnet2_id = var.subnet2_id - launch_template_http_endpoint = var.launch_template_http_endpoint - launch_template_http_tokens = var.launch_template_http_tokens - launch_template_http_put_response_hop_limit = var.launch_template_http_put_response_hop_limit - - standard_tags = var.tags -} - -module "metaflow-step-functions" { - source = "./modules/step-functions" +resource "aws_ecr_repository" "metaflow_batch_image" { + count = var.enable_custom_batch_container_registry ? 1 : 0 - resource_prefix = local.resource_prefix - resource_suffix = local.resource_suffix - - active = var.enable_step_functions - batch_job_queue_arn = module.metaflow-computation.METAFLOW_BATCH_JOB_QUEUE - iam_partition = var.iam_partition - s3_bucket_arn = module.metaflow-datastore.s3_bucket_arn - s3_bucket_kms_arn = module.metaflow-datastore.datastore_s3_bucket_kms_key_arn + name = local.metaflow_batch_image_name - standard_tags = var.tags + tags = var.tags } diff --git a/modules/common/locals.tf b/modules/common/locals.tf index 142475e..50544c8 100644 --- a/modules/common/locals.tf +++ b/modules/common/locals.tf @@ -1,4 +1,4 @@ locals { - default_metadata_service_container_image = "netflixoss/metaflow_metadata_service:v2.3.0" - default_ui_static_container_image = "public.ecr.aws/outerbounds/metaflow_ui:v1.1.2" + default_metadata_service_container_image = "netflixoss/metaflow_metadata_service:v2.4.13" + default_ui_static_container_image = "public.ecr.aws/outerbounds/metaflow_ui:v1.13.13" } diff --git a/modules/computation/README.md b/modules/computation/README.md index 719b878..9925889 100644 --- a/modules/computation/README.md +++ b/modules/computation/README.md @@ -29,8 +29,7 @@ To read more, see [the Metaflow docs](https://docs.metaflow.org/metaflow-on-aws/ | [resource\_prefix](#input\_resource\_prefix) | Prefix given to all AWS resources to differentiate between applications | `string` | n/a | yes | | [resource\_suffix](#input\_resource\_suffix) | Suffix given to all AWS resources to differentiate between environment and workspace | `string` | n/a | yes | | [standard\_tags](#input\_standard\_tags) | The standard tags to apply to every AWS resource. | `map(string)` | n/a | yes | -| [subnet1\_id](#input\_subnet1\_id) | The first private subnet used for redundancy | `string` | n/a | yes | -| [subnet2\_id](#input\_subnet2\_id) | The second private subnet used for redundancy | `string` | n/a | yes | +| [subnet\_ids](#input\_subnet\_ids) | A list of private subnets that will be used to create compute instances | `list(string)` | n/a | yes | ## Outputs diff --git a/modules/computation/batch.tf b/modules/computation/batch.tf index 655e4a6..6735adf 100644 --- a/modules/computation/batch.tf +++ b/modules/computation/batch.tf @@ -53,10 +53,7 @@ resource "aws_batch_compute_environment" "this" { ], var.compute_environment_additional_security_group_ids) # Which subnet to launch the instances into. - subnets = [ - var.subnet1_id, - var.subnet2_id - ] + subnets = var.subnet_ids # Type of instance Amazon EC2 for on-demand. Can use "SPOT" to use unused instances at discount if available type = local.enable_fargate_on_batch ? "FARGATE" : "EC2" @@ -79,9 +76,10 @@ resource "aws_batch_job_queue" "this" { name = local.batch_queue_name state = "ENABLED" priority = 1 - compute_environments = [ - aws_batch_compute_environment.this.arn - ] + compute_environment_order { + compute_environment = aws_batch_compute_environment.this.arn + order = 1 + } tags = var.standard_tags } diff --git a/modules/computation/variables.tf b/modules/computation/variables.tf index 92d11ba..2090f9e 100644 --- a/modules/computation/variables.tf +++ b/modules/computation/variables.tf @@ -68,14 +68,9 @@ variable "standard_tags" { description = "The standard tags to apply to every AWS resource." } -variable "subnet1_id" { - type = string - description = "The first private subnet used for redundancy" -} - -variable "subnet2_id" { - type = string - description = "The second private subnet used for redundancy" +variable "subnet_ids" { + type = list(string) + description = "A list of private subnets that will be used to create compute instances" } variable "launch_template_http_endpoint" { diff --git a/modules/computation/versions.tf b/modules/computation/versions.tf index bcfbadf..d4b11d8 100644 --- a/modules/computation/versions.tf +++ b/modules/computation/versions.tf @@ -2,8 +2,8 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 3.38.0" + version = ">= 5.82" } } - required_version = ">= 0.13" + required_version = ">= 1.10" } diff --git a/modules/datastore/README.md b/modules/datastore/README.md index 6e30f23..59b15ca 100644 --- a/modules/datastore/README.md +++ b/modules/datastore/README.md @@ -19,6 +19,7 @@ To read more, see [the Metaflow docs](https://docs.metaflow.org/metaflow-on-aws/ | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| +| [allowed\_security\_group\_ids](#input\_allowed\_security\_group\_ids) | A list of security group ids that have access to the RDS instance | `list(string)` | `[]` | no | | [db\_engine](#input\_db\_engine) | n/a | `string` | `"postgres"` | no | | [db\_engine\_version](#input\_db\_engine\_version) | n/a | `string` | `"11"` | no | | [db\_instance\_type](#input\_db\_instance\_type) | RDS instance type to launch for PostgresQL database. | `string` | `"db.t3.small"` | no | @@ -26,13 +27,13 @@ To read more, see [the Metaflow docs](https://docs.metaflow.org/metaflow-on-aws/ | [db\_username](#input\_db\_username) | PostgresQL username; defaults to 'metaflow' | `string` | `"metaflow"` | no | | [enable\_key\_rotation](#input\_enable\_key\_rotation) | Enable key rotation for KMS keys | `bool` | `false` | no | | [force\_destroy\_s3\_bucket](#input\_force\_destroy\_s3\_bucket) | Empty S3 bucket before destroying via terraform destroy | `bool` | `false` | no | -| [metadata\_service\_security\_group\_id](#input\_metadata\_service\_security\_group\_id) | The security group ID used by the MetaData service. We'll grant this access to our DB. | `string` | n/a | yes | +| [metadata\_service\_security\_group\_id](#input\_metadata\_service\_security\_group\_id) | DEPRECATED: The security group ID used by the MetaData service. We'll grant this access to our DB. | `string` | `""` | no | | [metaflow\_vpc\_id](#input\_metaflow\_vpc\_id) | ID of the Metaflow VPC this SageMaker notebook instance is to be deployed in | `string` | n/a | yes | | [resource\_prefix](#input\_resource\_prefix) | Prefix given to all AWS resources to differentiate between applications | `string` | n/a | yes | | [resource\_suffix](#input\_resource\_suffix) | Suffix given to all AWS resources to differentiate between environment and workspace | `string` | n/a | yes | | [standard\_tags](#input\_standard\_tags) | The standard tags to apply to every AWS resource. | `map(string)` | n/a | yes | -| [subnet1\_id](#input\_subnet1\_id) | First subnet used for availability zone redundancy | `string` | n/a | yes | -| [subnet2\_id](#input\_subnet2\_id) | Second subnet used for availability zone redundancy | `string` | n/a | yes | +| [subnet\_ids](#input\_subnet\_ids) | A list of subnets to use for creating database instances | `list(string)` | n/a | yes | +| [vpc\_cidr\_blocks](#input\_vpc\_cidr\_blocks) | Current CIDR block for the VPC | `list(string)` | n/a | yes | ## Outputs diff --git a/modules/datastore/locals.tf b/modules/datastore/locals.tf index 0e45b83..2329a46 100644 --- a/modules/datastore/locals.tf +++ b/modules/datastore/locals.tf @@ -7,4 +7,7 @@ locals { # Name of S3 bucket s3_bucket_name = "${var.resource_prefix}s3${var.resource_suffix}" + + # Access to RDS instance + allowed_security_group_ids = var.metadata_service_security_group_id != "" ? concat([var.metadata_service_security_group_id], var.allowed_security_group_ids) : var.allowed_security_group_ids } diff --git a/modules/datastore/rds.tf b/modules/datastore/rds.tf index cddfa76..0d5b54f 100644 --- a/modules/datastore/rds.tf +++ b/modules/datastore/rds.tf @@ -5,7 +5,7 @@ */ resource "aws_db_subnet_group" "this" { name = local.pg_subnet_group_name - subnet_ids = [var.subnet1_id, var.subnet2_id] + subnet_ids = var.subnet_ids tags = merge( var.standard_tags, @@ -24,11 +24,21 @@ resource "aws_security_group" "rds_security_group" { vpc_id = var.metaflow_vpc_id # ingress only from port 5432 + dynamic "ingress" { + for_each = length(var.allowed_security_group_ids) > 0 ? { 1 : 1 } : {} + content { + from_port = 5432 + to_port = 5432 + protocol = "tcp" + security_groups = local.allowed_security_group_ids + } + } + ingress { - from_port = 5432 - to_port = 5432 - protocol = "tcp" - security_groups = [var.metadata_service_security_group_id] + from_port = 5432 + to_port = 5432 + protocol = "tcp" + cidr_blocks = var.vpc_cidr_blocks } # egress to anywhere diff --git a/modules/datastore/s3.tf b/modules/datastore/s3.tf index b5ba180..e94fc23 100644 --- a/modules/datastore/s3.tf +++ b/modules/datastore/s3.tf @@ -1,15 +1,6 @@ resource "aws_s3_bucket" "this" { bucket = local.s3_bucket_name - acl = "private" force_destroy = var.force_destroy_s3_bucket - server_side_encryption_configuration { - rule { - apply_server_side_encryption_by_default { - kms_master_key_id = aws_kms_key.s3.arn - sse_algorithm = "aws:kms" - } - } - } tags = merge( var.standard_tags, @@ -19,6 +10,24 @@ resource "aws_s3_bucket" "this" { ) } +resource "aws_s3_bucket_server_side_encryption_configuration" "this" { + bucket = aws_s3_bucket.this.id + + rule { + apply_server_side_encryption_by_default { + kms_master_key_id = aws_kms_key.s3.arn + sse_algorithm = "aws:kms" + } + } +} + +resource "aws_s3_bucket_ownership_controls" "metaflow-datastore" { + bucket = aws_s3_bucket.this.id + rule { + object_ownership = "BucketOwnerEnforced" + } +} + resource "aws_s3_bucket_public_access_block" "this" { bucket = aws_s3_bucket.this.id diff --git a/modules/datastore/variables.tf b/modules/datastore/variables.tf index e294391..8449442 100644 --- a/modules/datastore/variables.tf +++ b/modules/datastore/variables.tf @@ -33,7 +33,19 @@ variable "db_username" { variable "metadata_service_security_group_id" { type = string - description = "The security group ID used by the MetaData service. We'll grant this access to our DB." + description = "DEPRECATED: The security group ID used by the MetaData service. We'll grant this access to our DB." + default = "" +} + +variable "allowed_security_group_ids" { + description = "A list of security group ids that have access to the RDS instance" + type = list(string) + default = [] +} + +variable "vpc_cidr_blocks" { + type = list(string) + description = "Current CIDR block for the VPC" } variable "metaflow_vpc_id" { @@ -56,14 +68,9 @@ variable "standard_tags" { description = "The standard tags to apply to every AWS resource." } -variable "subnet1_id" { - type = string - description = "First subnet used for availability zone redundancy" -} - -variable "subnet2_id" { - type = string - description = "Second subnet used for availability zone redundancy" +variable "subnet_ids" { + type = list(string) + description = "A list of subnets to use for creating database instances" } variable "enable_key_rotation" { diff --git a/modules/datastore/versions.tf b/modules/datastore/versions.tf index 5c8ca44..4440216 100644 --- a/modules/datastore/versions.tf +++ b/modules/datastore/versions.tf @@ -2,11 +2,11 @@ terraform { required_providers { aws = { source = "hashicorp/aws" - version = ">= 3.38.0, != 4.8.0, != 4.7.0, != 4.6.0, != 4.5.0, != 4.4.0, != 4.3.0, != 4.2.0, != 4.1.0, != 4.0.0" + version = ">= 5.82" } random = { source = "hashicorp/random" } } - required_version = ">= 0.13" + required_version = ">= 1.10" } diff --git a/modules/metadata-service/README.md b/modules/metadata-service/README.md index cbed1ef..92b5e99 100644 --- a/modules/metadata-service/README.md +++ b/modules/metadata-service/README.md @@ -35,10 +35,9 @@ If the `access_list_cidr_blocks` variable is set, only traffic originating from | [resource\_suffix](#input\_resource\_suffix) | Suffix given to all AWS resources to differentiate between environment and workspace | `string` | n/a | yes | | [s3\_bucket\_arn](#input\_s3\_bucket\_arn) | The ARN of the bucket we'll be using as blob storage | `string` | n/a | yes | | [standard\_tags](#input\_standard\_tags) | The standard tags to apply to every AWS resource. | `map(string)` | n/a | yes | -| [subnet1\_id](#input\_subnet1\_id) | First private subnet used for availability zone redundancy | `string` | n/a | yes | -| [subnet2\_id](#input\_subnet2\_id) | Second private subnet used for availability zone redundancy | `string` | n/a | yes | +| [subnet\_ids](#input\_subnet\_ids) | A list of private subnets used for creating the metadata service | `list(string)` | n/a | yes | | [vpc\_cidr\_blocks](#input\_vpc\_cidr\_blocks) | The VPC CIDR blocks that we'll access list on our Metadata Service API to allow all internal communications | `list(string)` | n/a | yes | -| [with\_public\_ip](#input\_with\_public\_ip) | Enable public IP assignment for the Metadata Service. Typically you want this to be set to true if using public subnets as subnet1\_id and subnet2\_id, and false otherwise | `bool` | n/a | yes | +| [with\_public\_ip](#input\_with\_public\_ip) | Enable public IP assignment for the Metadata Service. Typically you want this to be set to true if the list of subnets subnet\_ids are public subnets, and false otherwise | `bool` | n/a | yes | ## Outputs diff --git a/modules/metadata-service/ec2.tf b/modules/metadata-service/ec2.tf index 64ec728..3a8d0f3 100644 --- a/modules/metadata-service/ec2.tf +++ b/modules/metadata-service/ec2.tf @@ -48,7 +48,7 @@ resource "aws_lb" "this" { name = "${var.resource_prefix}nlb${var.resource_suffix}" internal = true load_balancer_type = "network" - subnets = [var.subnet1_id, var.subnet2_id] + subnets = var.subnet_ids tags = var.standard_tags } diff --git a/modules/metadata-service/ecs.tf b/modules/metadata-service/ecs.tf index 1abd89d..4467ecc 100644 --- a/modules/metadata-service/ecs.tf +++ b/modules/metadata-service/ecs.tf @@ -75,7 +75,7 @@ resource "aws_ecs_service" "this" { network_configuration { security_groups = [aws_security_group.metadata_service_security_group.id] assign_public_ip = var.with_public_ip - subnets = [var.subnet1_id, var.subnet2_id] + subnets = var.subnet_ids } load_balancer { diff --git a/modules/metadata-service/lambda.tf b/modules/metadata-service/lambda.tf index af5fce8..502e3ca 100644 --- a/modules/metadata-service/lambda.tf +++ b/modules/metadata-service/lambda.tf @@ -129,7 +129,7 @@ resource "aws_lambda_function" "db_migrate_lambda" { } vpc_config { - subnet_ids = [var.subnet1_id, var.subnet2_id] + subnet_ids = var.subnet_ids security_group_ids = [aws_security_group.metadata_service_security_group.id] } } diff --git a/modules/metadata-service/variables.tf b/modules/metadata-service/variables.tf index b38f99c..831a944 100644 --- a/modules/metadata-service/variables.tf +++ b/modules/metadata-service/variables.tf @@ -108,15 +108,11 @@ variable "standard_tags" { description = "The standard tags to apply to every AWS resource." } -variable "subnet1_id" { - type = string - description = "First private subnet used for availability zone redundancy" +variable "subnet_ids" { + type = list(string) + description = "A list of private subnets used for creating the metadata service " } -variable "subnet2_id" { - type = string - description = "Second private subnet used for availability zone redundancy" -} variable "vpc_cidr_blocks" { type = list(string) description = "The VPC CIDR blocks that we'll access list on our Metadata Service API to allow all internal communications" @@ -124,5 +120,5 @@ variable "vpc_cidr_blocks" { variable "with_public_ip" { type = bool - description = "Enable public IP assignment for the Metadata Service. Typically you want this to be set to true if using public subnets as subnet1_id and subnet2_id, and false otherwise" + description = "Enable public IP assignment for the Metadata Service. Typically you want this to be set to true if the list of subnets subnet_ids are public subnets, and false otherwise" } diff --git a/modules/metadata-service/versions.tf b/modules/metadata-service/versions.tf index 1d23dfe..d4b11d8 100644 --- a/modules/metadata-service/versions.tf +++ b/modules/metadata-service/versions.tf @@ -1,8 +1,9 @@ terraform { required_providers { aws = { - source = "hashicorp/aws" + source = "hashicorp/aws" + version = ">= 5.82" } } - required_version = ">= 0.13" + required_version = ">= 1.10" } diff --git a/modules/services/README.md b/modules/services/README.md new file mode 100644 index 0000000..e7350c4 --- /dev/null +++ b/modules/services/README.md @@ -0,0 +1,16 @@ + +## Inputs + +| Name | Description | Type | Default | Required | +|------|-------------|------|---------|:--------:| +| [deploy\_metaflow\_service](#input\_deploy\_metaflow\_service) | Deploy the Metaflow service | `bool` | `true` | no | +| [google\_access\_token](#input\_google\_access\_token) | The google access token | `string` | `""` | no | +| [kubernetes\_cluster\_ca\_certificate](#input\_kubernetes\_cluster\_ca\_certificate) | The Kubernetes cluster CA certificate | `string` | `""` | no | +| [kubernetes\_cluster\_host](#input\_kubernetes\_cluster\_host) | The Kubernetes cluster host | `string` | `""` | no | +| [metaflow\_database](#input\_metaflow\_database) | Properties of the database that will be used to store metadata about Metaflow runs |
object({
database_name = string
host = string
user = string
password = string
})
| `null` | no | +| [resource\_name\_prefix](#input\_resource\_name\_prefix) | The prefix to use for all resources | `string` | `""` | no | + +## Outputs + +No outputs. + diff --git a/modules/services/main.tf b/modules/services/main.tf new file mode 100644 index 0000000..0c1a63b --- /dev/null +++ b/modules/services/main.tf @@ -0,0 +1,143 @@ +resource "helm_release" "metaflow" { + count = var.deploy_metaflow_service ? 1 : 0 + + name = "metaflow" + chart = "${path.module}/../../../metaflow-tools/charts/metaflow" # TODO: Change to the official chart + namespace = "${var.resource_name_prefix}-service" + create_namespace = true + dependency_update = true + + values = [ + yamlencode(merge({ + "metaflow-service" = { + metadatadb = { + host = var.metaflow_database.host + name = var.metaflow_database.database_name + user = var.metaflow_database.user + password = var.metaflow_database.password + } + } + "metaflow-ui" = { + uiBackend = { + metadatadb = { + host = var.metaflow_database.host + name = var.metaflow_database.database_name + user = var.metaflow_database.user + password = var.metaflow_database.password + } + metaflowServiceURL = "http://metaflow-metaflow-service/api/metadata" + } + uiStatic = { + metaflowUIBackendURL = "http://metaflow-metaflow-ui/api/" + } + } + }, var.metaflow_helm_values)) + ] +} + +locals { + autoscaler_sa_name = "${var.cluster_name}-cluster-autoscaler" +} + +resource "helm_release" "cluster_autoscaler" { + count = var.deploy_cluster_autoscaler ? 1 : 0 + name = "autoscaler" + + repository = "https://kubernetes.github.io/autoscaler" + chart = "cluster-autoscaler" + namespace = "kube-system" + + values = [yamlencode( + { + "rbac" = { + "serviceAccount" = { + "annotations" = { + "eks.amazonaws.com/role-arn" = aws_iam_role.cluster_autoscaler.arn + } + "name" = local.autoscaler_sa_name + } + } + } + )] + + set { + name = "autoDiscovery.clusterName" + value = var.cluster_name + } + + set { + name = "awsRegion" + value = var.region + } + +} + +resource "aws_iam_role" "cluster_autoscaler" { + name = "${var.cluster_name}-cluster-autoscaler" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Principal = { + Service = "eks.amazonaws.com" + } + Action = "sts:AssumeRole" + }, + { + Effect = "Allow" + Principal = { + Federated = "arn:aws:iam::${var.account_id}:oidc-provider/${var.cluster_oidc_provider}" + } + Action = "sts:AssumeRoleWithWebIdentity" + Condition = { + StringEquals = { + "${var.cluster_oidc_provider}:sub" = "system:serviceaccount:kube-system:${local.autoscaler_sa_name}" + } + } + } + ] + }) +} + +resource "aws_iam_role_policy" "cluster_autoscaler" { + role = aws_iam_role.cluster_autoscaler.name + policy = jsonencode({ + Version = "2012-10-17" + Statement = [ + { + Effect = "Allow" + Action = [ + "autoscaling:DescribeAutoScalingGroups", + "autoscaling:DescribeAutoScalingInstances", + "autoscaling:DescribeLaunchConfigurations", + "autoscaling:DescribeTags", + "autoscaling:SetDesiredCapacity", + "autoscaling:TerminateInstanceInAutoScalingGroup", + "ec2:DescribeLaunchTemplateVersions" + ] + Resource = "*" + } + ] + }) +} + +resource "aws_iam_role_policy_attachment" "cluster_autoscaler" { + role = aws_iam_role.cluster_autoscaler.name + policy_arn = "arn:aws:iam::aws:policy/AutoScalingFullAccess" +} + +resource "aws_iam_role_policy_attachment" "cluster_autoscaler_ec2" { + role = aws_iam_role.cluster_autoscaler.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly" +} + +resource "aws_iam_role_policy_attachment" "cluster_autoscaler_eks" { + role = aws_iam_role.cluster_autoscaler.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy" +} + +resource "aws_iam_role_policy_attachment" "cluster_autoscaler_eks_worker" { + role = aws_iam_role.cluster_autoscaler.name + policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy" +} diff --git a/modules/services/variables.tf b/modules/services/variables.tf new file mode 100644 index 0000000..cfbf757 --- /dev/null +++ b/modules/services/variables.tf @@ -0,0 +1,76 @@ +variable "kubernetes_cluster_host" { + description = "The Kubernetes cluster host" + type = string + default = "" +} + +variable "kubernetes_token" { + description = "The kube config token for the eks cluster" + type = string + default = "" +} + +variable "resource_name_prefix" { + description = "The prefix to use for all resources" + type = string + default = "" +} + +variable "kubernetes_cluster_ca_certificate" { + description = "The Kubernetes cluster CA certificate" + type = string + default = "" +} + +variable "deploy_metaflow_service" { + description = "Deploy the Metaflow service" + type = bool + default = true +} + + +variable "metaflow_database" { + type = object({ + database_name = string + host = string + user = string + password = string + }) + description = "Properties of the database that will be used to store metadata about Metaflow runs" + default = null +} + +variable "metaflow_helm_values" { + description = "Values set to the metaflow helm chart" + type = any + default = {} +} + +variable "cluster_name" { + description = "the name of the EKS cluster" + type = string + default = "" +} + +variable "cluster_oidc_provider" { + description = "The issuer to use for the cluster" + type = string + default = "" +} + +variable "account_id" { + description = "The AWS account ID" + type = string + default = "" +} + +variable "deploy_cluster_autoscaler" { + description = "Deploy the cluster autoscaler" + type = bool + default = true +} + +variable "region" { + description = "The region to deploy the cluster autoscaler" + type = string +} diff --git a/modules/services/versions.tf b/modules/services/versions.tf new file mode 100644 index 0000000..fef8628 --- /dev/null +++ b/modules/services/versions.tf @@ -0,0 +1,21 @@ +# helm_provider.tf +provider "helm" { + kubernetes { + host = var.kubernetes_cluster_host + cluster_ca_certificate = base64decode(var.kubernetes_cluster_ca_certificate) + token = var.kubernetes_token + } +} + +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.82" + } + helm = { + source = "hashicorp/helm" + version = ">= 2.17" + } + } +} diff --git a/modules/step-functions/dynamodb.tf b/modules/step-functions/dynamodb.tf index 9977b2f..a90f24c 100644 --- a/modules/step-functions/dynamodb.tf +++ b/modules/step-functions/dynamodb.tf @@ -1,5 +1,4 @@ resource "aws_dynamodb_table" "step_functions_state_table" { - count = var.active ? 1 : 0 name = local.dynamodb_step_functions_state_db_name billing_mode = "PAY_PER_REQUEST" hash_key = "pathspec" diff --git a/modules/step-functions/iam-eventbridge.tf b/modules/step-functions/iam-eventbridge.tf index 46e31e0..3801a8c 100644 --- a/modules/step-functions/iam-eventbridge.tf +++ b/modules/step-functions/iam-eventbridge.tf @@ -28,7 +28,6 @@ data "aws_iam_policy_document" "eventbridge_step_functions_policy" { } resource "aws_iam_role" "eventbridge_role" { - count = var.active ? 1 : 0 name = "${var.resource_prefix}eventbridge_role${var.resource_suffix}" description = "IAM role for Amazon EventBridge to access AWS Step Functions." assume_role_policy = data.aws_iam_policy_document.eventbridge_assume_role_policy.json @@ -37,8 +36,7 @@ resource "aws_iam_role" "eventbridge_role" { } resource "aws_iam_role_policy" "eventbridge_step_functions_policy" { - count = var.active ? 1 : 0 name = "step_functions" - role = aws_iam_role.eventbridge_role[0].id + role = aws_iam_role.eventbridge_role.id policy = data.aws_iam_policy_document.eventbridge_step_functions_policy.json } diff --git a/modules/step-functions/iam-step-functions.tf b/modules/step-functions/iam-step-functions.tf index e617785..8cc55ec 100644 --- a/modules/step-functions/iam-step-functions.tf +++ b/modules/step-functions/iam-step-functions.tf @@ -138,7 +138,6 @@ data "aws_iam_policy_document" "step_functions_dynamodb" { } resource "aws_iam_role" "step_functions_role" { - count = var.active ? 1 : 0 name = "${var.resource_prefix}step_functions_role${var.resource_suffix}" description = "IAM role for AWS Step Functions to access AWS resources (AWS Batch, AWS DynamoDB)." assume_role_policy = data.aws_iam_policy_document.step_functions_assume_role_policy.json @@ -147,36 +146,31 @@ resource "aws_iam_role" "step_functions_role" { } resource "aws_iam_role_policy" "step_functions_batch" { - count = var.active ? 1 : 0 name = "aws_batch" - role = aws_iam_role.step_functions_role[0].id + role = aws_iam_role.step_functions_role.id policy = data.aws_iam_policy_document.step_functions_batch_policy.json } resource "aws_iam_role_policy" "step_functions_s3" { - count = var.active ? 1 : 0 name = "s3" - role = aws_iam_role.step_functions_role[0].id + role = aws_iam_role.step_functions_role.id policy = data.aws_iam_policy_document.step_functions_s3.json } resource "aws_iam_role_policy" "step_functions_cloudwatch" { - count = var.active ? 1 : 0 name = "cloudwatch" - role = aws_iam_role.step_functions_role[0].id + role = aws_iam_role.step_functions_role.id policy = data.aws_iam_policy_document.step_functions_cloudwatch.json } resource "aws_iam_role_policy" "step_functions_eventbridge" { - count = var.active ? 1 : 0 name = "event_bridge" - role = aws_iam_role.step_functions_role[0].id + role = aws_iam_role.step_functions_role.id policy = data.aws_iam_policy_document.step_functions_eventbridge.json } resource "aws_iam_role_policy" "step_functions_dynamodb" { - count = var.active ? 1 : 0 name = "dynamodb" - role = aws_iam_role.step_functions_role[0].id + role = aws_iam_role.step_functions_role.id policy = data.aws_iam_policy_document.step_functions_dynamodb.json } diff --git a/modules/step-functions/moved.tf b/modules/step-functions/moved.tf new file mode 100644 index 0000000..a9b7936 --- /dev/null +++ b/modules/step-functions/moved.tf @@ -0,0 +1,47 @@ +# Move IAM role resources +moved { + from = aws_iam_role.step_functions_role[0] + to = aws_iam_role.step_functions_role +} + +moved { + from = aws_iam_role.eventbridge_role[0] + to = aws_iam_role.eventbridge_role +} + +# Move IAM policy resources +moved { + from = aws_iam_role_policy.step_functions_batch[0] + to = aws_iam_role_policy.step_functions_batch +} + +moved { + from = aws_iam_role_policy.step_functions_s3[0] + to = aws_iam_role_policy.step_functions_s3 +} + +moved { + from = aws_iam_role_policy.step_functions_cloudwatch[0] + to = aws_iam_role_policy.step_functions_cloudwatch +} + +moved { + from = aws_iam_role_policy.step_functions_eventbridge[0] + to = aws_iam_role_policy.step_functions_eventbridge +} + +moved { + from = aws_iam_role_policy.step_functions_dynamodb[0] + to = aws_iam_role_policy.step_functions_dynamodb +} + +moved { + from = aws_iam_role_policy.eventbridge_step_functions_policy[0] + to = aws_iam_role_policy.eventbridge_step_functions_policy +} + +# Move DynamoDB resources +moved { + from = aws_dynamodb_table.step_functions_state_table[0] + to = aws_dynamodb_table.step_functions_state_table +} diff --git a/modules/step-functions/versions.tf b/modules/step-functions/versions.tf index 1d23dfe..d4b11d8 100644 --- a/modules/step-functions/versions.tf +++ b/modules/step-functions/versions.tf @@ -1,8 +1,9 @@ terraform { required_providers { aws = { - source = "hashicorp/aws" + source = "hashicorp/aws" + version = ">= 5.82" } } - required_version = ">= 0.13" + required_version = ">= 1.10" } diff --git a/modules/ui/README.md b/modules/ui/README.md index 338eb7b..36fda7a 100644 --- a/modules/ui/README.md +++ b/modules/ui/README.md @@ -11,6 +11,7 @@ The services are deployed behind an AWS ALB, and the module will output the ALB |------|-------------|------|---------|:--------:| | [METAFLOW\_DATASTORE\_SYSROOT\_S3](#input\_METAFLOW\_DATASTORE\_SYSROOT\_S3) | METAFLOW\_DATASTORE\_SYSROOT\_S3 value | `string` | n/a | yes | | [alb\_internal](#input\_alb\_internal) | Defines whether the ALB is internal | `bool` | `false` | no | +| [alb\_subnet\_ids](#input\_alb\_subnet\_ids) | A list of private or public subnet ids to be used for hosting the UI ALB. This is configured separately from other instances to allow users to specify a public subnet for the ALB while using private subnets for the rest of the components | `list(string)` | n/a | yes | | [certificate\_arn](#input\_certificate\_arn) | SSL certificate ARN. The certificate will be used by the UI load balancer. | `string` | n/a | yes | | [database\_name](#input\_database\_name) | The database name | `string` | `"metaflow"` | no | | [database\_password](#input\_database\_password) | The database password | `string` | n/a | yes | @@ -28,8 +29,7 @@ The services are deployed behind an AWS ALB, and the module will output the ALB | [resource\_suffix](#input\_resource\_suffix) | Suffix given to all AWS resources to differentiate between environment and workspace | `string` | n/a | yes | | [s3\_bucket\_arn](#input\_s3\_bucket\_arn) | The ARN of the bucket used for Metaflow datastore | `string` | n/a | yes | | [standard\_tags](#input\_standard\_tags) | The standard tags to apply to every AWS resource. | `map(string)` | n/a | yes | -| [subnet1\_id](#input\_subnet1\_id) | First private subnet used for availability zone redundancy | `string` | n/a | yes | -| [subnet2\_id](#input\_subnet2\_id) | Second private subnet used for availability zone redundancy | `string` | n/a | yes | +| [subnet\_ids](#input\_subnet\_ids) | A list of private or public subnet ids used for creating Metaflow UI deployment | `list(string)` | n/a | yes | | [ui\_allow\_list](#input\_ui\_allow\_list) | A list of CIDRs the UI will be available to | `list(string)` | `[]` | no | | [ui\_backend\_container\_image](#input\_ui\_backend\_container\_image) | Container image for UI backend | `string` | `""` | no | | [ui\_static\_container\_image](#input\_ui\_static\_container\_image) | Container image for the UI frontend app | `string` | `""` | no | diff --git a/modules/ui/ec2.tf b/modules/ui/ec2.tf index 1802fe5..0563173 100644 --- a/modules/ui/ec2.tf +++ b/modules/ui/ec2.tf @@ -77,7 +77,7 @@ resource "aws_lb" "this" { name = "${var.resource_prefix}alb${var.resource_suffix}" internal = var.alb_internal load_balancer_type = "application" - subnets = [var.subnet1_id, var.subnet2_id] + subnets = var.alb_subnet_ids security_groups = [ aws_security_group.ui_lb_security_group.id ] diff --git a/modules/ui/ecs_ui_backend.tf b/modules/ui/ecs_ui_backend.tf index 61b855a..8f329d5 100644 --- a/modules/ui/ecs_ui_backend.tf +++ b/modules/ui/ecs_ui_backend.tf @@ -61,7 +61,7 @@ resource "aws_ecs_service" "ui_backend" { network_configuration { security_groups = [aws_security_group.fargate_security_group.id, var.metadata_service_security_group_id] assign_public_ip = true - subnets = [var.subnet1_id, var.subnet2_id] + subnets = var.subnet_ids } load_balancer { diff --git a/modules/ui/ecs_ui_static.tf b/modules/ui/ecs_ui_static.tf index dfa99f3..d90ade3 100644 --- a/modules/ui/ecs_ui_static.tf +++ b/modules/ui/ecs_ui_static.tf @@ -53,7 +53,7 @@ resource "aws_ecs_service" "ui_static" { network_configuration { security_groups = [aws_security_group.fargate_security_group.id] assign_public_ip = true - subnets = [var.subnet1_id, var.subnet2_id] + subnets = var.subnet_ids } load_balancer { diff --git a/modules/ui/variables.tf b/modules/ui/variables.tf index db1ce14..662d382 100644 --- a/modules/ui/variables.tf +++ b/modules/ui/variables.tf @@ -71,14 +71,14 @@ variable "standard_tags" { description = "The standard tags to apply to every AWS resource." } -variable "subnet1_id" { - type = string - description = "First private subnet used for availability zone redundancy" +variable "subnet_ids" { + type = list(string) + description = "A list of private or public subnet ids used for creating Metaflow UI deployment" } -variable "subnet2_id" { - type = string - description = "Second private subnet used for availability zone redundancy" +variable "alb_subnet_ids" { + type = list(string) + description = "A list of private or public subnet ids to be used for hosting the UI ALB. This is configured separately from other instances to allow users to specify a public subnet for the ALB while using private subnets for the rest of the components" } variable "certificate_arn" { diff --git a/modules/ui/versions.tf b/modules/ui/versions.tf index 1d23dfe..d4b11d8 100644 --- a/modules/ui/versions.tf +++ b/modules/ui/versions.tf @@ -1,8 +1,9 @@ terraform { required_providers { aws = { - source = "hashicorp/aws" + source = "hashicorp/aws" + version = ">= 5.82" } } - required_version = ">= 0.13" + required_version = ">= 1.10" } diff --git a/outputs.tf b/outputs.tf index 9b3b339..5e12fb2 100644 --- a/outputs.tf +++ b/outputs.tf @@ -1,15 +1,15 @@ output "METAFLOW_BATCH_JOB_QUEUE" { - value = module.metaflow-computation.METAFLOW_BATCH_JOB_QUEUE + value = var.create_managed_metaflow_metadata_service ? module.metaflow-computation[0].METAFLOW_BATCH_JOB_QUEUE : "" description = "AWS Batch Job Queue ARN for Metaflow" } output "METAFLOW_DATASTORE_SYSROOT_S3" { - value = module.metaflow-datastore.METAFLOW_DATASTORE_SYSROOT_S3 + value = module.metaflow-datastore[0].METAFLOW_DATASTORE_SYSROOT_S3 description = "Amazon S3 URL for Metaflow DataStore" } output "METAFLOW_DATATOOLS_S3ROOT" { - value = module.metaflow-datastore.METAFLOW_DATATOOLS_S3ROOT + value = module.metaflow-datastore[0].METAFLOW_DATATOOLS_S3ROOT description = "Amazon S3 URL for Metaflow DataTools" } @@ -19,46 +19,46 @@ output "METAFLOW_ECS_S3_ACCESS_IAM_ROLE" { } output "METAFLOW_EVENTS_SFN_ACCESS_IAM_ROLE" { - value = module.metaflow-step-functions.metaflow_eventbridge_role_arn + value = var.create_step_functions ? module.metaflow-step-functions[0].metaflow_eventbridge_role_arn : "" description = "IAM role for Amazon EventBridge to access AWS Step Functions." } output "METAFLOW_SERVICE_INTERNAL_URL" { - value = module.metaflow-metadata-service.METAFLOW_SERVICE_INTERNAL_URL + value = var.create_managed_metaflow_metadata_service ? module.metaflow-metadata-service[0].METAFLOW_SERVICE_INTERNAL_URL : "" description = "URL for Metadata Service (Accessible in VPC)" } output "METAFLOW_SERVICE_URL" { - value = module.metaflow-metadata-service.METAFLOW_SERVICE_URL + value = var.create_managed_metaflow_metadata_service ? module.metaflow-metadata-service[0].METAFLOW_SERVICE_URL : "" description = "URL for Metadata Service (Accessible in VPC)" } output "METAFLOW_SFN_DYNAMO_DB_TABLE" { - value = module.metaflow-step-functions.metaflow_step_functions_dynamodb_table_name + value = var.create_step_functions ? module.metaflow-step-functions[0].metaflow_step_functions_dynamodb_table_name : "" description = "AWS DynamoDB table name for tracking AWS Step Functions execution metadata." } output "METAFLOW_SFN_IAM_ROLE" { - value = module.metaflow-step-functions.metaflow_step_functions_role_arn + value = var.create_step_functions ? module.metaflow-step-functions[0].metaflow_step_functions_role_arn : "" description = "IAM role for AWS Step Functions to access AWS resources (AWS Batch, AWS DynamoDB)." } output "api_gateway_rest_api_id_key_id" { - value = module.metaflow-metadata-service.api_gateway_rest_api_id_key_id + value = var.create_managed_metaflow_metadata_service ? module.metaflow-metadata-service[0].api_gateway_rest_api_id_key_id : "" description = "API Gateway Key ID for Metadata Service. Fetch Key from AWS Console [METAFLOW_SERVICE_AUTH_KEY]" } output "datastore_s3_bucket_kms_key_arn" { - value = module.metaflow-datastore.datastore_s3_bucket_kms_key_arn + value = module.metaflow-datastore[0].datastore_s3_bucket_kms_key_arn description = "The ARN of the KMS key used to encrypt the Metaflow datastore S3 bucket" } output "metadata_svc_ecs_task_role_arn" { - value = module.metaflow-metadata-service.metadata_svc_ecs_task_role_arn + value = var.create_managed_metaflow_metadata_service ? module.metaflow-metadata-service[0].metadata_svc_ecs_task_role_arn : "" } output "metaflow_api_gateway_rest_api_id" { - value = module.metaflow-metadata-service.api_gateway_rest_api_id + value = var.create_managed_metaflow_metadata_service ? module.metaflow-metadata-service[0].api_gateway_rest_api_id : "" description = "The ID of the API Gateway REST API we'll use to accept MetaData service requests to forward to the Fargate API instance" } @@ -67,30 +67,30 @@ output "metaflow_batch_container_image" { description = "The ECR repo containing the metaflow batch image" } -output "metaflow_profile_json" { +output "metaflow_aws_managed_profile_json" { value = jsonencode( merge( var.enable_custom_batch_container_registry ? { "METAFLOW_BATCH_CONTAINER_REGISTRY" = element(split("/", aws_ecr_repository.metaflow_batch_image[0].repository_url), 0), "METAFLOW_BATCH_CONTAINER_IMAGE" = element(split("/", aws_ecr_repository.metaflow_batch_image[0].repository_url), 1) } : {}, - var.metadata_service_enable_api_basic_auth ? { - "METAFLOW_SERVICE_AUTH_KEY" = "## Replace with output from 'aws apigateway get-api-key --api-key ${module.metaflow-metadata-service.api_gateway_rest_api_id_key_id} --include-value | grep value' ##" + var.metadata_service_enable_api_basic_auth && var.create_managed_metaflow_metadata_service ? { + "METAFLOW_SERVICE_AUTH_KEY" = "## Replace with output from 'aws apigateway get-api-key --api-key ${module.metaflow-metadata-service[0].api_gateway_rest_api_id_key_id} --include-value | grep value' ##" } : {}, var.batch_type == "fargate" ? { - "METAFLOW_ECS_FARGATE_EXECUTION_ROLE" = module.metaflow-computation.ecs_execution_role_arn + "METAFLOW_ECS_FARGATE_EXECUTION_ROLE" = module.metaflow-computation[0].ecs_execution_role_arn } : {}, { - "METAFLOW_DATASTORE_SYSROOT_S3" = module.metaflow-datastore.METAFLOW_DATASTORE_SYSROOT_S3, - "METAFLOW_DATATOOLS_S3ROOT" = module.metaflow-datastore.METAFLOW_DATATOOLS_S3ROOT, - "METAFLOW_BATCH_JOB_QUEUE" = module.metaflow-computation.METAFLOW_BATCH_JOB_QUEUE, + "METAFLOW_DATASTORE_SYSROOT_S3" = local.METAFLOW_DATASTORE_SYSROOT_S3, + "METAFLOW_DATATOOLS_S3ROOT" = var.create_datastore ? module.metaflow-datastore[0].METAFLOW_DATATOOLS_S3ROOT : "", + "METAFLOW_BATCH_JOB_QUEUE" = var.create_managed_compute ? module.metaflow-computation[0].METAFLOW_BATCH_JOB_QUEUE : "", "METAFLOW_ECS_S3_ACCESS_IAM_ROLE" = aws_iam_role.batch_s3_task_role.arn - "METAFLOW_SERVICE_URL" = module.metaflow-metadata-service.METAFLOW_SERVICE_URL, - "METAFLOW_SERVICE_INTERNAL_URL" = module.metaflow-metadata-service.METAFLOW_SERVICE_INTERNAL_URL, - "METAFLOW_SFN_IAM_ROLE" = module.metaflow-step-functions.metaflow_step_functions_role_arn, - "METAFLOW_SFN_STATE_MACHINE_PREFIX" = replace("${local.resource_prefix}${local.resource_suffix}", "--", "-"), - "METAFLOW_EVENTS_SFN_ACCESS_IAM_ROLE" = module.metaflow-step-functions.metaflow_eventbridge_role_arn, - "METAFLOW_SFN_DYNAMO_DB_TABLE" = module.metaflow-step-functions.metaflow_step_functions_dynamodb_table_name, + "METAFLOW_SERVICE_URL" = var.create_managed_metaflow_metadata_service ? module.metaflow-metadata-service[0].METAFLOW_SERVICE_URL : "", + "METAFLOW_SERVICE_INTERNAL_URL" = var.create_managed_metaflow_metadata_service ? module.metaflow-metadata-service[0].METAFLOW_SERVICE_INTERNAL_URL : "", + "METAFLOW_SFN_IAM_ROLE" = var.create_step_functions ? module.metaflow-step-functions[0].metaflow_step_functions_role_arn : "", + "METAFLOW_SFN_STATE_MACHINE_PREFIX" = var.create_step_functions ? replace("${local.resource_prefix}${local.resource_suffix}", "--", "-") : "", + "METAFLOW_EVENTS_SFN_ACCESS_IAM_ROLE" = var.create_step_functions ? module.metaflow-step-functions[0].metaflow_eventbridge_role_arn : "", + "METAFLOW_SFN_DYNAMO_DB_TABLE" = var.create_step_functions ? module.metaflow-step-functions[0].metaflow_step_functions_dynamodb_table_name : "", "METAFLOW_DEFAULT_DATASTORE" = "s3", "METAFLOW_DEFAULT_METADATA" = "service" } @@ -100,17 +100,17 @@ output "metaflow_profile_json" { } output "metaflow_s3_bucket_name" { - value = module.metaflow-datastore.s3_bucket_name + value = module.metaflow-datastore[0].s3_bucket_name description = "The name of the bucket we'll be using as blob storage" } output "metaflow_s3_bucket_arn" { - value = module.metaflow-datastore.s3_bucket_arn + value = local.s3_bucket_arn description = "The ARN of the bucket we'll be using as blob storage" } output "migration_function_arn" { - value = module.metaflow-metadata-service.migration_function_arn + value = var.create_managed_metaflow_metadata_service ? module.metaflow-metadata-service[0].migration_function_arn : "" description = "ARN of DB Migration Function" } @@ -125,6 +125,6 @@ output "ui_alb_arn" { } output "batch_compute_environment_security_group_id" { - value = module.metaflow-computation.batch_compute_environment_security_group_id + value = var.create_managed_metaflow_metadata_service ? module.metaflow-computation[0].batch_compute_environment_security_group_id : "" description = "The ID of the security group attached to the Batch Compute environment." } diff --git a/variables.tf b/variables.tf index 1738c0b..28ada39 100644 --- a/variables.tf +++ b/variables.tf @@ -1,13 +1,123 @@ -variable "access_list_cidr_blocks" { - type = list(string) - description = "List of CIDRs we want to grant access to our Metaflow Metadata Service. Usually this is our VPN's CIDR blocks." - default = [] +################################################################################# +# Common +################################################################################# + +variable "iam_partition" { + type = string + default = "aws" + description = "IAM Partition (Select aws-us-gov for AWS GovCloud, otherwise leave as is)" } -variable "batch_type" { +variable "tags" { + description = "aws tags" + type = map(string) +} + +variable "resource_prefix" { + default = "" type = string - description = "AWS Batch Compute Type ('ec2', 'fargate')" - default = "ec2" + description = "string prefix for all resources" +} + +variable "resource_suffix" { + default = "" + type = string + description = "string suffix for all resources" +} + +################################################################################# +# Datastore +################################################################################# + +variable "create_datastore" { + description = "Set to create the datastore components for metaflow such as S3 bucket, Postgres database, etc. This value should be set to true in most cases except if the components created by the module are being deployed in kubernetes or are being created through another means." + type = bool + default = true +} + +variable "force_destroy_s3_bucket" { + type = bool + description = "Empty S3 bucket before destroying via terraform destroy" + default = true +} + +variable "enable_key_rotation" { + type = bool + description = "Enable key rotation for KMS keys" + default = false +} + +variable "db_instance_type" { + type = string + description = "RDS instance type to launch for PostgresQL database." + default = "db.t3.small" +} + +variable "db_engine_version" { + description = "The database engine version for the RDS instances. This value is also used to determine whether to create an Aurora RDS cluster or a classic RDS instance." + type = string + default = "14" +} + +// -------- If create_datastore is set to false then the following values must be set ----------- +variable "database_name" { + description = "Name of the database to be used when create_datastore is set to false. This variable must be set if you create_datastore is set to false." + type = string + default = "" +} + +variable "database_username" { + description = "Username for the database when create_datastore is set to false. This variable must be set if you create_datastore is set to false." + type = string + default = "" +} + +variable "database_password" { + description = "Password for the database when create_datastore is set to false. This variable must be set if you create_datastore is set to false." + type = string + default = "" + sensitive = true +} + +variable "database_endpoint" { + description = "Endpoint for the database when create_datastore is set to false. This variable must be set if you create_datastore is set to false." + type = string + default = "" +} + +variable "metaflow_s3_bucket_arn" { + description = "ARN of the S3 bucket to be used when create_datastore is set to false. This variable must be set if you create_datastore is set to false." + type = string + default = "" +} + +variable "metaflow_s3_bucket_kms_key_arn" { + description = "ARN of the KMS key used to encrypt the S3 bucket when create_datastore is set to false. This variable must be set if you create_datastore is set to false." + type = string + default = "" +} + +variable "metaflow_s3_sys_root" { + description = "The S3 root prefix in the metaflow s3 bucket to use. This variable must be set if you create_datastore is set to false." + type = string + default = "" +} +// ------------------------------------------------------------------------------------------------------------------------------------------------------------------ + +################################################################################# +# AWS Managed: Metadata Service +################################################################################# + +variable "create_managed_metaflow_metadata_service" { + description = "Set to create metaflow metadata-service in managed AWS ECS service. This value should be set to false if the metadata service is deployed within a kubernetes cluster" + type = bool + default = true +} + +variable "access_list_cidr_blocks" { + type = list(string) + description = "List of CIDRs we want to grant access to the Metaflow Metadata Service. Usually this is should be your VPN's CIDR blocks." + default = [] } variable "db_migrate_lambda_zip_file" { @@ -16,25 +126,84 @@ variable "db_migrate_lambda_zip_file" { default = null } -variable "enable_custom_batch_container_registry" { +variable "metadata_service_enable_api_basic_auth" { + type = bool + default = true + description = "Enable basic auth for API Gateway? (requires key export)" +} + +variable "metadata_service_enable_api_gateway" { + type = bool + default = true + description = "Enable API Gateway for public metadata service endpoint" +} + +variable "metadata_service_container_image" { + type = string + default = "" + description = "Container image for metadata service" +} + +################################################################################# +# AWS Managed: Metadata UI +################################################################################# + +variable "create_managed_metaflow_ui" { + description = "Set to create metaflow UI in managed AWS ECS service. This value should be set to false if the UI is deployed within a kubernetes cluster" type = bool default = false - description = "Provisions infrastructure for custom Amazon ECR container registry if enabled" } -variable "enable_step_functions" { +variable "metaflow_ui_is_public" { + description = "Set to true if you would like to make the metaflow UI load balancer publicly accessible" type = bool - description = "Provisions infrastructure for step functions if enabled" + default = false } -variable "resource_prefix" { - default = "metaflow" - description = "string prefix for all resources" +variable "ui_certificate_arn" { + type = string + default = "" + description = "SSL certificate for UI. This value must be set if create_metaflow_ui is set to true." } -variable "resource_suffix" { +variable "ui_allow_list" { + type = list(string) + default = [] + description = "List of CIDRs we want to grant access to our Metaflow UI Service. Usually this is our VPN's CIDR blocks." +} + +variable "extra_ui_static_env_vars" { + type = map(string) + default = {} + description = "Additional environment variables for UI static app" +} + +variable "extra_ui_backend_env_vars" { + type = map(string) + default = {} + description = "Additional environment variables for UI backend container" +} + +variable "ui_static_container_image" { + type = string default = "" - description = "string suffix for all resources" + description = "Container image for the UI frontend app" +} + +################################################################################# +# AWS Manged: Metaflow Compute +################################################################################# + +variable "create_managed_compute" { + description = "Set to create metaflow compute resources in AWS Batch. This value should be set to false if the compute resources are deployed within a kubernetes cluster" + type = bool + default = true +} + +variable "batch_type" { + type = string + description = "AWS Batch Compute Type ('ec2', 'fargate')" + default = "ec2" } variable "compute_environment_desired_vcpus" { @@ -46,7 +215,7 @@ variable "compute_environment_desired_vcpus" { variable "compute_environment_instance_types" { type = list(string) description = "The instance types for the compute environment" - default = ["c4.large", "c4.xlarge", "c4.2xlarge", "c4.4xlarge", "c4.8xlarge"] + default = ["c5.large", "c5.xlarge", "c5.2xlarge", "c5.4xlarge", "c5.9xlarge"] } variable "compute_environment_min_vcpus" { @@ -67,17 +236,6 @@ variable "compute_environment_egress_cidr_blocks" { description = "CIDR blocks to which egress is allowed from the Batch Compute environment's security group" } -variable "db_instance_type" { - type = string - description = "RDS instance type to launch for PostgresQL database." - default = "db.t2.small" -} - -variable "db_engine_version" { - type = string - default = "11" -} - variable "launch_template_http_endpoint" { type = string description = "Whether the metadata service is available. Can be 'enabled' or 'disabled'" @@ -96,106 +254,133 @@ variable "launch_template_http_put_response_hop_limit" { default = 2 } -variable "iam_partition" { - type = string - default = "aws" - description = "IAM Partition (Select aws-us-gov for AWS GovCloud, otherwise leave as is)" +################################################################################# +# Step Functions +################################################################################# + +variable "create_step_functions" { + type = bool + description = "Provisions infrastructure for step functions if enabled" + default = false } -variable "metadata_service_container_image" { - type = string - default = "" - description = "Container image for metadata service" +################################################################################# +# ECR +################################################################################# + +variable "enable_custom_batch_container_registry" { + type = bool + default = false + description = "Provisions infrastructure for custom Amazon ECR container registry if enabled" } -variable "metadata_service_enable_api_basic_auth" { +################################################################################# +# VPC +################################################################################# + +variable "create_vpc" { + description = "Controls if VPC should be created (it affects almost all resources)" type = bool - default = true - description = "Enable basic auth for API Gateway? (requires key export)" + default = false } -variable "metadata_service_enable_api_gateway" { +variable "create_public_subnets_only" { + description = "Set to create a VPC with only public subnets. Using only public subnets helps reduce AWS costs by removing the need to create a NAT gateway. However, it also increases security risk to your infrastructure since a misconfigured security group can expose your infrastructure on the public internet. Hence we only recommend setting this for experimental deployments." type = bool - default = true - description = "Enable API Gateway for public metadata service endpoint" + default = false } -variable "ui_static_container_image" { +variable "vpc_cidr" { + description = "The CIDR block for the VPC" type = string - default = "" - description = "Container image for the UI frontend app" + default = "10.0.0.0/16" } -variable "tags" { - description = "aws tags" - type = map(string) +variable "azs" { + description = "A list of availability zones names in the region" + type = list(string) + default = [] } -variable "ui_alb_internal" { - type = bool - description = "Defines whether the ALB for the UI is internal" - default = false +variable "private_subnet_tags" { + description = "Additional tags for the private subnets" + type = map(string) + default = {} } -# variables from infra project that defines the VPC we will deploy to +variable "public_subnet_tags" { + description = "Additional tags for the public subnets" + type = map(string) + default = {} +} -variable "subnet1_id" { - type = string - description = "First subnet used for availability zone redundancy" +variable "existing_vpc_cidr_blocks" { + type = list(string) + description = "The VPC CIDR blocks that we'll access list on our Metadata Service API to allow all internal communications. Needs to be set if create_vpc is set to false" + default = [] } -variable "subnet2_id" { +variable "existing_vpc_id" { type = string - description = "Second subnet used for availability zone redundancy" + description = "The id of the single VPC we stood up for all Metaflow resources to exist in. Needs to be set if create_vpc is set to false" + default = "" } -variable "vpc_cidr_blocks" { +variable "existing_private_subnet_ids" { type = list(string) - description = "The VPC CIDR blocks that we'll access list on our Metadata Service API to allow all internal communications" + description = "List of private subnet ids that will be used to create metaflow components in. If create_vpc is set to false, either private_subnet_ids, public_subnet_ids or both need to be set. Setting private_subnet_ids will result in a more " + default = [] } -variable "vpc_id" { - type = string - description = "The id of the single VPC we stood up for all Metaflow resources to exist in." +variable "existing_public_subnet_ids" { + type = list(string) + description = "List of public subnet_ids that will be used to create metaflow components that you want to expose on the public internet. This may need to be set if create_vpc is set to false" + default = [] } -variable "ui_certificate_arn" { - type = string - default = "" - description = "SSL certificate for UI. If set to empty string, UI is disabled. " +################################################################################# +# EKS +################################################################################# + +variable "create_eks_cluster" { + description = "Set to create an EKS cluster" + type = bool + default = false } -variable "ui_allow_list" { - type = list(string) - default = [] - description = "List of CIDRs we want to grant access to our Metaflow UI Service. Usually this is our VPN's CIDR blocks." +variable "node_groups" { + type = any + description = "A key value map of EKS node group definitions that will directly override the inputs top the upstream EKS terraform module." + default = {} } -variable "extra_ui_backend_env_vars" { - type = map(string) +variable "node_group_defaults" { + type = any + description = "A key value map of EKS node group default configurations that will directly override the inputs top the upstream EKS terraform module." default = {} - description = "Additional environment variables for UI backend container" } -variable "extra_ui_static_env_vars" { +variable "node_group_iam_role_additional_policies" { type = map(string) + description = "A list of additional IAM policies to attach to the EKS worker nodes. This value directly overrides the input to the upstream EKS terraform module" default = {} - description = "Additional environment variables for UI static app" } -variable "with_public_ip" { +variable "deploy_cluster_autoscaler" { type = bool - description = "Enable public IP assignment for the Metadata Service. If the subnets specified for subnet1_id and subnet2_id are public subnets, you will NEED to set this to true to allow pulling container images from public registries. Otherwise this should be set to false." + description = "Set to deploy the cluster autoscaler" + default = false + } -variable "force_destroy_s3_bucket" { +variable "deploy_metaflow_services_in_eks" { + description = "Set to deploy metaflow metadata service and metaflow ui via the helm chart." type = bool - description = "Empty S3 bucket before destroying via terraform destroy" default = false } -variable "enable_key_rotation" { - type = bool - description = "Enable key rotation for KMS keys" - default = false +variable "metaflow_helm_values" { + description = "These are used to override the default values of the metaflow helm chart" + type = any + default = {} }