Skip to content

Commit d2481f6

Browse files
authored
initial implementation of standalone metaflow module (#3)
* initial metaflow module * remove comments * make final snapshot unique to prevent issues when recreating db instances * add comments for replace
0 parents  commit d2481f6

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+2137
-0
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# README
2+
3+
This project is composed of modules which break up the responsibility into logical parts. See each module's
4+
corresponding `README.md` for more details.
5+
6+
Provides the core functionality for Metaflow which includes:
7+
8+
- on demand processing (`computation`)
9+
- blob and tabular storage (`datastore`)
10+
- an API to record and query past executions (`metadata-service`)
11+
- orchestrated processing (`step-functions`)
12+
13+
Depends on the output of the project `infra`.
14+
15+
## ECR
16+
17+
Sets up an AWS ECR to hold the Docker image we wish to use with Metaflow.

aws/terraform/modules/metaflow/ecr.tf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
resource "aws_ecr_repository" "metaflow_batch_image" {
2+
name = local.metaflow_batch_image_name
3+
4+
tags = var.tags
5+
}
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
locals {
2+
resource_prefix = length(var.resource_prefix) > 0 ? "${var.resource_prefix}-" : ""
3+
resource_suffix = length(var.resource_suffix) > 0 ? "-${var.resource_suffix}" : ""
4+
5+
metaflow_batch_image_name = "${local.resource_prefix}batch${local.resource_suffix}"
6+
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
module "metaflow-datastore" {
2+
source = "./modules/datastore"
3+
4+
resource_prefix = local.resource_prefix
5+
resource_suffix = local.resource_suffix
6+
metaflow_vpc_id = var.vpc_id
7+
ecs_instance_role_arn = module.metaflow-computation.ecs_instance_role_arn
8+
ecs_execution_role_arn = module.metaflow-computation.ecs_execution_role_arn
9+
aws_batch_service_role_arn = module.metaflow-computation.batch_service_role_arn
10+
subnet_private_1_id = var.subnet_private_1_id
11+
subnet_private_2_id = var.subnet_private_2_id
12+
metadata_service_security_group_id = module.metaflow-metadata-service.metadata_service_security_group_id
13+
14+
standard_tags = var.tags
15+
}
16+
17+
module "metaflow-metadata-service" {
18+
source = "./modules/metadata-service"
19+
20+
resource_prefix = local.resource_prefix
21+
resource_suffix = local.resource_suffix
22+
metaflow_vpc_id = var.vpc_id
23+
vpc_cidr_block = var.vpc_cidr_block
24+
subnet_private_1_id = var.subnet_private_1_id
25+
subnet_private_2_id = var.subnet_private_2_id
26+
rds_master_instance_endpoint = module.metaflow-datastore.rds_master_instance_endpoint
27+
database_username = module.metaflow-datastore.database_username
28+
database_password = module.metaflow-datastore.database_password
29+
fargate_task_role_arn = module.metaflow-datastore.iam_s3_access_role_arn
30+
fargate_execution_role_arn = module.metaflow-computation.ecs_execution_role_arn
31+
access_list_cidr_blocks = var.access_list_cidr_blocks
32+
33+
standard_tags = var.tags
34+
}
35+
36+
module "metaflow-computation" {
37+
source = "./modules/computation"
38+
39+
resource_prefix = local.resource_prefix
40+
resource_suffix = local.resource_suffix
41+
metaflow_vpc_id = var.vpc_id
42+
subnet_private_1_id = var.subnet_private_1_id
43+
subnet_private_2_id = var.subnet_private_2_id
44+
s3_kms_policy_arn = module.metaflow-datastore.metaflow_kms_s3_policy_arn
45+
metaflow_policy_arn = var.metaflow_policy_arn
46+
metaflow_step_functions_dynamodb_policy = module.metaflow-step-functions.metaflow_step_functions_dynamodb_policy
47+
batch_compute_environment_cpu_max_vcpus = var.cpu_max_compute_vcpus
48+
batch_compute_environment_cpu_desired_vcpus = var.cpu_desired_compute_vcpus
49+
batch_compute_environment_cpu_min_vcpus = var.cpu_min_compute_vcpus
50+
batch_compute_environment_large_cpu_max_vcpus = var.large_cpu_max_compute_vcpus
51+
batch_compute_environment_large_cpu_desired_vcpus = var.large_cpu_desired_compute_vcpus
52+
batch_compute_environment_large_cpu_min_vcpus = var.large_cpu_min_compute_vcpus
53+
batch_compute_environment_gpu_max_vcpus = var.gpu_max_compute_vcpus
54+
batch_compute_environment_gpu_desired_vcpus = var.gpu_desired_compute_vcpus
55+
batch_compute_environment_gpu_min_vcpus = var.gpu_min_compute_vcpus
56+
enable_step_functions = var.enable_step_functions
57+
58+
standard_tags = var.tags
59+
}
60+
61+
module "metaflow-step-functions" {
62+
source = "./modules/step-functions"
63+
64+
active = var.enable_step_functions
65+
resource_prefix = local.resource_prefix
66+
resource_suffix = local.resource_suffix
67+
68+
batch_job_queue_arn = module.metaflow-computation.METAFLOW_BATCH_JOB_QUEUE
69+
s3_bucket_arn = module.metaflow-datastore.s3_bucket_arn
70+
s3_bucket_kms_arn = module.metaflow-datastore.datastore_s3_bucket_kms_key_arn
71+
72+
standard_tags = var.tags
73+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Modules
2+
3+
Our Metaflow Terraform code has been separated into separate modules based on the service architecture.
4+
5+
## Computation
6+
7+
Sets up remote computation resources so flows can be run on EC2 instances. These resources do not perform
8+
orchestration and rely on the data scientist's computer to perform this coordination.
9+
10+
## Datastore
11+
12+
Sets up blob and tabular data storage. Records all flows, the steps they took, their conda environments, artifacts
13+
and results.
14+
15+
Should exist for the lifetime of the stack.
16+
17+
## Metadata Service
18+
19+
Sets up an API entrypoint to interact with all other services, both for running flows and interacting with the
20+
Datastore to explore historic runs.
21+
22+
## Step Functions
23+
24+
Sets up remote computation resources that come with orchestration. This allows data scientists to schedule flows
25+
using crons as well as being able to kick off flows and shut down their machine, as the remote resources will handle
26+
all coordination.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Computation
2+
3+
This module sets up the required resources to perform remote AWS Batch executions. One can modify how many resources
4+
we want to have available at a given moment, as well as what resources we want to allow to scale up.
5+
6+
One can use Metaflow without this module by running purely locally and leveraging the Datastore
7+
8+
To read more, see [the Metaflow docs](https://docs.metaflow.org/metaflow-on-aws/metaflow-on-aws#compute)
Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
resource "aws_batch_compute_environment" "cpu" {
2+
/* Unique name for compute environment.
3+
We use compute_environment_name_prefix opposed to just compute_environment_name as batch compute environments must
4+
be created and destroyed, never edited. This way when we go to make a "modification" we will stand up a new
5+
batch compute environment with a new unique name and once that succeeds, the old one will be torn down. If we had
6+
just used compute_environment_name, then there would be a conflict when we went to stand up the new
7+
compute_environment that had the modifications applied and the process would fail.
8+
*/
9+
compute_environment_name_prefix = local.cpu_compute_env_prefix_name
10+
11+
# Give permissions so the batch service can make API calls.
12+
service_role = aws_iam_role.batch_service_role.arn
13+
type = "MANAGED"
14+
depends_on = [aws_iam_role_policy_attachment.batch_service_role]
15+
16+
compute_resources {
17+
# Give permissions so the ECS container instances can make API call.
18+
instance_role = aws_iam_instance_profile.ecs_instance_role.arn
19+
20+
# List of types that can be launched.
21+
instance_type = var.batch_cpu_instance_types
22+
23+
# Range of number of CPUs.
24+
max_vcpus = var.batch_compute_environment_cpu_max_vcpus
25+
min_vcpus = var.batch_compute_environment_cpu_min_vcpus
26+
desired_vcpus = var.batch_compute_environment_cpu_desired_vcpus
27+
28+
# Prefers cheap vCPU approaches
29+
allocation_strategy = "BEST_FIT"
30+
31+
/* Links to a launch template who has more than the standard 8GB of disk space. So we can download training data.
32+
Always uses the "default version", which means we can update the Launch Template to a smaller or larger disk size
33+
and this compute environment will not have to be destroyed and then created to point to a new Launch Template.
34+
*/
35+
launch_template {
36+
launch_template_id = aws_launch_template.this.id
37+
version = aws_launch_template.this.latest_version
38+
}
39+
40+
# Security group to apply to the instances launched.
41+
security_group_ids = [
42+
aws_security_group.batch.id,
43+
]
44+
45+
# Which subnet to launch the instances into.
46+
subnets = [
47+
var.subnet_private_1_id,
48+
var.subnet_private_2_id
49+
]
50+
51+
# Type of instance EC2 for on-demand. Can use "SPOT" to use unused instances at discount if available
52+
type = "EC2"
53+
54+
tags = var.standard_tags
55+
}
56+
57+
lifecycle {
58+
/* From here https://github.com/terraform-providers/terraform-provider-aws/issues/11077#issuecomment-560416740
59+
helps with "modifying" batch compute environments which requires creating new ones and deleting old ones
60+
as no inplace modification can be made
61+
*/
62+
create_before_destroy = true
63+
# To ensure terraform redeploys do not silently overwrite an up to date desired_vcpus that metaflow may modify
64+
ignore_changes = [compute_resources.0.desired_vcpus]
65+
}
66+
}
67+
68+
resource "aws_batch_compute_environment" "large-cpu" {
69+
/* Unique name for compute environment.
70+
We use compute_environment_name_prefix opposed to just compute_environment_name as batch compute environments must
71+
be created and destroyed, never edited. This way when we go to make a "modification" we will stand up a new
72+
batch compute environment with a new unique name and once that succeeds, the old one will be torn down. If we had
73+
just used compute_environment_name, then there would be a conflict when we went to stand up the new
74+
compute_environment that had the modifications applied and the process would fail.
75+
*/
76+
compute_environment_name_prefix = local.large_cpu_compute_env_prefix_name
77+
78+
# Give permissions so the batch service can make API calls.
79+
service_role = aws_iam_role.batch_service_role.arn
80+
type = "MANAGED"
81+
depends_on = [aws_iam_role_policy_attachment.batch_service_role]
82+
83+
compute_resources {
84+
# Give permissions so the ECS container instances can make API call.
85+
instance_role = aws_iam_instance_profile.ecs_instance_role.arn
86+
87+
# List of types that can be launched.
88+
instance_type = var.batch_large_cpu_instance_types
89+
90+
# Range of number of CPUs.
91+
max_vcpus = var.batch_compute_environment_large_cpu_max_vcpus
92+
min_vcpus = var.batch_compute_environment_large_cpu_min_vcpus
93+
desired_vcpus = var.batch_compute_environment_large_cpu_desired_vcpus
94+
95+
# Prefers cheap vCPU approaches
96+
allocation_strategy = "BEST_FIT"
97+
98+
/* Links to a launch template who has more than the standard 8GB of disk space. So we can download training data.
99+
Always uses the "default version", which means we can update the Launch Template to a smaller or larger disk size
100+
and this compute environment will not have to be destroyed and then created to point to a new Launch Template.
101+
*/
102+
launch_template {
103+
launch_template_id = aws_launch_template.this.id
104+
version = aws_launch_template.this.latest_version
105+
}
106+
107+
# Security group to apply to the instances launched.
108+
security_group_ids = [
109+
aws_security_group.batch.id,
110+
]
111+
112+
# Which subnet to launch the instances into.
113+
subnets = [
114+
var.subnet_private_1_id,
115+
var.subnet_private_2_id
116+
]
117+
118+
# Type of instance EC2 for on-demand. Can use "SPOT" to use unused instances at discount if available
119+
type = "EC2"
120+
121+
tags = var.standard_tags
122+
}
123+
124+
lifecycle {
125+
/* From here https://github.com/terraform-providers/terraform-provider-aws/issues/11077#issuecomment-560416740
126+
helps with "modifying" batch compute environments which requires creating new ones and deleting old ones
127+
as no inplace modification can be made
128+
*/
129+
create_before_destroy = true
130+
# To ensure terraform redeploys do not silently overwrite an up to date desired_vcpus that metaflow may modify
131+
ignore_changes = [compute_resources.0.desired_vcpus]
132+
}
133+
}
134+
135+
resource "aws_batch_compute_environment" "gpu" {
136+
/* Unique name for compute environment.
137+
We use compute_environment_name_prefix opposed to just compute_environment_name as batch compute environments must
138+
be created and destroyed, never edited. This way when we go to make a "modification" we will stand up a new
139+
batch compute environment with a new unique name and once that succeeds, the old one will be torn down. If we had
140+
just used compute_environment_name, then there would be a conflict when we went to stand up the new
141+
compute_environment that had the modifications applied and the process would fail.
142+
*/
143+
compute_environment_name_prefix = local.gpu_compute_env_prefix_name
144+
145+
# Give permissions so the batch service can make API calls.
146+
service_role = aws_iam_role.batch_service_role.arn
147+
type = "MANAGED"
148+
depends_on = [aws_iam_role_policy_attachment.batch_service_role]
149+
150+
compute_resources {
151+
# Give permissions so the ECS container instances can make API call.
152+
instance_role = aws_iam_instance_profile.ecs_instance_role.arn
153+
154+
# List of types that can be launched.
155+
instance_type = var.batch_gpu_instance_types
156+
157+
# Range of number of CPUs.
158+
max_vcpus = var.batch_compute_environment_gpu_max_vcpus
159+
min_vcpus = var.batch_compute_environment_gpu_min_vcpus
160+
desired_vcpus = var.batch_compute_environment_gpu_desired_vcpus
161+
162+
# Prefers cheap vCPU approaches
163+
allocation_strategy = "BEST_FIT"
164+
165+
/* Links to a launch template who has more than the standard 8GB of disk space. So we can download training data.
166+
Always uses the "default version", which means we can update the Launch Template to a smaller or larger disk size
167+
and this compute environment will not have to be destroyed and then created to point to a new Launch Template.
168+
*/
169+
launch_template {
170+
launch_template_id = aws_launch_template.this.id
171+
version = aws_launch_template.this.latest_version
172+
}
173+
174+
# Security group to apply to the instances launched.
175+
security_group_ids = [
176+
aws_security_group.batch.id,
177+
]
178+
179+
# Which subnet to launch the instances into.
180+
subnets = [
181+
var.subnet_private_1_id,
182+
var.subnet_private_2_id
183+
]
184+
185+
# Type of instance EC2 for on-demand. Can use "SPOT" to use unused instances at discount if available
186+
type = "EC2"
187+
188+
tags = var.standard_tags
189+
}
190+
191+
lifecycle {
192+
/* From here https://github.com/terraform-providers/terraform-provider-aws/issues/11077#issuecomment-560416740
193+
helps with "modifying" batch compute environments which requires creating new ones and deleting old ones
194+
as no inplace modification can be made
195+
*/
196+
create_before_destroy = true
197+
# To ensure terraform redeploys do not silently overwrite an up to date desired_vcpus that metaflow may modify
198+
ignore_changes = [compute_resources.0.desired_vcpus]
199+
}
200+
}
201+
202+
resource "aws_batch_job_queue" "this" {
203+
name = local.batch_queue_name
204+
state = "ENABLED"
205+
priority = 1
206+
compute_environments = [
207+
aws_batch_compute_environment.cpu.arn,
208+
aws_batch_compute_environment.large-cpu.arn,
209+
aws_batch_compute_environment.gpu.arn
210+
]
211+
212+
tags = var.standard_tags
213+
}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
data "aws_region" "current" {}

0 commit comments

Comments
 (0)