Skip to content

Commit 9b24cca

Browse files
authored
DPF-226 remove crawlers from the source module and recreate it for each department (#2042)
* DPF-226 remove crawlers from the source module and recreate it for each department * change the role arn of crawler * add temporary triggers on crawlers * add an index to trigger action * add tags to make the terraform linter happy
1 parent 0cd761f commit 9b24cca

File tree

3 files changed

+103
-12
lines changed

3 files changed

+103
-12
lines changed

terraform/etl/60-airflow-etl-used-crawlers.tf

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,3 +90,105 @@ resource "aws_glue_crawler" "parking_spatially_enriched_refined_zone" {
9090
}
9191
})
9292
}
93+
94+
resource "aws_glue_crawler" "parking_google_sheet_ingestion_raw_zone" {
95+
count = local.is_live_environment ? 1 : 0
96+
name = "${local.short_identifier_prefix}${module.department_parking_data_source.identifier}-google-sheet-ingestion-raw-zone"
97+
role = data.aws_iam_role.glue_role.arn
98+
database_name = module.department_parking_data_source.raw_zone_catalog_database_name
99+
s3_target {
100+
path = "s3://${module.raw_zone_data_source.bucket_id}/${module.department_parking_data_source.identifier}/google-sheets/"
101+
}
102+
103+
configuration = jsonencode({
104+
Version = 1.0
105+
Grouping = {
106+
TableLevelConfiguration = 4
107+
}
108+
CrawlerOutput = {
109+
Partitions = { AddOrUpdateBehavior = "InheritFromTable" }
110+
Tables = { AddOrUpdateBehavior = "MergeNewColumns" }
111+
}
112+
})
113+
}
114+
115+
resource "aws_glue_crawler" "housing_google_sheet_ingestion_raw_zone" {
116+
count = local.is_live_environment ? 1 : 0
117+
name = "${local.short_identifier_prefix}${module.department_housing_data_source.identifier}-google-sheet-ingestion-raw-zone"
118+
role = data.aws_iam_role.glue_role.arn
119+
database_name = module.department_housing_data_source.raw_zone_catalog_database_name
120+
s3_target {
121+
path = "s3://${module.raw_zone_data_source.bucket_id}/${module.department_housing_data_source.identifier}/google-sheets/"
122+
}
123+
124+
configuration = jsonencode({
125+
Version = 1.0
126+
Grouping = {
127+
TableLevelConfiguration = 4
128+
}
129+
CrawlerOutput = {
130+
Partitions = { AddOrUpdateBehavior = "InheritFromTable" }
131+
Tables = { AddOrUpdateBehavior = "MergeNewColumns" }
132+
}
133+
})
134+
}
135+
136+
resource "aws_glue_crawler" "data_and_insight_google_sheet_ingestion_raw_zone" {
137+
count = local.is_live_environment ? 1 : 0
138+
name = "${local.short_identifier_prefix}${module.department_data_and_insight_data_source.identifier}-google-sheet-ingestion-raw-zone"
139+
role = data.aws_iam_role.glue_role.arn
140+
database_name = module.department_data_and_insight_data_source.raw_zone_catalog_database_name
141+
s3_target {
142+
path = "s3://${module.raw_zone_data_source.bucket_id}/${module.department_data_and_insight_data_source.identifier}/google-sheets/"
143+
}
144+
145+
configuration = jsonencode({
146+
Version = 1.0
147+
Grouping = {
148+
TableLevelConfiguration = 4
149+
}
150+
CrawlerOutput = {
151+
Partitions = { AddOrUpdateBehavior = "InheritFromTable" }
152+
Tables = { AddOrUpdateBehavior = "MergeNewColumns" }
153+
}
154+
})
155+
}
156+
157+
# Below crawlers triggers are temporary and will be removed after enabling the airflow google sheet ingestion dag
158+
resource "aws_glue_trigger" "parking_google_sheet_ingestion_raw_zone_trigger" {
159+
name = "${local.short_identifier_prefix}${module.department_parking_data_source.identifier}-google-sheet-ingestion-raw-zone-trigger"
160+
type = "SCHEDULED"
161+
schedule = "cron(0 7 ? * * *)"
162+
start_on_creation = true
163+
164+
actions {
165+
crawler_name = aws_glue_crawler.parking_google_sheet_ingestion_raw_zone[0].name
166+
}
167+
tags = module.tags.values
168+
}
169+
170+
171+
resource "aws_glue_trigger" "housing_google_sheet_ingestion_raw_zone_trigger" {
172+
name = "${local.short_identifier_prefix}${module.department_housing_data_source.identifier}-google-sheet-ingestion-raw-zone-trigger"
173+
type = "SCHEDULED"
174+
schedule = "cron(0 7 ? * * *)"
175+
start_on_creation = true
176+
177+
actions {
178+
crawler_name = aws_glue_crawler.housing_google_sheet_ingestion_raw_zone[0].name
179+
}
180+
tags = module.tags.values
181+
}
182+
183+
184+
resource "aws_glue_trigger" "data_and_insight_google_sheet_ingestion_raw_zone_trigger" {
185+
name = "${local.short_identifier_prefix}${module.department_data_and_insight_data_source.identifier}-google-sheet-ingestion-raw-zone-trigger"
186+
type = "SCHEDULED"
187+
schedule = "cron(0 7 ? * * *)"
188+
start_on_creation = true
189+
190+
actions {
191+
crawler_name = aws_glue_crawler.data_and_insight_google_sheet_ingestion_raw_zone[0].name
192+
}
193+
tags = module.tags.values
194+
}
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
locals {
22
dataset_name = lower(replace(var.dataset_name, "_", "-"))
33
import_name = "${var.department.identifier}-${local.dataset_name}"
4-
full_output_path = "s3://${var.bucket_id}/${var.department.identifier}/${local.dataset_name}"
4+
full_output_path = "s3://${var.bucket_id}/${var.department.identifier}/google-sheets/${local.dataset_name}"
55
sheets_credentials_name = var.sheets_credentials_name == null ? var.department.google_service_account.credentials_secret.name : var.sheets_credentials_name
66
}

terraform/modules/google-sheets-glue-job/10-aws-glue-job.tf

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,17 +24,6 @@ module "google_sheet_import" {
2424
schedule = var.google_sheet_import_schedule
2525
max_retries = var.max_retries
2626
trigger_enabled = (var.is_live_environment && var.enable_glue_trigger)
27-
crawler_details = {
28-
database_name = var.glue_catalog_database_name
29-
s3_target_location = local.full_output_path
30-
table_prefix = "${var.department.identifier_snake_case}_"
31-
configuration = jsonencode({
32-
Version = 1.0
33-
Grouping = {
34-
TableGroupingPolicy = "CombineCompatibleSchemas"
35-
}
36-
})
37-
}
3827
}
3928

4029
resource "aws_glue_workflow" "workflow" {

0 commit comments

Comments
 (0)