11AWSTemplateFormatVersion : ' 2010-09-09'
2- Description : ' RDS Multi-Tenant Cost Visibility Module'
2+ Description : |
3+ RDS Multi-Tenant Cost Visibility Module
4+ Collects CloudWatch Database Insights metrics to enable cost allocation for multi-tenant RDS instances.
5+ Supports both user-level and database-level metric collection across all RDS engines.
36
47Parameters :
58 DatabaseName :
@@ -68,6 +71,7 @@ Parameters:
6871 Description : Arn of the Account Collector Lambda
6972
7073Resources :
74+ # IAM Role for the Lambda function to collect CloudWatch Database Insights metrics
7175 RDSMetricsLambdaRole :
7276 Type : AWS::IAM::Role
7377 Properties :
@@ -110,6 +114,7 @@ Resources:
110114 - s3:PutObject
111115 Resource : !Sub "${DestinationBucketARN}/*"
112116
117+ # Lambda function that collects CloudWatch Database Insights metrics for multi-tenant cost allocation
113118 RDSPerformanceInsightsFnHourly :
114119 Type : AWS::Lambda::Function
115120 Properties :
@@ -152,6 +157,10 @@ Resources:
152157
153158
154159 def lambda_handler(event, context):
160+ """
161+ Main Lambda handler for collecting CloudWatch Database Insights metrics from RDS instances.
162+ Enables multi-tenant cost allocation by collecting db.load metrics by user and database dimensions.
163+ """
155164 if 'account' not in event:
156165 raise ValueError(
157166 "Please do not trigger this Lambda manually."
@@ -164,7 +173,7 @@ Resources:
164173 account_name = account["account_name"]
165174 payer_id = account["payer_id"]
166175
167- print(f"Collecting RDS Performance Insights data for account: {account_id}")
176+ print(f"Collecting CloudWatch Database Insights data for account: {account_id}")
168177
169178 # Get all active AWS regions
170179 ec2_client = boto3.client('ec2')
@@ -236,32 +245,43 @@ Resources:
236245
237246
238247 def should_collect_database_metrics(engine):
248+ """
249+ Determines if database dimension metrics are supported for the given engine.
250+ Oracle and SQL Server engines don't support db.name dimension in CloudWatch Database Insights.
251+ https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_PerfInsights.UsingDashboard.Components.html#USER_PerfInsights.UsingDashboard.Components.AvgActiveSessions
252+ """
239253 excluded_engines = ['oracle-ee', 'oracle-se2', 'oracle-se1', 'oracle-se',
240254 'sqlserver-ee', 'sqlserver-se', 'sqlserver-ex', 'sqlserver-web']
241255 return engine.lower() not in excluded_engines
242256
243257 def get_performance_metrics(pi_client, instance_id, engine):
258+ """
259+ Retrieves CloudWatch Database Insights metrics for the specified RDS instance.
260+ Collects CPU metrics and db.load metrics grouped by user and database dimensions.
261+ """
244262 current_time = datetime.utcnow()
245- end_time = current_time.replace(minute=0, second=0, microsecond=0)
263+ end_time = current_time.replace(minute=0, second=0, microsecond=0) # Round to top of hour
246264 start_time = end_time - timedelta(hours=hour_delta)
247265
266+ # Base metrics: CPU utilization (no dimensions)
248267 metric_queries = [
249268 {
250- 'Metric': 'os.general.numVCPUs.avg'
269+ 'Metric': 'os.general.numVCPUs.avg' # Used for cost allocation calculations
251270 },
252271 {
253- 'Metric': 'db.load.avg',
272+ 'Metric': 'db.load.avg', # Database load by user (tenant)
254273 'GroupBy': {
255274 'Group': 'db.user',
256275 'Dimensions': ['db.user.name']
257276 }
258277 }
259278 ]
260279
261- # Add database dimension query if engine supports it
280+ # Add database dimension if supported by engine
281+ # Oracle and SQL Server don't support database-level grouping
262282 if should_collect_database_metrics(engine):
263283 metric_queries.append({
264- 'Metric': 'db.load.avg',
284+ 'Metric': 'db.load.avg', # Database load by database name
265285 'GroupBy': {
266286 'Group': 'db',
267287 'Dimensions': ['db.name']
@@ -281,42 +301,48 @@ Resources:
281301
282302
283303 def process_metrics(instance_id, instance_arn, metrics, region, engine):
304+ """
305+ Processes CloudWatch Database Insights metrics and flattens them for storage.
306+ Creates records for each metric data point with proper dimension mapping.
307+ """
284308 all_flattened_metrics = []
285309 num_cpus = ''
286310
287311 print(f"Processing metrics for instance {instance_id} in region {region}")
288312 print(f"Total metrics received: {len(metrics)}")
289313
290314 for metric in metrics:
291- # Debug print for each metric
292315 print(f"Processing metric: {metric['Key']['Metric']}")
293316
317+ # Extract CPU count for cost allocation calculations
294318 if metric["Key"]["Metric"] == 'os.general.numVCPUs.avg':
295319 for datapoint in metric["DataPoints"]:
296320 num_cpus = datapoint.get("Value", 0)
297321 if num_cpus != '':
298322 break
299323
324+ # Process metrics with dimensions (user or database level)
300325 if "Dimensions" in metric["Key"]:
301326 dimensions = metric["Key"]["Dimensions"]
302327
303- # Determine dimension type based on available dimensions
328+ # Determine dimension type for multi-tenant cost allocation
304329 if 'db.user.name' in dimensions:
305- dimension_type = 'user'
330+ dimension_type = 'user' # Cost allocation by database user
306331 elif 'db.name' in dimensions:
307- dimension_type = 'database'
332+ dimension_type = 'database' # Cost allocation by database name
308333 else:
309334 dimension_type = 'unknown'
310335
336+ # Create base record with all metadata
311337 base_entry = {
312338 "metric": metric["Key"]["Metric"],
313339 "resourcearn": instance_arn,
314340 "instance_id": instance_id,
315- "engine": engine,
316- "num_vcpus": num_cpus,
317- "dimension_type": dimension_type,
318- "db_user_name": dimensions.get('db.user.name', None),
319- "db_database_name": dimensions.get('db.name', None)
341+ "engine": engine, # Database engine type
342+ "num_vcpus": num_cpus, # For cost calculations
343+ "dimension_type": dimension_type, # user|database|unknown
344+ "db_user_name": dimensions.get('db.user.name', None), # Tenant user
345+ "db_database_name": dimensions.get('db.name', None) # Database name
320346 }
321347
322348 for datapoint in metric["DataPoints"]:
@@ -347,14 +373,17 @@ Resources:
347373 )
348374
349375 def write_metrics_to_s3(s3_client, region_metrics, account_id, payer_id):
350-
351- # Group metrics by their individual timestamps
376+ """
377+ Writes collected metrics to S3 in Parquet format with proper partitioning.
378+ Groups metrics by timestamp and stores them in hourly partitions for efficient querying.
379+ """
380+ # Process metrics for each region
352381 for region, metrics in region_metrics.items():
353382 if not metrics:
354383 print(f"No metrics to process for region {region}")
355384 continue
356385
357- # Group metrics by their unique timestamp keys
386+ # Group metrics by hourly timestamps for efficient storage
358387 timestamp_grouped_metrics = {}
359388 for metric in metrics:
360389 timestamp = datetime.strptime(metric['timestamp'], "%Y-%m-%d %H:%M:%S%z")
@@ -363,33 +392,33 @@ Resources:
363392 day = timestamp.strftime('%d')
364393 hour = timestamp.strftime('%H')
365394
366- # Create a unique key for each timestamp
395+ # Create partition key for S3 organization
367396 timestamp_key = f"{year}/{month}/{day}/{hour}"
368397
369398 if timestamp_key not in timestamp_grouped_metrics:
370399 timestamp_grouped_metrics[timestamp_key] = []
371400
372401 timestamp_grouped_metrics[timestamp_key].append(metric)
373402
374- # Write metrics for each unique timestamp
403+ # Write each hourly batch to S3 as separate Parquet files
375404 for timestamp_key, grouped_metrics in timestamp_grouped_metrics.items():
376405 year, month, day, hour = timestamp_key.split('/')
377406
378- # Create S3 key for this specific timestamp group with payer_id partitioning
407+ # S3 key with Hive-style partitioning for Athena compatibility
379408 s3_key = f"{metrics_s3_prefix}/payer_id={payer_id}/account_id={account_id}/region={region}/year={year}/month={month}/day={day}/hour={hour}/metrics.parquet"
380409
381410 print(f"Writing metrics to S3 key: {s3_key}")
382411 print(f"Total number of metrics: {len(grouped_metrics)}")
383412
384- # Convert to Arrow table
413+ # Convert to Apache Arrow table for efficient Parquet storage
385414 table = pa.Table.from_pylist(grouped_metrics)
386415
387- # Write to Parquet format
416+ # Serialize to Parquet format in memory
388417 buf = BytesIO()
389418 pq.write_table(table, buf)
390419 buf.seek(0)
391420
392- # Upload to S3
421+ # Upload to S3 bucket
393422 s3_client.put_object(
394423 Bucket=os.environ['METRICS_BUCKET'],
395424 Key=s3_key,
@@ -403,6 +432,7 @@ Resources:
403432 METRICS_S3_PREFIX : ' rds-multitenant'
404433 ROLENAME : !Ref MultiAccountRoleName
405434
435+ # EventBridge Scheduler to run the data collection hourly
406436 ModuleRefreshSchedule :
407437 Type : ' AWS::Scheduler::Schedule'
408438 Properties :
@@ -418,13 +448,14 @@ Resources:
418448 RoleArn : !Ref SchedulerExecutionRoleARN
419449 Input : !Sub '{"module_lambda":"${RDSPerformanceInsightsFnHourly.Arn}","crawlers": ["${ResourcePrefix}PerformanceInsightsRDSCrawler", "${ResourcePrefix}PerformanceInsightsRDSCrawlerHourly"]}'
420450
451+ # Glue Database to store CloudWatch Database Insights table metadata
421452 GlueDatabase :
422453 Type : AWS::Glue::Database
423454 Properties :
424455 CatalogId : !Ref AWS::AccountId
425456 DatabaseInput :
426457 Name : rds_performance_insights_db
427- Description : Database for RDS Performance Insights data
458+ Description : Database for CloudWatch Database Insights multi-tenant cost allocation data
428459
429460
430461
@@ -452,6 +483,7 @@ Resources:
452483 - s3:PutObject
453484 Resource : !Sub "${DestinationBucketARN}/*"
454485
486+ # Glue Crawler to automatically discover and catalog the Parquet data schema
455487 PerformanceInsightsRDSCrawler :
456488 Type : AWS::Glue::Crawler
457489 Properties :
@@ -527,6 +559,7 @@ Resources:
527559 - !Sub " ${DestinationBucketARN}"
528560 - !Sub " ${DestinationBucketARN}/*"
529561
562+ # Lambda function to create Athena views for cost allocation analysis
530563 CreateAthenaViewsLambda :
531564 Type : AWS::Lambda::Function
532565 Properties :
@@ -644,6 +677,7 @@ Resources:
644677 GlueDatabase : !Ref GlueDatabase
645678 S3OutputLocation : !Sub 's3://${DestinationBucket}/athena_output/'
646679
680+ # Step Function to orchestrate the data collection workflow across accounts
647681 ModuleStepFunction :
648682 Type : AWS::StepFunctions::StateMachine
649683 Properties :
0 commit comments