Skip to content

Commit 0c37148

Browse files
committed
docs: Add comprehensive comments to RDS multitenant module
- Add docstrings to all Lambda functions explaining their purpose - Document multi-tenant cost allocation logic and dimension types - Explain engine compatibility limitations for Oracle/SQL Server - Add CloudFormation template comments for key resources - Document S3 partitioning strategy and Parquet storage optimization - Improve code maintainability and understanding
1 parent 78806ab commit 0c37148

File tree

1 file changed

+60
-26
lines changed

1 file changed

+60
-26
lines changed

data-collection/deploy/module-rds-multitenant.yaml

Lines changed: 60 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
AWSTemplateFormatVersion: '2010-09-09'
2-
Description: 'RDS Multi-Tenant Cost Visibility Module'
2+
Description: |
3+
RDS Multi-Tenant Cost Visibility Module
4+
Collects CloudWatch Database Insights metrics to enable cost allocation for multi-tenant RDS instances.
5+
Supports both user-level and database-level metric collection across all RDS engines.
36
47
Parameters:
58
DatabaseName:
@@ -68,6 +71,7 @@ Parameters:
6871
Description: Arn of the Account Collector Lambda
6972

7073
Resources:
74+
# IAM Role for the Lambda function to collect CloudWatch Database Insights metrics
7175
RDSMetricsLambdaRole:
7276
Type: AWS::IAM::Role
7377
Properties:
@@ -110,6 +114,7 @@ Resources:
110114
- s3:PutObject
111115
Resource: !Sub "${DestinationBucketARN}/*"
112116

117+
# Lambda function that collects CloudWatch Database Insights metrics for multi-tenant cost allocation
113118
RDSPerformanceInsightsFnHourly:
114119
Type: AWS::Lambda::Function
115120
Properties:
@@ -152,6 +157,10 @@ Resources:
152157
153158
154159
def lambda_handler(event, context):
160+
"""
161+
Main Lambda handler for collecting CloudWatch Database Insights metrics from RDS instances.
162+
Enables multi-tenant cost allocation by collecting db.load metrics by user and database dimensions.
163+
"""
155164
if 'account' not in event:
156165
raise ValueError(
157166
"Please do not trigger this Lambda manually."
@@ -164,7 +173,7 @@ Resources:
164173
account_name = account["account_name"]
165174
payer_id = account["payer_id"]
166175
167-
print(f"Collecting RDS Performance Insights data for account: {account_id}")
176+
print(f"Collecting CloudWatch Database Insights data for account: {account_id}")
168177
169178
# Get all active AWS regions
170179
ec2_client = boto3.client('ec2')
@@ -236,32 +245,43 @@ Resources:
236245
237246
238247
def should_collect_database_metrics(engine):
248+
"""
249+
Determines if database dimension metrics are supported for the given engine.
250+
Oracle and SQL Server engines don't support db.name dimension in CloudWatch Database Insights.
251+
https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/USER_PerfInsights.UsingDashboard.Components.html#USER_PerfInsights.UsingDashboard.Components.AvgActiveSessions
252+
"""
239253
excluded_engines = ['oracle-ee', 'oracle-se2', 'oracle-se1', 'oracle-se',
240254
'sqlserver-ee', 'sqlserver-se', 'sqlserver-ex', 'sqlserver-web']
241255
return engine.lower() not in excluded_engines
242256
243257
def get_performance_metrics(pi_client, instance_id, engine):
258+
"""
259+
Retrieves CloudWatch Database Insights metrics for the specified RDS instance.
260+
Collects CPU metrics and db.load metrics grouped by user and database dimensions.
261+
"""
244262
current_time = datetime.utcnow()
245-
end_time = current_time.replace(minute=0, second=0, microsecond=0)
263+
end_time = current_time.replace(minute=0, second=0, microsecond=0) # Round to top of hour
246264
start_time = end_time - timedelta(hours=hour_delta)
247265
266+
# Base metrics: CPU utilization (no dimensions)
248267
metric_queries = [
249268
{
250-
'Metric': 'os.general.numVCPUs.avg'
269+
'Metric': 'os.general.numVCPUs.avg' # Used for cost allocation calculations
251270
},
252271
{
253-
'Metric': 'db.load.avg',
272+
'Metric': 'db.load.avg', # Database load by user (tenant)
254273
'GroupBy': {
255274
'Group': 'db.user',
256275
'Dimensions': ['db.user.name']
257276
}
258277
}
259278
]
260279
261-
# Add database dimension query if engine supports it
280+
# Add database dimension if supported by engine
281+
# Oracle and SQL Server don't support database-level grouping
262282
if should_collect_database_metrics(engine):
263283
metric_queries.append({
264-
'Metric': 'db.load.avg',
284+
'Metric': 'db.load.avg', # Database load by database name
265285
'GroupBy': {
266286
'Group': 'db',
267287
'Dimensions': ['db.name']
@@ -281,42 +301,48 @@ Resources:
281301
282302
283303
def process_metrics(instance_id, instance_arn, metrics, region, engine):
304+
"""
305+
Processes CloudWatch Database Insights metrics and flattens them for storage.
306+
Creates records for each metric data point with proper dimension mapping.
307+
"""
284308
all_flattened_metrics = []
285309
num_cpus = ''
286310
287311
print(f"Processing metrics for instance {instance_id} in region {region}")
288312
print(f"Total metrics received: {len(metrics)}")
289313
290314
for metric in metrics:
291-
# Debug print for each metric
292315
print(f"Processing metric: {metric['Key']['Metric']}")
293316
317+
# Extract CPU count for cost allocation calculations
294318
if metric["Key"]["Metric"] == 'os.general.numVCPUs.avg':
295319
for datapoint in metric["DataPoints"]:
296320
num_cpus = datapoint.get("Value", 0)
297321
if num_cpus != '':
298322
break
299323
324+
# Process metrics with dimensions (user or database level)
300325
if "Dimensions" in metric["Key"]:
301326
dimensions = metric["Key"]["Dimensions"]
302327
303-
# Determine dimension type based on available dimensions
328+
# Determine dimension type for multi-tenant cost allocation
304329
if 'db.user.name' in dimensions:
305-
dimension_type = 'user'
330+
dimension_type = 'user' # Cost allocation by database user
306331
elif 'db.name' in dimensions:
307-
dimension_type = 'database'
332+
dimension_type = 'database' # Cost allocation by database name
308333
else:
309334
dimension_type = 'unknown'
310335
336+
# Create base record with all metadata
311337
base_entry = {
312338
"metric": metric["Key"]["Metric"],
313339
"resourcearn": instance_arn,
314340
"instance_id": instance_id,
315-
"engine": engine,
316-
"num_vcpus": num_cpus,
317-
"dimension_type": dimension_type,
318-
"db_user_name": dimensions.get('db.user.name', None),
319-
"db_database_name": dimensions.get('db.name', None)
341+
"engine": engine, # Database engine type
342+
"num_vcpus": num_cpus, # For cost calculations
343+
"dimension_type": dimension_type, # user|database|unknown
344+
"db_user_name": dimensions.get('db.user.name', None), # Tenant user
345+
"db_database_name": dimensions.get('db.name', None) # Database name
320346
}
321347
322348
for datapoint in metric["DataPoints"]:
@@ -347,14 +373,17 @@ Resources:
347373
)
348374
349375
def write_metrics_to_s3(s3_client, region_metrics, account_id, payer_id):
350-
351-
# Group metrics by their individual timestamps
376+
"""
377+
Writes collected metrics to S3 in Parquet format with proper partitioning.
378+
Groups metrics by timestamp and stores them in hourly partitions for efficient querying.
379+
"""
380+
# Process metrics for each region
352381
for region, metrics in region_metrics.items():
353382
if not metrics:
354383
print(f"No metrics to process for region {region}")
355384
continue
356385
357-
# Group metrics by their unique timestamp keys
386+
# Group metrics by hourly timestamps for efficient storage
358387
timestamp_grouped_metrics = {}
359388
for metric in metrics:
360389
timestamp = datetime.strptime(metric['timestamp'], "%Y-%m-%d %H:%M:%S%z")
@@ -363,33 +392,33 @@ Resources:
363392
day = timestamp.strftime('%d')
364393
hour = timestamp.strftime('%H')
365394
366-
# Create a unique key for each timestamp
395+
# Create partition key for S3 organization
367396
timestamp_key = f"{year}/{month}/{day}/{hour}"
368397
369398
if timestamp_key not in timestamp_grouped_metrics:
370399
timestamp_grouped_metrics[timestamp_key] = []
371400
372401
timestamp_grouped_metrics[timestamp_key].append(metric)
373402
374-
# Write metrics for each unique timestamp
403+
# Write each hourly batch to S3 as separate Parquet files
375404
for timestamp_key, grouped_metrics in timestamp_grouped_metrics.items():
376405
year, month, day, hour = timestamp_key.split('/')
377406
378-
# Create S3 key for this specific timestamp group with payer_id partitioning
407+
# S3 key with Hive-style partitioning for Athena compatibility
379408
s3_key = f"{metrics_s3_prefix}/payer_id={payer_id}/account_id={account_id}/region={region}/year={year}/month={month}/day={day}/hour={hour}/metrics.parquet"
380409
381410
print(f"Writing metrics to S3 key: {s3_key}")
382411
print(f"Total number of metrics: {len(grouped_metrics)}")
383412
384-
# Convert to Arrow table
413+
# Convert to Apache Arrow table for efficient Parquet storage
385414
table = pa.Table.from_pylist(grouped_metrics)
386415
387-
# Write to Parquet format
416+
# Serialize to Parquet format in memory
388417
buf = BytesIO()
389418
pq.write_table(table, buf)
390419
buf.seek(0)
391420
392-
# Upload to S3
421+
# Upload to S3 bucket
393422
s3_client.put_object(
394423
Bucket=os.environ['METRICS_BUCKET'],
395424
Key=s3_key,
@@ -403,6 +432,7 @@ Resources:
403432
METRICS_S3_PREFIX: 'rds-multitenant'
404433
ROLENAME: !Ref MultiAccountRoleName
405434

435+
# EventBridge Scheduler to run the data collection hourly
406436
ModuleRefreshSchedule:
407437
Type: 'AWS::Scheduler::Schedule'
408438
Properties:
@@ -418,13 +448,14 @@ Resources:
418448
RoleArn: !Ref SchedulerExecutionRoleARN
419449
Input: !Sub '{"module_lambda":"${RDSPerformanceInsightsFnHourly.Arn}","crawlers": ["${ResourcePrefix}PerformanceInsightsRDSCrawler", "${ResourcePrefix}PerformanceInsightsRDSCrawlerHourly"]}'
420450

451+
# Glue Database to store CloudWatch Database Insights table metadata
421452
GlueDatabase:
422453
Type: AWS::Glue::Database
423454
Properties:
424455
CatalogId: !Ref AWS::AccountId
425456
DatabaseInput:
426457
Name: rds_performance_insights_db
427-
Description: Database for RDS Performance Insights data
458+
Description: Database for CloudWatch Database Insights multi-tenant cost allocation data
428459

429460

430461

@@ -452,6 +483,7 @@ Resources:
452483
- s3:PutObject
453484
Resource: !Sub "${DestinationBucketARN}/*"
454485

486+
# Glue Crawler to automatically discover and catalog the Parquet data schema
455487
PerformanceInsightsRDSCrawler:
456488
Type: AWS::Glue::Crawler
457489
Properties:
@@ -527,6 +559,7 @@ Resources:
527559
- !Sub "${DestinationBucketARN}"
528560
- !Sub "${DestinationBucketARN}/*"
529561

562+
# Lambda function to create Athena views for cost allocation analysis
530563
CreateAthenaViewsLambda:
531564
Type: AWS::Lambda::Function
532565
Properties:
@@ -644,6 +677,7 @@ Resources:
644677
GlueDatabase: !Ref GlueDatabase
645678
S3OutputLocation: !Sub 's3://${DestinationBucket}/athena_output/'
646679

680+
# Step Function to orchestrate the data collection workflow across accounts
647681
ModuleStepFunction:
648682
Type: AWS::StepFunctions::StateMachine
649683
Properties:

0 commit comments

Comments
 (0)