Skip to content

Commit 6e590ad

Browse files
Merge pull request #59 from AwsAndrewf/feature/spread-placement-support
feat: add spread placement group support with --placement flag
2 parents b82db3d + 919d675 commit 6e590ad

File tree

4 files changed

+137
-57
lines changed

4 files changed

+137
-57
lines changed

deployment/cdk/lib/base-latency-hunting-stack.ts

Lines changed: 47 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,19 @@ export interface BaseLatencyHuntingStackProps extends cdk.StackProps {
3535
* Base class for latency hunting stacks.
3636
* Implements the correct approach for creating instances across diverse instance types
3737
* with proper state tracking, resilient Lambda-based provisioning, and cleanup.
38+
*
39+
* Supports CDK context parameters:
40+
* placementStrategy - 'cluster' (default) or 'spread'
41+
* maxInstances - cap on number of instances to deploy (0 = no limit)
42+
* Automatically set to 7 for spread placement by the deploy script.
3843
*/
3944
export abstract class BaseLatencyHuntingStack extends cdk.Stack {
4045
protected readonly stateTable: dynamodb.Table;
4146
protected readonly instanceCreatorLambda: lambda.Function;
4247
protected readonly provider: Provider;
4348
protected readonly elasticIps?: string[];
49+
protected readonly placementStrategy: string;
50+
protected readonly maxInstances: number;
4451

4552
constructor(scope: cdk.App, id: string, props: BaseLatencyHuntingStackProps) {
4653
super(scope, id, props);
@@ -49,6 +56,15 @@ export abstract class BaseLatencyHuntingStack extends cdk.Stack {
4956
const managedByTag = props.managedByTag || 'CDK-LatencyHunting';
5057
this.elasticIps = props.elasticIps;
5158

59+
// Read placement strategy from CDK context (passed by deploy script --placement flag)
60+
// Valid values: 'cluster' (default) or 'spread'
61+
this.placementStrategy = this.node.tryGetContext('placementStrategy') || 'cluster';
62+
63+
// Read max instances cap from CDK context
64+
// When spread placement is selected, the deploy script sets this to 7
65+
// to respect the AWS limit of 7 instances per AZ per spread placement group
66+
this.maxInstances = parseInt(this.node.tryGetContext('maxInstances') || '0', 10);
67+
5268
// Create DynamoDB table for instance state tracking
5369
this.stateTable = new dynamodb.Table(this, 'InstanceStateTable', {
5470
partitionKey: { name: 'InstanceType', type: dynamodb.AttributeType.STRING },
@@ -423,7 +439,7 @@ def handler(event, context):
423439
ami_id = sorted_images[0]['ImageId']
424440
print(f"Auto-detected AMI: {ami_id}")
425441
426-
user_data_script = '''#!/bin/bash
442+
user_data_script = """#!/bin/bash
427443
set -e
428444
exec > >(tee /var/log/user-data.log)
429445
exec 2>&1
@@ -443,7 +459,7 @@ fi
443459
chown -R ec2-user:ec2-user /home/ec2-user/benchmark
444460
touch /home/ec2-user/benchmark/setup_complete
445461
echo "EC2 Hunting setup completed at $(date)"
446-
'''
462+
"""
447463
448464
run_params = {
449465
'ImageId': ami_id,
@@ -614,8 +630,33 @@ echo "EC2 Hunting setup completed at $(date)"
614630
}
615631

616632
/**
617-
* Create instances for the provided instance configurations
618-
* @param instances Array of instance configurations with unique IDs
633+
* Apply the maxInstances cap to an instance list.
634+
* When spread placement is selected, the deploy script sets maxInstances=7
635+
* to respect the AWS limit of 7 instances per AZ per spread placement group.
636+
*
637+
* Call this BEFORE passing instances to createInstances() and addCommonOutputs()
638+
* so both methods operate on the same capped list.
639+
*
640+
* @param instances Full array of instance configurations
641+
* @returns Capped array (or the original if no cap is set)
642+
*/
643+
protected applyInstanceCap(instances: InstanceConfig[]): InstanceConfig[] {
644+
if (this.maxInstances > 0 && instances.length > this.maxInstances) {
645+
console.log(
646+
`Capping instance count from ${instances.length} to ${this.maxInstances} ` +
647+
`(placement strategy: ${this.placementStrategy})`
648+
);
649+
return instances.slice(0, this.maxInstances);
650+
}
651+
return instances;
652+
}
653+
654+
/**
655+
* Create instances for the provided instance configurations.
656+
*
657+
* The placement group strategy is read from CDK context ('cluster' or 'spread').
658+
*
659+
* @param instances Array of instance configurations with unique IDs (should already be capped via applyInstanceCap)
619660
* @param vpc VPC to deploy instances in
620661
* @param securityGroup Security group for instances
621662
* @param keyPair Key pair for SSH access
@@ -650,12 +691,12 @@ echo "EC2 Hunting setup completed at $(date)"
650691

651692
// Create instances using explicit IDs
652693
instances.forEach((instance, index) => {
653-
// Create unique placement group for this instance
694+
// Create unique placement group for this instance using the configured strategy
654695
const placementGroup = new CfnPlacementGroup(
655696
this,
656697
`PlacementGroup-${instance.id}`,
657698
{
658-
strategy: 'cluster'
699+
strategy: this.placementStrategy
659700
}
660701
);
661702
placementGroup.applyRemovalPolicy(RemovalPolicy.DESTROY);
@@ -800,23 +841,6 @@ echo "EC2 Hunting setup completed at $(date)"
800841
{ id: 'arm-c8g-8', instanceType: 'c8g.48xlarge' },
801842
{ id: 'arm-c8g-9', instanceType: 'c8g.metal-24xl' },
802843
{ id: 'arm-c8g-10', instanceType: 'c8g.metal-48xl' },
803-
804-
// // Current generation - Intel (auto-detected AMI)
805-
// { id: 'intel-c7i-1', instanceType: 'c7i.xlarge' },
806-
// { id: 'intel-c7i-2', instanceType: 'c7i.xlarge' },
807-
// { id: 'intel-c7i-3', instanceType: 'c7i.xlarge' },
808-
// { id: 'intel-c6i-1', instanceType: 'c6i.xlarge' },
809-
// { id: 'intel-c6i-2', instanceType: 'c6i.xlarge' },
810-
// { id: 'intel-c6i-3', instanceType: 'c6i.xlarge' },
811-
812-
// // Current generation - Graviton ARM (auto-detected AMI)
813-
// { id: 'arm-c8g-1', instanceType: 'c8g.xlarge' },
814-
// { id: 'arm-c8g-2', instanceType: 'c8g.xlarge' },
815-
// { id: 'arm-c8g-3', instanceType: 'c8g.xlarge' },
816-
// { id: 'arm-c8g-4', instanceType: 'c8g.xlarge' },
817-
// { id: 'arm-c8g-5', instanceType: 'c8g.xlarge' },
818-
// { id: 'arm-c8g-6', instanceType: 'c8g.xlarge' },
819-
// { id: 'arm-c8g-7', instanceType: 'c8g.xlarge' },
820844
];
821845
}
822846
}

deployment/cdk/lib/latency-hunting-byovpc-stack.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,8 @@ export class LatencyHuntingBYOVPCStack extends BaseLatencyHuntingStack {
3939
throw new Error('keyPairName is required for BYOVPC stack');
4040
}
4141

42-
// Use default instances from base class
43-
const instances = this.getDefaultInstances();
42+
// Get instances from base class and apply cap (e.g., 7 for spread placement)
43+
const instances = this.applyInstanceCap(this.getDefaultInstances());
4444

4545
// Import existing VPC
4646
const vpc = Vpc.fromLookup(this, 'ExistingVpc', {
@@ -95,7 +95,7 @@ export class LatencyHuntingBYOVPCStack extends BaseLatencyHuntingStack {
9595
this.elasticIps
9696
);
9797

98-
// Add outputs using base class method
98+
// Add outputs using base class method (uses same capped list)
9999
this.addCommonOutputs(
100100
instances.map(i => i.instanceType),
101101
props.vpcId,

deployment/cdk/lib/latency-hunting-stack.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ export class LatencyHuntingStack extends BaseLatencyHuntingStack {
3131
const vpcCidr = props?.vpcCidr || '10.100.0.0/16'; // Default non-standard CIDR
3232
const vpcId = props?.vpcId;
3333

34-
// Get instances from overridden method
35-
const instances = this.getDefaultInstances();
34+
// Get instances from base class and apply cap (e.g., 7 for spread placement)
35+
const instances = this.applyInstanceCap(this.getDefaultInstances());
3636

3737
// Use existing VPC or create new one
3838
let vpc: cdk.aws_ec2.IVpc;
@@ -155,7 +155,7 @@ export class LatencyHuntingStack extends BaseLatencyHuntingStack {
155155
this.elasticIps
156156
);
157157

158-
// Add outputs using base class method
158+
// Add outputs using base class method (uses same capped list)
159159
this.addCommonOutputs(
160160
instances.map(i => i.instanceType),
161161
vpc.vpcId,

deployment/deploy-latency-hunting.sh

Lines changed: 84 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# Latency Hunting Deployment Script
44
# This script deploys diverse EC2 instance types with placement groups
55
# to find optimal network placement for latency-sensitive applications
6+
# Supports cluster and spread placement group strategies via --placement flag.
67

78
set -e
89

@@ -22,6 +23,7 @@ SECURITY_GROUP_ID="" # Optional for BYOVPC mode
2223
USE_EXISTING_VPC="false" # Set to true to use BYOVPC stack
2324
ELASTIC_IPS="" # Optional: comma-separated list of Elastic IP allocation IDs
2425
STACK_NAME="LatencyHuntingStack"
26+
PLACEMENT_STRATEGY="cluster" # Default: cluster placement group
2527

2628
# Function to print colored output
2729
print_info() {
@@ -52,30 +54,40 @@ OPTIONS:
5254
-g, --security-group-id SG_ID Security group ID (optional, creates one if not provided)
5355
-e, --elastic-ips EIP_LIST Comma-separated Elastic IP allocation IDs (optional)
5456
--use-existing-vpc Use BYOVPC stack (never creates/manages VPC)
57+
-p, --placement STRATEGY Placement group strategy: 'cluster' or 'spread' (default: cluster)
58+
cluster - packs instances close together in a single AZ
59+
for lowest inter-node latency
60+
spread - distributes instances across distinct hardware
61+
for maximum fault isolation (max 7 instances
62+
per AZ per group)
5563
-h, --help Show this help message
5664
5765
EXAMPLES:
58-
# Deploy with CDK-managed VPC (creates new VPC)
59-
$0 --region ap-northeast-1 --key-pair my-keypair
66+
# Deploy with cluster placement (default, lowest latency)
67+
$0 --region ap-northeast-1 --key-pair my-keypair --placement cluster
6068
61-
# Use existing VPC (RECOMMENDED - never manages VPC)
62-
$0 --use-existing-vpc \
63-
--region ap-northeast-1 \
64-
--vpc-id vpc-02393b8e30c6e3e5d \
65-
--subnet-id subnet-xxxxx \
66-
--key-pair tokyo_keypair
69+
# Deploy with spread placement (fault isolation)
70+
$0 --region ap-northeast-1 --key-pair my-keypair --placement spread
71+
72+
# Use existing VPC with spread placement
73+
$0 --use-existing-vpc \\
74+
--region ap-northeast-1 \\
75+
--vpc-id vpc-02393b8e30c6e3e5d \\
76+
--subnet-id subnet-xxxxx \\
77+
--key-pair tokyo_keypair \\
78+
--placement spread
6779
6880
# With existing security group
69-
$0 --use-existing-vpc \
70-
--region ap-northeast-1 \
71-
--vpc-id vpc-02393b8e30c6e3e5d \
72-
--subnet-id subnet-xxxxx \
73-
--security-group-id sg-xxxxx \
81+
$0 --use-existing-vpc \\
82+
--region ap-northeast-1 \\
83+
--vpc-id vpc-02393b8e30c6e3e5d \\
84+
--subnet-id subnet-xxxxx \\
85+
--security-group-id sg-xxxxx \\
7486
--key-pair my-keypair
7587
7688
# With Elastic IPs (first N instances get EIPs, rest get regular public IPs)
77-
$0 --region eu-central-1 \
78-
--key-pair frankfurt \
89+
$0 --region eu-central-1 \\
90+
--key-pair frankfurt \\
7991
--elastic-ips eipalloc-12345678,eipalloc-87654321,eipalloc-abcdef01
8092
8193
EOF
@@ -117,6 +129,10 @@ while [[ $# -gt 0 ]]; do
117129
USE_EXISTING_VPC="true"
118130
shift
119131
;;
132+
-p|--placement)
133+
PLACEMENT_STRATEGY="$2"
134+
shift 2
135+
;;
120136
-h|--help)
121137
usage
122138
;;
@@ -127,8 +143,32 @@ while [[ $# -gt 0 ]]; do
127143
esac
128144
done
129145

146+
# Validate placement strategy
147+
case "$PLACEMENT_STRATEGY" in
148+
cluster|spread)
149+
;;
150+
*)
151+
print_error "Invalid placement strategy: '$PLACEMENT_STRATEGY'. Must be 'cluster' or 'spread'."
152+
exit 1
153+
;;
154+
esac
155+
156+
# Warn about spread placement group limitations and enforce instance cap
157+
if [ "$PLACEMENT_STRATEGY" = "spread" ]; then
158+
print_warning "Spread placement selected. Note the following constraints:"
159+
print_warning " - Maximum of 7 running instances per AZ per placement group"
160+
print_warning " - Instances are placed on distinct underlying hardware (racks)"
161+
print_warning " - Inter-instance latency may be higher than cluster placement"
162+
print_warning " - Deployment will be capped at 7 instances to stay within the limit"
163+
print_warning ""
164+
MAX_INSTANCES=7
165+
else
166+
MAX_INSTANCES=0 # 0 = no limit
167+
fi
168+
130169
print_info "Starting Latency Hunting Deployment"
131170
print_info "================================"
171+
print_info "Placement Strategy: $PLACEMENT_STRATEGY"
132172

133173
if [ "$USE_EXISTING_VPC" = "true" ]; then
134174
print_info "Mode: BYOVPC (Bring Your Own VPC)"
@@ -145,7 +185,7 @@ if [ "$USE_EXISTING_VPC" = "true" ]; then
145185
print_info "Elastic IPs: $ELASTIC_IPS"
146186
fi
147187
STACK_NAME="LatencyHuntingBYOVPCStack"
148-
188+
149189
# Validate required parameters
150190
if [ -z "$VPC_ID" ]; then
151191
print_error "VPC ID is required when using --use-existing-vpc"
@@ -206,6 +246,11 @@ fi
206246
CDK_CONTEXT="--context deploymentType=latency-hunting"
207247
CDK_CONTEXT="$CDK_CONTEXT --context region=$REGION"
208248
CDK_CONTEXT="$CDK_CONTEXT --context keyPairName=$KEY_PAIR_NAME"
249+
CDK_CONTEXT="$CDK_CONTEXT --context placementStrategy=$PLACEMENT_STRATEGY"
250+
if [ "$MAX_INSTANCES" -gt 0 ]; then
251+
CDK_CONTEXT="$CDK_CONTEXT --context maxInstances=$MAX_INSTANCES"
252+
print_info "Instance cap: $MAX_INSTANCES (spread placement limit)"
253+
fi
209254

210255
# Add Elastic IPs if provided
211256
if [ -n "$ELASTIC_IPS" ]; then
@@ -230,33 +275,37 @@ else
230275
fi
231276

232277
# Deploy the stack
233-
print_info "Deploying Latency Hunting Stack..."
234-
print_info "This will launch multiple instance types with placement groups"
278+
print_info "Deploying Latency Hunting Stack (placement: $PLACEMENT_STRATEGY)..."
279+
if [ "$PLACEMENT_STRATEGY" = "spread" ]; then
280+
print_info "Instances will be distributed across distinct hardware for fault isolation"
281+
else
282+
print_info "Instances will be packed together in a single AZ for lowest latency"
283+
fi
235284
print_info "Some instance types may fail due to capacity constraints - this is expected"
236285
print_info ""
237286

238287
if cdk deploy $STACK_NAME $CDK_CONTEXT --require-approval never; then
239288
print_info ""
240-
print_info "✅ Deployment successful!"
289+
print_info "✅ Deployment successful! (placement: $PLACEMENT_STRATEGY)"
241290
print_info ""
242-
291+
243292
# Get stack outputs
244293
print_info "Fetching stack outputs..."
245294
OUTPUTS=$(aws cloudformation describe-stacks \
246295
--stack-name "$STACK_NAME" \
247296
--region "$REGION" \
248297
--query 'Stacks[0].Outputs' \
249298
--output json)
250-
299+
251300
# Create output directory and save outputs to file
252301
mkdir -p "$SCRIPT_DIR/latency-hunting"
253302
OUTPUT_FILE="$SCRIPT_DIR/latency-hunting/deployment-outputs.json"
254303
echo "$OUTPUTS" > "$OUTPUT_FILE"
255304
print_info "Stack outputs saved to: $OUTPUT_FILE"
256-
305+
257306
# Get total instance types attempted
258307
TOTAL_TYPES=$(echo "$OUTPUTS" | jq -r '.[] | select(.OutputKey=="TotalInstanceTypes") | .OutputValue')
259-
308+
260309
# Query actual instances via EC2 API using tags
261310
print_info ""
262311
print_info "Querying deployed instances..."
@@ -265,29 +314,36 @@ if cdk deploy $STACK_NAME $CDK_CONTEXT --require-approval never; then
265314
--filters "Name=tag:Architecture,Values=latency-hunting" "Name=instance-state-name,Values=pending,running" \
266315
--query 'Reservations[].Instances[].[InstanceId,InstanceType,State.Name,PublicIpAddress]' \
267316
--output json)
268-
317+
269318
SUCCESS_COUNT=$(echo "$INSTANCES" | jq '. | length')
270-
319+
271320
print_info ""
272321
print_info "Summary:"
322+
print_info " Placement strategy: $PLACEMENT_STRATEGY"
273323
print_info " Total instance types attempted: $TOTAL_TYPES"
274324
print_info " Successfully launched: $SUCCESS_COUNT"
275325
print_info " Failed (capacity/compatibility issues): $((TOTAL_TYPES - SUCCESS_COUNT))"
326+
if [ "$PLACEMENT_STRATEGY" = "spread" ]; then
327+
print_info " (capped at 7 for spread placement)"
328+
fi
276329
print_info ""
277-
330+
278331
# Show successful instances
279332
print_info "Successfully launched instances:"
280333
echo "$INSTANCES" | jq -r '.[] | "\(.[1]) (\(.[2])): \(.[0]) - \(.[3] // "pending")"' | while read line; do
281334
print_info " $line"
282335
done
283-
336+
284337
print_info ""
285338
print_info "Next steps:"
286339
print_info " 1. Wait for instances to fully initialize (2-3 minutes)"
287340
print_info " 2. Run latency tests: ./latency-hunting/run-hunting-tests.sh"
288341
print_info " 3. Analyze results: ./latency-hunting/analyze-hunting-results.sh"
289-
342+
290343
else
291344
print_error "Deployment failed. Check the error messages above."
345+
if [ "$PLACEMENT_STRATEGY" = "spread" ]; then
346+
print_warning "If you hit the 7-instance-per-AZ limit, try reducing instance types or using multiple AZs."
347+
fi
292348
exit 1
293349
fi

0 commit comments

Comments
 (0)