Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 47 additions & 23 deletions deployment/cdk/lib/base-latency-hunting-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,19 @@ export interface BaseLatencyHuntingStackProps extends cdk.StackProps {
* Base class for latency hunting stacks.
* Implements the correct approach for creating instances across diverse instance types
* with proper state tracking, resilient Lambda-based provisioning, and cleanup.
*
* Supports CDK context parameters:
* placementStrategy - 'cluster' (default) or 'spread'
* maxInstances - cap on number of instances to deploy (0 = no limit)
* Automatically set to 7 for spread placement by the deploy script.
*/
export abstract class BaseLatencyHuntingStack extends cdk.Stack {
protected readonly stateTable: dynamodb.Table;
protected readonly instanceCreatorLambda: lambda.Function;
protected readonly provider: Provider;
protected readonly elasticIps?: string[];
protected readonly placementStrategy: string;
protected readonly maxInstances: number;

constructor(scope: cdk.App, id: string, props: BaseLatencyHuntingStackProps) {
super(scope, id, props);
Expand All @@ -49,6 +56,15 @@ export abstract class BaseLatencyHuntingStack extends cdk.Stack {
const managedByTag = props.managedByTag || 'CDK-LatencyHunting';
this.elasticIps = props.elasticIps;

// Read placement strategy from CDK context (passed by deploy script --placement flag)
// Valid values: 'cluster' (default) or 'spread'
this.placementStrategy = this.node.tryGetContext('placementStrategy') || 'cluster';

// Read max instances cap from CDK context
// When spread placement is selected, the deploy script sets this to 7
// to respect the AWS limit of 7 instances per AZ per spread placement group
this.maxInstances = parseInt(this.node.tryGetContext('maxInstances') || '0', 10);

// Create DynamoDB table for instance state tracking
this.stateTable = new dynamodb.Table(this, 'InstanceStateTable', {
partitionKey: { name: 'InstanceType', type: dynamodb.AttributeType.STRING },
Expand Down Expand Up @@ -423,7 +439,7 @@ def handler(event, context):
ami_id = sorted_images[0]['ImageId']
print(f"Auto-detected AMI: {ami_id}")

user_data_script = '''#!/bin/bash
user_data_script = """#!/bin/bash
set -e
exec > >(tee /var/log/user-data.log)
exec 2>&1
Expand All @@ -443,7 +459,7 @@ fi
chown -R ec2-user:ec2-user /home/ec2-user/benchmark
touch /home/ec2-user/benchmark/setup_complete
echo "EC2 Hunting setup completed at $(date)"
'''
"""

run_params = {
'ImageId': ami_id,
Expand Down Expand Up @@ -614,8 +630,33 @@ echo "EC2 Hunting setup completed at $(date)"
}

/**
* Create instances for the provided instance configurations
* @param instances Array of instance configurations with unique IDs
* Apply the maxInstances cap to an instance list.
* When spread placement is selected, the deploy script sets maxInstances=7
* to respect the AWS limit of 7 instances per AZ per spread placement group.
*
* Call this BEFORE passing instances to createInstances() and addCommonOutputs()
* so both methods operate on the same capped list.
*
* @param instances Full array of instance configurations
* @returns Capped array (or the original if no cap is set)
*/
protected applyInstanceCap(instances: InstanceConfig[]): InstanceConfig[] {
if (this.maxInstances > 0 && instances.length > this.maxInstances) {
console.log(
`Capping instance count from ${instances.length} to ${this.maxInstances} ` +
`(placement strategy: ${this.placementStrategy})`
);
return instances.slice(0, this.maxInstances);
}
return instances;
}

/**
* Create instances for the provided instance configurations.
*
* The placement group strategy is read from CDK context ('cluster' or 'spread').
*
* @param instances Array of instance configurations with unique IDs (should already be capped via applyInstanceCap)
* @param vpc VPC to deploy instances in
* @param securityGroup Security group for instances
* @param keyPair Key pair for SSH access
Expand Down Expand Up @@ -650,12 +691,12 @@ echo "EC2 Hunting setup completed at $(date)"

// Create instances using explicit IDs
instances.forEach((instance, index) => {
// Create unique placement group for this instance
// Create unique placement group for this instance using the configured strategy
const placementGroup = new CfnPlacementGroup(
this,
`PlacementGroup-${instance.id}`,
{
strategy: 'cluster'
strategy: this.placementStrategy
}
);
placementGroup.applyRemovalPolicy(RemovalPolicy.DESTROY);
Expand Down Expand Up @@ -800,23 +841,6 @@ echo "EC2 Hunting setup completed at $(date)"
{ id: 'arm-c8g-8', instanceType: 'c8g.48xlarge' },
{ id: 'arm-c8g-9', instanceType: 'c8g.metal-24xl' },
{ id: 'arm-c8g-10', instanceType: 'c8g.metal-48xl' },

// // Current generation - Intel (auto-detected AMI)
// { id: 'intel-c7i-1', instanceType: 'c7i.xlarge' },
// { id: 'intel-c7i-2', instanceType: 'c7i.xlarge' },
// { id: 'intel-c7i-3', instanceType: 'c7i.xlarge' },
// { id: 'intel-c6i-1', instanceType: 'c6i.xlarge' },
// { id: 'intel-c6i-2', instanceType: 'c6i.xlarge' },
// { id: 'intel-c6i-3', instanceType: 'c6i.xlarge' },

// // Current generation - Graviton ARM (auto-detected AMI)
// { id: 'arm-c8g-1', instanceType: 'c8g.xlarge' },
// { id: 'arm-c8g-2', instanceType: 'c8g.xlarge' },
// { id: 'arm-c8g-3', instanceType: 'c8g.xlarge' },
// { id: 'arm-c8g-4', instanceType: 'c8g.xlarge' },
// { id: 'arm-c8g-5', instanceType: 'c8g.xlarge' },
// { id: 'arm-c8g-6', instanceType: 'c8g.xlarge' },
// { id: 'arm-c8g-7', instanceType: 'c8g.xlarge' },
];
}
}
6 changes: 3 additions & 3 deletions deployment/cdk/lib/latency-hunting-byovpc-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ export class LatencyHuntingBYOVPCStack extends BaseLatencyHuntingStack {
throw new Error('keyPairName is required for BYOVPC stack');
}

// Use default instances from base class
const instances = this.getDefaultInstances();
// Get instances from base class and apply cap (e.g., 7 for spread placement)
const instances = this.applyInstanceCap(this.getDefaultInstances());

// Import existing VPC
const vpc = Vpc.fromLookup(this, 'ExistingVpc', {
Expand Down Expand Up @@ -95,7 +95,7 @@ export class LatencyHuntingBYOVPCStack extends BaseLatencyHuntingStack {
this.elasticIps
);

// Add outputs using base class method
// Add outputs using base class method (uses same capped list)
this.addCommonOutputs(
instances.map(i => i.instanceType),
props.vpcId,
Expand Down
6 changes: 3 additions & 3 deletions deployment/cdk/lib/latency-hunting-stack.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ export class LatencyHuntingStack extends BaseLatencyHuntingStack {
const vpcCidr = props?.vpcCidr || '10.100.0.0/16'; // Default non-standard CIDR
const vpcId = props?.vpcId;

// Get instances from overridden method
const instances = this.getDefaultInstances();
// Get instances from base class and apply cap (e.g., 7 for spread placement)
const instances = this.applyInstanceCap(this.getDefaultInstances());

// Use existing VPC or create new one
let vpc: cdk.aws_ec2.IVpc;
Expand Down Expand Up @@ -155,7 +155,7 @@ export class LatencyHuntingStack extends BaseLatencyHuntingStack {
this.elasticIps
);

// Add outputs using base class method
// Add outputs using base class method (uses same capped list)
this.addCommonOutputs(
instances.map(i => i.instanceType),
vpc.vpcId,
Expand Down
112 changes: 84 additions & 28 deletions deployment/deploy-latency-hunting.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# Latency Hunting Deployment Script
# This script deploys diverse EC2 instance types with placement groups
# to find optimal network placement for latency-sensitive applications
# Supports cluster and spread placement group strategies via --placement flag.

set -e

Expand All @@ -22,6 +23,7 @@ SECURITY_GROUP_ID="" # Optional for BYOVPC mode
USE_EXISTING_VPC="false" # Set to true to use BYOVPC stack
ELASTIC_IPS="" # Optional: comma-separated list of Elastic IP allocation IDs
STACK_NAME="LatencyHuntingStack"
PLACEMENT_STRATEGY="cluster" # Default: cluster placement group

# Function to print colored output
print_info() {
Expand Down Expand Up @@ -52,30 +54,40 @@ OPTIONS:
-g, --security-group-id SG_ID Security group ID (optional, creates one if not provided)
-e, --elastic-ips EIP_LIST Comma-separated Elastic IP allocation IDs (optional)
--use-existing-vpc Use BYOVPC stack (never creates/manages VPC)
-p, --placement STRATEGY Placement group strategy: 'cluster' or 'spread' (default: cluster)
cluster - packs instances close together in a single AZ
for lowest inter-node latency
spread - distributes instances across distinct hardware
for maximum fault isolation (max 7 instances
per AZ per group)
-h, --help Show this help message

EXAMPLES:
# Deploy with CDK-managed VPC (creates new VPC)
$0 --region ap-northeast-1 --key-pair my-keypair
# Deploy with cluster placement (default, lowest latency)
$0 --region ap-northeast-1 --key-pair my-keypair --placement cluster

# Use existing VPC (RECOMMENDED - never manages VPC)
$0 --use-existing-vpc \
--region ap-northeast-1 \
--vpc-id vpc-02393b8e30c6e3e5d \
--subnet-id subnet-xxxxx \
--key-pair tokyo_keypair
# Deploy with spread placement (fault isolation)
$0 --region ap-northeast-1 --key-pair my-keypair --placement spread

# Use existing VPC with spread placement
$0 --use-existing-vpc \\
--region ap-northeast-1 \\
--vpc-id vpc-02393b8e30c6e3e5d \\
--subnet-id subnet-xxxxx \\
--key-pair tokyo_keypair \\
--placement spread

# With existing security group
$0 --use-existing-vpc \
--region ap-northeast-1 \
--vpc-id vpc-02393b8e30c6e3e5d \
--subnet-id subnet-xxxxx \
--security-group-id sg-xxxxx \
$0 --use-existing-vpc \\
--region ap-northeast-1 \\
--vpc-id vpc-02393b8e30c6e3e5d \\
--subnet-id subnet-xxxxx \\
--security-group-id sg-xxxxx \\
--key-pair my-keypair

# With Elastic IPs (first N instances get EIPs, rest get regular public IPs)
$0 --region eu-central-1 \
--key-pair frankfurt \
$0 --region eu-central-1 \\
--key-pair frankfurt \\
--elastic-ips eipalloc-12345678,eipalloc-87654321,eipalloc-abcdef01

EOF
Expand Down Expand Up @@ -117,6 +129,10 @@ while [[ $# -gt 0 ]]; do
USE_EXISTING_VPC="true"
shift
;;
-p|--placement)
PLACEMENT_STRATEGY="$2"
shift 2
;;
-h|--help)
usage
;;
Expand All @@ -127,8 +143,32 @@ while [[ $# -gt 0 ]]; do
esac
done

# Validate placement strategy
case "$PLACEMENT_STRATEGY" in
cluster|spread)
;;
*)
print_error "Invalid placement strategy: '$PLACEMENT_STRATEGY'. Must be 'cluster' or 'spread'."
exit 1
;;
esac

# Warn about spread placement group limitations and enforce instance cap
if [ "$PLACEMENT_STRATEGY" = "spread" ]; then
print_warning "Spread placement selected. Note the following constraints:"
print_warning " - Maximum of 7 running instances per AZ per placement group"
print_warning " - Instances are placed on distinct underlying hardware (racks)"
print_warning " - Inter-instance latency may be higher than cluster placement"
print_warning " - Deployment will be capped at 7 instances to stay within the limit"
print_warning ""
MAX_INSTANCES=7
else
MAX_INSTANCES=0 # 0 = no limit
fi

print_info "Starting Latency Hunting Deployment"
print_info "================================"
print_info "Placement Strategy: $PLACEMENT_STRATEGY"

if [ "$USE_EXISTING_VPC" = "true" ]; then
print_info "Mode: BYOVPC (Bring Your Own VPC)"
Expand All @@ -145,7 +185,7 @@ if [ "$USE_EXISTING_VPC" = "true" ]; then
print_info "Elastic IPs: $ELASTIC_IPS"
fi
STACK_NAME="LatencyHuntingBYOVPCStack"

# Validate required parameters
if [ -z "$VPC_ID" ]; then
print_error "VPC ID is required when using --use-existing-vpc"
Expand Down Expand Up @@ -206,6 +246,11 @@ fi
CDK_CONTEXT="--context deploymentType=latency-hunting"
CDK_CONTEXT="$CDK_CONTEXT --context region=$REGION"
CDK_CONTEXT="$CDK_CONTEXT --context keyPairName=$KEY_PAIR_NAME"
CDK_CONTEXT="$CDK_CONTEXT --context placementStrategy=$PLACEMENT_STRATEGY"
if [ "$MAX_INSTANCES" -gt 0 ]; then
CDK_CONTEXT="$CDK_CONTEXT --context maxInstances=$MAX_INSTANCES"
print_info "Instance cap: $MAX_INSTANCES (spread placement limit)"
fi

# Add Elastic IPs if provided
if [ -n "$ELASTIC_IPS" ]; then
Expand All @@ -230,33 +275,37 @@ else
fi

# Deploy the stack
print_info "Deploying Latency Hunting Stack..."
print_info "This will launch multiple instance types with placement groups"
print_info "Deploying Latency Hunting Stack (placement: $PLACEMENT_STRATEGY)..."
if [ "$PLACEMENT_STRATEGY" = "spread" ]; then
print_info "Instances will be distributed across distinct hardware for fault isolation"
else
print_info "Instances will be packed together in a single AZ for lowest latency"
fi
print_info "Some instance types may fail due to capacity constraints - this is expected"
print_info ""

if cdk deploy $STACK_NAME $CDK_CONTEXT --require-approval never; then
print_info ""
print_info "✅ Deployment successful!"
print_info "✅ Deployment successful! (placement: $PLACEMENT_STRATEGY)"
print_info ""

# Get stack outputs
print_info "Fetching stack outputs..."
OUTPUTS=$(aws cloudformation describe-stacks \
--stack-name "$STACK_NAME" \
--region "$REGION" \
--query 'Stacks[0].Outputs' \
--output json)

# Create output directory and save outputs to file
mkdir -p "$SCRIPT_DIR/latency-hunting"
OUTPUT_FILE="$SCRIPT_DIR/latency-hunting/deployment-outputs.json"
echo "$OUTPUTS" > "$OUTPUT_FILE"
print_info "Stack outputs saved to: $OUTPUT_FILE"

# Get total instance types attempted
TOTAL_TYPES=$(echo "$OUTPUTS" | jq -r '.[] | select(.OutputKey=="TotalInstanceTypes") | .OutputValue')

# Query actual instances via EC2 API using tags
print_info ""
print_info "Querying deployed instances..."
Expand All @@ -265,29 +314,36 @@ if cdk deploy $STACK_NAME $CDK_CONTEXT --require-approval never; then
--filters "Name=tag:Architecture,Values=latency-hunting" "Name=instance-state-name,Values=pending,running" \
--query 'Reservations[].Instances[].[InstanceId,InstanceType,State.Name,PublicIpAddress]' \
--output json)

SUCCESS_COUNT=$(echo "$INSTANCES" | jq '. | length')

print_info ""
print_info "Summary:"
print_info " Placement strategy: $PLACEMENT_STRATEGY"
print_info " Total instance types attempted: $TOTAL_TYPES"
print_info " Successfully launched: $SUCCESS_COUNT"
print_info " Failed (capacity/compatibility issues): $((TOTAL_TYPES - SUCCESS_COUNT))"
if [ "$PLACEMENT_STRATEGY" = "spread" ]; then
print_info " (capped at 7 for spread placement)"
fi
print_info ""

# Show successful instances
print_info "Successfully launched instances:"
echo "$INSTANCES" | jq -r '.[] | "\(.[1]) (\(.[2])): \(.[0]) - \(.[3] // "pending")"' | while read line; do
print_info " $line"
done

print_info ""
print_info "Next steps:"
print_info " 1. Wait for instances to fully initialize (2-3 minutes)"
print_info " 2. Run latency tests: ./latency-hunting/run-hunting-tests.sh"
print_info " 3. Analyze results: ./latency-hunting/analyze-hunting-results.sh"

else
print_error "Deployment failed. Check the error messages above."
if [ "$PLACEMENT_STRATEGY" = "spread" ]; then
print_warning "If you hit the 7-instance-per-AZ limit, try reducing instance types or using multiple AZs."
fi
exit 1
fi