@@ -40,6 +40,49 @@ def test_distributed_training(test_helper, framework_version):
4040
4141 image_uri = test_helper .get_custom_image_uri (framework_version , type = "training" , gpu = False )
4242
43+ import threading
44+ import time
45+ import boto3
46+
47+ # Create a safety timer that will terminate any instances from this test after 5 minutes
48+ def safety_termination ():
49+ time .sleep (300 ) # 5 minutes for testing
50+ try :
51+ print ("Safety termination triggered - checking for running instances..." )
52+ ec2 = boto3 .client ("ec2" , region_name = "us-east-1" )
53+
54+ # Find instances that might be from this test run
55+ response = ec2 .describe_instances (
56+ Filters = [
57+ {"Name" : "instance-state-name" , "Values" : ["running" , "pending" ]},
58+ {"Name" : "tag:ray-cluster-name" , "Values" : ["*ag_ray_aws_default*" ]},
59+ ]
60+ )
61+
62+ instance_ids = []
63+ for reservation in response ["Reservations" ]:
64+ for instance in reservation ["Instances" ]:
65+ # Only terminate recent instances (launched within last 10 minutes)
66+ launch_time = instance ["LaunchTime" ].timestamp ()
67+ current_time = time .time ()
68+ if (current_time - launch_time ) < 600 : # 10 minutes
69+ instance_ids .append (instance ["InstanceId" ])
70+
71+ if instance_ids :
72+ print (f"Safety termination: Found { len (instance_ids )} instances to terminate: { instance_ids } " )
73+ ec2 .terminate_instances (InstanceIds = instance_ids )
74+ print ("Safety termination: Instances terminated successfully" )
75+ else :
76+ print ("Safety termination: No matching instances found" )
77+
78+ except Exception as e :
79+ print (f"Safety termination failed: { e } " )
80+
81+ # Start the safety timer in background
82+ safety_thread = threading .Thread (target = safety_termination , daemon = True )
83+ safety_thread .start ()
84+ print ("Started safety termination timer (5 minutes)" )
85+
4386 try :
4487 cp .fit (
4588 predictor_init_args = predictor_init_args ,
@@ -54,15 +97,41 @@ def test_distributed_training(test_helper, framework_version):
5497 "--password-stdin 369469875935.dkr.ecr.us-east-1.amazonaws.com" ,
5598 # Auto-terminate after 20 minutes as safety for CI
5699 "echo '#!/bin/bash' > /tmp/auto_terminate.sh" ,
57- "echo 'sleep 1200' >> /tmp/auto_terminate.sh" , # 20 minutes
100+ "echo 'exec > >(tee -a /tmp/auto_terminate.log) 2>&1' >> /tmp/auto_terminate.sh" , # Log everything
101+ "echo 'echo \" [$(date)] Auto-termination script started. Will terminate in 5 minutes.\" ' >> /tmp/auto_terminate.sh" ,
102+ "echo 'echo \" [$(date)] Process ID: $$\" ' >> /tmp/auto_terminate.sh" ,
103+ "echo 'sleep 300' >> /tmp/auto_terminate.sh" , # 5 minutes for testing
104+ "echo 'echo \" [$(date)] 5 minutes elapsed. Initiating termination...\" ' >> /tmp/auto_terminate.sh" ,
58105 'echo \' TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" '
59- '-H "X-aws-ec2-metadata-token-ttl-seconds: 21600")\' >> /tmp/auto_terminate.sh' ,
60- 'echo \' INSTANCE_ID=$(curl -H "X-aws-ec2-metadata-token: \\ $TOKEN" '
61- "-s http://169.254.169.254/latest/meta-data/instance-id)' >> /tmp/auto_terminate.sh" ,
106+ '-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null || echo "failed")\' >> /tmp/auto_terminate.sh' ,
107+ 'echo \' if [ "$TOKEN" = "failed" ]; then\' >> /tmp/auto_terminate.sh' ,
108+ "echo ' echo \" [$(date)] Failed to get metadata token, trying without token...\" ' >> /tmp/auto_terminate.sh" ,
109+ "echo ' INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo \" unknown\" )' >> /tmp/auto_terminate.sh" ,
110+ "echo 'else' >> /tmp/auto_terminate.sh" ,
111+ 'echo \' INSTANCE_ID=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" '
112+ '-s http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")\' >> /tmp/auto_terminate.sh' ,
113+ "echo 'fi' >> /tmp/auto_terminate.sh" ,
114+ "echo 'echo \" [$(date)] Instance ID: \\ $INSTANCE_ID\" ' >> /tmp/auto_terminate.sh" ,
115+ "echo 'echo \" [$(date)] Instance ID: \\ $INSTANCE_ID\" ' >> /tmp/auto_terminate.sh" ,
116+ 'echo \' if [ "\\ $INSTANCE_ID" != "unknown" ] && [ -n "\\ $INSTANCE_ID" ]; then\' >> /tmp/auto_terminate.sh' ,
117+ "echo 'echo \" [$(date)] Checking IAM permissions...\" ' >> /tmp/auto_terminate.sh" ,
118+ "echo 'aws sts get-caller-identity 2>&1' >> /tmp/auto_terminate.sh" ,
119+ "echo 'echo \" [$(date)] Attempting termination...\" ' >> /tmp/auto_terminate.sh" ,
62120 "echo 'aws ec2 terminate-instances --instance-ids \\ $INSTANCE_ID "
63- "--region us-east-1' >> /tmp/auto_terminate.sh" ,
121+ "--region us-east-1 2>&1' >> /tmp/auto_terminate.sh" ,
122+ "echo 'echo \" [$(date)] Termination command sent.\" ' >> /tmp/auto_terminate.sh" ,
123+ "echo 'else' >> /tmp/auto_terminate.sh" ,
124+ "echo 'echo \" [$(date)] ERROR: Could not determine instance ID, cannot terminate\" ' >> /tmp/auto_terminate.sh" ,
125+ "echo 'fi' >> /tmp/auto_terminate.sh" ,
64126 "chmod +x /tmp/auto_terminate.sh" ,
65- "nohup /tmp/auto_terminate.sh > /tmp/auto_terminate.log 2>&1 &" ,
127+ # Create a systemd-style service for better persistence
128+ "echo 'Starting auto-termination script...'" ,
129+ "setsid /tmp/auto_terminate.sh &" , # Use setsid instead of nohup for better process isolation
130+ "echo 'Auto-termination script PID:' $!" ,
131+ "sleep 2" , # Give script time to start
132+ "ps aux | grep auto_terminate | grep -v grep || echo 'Warning: auto_terminate script not found in process list'" ,
133+ "ls -la /tmp/auto_terminate.*" ,
134+ "echo 'Check /tmp/auto_terminate.log for status'" ,
66135 ]
67136 },
68137 )
0 commit comments