Skip to content

Commit 26e61ec

Browse files
author
Ubuntu
committed
test
1 parent 67564bc commit 26e61ec

File tree

1 file changed

+75
-6
lines changed

1 file changed

+75
-6
lines changed

tests/unittests/cluster/test_distributed_training.py

Lines changed: 75 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,49 @@ def test_distributed_training(test_helper, framework_version):
4040

4141
image_uri = test_helper.get_custom_image_uri(framework_version, type="training", gpu=False)
4242

43+
import threading
44+
import time
45+
import boto3
46+
47+
# Create a safety timer that will terminate any instances from this test after 5 minutes
48+
def safety_termination():
49+
time.sleep(300) # 5 minutes for testing
50+
try:
51+
print("Safety termination triggered - checking for running instances...")
52+
ec2 = boto3.client("ec2", region_name="us-east-1")
53+
54+
# Find instances that might be from this test run
55+
response = ec2.describe_instances(
56+
Filters=[
57+
{"Name": "instance-state-name", "Values": ["running", "pending"]},
58+
{"Name": "tag:ray-cluster-name", "Values": ["*ag_ray_aws_default*"]},
59+
]
60+
)
61+
62+
instance_ids = []
63+
for reservation in response["Reservations"]:
64+
for instance in reservation["Instances"]:
65+
# Only terminate recent instances (launched within last 10 minutes)
66+
launch_time = instance["LaunchTime"].timestamp()
67+
current_time = time.time()
68+
if (current_time - launch_time) < 600: # 10 minutes
69+
instance_ids.append(instance["InstanceId"])
70+
71+
if instance_ids:
72+
print(f"Safety termination: Found {len(instance_ids)} instances to terminate: {instance_ids}")
73+
ec2.terminate_instances(InstanceIds=instance_ids)
74+
print("Safety termination: Instances terminated successfully")
75+
else:
76+
print("Safety termination: No matching instances found")
77+
78+
except Exception as e:
79+
print(f"Safety termination failed: {e}")
80+
81+
# Start the safety timer in background
82+
safety_thread = threading.Thread(target=safety_termination, daemon=True)
83+
safety_thread.start()
84+
print("Started safety termination timer (5 minutes)")
85+
4386
try:
4487
cp.fit(
4588
predictor_init_args=predictor_init_args,
@@ -54,15 +97,41 @@ def test_distributed_training(test_helper, framework_version):
5497
"--password-stdin 369469875935.dkr.ecr.us-east-1.amazonaws.com",
5598
# Auto-terminate after 20 minutes as safety for CI
5699
"echo '#!/bin/bash' > /tmp/auto_terminate.sh",
57-
"echo 'sleep 1200' >> /tmp/auto_terminate.sh", # 20 minutes
100+
"echo 'exec > >(tee -a /tmp/auto_terminate.log) 2>&1' >> /tmp/auto_terminate.sh", # Log everything
101+
"echo 'echo \"[$(date)] Auto-termination script started. Will terminate in 5 minutes.\"' >> /tmp/auto_terminate.sh",
102+
"echo 'echo \"[$(date)] Process ID: $$\"' >> /tmp/auto_terminate.sh",
103+
"echo 'sleep 300' >> /tmp/auto_terminate.sh", # 5 minutes for testing
104+
"echo 'echo \"[$(date)] 5 minutes elapsed. Initiating termination...\"' >> /tmp/auto_terminate.sh",
58105
'echo \'TOKEN=$(curl -X PUT "http://169.254.169.254/latest/api/token" '
59-
'-H "X-aws-ec2-metadata-token-ttl-seconds: 21600")\' >> /tmp/auto_terminate.sh',
60-
'echo \'INSTANCE_ID=$(curl -H "X-aws-ec2-metadata-token: \\$TOKEN" '
61-
"-s http://169.254.169.254/latest/meta-data/instance-id)' >> /tmp/auto_terminate.sh",
106+
'-H "X-aws-ec2-metadata-token-ttl-seconds: 21600" 2>/dev/null || echo "failed")\' >> /tmp/auto_terminate.sh',
107+
'echo \'if [ "$TOKEN" = "failed" ]; then\' >> /tmp/auto_terminate.sh',
108+
"echo ' echo \"[$(date)] Failed to get metadata token, trying without token...\"' >> /tmp/auto_terminate.sh",
109+
"echo ' INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo \"unknown\")' >> /tmp/auto_terminate.sh",
110+
"echo 'else' >> /tmp/auto_terminate.sh",
111+
'echo \' INSTANCE_ID=$(curl -H "X-aws-ec2-metadata-token: $TOKEN" '
112+
'-s http://169.254.169.254/latest/meta-data/instance-id 2>/dev/null || echo "unknown")\' >> /tmp/auto_terminate.sh',
113+
"echo 'fi' >> /tmp/auto_terminate.sh",
114+
"echo 'echo \"[$(date)] Instance ID: \\$INSTANCE_ID\"' >> /tmp/auto_terminate.sh",
115+
"echo 'echo \"[$(date)] Instance ID: \\$INSTANCE_ID\"' >> /tmp/auto_terminate.sh",
116+
'echo \'if [ "\\$INSTANCE_ID" != "unknown" ] && [ -n "\\$INSTANCE_ID" ]; then\' >> /tmp/auto_terminate.sh',
117+
"echo 'echo \"[$(date)] Checking IAM permissions...\"' >> /tmp/auto_terminate.sh",
118+
"echo 'aws sts get-caller-identity 2>&1' >> /tmp/auto_terminate.sh",
119+
"echo 'echo \"[$(date)] Attempting termination...\"' >> /tmp/auto_terminate.sh",
62120
"echo 'aws ec2 terminate-instances --instance-ids \\$INSTANCE_ID "
63-
"--region us-east-1' >> /tmp/auto_terminate.sh",
121+
"--region us-east-1 2>&1' >> /tmp/auto_terminate.sh",
122+
"echo 'echo \"[$(date)] Termination command sent.\"' >> /tmp/auto_terminate.sh",
123+
"echo 'else' >> /tmp/auto_terminate.sh",
124+
"echo 'echo \"[$(date)] ERROR: Could not determine instance ID, cannot terminate\"' >> /tmp/auto_terminate.sh",
125+
"echo 'fi' >> /tmp/auto_terminate.sh",
64126
"chmod +x /tmp/auto_terminate.sh",
65-
"nohup /tmp/auto_terminate.sh > /tmp/auto_terminate.log 2>&1 &",
127+
# Create a systemd-style service for better persistence
128+
"echo 'Starting auto-termination script...'",
129+
"setsid /tmp/auto_terminate.sh &", # Use setsid instead of nohup for better process isolation
130+
"echo 'Auto-termination script PID:' $!",
131+
"sleep 2", # Give script time to start
132+
"ps aux | grep auto_terminate | grep -v grep || echo 'Warning: auto_terminate script not found in process list'",
133+
"ls -la /tmp/auto_terminate.*",
134+
"echo 'Check /tmp/auto_terminate.log for status'",
66135
]
67136
},
68137
)

0 commit comments

Comments
 (0)