Skip to content

Commit 7bc6c74

Browse files
committed
Updated sqswatcher to be more resiliant to errors and improved documentation.
1 parent 9ae33e4 commit 7bc6c74

File tree

7 files changed

+41
-30
lines changed

7 files changed

+41
-30
lines changed

docs/source/configuration.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ Defaults to false for the default template. ::
131131

132132
scheduler
133133
"""""""""
134-
Cluster scheduler
134+
Scheduler to be used with the cluster. Valid options are sge, openlava, or torque.
135135

136136
Defaults to sge for the default template. ::
137137

docs/source/getting_started.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ This configure wizard will prompt you for everything you need to create your clu
7373

7474
Cluster Name [mycluster]:
7575

76-
Next, you will be prompted for your AWS Access & Secret Keys. You can leave these blank to use keys defined in your environment variaables or aws config. Othewise, set them here to be used by cfncluster.
76+
Next, you will be prompted for your AWS Access & Secret Keys. Enter the keys for an IAM user with administrative privledges. These can also be read from your environment variaables or the aws CLI config.
7777

7878
::
7979

docs/source/welcome.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
.. _welcome
22
3-
Working with cfncluster
3+
Working with CfnCluster
44
#######################
55

66
.. toctree::

node/sqswatcher/plugins/openlava.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,6 @@ def addHost(hostname, cluster_user):
3535
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
3636
hosts_key_file = '/home/' + cluster_user + '/.ssh/known_hosts'
3737
user_key_file = '/home/' + cluster_user + '/.ssh/id_rsa'
38-
try:
39-
ssh.load_host_keys(hosts_key_file)
40-
except IOError:
41-
ssh._host_keys_filename = None
42-
pass
4338
iter=0
4439
connected=False
4540
while iter < 3 and connected == False:
@@ -54,6 +49,11 @@ def addHost(hostname, cluster_user):
5449
if iter == 3:
5550
print("Unable to provison host")
5651
return
52+
try:
53+
ssh.load_host_keys(hosts_key_file)
54+
except IOError:
55+
ssh._host_keys_filename = None
56+
pass
5757
ssh.save_host_keys(hosts_key_file)
5858
ssh.close()
5959

node/sqswatcher/plugins/sge.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,6 @@ def addHost(hostname, cluster_user):
5959
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
6060
hosts_key_file = '/home/' + cluster_user + '/.ssh/known_hosts'
6161
user_key_file = '/home/' + cluster_user + '/.ssh/id_rsa'
62-
try:
63-
ssh.load_host_keys(hosts_key_file)
64-
except IOError:
65-
ssh._host_keys_filename = None
66-
pass
6762
iter=0
6863
connected=False
6964
while iter < 3 and connected == False:
@@ -78,6 +73,11 @@ def addHost(hostname, cluster_user):
7873
if iter == 3:
7974
print("Unable to provison host")
8075
return
76+
try:
77+
ssh.load_host_keys(hosts_key_file)
78+
except IOError:
79+
ssh._host_keys_filename = None
80+
pass
8181
ssh.save_host_keys(hosts_key_file)
8282
command = "sudo sh -c \'cd /opt/sge && /opt/sge/inst_sge -x -auto /opt/cfncluster/templates/sge/sge_inst.conf\'"
8383
stdin, stdout, stderr = ssh.exec_command(command)

node/sqswatcher/plugins/torque.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,6 @@ def addHost(hostname,cluster_user):
3737
ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
3838
hosts_key_file = '/home/' + cluster_user + '/.ssh/known_hosts'
3939
user_key_file = '/home/' + cluster_user + '/.ssh/id_rsa'
40-
try:
41-
ssh.load_host_keys(hosts_key_file)
42-
except IOError:
43-
ssh._host_keys_filename = None
44-
pass
4540
iter=0
4641
connected=False
4742
while iter < 3 and connected == False:
@@ -56,6 +51,11 @@ def addHost(hostname,cluster_user):
5651
if iter == 3:
5752
print("Unable to provison host")
5853
return
54+
try:
55+
ssh.load_host_keys(hosts_key_file)
56+
except IOError:
57+
ssh._host_keys_filename = None
58+
pass
5959
ssh.save_host_keys(hosts_key_file)
6060
ssh.close()
6161

node/sqswatcher/sqswatcher.py

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,21 @@ def pollQueue(scheduler, q, t):
115115
wait = 15
116116
while retry < 3:
117117
try:
118-
hostname = ec2.get_all_instances(instance_ids=instanceId)[
119-
0].instances[0].private_dns_name.split('.')[:1][0]
118+
hostname = ec2.get_all_instances(instance_ids=instanceId)
119+
120+
if not hostname:
121+
print "Unable to find running instance %s." % instanceId
122+
else:
123+
print "Adding Hostname: %s" % hostname
124+
hostname = hostname[0].instances[0].private_dns_name.split('.')[:1][0]
125+
s.addHost(hostname,cluster_user)
126+
127+
t.put_item(data={
128+
'instanceId': instanceId,
129+
'hostname': hostname
130+
})
131+
132+
q.delete_message(result)
120133
break
121134
except boto.exception.BotoServerError as e:
122135
if e.error_code == 'RequestLimitExceeded':
@@ -125,15 +138,9 @@ def pollQueue(scheduler, q, t):
125138
wait = (wait*2+retry)
126139
else:
127140
raise e
128-
129-
s.addHost(hostname,cluster_user)
130-
131-
t.put_item(data={
132-
'instanceId': instanceId,
133-
'hostname': hostname
134-
})
135-
136-
q.delete_message(result)
141+
except:
142+
print "Unexpected error:", sys.exc_info()[0]
143+
raise
137144

138145
elif eventType == 'autoscaling:EC2_INSTANCE_TERMINATE':
139146
print eventType, instanceId
@@ -142,12 +149,16 @@ def pollQueue(scheduler, q, t):
142149
item = t.get_item(consistent=True, instanceId=instanceId)
143150
hostname = item['hostname']
144151

145-
s.removeHost(hostname,cluster_user)
152+
if hostname:
153+
s.removeHost(hostname,cluster_user)
146154

147155
item.delete()
148156

149157
except boto.dynamodb2.exceptions.ItemNotFound:
150158
print ("Did not find %s in the metadb\n" % instanceId)
159+
except:
160+
print "Unexpected error:", sys.exc_info()[0]
161+
raise
151162

152163
q.delete_message(result)
153164

0 commit comments

Comments
 (0)