Skip to content

Commit 90f8be3

Browse files
author
Joe Stubbs
committed
Merge branch 'dev'
2 parents aef0884 + 02a0c93 commit 90f8be3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+158801
-1816
lines changed

CHANGELOG.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,36 @@
11
# Change Log
22
All notable changes to this project will be documented in this file.
33

4+
5+
## 0.8.0 - 2018-05-10
6+
### Added
7+
- Added support for a tenant-specific global_mounts config.
8+
9+
### Changed
10+
- Changed RabbitMQ connection handling across all channel objects to greatly reduce cpu load on RabbitMQ server as well as on worker nodes in the cluster.
11+
- Implemented a stop-no-delete command on the command channel to prevent a race condition when updating an actor's image that could cause the new worker to be killed.
12+
- Fixed an issue where Docker fails to report container execution finish time when the compute server is under heavy load. In this case, we note return finish_time as computed from the start_time and the run_time (calculated by Abaco).
13+
- Fixed issues with Actor update: 1) owner can no longer change in case a different user from the original owner updates the actor image, 2) last_update_time is always updated, and 3) ensure updater has permanent permissions for the actor.
14+
15+
### Removed
16+
- No change
17+
18+
## 0.7.0 - 2018-04-08
19+
### Added
20+
- Added support for setting max_workers_per_host to prevent overloading.
21+
- Added support for retrieving the TAS GID on a per user basis from the extended profile within Metadata.
22+
- Initial implementation of autoscaling via Prometheus added.
23+
- Additional fields for each execution are now returned in the executions summary.
24+
25+
### Changed
26+
- The routines used when executing an actor container have been simplified to provide better performance and to prevent some issues such as stats collection generating a UnixHTTPConnectionPool Readtime when compute server is under load.
27+
- Added several safety guards to the health checker code to prevent crashes of the health checker when working with unexpected data (e.g. when a worker's last_execution is not defined)
28+
- Fixed bug due to message formatting issue in message returned from a POST to the /workers endpoint.
29+
30+
### Removed
31+
- The 'ids' collection has been removed from the executions endpoint response in favor of an 'executions' collections providing additional fields for each execution.
32+
33+
434
## 0.6.0 - 2018-03-08
535
### Added
636
- Add support for binary messages through a FIFO mount to the actor.

abaco.log

Lines changed: 157582 additions & 1685 deletions
Large diffs are not rendered by default.
File renamed without changes.

actors/auth.py

Lines changed: 71 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,12 @@
1414
from agaveflask.logs import get_logger
1515
logger = get_logger(__name__)
1616

17+
from agavepy.agave import Agave
1718
from config import Config
1819
import codes
1920
from models import Actor, get_permissions, Nonce
2021

21-
from errors import ResourceError, PermissionsException
22+
from errors import ClientException, ResourceError, PermissionsException
2223

2324

2425
jwt.verify_methods['SHA256WITHRSA'] = (
@@ -284,23 +285,47 @@ def get_tenants():
284285
'TACC-PROD',
285286
'VDJSERVER-ORG']
286287

288+
def tenant_can_use_tas(tenant):
289+
"""Return whether a tenant can use TAS for uid/gid resolution. This is equivalent to whether the tenant uses
290+
the TACC IdP"""
291+
if tenant == 'DESIGNSAFE' or \
292+
tenant == 'SD2E' or \
293+
tenant == 'TACC-PROD':
294+
return True
295+
# all other tenants use some other IdP so username will not be a TAS account:
296+
return False
287297

288298
# TAS configuration:
289299
# base URL for TAS API.
290300
TAS_URL_BASE = os.environ.get('TAS_URL_BASE', 'https://tas.tacc.utexas.edu/api/v1')
291301
TAS_ROLE_ACCT = os.environ.get('TAS_ROLE_ACCT', 'tas-jetstream')
292302
TAS_ROLE_PASS = os.environ.get('TAS_ROLE_PASS')
293303

294-
295-
def get_tas_data(username):
304+
def get_service_client(tenant):
305+
"""Returns the service client for a specific tenant."""
306+
service_token = os.environ.get('_abaco_{}_service_token'.format(tenant))
307+
if not service_token:
308+
raise ClientException("No service token configured for tenant: {}".format(tenant))
309+
api_server = get_api_server(tenant)
310+
verify = get_tenant_verify(tenant)
311+
# generate an Agave client with the service token
312+
logger.info("Attempting to generate an agave client.")
313+
return Agave(api_server=api_server,
314+
token=service_token,
315+
verify=verify)
316+
317+
def get_tas_data(username, tenant):
296318
"""Get the TACC uid, gid and homedir for this user from the TAS API."""
297-
logger.debug("Top of get_tas_data for username: {}".format(username))
319+
logger.debug("Top of get_tas_data for username: {}; tenant: {}".format(username, tenant))
298320
if not TAS_ROLE_ACCT:
299321
logger.error("No TAS_ROLE_ACCT configured. Aborting.")
300-
return
322+
return None, None, None
301323
if not TAS_ROLE_PASS:
302324
logger.error("No TAS_ROLE_PASS configured. Aborting.")
303-
return
325+
return None, None, None
326+
if not tenant_can_use_tas(tenant):
327+
logger.debug("Tenant {} cannot use TAS".format(tenant))
328+
return None, None, None
304329
url = '{}/users/username/{}'.format(TAS_URL_BASE, username)
305330
headers = {'Content-type': 'application/json',
306331
'Accept': 'application/json'
@@ -312,20 +337,57 @@ def get_tas_data(username):
312337
except Exception as e:
313338
logger.error("Got an exception from TAS API. "
314339
"Exception: {}. url: {}. TAS_ROLE_ACCT: {}".format(e, url, TAS_ROLE_ACCT))
315-
return
340+
return None, None, None
316341
try:
317342
data = rsp.json()
318343
except Exception as e:
319344
logger.error("Did not get JSON from TAS API. rsp: {}"
320345
"Exception: {}. url: {}. TAS_ROLE_ACCT: {}".format(rsp, e, url, TAS_ROLE_ACCT))
321-
return
346+
return None, None, None
322347
try:
323348
tas_uid = data['result']['uid']
324349
tas_homedir = data['result']['homeDirectory']
325350
except Exception as e:
326351
logger.error("Did not get attributes from TAS API. rsp: {}"
327352
"Exception: {}. url: {}. TAS_ROLE_ACCT: {}".format(rsp, e, url, TAS_ROLE_ACCT))
328-
return
353+
return None, None, None
354+
355+
# first look for an "extended profile" record in agave metadata. such a record might have the
356+
# gid to use for this user. to do this search we need a service client for the tenant:
357+
ag = None
358+
tas_gid = None
359+
try:
360+
ag = get_service_client(tenant)
361+
except ClientException as e:
362+
logger.info("got ClientException trying to generate the service client; e: {}".format(e))
363+
except Exception as e:
364+
logger.error("Unexpected exception trying to generate service client; e: {}".format(e))
365+
# if we get a service client, try to look up extended profile:
366+
if ag:
367+
meta_name = 'profile.{}.{}'.format(tenant.lower(), username)
368+
q = "{'name': '" + meta_name + "'}"
369+
logger.debug("using query: {}".format(q))
370+
try:
371+
rsp = ag.meta.listMetadata(q=q)
372+
except Exception as e:
373+
logger.error("Got an exception trying to retrieve the extended profile. Exception: {}".format(e))
374+
try:
375+
tas_gid = rsp[0].value['posix_gid']
376+
except IndexError:
377+
logger.info("Got an index error - returning None. response: {}".format(rsp))
378+
tas_gid = None
379+
except Exception as e:
380+
logger.error("Got an exception trying to retrieve the gid from the extended profile. Exception: {}".format(e))
381+
if tas_gid:
382+
logger.debug("Got a tas gid from the extended profile.")
383+
logger.info("Setting the following TAS data: uid:{} gid:{} homedir:{}".format(tas_uid,
384+
tas_gid,
385+
tas_homedir))
386+
return tas_uid, tas_gid, tas_homedir
387+
else:
388+
logger.error("got a valid response but did not get a tas_gid. Full rsp: {}".format(rsp))
389+
# if we are here, we didn't get a TAS_GID from the extended profile.
390+
logger.debug("did not get an extended profile.")
329391
# if the instance has a configured TAS_GID to use we will use that; otherwise,
330392
# we fall back on using the user's uid as the gid, which is (almost) always safe)
331393
tas_gid = os.environ.get('TAS_GID', tas_uid)

actors/channels.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,15 @@ def get(self, timeout=float('inf')):
119119
raise ChannelTimeoutException()
120120
time.sleep(self.POLL_FREQUENCY)
121121

122+
def get_one(self):
123+
"""Blocking method to get a single message without polling."""
124+
if self._queue is None:
125+
raise ChannelClosedException()
126+
for msg in self._queue._queue:
127+
msg.ack()
128+
return self._process(msg.body)
129+
130+
122131

123132
class ActorMsgChannel(BinaryChannel):
124133
"""Work with messages sent to a specific actor.

actors/clients.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from agaveflask.auth import get_api_server
1010

11-
from agave import Agave
11+
from aga import Agave
1212
from auth import get_tenants, get_tenant_verify
1313
from channels import ClientsChannel
1414
from models import Actor, Client, Worker
@@ -60,7 +60,7 @@ def run(self):
6060
to send an anonymous channel together with the actual client request command.
6161
"""
6262
while True:
63-
message = self.ch.get()
63+
message = self.ch.get_one()
6464
logger.info("cleintg processing message: {}".format(message))
6565
anon_ch = message['reply_to']
6666
cmd = message['value']

0 commit comments

Comments
 (0)