|
13 | 13 | import shutil |
14 | 14 | import time |
15 | 15 |
|
16 | | -from agaveflask.logs import get_log_file_strategy |
| 16 | +from agaveflask.auth import get_api_server |
17 | 17 | import channelpy |
18 | 18 |
|
| 19 | +from aga import Agave |
| 20 | +from auth import get_tenants, get_tenant_verify |
19 | 21 | import codes |
20 | 22 | from config import Config |
21 | 23 | from docker_utils import rm_container, DockerError, container_running, run_container_with_docker |
22 | | -from models import Actor, Worker |
| 24 | +from models import Actor, Worker, is_hashid |
23 | 25 | from channels import ClientsChannel, CommandChannel, WorkerChannel |
24 | 26 | from stores import actors_store, clients_store, executions_store, workers_store |
25 | 27 | from worker import shutdown_worker |
@@ -86,6 +88,72 @@ def clean_up_ipc_dirs(): |
86 | 88 | clean_up_socket_dirs() |
87 | 89 | clean_up_fifo_dirs() |
88 | 90 |
|
| 91 | +def delete_client(ag, client_name): |
| 92 | + """Remove a client from the APIM.""" |
| 93 | + try: |
| 94 | + ag.clients.delete(clientName=client_name) |
| 95 | + except Exception as e: |
| 96 | + m = 'Not able to delete client from APIM. Got an exception: {}'.format(e) |
| 97 | + logger.error(m) |
| 98 | + return None |
| 99 | + |
| 100 | +def clean_up_apim_clients(tenant): |
| 101 | + """Check the list of clients registered in APIM and remove any that are associated with retired workers.""" |
| 102 | + username = os.environ.get('_abaco_{}_username'.format(tenant), '') |
| 103 | + password = os.environ.get('_abaco_{}_password'.format(tenant), '') |
| 104 | + if not username: |
| 105 | + msg = "Health process did not get a username for tenant {}; " \ |
| 106 | + "returning from clean_up_apim_clients".format(tenant) |
| 107 | + if tenant in ['SD2E', 'TACC-PROD']: |
| 108 | + logger.error(msg) |
| 109 | + else: |
| 110 | + logger.info(msg) |
| 111 | + return None |
| 112 | + if not password: |
| 113 | + msg = "Health process did not get a password for tenant {}; " \ |
| 114 | + "returning from clean_up_apim_clients".format(tenant) |
| 115 | + if tenant in ['SD2E', 'TACC-PROD']: |
| 116 | + logger.error(msg) |
| 117 | + else: |
| 118 | + logger.info(msg) |
| 119 | + return None |
| 120 | + api_server = get_api_server(tenant) |
| 121 | + verify = get_tenant_verify(tenant) |
| 122 | + ag = Agave(api_server=api_server, |
| 123 | + username=username, |
| 124 | + password=password, |
| 125 | + verify=verify) |
| 126 | + logger.debug("health process created an ag for tenant: {}".format(tenant)) |
| 127 | + try: |
| 128 | + cs = ag.clients.list() |
| 129 | + clients = cs.json()['result'] |
| 130 | + except Exception as e: |
| 131 | + msg = "Health process got an exception trying to retrieve clients; exception: {}".format(e) |
| 132 | + logger.error(msg) |
| 133 | + return None |
| 134 | + for client in clients: |
| 135 | + # check if the name of the client is an abaco hash (i.e., a worker id). if not, we ignore it from the beginning |
| 136 | + name = client.get('name') |
| 137 | + if not is_hashid(name): |
| 138 | + logger.debug("client {} is not an abaco hash id; skipping.".format(name)) |
| 139 | + continue |
| 140 | + # we know this client came from a worker, so we need to check to see if the worker is still active; |
| 141 | + # first check if the worker even exists; if it does, the id will be the client name: |
| 142 | + worker = get_worker(name) |
| 143 | + if not worker: |
| 144 | + logger.info("no worker associated with id: {}; deleting client.".format(name)) |
| 145 | + delete_client(ag, name) |
| 146 | + logger.info("client {} deleted by health process.".format(name)) |
| 147 | + continue |
| 148 | + # if the worker exists, we should check the status: |
| 149 | + status = worker.get('status') |
| 150 | + if status == codes.ERROR: |
| 151 | + logger.info("worker {} was in ERROR status so deleting client; worker: {}.".format(name, worker)) |
| 152 | + delete_client(ag, name) |
| 153 | + logger.info("client {} deleted by health process.".format(name)) |
| 154 | + else: |
| 155 | + logger.debug("worker {} still active; not deleting client.".format(worker)) |
| 156 | + |
89 | 157 | def clean_up_clients_store(): |
90 | 158 | logger.debug("top of clean_up_clients_store") |
91 | 159 | secret = os.environ.get('_abaco_secret') |
@@ -399,6 +467,10 @@ def main(): |
399 | 467 | for id in ids: |
400 | 468 | # manage_workers(id) |
401 | 469 | check_workers(id, ttl) |
| 470 | + tenants = get_tenants() |
| 471 | + for t in tenants: |
| 472 | + logger.debug("health process cleaning up apim_clients for tenant: {}".format(t)) |
| 473 | + clean_up_apim_clients(t) |
402 | 474 |
|
403 | 475 | # TODO - turning off the check_workers_store for now. unclear that removing worker objects |
404 | 476 | # check_workers_store(ttl) |
|
0 commit comments