Skip to content

Commit 93a91d8

Browse files
skip_failed should not return an exception + adjust logs for Job ACLs
1 parent 0f15338 commit 93a91d8

File tree

3 files changed

+28
-7
lines changed

3 files changed

+28
-7
lines changed

dbclient/JobsClient.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,9 +100,12 @@ def log_job_configs(self, users_list=None, groups_list = None, log_file='jobs.lo
100100
jobs_log = self.get_export_dir() + log_file
101101
acl_jobs_log = self.get_export_dir() + acl_file
102102
error_logger = logging_utils.get_error_logger(wmconstants.WM_EXPORT, wmconstants.JOB_OBJECT, self.get_export_dir())
103-
failed_log_file = logging_utils.get_error_log_file(
103+
failed_job_log_file = logging_utils.get_error_log_file(
104104
wmconstants.WM_EXPORT, wmconstants.JOB_OBJECT, self.get_export_dir()
105105
)
106+
failed_acl_log_file = logging_utils.get_error_log_file(
107+
wmconstants.WM_EXPORT, wmconstants.JOB_ACL_OBJECT, self.get_export_dir()
108+
)
106109
# pinned by cluster_user is a flag per cluster
107110
jl_full = self.get_jobs_list(False)
108111
if users_list:
@@ -112,7 +115,8 @@ def log_job_configs(self, users_list=None, groups_list = None, log_file='jobs.lo
112115
jl = jl_full
113116
with open(jobs_log, "w") as log_fp, \
114117
open(acl_jobs_log, 'w') as acl_fp, \
115-
open(failed_log_file, "w") as failed_log_fp:
118+
open(failed_job_log_file, "w") as failed_log_fp, \
119+
open(failed_acl_log_file, "w") as failed_acl_log_file:
116120
for x in jl:
117121
job_id = x['job_id']
118122
new_job_name = x['settings']['name'] + ':::' + str(job_id)
@@ -146,13 +150,16 @@ def log_job_configs(self, users_list=None, groups_list = None, log_file='jobs.lo
146150
# job_acl is malformed, the job is written to error output file
147151
logging.error(f"The following job id {job_id} has malformed permissions: {json.dumps(job_perms)}")
148152
failed_log_fp.write(json.dumps(x) + '\n')
153+
failed_acl_log_file.write(json.dumps(job_perms) + '\n')
149154

150155
def import_job_configs(self, log_file='jobs.log', acl_file='acl_jobs.log', job_map_file='job_id_map.log'):
151156
jobs_log = self.get_export_dir() + log_file
152157
acl_jobs_log = self.get_export_dir() + acl_file
153158
job_map_log = self.get_export_dir() + job_map_file
154159
error_logger = logging_utils.get_error_logger(
155160
wmconstants.WM_IMPORT, wmconstants.JOB_OBJECT, self.get_export_dir())
161+
job_acl_error_logger = logging_utils.get_error_logger(
162+
wmconstants.WM_IMPORT, wmconstants.JOB_ACL_OBJECT, self.get_export_dir())
156163
if not os.path.exists(jobs_log):
157164
logging.info("No job configurations to import.")
158165
return
@@ -263,10 +270,13 @@ def adjust_ids_for_cluster(settings): #job_settings or task_settings
263270
acl_perms = self.build_acl_args(acl_conf['access_control_list'], True)
264271
acl_create_args = {'access_control_list': acl_perms}
265272
acl_resp = self.patch(api, acl_create_args)
266-
if not logging_utils.log_response_error(error_logger, acl_resp) and 'object_id' in acl_conf:
273+
if not logging_utils.log_response_error(job_acl_error_logger, acl_resp) and 'object_id' in acl_conf:
267274
checkpoint_job_configs_set.write(acl_conf['object_id'])
268275
else:
269-
raise RuntimeError("Import job has failed. Refer to the previous log messages to investigate.")
276+
if self.is_skip_failed():
277+
logging.error(f"Skipped {acl_conf}")
278+
else:
279+
raise RuntimeError("Import job has failed. Refer to the previous log messages to investigate.")
270280
# update the imported job names
271281
self.update_imported_job_names(error_logger, checkpoint_job_configs_set)
272282

dbclient/dbclient.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -265,9 +265,19 @@ def http_req(self, http_type, endpoint, json_params, version='2.0', print_json=F
265265

266266
http_status_code = raw_results.status_code
267267
if http_status_code in dbclient.http_error_codes:
268-
raise Exception("Error: {0} request failed with code {1}\n{2}".format(http_type,
269-
http_status_code,
270-
raw_results.text))
268+
message = "Error: {0} request failed with code {1}\n{2}".format(
269+
http_type, http_status_code, raw_results.text
270+
)
271+
if self.is_skip_failed():
272+
logging.error(message)
273+
return {
274+
'http_status_code': raw_results.status_code,
275+
'error': raw_results.text,
276+
'url': full_endpoint,
277+
'json': json_params,
278+
}
279+
else:
280+
raise Exception(message)
271281
results = raw_results.json()
272282
if logging_utils.check_error(results):
273283
logging.warn(json.dumps(results) + '\n')

wmconstants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
CLUSTER_OBJECT = "clusters"
1616
INSTANCE_POOL_OBJECT = "instance_pools"
1717
JOB_OBJECT = "jobs"
18+
JOB_ACL_OBJECT = "acl_jobs"
1819
SECRET_OBJECT = "secrets"
1920
MLFLOW_EXPERIMENT_OBJECT = "mlflow_experiments"
2021
MLFLOW_EXPERIMENT_PERMISSION_OBJECT = "mlflow_experiments_permissions"

0 commit comments

Comments
 (0)