Skip to content

Commit 17c40f4

Browse files
author
Ralph Castain
committed
Implement support for proctable queries
Signed-off-by: Ralph Castain <[email protected]>
1 parent 0434b61 commit 17c40f4

File tree

9 files changed

+320
-26
lines changed

9 files changed

+320
-26
lines changed

opal/mca/pmix/pmix3x/pmix/include/pmix_common.h.in

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -573,11 +573,13 @@ typedef uint8_t pmix_proc_state_t;
573573
#define PMIX_PROC_STATE_ABORTED_BY_SIG (PMIX_PROC_STATE_ERROR + 4) /* process aborted by signal */
574574
#define PMIX_PROC_STATE_TERM_WO_SYNC (PMIX_PROC_STATE_ERROR + 5) /* process exit'd w/o calling PMIx_Finalize */
575575
#define PMIX_PROC_STATE_COMM_FAILED (PMIX_PROC_STATE_ERROR + 6) /* process communication has failed */
576-
#define PMIX_PROC_STATE_CALLED_ABORT (PMIX_PROC_STATE_ERROR + 7) /* process called "PMIx_Abort" */
577-
#define PMIX_PROC_STATE_MIGRATING (PMIX_PROC_STATE_ERROR + 8) /* process failed and is waiting for resources before restarting */
578-
#define PMIX_PROC_STATE_CANNOT_RESTART (PMIX_PROC_STATE_ERROR + 9) /* process failed and cannot be restarted */
579-
#define PMIX_PROC_STATE_TERM_NON_ZERO (PMIX_PROC_STATE_ERROR + 10) /* process exited with a non-zero status, indicating abnormal */
580-
#define PMIX_PROC_STATE_FAILED_TO_LAUNCH (PMIX_PROC_STATE_ERROR + 11) /* unable to launch process */
576+
#define PMIX_PROC_STATE_SENSOR_BOUND_EXCEEDED (PMIX_PROC_STATE_ERROR + 7) /* process exceeded a sensor limit */
577+
#define PMIX_PROC_STATE_CALLED_ABORT (PMIX_PROC_STATE_ERROR + 8) /* process called "PMIx_Abort" */
578+
#define PMIX_PROC_STATE_HEARTBEAT_FAILED (PMIX_PROC_STATE_ERROR + 9) /* process failed to send heartbeat w/in time limit */
579+
#define PMIX_PROC_STATE_MIGRATING (PMIX_PROC_STATE_ERROR + 10) /* process failed and is waiting for resources before restarting */
580+
#define PMIX_PROC_STATE_CANNOT_RESTART (PMIX_PROC_STATE_ERROR + 11) /* process failed and cannot be restarted */
581+
#define PMIX_PROC_STATE_TERM_NON_ZERO (PMIX_PROC_STATE_ERROR + 12) /* process exited with a non-zero status, indicating abnormal */
582+
#define PMIX_PROC_STATE_FAILED_TO_LAUNCH (PMIX_PROC_STATE_ERROR + 13) /* unable to launch process */
581583

582584

583585
/**** PMIX ERROR CONSTANTS ****/
@@ -1356,16 +1358,20 @@ struct pmix_info_t {
13561358
} \
13571359
} while (0)
13581360

1359-
#define PMIX_INFO_LOAD(m, k, v, t) \
1360-
do { \
1361-
(void)strncpy((m)->key, (k), PMIX_MAX_KEYLEN); \
1362-
pmix_value_load(&((m)->value), (v), (t)); \
1363-
} while (0)
1364-
#define PMIX_INFO_XFER(d, s) \
1361+
#define PMIX_INFO_LOAD(m, k, v, t) \
13651362
do { \
1366-
(void)strncpy((d)->key, (s)->key, PMIX_MAX_KEYLEN); \
1367-
(d)->flags = (s)->flags; \
1368-
pmix_value_xfer(&(d)->value, &(s)->value); \
1363+
if (NULL != (k)) { \
1364+
(void)strncpy((m)->key, (k), PMIX_MAX_KEYLEN); \
1365+
} \
1366+
pmix_value_load(&((m)->value), (v), (t)); \
1367+
} while (0)
1368+
#define PMIX_INFO_XFER(d, s) \
1369+
do { \
1370+
if (NULL != (s)->key) { \
1371+
(void)strncpy((d)->key, (s)->key, PMIX_MAX_KEYLEN); \
1372+
} \
1373+
(d)->flags = (s)->flags; \
1374+
pmix_value_xfer(&(d)->value, &(s)->value); \
13691375
} while(0)
13701376

13711377
#define PMIX_INFO_REQUIRED(m) \
@@ -1386,7 +1392,9 @@ struct pmix_info_t {
13861392
(r) = PMIX_ERR_NOMEM; \
13871393
break; \
13881394
} \
1389-
_kv->key = strdup(_info[_n].key); \
1395+
if (NULL != _info[_n].key) { \
1396+
_kv->key = strdup(_info[_n].key); \
1397+
} \
13901398
PMIX_VALUE_XFER((r), _kv->value, &_info[_n].value);\
13911399
if (PMIX_SUCCESS != (r)) { \
13921400
PMIX_RELEASE(_kv); \

opal/mca/pmix/pmix3x/pmix/src/common/pmix_query.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ static void query_cbfunc(struct pmix_peer_t *peer,
6363
PMIX_BFROPS_UNPACK(rc, peer, buf, &results->status, &cnt, PMIX_STATUS);
6464
if (PMIX_SUCCESS != rc) {
6565
PMIX_ERROR_LOG(rc);
66+
results->status = rc;
6667
goto complete;
6768
}
6869
if (PMIX_SUCCESS != results->status) {
@@ -74,6 +75,7 @@ static void query_cbfunc(struct pmix_peer_t *peer,
7475
PMIX_BFROPS_UNPACK(rc, peer, buf, &results->ninfo, &cnt, PMIX_SIZE);
7576
if (PMIX_SUCCESS != rc) {
7677
PMIX_ERROR_LOG(rc);
78+
results->status = rc;
7779
goto complete;
7880
}
7981
if (0 < results->ninfo) {
@@ -82,6 +84,7 @@ static void query_cbfunc(struct pmix_peer_t *peer,
8284
PMIX_BFROPS_UNPACK(rc, peer, buf, results->info, &cnt, PMIX_INFO);
8385
if (PMIX_SUCCESS != rc) {
8486
PMIX_ERROR_LOG(rc);
87+
results->status = rc;
8588
goto complete;
8689
}
8790
}

opal/mca/pmix/pmix3x/pmix/src/common/pmix_strings.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,12 @@ PMIX_EXPORT const char* PMIx_Proc_state_string(pmix_proc_state_t state)
7171
return "PROC TERMINATED WITHOUT CALLING PMIx_Finalize";
7272
case PMIX_PROC_STATE_COMM_FAILED:
7373
return "PROC LOST COMMUNICATION";
74+
case PMIX_PROC_STATE_SENSOR_BOUND_EXCEEDED:
75+
return "PROC SENSOR BOUND EXCEEDED";
7476
case PMIX_PROC_STATE_CALLED_ABORT:
7577
return "PROC CALLED PMIx_Abort";
78+
case PMIX_PROC_STATE_HEARTBEAT_FAILED:
79+
return "PROC FAILED TO REPORT HEARTBEAT";
7680
case PMIX_PROC_STATE_MIGRATING:
7781
return "PROC WAITING TO MIGRATE";
7882
case PMIX_PROC_STATE_CANNOT_RESTART:

opal/mca/pmix/pmix3x/pmix/src/mca/bfrops/base/bfrop_base_copy.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,7 @@ pmix_status_t pmix_bfrops_base_copy_pinfo(pmix_proc_info_t **dest,
373373
if (NULL == p) {
374374
return PMIX_ERR_NOMEM;
375375
}
376+
memcpy(&p->proc, &src->proc, sizeof(pmix_proc_t));
376377
if (NULL != src->hostname) {
377378
p->hostname = strdup(src->hostname);
378379
}
@@ -623,7 +624,7 @@ pmix_status_t pmix_bfrops_base_copy_darray(pmix_data_array_t **dest,
623624
p1 = (pmix_info_t*)p->array;
624625
s1 = (pmix_info_t*)src->array;
625626
for (n=0; n < src->size; n++) {
626-
PMIX_INFO_LOAD(&p1[n], s1[n].key, &s1[n].value.data.flag, s1[n].value.type);
627+
PMIX_INFO_XFER(&p1[n], &s1[n]);
627628
}
628629
break;
629630
case PMIX_PDATA:
@@ -635,7 +636,7 @@ pmix_status_t pmix_bfrops_base_copy_darray(pmix_data_array_t **dest,
635636
pd = (pmix_pdata_t*)p->array;
636637
sd = (pmix_pdata_t*)src->array;
637638
for (n=0; n < src->size; n++) {
638-
PMIX_PDATA_LOAD(&pd[n], &sd[n].proc, sd[n].key, &sd[n].value.data.flag, sd[n].value.type);
639+
PMIX_PDATA_XFER(&pd[n], &sd[n]);
639640
}
640641
break;
641642
case PMIX_BUFFER:

opal/mca/pmix/pmix3x/pmix3x.c

Lines changed: 161 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -909,14 +909,42 @@ void pmix3x_value_load(pmix_value_t *v,
909909
v->data.darray->array = info;
910910
n=0;
911911
OPAL_LIST_FOREACH(val, list, opal_value_t) {
912-
(void)strncpy(info[n].key, val->key, PMIX_MAX_KEYLEN);
912+
if (NULL != val->key) {
913+
(void)strncpy(info[n].key, val->key, PMIX_MAX_KEYLEN);
914+
}
913915
pmix3x_value_load(&info[n].value, val);
914916
++n;
915917
}
916918
} else {
917919
v->data.darray->array = NULL;
918920
}
919921
break;
922+
case OPAL_PROC_INFO:
923+
v->type = PMIX_PROC_INFO;
924+
PMIX_PROC_INFO_CREATE(v->data.pinfo, 1);
925+
/* see if this job is in our list of known nspaces */
926+
found = false;
927+
OPAL_LIST_FOREACH(job, &mca_pmix_pmix3x_component.jobids, opal_pmix3x_jobid_trkr_t) {
928+
if (job->jobid == kv->data.pinfo.name.jobid) {
929+
(void)strncpy(v->data.pinfo->proc.nspace, job->nspace, PMIX_MAX_NSLEN);
930+
found = true;
931+
break;
932+
}
933+
}
934+
if (!found) {
935+
(void)opal_snprintf_jobid(v->data.pinfo->proc.nspace, PMIX_MAX_NSLEN, kv->data.pinfo.name.jobid);
936+
}
937+
v->data.pinfo->proc.rank = pmix3x_convert_opalrank(kv->data.pinfo.name.vpid);
938+
if (NULL != kv->data.pinfo.hostname) {
939+
v->data.pinfo->hostname = strdup(kv->data.pinfo.hostname);
940+
}
941+
if (NULL != kv->data.pinfo.executable_name) {
942+
v->data.pinfo->executable_name = strdup(kv->data.pinfo.executable_name);
943+
}
944+
v->data.pinfo->pid = kv->data.pinfo.pid;
945+
v->data.pinfo->exit_code = kv->data.pinfo.exit_code;
946+
v->data.pinfo->state = pmix3x_convert_opalstate(kv->data.pinfo.state);
947+
break;
920948
case OPAL_ENVAR:
921949
v->type = PMIX_ENVAR;
922950
PMIX_ENVAR_CONSTRUCT(&v->data.envar);
@@ -1099,7 +1127,9 @@ int pmix3x_value_unload(opal_value_t *kv,
10991127
/* handle the various types */
11001128
if (PMIX_INFO == v->data.darray->type) {
11011129
pmix_info_t *iptr = (pmix_info_t*)v->data.darray->array;
1102-
ival->key = strdup(iptr[n].key);
1130+
if (NULL != iptr[n].key) {
1131+
ival->key = strdup(iptr[n].key);
1132+
}
11031133
rc = pmix3x_value_unload(ival, &iptr[n].value);
11041134
if (OPAL_SUCCESS != rc) {
11051135
OPAL_LIST_RELEASE(lt);
@@ -1110,6 +1140,37 @@ int pmix3x_value_unload(opal_value_t *kv,
11101140
}
11111141
}
11121142
break;
1143+
case PMIX_PROC_INFO:
1144+
kv->type = OPAL_PROC_INFO;
1145+
if (NULL == v->data.pinfo) {
1146+
rc = OPAL_ERR_BAD_PARAM;
1147+
break;
1148+
}
1149+
/* see if this job is in our list of known nspaces */
1150+
found = false;
1151+
OPAL_LIST_FOREACH(job, &mca_pmix_pmix3x_component.jobids, opal_pmix3x_jobid_trkr_t) {
1152+
if (0 == strncmp(job->nspace, v->data.pinfo->proc.nspace, PMIX_MAX_NSLEN)) {
1153+
kv->data.pinfo.name.jobid = job->jobid;
1154+
found = true;
1155+
break;
1156+
}
1157+
}
1158+
if (!found) {
1159+
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&kv->data.pinfo.name.jobid, v->data.pinfo->proc.nspace))) {
1160+
return pmix3x_convert_opalrc(rc);
1161+
}
1162+
}
1163+
kv->data.pinfo.name.vpid = pmix3x_convert_rank(v->data.pinfo->proc.rank);
1164+
if (NULL != v->data.pinfo->hostname) {
1165+
kv->data.pinfo.hostname = strdup(v->data.pinfo->hostname);
1166+
}
1167+
if (NULL != v->data.pinfo->executable_name) {
1168+
kv->data.pinfo.executable_name = strdup(v->data.pinfo->executable_name);
1169+
}
1170+
kv->data.pinfo.pid = v->data.pinfo->pid;
1171+
kv->data.pinfo.exit_code = v->data.pinfo->exit_code;
1172+
kv->data.pinfo.state = pmix3x_convert_state(v->data.pinfo->state);
1173+
break;
11131174
case PMIX_ENVAR:
11141175
kv->type = OPAL_ENVAR;
11151176
OBJ_CONSTRUCT(&kv->data.envar, opal_envar_t);
@@ -1347,6 +1408,7 @@ static void infocbfunc(pmix_status_t status,
13471408
opal_list_append(results, &iptr->super);
13481409
iptr->key = strdup(info[n].key);
13491410
if (OPAL_SUCCESS != (rc = pmix3x_value_unload(iptr, &info[n].value))) {
1411+
OPAL_ERROR_LOG(rc);
13501412
OPAL_LIST_RELEASE(results);
13511413
results = NULL;
13521414
break;
@@ -1510,6 +1572,103 @@ opal_pmix_alloc_directive_t pmix3x_convert_allocdir(pmix_alloc_directive_t dir)
15101572
}
15111573
}
15121574

1575+
int pmix3x_convert_state(pmix_proc_state_t state)
1576+
{
1577+
switch(state) {
1578+
case PMIX_PROC_STATE_UNDEF:
1579+
return 0;
1580+
case PMIX_PROC_STATE_PREPPED:
1581+
case PMIX_PROC_STATE_LAUNCH_UNDERWAY:
1582+
return 1;
1583+
case PMIX_PROC_STATE_RESTART:
1584+
return 2;
1585+
case PMIX_PROC_STATE_TERMINATE:
1586+
return 3;
1587+
case PMIX_PROC_STATE_RUNNING:
1588+
return 4;
1589+
case PMIX_PROC_STATE_CONNECTED:
1590+
return 5;
1591+
case PMIX_PROC_STATE_UNTERMINATED:
1592+
return 15;
1593+
case PMIX_PROC_STATE_TERMINATED:
1594+
return 20;
1595+
case PMIX_PROC_STATE_KILLED_BY_CMD:
1596+
return 51;
1597+
case PMIX_PROC_STATE_ABORTED:
1598+
return 52;
1599+
case PMIX_PROC_STATE_FAILED_TO_START:
1600+
return 53;
1601+
case PMIX_PROC_STATE_ABORTED_BY_SIG:
1602+
return 54;
1603+
case PMIX_PROC_STATE_TERM_WO_SYNC:
1604+
return 55;
1605+
case PMIX_PROC_STATE_COMM_FAILED:
1606+
return 56;
1607+
case PMIX_PROC_STATE_SENSOR_BOUND_EXCEEDED:
1608+
return 57;
1609+
case PMIX_PROC_STATE_CALLED_ABORT:
1610+
return 58;
1611+
case PMIX_PROC_STATE_HEARTBEAT_FAILED:
1612+
return 59;
1613+
case PMIX_PROC_STATE_MIGRATING:
1614+
return 60;
1615+
case PMIX_PROC_STATE_CANNOT_RESTART:
1616+
return 61;
1617+
case PMIX_PROC_STATE_TERM_NON_ZERO:
1618+
return 62;
1619+
case PMIX_PROC_STATE_FAILED_TO_LAUNCH:
1620+
return 63;
1621+
default:
1622+
return 0; // undef
1623+
}
1624+
}
1625+
1626+
pmix_proc_state_t pmix3x_convert_opalstate(int state)
1627+
{
1628+
switch(state) {
1629+
case 0:
1630+
return PMIX_PROC_STATE_UNDEF;
1631+
case 1:
1632+
return PMIX_PROC_STATE_LAUNCH_UNDERWAY;
1633+
case 2:
1634+
return PMIX_PROC_STATE_RESTART;
1635+
case 3:
1636+
return PMIX_PROC_STATE_TERMINATE;
1637+
case 4:
1638+
return PMIX_PROC_STATE_RUNNING;
1639+
case 5:
1640+
return PMIX_PROC_STATE_CONNECTED;
1641+
case 51:
1642+
return PMIX_PROC_STATE_KILLED_BY_CMD;
1643+
case 52:
1644+
return PMIX_PROC_STATE_ABORTED;
1645+
case 53:
1646+
return PMIX_PROC_STATE_FAILED_TO_START;
1647+
case 54:
1648+
return PMIX_PROC_STATE_ABORTED_BY_SIG;
1649+
case 55:
1650+
return PMIX_PROC_STATE_TERM_WO_SYNC;
1651+
case 56:
1652+
return PMIX_PROC_STATE_COMM_FAILED;
1653+
case 57:
1654+
return PMIX_PROC_STATE_SENSOR_BOUND_EXCEEDED;
1655+
case 58:
1656+
return PMIX_PROC_STATE_CALLED_ABORT;
1657+
case 59:
1658+
return PMIX_PROC_STATE_HEARTBEAT_FAILED;
1659+
case 60:
1660+
return PMIX_PROC_STATE_MIGRATING;
1661+
case 61:
1662+
return PMIX_PROC_STATE_CANNOT_RESTART;
1663+
case 62:
1664+
return PMIX_PROC_STATE_TERM_NON_ZERO;
1665+
case 63:
1666+
return PMIX_PROC_STATE_FAILED_TO_LAUNCH;
1667+
default:
1668+
return PMIX_PROC_STATE_UNDEF;
1669+
}
1670+
}
1671+
15131672
/**** INSTANTIATE INTERNAL CLASSES ****/
15141673
OBJ_CLASS_INSTANCE(opal_pmix3x_jobid_trkr_t,
15151674
opal_list_item_t,

opal/mca/pmix/pmix3x/pmix3x.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,11 @@ OPAL_MODULE_DECLSPEC opal_pmix_alloc_directive_t pmix3x_convert_allocdir(pmix_al
342342

343343
OPAL_MODULE_DECLSPEC char* pmix3x_convert_jobid(opal_jobid_t jobid);
344344

345+
OPAL_MODULE_DECLSPEC int pmix3x_convert_state(pmix_proc_state_t state);
346+
347+
OPAL_MODULE_DECLSPEC pmix_proc_state_t pmix3x_convert_opalstate(int state);
348+
349+
345350
END_C_DECLS
346351

347352
#endif /* MCA_PMIX_EXTERNAL_H */

opal/mca/pmix/pmix3x/pmix3x_server_north.c

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -954,6 +954,7 @@ static void info_cbfunc(int status,
954954
OPAL_LIST_FOREACH(kv, info, opal_value_t) {
955955
(void)strncpy(pcaddy->info[n].key, kv->key, PMIX_MAX_KEYLEN);
956956
pmix3x_value_load(&pcaddy->info[n].value, kv);
957+
++n;
957958
}
958959
}
959960
/* we are done with the incoming data */
@@ -1012,10 +1013,20 @@ static pmix_status_t server_query(pmix_proc_t *proct,
10121013
for (m=0; m < queries[n].nqual; m++) {
10131014
oinfo = OBJ_NEW(opal_value_t);
10141015
opal_list_append(&q->qualifiers, &oinfo->super);
1015-
oinfo->key = strdup(queries[n].qualifiers[m].key);
1016-
if (OPAL_SUCCESS != (rc = pmix3x_value_unload(oinfo, &queries[n].qualifiers[m].value))) {
1017-
OBJ_RELEASE(opalcaddy);
1018-
return pmix3x_convert_opalrc(rc);
1016+
1017+
if (0 == strcmp(queries[n].qualifiers[m].key, PMIX_NSPACE)) {
1018+
/* must convert this to jobid */
1019+
oinfo->key = strdup(OPAL_PMIX_PROCID);
1020+
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&oinfo->data.name.jobid, queries[n].qualifiers[m].value.data.string))) {
1021+
OBJ_RELEASE(opalcaddy);
1022+
return pmix3x_convert_opalrc(rc);
1023+
}
1024+
} else {
1025+
oinfo->key = strdup(queries[n].qualifiers[m].key);
1026+
if (OPAL_SUCCESS != (rc = pmix3x_value_unload(oinfo, &queries[n].qualifiers[m].value))) {
1027+
OBJ_RELEASE(opalcaddy);
1028+
return pmix3x_convert_opalrc(rc);
1029+
}
10191030
}
10201031
}
10211032
}

opal/mca/pmix/pmix_types.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -534,7 +534,7 @@ OBJ_CLASS_DECLARATION(opal_pmix_modex_data_t);
534534
typedef struct {
535535
opal_list_item_t super;
536536
char **keys;
537-
opal_list_t qualifiers;
537+
opal_list_t qualifiers; // list of opal_value_t
538538
} opal_pmix_query_t;
539539
OBJ_CLASS_DECLARATION(opal_pmix_query_t);
540540

0 commit comments

Comments
 (0)