Skip to content

Commit b0fd430

Browse files
author
Ralph Castain
committed
Try to debug the MTT failures
Provide complete coverage of PMIx data types in the opal transition layer, printing an OPAL_ERROR_LOG where we don't support one so we can see what is missing in the MTT tests. I've been unable to reproduce them locally. Signed-off-by: Ralph Castain <[email protected]>
1 parent 0316e7b commit b0fd430

File tree

2 files changed

+183
-6
lines changed

2 files changed

+183
-6
lines changed

opal/mca/pmix/pmix2x/pmix2x.c

Lines changed: 179 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -949,9 +949,9 @@ int pmix2x_value_unload(opal_value_t *kv,
949949
kv->type = OPAL_STATUS;
950950
kv->data.status = pmix2x_convert_rc(v->data.status);
951951
break;
952-
case PMIX_PROC_RANK:
953-
kv->type = OPAL_VPID;
954-
kv->data.name.vpid = pmix2x_convert_rank(v->data.rank);
952+
case PMIX_VALUE:
953+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
954+
rc = OPAL_ERR_NOT_SUPPORTED;
955955
break;
956956
case PMIX_PROC:
957957
kv->type = OPAL_NAME;
@@ -971,6 +971,18 @@ int pmix2x_value_unload(opal_value_t *kv,
971971
}
972972
kv->data.name.vpid = pmix2x_convert_rank(v->data.proc->rank);
973973
break;
974+
case PMIX_INFO:
975+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
976+
rc = OPAL_ERR_NOT_SUPPORTED;
977+
break;
978+
case PMIX_PDATA:
979+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
980+
rc = OPAL_ERR_NOT_SUPPORTED;
981+
break;
982+
case PMIX_BUFFER:
983+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
984+
rc = OPAL_ERR_NOT_SUPPORTED;
985+
break;
974986
case PMIX_BYTE_OBJECT:
975987
kv->type = OPAL_BYTE_OBJECT;
976988
if (NULL != v->data.bo.bytes && 0 < v->data.bo.size) {
@@ -982,10 +994,22 @@ int pmix2x_value_unload(opal_value_t *kv,
982994
kv->data.bo.size = 0;
983995
}
984996
break;
997+
case PMIX_KVAL:
998+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
999+
rc = OPAL_ERR_NOT_SUPPORTED;
1000+
break;
1001+
case PMIX_MODEX:
1002+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
1003+
rc = OPAL_ERR_NOT_SUPPORTED;
1004+
break;
9851005
case PMIX_PERSIST:
9861006
kv->type = OPAL_PERSIST;
9871007
kv->data.uint8 = pmix2x_convert_persist(v->data.persist);
9881008
break;
1009+
case PMIX_POINTER:
1010+
kv->type = OPAL_PTR;
1011+
kv->data.ptr = v->data.ptr;
1012+
break;
9891013
case PMIX_SCOPE:
9901014
kv->type = OPAL_SCOPE;
9911015
kv->data.uint8 = pmix2x_convert_scope(v->data.scope);
@@ -994,15 +1018,54 @@ int pmix2x_value_unload(opal_value_t *kv,
9941018
kv->type = OPAL_DATA_RANGE;
9951019
kv->data.uint8 = pmix2x_convert_range(v->data.range);
9961020
break;
1021+
case PMIX_COMMAND:
1022+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
1023+
rc = OPAL_ERR_NOT_SUPPORTED;
1024+
break;
1025+
case PMIX_INFO_DIRECTIVES:
1026+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
1027+
rc = OPAL_ERR_NOT_SUPPORTED;
1028+
break;
1029+
case PMIX_DATA_TYPE:
1030+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
1031+
rc = OPAL_ERR_NOT_SUPPORTED;
1032+
break;
9971033
case PMIX_PROC_STATE:
9981034
kv->type = OPAL_PROC_STATE;
9991035
/* the OPAL layer doesn't have any concept of proc state,
10001036
* so the ORTE layer is responsible for converting it */
10011037
memcpy(&kv->data.uint8, &v->data.state, sizeof(uint8_t));
10021038
break;
1003-
case PMIX_POINTER:
1004-
kv->type = OPAL_PTR;
1005-
kv->data.ptr = v->data.ptr;
1039+
case PMIX_PROC_INFO:
1040+
kv->type = OPAL_PROC_INFO;
1041+
if (NULL == v->data.pinfo) {
1042+
rc = OPAL_ERR_BAD_PARAM;
1043+
break;
1044+
}
1045+
/* see if this job is in our list of known nspaces */
1046+
found = false;
1047+
OPAL_LIST_FOREACH(job, &mca_pmix_pmix2x_component.jobids, opal_pmix2x_jobid_trkr_t) {
1048+
if (0 == strncmp(job->nspace, v->data.pinfo->proc.nspace, PMIX_MAX_NSLEN)) {
1049+
kv->data.pinfo.name.jobid = job->jobid;
1050+
found = true;
1051+
break;
1052+
}
1053+
}
1054+
if (!found) {
1055+
if (OPAL_SUCCESS != (rc = opal_convert_string_to_jobid(&kv->data.pinfo.name.jobid, v->data.pinfo->proc.nspace))) {
1056+
return pmix2x_convert_opalrc(rc);
1057+
}
1058+
}
1059+
kv->data.pinfo.name.vpid = pmix2x_convert_rank(v->data.pinfo->proc.rank);
1060+
if (NULL != v->data.pinfo->hostname) {
1061+
kv->data.pinfo.hostname = strdup(v->data.pinfo->hostname);
1062+
}
1063+
if (NULL != v->data.pinfo->executable_name) {
1064+
kv->data.pinfo.executable_name = strdup(v->data.pinfo->executable_name);
1065+
}
1066+
kv->data.pinfo.pid = v->data.pinfo->pid;
1067+
kv->data.pinfo.exit_code = v->data.pinfo->exit_code;
1068+
kv->data.pinfo.state = pmix2x_convert_state(v->data.pinfo->state);
10061069
break;
10071070
case PMIX_DATA_ARRAY:
10081071
if (NULL == v->data.darray || NULL == v->data.darray->array) {
@@ -1029,6 +1092,27 @@ int pmix2x_value_unload(opal_value_t *kv,
10291092
}
10301093
}
10311094
break;
1095+
case PMIX_PROC_RANK:
1096+
kv->type = OPAL_VPID;
1097+
kv->data.name.vpid = pmix2x_convert_rank(v->data.rank);
1098+
break;
1099+
case PMIX_QUERY:
1100+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
1101+
rc = OPAL_ERR_NOT_SUPPORTED;
1102+
break;
1103+
case PMIX_COMPRESSED_STRING:
1104+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
1105+
rc = OPAL_ERR_NOT_SUPPORTED;
1106+
break;
1107+
case PMIX_ALLOC_DIRECTIVE:
1108+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
1109+
rc = OPAL_ERR_NOT_SUPPORTED;
1110+
break;
1111+
case PMIX_INFO_ARRAY:
1112+
OPAL_ERROR_LOG(OPAL_ERR_NOT_SUPPORTED);
1113+
rc = OPAL_ERR_NOT_SUPPORTED;
1114+
break;
1115+
10321116
default:
10331117
/* silence warnings */
10341118
rc = OPAL_ERROR;
@@ -1418,6 +1502,95 @@ opal_pmix_alloc_directive_t pmix2x_convert_allocdir(pmix_alloc_directive_t dir)
14181502
}
14191503
}
14201504

1505+
int pmix2x_convert_state(pmix_proc_state_t state)
1506+
{
1507+
switch(state) {
1508+
case PMIX_PROC_STATE_UNDEF:
1509+
return 0;
1510+
case PMIX_PROC_STATE_PREPPED:
1511+
case PMIX_PROC_STATE_LAUNCH_UNDERWAY:
1512+
return 1;
1513+
case PMIX_PROC_STATE_RESTART:
1514+
return 2;
1515+
case PMIX_PROC_STATE_TERMINATE:
1516+
return 3;
1517+
case PMIX_PROC_STATE_RUNNING:
1518+
return 4;
1519+
case PMIX_PROC_STATE_CONNECTED:
1520+
return 5;
1521+
case PMIX_PROC_STATE_UNTERMINATED:
1522+
return 15;
1523+
case PMIX_PROC_STATE_TERMINATED:
1524+
return 20;
1525+
case PMIX_PROC_STATE_KILLED_BY_CMD:
1526+
return 51;
1527+
case PMIX_PROC_STATE_ABORTED:
1528+
return 52;
1529+
case PMIX_PROC_STATE_FAILED_TO_START:
1530+
return 53;
1531+
case PMIX_PROC_STATE_ABORTED_BY_SIG:
1532+
return 54;
1533+
case PMIX_PROC_STATE_TERM_WO_SYNC:
1534+
return 55;
1535+
case PMIX_PROC_STATE_COMM_FAILED:
1536+
return 56;
1537+
case PMIX_PROC_STATE_CALLED_ABORT:
1538+
return 58;
1539+
case PMIX_PROC_STATE_MIGRATING:
1540+
return 60;
1541+
case PMIX_PROC_STATE_CANNOT_RESTART:
1542+
return 61;
1543+
case PMIX_PROC_STATE_TERM_NON_ZERO:
1544+
return 62;
1545+
case PMIX_PROC_STATE_FAILED_TO_LAUNCH:
1546+
return 63;
1547+
default:
1548+
return 0; // undef
1549+
}
1550+
}
1551+
1552+
pmix_proc_state_t pmix2x_convert_opalstate(int state)
1553+
{
1554+
switch(state) {
1555+
case 0:
1556+
return PMIX_PROC_STATE_UNDEF;
1557+
case 1:
1558+
return PMIX_PROC_STATE_LAUNCH_UNDERWAY;
1559+
case 2:
1560+
return PMIX_PROC_STATE_RESTART;
1561+
case 3:
1562+
return PMIX_PROC_STATE_TERMINATE;
1563+
case 4:
1564+
return PMIX_PROC_STATE_RUNNING;
1565+
case 5:
1566+
return PMIX_PROC_STATE_CONNECTED;
1567+
case 51:
1568+
return PMIX_PROC_STATE_KILLED_BY_CMD;
1569+
case 52:
1570+
return PMIX_PROC_STATE_ABORTED;
1571+
case 53:
1572+
return PMIX_PROC_STATE_FAILED_TO_START;
1573+
case 54:
1574+
return PMIX_PROC_STATE_ABORTED_BY_SIG;
1575+
case 55:
1576+
return PMIX_PROC_STATE_TERM_WO_SYNC;
1577+
case 56:
1578+
return PMIX_PROC_STATE_COMM_FAILED;
1579+
case 58:
1580+
return PMIX_PROC_STATE_CALLED_ABORT;
1581+
case 59:
1582+
return PMIX_PROC_STATE_MIGRATING;
1583+
case 61:
1584+
return PMIX_PROC_STATE_CANNOT_RESTART;
1585+
case 62:
1586+
return PMIX_PROC_STATE_TERM_NON_ZERO;
1587+
case 63:
1588+
return PMIX_PROC_STATE_FAILED_TO_LAUNCH;
1589+
default:
1590+
return PMIX_PROC_STATE_UNDEF;
1591+
}
1592+
}
1593+
14211594
/**** INSTANTIATE INTERNAL CLASSES ****/
14221595
OBJ_CLASS_INSTANCE(opal_pmix2x_jobid_trkr_t,
14231596
opal_list_item_t,

opal/mca/pmix/pmix2x/pmix2x.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,10 @@ OPAL_MODULE_DECLSPEC opal_pmix_alloc_directive_t pmix2x_convert_allocdir(pmix_al
339339

340340
OPAL_MODULE_DECLSPEC char* pmix2x_convert_jobid(opal_jobid_t jobid);
341341

342+
OPAL_MODULE_DECLSPEC int pmix2x_convert_state(pmix_proc_state_t state);
343+
344+
OPAL_MODULE_DECLSPEC pmix_proc_state_t pmix2x_convert_opalstate(int state);
345+
342346
END_C_DECLS
343347

344348
#endif /* MCA_PMIX_EXTERNAL_H */

0 commit comments

Comments
 (0)