Skip to content

Commit fca68b0

Browse files
author
Ralph Castain
authored
Merge pull request #3934 from rhc54/topic/singleton
Fix the isolated pmix component. Cleanup the ess/singleton component …
2 parents 6cbea90 + 543c16b commit fca68b0

File tree

7 files changed

+140
-96
lines changed

7 files changed

+140
-96
lines changed

opal/mca/pmix/isolated/pmix_isolated.c

Lines changed: 74 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -122,12 +122,18 @@ static int isolated_init(opal_list_t *ilist)
122122
{
123123
int rc;
124124
opal_value_t kv;
125+
opal_process_name_t wildcard;
125126

126-
if (0 < isolated_init_count) {
127+
OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock);
128+
++isolated_init_count;
129+
if (1 < isolated_init_count) {
130+
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
127131
return OPAL_SUCCESS;
128132
}
129133

130-
++isolated_init_count;
134+
135+
wildcard.jobid = 1;
136+
wildcard.vpid = OPAL_VPID_WILDCARD;
131137

132138
/* store our name in the opal_proc_t so that
133139
* debug messages will make sense - an upper
@@ -178,6 +184,17 @@ static int isolated_init(opal_list_t *ilist)
178184
}
179185
OBJ_DESTRUCT(&kv);
180186

187+
OBJ_CONSTRUCT(&kv, opal_value_t);
188+
kv.key = strdup(OPAL_PMIX_MAX_PROCS);
189+
kv.type = OPAL_UINT32;
190+
kv.data.uint32 = 1;
191+
if (OPAL_SUCCESS != (rc = opal_pmix_base_store(&wildcard, &kv))) {
192+
OPAL_ERROR_LOG(rc);
193+
OBJ_DESTRUCT(&kv);
194+
goto err_exit;
195+
}
196+
OBJ_DESTRUCT(&kv);
197+
181198
OBJ_CONSTRUCT(&kv, opal_value_t);
182199
kv.key = strdup(OPAL_PMIX_JOBID);
183200
kv.type = OPAL_UINT32;
@@ -246,30 +263,35 @@ static int isolated_init(opal_list_t *ilist)
246263
}
247264
OBJ_DESTRUCT(&kv);
248265

266+
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
249267
return OPAL_SUCCESS;
250268

251269
err_exit:
270+
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
252271
return rc;
253272
}
254273

255274
static int isolated_fini(void)
256275
{
276+
OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock);
277+
--opal_pmix_base.initialized;
278+
257279
if (0 == isolated_init_count) {
258-
return OPAL_SUCCESS;
280+
opal_pmix_base_hash_finalize();
259281
}
260282

261-
if (0 != --isolated_init_count) {
262-
return OPAL_SUCCESS;
263-
}
264-
opal_pmix_base_hash_finalize();
283+
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
265284
return OPAL_SUCCESS;
266285
}
267286

268287
static int isolated_initialized(void)
269288
{
289+
OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock);
270290
if (0 < isolated_init_count) {
291+
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
271292
return 1;
272293
}
294+
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
273295
return 0;
274296
}
275297

@@ -325,13 +347,16 @@ static int isolated_put(opal_pmix_scope_t scope,
325347
{
326348
int rc;
327349

328-
opal_output_verbose(10, opal_pmix_base_framework.framework_output,
329-
"%s pmix:isolated isolated_put key %s scope %d\n",
350+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
351+
"%s pmix:isolated isolated_put key %s scope %d",
330352
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kv->key, scope);
331353

332-
if (!isolated_init_count) {
354+
OPAL_PMIX_ACQUIRE_THREAD(&opal_pmix_base.lock);
355+
if (0 == isolated_init_count) {
356+
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
333357
return OPAL_ERROR;
334358
}
359+
OPAL_PMIX_RELEASE_THREAD(&opal_pmix_base.lock);
335360

336361
rc = opal_pmix_base_store(&isolated_pname, kv);
337362

@@ -340,18 +365,31 @@ static int isolated_put(opal_pmix_scope_t scope,
340365

341366
static int isolated_commit(void)
342367
{
368+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
369+
"%s pmix:isolated isolated commit",
370+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
371+
343372
return OPAL_SUCCESS;
344373
}
345374

346375
static int isolated_fence(opal_list_t *procs, int collect_data)
347376
{
377+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
378+
"%s pmix:isolated isolated fence",
379+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
348380
return OPAL_SUCCESS;
349381
}
350382

351383
static int isolated_fence_nb(opal_list_t *procs, int collect_data,
352384
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
353385
{
354-
return OPAL_ERR_NOT_IMPLEMENTED;
386+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
387+
"%s pmix:isolated isolated fence_nb",
388+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
389+
if (NULL != cbfunc) {
390+
cbfunc(OPAL_SUCCESS, cbdata);
391+
}
392+
return OPAL_SUCCESS;
355393
}
356394

357395
static int isolated_get(const opal_process_name_t *id,
@@ -383,39 +421,60 @@ static int isolated_get(const opal_process_name_t *id,
383421
static int isolated_get_nb(const opal_process_name_t *id, const char *key,
384422
opal_list_t *info, opal_pmix_value_cbfunc_t cbfunc, void *cbdata)
385423
{
424+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
425+
"%s pmix:isolated isolated get_nb",
426+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
386427
return OPAL_ERR_NOT_IMPLEMENTED;
387428
}
388429

389430
static int isolated_publish(opal_list_t *info)
390431
{
432+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
433+
"%s pmix:isolated isolated publish",
434+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
391435
return OPAL_ERR_NOT_SUPPORTED;
392436
}
393437

394438
static int isolated_publish_nb(opal_list_t *info,
395439
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
396440
{
441+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
442+
"%s pmix:isolated isolated publish_nb",
443+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
397444
return OPAL_ERR_NOT_SUPPORTED;
398445
}
399446

400447
static int isolated_lookup(opal_list_t *data, opal_list_t *info)
401448
{
449+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
450+
"%s pmix:isolated isolated lookup",
451+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
402452
return OPAL_ERR_NOT_SUPPORTED;
403453
}
404454

405455
static int isolated_lookup_nb(char **keys, opal_list_t *info,
406456
opal_pmix_lookup_cbfunc_t cbfunc, void *cbdata)
407457
{
458+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
459+
"%s pmix:isolated isolated lookup_nb",
460+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
408461
return OPAL_ERR_NOT_SUPPORTED;
409462
}
410463

411464
static int isolated_unpublish(char **keys, opal_list_t *info)
412465
{
466+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
467+
"%s pmix:isolated isolated unpublish",
468+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
413469
return OPAL_ERR_NOT_SUPPORTED;
414470
}
415471

416472
static int isolated_unpublish_nb(char **keys, opal_list_t *info,
417473
opal_pmix_op_cbfunc_t cbfunc, void *cbdata)
418474
{
475+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
476+
"%s pmix:isolated isolated unpublish_nb",
477+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
419478
return OPAL_ERR_NOT_SUPPORTED;
420479
}
421480

@@ -427,6 +486,10 @@ static const char *isolated_get_version(void)
427486
static int isolated_store_local(const opal_process_name_t *proc,
428487
opal_value_t *val)
429488
{
489+
opal_output_verbose(2, opal_pmix_base_framework.framework_output,
490+
"%s pmix:isolated isolated store_local",
491+
OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
492+
430493
opal_pmix_base_store(proc, val);
431494

432495
return OPAL_SUCCESS;

orte/mca/ess/singleton/ess_singleton_module.c

Lines changed: 8 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,6 @@ static int rte_init(void)
8484
{
8585
int rc, ret;
8686
char *error = NULL;
87-
opal_value_t *kv;
88-
char *val = NULL;
8987
int u32, *u32ptr;
9088
uint16_t u16, *u16ptr;
9189
orte_process_name_t name;
@@ -159,7 +157,7 @@ static int rte_init(void)
159157
} else if (NULL != getenv("SINGULARITY_CONTAINER") ||
160158
mca_ess_singleton_component.isolated) {
161159
/* ensure we use the isolated pmix component */
162-
opal_setenv (OPAL_MCA_PREFIX"pmix", "isolated", true, &environ);
160+
opal_setenv(OPAL_MCA_PREFIX"pmix", "isolated", true, &environ);
163161
} else {
164162
/* we want to use PMIX_NAMESPACE that will be sent by the hnp as a jobid */
165163
opal_setenv(OPAL_MCA_PREFIX"orte_launch", "1", true, &environ);
@@ -169,7 +167,7 @@ static int rte_init(void)
169167
return rc;
170168
}
171169
/* our name was given to us by the HNP */
172-
opal_setenv (OPAL_MCA_PREFIX"pmix", "^s1,s2,cray,isolated", true, &environ);
170+
opal_setenv(OPAL_MCA_PREFIX"pmix", "^s1,s2,cray,isolated", true, &environ);
173171
}
174172

175173
/* get an async event base - we use the opal_async one so
@@ -265,69 +263,13 @@ static int rte_init(void)
265263
* we can use the jobfam and stepid as unique keys
266264
* because they are unique values assigned by the RM
267265
*/
268-
assert (NULL != getenv(OPAL_MCA_PREFIX"orte_precondition_transports"));
269-
270-
/* retrieve our topology */
271-
OPAL_MODEX_RECV_VALUE(ret, OPAL_PMIX_LOCAL_TOPO,
272-
&name, &val, OPAL_STRING);
273-
if (OPAL_SUCCESS == ret && NULL != val) {
274-
/* load the topology */
275-
if (0 != hwloc_topology_init(&opal_hwloc_topology)) {
276-
ret = OPAL_ERROR;
277-
free(val);
278-
error = "setting topology";
279-
goto error;
266+
if (NULL == getenv(OPAL_MCA_PREFIX"orte_precondition_transports")) {
267+
char *key;
268+
ret = orte_pre_condition_transports(NULL, &key);
269+
if (ORTE_SUCCESS == ret) {
270+
opal_setenv(OPAL_MCA_PREFIX"orte_precondition_transports", key, true, &environ);
271+
free(key);
280272
}
281-
if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val))) {
282-
ret = OPAL_ERROR;
283-
free(val);
284-
hwloc_topology_destroy(opal_hwloc_topology);
285-
error = "setting topology";
286-
goto error;
287-
}
288-
/* since we are loading this from an external source, we have to
289-
* explicitly set a flag so hwloc sets things up correctly
290-
*/
291-
if (0 != hwloc_topology_set_flags(opal_hwloc_topology,
292-
(HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM |
293-
HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM |
294-
HWLOC_TOPOLOGY_FLAG_IO_DEVICES))) {
295-
ret = OPAL_ERROR;
296-
hwloc_topology_destroy(opal_hwloc_topology);
297-
free(val);
298-
error = "setting topology";
299-
goto error;
300-
}
301-
/* now load the topology */
302-
if (0 != hwloc_topology_load(opal_hwloc_topology)) {
303-
ret = OPAL_ERROR;
304-
hwloc_topology_destroy(opal_hwloc_topology);
305-
free(val);
306-
error = "setting topology";
307-
goto error;
308-
}
309-
free(val);
310-
} else {
311-
/* it wasn't passed down to us, so go get it */
312-
if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
313-
error = "topology discovery";
314-
goto error;
315-
}
316-
/* push it into the PMIx database in case someone
317-
* tries to retrieve it so we avoid an attempt to
318-
* get it again */
319-
kv = OBJ_NEW(opal_value_t);
320-
kv->key = strdup(OPAL_PMIX_LOCAL_TOPO);
321-
kv->type = OPAL_STRING;
322-
if (0 != (ret = hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &kv->data.string, &u32))) {
323-
error = "topology export";
324-
goto error;
325-
}
326-
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, kv))) {
327-
error = "topology store";
328-
goto error;
329-
}
330-
OBJ_RELEASE(kv);
331273
}
332274

333275
/* use the std app init to complete the procedure */

orte/mca/plm/base/plm_base_launch_support.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
332332
}
333333
free(key);
334334
} else {
335-
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata))) {
335+
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata, NULL))) {
336336
ORTE_ERROR_LOG(rc);
337337
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
338338
OBJ_RELEASE(caddy);
@@ -342,7 +342,7 @@ void orte_plm_base_setup_job(int fd, short args, void *cbdata)
342342
} else {
343343
/* this will also record the transport key attribute in the job object, and
344344
* adds the key envar to each app */
345-
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata))) {
345+
if (ORTE_SUCCESS != (rc = orte_pre_condition_transports(caddy->jdata, NULL))) {
346346
ORTE_ERROR_LOG(rc);
347347
ORTE_FORCED_TERMINATE(ORTE_ERROR_DEFAULT_EXIT_CODE);
348348
OBJ_RELEASE(caddy);

orte/orted/orted_main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -578,7 +578,7 @@ int orte_daemon(int argc, char *argv[])
578578
ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);
579579

580580
/* set the ORTE_JOB_TRANSPORT_KEY from the environment */
581-
orte_pre_condition_transports(jdata);
581+
orte_pre_condition_transports(jdata, NULL);
582582

583583
/* register the singleton's nspace with our PMIx server */
584584
if (ORTE_SUCCESS != (ret = orte_pmix_server_register_nspace(jdata, false))) {

orte/test/mpi/hellocycle.pl

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/env perl
2+
#
3+
use strict;
4+
use warnings;
5+
use Date::Parse;
6+
7+
#
8+
$ENV{OMPI_MCA_btl} = "self";
9+
#
10+
sub prtime {
11+
my $count = shift;
12+
my $str = localtime;
13+
print "$count: $str\n";
14+
}
15+
16+
17+
my $totalcount = 5000;
18+
my $count = $totalcount;
19+
prtime($count);
20+
my $start = time();
21+
while ($count > 0) {
22+
system("./hello > /dev/null 2>&1");
23+
$count--;
24+
25+
if ($count % 1000 == 0) {
26+
prtime($count);
27+
}
28+
}
29+
prtime($count);
30+
31+
my $stop = time();
32+
my $rate = $totalcount / ($stop - $start);
33+
print "Rate: $rate\n";

0 commit comments

Comments
 (0)