Skip to content

Commit 04ec013

Browse files
authored
Merge pull request #4847 from alex-mikheev/topic/oshmem_group_cache_refactor
oshmem: refactor group cache
2 parents 3235243 + 292d185 commit 04ec013

21 files changed

+626
-1088
lines changed

oshmem/proc/proc.c

Lines changed: 153 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013 Mellanox Technologies, Inc.
2+
* Copyright (c) 2013-2018 Mellanox Technologies, Inc.
33
* All rights reserved.
44
* Copyright (c) 2014-2016 Research Organization for Information Science
55
* and Technology (RIST). All rights reserved.
@@ -17,6 +17,7 @@
1717
#include "oshmem/constants.h"
1818
#include "oshmem/runtime/runtime.h"
1919
#include "oshmem/mca/scoll/base/base.h"
20+
#include "oshmem/proc/proc_group_cache.h"
2021

2122
#ifdef HAVE_STRINGS_H
2223
#include <strings.h>
@@ -65,40 +66,67 @@ oshmem_group_t* oshmem_group_null = NULL;
6566

6667
OBJ_CLASS_INSTANCE(oshmem_group_t, opal_object_t, NULL, NULL);
6768

69+
static void oshmem_proc_group_destroy_internal(oshmem_group_t* group,
70+
int scoll_unselect);
71+
6872
int oshmem_proc_group_init(void)
6973
{
74+
int rc;
75+
76+
rc = oshmem_group_cache_init();
77+
if (OSHMEM_SUCCESS != rc) {
78+
return rc;
79+
}
80+
7081
/* Setup communicator array */
7182
OBJ_CONSTRUCT(&oshmem_group_array, opal_pointer_array_t);
72-
if (OPAL_SUCCESS
73-
!= opal_pointer_array_init(&oshmem_group_array,
74-
0,
75-
ORTE_GLOBAL_ARRAY_MAX_SIZE,
76-
1)) {
77-
return OSHMEM_ERROR;
83+
84+
rc = opal_pointer_array_init(&oshmem_group_array, 0,
85+
ORTE_GLOBAL_ARRAY_MAX_SIZE, 1);
86+
if (OPAL_SUCCESS != rc) {
87+
goto err1;
7888
}
7989

8090
/* Setup SHMEM_GROUP_ALL */
81-
if (NULL
82-
== (oshmem_group_all =
83-
oshmem_proc_group_create(0,
84-
1,
85-
ompi_comm_size(oshmem_comm_world)))) {
86-
return OSHMEM_ERROR;
91+
oshmem_group_all = oshmem_proc_group_create(0, 1, ompi_comm_size(oshmem_comm_world));
92+
if (NULL == oshmem_group_all) {
93+
goto err2;
8794
}
8895

8996
/* Setup SHMEM_GROUP_SELF */
90-
if (NULL
91-
== (oshmem_group_self = oshmem_proc_group_create(oshmem_proc_pe(oshmem_proc_local()),
92-
0,
93-
1))) {
94-
oshmem_proc_group_destroy(oshmem_group_self);
95-
return OSHMEM_ERROR;
97+
oshmem_group_self = oshmem_proc_group_create(oshmem_proc_pe(oshmem_proc_local()), 0, 1);
98+
if (NULL == oshmem_group_self) {
99+
goto err3;
96100
}
97101

98102
/* Setup SHMEM_GROUP_NULL */
99103
oshmem_group_null = NULL;
100104

101105
return OSHMEM_SUCCESS;
106+
107+
err3:
108+
oshmem_proc_group_destroy_internal(oshmem_group_all, 1);
109+
err2:
110+
OBJ_DESTRUCT(&oshmem_group_array);
111+
err1:
112+
oshmem_group_cache_destroy();
113+
return OSHMEM_ERROR;
114+
}
115+
116+
void oshmem_proc_group_finalize_scoll(void)
117+
{
118+
int max, i;
119+
oshmem_group_t *group;
120+
121+
/* Check whether we have some left */
122+
max = opal_pointer_array_get_size(&oshmem_group_array);
123+
for (i = 0; i < max; i++) {
124+
group = (oshmem_group_t *) opal_pointer_array_get_item(&oshmem_group_array,
125+
i);
126+
if (NULL != group) {
127+
mca_scoll_base_group_unselect(group);
128+
}
129+
}
102130
}
103131

104132
int oshmem_proc_group_finalize(void)
@@ -114,18 +142,17 @@ int oshmem_proc_group_finalize(void)
114142
i);
115143
if (NULL != group) {
116144
/* Group has not been freed before finalize */
117-
oshmem_proc_group_destroy(group);
145+
oshmem_proc_group_destroy_internal(group, 0);
118146
}
119147
}
120148

121149
OBJ_DESTRUCT(&oshmem_group_array);
122150

151+
oshmem_group_cache_destroy();
123152
return OSHMEM_SUCCESS;
124153
}
125154

126-
oshmem_group_t* oshmem_proc_group_create(int pe_start,
127-
int pe_stride,
128-
size_t pe_size)
155+
oshmem_group_t* oshmem_proc_group_create(int pe_start, int pe_stride, int pe_size)
129156
{
130157
int cur_pe, count_pe;
131158
int i;
@@ -135,107 +162,133 @@ oshmem_group_t* oshmem_proc_group_create(int pe_start,
135162

136163
assert(oshmem_proc_local());
137164

165+
group = oshmem_group_cache_find(pe_start, pe_stride, pe_size);
166+
if (NULL != group) {
167+
return group;
168+
}
169+
138170
group = OBJ_NEW(oshmem_group_t);
171+
if (NULL == group) {
172+
return NULL;
173+
}
139174

140-
if (group) {
141-
cur_pe = 0;
142-
count_pe = 0;
175+
cur_pe = 0;
176+
count_pe = 0;
143177

144-
OPAL_THREAD_LOCK(&oshmem_proc_lock);
178+
OPAL_THREAD_LOCK(&oshmem_proc_lock);
179+
180+
/* allocate an array */
181+
proc_array = (ompi_proc_t**) malloc(pe_size * sizeof(ompi_proc_t*));
182+
if (NULL == proc_array) {
183+
OBJ_RELEASE(group);
184+
OPAL_THREAD_UNLOCK(&oshmem_proc_lock);
185+
return NULL ;
186+
}
145187

146-
/* allocate an array */
147-
proc_array = (ompi_proc_t**) malloc(pe_size * sizeof(ompi_proc_t*));
148-
if (NULL == proc_array) {
188+
group->my_pe = oshmem_proc_pe(oshmem_proc_local());
189+
group->is_member = 0;
190+
for (i = 0 ; i < ompi_comm_size(oshmem_comm_world) ; i++) {
191+
proc = oshmem_proc_find(i);
192+
if (NULL == proc) {
193+
opal_output(0,
194+
"Error: Can not find proc object for pe = %d", i);
195+
free(proc_array);
149196
OBJ_RELEASE(group);
150197
OPAL_THREAD_UNLOCK(&oshmem_proc_lock);
151-
return NULL ;
198+
return NULL;
152199
}
153-
154-
group->my_pe = oshmem_proc_pe(oshmem_proc_local());
155-
group->is_member = 0;
156-
for (i = 0 ; i < ompi_comm_size(oshmem_comm_world) ; i++) {
157-
proc = oshmem_proc_find(i);
158-
if (NULL == proc) {
159-
opal_output(0,
160-
"Error: Can not find proc object for pe = %d", i);
161-
free(proc_array);
162-
OBJ_RELEASE(group);
163-
OPAL_THREAD_UNLOCK(&oshmem_proc_lock);
164-
return NULL;
165-
}
166-
if (count_pe >= (int) pe_size) {
167-
break;
168-
} else if ((cur_pe >= pe_start)
169-
&& ((pe_stride == 0)
170-
|| (((cur_pe - pe_start) % pe_stride) == 0))) {
171-
proc_array[count_pe++] = proc;
172-
if (oshmem_proc_pe(proc) == group->my_pe)
173-
group->is_member = 1;
174-
}
175-
cur_pe++;
200+
if (count_pe >= (int) pe_size) {
201+
break;
202+
} else if ((cur_pe >= pe_start)
203+
&& ((pe_stride == 0)
204+
|| (((cur_pe - pe_start) % pe_stride) == 0))) {
205+
proc_array[count_pe++] = proc;
206+
if (oshmem_proc_pe(proc) == group->my_pe)
207+
group->is_member = 1;
176208
}
177-
group->proc_array = proc_array;
178-
group->proc_count = (int) count_pe;
179-
group->ompi_comm = NULL;
180-
181-
/* Prepare peers list */
182-
OBJ_CONSTRUCT(&(group->peer_list), opal_list_t);
183-
{
184-
orte_namelist_t *peer = NULL;
185-
186-
for (i = 0; i < group->proc_count; i++) {
187-
peer = OBJ_NEW(orte_namelist_t);
188-
peer->name.jobid = OSHMEM_PROC_JOBID(group->proc_array[i]);
189-
peer->name.vpid = OSHMEM_PROC_VPID(group->proc_array[i]);
190-
opal_list_append(&(group->peer_list), &peer->super);
191-
}
209+
cur_pe++;
210+
}
211+
group->proc_array = proc_array;
212+
group->proc_count = (int) count_pe;
213+
group->ompi_comm = NULL;
214+
215+
/* Prepare peers list */
216+
OBJ_CONSTRUCT(&(group->peer_list), opal_list_t);
217+
{
218+
orte_namelist_t *peer = NULL;
219+
220+
for (i = 0; i < group->proc_count; i++) {
221+
peer = OBJ_NEW(orte_namelist_t);
222+
peer->name.jobid = OSHMEM_PROC_JOBID(group->proc_array[i]);
223+
peer->name.vpid = OSHMEM_PROC_VPID(group->proc_array[i]);
224+
opal_list_append(&(group->peer_list), &peer->super);
192225
}
193-
group->id = opal_pointer_array_add(&oshmem_group_array, group);
226+
}
227+
group->id = opal_pointer_array_add(&oshmem_group_array, group);
194228

195-
memset(&group->g_scoll, 0, sizeof(mca_scoll_base_group_scoll_t));
229+
memset(&group->g_scoll, 0, sizeof(mca_scoll_base_group_scoll_t));
196230

197-
if (OSHMEM_SUCCESS != mca_scoll_base_select(group)) {
198-
opal_output(0,
199-
"Error: No collective modules are available: group is not created, returning NULL");
200-
oshmem_proc_group_destroy(group);
201-
OPAL_THREAD_UNLOCK(&oshmem_proc_lock);
202-
return NULL;
203-
}
231+
if (OSHMEM_SUCCESS != mca_scoll_base_select(group)) {
232+
opal_output(0,
233+
"Error: No collective modules are available: group is not created, returning NULL");
234+
oshmem_proc_group_destroy_internal(group, 0);
204235
OPAL_THREAD_UNLOCK(&oshmem_proc_lock);
236+
return NULL;
205237
}
206238

239+
if (OSHMEM_SUCCESS != oshmem_group_cache_insert(group, pe_start,
240+
pe_stride, pe_size)) {
241+
oshmem_proc_group_destroy_internal(group, 1);
242+
OPAL_THREAD_UNLOCK(&oshmem_proc_lock);
243+
return NULL;
244+
}
245+
246+
OPAL_THREAD_UNLOCK(&oshmem_proc_lock);
207247
return group;
208248
}
209249

210-
void oshmem_proc_group_destroy(oshmem_group_t* group)
250+
static void
251+
oshmem_proc_group_destroy_internal(oshmem_group_t* group, int scoll_unselect)
211252
{
212-
if (group) {
253+
if (NULL == group) {
254+
return;
255+
}
256+
257+
if (scoll_unselect) {
213258
mca_scoll_base_group_unselect(group);
259+
}
214260

215-
/* Destroy proc array */
216-
if (group->proc_array) {
217-
free(group->proc_array);
218-
}
261+
/* Destroy proc array */
262+
if (group->proc_array) {
263+
free(group->proc_array);
264+
}
219265

220-
/* Destroy peer list */
221-
{
222-
opal_list_item_t *item;
266+
/* Destroy peer list */
267+
{
268+
opal_list_item_t *item;
223269

224-
while (NULL != (item = opal_list_remove_first(&(group->peer_list)))) {
225-
/* destruct the item (we constructed it), then free the memory chunk */
226-
OBJ_RELEASE(item);
227-
}
228-
OBJ_DESTRUCT(&(group->peer_list));
270+
while (NULL != (item = opal_list_remove_first(&(group->peer_list)))) {
271+
/* destruct the item (we constructed it), then free the memory chunk */
272+
OBJ_RELEASE(item);
229273
}
274+
OBJ_DESTRUCT(&(group->peer_list));
275+
}
230276

231-
/* reset the oshmem_group_array entry - make sure that the
232-
* entry is in the table */
233-
if (NULL
234-
!= opal_pointer_array_get_item(&oshmem_group_array,
235-
group->id)) {
236-
opal_pointer_array_set_item(&oshmem_group_array, group->id, NULL );
237-
}
277+
/* reset the oshmem_group_array entry - make sure that the
278+
* entry is in the table */
279+
if (NULL
280+
!= opal_pointer_array_get_item(&oshmem_group_array,
281+
group->id)) {
282+
opal_pointer_array_set_item(&oshmem_group_array, group->id, NULL );
283+
}
238284

239-
OBJ_RELEASE(group);
285+
OBJ_RELEASE(group);
286+
}
287+
288+
void oshmem_proc_group_destroy(oshmem_group_t* group)
289+
{
290+
if (oshmem_group_cache_enabled()) {
291+
return;
240292
}
293+
oshmem_proc_group_destroy_internal(group, 1);
241294
}

0 commit comments

Comments
 (0)