Skip to content

Commit d28ee62

Browse files
committed
Update in PSM and PSM2 MTLs to detect entries created by drivers for
Intel TrueScale and Intel OmniPath, and detect a link in ACTIVE state. This fix addresses the scenario reported in the below OMPI users email, including formerly named Qlogic IB, now Intel True scale. Given the nature of the PSM/PSM2 mtls this fix applies to OmniPath: https://www.open-mpi.org/community/lists/users/2016/04/29018.php
1 parent 44d95cb commit d28ee62

File tree

2 files changed

+69
-7
lines changed

2 files changed

+69
-7
lines changed

ompi/mca/mtl/psm/mtl_psm_component.c

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <sys/types.h>
3838
#include <sys/stat.h>
3939
#include <unistd.h>
40+
#include <glob.h>
4041

4142
static int param_priority;
4243

@@ -185,12 +186,41 @@ ompi_mtl_psm_component_open(void)
185186
}
186187

187188
/* Component available only if Truescale hardware is present */
188-
if (0 == stat("/dev/ipath", &st)) {
189-
return OMPI_SUCCESS;
189+
if (0 != stat("/dev/ipath", &st)) {
190+
return OPAL_ERR_NOT_AVAILABLE;
191+
}
192+
193+
/* Component available only if at least one qib port is ACTIVE */
194+
bool foundOnlineQibPort = false;
195+
size_t i;
196+
char portState[128];
197+
FILE *devFile;
198+
glob_t globbuf;
199+
globbuf.gl_offs = 0;
200+
if (glob("/sys/class/infiniband/qib*/ports/*/state",
201+
GLOB_DOOFFS, NULL, &globbuf) != 0) {
202+
return OPAL_ERR_NOT_AVAILABLE;
203+
}
204+
205+
for (i=0;i < globbuf.gl_pathc; i++) {
206+
devFile = fopen(globbuf.gl_pathv[i], "r");
207+
fgets(portState, sizeof(portState), devFile);
208+
fclose(devFile);
209+
210+
if (strstr(portState, "ACTIVE") != NULL) {
211+
/* Found at least one ACTIVE port */
212+
foundOnlineQibPort = true;
213+
break;
214+
}
190215
}
191-
else {
216+
217+
globfree(&globbuf);
218+
219+
if (!foundOnlineQibPort) {
192220
return OPAL_ERR_NOT_AVAILABLE;
193221
}
222+
223+
return OMPI_SUCCESS;
194224
}
195225

196226
static int

ompi/mca/mtl/psm2/mtl_psm2_component.c

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <sys/types.h>
3838
#include <sys/stat.h>
3939
#include <unistd.h>
40+
#include <glob.h>
4041

4142
static int param_priority;
4243

@@ -101,15 +102,46 @@ ompi_mtl_psm2_component_register(void)
101102
static int
102103
ompi_mtl_psm2_component_open(void)
103104
{
104-
struct stat st;
105+
glob_t globbuf;
106+
globbuf.gl_offs = 0;
105107

106108
/* Component available only if Omni-Path hardware is present */
107-
if (0 == stat("/dev/hfi1", &st)) {
108-
return OMPI_SUCCESS;
109+
if ((glob("/dev/hfi1_[0-9]", GLOB_DOOFFS, NULL, &globbuf) != 0) &&
110+
(glob("/dev/hfi1_[0-9][0-9]", GLOB_APPEND, NULL, &globbuf) != 0)) {
111+
return OPAL_ERR_NOT_AVAILABLE;
112+
}
113+
114+
globfree(&globbuf);
115+
116+
/* Component available only if at least one hfi1 port is ACTIVE */
117+
bool foundOnlineHfi1Port = false;
118+
size_t i;
119+
char portState[128];
120+
FILE *devFile;
121+
if (glob("/sys/class/infiniband/hfi1_*/ports/*/state",
122+
GLOB_DOOFFS, NULL, &globbuf) != 0) {
123+
return OPAL_ERR_NOT_AVAILABLE;
124+
}
125+
126+
for (i=0;i < globbuf.gl_pathc; i++) {
127+
devFile = fopen(globbuf.gl_pathv[i], "r");
128+
fgets(portState, sizeof(portState), devFile);
129+
fclose(devFile);
130+
131+
if (strstr(portState, "ACTIVE") != NULL) {
132+
/* Found at least one ACTIVE port */
133+
foundOnlineHfi1Port = true;
134+
break;
135+
}
109136
}
110-
else {
137+
138+
globfree(&globbuf);
139+
140+
if (!foundOnlineHfi1Port) {
111141
return OPAL_ERR_NOT_AVAILABLE;
112142
}
143+
144+
return OMPI_SUCCESS;
113145
}
114146

115147
static int

0 commit comments

Comments
 (0)