Skip to content

Commit abae4e9

Browse files
committed
Fix IBVERBS CI failures by probing QP creation before constructing Device
On CI runners without real RDMA hardware, rdma-core software providers let ibv_open_device/ibv_alloc_pd/ibv_create_comp_channel succeed but ibv_create_qp fails with EINVAL. Creating a gloo Device starts a background thread; after fork() in TransportMultiProcTest the thread handle is invalid, causing SIGSEGV (exit 139) in Device::~Device. Fix: probe ibverbs capability using raw APIs (through ibv_create_qp) in the test's createDevice() before constructing a gloo Device. If QP creation fails, mark IBVERBS as unavailable and return nullptr. Also moves GTEST_SKIP() out of worker threads to avoid concurrent calls racing on GTest internals (exit 134), adds a SIGSEGV backtrace handler for test debugging, and builds with RelWithDebInfo.
1 parent 2ba34a6 commit abae4e9

File tree

4 files changed

+122
-5
lines changed

4 files changed

+122
-5
lines changed

.github/workflows/build-linux.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ jobs:
7575
sudo apt-get install -y gcc g++
7676
mkdir -p build
7777
cd build
78-
cmake ../ -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_TEST=ON ${{matrix.cmake_args}} -DOPENSSL_ROOT_DIR=/opt/openssl/
78+
cmake ../ -DCMAKE_VERBOSE_MAKEFILE=ON -DBUILD_TEST=ON -DCMAKE_BUILD_TYPE=RelWithDebInfo ${{matrix.cmake_args}} -DOPENSSL_ROOT_DIR=/opt/openssl/
7979
make
8080
- name: Test
8181
run: |

gloo/test/base_test.cc

Lines changed: 74 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
#include "gloo/test/base_test.h"
1010
#include "gloo/test/openssl_utils.h"
1111

12+
#if GLOO_HAVE_TRANSPORT_IBVERBS
13+
#include <infiniband/verbs.h>
14+
#endif
15+
1216
namespace gloo {
1317
namespace test {
1418

@@ -76,12 +80,81 @@ std::shared_ptr<::gloo::transport::Device> createDevice(Transport transport) {
7680
#endif
7781
#if GLOO_HAVE_TRANSPORT_IBVERBS
7882
if (transport == Transport::IBVERBS) {
83+
if (ibverbsUnavailable().load()) {
84+
return nullptr;
85+
}
86+
// Probe ibverbs capability using raw APIs before creating a gloo Device.
87+
// On CI runners without real RDMA hardware, rdma-core software providers
88+
// allow device open, PD alloc, and CQ creation to succeed, but QP creation
89+
// fails with EINVAL. If we create a gloo Device (which starts a background
90+
// thread), destroying it can segfault after fork() in
91+
// TransportMultiProcTest where the thread handle becomes invalid.
92+
{
93+
int numDevices = 0;
94+
struct ibv_device** deviceList = ibv_get_device_list(&numDevices);
95+
if (!deviceList || numDevices == 0) {
96+
if (deviceList)
97+
ibv_free_device_list(deviceList);
98+
ibverbsUnavailable().store(true);
99+
return nullptr;
100+
}
101+
struct ibv_context* ctx = ibv_open_device(deviceList[0]);
102+
ibv_free_device_list(deviceList);
103+
if (!ctx) {
104+
ibverbsUnavailable().store(true);
105+
return nullptr;
106+
}
107+
struct ibv_pd* pd = ibv_alloc_pd(ctx);
108+
if (!pd) {
109+
ibv_close_device(ctx);
110+
ibverbsUnavailable().store(true);
111+
return nullptr;
112+
}
113+
struct ibv_comp_channel* channel = ibv_create_comp_channel(ctx);
114+
if (!channel) {
115+
ibv_dealloc_pd(pd);
116+
ibv_close_device(ctx);
117+
ibverbsUnavailable().store(true);
118+
return nullptr;
119+
}
120+
struct ibv_cq* cq = ibv_create_cq(ctx, 64, nullptr, channel, 0);
121+
if (!cq) {
122+
ibv_destroy_comp_channel(channel);
123+
ibv_dealloc_pd(pd);
124+
ibv_close_device(ctx);
125+
ibverbsUnavailable().store(true);
126+
return nullptr;
127+
}
128+
struct ibv_qp_init_attr qpAttr{};
129+
qpAttr.send_cq = cq;
130+
qpAttr.recv_cq = cq;
131+
qpAttr.cap.max_send_wr = 16;
132+
qpAttr.cap.max_recv_wr = 16;
133+
qpAttr.cap.max_send_sge = 1;
134+
qpAttr.cap.max_recv_sge = 1;
135+
qpAttr.qp_type = IBV_QPT_RC;
136+
struct ibv_qp* qp = ibv_create_qp(pd, &qpAttr);
137+
if (!qp) {
138+
ibv_destroy_cq(cq);
139+
ibv_destroy_comp_channel(channel);
140+
ibv_dealloc_pd(pd);
141+
ibv_close_device(ctx);
142+
ibverbsUnavailable().store(true);
143+
return nullptr;
144+
}
145+
ibv_destroy_qp(qp);
146+
ibv_destroy_cq(cq);
147+
ibv_destroy_comp_channel(channel);
148+
ibv_dealloc_pd(pd);
149+
ibv_close_device(ctx);
150+
}
79151
gloo::transport::ibverbs::attr attr;
80152
attr.port = 1;
81153
try {
82154
return ::gloo::transport::ibverbs::CreateDevice(attr);
83-
} catch (const InvalidOperationException& e) {
155+
} catch (const std::exception& e) {
84156
GLOO_INFO("IBVERBS not available: ", e.what());
157+
ibverbsUnavailable().store(true);
85158
}
86159
}
87160
#endif

gloo/test/base_test.h

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#include <gtest/gtest.h>
1212

13+
#include <atomic>
1314
#include <exception>
1415
#include <functional>
1516
#include <stdexcept>
@@ -75,6 +76,14 @@ extern const std::vector<Transport> kTransportsForClassAlgorithms;
7576
extern const std::vector<Transport> kTransportsForFunctionAlgorithms;
7677
extern const std::vector<Transport> kTransportsForRDMA;
7778

79+
// Flag to prevent repeated IBVERBS device creation on systems without real
80+
// RDMA hardware. Set by spawn() when connectFullMesh fails and by
81+
// createDevice() when CreateDevice throws.
82+
inline std::atomic<bool>& ibverbsUnavailable() {
83+
static std::atomic<bool> instance{false};
84+
return instance;
85+
}
86+
7887
std::shared_ptr<::gloo::transport::Device> createDevice(Transport transport);
7988

8089
class BaseTest : public ::testing::Test {
@@ -115,18 +124,34 @@ class BaseTest : public ::testing::Test {
115124
Barrier barrier(size);
116125
auto store = std::make_shared<::gloo::rendezvous::HashStore>();
117126

127+
// Track whether workers found the transport unavailable so we can
128+
// call GTEST_SKIP() from the main thread after joining.
129+
// GTEST_SKIP() is not safe to call from worker threads — concurrent
130+
// calls race on GTest internals and can cause "terminate called
131+
// recursively" (SIGABRT / exit code 134).
132+
std::atomic<bool> transportUnavailable{false};
133+
118134
spawnThreads(size, [&](int rank) {
119135
auto context =
120136
std::make_shared<::gloo::rendezvous::Context>(rank, size, base);
121137

122-
// Create device per thread to avoid collisions then they are using the
138+
// Create device per thread to avoid collisions when they are using the
123139
// socket address.
124140
auto device = device_creator(transport);
125141
if (!device) {
126-
GTEST_SKIP() << "Skipping test: transport not available";
142+
transportUnavailable.store(true);
143+
return;
144+
}
145+
146+
try {
147+
context->connectFullMesh(store, device);
148+
} catch (const std::exception&) {
149+
if (transport == Transport::IBVERBS) {
150+
ibverbsUnavailable().store(true);
151+
}
152+
transportUnavailable.store(true);
127153
return;
128154
}
129-
context->connectFullMesh(store, device);
130155

131156
try {
132157
fn(context);
@@ -150,6 +175,10 @@ class BaseTest : public ::testing::Test {
150175
context->closeConnections();
151176
}
152177
});
178+
179+
if (transportUnavailable.load()) {
180+
GTEST_SKIP() << "Skipping test: transport not available";
181+
}
153182
}
154183

155184
void spawn(

gloo/test/main.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,25 @@
1010

1111
// One-time init to use EPIPE errors instead of SIGPIPE
1212
#ifndef _WIN32
13+
#include <execinfo.h>
14+
#include <signal.h>
15+
#include <unistd.h>
16+
#include <cstdio>
17+
1318
namespace {
19+
20+
static void segfault_handler(int sig) {
21+
void* array[30];
22+
int size = backtrace(array, 30);
23+
fprintf(stderr, "[DIAG] Signal %d caught, backtrace:\n", sig);
24+
backtrace_symbols_fd(array, size, STDERR_FILENO);
25+
_exit(128 + sig);
26+
}
27+
1428
struct Initializer {
1529
Initializer() {
1630
signal(SIGPIPE, SIG_IGN);
31+
signal(SIGSEGV, segfault_handler);
1732
}
1833
};
1934
Initializer initializer;

0 commit comments

Comments
 (0)