Skip to content

Commit ab1e531

Browse files
authored
[backport] Allow fallback for CUDA virtual memory. (dmlc#11391, dmlc#11398) (dmlc#11434)
- Check CUDA version based on both driver API and nvidia-smi.
1 parent 690c991 commit ab1e531

File tree

7 files changed

+162
-17
lines changed

7 files changed

+162
-17
lines changed

src/common/common.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright 2015-2024, XGBoost Contributors
2+
* Copyright 2015-2025, XGBoost Contributors
33
* \file common.h
44
* \brief Common utilities
55
*/
@@ -62,6 +62,19 @@ inline std::vector<std::string> Split(const std::string& s, char delim) {
6262
return ret;
6363
}
6464

65+
// Trims leading whitespace from a string
66+
[[nodiscard]] inline std::string TrimFirst(const std::string &str) {
67+
if (str.empty()) {
68+
return str;
69+
}
70+
71+
std::size_t first = str.find_first_not_of(" \t\n\r");
72+
if (first == std::string::npos) {
73+
return "";
74+
}
75+
return str.substr(first);
76+
}
77+
6578
/**
6679
* @brief Add escapes for a UTF-8 string.
6780
*/

src/common/cuda_dr_utils.cc

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright 2024, XGBoost contributors
2+
* Copyright 2024-2025, XGBoost contributors
33
*/
44
#if defined(XGBOOST_USE_CUDA)
55
#include "cuda_dr_utils.h"
@@ -10,11 +10,12 @@
1010
#include <memory> // for make_unique
1111
#include <mutex> // for call_once
1212
#include <sstream> // for stringstream
13-
#include <string> // for string
13+
#include <string> // for string, stoi
1414

15-
#include "common.h" // for safe_cuda
15+
#include "common.h" // for safe_cuda, TrimFirst, Split
1616
#include "cuda_rt_utils.h" // for CurrentDevice
17-
#include "xgboost/string_view.h" // for StringVie
17+
#include "io.h" // for CmdOutput
18+
#include "xgboost/string_view.h" // for StringView
1819

1920
namespace xgboost::cudr {
2021
CuDriverApi::CuDriverApi() {
@@ -104,5 +105,52 @@ void MakeCuMemLocation(CUmemLocationType type, CUmemLocation *loc) {
104105
MakeCuMemLocation(type, &prop.location);
105106
return prop;
106107
}
108+
109+
[[nodiscard]] bool GetVersionFromSmi(std::int32_t *p_major, std::int32_t *p_minor) {
110+
using ::xgboost::common::Split;
111+
using ::xgboost::common::TrimFirst;
112+
// `nvidia-smi --version` is not available for older versions, as a result, we can't query the
113+
// cuda driver version unless we want to parse the table output.
114+
115+
// Example output on a 2-GPU system:
116+
//
117+
// $ nvidia-smi --query-gpu=driver_version --format=csv
118+
//
119+
// driver_version
120+
// 570.124.06
121+
// 570.124.06
122+
//
123+
auto cmd = "nvidia-smi --query-gpu=driver_version --format=csv";
124+
auto smi_out_str = common::CmdOutput(StringView{cmd});
125+
126+
auto Invalid = [=] {
127+
*p_major = *p_minor = -1;
128+
return false;
129+
};
130+
if (smi_out_str.empty()) {
131+
return Invalid();
132+
}
133+
134+
auto smi_split = Split(smi_out_str, '\n');
135+
if (smi_split.size() < 2) {
136+
return Invalid();
137+
}
138+
139+
// Use the first GPU
140+
auto smi_ver = Split(TrimFirst(smi_split[1]), '.');
141+
// 570.124.06
142+
if (smi_ver.size() != 3) {
143+
return Invalid();
144+
}
145+
try {
146+
*p_major = std::stoi(smi_ver[0]);
147+
*p_minor = std::stoi(smi_ver[1]);
148+
LOG(INFO) << "Driver version: `" << *p_major << "." << *p_minor << "`";
149+
return true;
150+
} catch (std::exception const &) {
151+
}
152+
153+
return Invalid();
154+
}
107155
} // namespace xgboost::cudr
108156
#endif

src/common/cuda_dr_utils.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright 2024, XGBoost contributors
2+
* Copyright 2024-2025, XGBoost contributors
33
*
44
* @brief Utility for CUDA driver API.
55
*
@@ -102,4 +102,11 @@ void MakeCuMemLocation(CUmemLocationType type, CUmemLocation* loc);
102102
* @brief Construct a `CUmemAllocationProp`.
103103
*/
104104
[[nodiscard]] CUmemAllocationProp MakeAllocProp(CUmemLocationType type);
105+
106+
/**
107+
* @brief Get system driver version from the `nvidia-smi` command.
108+
*
109+
* @return Whether the system call is successful.
110+
*/
111+
[[nodiscard]] bool GetVersionFromSmi(std::int32_t *p_major, std::int32_t *p_minor);
105112
} // namespace xgboost::cudr

src/common/device_helpers.cu

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,51 @@
11
/**
2-
* Copyright 2024, XGBoost contributors
2+
* Copyright 2024-2025, XGBoost contributors
33
*/
4-
#include "cuda_rt_utils.h" // for RtVersion
4+
#include <mutex> // for once_flag, call_once
5+
6+
#include "../common/cuda_dr_utils.h" // for GetVersionFromSmi
7+
#include "cuda_rt_utils.h" // for RtVersion
58
#include "device_helpers.cuh"
9+
#include "device_vector.cuh" // for GrowOnlyVirtualMemVec
610
#include "xgboost/windefs.h" // for xgboost_IS_WIN
711

812
namespace dh {
13+
namespace {
14+
[[nodiscard]] bool IsSupportedDrVer(std::int32_t major, std::int32_t minor) {
15+
return major > 12 || (major == 12 && minor >= 5);
16+
}
17+
18+
// Check whether cuda virtual memory can be used.
19+
// Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable
20+
[[nodiscard]] bool CheckVmAlloc() {
21+
static bool vm_flag = true;
22+
static std::once_flag once;
23+
24+
std::call_once(once, [] {
25+
std::int32_t major{0}, minor{0};
26+
xgboost::curt::DrVersion(&major, &minor);
27+
if (IsSupportedDrVer(major, minor)) {
28+
// The result from the driver api is not reliable. The system driver might not match
29+
// the CUDA driver in some obscure cases.
30+
//
31+
// https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
32+
// Ver Linux Win
33+
// CUDA 12.5 Update 1 >=555.42.06 >=555.85
34+
// CUDA 12.5 GA >=555.42.02 >=555.85
35+
vm_flag = xgboost::cudr::GetVersionFromSmi(&major, &minor) && major >= 555;
36+
} else {
37+
vm_flag = false;
38+
}
39+
});
40+
return vm_flag;
41+
}
42+
} // namespace
43+
944
PinnedMemory::PinnedMemory() {
1045
#if defined(xgboost_IS_WIN)
1146
this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();
1247
#else
13-
std::int32_t major{0}, minor{0};
14-
xgboost::curt::DrVersion(&major, &minor);
15-
// Host NUMA allocation requires driver that supports CTK >= 12.5 to be stable.
16-
if (major >= 12 && minor >= 5) {
48+
if (CheckVmAlloc()) {
1749
this->impl_.emplace<detail::GrowOnlyVirtualMemVec>(CU_MEM_LOCATION_TYPE_HOST_NUMA);
1850
} else {
1951
this->impl_.emplace<detail::GrowOnlyPinnedMemoryImpl>();

src/common/io.cc

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright 2019-2024, by XGBoost Contributors
2+
* Copyright 2019-2025, by XGBoost Contributors
33
*/
44
#if defined(__unix__) || defined(__APPLE__)
55

@@ -234,7 +234,7 @@ void detail::CloseMmap(MMAPFile* handle) {
234234
}
235235
#if defined(xgboost_IS_WIN)
236236
if (handle->base_ptr) {
237-
CHECK(UnmapViewOfFile(handle->base_ptr)) "Faled to call munmap: " << SystemErrorMsg();
237+
CHECK(UnmapViewOfFile(handle->base_ptr)) << "Failed to call munmap: " << SystemErrorMsg();
238238
}
239239
if (handle->fd != INVALID_HANDLE_VALUE) {
240240
CHECK(CloseHandle(handle->fd)) << "Failed to close handle: " << SystemErrorMsg();
@@ -245,11 +245,11 @@ void detail::CloseMmap(MMAPFile* handle) {
245245
#else
246246
if (handle->base_ptr) {
247247
CHECK_NE(munmap(handle->base_ptr, handle->base_size), -1)
248-
<< "Faled to call munmap: `" << handle->path << "`. " << SystemErrorMsg();
248+
<< "Failed to call munmap: `" << handle->path << "`. " << SystemErrorMsg();
249249
}
250250
if (handle->fd != 0) {
251251
CHECK_NE(close(handle->fd), -1)
252-
<< "Faled to close: `" << handle->path << "`. " << SystemErrorMsg();
252+
<< "Failed to close: `" << handle->path << "`. " << SystemErrorMsg();
253253
}
254254
#endif
255255
delete handle;
@@ -302,4 +302,23 @@ AlignedMemWriteStream::~AlignedMemWriteStream() = default;
302302
[[nodiscard]] std::size_t AlignedMemWriteStream::Tell() const noexcept(true) {
303303
return this->pimpl_->Tell();
304304
}
305+
306+
[[nodiscard]] std::string CmdOutput(StringView cmd) {
307+
#if defined(xgboost_IS_WIN) || defined(__i386__)
308+
(void)cmd;
309+
LOG(FATAL) << "Not implemented";
310+
return "";
311+
#else
312+
// popen is a convenient method, but it always returns a success even if the command
313+
// fails.
314+
std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd.c_str(), "r"), pclose);
315+
CHECK(pipe);
316+
std::array<char, 128> buffer;
317+
std::string result;
318+
while (std::fgets(buffer.data(), static_cast<std::int32_t>(buffer.size()), pipe.get())) {
319+
result += buffer.data();
320+
}
321+
return result;
322+
#endif
323+
}
305324
} // namespace xgboost::common

src/common/io.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/**
2-
* Copyright 2014-2024, XGBoost Contributors
2+
* Copyright 2014-2025, XGBoost Contributors
33
* \file io.h
44
* \brief general stream interface for serialization, I/O
55
* \author Tianqi Chen
@@ -607,5 +607,8 @@ class AlignedMemWriteStream : public AlignedFileWriteStream {
607607

608608
[[nodiscard]] std::size_t Tell() const noexcept(true);
609609
};
610+
611+
// Run a system command, get its stdout.
612+
[[nodiscard]] std::string CmdOutput(StringView cmd);
610613
} // namespace xgboost::common
611614
#endif // XGBOOST_COMMON_IO_H_
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
/**
2+
* Copyright 2025, XGBoost Contributors
3+
*/
4+
#include <gtest/gtest.h>
5+
6+
#if defined(XGBOOST_USE_CUDA) && defined(__linux__)
7+
#include "../../../src/common/cuda_dr_utils.h"
8+
9+
namespace xgboost::cudr {
10+
TEST(DrUtils, GetVersionFromSmi) {
11+
std::int32_t major = 0, minor = 0;
12+
bool result = GetVersionFromSmi(&major, &minor);
13+
14+
if (result) {
15+
EXPECT_GE(major, 0);
16+
EXPECT_GE(minor, 0);
17+
} else {
18+
EXPECT_EQ(major, -1);
19+
EXPECT_EQ(minor, -1);
20+
}
21+
}
22+
} // namespace xgboost::cudr
23+
#endif // defined(XGBOOST_USE_CUDA)

0 commit comments

Comments
 (0)