Skip to content

Commit d809a88

Browse files
mozarhuabwbarrett
authored andcommitted
init: Set NCCL_PROTO to simple when GDR is unsupported
Force NCCL_PROTO to simple when GDR is not supported. While NCCL disables the LL128 protocol when using host buffers, it leaves the LL protocol enabled and polls on host memory directly. Many host-only providers, like tcp and sockets, do not guarantee data delivering ordering and can cause corruption when used with the LL protocol. Since polling on host memory is expensive anyways, this has no real performance implications and avoids dealing with the lack of a data ordering hint in the Libfabric API. Note: Warning for pre-existing NCCL_PROTO setting was removed as modern NCCL supports complex protocol formats (e.g. "^LL", "allreduce:simple"). We can consider adding protocol format validation to put an INFO later. Signed-off-by: Mozar Huang <[email protected]>
1 parent e05fd70 commit d809a88

File tree

3 files changed

+55
-20
lines changed

3 files changed

+55
-20
lines changed

include/nccl_ofi.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -791,4 +791,17 @@ int get_inject_rma_size_opt(struct fid_ep *ofi_ep,
791791
*/
792792
long nccl_net_ofi_gettid(void);
793793

794+
/*
795+
* @brief Configures NCCL_PROTO environment variable to "simple".
796+
*
797+
* @details If NCCL_PROTO is not set, configures it to "simple" protocol.
798+
* If NCCL_PROTO is already set, skip the configuration.
799+
*
800+
* @input log reason string
801+
*
802+
* @return 0 on success or when warning is issued
803+
* -errno in case of any failure
804+
*/
805+
int nccl_net_ofi_configure_nccl_proto_simple(const char *log_reason);
806+
794807
#endif // End NCCL_OFI_H_

src/nccl_ofi_net.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,24 @@ int nccl_net_ofi_create_plugin(nccl_net_ofi_plugin_t **plugin_p)
328328
ret = -ENOTSUP;
329329
goto exit;
330330
}
331+
/* Force SIMPLE protocol when using a provider that does not support
332+
* GDR. NCCL disables the LL128 protocol in this case, but leaves the
333+
* LL protocol enabled. Without GDR, the LL protocol polls on host
334+
* memory for completion flags. In addition to being slow, this assumes
335+
* that host memory is updated in 8 byte segments. However, most
336+
* providers that do not support HMEM (like the tcp or sockets
337+
* providers) do not make any guarantees about data delivery ordering.
338+
* There is not a good way to ask Libfabric providers about their data
339+
* delivery support in the general case, so take a conservative
340+
* approach and force the simple protocol whenever using a provider
341+
* that does not support HMEM.
342+
*/
343+
if (support_gdr != GDR_SUPPORTED) {
344+
ret = nccl_net_ofi_configure_nccl_proto_simple("GDR");
345+
if (ret != 0) {
346+
goto exit;
347+
}
348+
}
331349

332350
*plugin_p = plugin;
333351

@@ -1186,3 +1204,22 @@ int get_inject_rma_size_opt(struct fid_ep *ofi_ep,
11861204
return -FI_ENOPROTOOPT;
11871205
#endif
11881206
}
1207+
1208+
1209+
int nccl_net_ofi_configure_nccl_proto_simple(const char *log_reason)
1210+
{
1211+
int ret;
1212+
1213+
if (getenv("NCCL_PROTO") == NULL) {
1214+
NCCL_OFI_INFO(NCCL_INIT, "Setting NCCL_PROTO='simple' to prevent data corruption (reason: %s not supported)",
1215+
log_reason);
1216+
ret = setenv("NCCL_PROTO", "simple", 1);
1217+
if (ret != 0) {
1218+
NCCL_OFI_WARN("Error setting NCCL_PROTO environment variable: %s",
1219+
strerror(errno));
1220+
return -errno;
1221+
}
1222+
}
1223+
1224+
return 0;
1225+
}

src/platform-aws.cpp

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -275,25 +275,6 @@ static int validate_rdma_write(struct fid_ep *ep)
275275

276276

277277
#if HAVE_CUDA
278-
static int configure_nccl_proto(void)
279-
{
280-
int ret;
281-
282-
if (!getenv("NCCL_PROTO")) {
283-
NCCL_OFI_INFO(NCCL_INIT, "Setting NCCL_PROTO to \"simple\"");
284-
ret = setenv("NCCL_PROTO", "simple", 0);
285-
if (ret != 0) {
286-
NCCL_OFI_WARN("Error setting NCCL_PROTO environment variable: %s",
287-
strerror(errno));
288-
return -errno;
289-
}
290-
} else if (strcasecmp(getenv("NCCL_PROTO"), "simple") != 0) {
291-
NCCL_OFI_WARN("NCCL_PROTO was set to \"LL/LL128\", but the Libfabric endpoint does not support 128 byte in-order aligned stores. This endpoint may corrupt data during communication");
292-
}
293-
294-
return 0;
295-
}
296-
297278
/*
298279
* Try to set one of the in-order flags for either send/recv or rdma
299280
* on the current endpoint to true. have_ordering will be the
@@ -742,7 +723,11 @@ int platform_config_endpoint(struct fi_info *info, struct fid_ep* endpoint) {
742723
nccl_proto_configured = true;
743724

744725
if (!have_ordering) {
745-
ret = configure_nccl_proto();
726+
/* When byte delivery ordering is not guaranteed, force
727+
* the simple protocol as the LL/LL128 protocols can lead
728+
* to data corruption without data delivery ordering.
729+
*/
730+
ret = nccl_net_ofi_configure_nccl_proto_simple("byte delivery ordering");
746731
if (ret != 0) {
747732
NCCL_OFI_WARN("Failed to set NCCL_PROTO: %d", ret);
748733
ret = -ENOTSUP;

0 commit comments

Comments
 (0)