Skip to content

Commit d6695d5

Browse files
authored
Merge pull request #10910 from Sergei-Lebedev/topic/config_gdaki_lat
UCT/GDA/TEST: add config parameter for gda latency thresh
2 parents 566a266 + 981b78d commit d6695d5

File tree

4 files changed

+15
-8
lines changed

4 files changed

+15
-8
lines changed

src/uct/ib/base/ib_md.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ ucs_config_field_t uct_ib_md_config_table[] = {
119119
"Use GPU Direct RDMA for HCA to access GPU pages directly\n",
120120
ucs_offsetof(uct_ib_md_config_t, enable_gpudirect_rdma), UCS_CONFIG_TYPE_TERNARY},
121121

122+
{"GPU_IB_DISTANCE_LATENCY_THRESH", "300ns",
123+
"Skip GPU device if the distance latency to the IB device is greater than this value.",
124+
ucs_offsetof(uct_ib_md_config_t, ext.gpu_ib_distance_latency_thresh),
125+
UCS_CONFIG_TYPE_TIME},
126+
122127
{"PCI_BW", "",
123128
"Maximum effective data transfer rate of PCI bus connected to HCA\n",
124129
ucs_offsetof(uct_ib_md_config_t, pci_bw), UCS_CONFIG_TYPE_ARRAY(pci_bw)},

src/uct/ib/base/ib_md.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ typedef struct uct_ib_md_ext_config {
109109
unsigned long reg_retry_cnt; /**< Memory registration retry count */
110110
unsigned smkey_block_size; /**< Mkey indexes in a symmetric block */
111111
int direct_nic; /**< Direct NIC with GPU functionality */
112+
double gpu_ib_distance_latency_thresh; /**< Threshold to filter GPU<->IB distance */
112113
} uct_ib_md_ext_config_t;
113114

114115

src/uct/ib/mlx5/gdaki/gdaki.c

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,15 @@ typedef struct {
2424
} uct_rc_gdaki_iface_config_t;
2525

2626
ucs_config_field_t uct_rc_gdaki_iface_config_table[] = {
27-
{UCT_IB_CONFIG_PREFIX, "", NULL,
28-
ucs_offsetof(uct_rc_gdaki_iface_config_t, super),
29-
UCS_CONFIG_TYPE_TABLE(uct_rc_iface_common_config_table)},
27+
{UCT_IB_CONFIG_PREFIX, "", NULL,
28+
ucs_offsetof(uct_rc_gdaki_iface_config_t, super),
29+
UCS_CONFIG_TYPE_TABLE(uct_rc_iface_common_config_table)},
3030

31-
{UCT_IB_CONFIG_PREFIX, "", NULL,
32-
ucs_offsetof(uct_rc_gdaki_iface_config_t, mlx5),
33-
UCS_CONFIG_TYPE_TABLE(uct_rc_mlx5_common_config_table)},
31+
{UCT_IB_CONFIG_PREFIX, "", NULL,
32+
ucs_offsetof(uct_rc_gdaki_iface_config_t, mlx5),
33+
UCS_CONFIG_TYPE_TABLE(uct_rc_mlx5_common_config_table)},
3434

35-
{NULL}
35+
{NULL}
3636
};
3737

3838

@@ -657,7 +657,7 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md,
657657
}
658658

659659
/* TODO this logic should be done in UCP */
660-
if (dist.latency > 300.0 / UCS_NSEC_PER_SEC) {
660+
if (dist.latency > md->super.config.gpu_ib_distance_latency_thresh) {
661661
continue;
662662
}
663663

test/gtest/ucp/test_ucp_device.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ void test_ucp_device::get_test_variants(std::vector<ucp_test_variant> &variants)
7070
void test_ucp_device::init()
7171
{
7272
m_env.push_back(new ucs::scoped_setenv("UCX_CUDA_IPC_ENABLE_SAME_PROCESS", "y"));
73+
m_env.push_back(new ucs::scoped_setenv("UCX_IB_GPU_IB_DISTANCE_LATENCY_THRESH", "1000ns"));
7374
ucp_test::init();
7475
sender().connect(&receiver(), get_ep_params());
7576
if (!is_loopback()) {

0 commit comments

Comments
 (0)