Skip to content

Commit c4ff00c

Browse files
rongbingzhouhershys-aws
authored andcommitted
aws: add platform config for g7e instance type
Enable OFI NCCL plugin to support AWS g7e.8xlarge, g7e.12xlarge, g7e.24xlarge, g7e.48xlarge instance types. Signed-off-by: Rongbing Zhou <rongbiz@amazon.com> (cherry picked from commit 8dc2ce4)
1 parent 8385fd2 commit c4ff00c

File tree

2 files changed

+38
-0
lines changed

2 files changed

+38
-0
lines changed

src/platform-aws.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,34 @@ static struct ec2_platform_data platform_data_map[] = {
208208
.domain_per_thread = true,
209209
.env = {},
210210
},
211+
{
212+
.name = "g7e.8xlarge",
213+
.regex = NULL,
214+
.topology = NULL,
215+
.default_dup_conns = 0,
216+
.latency = 35.0,
217+
.gdr_required = false,
218+
.default_protocol = PROTOCOL::RDMA,
219+
.domain_per_thread = true,
220+
.env = {
221+
{ "NCCL_BUFFSIZE", "8388608" },
222+
{ "NCCL_P2P_NET_CHUNKSIZE", "524288" },
223+
},
224+
},
225+
{
226+
.name = "g7e",
227+
.regex = "^g7e\\.(12|24|48)xlarge",
228+
.topology = NULL,
229+
.default_dup_conns = 0,
230+
.latency = 35.0,
231+
.gdr_required = true,
232+
.default_protocol = PROTOCOL::RDMA,
233+
.domain_per_thread = true,
234+
.env = {
235+
{ "NCCL_BUFFSIZE", "8388608" },
236+
{ "NCCL_P2P_NET_CHUNKSIZE", "524288" },
237+
},
238+
},
211239
{
212240
.name = "trn1",
213241
.regex = "^trn1.*",

tests/unit/aws_platform_mapper.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,16 @@ static int check_known_platforms(void)
6565
ret += check_value(platform_data_list, len, "p6e-gb200.36xlarge", "p-series");
6666
ret += check_value(platform_data_list, len, "g5.48xlarge", "g5.48xlarge");
6767
ret += check_value(platform_data_list, len, "g6.16xlarge", NULL);
68+
ret += check_value(platform_data_list, len, "g7e.8xlarge", "g7e.8xlarge");
69+
ret += check_value(platform_data_list, len, "g7.8xlarge", NULL);
70+
ret += check_value(platform_data_list, len, "g7e.12xlarge", "g7e");
71+
ret += check_value(platform_data_list, len, "g7e.24xlarge", "g7e");
72+
ret += check_value(platform_data_list, len, "g7e.48xlarge", "g7e");
73+
ret += check_value(platform_data_list, len, "g7e.xlarge", NULL);
74+
ret += check_value(platform_data_list, len, "g7e.1xlarge", NULL);
75+
ret += check_value(platform_data_list, len, "g7e.2xlarge", NULL);
76+
ret += check_value(platform_data_list, len, "g7e.4xlarge", NULL);
77+
ret += check_value(platform_data_list, len, "g7.48xlarge", NULL);
6878

6979
// obviously future platforms
7080
ret += check_value(platform_data_list, len, "p100.2048xlarge", "p-series");

0 commit comments

Comments
 (0)