Skip to content

Commit eece99d

Browse files
committed
Revert "tuner: transform x coordinates to log2 scale."
This reverts commit 09c2351. The commit changed the OFI NCCL plugin's region-based NCCL tuner to select nonoptimal NCCL algorithm and protocol combinations for 0x7 AllGather and 0x7 ReduceScatter. Signed-off-by: Aviv Benchorin <benchori@amazon.com>
1 parent 53e92ef commit eece99d

File tree

3 files changed

+35
-104
lines changed

3 files changed

+35
-104
lines changed

include/tuner/nccl_ofi_tuner_region.h

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
#ifndef NCCL_OFI_TUNER_REGION_H_
66
#define NCCL_OFI_TUNER_REGION_H_
77

8-
#include <cmath>
98
#include <stddef.h>
109
#include "tuner/nccl_ofi_tuner_common.h"
1110

@@ -58,28 +57,6 @@ typedef struct nccl_ofi_tuner_point
5857
{
5958
double x;
6059
double y;
61-
enum COORD_SCALE {
62-
UNSPECIFIED,
63-
ORIGINAL,
64-
X_LOG2
65-
66-
} coord_scale = UNSPECIFIED;
67-
68-
inline void transform_log2_x() {
69-
if (coord_scale == X_LOG2) return;
70-
71-
if (x > 0) {
72-
x = std::log2(x);
73-
coord_scale = X_LOG2;
74-
}
75-
}
76-
77-
inline void transform_pow2_x() {
78-
if (coord_scale != X_LOG2) return;
79-
80-
x = std::pow(2.0, x);
81-
coord_scale = ORIGINAL;
82-
}
8360
} nccl_ofi_tuner_point_t;
8461

8562
typedef struct nccl_ofi_tuner_region {
@@ -93,8 +70,7 @@ nccl_ofi_tuner_point_t extend_region(nccl_ofi_tuner_point_t a,
9370
nccl_ofi_tuner_point_t b,
9471
nccl_ofi_tuner_point_t z);
9572

96-
int is_inside_region(
97-
nccl_ofi_tuner_point_t point,
98-
const nccl_ofi_tuner_region_t *region);
73+
int is_inside_region(nccl_ofi_tuner_point_t point,
74+
nccl_ofi_tuner_region_t *region);
9975

10076
#endif /* NCCL_OFI_TUNER_REGION_H_ */

src/tuner/nccl_ofi_regions.cpp

Lines changed: 7 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ static inline double distance(nccl_ofi_tuner_point_t x,
127127
double eps)
128128
{
129129
nccl_ofi_tuner_point_t dy = vsub(y1, y0);
130-
nccl_ofi_tuner_point_t x1, s = {0, 0};
130+
nccl_ofi_tuner_point_t x1, s;
131131
int r;
132132

133133
x1.x = x.x + dy.y;
@@ -155,12 +155,12 @@ static inline double distance(nccl_ofi_tuner_point_t x,
155155
* -1 for outside
156156
* 0 for on edge.
157157
*/
158-
int is_inside_region(nccl_ofi_tuner_point_t point, const nccl_ofi_tuner_region_t *region)
158+
int is_inside_region(nccl_ofi_tuner_point_t point, nccl_ofi_tuner_region_t *region)
159159
{
160160
assert(region->num_vertices > 1);
161161

162162
size_t i, k;
163-
const nccl_ofi_tuner_point_t *pv;
163+
nccl_ofi_tuner_point_t *pv;
164164
double min_x, max_x, min_y, max_y;
165165
const double eps = 1e-10;
166166

@@ -233,14 +233,6 @@ static ncclResult_t set_regions(nccl_ofi_tuner_region_context_t *region_ctx,
233233
}
234234

235235
memcpy(region_ctx->regions[collType], &regions[0], num_regions * sizeof(nccl_ofi_tuner_region_t));
236-
237-
for (size_t i = 0; i < num_regions; i++) {
238-
nccl_ofi_tuner_region& region = region_ctx->regions[collType][i];
239-
for (size_t j = 0; j < region.num_vertices; j++) {
240-
region.vertices[j].transform_log2_x();
241-
}
242-
}
243-
244236
return ncclSuccess;
245237
}
246238

@@ -253,22 +245,15 @@ nccl_ofi_tuner_point_t extend_region(nccl_ofi_tuner_point_t a, nccl_ofi_tuner_po
253245
{
254246
nccl_ofi_tuner_point_t ret;
255247

256-
a.transform_log2_x();
257-
b.transform_log2_x();
258-
z.transform_log2_x();
259-
ret.coord_scale = nccl_ofi_tuner_point_t::X_LOG2;
260-
261248
if (a.x == b.x) {
262249
/* a and b are on the same vertical line */
263-
ret.x = a.x, ret.y = z.y;
264-
ret.transform_pow2_x();
250+
ret = (nccl_ofi_tuner_point_t){.x = a.x, .y = z.y};
265251
return ret;
266252
}
267253

268254
if (a.y == b.y) {
269255
/* a and b are on the same horizontal line */
270-
ret.x = z.x, ret.y = a.y;
271-
ret.transform_pow2_x();
256+
ret = (nccl_ofi_tuner_point_t){.x = z.x, .y = a.y};
272257
return ret;
273258
}
274259

@@ -277,12 +262,11 @@ nccl_ofi_tuner_point_t extend_region(nccl_ofi_tuner_point_t a, nccl_ofi_tuner_po
277262
double projected_zy = m * z.x + c;
278263

279264
if (projected_zy < z.y) {
280-
ret.x = z.x, ret.y = projected_zy;
265+
ret = (nccl_ofi_tuner_point_t){.x = z.x, .y = projected_zy};
281266
} else {
282-
ret.x = (z.y - c) / m, ret.y = z.y;
267+
ret = (nccl_ofi_tuner_point_t){.x = (z.y - c) / m, .y = z.y};
283268
}
284269

285-
ret.transform_pow2_x();
286270
return ret;
287271
}
288272

@@ -1449,7 +1433,6 @@ ncclResult_t region_get_coll_info_internal_v2(nccl_ofi_tuner_context_t *ctx,
14491433

14501434
p.x = (double)nBytes;
14511435
p.y = (double)region_ctx->dims.num_ranks;
1452-
p.transform_log2_x();
14531436

14541437
/* Check all regions */
14551438
for (size_t i = 0; i < region_ctx->num_regions[collType] && in_out < 0; i++) {
@@ -1517,7 +1500,6 @@ ncclResult_t region_get_coll_info_internal_v3(nccl_ofi_tuner_context_t *ctx,
15171500

15181501
p.x = (double)nBytes;
15191502
p.y = (double)region_ctx->dims.num_ranks;
1520-
p.transform_log2_x();
15211503

15221504
/* Check all regions */
15231505
for (size_t i = 0; i < region_ctx->num_regions[collType] && in_out < 0; i++) {

tests/unit/region_based_tuner.cpp

Lines changed: 26 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,6 @@
1111
#include "tuner/nccl_ofi_tuner_region.h"
1212
#include "nccl_ofi_param.h"
1313

14-
using std::abs;
15-
using std::log2;
16-
using std::pow;
17-
const double eps = 1e-4;
18-
1914
static int test_extend_region(void)
2015
{
2116
nccl_ofi_tuner_point_t extended_point;
@@ -26,9 +21,8 @@ static int test_extend_region(void)
2621
extended_point = extend_region((nccl_ofi_tuner_point_t){2, 8},
2722
(nccl_ofi_tuner_point_t){4, 8},
2823
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
29-
if (abs(extended_point.x - TUNER_MAX_SIZE) > eps || extended_point.y != 8) {
30-
printf("X-Axis Extend Test Failed : Extended Points : x = %f (diff = %f) y = %f\n", extended_point.x,
31-
extended_point.x - TUNER_MAX_SIZE, extended_point.y);
24+
if (extended_point.x != TUNER_MAX_SIZE || extended_point.y != 8) {
25+
printf("X-Axis Extend Test Failed : Extended Points : x = %f y = %f\n", extended_point.x, extended_point.y);
3226
return -1;
3327
}
3428

@@ -45,26 +39,22 @@ static int test_extend_region(void)
4539
extended_point = extend_region((nccl_ofi_tuner_point_t){8, 64},
4640
(nccl_ofi_tuner_point_t){8290304, 72},
4741
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
48-
slope = (72.0 - 64.0) / (log2(8290304.0) - log2(8.0)); // slope = (y2 - y1)/(x2 - x1)
42+
slope = (72.0 - 64.0) / (8290304.0 - 8.0); // slope = (y2 - y1)/(x2 - x1)
4943
// y3 = mx3 + c and substitute for m=(y2-y1)/(x2-x1) and c = y2 - mx2
50-
projected_y = 72.0 + slope * (log2(TUNER_MAX_SIZE) - log2(8290304.0)); // y3 = y2 + mx3 - mx2
51-
if (abs(extended_point.x - TUNER_MAX_SIZE) > eps || extended_point.y != projected_y) {
52-
printf("X-Axis Upper Bound Test Failed : Extended Points : x = %f (diff = %f) y = %f (diff = %f) \n",
53-
extended_point.x, extended_point.x - TUNER_MAX_SIZE,
54-
extended_point.y, extended_point.y - projected_y);
44+
projected_y = 72.0 + slope * (TUNER_MAX_SIZE - 8290304.0); // y3 = y2 + mx3 - mx2
45+
if (extended_point.x != TUNER_MAX_SIZE || extended_point.y != projected_y) {
46+
printf("X-Axis Upper Bound Test Failed : Extended Points : x = %f y = %f\n", extended_point.x, extended_point.y);
5547
return -1;
5648
}
5749

5850
/* Extend the line to TUNER_MAX_RANKS (y-axis) */
5951
extended_point = extend_region((nccl_ofi_tuner_point_t){8, 64},
60-
(nccl_ofi_tuner_point_t){8.01, 1024},
52+
(nccl_ofi_tuner_point_t){16, 1024},
6153
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
62-
slope = (1024.0 - 64.0) / (log2(8.01) - log2(8.0));
63-
projected_x = pow(2, ((TUNER_MAX_RANKS - 1024.0) / slope) + log2(8.01));
64-
if (abs(extended_point.x - projected_x) > eps || extended_point.y != TUNER_MAX_RANKS) {
65-
printf("X-Axis Upper Bound Test 2 Failed : Extended Points : x = %f (diff = %f) y = %f (diff = %f) \n",
66-
extended_point.x, extended_point.x - projected_x,
67-
extended_point.y, extended_point.y - TUNER_MAX_RANKS);
54+
slope = (1024.0 - 64.0) / (16.0 - 8.0);
55+
projected_x = ((TUNER_MAX_RANKS - 1024.0) / slope) + 16;
56+
if (extended_point.x != projected_x || extended_point.y != TUNER_MAX_RANKS) {
57+
printf("X-Axis Upper Bound Test Failed : Extended Points : x = %f y = %f\n", extended_point.x, extended_point.y);
6858
return -1;
6959
}
7060

@@ -85,7 +75,7 @@ static int test_extend_region(void)
8575
| . |
8676
| . |
8777
|--------*------*----------*----------*---------*--------*---
88-
| p3(4M, 2) |p5(TUNER_MAX_SIZE, 2))
78+
| p3(4M, 2) |p4(TUNER_MAX_SIZE, 2))
8979
| |
9080
*/
9181
static int test_is_inside_region(void) {
@@ -97,14 +87,6 @@ static int test_is_inside_region(void) {
9787
(nccl_ofi_tuner_point_t){(double)48.0 * 1024 * 1024, 16},
9888
(nccl_ofi_tuner_point_t){(double)288.0 * 1024 * 1024, 128},
9989
(nccl_ofi_tuner_point_t){TUNER_MAX_SIZE, TUNER_MAX_RANKS});
100-
printf("INFO extended point: %f %f \n", e_48M_16_288M_128.x, e_48M_16_288M_128.y );
101-
102-
p1_288M_128.transform_log2_x();
103-
p2_38M_16.transform_log2_x();
104-
p3_4M_2.transform_log2_x();
105-
p5_maxM_2.transform_log2_x();
106-
e_48M_16_288M_128.transform_log2_x();
107-
printf("INFO extended point after transform_log2_x: %f %f \n", e_48M_16_288M_128.x, e_48M_16_288M_128.y );
10890

10991
nccl_ofi_tuner_region_t region = {
11092
.algorithm = NCCL_ALGO_RING,
@@ -116,7 +98,6 @@ static int test_is_inside_region(void) {
11698
p3_4M_2,
11799
p5_maxM_2}};
118100

119-
120101
/* Points on the vertices of the polygon should be classified to be on the edge of the region */
121102
if (is_inside_region(e_48M_16_288M_128, &region) != 0)
122103
return -1;
@@ -135,17 +116,15 @@ static int test_is_inside_region(void) {
135116
To find the points on the edge of the polygons:
136117
1. Consider two vertices of the polygon
137118
2. Calculate the slope and y-intercept of the line.
138-
3. Using the equation y = m * x + c, get multiple points on the line in powers of 2.
119+
3. Using the equation y = mx + c, get multiple points on the line in powers of 2.
139120
*/
140121
for (size_t i = 0; i < region.num_vertices; i++) {
141122
size_t k = (i + 1) % region.num_vertices;
142123
double slope = (region.vertices[k].y - region.vertices[i].y) / (region.vertices[k].x - region.vertices[i].x);
143124
double c = region.vertices[k].y - (slope * (region.vertices[i].x));
144125
for (double x = region.vertices[i].x; x < region.vertices[k].x; x = x * 2) {
145126
double y = (slope * x) + c;
146-
nccl_ofi_tuner_point_t test_point {x, y, nccl_ofi_tuner_point_t::X_LOG2};
147-
148-
if (is_inside_region(test_point, &region) != 0)
127+
if (is_inside_region((nccl_ofi_tuner_point_t){x, y}, &region) != 0)
149128
return -1;
150129
// printf(" Is (%.10f, %.10f) inside the region : %d\n", x, y, is_inside_region(
151130
// (nccl_ofi_tuner_point_t){x, y}, &region));
@@ -154,8 +133,8 @@ static int test_is_inside_region(void) {
154133

155134
printf("All points on the edges of the polygon are detected correcltly\n");
156135

157-
const size_t num_points = 20;
158-
nccl_ofi_tuner_point_t inside_vertices[] = {{16.0 * 1024 * 1024, 4},
136+
size_t num_points = 20;
137+
const nccl_ofi_tuner_point_t inside_vertices[] = {{16.0 * 1024 * 1024, 4},
159138
{128.0 * 1024 * 1024, 4},
160139
{1.0 * 1024 * 1024 * 1024, 4},
161140
{4.0 * 1024 * 1024 * 1024, 4},
@@ -173,30 +152,25 @@ static int test_is_inside_region(void) {
173152
{32.0 * 1024 * 1024 * 1024, 128},
174153
{64.0 * 1024 * 1024 * 1024, 128},
175154
{64.0 * 1024 * 1024 * 1024, 256},
176-
// Note, set a big enough diff (10.0) below, otherwise
177-
// the delta after log2 is within floating error (eps).
178-
{TUNER_MAX_SIZE - 10.0, 128},
179-
{e_48M_16_288M_128.x - 0.1, e_48M_16_288M_128.y - 10.0, nccl_ofi_tuner_point_t::X_LOG2}};
155+
{TUNER_MAX_SIZE - 1.0, 128},
156+
{e_48M_16_288M_128.x - 1.0, e_48M_16_288M_128.y - 1.0}};
180157

181158
/* These points should be inside the polygon */
182159
for (size_t i = 0; i < num_points; i++) {
183-
inside_vertices[i].transform_log2_x();
184-
int d = is_inside_region(inside_vertices[i], &region);
185-
if (d != 1) {
186-
printf("%ld: %.10f, %.10f is_inside_region: %d\n", i, inside_vertices[i].x, inside_vertices[i].y, d);
160+
if (is_inside_region(inside_vertices[i], &region) != 1) {
161+
printf("%.10f, %.10f\n", inside_vertices[i].x, inside_vertices[i].y);
187162
return -1;
188163
};
189164
}
190165

191166
printf("All points inside the polygon are detected correcltly\n");
192167

193-
const size_t outside_num_points = 24;
194-
const nccl_ofi_tuner_point_t outside_vertices[] = {{8.0 * 1024 * 1024, 6},
168+
const nccl_ofi_tuner_point_t outside_vertices[] = {{8.0 * 1024 * 1024, 4},
195169
{8.0 * 1024 * 1024, 32},
196170
{8.0 * 1024 * 1024, 128},
197171
{8.0 * 1024 * 1024, 512},
198172
{8.0 * 1024 * 1024, TUNER_MAX_RANKS},
199-
{16.0 * 1024 * 1024, 10},
173+
{16.0 * 1024 * 1024, 8},
200174
{16.0 * 1024 * 1024, 32},
201175
{16.0 * 1024 * 1024, 128},
202176
{16.0 * 1024 * 1024, 512},
@@ -206,7 +180,7 @@ static int test_is_inside_region(void) {
206180
{32.0 * 1024 * 1024, 64},
207181
{32.0 * 1024 * 1024, 128},
208182
{32.0 * 1024 * 1024, 256},
209-
{64 * 1024 * 1024, 35},
183+
{64 * 1024 * 1024, 32},
210184
{64.0 * 1024 * 1024, 64},
211185
{64.0 * 1024 * 1024, 256},
212186
{64.0 * 1024 * 1024, 1024},
@@ -217,10 +191,9 @@ static int test_is_inside_region(void) {
217191
{e_48M_16_288M_128.x + 1.0, e_48M_16_288M_128.y + 1.0}};
218192

219193
/* These points should be outside the polygons */
220-
for (size_t i = 0; i < outside_num_points; i++) {
221-
int d = is_inside_region(outside_vertices[i], &region);
222-
if ( d != -1) {
223-
printf("%ld: %.10f, %.10f is_inside_region: %d\n", i, outside_vertices[i].x, outside_vertices[i].y, d);
194+
for (size_t i = 0; i < num_points; i++) {
195+
if (is_inside_region(outside_vertices[i], &region) != -1) {
196+
printf("%.10f, %.10f\n", outside_vertices[i].x, outside_vertices[i].y);
224197
return -1;
225198
};
226199
}

0 commit comments

Comments
 (0)