|
4 | 4 | * reserved. |
5 | 5 | * Copyright (c) 2019 Arm Ltd. All rights reserved. |
6 | 6 | * Copyright (c) 2024 NVIDIA Corporation. All rights reserved. |
| 7 | + * Copyright (c) 2024 Research Organization for Information Science |
| 8 | + * and Technology (RIST). All rights reserved. |
7 | 9 | * |
8 | 10 | * $COPYRIGHT$ |
9 | 11 | * |
@@ -140,20 +142,18 @@ _Generic((*(out)), \ |
140 | 142 | struct ompi_datatype_t **dtype, \ |
141 | 143 | struct ompi_op_base_module_1_0_0_t *module) \ |
142 | 144 | { \ |
143 | | - int types_per_step = svcnt(*((type##type_size##_t *) _in)); \ |
144 | | - size_t idx = 0, left_over = *count; \ |
| 145 | + const int types_per_step = svcnt(*((type##type_size##_t *) _in)); \ |
| 146 | + const int cnt = *count; \ |
145 | 147 | type##type_size##_t *in = (type##type_size##_t *) _in, \ |
146 | 148 | *out = (type##type_size##_t *) _out; \ |
147 | 149 | OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##_t) vsrc, vdst; \ |
148 | | - svbool_t pred = svwhilelt_b##type_size(idx, left_over); \ |
149 | | - do { \ |
| 150 | + for (int idx=0; idx < cnt; idx += types_per_step) { \ |
| 151 | + svbool_t pred = svwhilelt_b##type_size(idx, cnt); \ |
150 | 152 | vsrc = svld1(pred, &in[idx]); \ |
151 | 153 | vdst = svld1(pred, &out[idx]); \ |
152 | 154 | vdst = OP_CONCAT(OMPI_OP_OP_PREPEND, op##_x)(pred, vdst, vsrc); \ |
153 | 155 | OP_CONCAT(OMPI_OP_OP_PREPEND, st1)(pred, &out[idx], vdst); \ |
154 | | - idx += types_per_step; \ |
155 | | - pred = svwhilelt_b##type_size(idx, left_over); \ |
156 | | - } while (svptest_any(svptrue_b##type_size(), pred)); \ |
| 156 | + } \ |
157 | 157 | } |
158 | 158 | #endif |
159 | 159 |
|
@@ -308,21 +308,19 @@ static void OP_CONCAT(ompi_op_aarch64_3buff_##name##_##type##type_size##_t, APPE |
308 | 308 | struct ompi_datatype_t **dtype, \ |
309 | 309 | struct ompi_op_base_module_1_0_0_t *module) \ |
310 | 310 | { \ |
311 | | - int types_per_step = svcnt(*((type##type_size##_t *) _in1)); \ |
| 311 | + const int types_per_step = svcnt(*((type##type_size##_t *) _in1)); \ |
312 | 312 | type##type_size##_t *in1 = (type##type_size##_t *) _in1, \ |
313 | 313 | *in2 = (type##type_size##_t *) _in2, \ |
314 | 314 | *out = (type##type_size##_t *) _out; \ |
315 | | - size_t idx = 0, left_over = *count; \ |
| 315 | + const int cnt = *count; \ |
316 | 316 | OP_CONCAT(OMPI_OP_TYPE_PREPEND, type##type_size##_t) vsrc, vdst; \ |
317 | | - svbool_t pred = svwhilelt_b##type_size(idx, left_over); \ |
318 | | - do { \ |
| 317 | + for (int idx=0; idx < cnt; idx += types_per_step) { \ |
| 318 | + svbool_t pred = svwhilelt_b##type_size(idx, cnt); \ |
319 | 319 | vsrc = svld1(pred, &in1[idx]); \ |
320 | 320 | vdst = svld1(pred, &in2[idx]); \ |
321 | 321 | vdst = OP_CONCAT(OMPI_OP_OP_PREPEND, op##_x)(pred, vdst, vsrc); \ |
322 | 322 | OP_CONCAT(OMPI_OP_OP_PREPEND, st1)(pred, &out[idx], vdst); \ |
323 | | - idx += types_per_step; \ |
324 | | - pred = svwhilelt_b##type_size(idx, left_over); \ |
325 | | - } while (svptest_any(svptrue_b##type_size(), pred)); \ |
| 323 | + } \ |
326 | 324 | } |
327 | 325 | #endif /* defined(GENERATE_SVE_CODE) */ |
328 | 326 |
|
|
0 commit comments