Skip to content

Commit ea90d0f

Browse files
Changes to permit vectorization of most loops by ICX
Some loops are not vectorized due to compiler's cost model analysis. Added CMake option OPTIMIZATION_REPORT (OFF by default). It would instruct compiler to generate optimization report for mkl_umath library.
1 parent de82deb commit ea90d0f

File tree

3 files changed

+378
-231
lines changed

3 files changed

+378
-231
lines changed

CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,11 @@ project(mkl_umath
77
DESCRIPTION "mkl_umath module"
88
)
99

10+
option(OPTIMIZATION_REPORT
11+
"Whether to generate optimization vectorization report"
12+
OFF
13+
)
14+
1015
find_package(Python COMPONENTS Interpreter Development REQUIRED)
1116
find_package(NumPy REQUIRED)
1217

@@ -91,6 +96,9 @@ target_link_libraries(${_trgt} PRIVATE mkl_rt ${Python_LIBRARIES})
9196
target_link_options(${_trgt} PRIVATE ${_linker_options})
9297
target_compile_options(${_trgt} PRIVATE -fveclib=SVML)
9398
target_compile_options(${_trgt} PRIVATE -fvectorize)
99+
if(OPTIMIZATION_REPORT)
100+
target_compile_options(${_trgt} PRIVATE -qopt-report=3)
101+
endif()
94102
install(TARGETS ${_trgt}
95103
LIBRARY DESTINATION mkl_umath
96104
ARCHIVE DESTINATION mkl_umath

mkl_umath/src/fast_loop_macros.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -74,19 +74,19 @@
7474
npy_intp is1 = steps[0], os1 = steps[1];\
7575
npy_intp n = dimensions[0];\
7676
npy_intp i;\
77-
for(i = 0; i < n; i++, ip1 += is1, op1 += os1)
77+
for(i = 0; i < n; ++i, ip1 += is1, op1 += os1)
7878

79-
#define UNARY_LOOP_VECTORIZED\
80-
char *ip1 = args[0], *op1 = args[1];\
81-
npy_intp is1 = steps[0], os1 = steps[1];\
79+
#define UNARY_LOOP_VECTORIZED(tin, tout)\
80+
tin *ip1 = (tin *) args[0];\
81+
tout *op1 = (tout *) args[1]; \
8282
npy_intp n = dimensions[0];\
8383
npy_intp i;\
8484
NPY_PRAGMA_VECTOR\
85-
for(i = 0; i < n; i++, ip1 += is1, op1 += os1)
85+
for(i = 0; i < n; ++i, ++ip1, ++op1)
8686

87-
#define UNARY_LOOP_DISPATCH(cond, body)\
87+
#define UNARY_LOOP_DISPATCH(tin, tout, cond, body)\
8888
if (cond) {\
89-
UNARY_LOOP_VECTORIZED { body; }\
89+
UNARY_LOOP_VECTORIZED(tin, tout) { body; }\
9090
} else {\
9191
UNARY_LOOP { body; }\
9292
}
@@ -97,31 +97,31 @@
9797
npy_intp is1 = steps[0], os1 = steps[1], os2 = steps[2];\
9898
npy_intp n = dimensions[0];\
9999
npy_intp i;\
100-
for(i = 0; i < n; i++, ip1 += is1, op1 += os1, op2 += os2)
100+
for(i = 0; i < n; ++i, ip1 += is1, op1 += os1, op2 += os2)
101101

102102
/** (ip1, ip2) -> (op1) */
103103
#define BINARY_LOOP\
104104
char *ip1 = args[0], *ip2 = args[1], *op1 = args[2];\
105105
npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2];\
106106
npy_intp n = dimensions[0];\
107107
npy_intp i;\
108-
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1)
108+
for(i = 0; i < n; ++i, ip1 += is1, ip2 += is2, op1 += os1)
109109

110110
/** (ip1, ip2) -> (op1, op2) */
111111
#define BINARY_LOOP_TWO_OUT\
112112
char *ip1 = args[0], *ip2 = args[1], *op1 = args[2], *op2 = args[3];\
113113
npy_intp is1 = steps[0], is2 = steps[1], os1 = steps[2], os2 = steps[3];\
114114
npy_intp n = dimensions[0];\
115115
npy_intp i;\
116-
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2)
116+
for(i = 0; i < n; ++i, ip1 += is1, ip2 += is2, op1 += os1, op2 += os2)
117117

118118
/** (ip1, ip2, ip3) -> (op1) */
119119
#define TERNARY_LOOP\
120120
char *ip1 = args[0], *ip2 = args[1], *ip3 = args[2], *op1 = args[3];\
121121
npy_intp is1 = steps[0], is2 = steps[1], is3 = steps[2], os1 = steps[3];\
122122
npy_intp n = dimensions[0];\
123123
npy_intp i;\
124-
for(i = 0; i < n; i++, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
124+
for(i = 0; i < n; ++i, ip1 += is1, ip2 += is2, ip3 += is3, op1 += os1)
125125

126126
/** @} */
127127

0 commit comments

Comments
 (0)