-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Description
For the below given c code with characteristics-> 1) where streaming attribute is used to execute the function in streaming mode 2) Scalabale Vectorization is enabled via pragma 3) Predicated Control flow inside loop body that data-depends on atleast one of the arrays/vectors
For ex:
__arm_locally_streaming
void foo1 (char *AA, char *BB, char *CC, char *DD, int N, int T, char *EE) {
#pragma clang loop vectorize_width(16, scalable)
for (int idx = 0; idx < N; idx++) {
if (EE[idx] >= DD[idx]) {
AA[idx] = AA[idx] + CC[idx];
BB[idx] = BB[idx] * CC[idx];
}
}
}
When compiled with -march=armv8+sme generates a disassembly which doesn't have masked load/stores.
For reference a section of generated disassembly for above is mentioned below[That contains scalar indexed loads]:
.LBB0_21:
cmphs p8.b, p1/z, z16.b, z17.b
mov z19.b, p8/z, #-1
mov z19.b, z19.b[5]
fmov w30, s19
tbnz w30, #0, .LBB0_39
.LBB0_22:
cmphs p8.b, p1/z, z16.b, z17.b
mov z19.b, p8/z, #-1
mov z19.b, z19.b[6]
fmov w30, s19
tbnz w30, #0, .LBB0_40
.LBB0_23:
cmphs p8.b, p1/z, z16.b, z17.b
mov z19.b, p8/z, #-1
mov z19.b, z19.b[7]
fmov w30, s19
tbnz w30, #0, .LBB0_41
.LBB0_24:
ptrue p3.b, vl16
cmphs p3.b, p3/z, z16.b, z17.b
mov z19.b, p3/z, #-1
mov z19.b, z19.b[8]
fmov w30, s19
tbnz w30, #0, .LBB0_42
.LBB0_25:
ptrue p3.b, vl16
cmphs p3.b, p3/z, z16.b, z17.b
mov z19.b, p3/z, #-1
mov z19.b, z19.b[9]
fmov w30, s19
tbnz w30, #0, .LBB0_43
.LBB0_26:
ptrue p3.b, vl16
cmphs p3.b, p3/z, z16.b, z17.b
mov z19.b, p3/z, #-1
mov z19.b, z19.b[10]
fmov w30, s19
tbnz w30, #0, .LBB0_44
.LBB0_27:
ptrue p3.b, vl16
cmphs p3.b, p3/z, z16.b, z17.b
mov z19.b, p3/z, #-1
mov z19.b, z19.b[11]
fmov w30, s19
tbnz w30, #0, .LBB0_45
.LBB0_28:
ptrue p3.b, vl16
cmphs p3.b, p3/z, z16.b, z17.b
mov z19.b, p3/z, #-1
mov z19.b, z19.b[12]
fmov w30, s19
tbnz w30, #0, .LBB0_46
Where as if for the same c-code we remove the streaming attribute and compile it with -march=armv8+sve then the disassembly contains the full masked load/stores:
.LBB0_15:
ld1b { z0.b }, p0/z, [x6, x11]
ld1b { z1.b }, p0/z, [x3, x11]
cmphs p1.b, p0/z, z0.b, z1.b
ld1b { z0.b }, p1/z, [x0, x11]
ld1b { z1.b }, p1/z, [x2, x11]
ld1b { z2.b }, p1/z, [x1, x11]
ld1b { z3.b }, p1/z, [x2, x11]
add z0.b, z1.b, z0.b
add z1.b, z3.b, z2.b
st1b { z0.b }, p1, [x0, x11]
st1b { z1.b }, p1, [x1, x11]
add x11, x11, x9
cmp x10, x11
b.ne .LBB0_15
cmp x10, x8
b.ne .LBB0_3
Thus such behavior/instructions are also desired incase of -march=armv8+sme.
For more understanding pls refer to this godbolt reproducer: https://godbolt.org/z/5bd3zhvTP