Skip to content

[AArch64] Masked load/store generation missing for Streaming-SVE with -march=armv8-a+sme. #162797

@ShikharjQUIC

Description

@ShikharjQUIC

For the below given c code with characteristics-> 1) where streaming attribute is used to execute the function in streaming mode 2) Scalabale Vectorization is enabled via pragma 3) Predicated Control flow inside loop body that data-depends on atleast one of the arrays/vectors

For ex:

__arm_locally_streaming
void foo1 (char *AA, char *BB, char *CC, char *DD, int N, int T, char *EE) {
   #pragma clang loop vectorize_width(16, scalable)
  for (int idx = 0; idx < N; idx++) {
    if (EE[idx] >= DD[idx]) {
       AA[idx] = AA[idx] + CC[idx];
       BB[idx] = BB[idx] * CC[idx];
    }
  }
}

When compiled with -march=armv8+sme generates a disassembly which doesn't have masked load/stores.

For reference a section of generated disassembly for above is mentioned below[That contains scalar indexed loads]:

.LBB0_21:
        cmphs   p8.b, p1/z, z16.b, z17.b
        mov     z19.b, p8/z, #-1
        mov     z19.b, z19.b[5]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_39
.LBB0_22:
        cmphs   p8.b, p1/z, z16.b, z17.b
        mov     z19.b, p8/z, #-1
        mov     z19.b, z19.b[6]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_40
.LBB0_23:
        cmphs   p8.b, p1/z, z16.b, z17.b
        mov     z19.b, p8/z, #-1
        mov     z19.b, z19.b[7]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_41
.LBB0_24:
        ptrue   p3.b, vl16
        cmphs   p3.b, p3/z, z16.b, z17.b
        mov     z19.b, p3/z, #-1
        mov     z19.b, z19.b[8]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_42
.LBB0_25:
        ptrue   p3.b, vl16
        cmphs   p3.b, p3/z, z16.b, z17.b
        mov     z19.b, p3/z, #-1
        mov     z19.b, z19.b[9]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_43
.LBB0_26:
        ptrue   p3.b, vl16
        cmphs   p3.b, p3/z, z16.b, z17.b
        mov     z19.b, p3/z, #-1
        mov     z19.b, z19.b[10]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_44
.LBB0_27:
        ptrue   p3.b, vl16
        cmphs   p3.b, p3/z, z16.b, z17.b
        mov     z19.b, p3/z, #-1
        mov     z19.b, z19.b[11]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_45
.LBB0_28:
        ptrue   p3.b, vl16
        cmphs   p3.b, p3/z, z16.b, z17.b
        mov     z19.b, p3/z, #-1
        mov     z19.b, z19.b[12]
        fmov    w30, s19
        tbnz    w30, #0, .LBB0_46

Where as if for the same c-code we remove the streaming attribute and compile it with -march=armv8+sve then the disassembly contains the full masked load/stores:

.LBB0_15:
        ld1b    { z0.b }, p0/z, [x6, x11]
        ld1b    { z1.b }, p0/z, [x3, x11]
        cmphs   p1.b, p0/z, z0.b, z1.b
        ld1b    { z0.b }, p1/z, [x0, x11]
        ld1b    { z1.b }, p1/z, [x2, x11]
        ld1b    { z2.b }, p1/z, [x1, x11]
        ld1b    { z3.b }, p1/z, [x2, x11]
        add     z0.b, z1.b, z0.b
        add     z1.b, z3.b, z2.b
        st1b    { z0.b }, p1, [x0, x11]
        st1b    { z1.b }, p1, [x1, x11]
        add     x11, x11, x9
        cmp     x10, x11
        b.ne    .LBB0_15
        cmp     x10, x8
        b.ne    .LBB0_3 		

Thus such behavior/instructions are also desired incase of -march=armv8+sme.

For more understanding pls refer to this godbolt reproducer: https://godbolt.org/z/5bd3zhvTP

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions