Skip to content

[AVX2] SAD pattern detection is too strict #143456

@adworacz

Description

@adworacz

Reference code: Zig Godbolt

Some opportunities for producing optimized sum of absolute differences (SAD) calculations are being missed. It looks like prior support for this was overly restrictive.

Essentially, the absolute difference is being calculated, when it should just be handled by the dedicated SAD instruction.

Here's the code inline:

const block_width = 8;
const T = u8;
const VT = @Vector(block_width, T);

export fn sad(noalias srcp: [*]const u8, noalias refp: [*]const u8, height: usize, stride: usize) u32 {
    const src = srcp[0..height * stride];
    const ref = refp[0..height * stride];
    
    var sum: u32 = 0;

    const s: VT = src[0*stride..][0..block_width].*;
    const r: VT = ref[0*stride..][0..block_width].*;

    // Should work, but doesn't.
    const absdiff = @max(s,r) - @min(s,r);
    sum += @reduce(.Add, absdiff);
        
    // Should work, but doesn't
    //const VTI = @Vector(block_width, i16);
    //sum += @reduce(.Add, @abs(@as(VTI, s) - @as(VTI, r)));

    // Does work
    //const VTI = @Vector(block_width, i32);
    //sum += @reduce(.Add, @abs(@as(VTI, s) - @as(VTI, r)));
    
    return sum;
}

Which produces:

sad:
        push    rbp
        mov     rbp, rsp
        vmovq   xmm0, qword ptr [rdi]
        vmovq   xmm1, qword ptr [rsi]
        vpminub xmm2, xmm0, xmm1
        vpmaxub xmm0, xmm0, xmm1
        vpxor   xmm1, xmm1, xmm1
        vpsubb  xmm0, xmm0, xmm2
        vpsadbw xmm0, xmm0, xmm1
        vpextrb eax, xmm0, 0
        pop     rbp
        ret

But it should be:

sad:
        push    rbp
        mov     rbp, rsp
        vmovq   xmm0, qword ptr [rdi]
        vmovq   xmm1, qword ptr [rsi]
        vpsadbw xmm0, xmm0, xmm1
        vmovd   eax, xmm0
        pop     rbp
        ret

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions