|  | 
|  | 1 | +// RUN: %{ispc} %s --target=avx2-i32x8 --arch=x86-64 --nostdlib --emit-asm -o - | FileCheck %s | 
|  | 2 | + | 
|  | 3 | +// Test produces different IR/ASM with LLVM_21_0+ due to SROA improvements. Multiple stores | 
|  | 4 | +// filling same alloca are tree-optimized - https://github.com/llvm/llvm-project/pull/152793 | 
|  | 5 | +// REQUIRES: X86_ENABLED && LLVM_21_0+ | 
|  | 6 | + | 
|  | 7 | +// The goal of this test is to check that code generation for both versions is the same. | 
|  | 8 | + | 
|  | 9 | +struct FVector4 { | 
|  | 10 | +    float<4> V; | 
|  | 11 | +}; | 
|  | 12 | + | 
|  | 13 | +struct WideFVector4 { | 
|  | 14 | +    float V[programCount]; | 
|  | 15 | +}; | 
|  | 16 | + | 
|  | 17 | +unmasked inline uniform WideFVector4 operator+(const uniform WideFVector4 &A, const uniform WideFVector4 &B) { | 
|  | 18 | +    uniform WideFVector4 Result; | 
|  | 19 | +    Result.V[programIndex] = A.V[programIndex] + B.V[programIndex]; | 
|  | 20 | +    return Result; | 
|  | 21 | +} | 
|  | 22 | + | 
|  | 23 | +unmasked inline void LoadWideFVector4(uniform FVector4 *uniform DstPtr, const uniform FVector4 *uniform SrcPtr) { | 
|  | 24 | +    *DstPtr = *SrcPtr; | 
|  | 25 | +    *(DstPtr + 1) = *(SrcPtr + 1); | 
|  | 26 | +} | 
|  | 27 | + | 
|  | 28 | +unmasked inline void StoreWideFVector4(uniform FVector4 *uniform DstPtr, const uniform FVector4 *uniform SrcPtr) { | 
|  | 29 | +    *DstPtr = *SrcPtr; | 
|  | 30 | +    *(DstPtr + 1) = *(SrcPtr + 1); | 
|  | 31 | +} | 
|  | 32 | + | 
|  | 33 | +// CHECK-LABEL: AddWide___ | 
|  | 34 | +// CHECK-COUNT-2: vmovaps | 
|  | 35 | +// CHECK-NOT: vmovups | 
|  | 36 | +// CHECK-COUNT-2: vaddps | 
|  | 37 | +// CHECK-COUNT-2: vmovaps | 
|  | 38 | +// CHECK-NOT: vmovaps | 
|  | 39 | +unmasked void AddWide(uniform FVector4 Result[], uniform FVector4 Source1[], uniform FVector4 Source2[]) { | 
|  | 40 | +    uniform int Index = 0; | 
|  | 41 | +    uniform WideFVector4 S1, S2; | 
|  | 42 | + | 
|  | 43 | +    LoadWideFVector4((uniform FVector4 * uniform) & S1, (uniform FVector4 * uniform) & Source1[Index]); | 
|  | 44 | +    LoadWideFVector4((uniform FVector4 * uniform) & S2, (uniform FVector4 * uniform) & Source2[Index]); | 
|  | 45 | +    const uniform WideFVector4 R = S1 + S2; | 
|  | 46 | +    StoreWideFVector4((uniform FVector4 * uniform) & Result[Index], (uniform FVector4 * uniform) & R); | 
|  | 47 | +} | 
|  | 48 | + | 
|  | 49 | +unmasked inline void LoadWideFVector4_2(uniform FVector4 *uniform DstPtr, const uniform FVector4 *uniform SrcPtr) { | 
|  | 50 | +    for (uniform int i = 0; i < (programCount / 4); i++) { | 
|  | 51 | +        *(DstPtr + i) = *(SrcPtr + i); | 
|  | 52 | +    } | 
|  | 53 | +} | 
|  | 54 | + | 
|  | 55 | +unmasked inline void StoreWideFVector4_2(uniform FVector4 *uniform DstPtr, const uniform FVector4 *uniform SrcPtr) { | 
|  | 56 | +    for (uniform int i = 0; i < (programCount / 4); i++) { | 
|  | 57 | +        *(DstPtr + i) = *(SrcPtr + i); | 
|  | 58 | +    } | 
|  | 59 | +} | 
|  | 60 | + | 
|  | 61 | +// CHECK-LABEL: AddWide_2___ | 
|  | 62 | +// CHECK-COUNT-2: vmovaps | 
|  | 63 | +// CHECK-NOT: vmovups | 
|  | 64 | +// CHECK-COUNT-2: vaddps | 
|  | 65 | +// CHECK-COUNT-2: vmovaps | 
|  | 66 | +// CHECK-NOT: vmovaps | 
|  | 67 | +unmasked void AddWide_2(uniform FVector4 Result[], uniform FVector4 Source1[], uniform FVector4 Source2[]) { | 
|  | 68 | +    uniform int Index = 0; | 
|  | 69 | +    uniform WideFVector4 S1, S2; | 
|  | 70 | + | 
|  | 71 | +    LoadWideFVector4_2((uniform FVector4 * uniform) & S1, (uniform FVector4 * uniform) & Source1[Index]); | 
|  | 72 | +    LoadWideFVector4_2((uniform FVector4 * uniform) & S2, (uniform FVector4 * uniform) & Source2[Index]); | 
|  | 73 | +    const uniform WideFVector4 R = S1 + S2; | 
|  | 74 | +    StoreWideFVector4_2((uniform FVector4 * uniform) & Result[Index], (uniform FVector4 * uniform) & R); | 
|  | 75 | +} | 
0 commit comments