|
| 1 | +; /home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk.ll -mtriple=amdgcn -mcpu=gfx942 -o - |
| 2 | +; /home/mlevental/dev_projects/llvm-project/llvm/lib/Target/AMDGPU/test_v_pk.ll -mattr=-packed-fp32-ops -mtriple=amdgcn -mcpu=gfx942 -o - |
| 3 | + |
| 4 | +; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite) |
| 5 | +define amdgpu_kernel void @add_kernel(ptr addrspace(1) nocapture readonly %0, ptr addrspace(1) nocapture readonly %1, ptr addrspace(1) nocapture writeonly %2, i32 %3) local_unnamed_addr #0 { |
| 6 | + %5 = tail call i32 @llvm.amdgcn.workgroup.id.x() |
| 7 | + %6 = shl i32 %5, 10 |
| 8 | + %7 = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 9 | + %8 = shl i32 %7, 2 |
| 10 | + %9 = and i32 %8, 1020 |
| 11 | + %10 = or disjoint i32 %9, %6 |
| 12 | + %11 = icmp slt i32 %10, %3 |
| 13 | + br i1 %11, label %.critedge, label %.critedge2 |
| 14 | + |
| 15 | +.critedge: ; preds = %4 |
| 16 | + %12 = or disjoint i32 %10, 3 |
| 17 | + %13 = or disjoint i32 %10, 2 |
| 18 | + %14 = or disjoint i32 %10, 1 |
| 19 | + %15 = sext i32 %10 to i64 |
| 20 | + %16 = getelementptr float, ptr addrspace(1) %0, i64 %15 |
| 21 | + %17 = addrspacecast ptr addrspace(1) %16 to ptr |
| 22 | + %18 = load float, ptr %17, align 16 |
| 23 | + %19 = getelementptr inbounds i8, ptr %17, i64 4 |
| 24 | + %20 = load float, ptr %19, align 4 |
| 25 | + |
| 26 | + %v_100 = insertelement <2 x float> undef, float %18, i32 0 |
| 27 | + %v_102 = insertelement <2 x float> %v_100, float %20, i32 1 |
| 28 | + |
| 29 | + %21 = getelementptr inbounds i8, ptr %17, i64 8 |
| 30 | + %22 = load float, ptr %21, align 8 |
| 31 | + %23 = getelementptr inbounds i8, ptr %17, i64 12 |
| 32 | + %24 = load float, ptr %23, align 4 |
| 33 | + |
| 34 | + %v_200 = insertelement <2 x float> undef, float %22, i32 0 |
| 35 | + %v_202 = insertelement <2 x float> %v_200, float %24, i32 1 |
| 36 | + |
| 37 | + %25 = getelementptr float, ptr addrspace(1) %1, i64 %15 |
| 38 | + %26 = addrspacecast ptr addrspace(1) %25 to ptr |
| 39 | + %27 = sext i32 %12 to i64 |
| 40 | + %28 = getelementptr float, ptr addrspace(1) %2, i64 %27 |
| 41 | + %29 = sext i32 %13 to i64 |
| 42 | + %30 = getelementptr float, ptr addrspace(1) %2, i64 %29 |
| 43 | + %31 = sext i32 %14 to i64 |
| 44 | + %32 = getelementptr float, ptr addrspace(1) %2, i64 %31 |
| 45 | + %33 = getelementptr inbounds i8, ptr %26, i64 12 |
| 46 | + %34 = load float, ptr %33, align 4 |
| 47 | + |
| 48 | + %36 = getelementptr inbounds i8, ptr %26, i64 8 |
| 49 | + %37 = load float, ptr %36, align 8 |
| 50 | + |
| 51 | + %v_300 = insertelement <2 x float> undef, float %34, i32 0 |
| 52 | + %v_302 = insertelement <2 x float> %v_300, float %37, i32 1 |
| 53 | + |
| 54 | + %39 = getelementptr inbounds i8, ptr %26, i64 4 |
| 55 | + %40 = load float, ptr %39, align 4 |
| 56 | + %42 = load float, ptr %26, align 16 |
| 57 | + |
| 58 | + %v_400 = insertelement <2 x float> undef, float %40, i32 0 |
| 59 | + %v_402 = insertelement <2 x float> %v_400, float %42, i32 1 |
| 60 | + |
| 61 | + %v_500 = fadd <2 x float> %v_102, %v_402 |
| 62 | + %v_501 = fadd <2 x float> %v_202, %v_302 |
| 63 | + ; tail call void @llvm.amdgcn.iglp.opt(i32 4) |
| 64 | + |
| 65 | + %v_45 = extractelement <2 x float> %v_501, i32 1 |
| 66 | + %v_32 = extractelement <2 x float> %v_501, i32 0 |
| 67 | + %v_30 = extractelement <2 x float> %v_500, i32 1 |
| 68 | + %v_28 = extractelement <2 x float> %v_500, i32 0 |
| 69 | + |
| 70 | + %i_44 = sext i32 %10 to i64 |
| 71 | + %p_45 = getelementptr float, ptr addrspace(1) %2, i64 %i_44 |
| 72 | + store float %v_28, ptr addrspace(1) %p_45, align 4 |
| 73 | + |
| 74 | + %i_31 = sext i32 %14 to i64 |
| 75 | + %p_32 = getelementptr float, ptr addrspace(1) %2, i64 %i_31 |
| 76 | + store float %v_32, ptr addrspace(1) %p_32, align 4 |
| 77 | + |
| 78 | + %i_29 = sext i32 %13 to i64 |
| 79 | + %p_30 = getelementptr float, ptr addrspace(1) %2, i64 %i_29 |
| 80 | + store float %v_30, ptr addrspace(1) %p_30, align 4 |
| 81 | + |
| 82 | + %i_27 = sext i32 %12 to i64 |
| 83 | + %p_28 = getelementptr float, ptr addrspace(1) %2, i64 %i_27 |
| 84 | + store float %v_28, ptr addrspace(1) %p_28, align 4 |
| 85 | + |
| 86 | + br label %.critedge2 |
| 87 | + |
| 88 | +.critedge2: ; preds = %4, %.critedge |
| 89 | + ret void |
| 90 | +} |
0 commit comments