|
| 1 | +; RUN: llc -O3 -mcpu=gfx942 < %s | FileCheck %s |
| 2 | +; CHECK: v_add_f32_e32 |
| 3 | +; CHECK-NEXT: ;;#ASMSTART |
| 4 | +; CHECK-NEXT: v_add_f32_e32 |
| 5 | +; CHECK-NEXT: ;;#ASMEND |
| 6 | +; CHECK: v_add_f32_e32 |
| 7 | +; ModuleID = '<stdin>' |
| 8 | +source_filename = "llvm-link" |
| 9 | +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128:128:48-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" |
| 10 | +target triple = "amdgcn-amd-amdhsa" |
| 11 | + |
| 12 | +@llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_bffb86447932ec40 to ptr)], section "llvm.metadata" |
| 13 | +@__hip_cuid_bffb86447932ec40 = addrspace(1) global i8 0 |
| 14 | + |
| 15 | +; Function Attrs: convergent mustprogress norecurse nounwind |
| 16 | +define protected amdgpu_kernel void @_Z17group4_sum_floaatPfPKfi(ptr addrspace(1) noalias noundef writeonly captures(none) %to.coerce, ptr addrspace(1) noalias noundef readonly captures(none) %from.coerce, i32 noundef %length) local_unnamed_addr #0 { |
| 17 | +entry: |
| 18 | + %0 = tail call i32 @llvm.amdgcn.workgroup.id.x() |
| 19 | + %mul = shl i32 %0, 6 |
| 20 | + %1 = tail call i32 @llvm.amdgcn.workitem.id.x() |
| 21 | + %add = add i32 %mul, %1 |
| 22 | + %cmp = icmp slt i32 %add, %length |
| 23 | + br i1 %cmp, label %if.then, label %if.end |
| 24 | + |
| 25 | +if.then: ; preds = %entry |
| 26 | + %idx.ext = sext i32 %add to i64 |
| 27 | + %add.ptr = getelementptr inbounds float, ptr addrspace(1) %to.coerce, i64 %idx.ext |
| 28 | + %mul3 = shl nsw i32 %add, 2 |
| 29 | + %idx.ext4 = sext i32 %mul3 to i64 |
| 30 | + %add.ptr5 = getelementptr inbounds float, ptr addrspace(1) %from.coerce, i64 %idx.ext4 |
| 31 | + %2 = load <4 x float>, ptr addrspace(1) %add.ptr5, align 16, !tbaa !0 |
| 32 | + %3 = extractelement <4 x float> %2, i64 3 |
| 33 | + %4 = extractelement <4 x float> %2, i64 0 |
| 34 | + %5 = tail call contract noundef float asm "v_add_f32_e32 $0, $1, $2 ; SGMASK:0x1", "=v,v,v"(float %3, float %4) #3, !srcloc !3 |
| 35 | + %6 = extractelement <4 x float> %2, i64 1 |
| 36 | + %7 = extractelement <4 x float> %2, i64 2 |
| 37 | + %add6 = fadd contract float %6, %7 |
| 38 | + %add7 = fadd contract float %5, %add6 |
| 39 | + store float %add7, ptr addrspace(1) %add.ptr, align 4, !tbaa !4 |
| 40 | + tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0) |
| 41 | + tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 5, i32 0) |
| 42 | + tail call void @llvm.amdgcn.sched.group.barrier(i32 16, i32 1, i32 0) |
| 43 | + tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0) |
| 44 | + tail call void @llvm.amdgcn.sched.group.barrier(i32 1, i32 1, i32 0) |
| 45 | + tail call void @llvm.amdgcn.sched.group.barrier(i32 2, i32 1, i32 0) |
| 46 | + br label %if.end |
| 47 | + |
| 48 | +if.end: ; preds = %if.then, %entry |
| 49 | + ret void |
| 50 | +} |
| 51 | + |
| 52 | +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) |
| 53 | +declare noundef i32 @llvm.amdgcn.workgroup.id.x() #1 |
| 54 | + |
| 55 | +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) |
| 56 | +declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1 |
| 57 | + |
| 58 | +; Function Attrs: convergent nocallback nofree nounwind willreturn |
| 59 | +declare void @llvm.amdgcn.sched.group.barrier(i32 immarg, i32 immarg, i32 immarg) #2 |
| 60 | + |
| 61 | +attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" } |
| 62 | +attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } |
| 63 | +attributes #2 = { convergent nocallback nofree nounwind willreturn } |
| 64 | +attributes #3 = { convergent nounwind memory(none) } |
| 65 | + |
| 66 | +!0 = !{!1, !1, i64 0} |
| 67 | +!1 = !{!"omnipotent char", !2, i64 0} |
| 68 | +!2 = !{!"Simple C++ TBAA"} |
| 69 | +!3 = !{i64 129} |
| 70 | +!4 = !{!5, !5, i64 0} |
| 71 | +!5 = !{!"float", !1, i64 0} |
0 commit comments