11; RUN: llc -mcpu=gfx942 < %s | FileCheck %s
2+ ; CHECK-LABEL: test_mfma
23; CHECK: v_add_f32_e32
34; CHECK-NEXT: ;;#ASMSTART
45; CHECK-NEXT: v_mfma_f64
89target triple = "amdgcn-amd-amdhsa"
910
1011; Function Attrs: convergent mustprogress norecurse nounwind
11- define protected amdgpu_kernel void @_Z17group4_sum_floaatPfPKfi (ptr addrspace (1 ) noalias noundef writeonly captures(none) %to.coerce , ptr addrspace (1 ) noalias noundef readonly captures(none) %from.coerce , i32 noundef %length ) local_unnamed_addr #0 {
12+ define protected amdgpu_kernel void @test_valu (ptr addrspace (1 ) noalias noundef writeonly captures(none) %to.coerce , ptr addrspace (1 ) noalias noundef readonly captures(none) %from.coerce , i32 noundef %k , ptr addrspace (1 ) noundef writeonly captures(none) %ret.coerce , i32 noundef %length ) local_unnamed_addr #0 {
13+ entry:
14+ %0 = tail call i32 @llvm.amdgcn.workgroup.id.x ()
15+ %mul = shl i32 %0 , 6
16+ %1 = tail call i32 @llvm.amdgcn.workitem.id.x ()
17+ %add = add i32 %mul , %1
18+ %cmp = icmp slt i32 %add , %length
19+ br i1 %cmp , label %if.then , label %if.end
20+
21+ if.then: ; preds = %entry
22+ %idx.ext = sext i32 %add to i64
23+ %add.ptr = getelementptr inbounds float , ptr addrspace (1 ) %to.coerce , i64 %idx.ext
24+ %mul4 = shl nsw i32 %add , 2
25+ %idx.ext5 = sext i32 %mul4 to i64
26+ %add.ptr6 = getelementptr inbounds float , ptr addrspace (1 ) %from.coerce , i64 %idx.ext5
27+ %2 = load <4 x float >, ptr addrspace (1 ) %add.ptr6 , align 16
28+ %3 = extractelement <4 x float > %2 , i64 3
29+ %4 = extractelement <4 x float > %2 , i64 0
30+ %5 = tail call contract noundef float asm "v_add_f32_e32 $0, $1, $2" , "=v,v,v" (float %3 , float %4 ) #3
31+ %6 = extractelement <4 x float > %2 , i64 1
32+ %7 = extractelement <4 x float > %2 , i64 2
33+ %add7 = fadd contract float %6 , %7
34+ %add8 = fadd contract float %5 , %add7
35+ store float %add8 , ptr addrspace (1 ) %add.ptr , align 4
36+ %mul9 = mul nsw i32 %k , 3
37+ store i32 %mul9 , ptr addrspace (1 ) %ret.coerce , align 4
38+ tail call void @llvm.amdgcn.sched.group.barrier (i32 2 , i32 6 , i32 0 )
39+ tail call void @llvm.amdgcn.sched.group.barrier (i32 4 , i32 1 , i32 0 )
40+ tail call void @llvm.amdgcn.sched.group.barrier (i32 2 , i32 4 , i32 0 )
41+ br label %if.end
42+
43+ if.end: ; preds = %if.then, %entry
44+ ret void
45+ }
46+
47+ ; Function Attrs: convergent mustprogress norecurse nounwind
48+ define protected amdgpu_kernel void @test_mfma (ptr addrspace (1 ) noalias noundef writeonly captures(none) %to.coerce , ptr addrspace (1 ) noalias noundef readonly captures(none) %from.coerce , i32 noundef %length ) local_unnamed_addr #0 {
1249entry:
1350 %0 = tail call i32 @llvm.amdgcn.workgroup.id.x ()
1451 %mul = shl i32 %0 , 6
@@ -47,6 +84,45 @@ if.end: ; preds = %if.then, %entry
4784 ret void
4885}
4986
87+ @_ZZ16group4_sum_floatPfPKfE6cpymem = internal addrspace (3 ) global [8 x float ] undef , align 16
88+
89+ ; Function Attrs: convergent mustprogress norecurse nounwind
90+ define protected amdgpu_kernel void @test_ds (ptr addrspace (1 ) noalias noundef writeonly captures(none) %to.coerce , ptr addrspace (1 ) noalias noundef readonly captures(none) %from.coerce , i32 noundef %length ) local_unnamed_addr #0 {
91+ entry:
92+ %0 = tail call i32 @llvm.amdgcn.workgroup.id.x ()
93+ %mul = shl i32 %0 , 6
94+ %1 = tail call i32 @llvm.amdgcn.workitem.id.x ()
95+ %add = add i32 %mul , %1
96+ %cmp = icmp slt i32 %add , %length
97+ br i1 %cmp , label %if.then , label %if.end
98+
99+ if.then: ; preds = %entry
100+ %idx.ext = sext i32 %add to i64
101+ %add.ptr = getelementptr inbounds float , ptr addrspace (1 ) %to.coerce , i64 %idx.ext
102+ %mul3 = shl nsw i32 %add , 2
103+ %idx.ext4 = sext i32 %mul3 to i64
104+ %add.ptr5 = getelementptr inbounds float , ptr addrspace (1 ) %from.coerce , i64 %idx.ext4
105+ %2 = load <2 x float >, ptr addrspace (1 ) %add.ptr5 , align 16
106+ %a20 = add i64 %idx.ext4 , 2
107+ %a21 = getelementptr inbounds float , ptr addrspace (1 ) %from.coerce , i64 %a20
108+ %a22 = load <2 x float >, ptr addrspace (1 ) %a21 , align 16
109+ %3 = extractelement <2 x float > %a22 , i64 1
110+ %4 = extractelement <2 x float > %2 , i64 0
111+ %5 = tail call contract noundef float asm "ds_read_b32 $0, $1 offset:0" , "=v,v,~{memory}" (i32 ptrtoint (ptr addrspacecast (ptr addrspace (3 ) @_ZZ16group4_sum_floatPfPKfE6cpymem to ptr ) to i32 )) #4
112+ %6 = extractelement <2 x float > %2 , i64 1
113+ %7 = extractelement <2 x float > %a22 , i64 0
114+ %add6 = fadd contract float %6 , %7
115+ %add7 = fadd contract float %5 , %add6
116+ store float %add7 , ptr addrspace (1 ) %add.ptr , align 4
117+ tail call void @llvm.amdgcn.sched.group.barrier (i32 2 , i32 7 , i32 0 )
118+ tail call void @llvm.amdgcn.sched.group.barrier (i32 128 , i32 1 , i32 0 )
119+ tail call void @llvm.amdgcn.sched.group.barrier (i32 2 , i32 1 , i32 0 )
120+ br label %if.end
121+
122+ if.end: ; preds = %if.then, %entry
123+ ret void
124+ }
125+
50126; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
51127declare noundef i32 @llvm.amdgcn.workgroup.id.x () #1
52128
@@ -56,7 +132,8 @@ declare noundef range(i32 0, 1024) i32 @llvm.amdgcn.workitem.id.x() #1
56132; Function Attrs: convergent nocallback nofree nounwind willreturn
57133declare void @llvm.amdgcn.sched.group.barrier (i32 immarg, i32 immarg, i32 immarg) #2
58134
59- attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="0" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
135+ attributes #0 = { convergent mustprogress norecurse nounwind "amdgpu-agpr-alloc"="1" "amdgpu-flat-work-group-size"="1,1024" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx942" "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" "uniform-work-group-size"="true" }
60136attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
61137attributes #2 = { convergent nocallback nofree nounwind willreturn }
62138attributes #3 = { convergent nounwind memory(none) }
139+ attributes #4 = { convergent nounwind }
0 commit comments