@@ -59,6 +59,91 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
5959 ret void
6060}
6161
62+ ; Function Attrs: nounwind
63+ define dso_local void @__tile_dpbf8ps (ptr %dst , ptr %src1 , ptr %src2 ) #0 {
64+ ; CHECK-LABEL: __tile_dpbf8ps:
65+ ; CHECK: # %bb.0: # %entry
66+ ; CHECK-NEXT: pushq %rbp
67+ ; CHECK-NEXT: subq $4976, %rsp # imm = 0x1370
68+ ; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
69+ ; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
70+ ; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
71+ ; CHECK-NEXT: movzwl (%rsi), %eax
72+ ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
73+ ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
74+ ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
75+ ; CHECK-NEXT: movswq 2(%rdx), %rcx
76+ ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
77+ ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
78+ ; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
79+ ; CHECK-NEXT: movswq 2(%rsi), %r8
80+ ; CHECK-NEXT: movw %r8w, {{[0-9]+}}(%rsp)
81+ ; CHECK-NEXT: movzwl %r8w, %r9d
82+ ; CHECK-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
83+ ; CHECK-NEXT: shrl $2, %r9d
84+ ; CHECK-NEXT: movb %r9b, {{[0-9]+}}(%rsp)
85+ ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
86+ ; CHECK-NEXT: addq $64, %rdi
87+ ; CHECK-NEXT: tileloadd (%rdi,%rcx), %tmm0
88+ ; CHECK-NEXT: addq $64, %rsi
89+ ; CHECK-NEXT: tileloadd (%rsi,%r8), %tmm1
90+ ; CHECK-NEXT: addq $64, %rdx
91+ ; CHECK-NEXT: tileloadd (%rdx,%rcx), %tmm2
92+ ; CHECK-NEXT: movabsq $64, %rbp
93+ ; CHECK-NEXT: tilestored %tmm0, 896(%rsp,%rbp) # 1024-byte Folded Spill
94+ ; CHECK-NEXT: tileloadd 896(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
95+ ; CHECK-NEXT: tdpbf8ps %tmm2, %tmm1, %tmm3
96+ ; CHECK-NEXT: tilestored %tmm3, (%rdi,%rcx)
97+ ; CHECK-NEXT: tilestored %tmm0, 1920(%rsp,%rbp) # 1024-byte Folded Spill
98+ ; CHECK-NEXT: tileloadd 1920(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
99+ ; CHECK-NEXT: tdpbhf8ps %tmm2, %tmm1, %tmm3
100+ ; CHECK-NEXT: tilestored %tmm3, (%rdi,%rcx)
101+ ; CHECK-NEXT: tilestored %tmm0, 2944(%rsp,%rbp) # 1024-byte Folded Spill
102+ ; CHECK-NEXT: tileloadd 2944(%rsp,%rbp), %tmm3 # 1024-byte Folded Reload
103+ ; CHECK-NEXT: tdphbf8ps %tmm2, %tmm1, %tmm3
104+ ; CHECK-NEXT: tilestored %tmm3, (%rdi,%rcx)
105+ ; CHECK-NEXT: tdphf8ps %tmm2, %tmm1, %tmm0
106+ ; CHECK-NEXT: tilestored %tmm0, (%rdi,%rcx)
107+ ; CHECK-NEXT: addq $4976, %rsp # imm = 0x1370
108+ ; CHECK-NEXT: popq %rbp
109+ ; CHECK-NEXT: tilerelease
110+ ; CHECK-NEXT: vzeroupper
111+ ; CHECK-NEXT: retq
112+ entry:
113+ %0 = load i16 , ptr %src1 , align 64
114+ %col = getelementptr inbounds nuw i8 , ptr %src2 , i64 2
115+ %1 = load i16 , ptr %col , align 2
116+ %col1 = getelementptr inbounds nuw i8 , ptr %src1 , i64 2
117+ %2 = load i16 , ptr %col1 , align 2
118+ %tile = getelementptr inbounds nuw i8 , ptr %dst , i64 64
119+ %3 = load <256 x i32 >, ptr %tile , align 64
120+ %tile2 = getelementptr inbounds nuw i8 , ptr %src1 , i64 64
121+ %4 = load <256 x i32 >, ptr %tile2 , align 64
122+ %tile3 = getelementptr inbounds nuw i8 , ptr %src2 , i64 64
123+ %5 = load <256 x i32 >, ptr %tile3 , align 64
124+ %6 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32 (<256 x i32 > %3 )
125+ %7 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32 (<256 x i32 > %4 )
126+ %8 = tail call x86_amx @llvm.x86.cast.vector.to.tile.v256i32 (<256 x i32 > %5 )
127+
128+ %9 = tail call x86_amx @llvm.x86.tdpbf8ps.internal (i16 %0 , i16 %1 , i16 %2 , x86_amx %6 , x86_amx %7 , x86_amx %8 )
129+ %10 = tail call <256 x i32 > @llvm.x86.cast.tile.to.vector.v256i32 (x86_amx %9 )
130+ store <256 x i32 > %10 , ptr %tile , align 64
131+
132+ %11 = tail call x86_amx @llvm.x86.tdpbhf8ps.internal (i16 %0 , i16 %1 , i16 %2 , x86_amx %6 , x86_amx %7 , x86_amx %8 )
133+ %12 = tail call <256 x i32 > @llvm.x86.cast.tile.to.vector.v256i32 (x86_amx %11 )
134+ store <256 x i32 > %12 , ptr %tile , align 64
135+
136+ %13 = tail call x86_amx @llvm.x86.tdphbf8ps.internal (i16 %0 , i16 %1 , i16 %2 , x86_amx %6 , x86_amx %7 , x86_amx %8 )
137+ %14 = tail call <256 x i32 > @llvm.x86.cast.tile.to.vector.v256i32 (x86_amx %13 )
138+ store <256 x i32 > %14 , ptr %tile , align 64
139+
140+ %15 = tail call x86_amx @llvm.x86.tdphf8ps.internal (i16 %0 , i16 %1 , i16 %2 , x86_amx %6 , x86_amx %7 , x86_amx %8 )
141+ %16 = tail call <256 x i32 > @llvm.x86.cast.tile.to.vector.v256i32 (x86_amx %15 )
142+ store <256 x i32 > %16 , ptr %tile , align 64
143+
144+ ret void
145+ }
146+
62147declare x86_amx @llvm.x86.tilezero.internal (i16 , i16 )
63148declare x86_amx @llvm.x86.tileloadd64.internal (i16 , i16 , i8* , i64 )
64149declare void @llvm.x86.tilestored64.internal (i16 , i16 , i8* , i64 , x86_amx)
@@ -67,3 +152,6 @@ declare x86_amx @llvm.x86.tdpbf8ps.internal(i16, i16, i16, x86_amx, x86_amx, x86
67152declare x86_amx @llvm.x86.tdpbhf8ps.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
68153declare x86_amx @llvm.x86.tdphbf8ps.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
69154declare x86_amx @llvm.x86.tdphf8ps.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
155+
156+ attributes #0 = { nounwind }
157+
0 commit comments