@@ -59,63 +59,53 @@ define void @test_amx(i8* %pointer, i8* %base, i64 %stride) {
5959 ret void
6060}
6161
62- %struct.__tile1024i_str = type <{ i16 , i16 , [60 x i8 ], <256 x i32 > }>
63-
64- define dso_local void @__tile_dpbf8ps (ptr nocapture noundef %dst , ptr nocapture noundef readonly byval (%struct.__tile1024i_str ) align 64 %src1 , ptr nocapture noundef readonly byval (%struct.__tile1024i_str ) align 64 %src2 ) {
62+ ; Function Attrs: nounwind
63+ define dso_local void @__tile_dpbf8ps (ptr %dst , ptr %src1 , ptr %src2 ) #0 {
6564; CHECK-LABEL: __tile_dpbf8ps:
6665; CHECK: # %bb.0: # %entry
6766; CHECK-NEXT: pushq %rbp
68- ; CHECK-NEXT: .cfi_def_cfa_offset 16
69- ; CHECK-NEXT: .cfi_offset %rbp, -16
70- ; CHECK-NEXT: movq %rsp, %rbp
71- ; CHECK-NEXT: .cfi_def_cfa_register %rbp
72- ; CHECK-NEXT: pushq %rbx
73- ; CHECK-NEXT: andq $-1024, %rsp # imm = 0xFC00
74- ; CHECK-NEXT: subq $5120, %rsp # imm = 0x1400
75- ; CHECK-NEXT: .cfi_offset %rbx, -24
67+ ; CHECK-NEXT: subq $4976, %rsp # imm = 0x1370
7668; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
7769; CHECK-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
7870; CHECK-NEXT: movb $1, {{[0-9]+}}(%rsp)
79- ; CHECK-NEXT: movzwl 16(%rbp ), %eax
71+ ; CHECK-NEXT: movzwl (%rsi ), %eax
8072; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
8173; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
8274; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp)
83- ; CHECK-NEXT: movswq 1106(%rbp ), %rcx
75+ ; CHECK-NEXT: movswq 2(%rdx ), %rcx
8476; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
8577; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
8678; CHECK-NEXT: movw %cx, {{[0-9]+}}(%rsp)
87- ; CHECK-NEXT: movswq 18(%rbp ), %rdx
88- ; CHECK-NEXT: movw %dx , {{[0-9]+}}(%rsp)
89- ; CHECK-NEXT: movzwl %dx , %esi
90- ; CHECK-NEXT: movb %sil , {{[0-9]+}}(%rsp)
91- ; CHECK-NEXT: shrl $2, %esi
92- ; CHECK-NEXT: movb %sil , {{[0-9]+}}(%rsp)
79+ ; CHECK-NEXT: movswq 2(%rsi ), %r8
80+ ; CHECK-NEXT: movw %r8w , {{[0-9]+}}(%rsp)
81+ ; CHECK-NEXT: movzwl %r8w , %r9d
82+ ; CHECK-NEXT: movb %r9b , {{[0-9]+}}(%rsp)
83+ ; CHECK-NEXT: shrl $2, %r9d
84+ ; CHECK-NEXT: movb %r9b , {{[0-9]+}}(%rsp)
9385; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp)
9486; CHECK-NEXT: addq $64, %rdi
9587; CHECK-NEXT: tileloadd (%rdi,%rcx), %tmm0
96- ; CHECK-NEXT: leaq 80(%rbp) , %r8
97- ; CHECK-NEXT: tileloadd (%r8,%rdx ), %tmm1
98- ; CHECK-NEXT: leaq 1168(%rbp) , %r8
99- ; CHECK-NEXT: tileloadd (%r8 ,%rcx), %tmm2
100- ; CHECK-NEXT: movabsq $64, %rbx
101- ; CHECK-NEXT: tilestored %tmm0, 1024 (%rsp,%rbx ) # 1024-byte Folded Spill
102- ; CHECK-NEXT: tileloadd 1024 (%rsp,%rbx ), %tmm3 # 1024-byte Folded Reload
88+ ; CHECK-NEXT: addq $64 , %rsi
89+ ; CHECK-NEXT: tileloadd (%rsi,%r8 ), %tmm1
90+ ; CHECK-NEXT: addq $64 , %rdx
91+ ; CHECK-NEXT: tileloadd (%rdx ,%rcx), %tmm2
92+ ; CHECK-NEXT: movabsq $64, %rbp
93+ ; CHECK-NEXT: tilestored %tmm0, 896 (%rsp,%rbp ) # 1024-byte Folded Spill
94+ ; CHECK-NEXT: tileloadd 896 (%rsp,%rbp ), %tmm3 # 1024-byte Folded Reload
10395; CHECK-NEXT: tdpbf8ps %tmm2, %tmm1, %tmm3
10496; CHECK-NEXT: tilestored %tmm3, (%rdi,%rcx)
105- ; CHECK-NEXT: tilestored %tmm0, 2048 (%rsp,%rbx ) # 1024-byte Folded Spill
106- ; CHECK-NEXT: tileloadd 2048 (%rsp,%rbx ), %tmm3 # 1024-byte Folded Reload
97+ ; CHECK-NEXT: tilestored %tmm0, 1920 (%rsp,%rbp ) # 1024-byte Folded Spill
98+ ; CHECK-NEXT: tileloadd 1920 (%rsp,%rbp ), %tmm3 # 1024-byte Folded Reload
10799; CHECK-NEXT: tdpbhf8ps %tmm2, %tmm1, %tmm3
108100; CHECK-NEXT: tilestored %tmm3, (%rdi,%rcx)
109- ; CHECK-NEXT: tilestored %tmm0, 3072 (%rsp,%rbx ) # 1024-byte Folded Spill
110- ; CHECK-NEXT: tileloadd 3072 (%rsp,%rbx ), %tmm3 # 1024-byte Folded Reload
101+ ; CHECK-NEXT: tilestored %tmm0, 2944 (%rsp,%rbp ) # 1024-byte Folded Spill
102+ ; CHECK-NEXT: tileloadd 2944 (%rsp,%rbp ), %tmm3 # 1024-byte Folded Reload
111103; CHECK-NEXT: tdphbf8ps %tmm2, %tmm1, %tmm3
112104; CHECK-NEXT: tilestored %tmm3, (%rdi,%rcx)
113105; CHECK-NEXT: tdphf8ps %tmm2, %tmm1, %tmm0
114106; CHECK-NEXT: tilestored %tmm0, (%rdi,%rcx)
115- ; CHECK-NEXT: leaq -8(%rbp), %rsp
116- ; CHECK-NEXT: popq %rbx
107+ ; CHECK-NEXT: addq $4976, %rsp # imm = 0x1370
117108; CHECK-NEXT: popq %rbp
118- ; CHECK-NEXT: .cfi_def_cfa %rsp, 8
119109; CHECK-NEXT: tilerelease
120110; CHECK-NEXT: vzeroupper
121111; CHECK-NEXT: retq
@@ -162,3 +152,6 @@ declare x86_amx @llvm.x86.tdpbf8ps.internal(i16, i16, i16, x86_amx, x86_amx, x86
162152declare x86_amx @llvm.x86.tdpbhf8ps.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
163153declare x86_amx @llvm.x86.tdphbf8ps.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
164154declare x86_amx @llvm.x86.tdphf8ps.internal (i16 , i16 , i16 , x86_amx, x86_amx, x86_amx)
155+
156+ attributes #0 = { nounwind }
157+
0 commit comments