Skip to content

Commit a629119

Browse files
authored
[AMDGPU] Remove wave64 functions (#153690)
gfx1250 only supports wave32.
1 parent 2775c79 commit a629119

File tree

4 files changed

+253
-3
lines changed

4 files changed

+253
-3
lines changed

llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -195,13 +195,17 @@ bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) {
195195

196196
// Delete FeatureWavefrontSize32 functions for
197197
// gfx9 and below targets that don't support the mode.
198-
// gfx10+ is implied to support both wave32 and 64 features.
198+
// gfx10, gfx11, gfx12 are implied to support both wave32 and 64 features.
199199
// They are not in the feature set. So, we need a separate check
200-
if (ST->getGeneration() < AMDGPUSubtarget::GFX10 &&
201-
ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) {
200+
if (!ST->supportsWave32() && ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) {
202201
reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize32);
203202
return true;
204203
}
204+
// gfx125x only support FeatureWavefrontSize32.
205+
if (!ST->supportsWave64() && ST->hasFeature(AMDGPU::FeatureWavefrontSize64)) {
206+
reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize64);
207+
return true;
208+
}
205209
return false;
206210
}
207211

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1726,6 +1726,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
17261726
/// unit requirement.
17271727
unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
17281728

1729+
bool supportsWave32() const { return getGeneration() >= GFX10; }
1730+
1731+
bool supportsWave64() const { return !hasGFX1250Insts(); }
1732+
17291733
bool isWave32() const {
17301734
return getWavefrontSize() == 32;
17311735
}
Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,208 @@
1+
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1250 <%s | FileCheck %s --check-prefixes=CHECK
2+
3+
; CHECK-LABEL: {{^}}_amdgpu_cs_main:
4+
; CHECK: ; TotalNumSgprs: 4
5+
; CHECK: ; NumVgprs: 2
6+
; CHECK: .amdgpu_pal_metadata
7+
; CHECK-NEXT: ---
8+
; CHECK-NEXT: amdpal.pipelines:
9+
; CHECK-NEXT: - .api: Vulkan
10+
; CHECK-NEXT: .compute_registers:
11+
; CHECK-NEXT: .tg_size_en: true
12+
; CHECK-NEXT: .tgid_x_en: false
13+
; CHECK-NEXT: .tgid_y_en: false
14+
; CHECK-NEXT: .tgid_z_en: false
15+
; CHECK-NEXT: .tidig_comp_cnt: 0x1
16+
; CHECK-NEXT: .graphics_registers:
17+
; CHECK-NEXT: .ps_extra_lds_size: 0
18+
; CHECK-NEXT: .spi_ps_input_addr:
19+
; CHECK-NEXT: .ancillary_ena: false
20+
; CHECK-NEXT: .front_face_ena: true
21+
; CHECK-NEXT: .line_stipple_tex_ena: false
22+
; CHECK-NEXT: .linear_center_ena: true
23+
; CHECK-NEXT: .linear_centroid_ena: true
24+
; CHECK-NEXT: .linear_sample_ena: true
25+
; CHECK-NEXT: .persp_center_ena: true
26+
; CHECK-NEXT: .persp_centroid_ena: true
27+
; CHECK-NEXT: .persp_pull_model_ena: false
28+
; CHECK-NEXT: .persp_sample_ena: true
29+
; CHECK-NEXT: .pos_fixed_pt_ena: true
30+
; CHECK-NEXT: .pos_w_float_ena: false
31+
; CHECK-NEXT: .pos_x_float_ena: false
32+
; CHECK-NEXT: .pos_y_float_ena: false
33+
; CHECK-NEXT: .pos_z_float_ena: false
34+
; CHECK-NEXT: .sample_coverage_ena: false
35+
; CHECK-NEXT: .spi_ps_input_ena:
36+
; CHECK-NEXT: .ancillary_ena: false
37+
; CHECK-NEXT: .front_face_ena: false
38+
; CHECK-NEXT: .line_stipple_tex_ena: false
39+
; CHECK-NEXT: .linear_center_ena: false
40+
; CHECK-NEXT: .linear_centroid_ena: false
41+
; CHECK-NEXT: .linear_sample_ena: false
42+
; CHECK-NEXT: .persp_center_ena: false
43+
; CHECK-NEXT: .persp_centroid_ena: false
44+
; CHECK-NEXT: .persp_pull_model_ena: false
45+
; CHECK-NEXT: .persp_sample_ena: true
46+
; CHECK-NEXT: .pos_fixed_pt_ena: false
47+
; CHECK-NEXT: .pos_w_float_ena: false
48+
; CHECK-NEXT: .pos_x_float_ena: false
49+
; CHECK-NEXT: .pos_y_float_ena: false
50+
; CHECK-NEXT: .pos_z_float_ena: false
51+
; CHECK-NEXT: .sample_coverage_ena: false
52+
; CHECK-NEXT: .hardware_stages:
53+
; CHECK-NEXT: .cs:
54+
; CHECK-NEXT: .checksum_value: 0x9444d7d0
55+
; CHECK-NEXT: .debug_mode: false
56+
; CHECK-NEXT: .entry_point: _amdgpu_cs
57+
; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
58+
; CHECK-NEXT: .excp_en: 0
59+
; CHECK-NEXT: .float_mode: 0xc0
60+
; CHECK-NEXT: .forward_progress: true
61+
; GFX11-NEXT: .ieee_mode: false
62+
; CHECK-NEXT: .image_op: false
63+
; CHECK-NEXT: .lds_size: 0
64+
; CHECK-NEXT: .mem_ordered: true
65+
; CHECK-NEXT: .scratch_en: false
66+
; CHECK-NEXT: .scratch_memory_size: 0
67+
; CHECK-NEXT: .sgpr_count: 0x4
68+
; CHECK-NEXT: .sgpr_limit: 0x6a
69+
; CHECK-NEXT: .threadgroup_dimensions:
70+
; CHECK-NEXT: - 0x1
71+
; CHECK-NEXT: - 0x400
72+
; CHECK-NEXT: - 0x1
73+
; CHECK-NEXT: .trap_present: false
74+
; CHECK-NEXT: .user_data_reg_map:
75+
; CHECK-NEXT: - 0x10000000
76+
; CHECK-NEXT: - 0xffffffff
77+
; CHECK-NEXT: - 0
78+
; CHECK-NEXT: - 0xffffffff
79+
; CHECK-NEXT: - 0xffffffff
80+
; CHECK-NEXT: - 0xffffffff
81+
; CHECK-NEXT: - 0xffffffff
82+
; CHECK-NEXT: - 0xffffffff
83+
; CHECK-NEXT: - 0xffffffff
84+
; CHECK-NEXT: - 0xffffffff
85+
; CHECK-NEXT: - 0xffffffff
86+
; CHECK-NEXT: - 0xffffffff
87+
; CHECK-NEXT: - 0xffffffff
88+
; CHECK-NEXT: - 0xffffffff
89+
; CHECK-NEXT: - 0xffffffff
90+
; CHECK-NEXT: - 0xffffffff
91+
; CHECK-NEXT: - 0xffffffff
92+
; CHECK-NEXT: - 0xffffffff
93+
; CHECK-NEXT: - 0xffffffff
94+
; CHECK-NEXT: - 0xffffffff
95+
; CHECK-NEXT: - 0xffffffff
96+
; CHECK-NEXT: - 0xffffffff
97+
; CHECK-NEXT: - 0xffffffff
98+
; CHECK-NEXT: - 0xffffffff
99+
; CHECK-NEXT: - 0xffffffff
100+
; CHECK-NEXT: - 0xffffffff
101+
; CHECK-NEXT: - 0xffffffff
102+
; CHECK-NEXT: - 0xffffffff
103+
; CHECK-NEXT: - 0xffffffff
104+
; CHECK-NEXT: - 0xffffffff
105+
; CHECK-NEXT: - 0xffffffff
106+
; CHECK-NEXT: - 0xffffffff
107+
; CHECK-NEXT: .user_sgprs: 0x3
108+
; CHECK-NEXT: .vgpr_count: 0x2
109+
; CHECK-NEXT: .vgpr_limit: 0x100
110+
; CHECK-NEXT: .wavefront_size: 0x20
111+
; CHECK-NEXT: .wgp_mode: false
112+
; CHECK-NEXT: .gs:
113+
; CHECK-NEXT: .debug_mode: false
114+
; CHECK-NEXT: .entry_point: _amdgpu_gs
115+
; CHECK-NEXT: .entry_point_symbol: gs_shader
116+
; CHECK-NEXT: .forward_progress: true
117+
; CHECK-NEXT: .lds_size: 0x400
118+
; CHECK-NEXT: .mem_ordered: true
119+
; CHECK-NEXT: .scratch_en: false
120+
; CHECK-NEXT: .scratch_memory_size: 0
121+
; CHECK-NEXT: .sgpr_count: 0x1
122+
; CHECK-NEXT: .vgpr_count: 0x1
123+
; CHECK-NEXT: .wgp_mode: false
124+
; CHECK-NEXT: .hs:
125+
; CHECK-NEXT: .debug_mode: false
126+
; CHECK-NEXT: .entry_point: _amdgpu_hs
127+
; CHECK-NEXT: .entry_point_symbol: hs_shader
128+
; CHECK-NEXT: .forward_progress: true
129+
; CHECK-NEXT: .lds_size: 0x1000
130+
; CHECK-NEXT: .mem_ordered: true
131+
; CHECK-NEXT: .scratch_en: false
132+
; CHECK-NEXT: .scratch_memory_size: 0
133+
; CHECK-NEXT: .sgpr_count: 0x1
134+
; CHECK-NEXT: .vgpr_count: 0x1
135+
; CHECK-NEXT: .wgp_mode: false
136+
; CHECK-NEXT: .ps:
137+
; CHECK-NEXT: .debug_mode: false
138+
; CHECK-NEXT: .entry_point: _amdgpu_ps
139+
; CHECK-NEXT: .entry_point_symbol: ps_shader
140+
; CHECK-NEXT: .forward_progress: true
141+
; CHECK-NEXT: .lds_size: 0
142+
; CHECK-NEXT: .mem_ordered: true
143+
; CHECK-NEXT: .scratch_en: false
144+
; CHECK-NEXT: .scratch_memory_size: 0
145+
; CHECK-NEXT: .sgpr_count: 0x1
146+
; CHECK-NEXT: .vgpr_count: 0x1
147+
; CHECK-NEXT: .wgp_mode: false
148+
; CHECK: .registers: {}
149+
; CHECK:amdpal.version:
150+
; CHECK-NEXT: - 0x3
151+
; CHECK-NEXT: - 0
152+
; CHECK-NEXT:...
153+
; CHECK-NEXT: .end_amdgpu_pal_metadata
154+
155+
define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
156+
.entry:
157+
%i = call i64 @llvm.amdgcn.s.getpc()
158+
%i1 = and i64 %i, -4294967296
159+
%i2 = zext i32 %arg1 to i64
160+
%i3 = or i64 %i1, %i2
161+
%i4 = inttoptr i64 %i3 to ptr addrspace(4)
162+
%i5 = and i32 %arg2, 1023
163+
%i6 = lshr i32 %arg2, 10
164+
%i7 = and i32 %i6, 1023
165+
%i8 = add nuw nsw i32 %i7, %i5
166+
%i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
167+
%.idx = shl nuw nsw i32 %i8, 2
168+
call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
169+
ret void
170+
}
171+
172+
define dllexport amdgpu_ps void @ps_shader() #1 {
173+
ret void
174+
}
175+
176+
@LDS.GS = external addrspace(3) global [1 x i32], align 4
177+
178+
define dllexport amdgpu_gs void @gs_shader() #2 {
179+
%ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
180+
store i32 0, ptr addrspace(3) %ptr, align 4
181+
ret void
182+
}
183+
184+
@LDS.HS = external addrspace(3) global [1024 x i32], align 4
185+
186+
define dllexport amdgpu_hs void @hs_shader() #2 {
187+
%ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
188+
store i32 0, ptr addrspace(3) %ptr, align 4
189+
ret void
190+
}
191+
192+
!amdgpu.pal.metadata.msgpack = !{!0}
193+
194+
; Function Attrs: nounwind willreturn memory(none)
195+
declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1
196+
197+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
198+
declare i64 @llvm.amdgcn.s.getpc() #2
199+
200+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
201+
declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3
202+
203+
attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" }
204+
205+
attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }
206+
207+
!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size \B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
208+
!1 = !{i32 7}
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32 -stop-after=amdgpu-remove-incompatible-functions\
2+
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX1250 %s
3+
; RUN: FileCheck --check-prefix=WARN-GFX1250 %s < %t
4+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+wavefrontsize32 < %s
5+
6+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 -stop-after=amdgpu-remove-incompatible-functions\
7+
; RUN: -pass-remarks=amdgpu-remove-incompatible-functions < %s 2>%t | FileCheck -check-prefixes=GFX1200 %s
8+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+wavefrontsize32,-wavefrontsize64 < %s
9+
10+
; WARN-GFX1250: removing function 'needs_wavefrontsize64': +wavefrontsize64 is not supported on the current target
11+
; WARN-GFX1250-NOT: not supported
12+
13+
define void @needs_wavefrontsize64(ptr %out) #0 {
14+
; GFX1250-NOT: @needs_wavefrontsize64
15+
; GFX1200: define void @needs_wavefrontsize64(
16+
%1 = tail call i64 @llvm.read_register.i64(metadata !0)
17+
%2 = tail call i64 @llvm.ctpop.i64(i64 %1)
18+
store i64 %2, ptr %out, align 4
19+
ret void
20+
}
21+
22+
define void @caller(ptr %out) {
23+
; GFX1250: call void null(
24+
; GFX1200: call void @needs_wavefrontsize64(
25+
call void @needs_wavefrontsize64(ptr %out)
26+
ret void
27+
}
28+
29+
declare i64 @llvm.read_register.i64(metadata)
30+
declare i64 @llvm.ctpop.i64(i64)
31+
32+
!0 = !{!"exec"}
33+
34+
attributes #0 = { "target-features"="+wavefrontsize64" }

0 commit comments

Comments
 (0)