Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion llvm/docs/AMDGPUUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5855,7 +5855,7 @@ The fields used by CP for code objects before V3 also match those specified in
GFX950
roundup(lds-size / (320 * 4))
GFX125*
roundup(lds-size / (256 * 4))
roundup(lds-size / (512 * 4))

24 1 bit ENABLE_EXCEPTION_IEEE_754_FP Wavefront starts execution
_INVALID_OPERATION with specified exceptions
Expand Down
23 changes: 11 additions & 12 deletions llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1160,21 +1160,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// Make clamp modifier on NaN input returns 0.
ProgInfo.DX10Clamp = Mode.DX10Clamp;

unsigned LDSAlignShift;
if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) {
// LDS is allocated in 256 dword blocks.
LDSAlignShift = 10;
} else if (STM.getFeatureBits().test(
FeatureAddressableLocalMemorySize163840)) {
// LDS is allocated in 320 dword blocks.
unsigned LDSAlignShift = 8;
switch (getLdsDwGranularity(STM)) {
case 512:
case 320:
LDSAlignShift = 11;
} else if (STM.getFeatureBits().test(
FeatureAddressableLocalMemorySize65536)) {
// LDS is allocated in 128 dword blocks.
break;
case 128:
LDSAlignShift = 9;
} else {
// LDS is allocated in 64 dword blocks.
break;
case 64:
LDSAlignShift = 8;
break;
default:
llvm_unreachable("invald LDS block size");
}

ProgInfo.SGPRSpill = MFI->getNumSpilledSGPRs();
Expand Down
11 changes: 9 additions & 2 deletions llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3546,8 +3546,15 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCInstrInfo &MII,
}

unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) {
return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256
: 128;
if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize32768))
return 64;
if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize65536))
return 128;
if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize163840))
return 320;
if (ST.getFeatureBits().test(FeatureAddressableLocalMemorySize327680))
return 512;
return 64;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should probably be llvm_unreachable?

}

bool isPackedFP32Inst(unsigned Opc) {
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AMDGPU/extra-lds-size.ll
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@
; GFX1200-MESA: .long 45100
; GFX1200-MESA-NEXT: .long 1024

; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200
; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x100
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure how this test ends up showing the allocation granularity


; GFX1250-MESA: .long 45100
; GFX1250-MESA-NEXT: .long 512
; GFX1250-MESA-NEXT: .long 256

@lds = internal addrspace(3) global [4096 x i8] poison

Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ define amdgpu_kernel void @test_lds_i32(i32 %val) {
; GCN-LABEL: test_lds_array_i8:
; GCN: .amdhsa_group_segment_fixed_size 327680
; GCN: ; LDSByteSize: 327680 bytes/workgroup
; MESA: granulated_lds_size = 320
; MESA: granulated_lds_size = 160
define amdgpu_kernel void @test_lds_array_i8() {
%gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5
%val = load i8, ptr addrspace(3) %gep
Expand All @@ -52,7 +52,7 @@ define amdgpu_kernel void @test_lds_array_i8() {
; GCN-LABEL: test_lds_array_i16:
; GCN: .amdhsa_group_segment_fixed_size 327680
; GCN: ; LDSByteSize: 327680 bytes/workgroup
; MESA: granulated_lds_size = 320
; MESA: granulated_lds_size = 160
define amdgpu_kernel void @test_lds_array_i16() {
%gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10
%val = load i16, ptr addrspace(3) %gep
Expand All @@ -63,7 +63,7 @@ define amdgpu_kernel void @test_lds_array_i16() {
; GCN-LABEL: test_lds_array_i32:
; GCN: .amdhsa_group_segment_fixed_size 327680
; GCN: ; LDSByteSize: 327680 bytes/workgroup
; MESA: granulated_lds_size = 320
; MESA: granulated_lds_size = 160
define amdgpu_kernel void @test_lds_array_i32() {
%gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20
%val = load i32, ptr addrspace(3) %gep
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx1250.ll
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@
; CHECK-NEXT: .entry_point: _amdgpu_gs
; CHECK-NEXT: .entry_point_symbol: gs_shader
; CHECK-NEXT: .forward_progress: true
; CHECK-NEXT: .lds_size: 0x400
; CHECK-NEXT: .lds_size: 0x800
; CHECK-NEXT: .mem_ordered: true
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
Expand Down
215 changes: 215 additions & 0 deletions llvm/test/CodeGen/AMDGPU/pal-metadata-3.0.gfx950.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx950 <%s | FileCheck %s --check-prefixes=CHECK

; CHECK-LABEL: {{^}}_amdgpu_cs_main:
; CHECK: ; TotalNumSgprs: 6
; CHECK: ; NumVgprs: 1
; CHECK: .amdgpu_pal_metadata
; CHECK-NEXT: ---
; CHECK-NEXT: amdpal.pipelines:
; CHECK-NEXT: - .api: Vulkan
; CHECK-NEXT: .compute_registers:
; CHECK-NEXT: .tg_size_en: true
; CHECK-NEXT: .tgid_x_en: false
; CHECK-NEXT: .tgid_y_en: false
; CHECK-NEXT: .tgid_z_en: false
; CHECK-NEXT: .tidig_comp_cnt: 0x1
; CHECK-NEXT: .graphics_registers:
; CHECK-NEXT: .ps_extra_lds_size: 0
; CHECK-NEXT: .spi_ps_input_addr:
; CHECK-NEXT: .ancillary_ena: false
; CHECK-NEXT: .front_face_ena: true
; CHECK-NEXT: .line_stipple_tex_ena: false
; CHECK-NEXT: .linear_center_ena: true
; CHECK-NEXT: .linear_centroid_ena: true
; CHECK-NEXT: .linear_sample_ena: true
; CHECK-NEXT: .persp_center_ena: true
; CHECK-NEXT: .persp_centroid_ena: true
; CHECK-NEXT: .persp_pull_model_ena: false
; CHECK-NEXT: .persp_sample_ena: true
; CHECK-NEXT: .pos_fixed_pt_ena: true
; CHECK-NEXT: .pos_w_float_ena: false
; CHECK-NEXT: .pos_x_float_ena: false
; CHECK-NEXT: .pos_y_float_ena: false
; CHECK-NEXT: .pos_z_float_ena: false
; CHECK-NEXT: .sample_coverage_ena: false
; CHECK-NEXT: .spi_ps_input_ena:
; CHECK-NEXT: .ancillary_ena: false
; CHECK-NEXT: .front_face_ena: false
; CHECK-NEXT: .line_stipple_tex_ena: false
; CHECK-NEXT: .linear_center_ena: false
; CHECK-NEXT: .linear_centroid_ena: false
; CHECK-NEXT: .linear_sample_ena: false
; CHECK-NEXT: .persp_center_ena: false
; CHECK-NEXT: .persp_centroid_ena: false
; CHECK-NEXT: .persp_pull_model_ena: false
; CHECK-NEXT: .persp_sample_ena: true
; CHECK-NEXT: .pos_fixed_pt_ena: false
; CHECK-NEXT: .pos_w_float_ena: false
; CHECK-NEXT: .pos_x_float_ena: false
; CHECK-NEXT: .pos_y_float_ena: false
; CHECK-NEXT: .pos_z_float_ena: false
; CHECK-NEXT: .sample_coverage_ena: false
; CHECK-NEXT: .hardware_stages:
; CHECK-NEXT: .cs:
; CHECK-NEXT: .agpr_count: 0
; CHECK-NEXT: .checksum_value: 0x9444d7d0
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_cs
; CHECK-NEXT: .entry_point_symbol: _amdgpu_cs_main
; CHECK-NEXT: .excp_en: 0
; CHECK-NEXT: .float_mode: 0xc0
; CHECK-NEXT: .forward_progress: false
; CHECK-NEXT: .ieee_mode: false
; CHECK-NEXT: .image_op: false
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .mem_ordered: false
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
; CHECK-NEXT: .sgpr_count: 0xa
; CHECK-NEXT: .sgpr_limit: 0x6a
; CHECK-NEXT: .threadgroup_dimensions:
; CHECK-NEXT: - 0x1
; CHECK-NEXT: - 0x400
; CHECK-NEXT: - 0x1
; CHECK-NEXT: .trap_present: false
; CHECK-NEXT: .user_data_reg_map:
; CHECK-NEXT: - 0x10000000
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: - 0xffffffff
; CHECK-NEXT: .user_sgprs: 0x3
; CHECK-NEXT: .vgpr_count: 0x2
; CHECK-NEXT: .vgpr_limit: 0x100
; CHECK-NEXT: .wavefront_size: 0x20
; CHECK-NEXT: .wgp_mode: false
; CHECK-NEXT: .gs:
; CHECK-NEXT: .agpr_count: 0
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_gs
; CHECK-NEXT: .entry_point_symbol: gs_shader
; CHECK-NEXT: .forward_progress: false
; CHECK-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0x500
; CHECK-NEXT: .mem_ordered: false
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
; CHECK-NEXT: .sgpr_count: 0x6
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: .wgp_mode: false
; CHECK-NEXT: .hs:
; CHECK-NEXT: .agpr_count: 0
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_hs
; CHECK-NEXT: .entry_point_symbol: hs_shader
; CHECK-NEXT: .forward_progress: false
; CHECK-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0xa00
; CHECK-NEXT: .mem_ordered: false
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
; CHECK-NEXT: .sgpr_count: 0x6
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: .wgp_mode: false
; CHECK-NEXT: .ps:
; CHECK-NEXT: .agpr_count: 0
; CHECK-NEXT: .debug_mode: false
; CHECK-NEXT: .entry_point: _amdgpu_ps
; CHECK-NEXT: .entry_point_symbol: ps_shader
; CHECK-NEXT: .forward_progress: false
; CHECK-NEXT: .ieee_mode: false
; CHECK-NEXT: .lds_size: 0
; CHECK-NEXT: .mem_ordered: false
; CHECK-NEXT: .scratch_en: false
; CHECK-NEXT: .scratch_memory_size: 0
; CHECK-NEXT: .sgpr_count: 0x6
; CHECK-NEXT: .vgpr_count: 0x1
; CHECK-NEXT: .wgp_mode: false
; CHECK: .registers: {}
; CHECK:amdpal.version:
; CHECK-NEXT: - 0x3
; CHECK-NEXT: - 0
; CHECK-NEXT:...
; CHECK-NEXT: .end_amdgpu_pal_metadata

define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg %arg1, i32 %arg2) #0 !lgc.shaderstage !1 {
.entry:
%i = call i64 @llvm.amdgcn.s.getpc()
%i1 = and i64 %i, -4294967296
%i2 = zext i32 %arg1 to i64
%i3 = or i64 %i1, %i2
%i4 = inttoptr i64 %i3 to ptr addrspace(4)
%i5 = and i32 %arg2, 1023
%i6 = lshr i32 %arg2, 10
%i7 = and i32 %i6, 1023
%i8 = add nuw nsw i32 %i7, %i5
%i9 = load <4 x i32>, ptr addrspace(4) %i4, align 16
%.idx = shl nuw nsw i32 %i8, 2
call void @llvm.amdgcn.raw.buffer.store.i32(i32 1, <4 x i32> %i9, i32 %.idx, i32 0, i32 0)
ret void
}

define dllexport amdgpu_ps void @ps_shader() #1 {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
define dllexport amdgpu_ps void @ps_shader() #1 {
define amdgpu_ps void @ps_shader() #1 {

ret void
}

@LDS.GS = external addrspace(3) global [1 x i32], align 4

define dllexport amdgpu_gs void @gs_shader() #2 {
%ptr = getelementptr i32, ptr addrspace(3) @LDS.GS, i32 0
store i32 0, ptr addrspace(3) %ptr, align 4
ret void
}

@LDS.HS = external addrspace(3) global [1024 x i32], align 4

define dllexport amdgpu_hs void @hs_shader() #2 {
%ptr = getelementptr i32, ptr addrspace(3) @LDS.HS, i32 0
store i32 0, ptr addrspace(3) %ptr, align 4
ret void
}

!amdgpu.pal.metadata.msgpack = !{!0}

; Function Attrs: nounwind willreturn memory(none)
declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>) #1

; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
declare i64 @llvm.amdgcn.s.getpc() #2

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg) #3

attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1024,1024" "amdgpu-memory-bound"="false" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "amdgpu-work-group-info-arg-no"="4" "denormal-fp-math-f32"="preserve-sign" }

attributes #1 = { nounwind memory(readwrite) "InitialPSInputAddr"="36983" }

!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size \B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
Comment on lines +210 to +214
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a lot of extra stuff here that shouldn't be necessary to test allocation granularity

!1 = !{i32 7}
6 changes: 3 additions & 3 deletions llvm/test/MC/AMDGPU/hsa-gfx1250-v4.s
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
// OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000
// OBJDUMP-NEXT: 00f0 00000cc0 80000000 00040000 00000000
// max_lds_size
// OBJDUMP-NEXT: 0100 00000600 00000000 00000000 00000000
// OBJDUMP-NEXT: 0100 00000500 00000000 00000000 00000000
// OBJDUMP-NEXT: 0110 00000000 00000000 00000000 00000000
// OBJDUMP-NEXT: 0120 00000000 00000000 00000000 00000000
// OBJDUMP-NEXT: 0130 00000cc0 80000000 00040000 00000000
Expand Down Expand Up @@ -231,13 +231,13 @@ max_vgprs:

.p2align 6
.amdhsa_kernel max_lds_size
.amdhsa_group_segment_fixed_size 393216
.amdhsa_group_segment_fixed_size 327680
.amdhsa_next_free_vgpr 1
.amdhsa_next_free_sgpr 1
.end_amdhsa_kernel

// ASM: .amdhsa_kernel max_lds_size
// ASM: .amdhsa_group_segment_fixed_size 393216
// ASM: .amdhsa_group_segment_fixed_size 327680
// ASM: .end_amdhsa_kernel

// Test maximum VGPR allocation
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/MC/AMDGPU/hsa-gfx1251-v4.s
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
// OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000
// OBJDUMP-NEXT: 00f0 00000cc0 80000000 00040000 00000000
// max_lds_size
// OBJDUMP-NEXT: 0100 00000600 00000000 00000000 00000000
// OBJDUMP-NEXT: 0100 00000500 00000000 00000000 00000000
// OBJDUMP-NEXT: 0110 00000000 00000000 00000000 00000000
// OBJDUMP-NEXT: 0120 00000000 00000000 00000000 00000000
// OBJDUMP-NEXT: 0130 00000cc0 80000000 00040000 00000000
Expand Down Expand Up @@ -231,13 +231,13 @@ max_vgprs:

.p2align 6
.amdhsa_kernel max_lds_size
.amdhsa_group_segment_fixed_size 393216
.amdhsa_group_segment_fixed_size 327680
.amdhsa_next_free_vgpr 1
.amdhsa_next_free_sgpr 1
.end_amdhsa_kernel

// ASM: .amdhsa_kernel max_lds_size
// ASM: .amdhsa_group_segment_fixed_size 393216
// ASM: .amdhsa_group_segment_fixed_size 327680
// ASM: .end_amdhsa_kernel

// Test maximum VGPR allocation
Expand Down
Loading