diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 5343d66b083c7..8d0786ab0440d 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -5598,6 +5598,8 @@ The fields used by CP for code objects before V3 also match those specified in roundup(lds-size / (128 * 4)) GFX950 roundup(lds-size / (320 * 4)) + GFX125* + roundup(lds-size / (256 * 4)) 24 1 bit ENABLE_EXCEPTION_IEEE_754_FP Wavefront starts execution _INVALID_OPERATION with specified exceptions diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index f26639847be75..8e4b6365dc06b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1548,7 +1548,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11", def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", "gfx12", - [FeatureFP64, FeatureAddressableLocalMemorySize65536, FeatureMIMG_R128, + [FeatureFP64, FeatureMIMG_R128, FeatureFlatAddressSpace, Feature16BitInsts, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts, @@ -1977,6 +1977,7 @@ def FeatureISAVersion11_5_3 : FeatureSet< def FeatureISAVersion12 : FeatureSet< [FeatureGFX12, + FeatureAddressableLocalMemorySize65536, FeatureLDSBankCount32, FeatureDLInsts, FeatureDot7Insts, @@ -2019,6 +2020,7 @@ def FeatureISAVersion12_50 : FeatureSet< [FeatureGFX12, FeatureGFX1250Insts, FeatureCUStores, + FeatureAddressableLocalMemorySize327680, FeatureCuMode, Feature64BitLiterals, FeatureLDSBankCount32, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 626734a4752f3..c7d2d268a2707 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -1103,7 +1103,11 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, ProgInfo.DX10Clamp = Mode.DX10Clamp; unsigned LDSAlignShift; - if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) { + if (STM.getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) { + // LDS is allocated in 256 dword blocks. + LDSAlignShift = 10; + } else if (STM.getFeatureBits().test( + FeatureAddressableLocalMemorySize163840)) { // LDS is allocated in 320 dword blocks. LDSAlignShift = 11; } else if (STM.getFeatureBits().test( diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td index 74d1faeb6f545..d14b5ce80d28e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -30,6 +30,7 @@ class SubtargetFeatureAddressableLocalMemorySize : SubtargetFeature< def FeatureAddressableLocalMemorySize32768 : SubtargetFeatureAddressableLocalMemorySize<32768>; def FeatureAddressableLocalMemorySize65536 : SubtargetFeatureAddressableLocalMemorySize<65536>; def FeatureAddressableLocalMemorySize163840 : SubtargetFeatureAddressableLocalMemorySize<163840>; +def FeatureAddressableLocalMemorySize327680 : SubtargetFeatureAddressableLocalMemorySize<327680>; class SubtargetFeatureWavefrontSize : SubtargetFeature< "wavefrontsize"#!shl(1, ValueLog2), diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index e0ac040bdd226..ec9f1abdd8467 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1160,6 +1160,8 @@ unsigned getAddressableLocalMemorySize(const MCSubtargetInfo *STI) { return 65536; if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize163840)) return 163840; + if (STI->getFeatureBits().test(FeatureAddressableLocalMemorySize327680)) + return 327680; return 0; } @@ -3340,8 +3342,8 @@ bool isDPALU_DPP(const MCInstrDesc &OpDesc, const MCSubtargetInfo &ST) { } unsigned getLdsDwGranularity(const MCSubtargetInfo &ST) { - // Currently this is 128 for all subtargets - return 128; + return ST.hasFeature(AMDGPU::FeatureAddressableLocalMemorySize327680) ? 256 + : 128; } bool isPackedFP32Inst(unsigned Opc) { diff --git a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll index e1ce5341efdd1..4349b18fd394c 100644 --- a/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll +++ b/llvm/test/CodeGen/AMDGPU/extra-lds-size.ll @@ -6,6 +6,8 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefix=GFX950-MESA %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-PAL %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200-MESA %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-PAL %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250-MESA %s ; Check EXTRA_LDS_SIZE in SPI_SHADER_PGM_RSRC2_PS. @@ -29,6 +31,11 @@ ; GFX1200-MESA: .long 45100 ; GFX1200-MESA-NEXT: .long 1024 +; GFX1250-PAL: '0x2c0b (SPI_SHADER_PGM_RSRC2_PS)': 0x200 + +; GFX1250-MESA: .long 45100 +; GFX1250-MESA-NEXT: .long 512 + @lds = internal addrspace(3) global [4096 x i8] poison define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset) { diff --git a/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll new file mode 100644 index 0000000000000..da92dcdd7104e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-limit-diagnostics-gfx1250.ll @@ -0,0 +1,13 @@ +; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; GFX1250 supports upto 320 KB LDS memory. +; This is a negative test to check when the LDS size exceeds the max usable limit. + +; ERROR: error: :0:0: local memory (327684) exceeds limit (327680) in function 'test_lds_limit' +@dst = addrspace(3) global [81921 x i32] undef + +define amdgpu_kernel void @test_lds_limit(i32 %val) { + %gep = getelementptr [81921 x i32], ptr addrspace(3) @dst, i32 0, i32 100 + store i32 %val, ptr addrspace(3) %gep + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll new file mode 100644 index 0000000000000..3db0fa8f21759 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-size-hsa-gfx1250.ll @@ -0,0 +1,72 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefix=MESA %s + +; GFX1250 supports upto 320 KB configurable LDS memory. +; This test checks the min and max size of LDS that can be allocated. + +@lds.i8 = addrspace(3) global i8 undef +@lds.array.i8 = addrspace(3) global [327679 x i8] undef +@lds.i16 = addrspace(3) global i16 undef +@lds.array.i16 = addrspace(3) global [163839 x i16] undef +@lds.i32 = addrspace(3) global i32 undef +@lds.array.i32 = addrspace(3) global [81919 x i32] undef + +; GCN-LABEL: test_lds_i8: +; GCN: .amdhsa_group_segment_fixed_size 1 +; GCN: ; LDSByteSize: 1 bytes/workgroup +; MESA: granulated_lds_size = 1 +define amdgpu_kernel void @test_lds_i8(i8 %val) { + store i8 %val, ptr addrspace(3) @lds.i8 + ret void +} + +; GCN-LABEL: test_lds_i16: +; GCN: .amdhsa_group_segment_fixed_size 2 +; GCN: ; LDSByteSize: 2 bytes/workgroup +; MESA: granulated_lds_size = 1 +define amdgpu_kernel void @test_lds_i16(i16 %val) { + store i16 %val, ptr addrspace(3) @lds.i16 + ret void +} + +; GCN-LABEL: test_lds_i32: +; GCN: .amdhsa_group_segment_fixed_size 4 +; GCN: ; LDSByteSize: 4 bytes/workgroup +; MESA: granulated_lds_size = 1 +define amdgpu_kernel void @test_lds_i32(i32 %val) { + store i32 %val, ptr addrspace(3) @lds.i32 + ret void +} + +; GCN-LABEL: test_lds_array_i8: +; GCN: .amdhsa_group_segment_fixed_size 327680 +; GCN: ; LDSByteSize: 327680 bytes/workgroup +; MESA: granulated_lds_size = 320 +define amdgpu_kernel void @test_lds_array_i8() { + %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5 + %val = load i8, ptr addrspace(3) %gep + store i8 %val, ptr addrspace(3) @lds.i8 + ret void +} + +; GCN-LABEL: test_lds_array_i16: +; GCN: .amdhsa_group_segment_fixed_size 327680 +; GCN: ; LDSByteSize: 327680 bytes/workgroup +; MESA: granulated_lds_size = 320 +define amdgpu_kernel void @test_lds_array_i16() { + %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10 + %val = load i16, ptr addrspace(3) %gep + store i16 %val, ptr addrspace(3) @lds.i16 + ret void +} + +; GCN-LABEL: test_lds_array_i32: +; GCN: .amdhsa_group_segment_fixed_size 327680 +; GCN: ; LDSByteSize: 327680 bytes/workgroup +; MESA: granulated_lds_size = 320 +define amdgpu_kernel void @test_lds_array_i32() { + %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20 + %val = load i32, ptr addrspace(3) %gep + store i32 %val, ptr addrspace(3) @lds.i32 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll new file mode 100644 index 0000000000000..bfa7d37ce63a7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-size-pal-gfx1250.ll @@ -0,0 +1,61 @@ +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 < %s | FileCheck -check-prefix=PAL %s + +; GFX1250 supports upto 320 KB configurable LDS memory. +; This test checks the min and max size of LDS that can be allocated. + +; PAL: .shader_functions: +; PAL: test_lds_array_i16: +; PAL: .lds_size: 0x50000 +; PAL: test_lds_array_i32: +; PAL: .lds_size: 0x50000 +; PAL: test_lds_array_i8: +; PAL: .lds_size: 0x50000 +; PAL: test_lds_i16: +; PAL: .lds_size: 0x2 +; PAL: test_lds_i32: +; PAL: .lds_size: 0x4 +; PAL: test_lds_i8: +; PAL: .lds_size: 0x1 + +@lds.i8 = addrspace(3) global i8 undef +@lds.array.i8 = addrspace(3) global [327679 x i8] undef +@lds.i16 = addrspace(3) global i16 undef +@lds.array.i16 = addrspace(3) global [163839 x i16] undef +@lds.i32 = addrspace(3) global i32 undef +@lds.array.i32 = addrspace(3) global [81919 x i32] undef + +define amdgpu_gfx void @test_lds_i8(i8 %val) { + store i8 %val, ptr addrspace(3) @lds.i8 + ret void +} + +define amdgpu_gfx void @test_lds_i16(i16 %val) { + store i16 %val, ptr addrspace(3) @lds.i16 + ret void +} + +define amdgpu_gfx void @test_lds_i32(i32 %val) { + store i32 %val, ptr addrspace(3) @lds.i32 + ret void +} + +define amdgpu_gfx void @test_lds_array_i8() { + %gep = getelementptr inbounds [327679 x i8], ptr addrspace(3) @lds.array.i8, i32 0, i32 5 + %val = load i8, ptr addrspace(3) %gep + store i8 %val, ptr addrspace(3) @lds.i8 + ret void +} + +define amdgpu_gfx void @test_lds_array_i16() { + %gep = getelementptr inbounds [163839 x i16], ptr addrspace(3) @lds.array.i16, i32 0, i32 10 + %val = load i16, ptr addrspace(3) %gep + store i16 %val, ptr addrspace(3) @lds.i16 + ret void +} + +define amdgpu_gfx void @test_lds_array_i32() { + %gep = getelementptr inbounds [81919 x i32], ptr addrspace(3) @lds.array.i32, i32 0, i32 20 + %val = load i32, ptr addrspace(3) %gep + store i32 %val, ptr addrspace(3) @lds.i32 + ret void +}