diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 13de93e829fab..412692bd53229 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3267,9 +3267,14 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const { MIB.add(MI.getOperand(1)); // rsrc MIB.add(MI.getOperand(5 + OpOffset)); // soffset MIB.add(MI.getOperand(6 + OpOffset)); // imm offset + bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI); unsigned Aux = MI.getOperand(7 + OpOffset).getImm(); - MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol - MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz + MIB.addImm(Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL + : AMDGPU::CPol::ALL_pregfx12)); // cpol + MIB.addImm( + Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12) + ? 1 + : 0); // swz MachineMemOperand *LoadMMO = *MI.memoperands_begin(); MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 428a19c391374..344028c4b4868 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -160,7 +160,7 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo, O << " dlc"; if ((Imm & CPol::SCC) && AMDGPU::isGFX90A(STI)) O << (AMDGPU::isGFX940(STI) ? " sc1" : " scc"); - if (Imm & ~CPol::ALL) + if (Imm & ~CPol::ALL_pregfx12) O << " /* unexpected cache policy bit */"; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index b186dafb4c0de..748ee872c8e1f 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9840,11 +9840,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Ops.push_back(Rsrc); Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset + bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget); unsigned Aux = Op.getConstantOperandVal(8 + OpOffset); - Ops.push_back( - DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol Ops.push_back(DAG.getTargetConstant( - Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz + Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12), + DL, MVT::i8)); // cpol + Ops.push_back(DAG.getTargetConstant( + Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12) + ? 1 + : 0, + DL, MVT::i8)); // swz Ops.push_back(M0Val.getValue(0)); // Chain Ops.push_back(M0Val.getValue(1)); // Glue diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll index 0c6bba2426947..b42ba7d75094a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -1312,8 +1312,8 @@ main_body: ret void } -define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) { -; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged: +define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12(<4 x i32> inreg %rsrc) { +; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12: ; PREGFX10: ; %bb.0: ; %main_body ; PREGFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; PREGFX10-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 @@ -1327,7 +1327,7 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i ; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm ; PREGFX10-NEXT: s_endpgm ; -; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged: +; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12: ; GFX10: ; %bb.0: ; %main_body ; GFX10-NEXT: s_clause 0x5 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 @@ -1342,7 +1342,7 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i ; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged: +; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12: ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: buffer_load_b32 v0, off, s[0:3], 0 offset:4 @@ -1357,7 +1357,7 @@ define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> i ; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done ; GFX11-NEXT: s_endpgm ; -; GFX12-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged: +; GFX12-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged_pregfx12: ; GFX12: ; %bb.0: ; %main_body ; GFX12-NEXT: s_clause 0x1 ; GFX12-NEXT: buffer_load_b128 v[0:3], off, s[0:3], null offset:4 scope:SCOPE_SE @@ -1379,6 +1379,65 @@ main_body: ret void } +define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) { +; PREGFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged: +; PREGFX10: ; %bb.0: ; %main_body +; PREGFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; PREGFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; PREGFX10-NEXT: s_waitcnt vmcnt(1) +; PREGFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm +; PREGFX10-NEXT: s_waitcnt vmcnt(0) +; PREGFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm +; PREGFX10-NEXT: s_endpgm +; +; GFX10-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4 +; GFX10-NEXT: buffer_load_dwordx2 v[4:5], off, s[0:3], 0 offset:28 +; GFX10-NEXT: s_waitcnt vmcnt(1) +; GFX10-NEXT: exp mrt0 v0, v1, v2, v3 done vm +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: exp mrt0 v4, v5, v0, v0 done vm +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: buffer_load_b128 v[0:3], off, s[0:3], 0 offset:4 +; GFX11-NEXT: buffer_load_b64 v[4:5], off, s[0:3], 0 offset:28 +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: exp mrt0 v0, v1, v2, v3 done +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: exp mrt0 v4, v5, v0, v0 done +; GFX11-NEXT: s_endpgm +; +; GFX12-LABEL: raw_buffer_load_x1_offset_swizzled_not_merged: +; GFX12: ; %bb.0: ; %main_body +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: buffer_load_b32 v0, off, s[0:3], null offset:4 +; GFX12-NEXT: buffer_load_b32 v1, off, s[0:3], null offset:8 +; GFX12-NEXT: buffer_load_b32 v2, off, s[0:3], null offset:12 +; GFX12-NEXT: buffer_load_b32 v3, off, s[0:3], null offset:16 +; GFX12-NEXT: buffer_load_b32 v4, off, s[0:3], null offset:28 +; GFX12-NEXT: buffer_load_b32 v5, off, s[0:3], null offset:32 +; GFX12-NEXT: s_wait_loadcnt 0x2 +; GFX12-NEXT: export mrt0 v0, v1, v2, v3 done +; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: export mrt0 v4, v5, v0, v0 done +; GFX12-NEXT: s_endpgm +main_body: + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 64) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 64) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 64) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 64) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 64) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 64) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll index fd6e354b274a4..3493de1497d11 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -2,6 +2,7 @@ ; RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefixes=GFX68,VERDE %s ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefixes=GFX68,GFX8 %s ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck -check-prefixes=GFX11 %s +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck -check-prefixes=GFX12 %s define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { ; GFX68-LABEL: buffer_store: @@ -497,8 +498,8 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, ret void } -define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { -; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: +define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12: ; GFX68: ; %bb.0: ; GFX68-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; GFX68-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 @@ -508,7 +509,7 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> ; GFX68-NEXT: buffer_store_dword v5, off, s[0:3], 0 offset:32 ; GFX68-NEXT: s_endpgm ; -; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: +; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x5 ; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 offset:4 @@ -527,6 +528,26 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> ret void } +define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { +; GFX12-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_clause 0x5 +; GFX12-NEXT: buffer_store_b32 v0, off, s[0:3], null offset:4 +; GFX12-NEXT: buffer_store_b32 v1, off, s[0:3], null offset:8 +; GFX12-NEXT: buffer_store_b32 v2, off, s[0:3], null offset:12 +; GFX12-NEXT: buffer_store_b32 v3, off, s[0:3], null offset:16 +; GFX12-NEXT: buffer_store_b32 v4, off, s[0:3], null offset:28 +; GFX12-NEXT: buffer_store_b32 v5, off, s[0:3], null offset:32 +; GFX12-NEXT: s_endpgm + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 64) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 64) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 64) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 64) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 64) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 64) + ret void +} + declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0