Skip to content

Commit 22a2d74

Browse files
authored
[NVPTX] Emit ld.v4.b16 for loading <4 x bfloat> (#109069)
This PR enables emitting a single load instruction for <4 x bfloat>, otherwise, 2 ld.b32 loads are generated.
1 parent f8eceb4 commit 22a2d74

File tree

2 files changed

+10
-0
lines changed

2 files changed

+10
-0
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6179,6 +6179,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
61796179
case MVT::v4i16:
61806180
case MVT::v4i32:
61816181
case MVT::v4f16:
6182+
case MVT::v4bf16:
61826183
case MVT::v4f32:
61836184
case MVT::v8f16: // <4 x f16x2>
61846185
case MVT::v8bf16: // <4 x bf16x2>

llvm/test/CodeGen/NVPTX/vector-loads.ll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -198,3 +198,12 @@ define void @extv8f16_generic_a4(ptr noalias readonly align 16 %dst, ptr noalias
198198

199199

200200
!1 = !{i32 0, i32 64}
201+
202+
; CHECK-LABEL: bf16_v4_align_load_store
203+
define dso_local void @bf16_v4_align_load_store(ptr noundef %0, ptr noundef %1) #0 {
204+
; CHECK: ld.v4.b16
205+
; CHECK: st.v4.b16
206+
%3 = load <4 x bfloat>, ptr %1, align 8
207+
store <4 x bfloat> %3, ptr %0, align 8
208+
ret void
209+
}

0 commit comments

Comments
 (0)