Skip to content

Commit 2787983

Browse files
committed
fixup! [AMDGPU] Allow casts between the Global and Constant Addr Spaces in isValidAddrSpaceCast
Relax and reorganize isValidAddrSpaceCast further
1 parent aea773c commit 2787983

File tree

3 files changed

+204
-34
lines changed

3 files changed

+204
-34
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -174,27 +174,45 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
174174
bool isAlwaysUniform(const Value *V) const;
175175

176176
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
177-
if (ToAS == AMDGPUAS::FLAT_ADDRESS) {
178-
switch (FromAS) {
177+
// Address space casts must cast between different address spaces.
178+
if (FromAS == ToAS)
179+
return false;
180+
181+
if (FromAS == AMDGPUAS::FLAT_ADDRESS ||
182+
FromAS == AMDGPUAS::GLOBAL_ADDRESS ||
183+
FromAS == AMDGPUAS::CONSTANT_ADDRESS ||
184+
FromAS > AMDGPUAS::MAX_AMDGPU_ADDRESS) {
185+
// Casting any 64-bit AS to another 64-bit AS or to a 32-bit AS is
186+
// valid.
187+
switch (ToAS) {
188+
case AMDGPUAS::FLAT_ADDRESS:
179189
case AMDGPUAS::GLOBAL_ADDRESS:
180-
case AMDGPUAS::CONSTANT_ADDRESS:
181-
case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
182190
case AMDGPUAS::LOCAL_ADDRESS:
191+
case AMDGPUAS::CONSTANT_ADDRESS:
183192
case AMDGPUAS::PRIVATE_ADDRESS:
193+
case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
184194
return true;
185195
default:
186196
break;
187197
}
188-
return false;
198+
if (ToAS > AMDGPUAS::MAX_AMDGPU_ADDRESS)
199+
return true;
200+
} else if (FromAS == AMDGPUAS::LOCAL_ADDRESS ||
201+
FromAS == AMDGPUAS::PRIVATE_ADDRESS ||
202+
FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
203+
// Casting from a 32-bit AS to any 64-bit AS is valid.
204+
switch (ToAS) {
205+
case AMDGPUAS::FLAT_ADDRESS:
206+
case AMDGPUAS::GLOBAL_ADDRESS:
207+
case AMDGPUAS::CONSTANT_ADDRESS:
208+
return true;
209+
default:
210+
break;
211+
}
212+
if (ToAS > AMDGPUAS::MAX_AMDGPU_ADDRESS)
213+
return true;
189214
}
190-
if (FromAS != ToAS &&
191-
(FromAS == AMDGPUAS::GLOBAL_ADDRESS ||
192-
FromAS == AMDGPUAS::CONSTANT_ADDRESS ||
193-
FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
194-
(ToAS == AMDGPUAS::GLOBAL_ADDRESS ||
195-
ToAS == AMDGPUAS::CONSTANT_ADDRESS ||
196-
ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT))
197-
return true;
215+
// Everything else is not valid.
198216
return false;
199217
}
200218

llvm/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll

Lines changed: 167 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1336,8 +1336,8 @@ define amdgpu_kernel void @memmove_flat_align1_global_align1(ptr %dst, ptr addrs
13361336
; MAX1024-NEXT: ret void
13371337
;
13381338
; ALL-LABEL: @memmove_flat_align1_global_align1(
1339-
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[SRC:%.*]] to ptr
1340-
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
1339+
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(1)
1340+
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[TMP1]]
13411341
; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
13421342
; ALL: memmove_bwd_loop:
13431343
; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
@@ -1404,8 +1404,8 @@ define amdgpu_kernel void @memmove_flat_align1_private_align1(ptr %dst, ptr addr
14041404
; MAX1024-NEXT: ret void
14051405
;
14061406
; ALL-LABEL: @memmove_flat_align1_private_align1(
1407-
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[SRC:%.*]] to ptr
1408-
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
1407+
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(5)
1408+
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(5) [[SRC:%.*]], [[TMP1]]
14091409
; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
14101410
; ALL: memmove_bwd_loop:
14111411
; ALL-NEXT: [[TMP2:%.*]] = phi i64 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 16, [[TMP0:%.*]] ]
@@ -1514,7 +1514,59 @@ define amdgpu_kernel void @memmove_global_align1_private_align1(ptr addrspace(1)
15141514

15151515
define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size) {
15161516
; OPT-LABEL: @memmove_global_align1_p999_align1(
1517-
; OPT-NEXT: call void @llvm.memmove.p1.p999.i64(ptr addrspace(1) [[DST:%.*]], ptr addrspace(999) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false)
1517+
; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[SIZE:%.*]], 4
1518+
; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE]], 15
1519+
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]]
1520+
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0
1521+
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP1]], 0
1522+
; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[DST:%.*]] to ptr addrspace(999)
1523+
; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(999) [[SRC:%.*]], [[TMP4]]
1524+
; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
1525+
; OPT: memmove_copy_backwards:
1526+
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
1527+
; OPT: memmove_bwd_residual_loop:
1528+
; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
1529+
; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1
1530+
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]]
1531+
; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(999) [[TMP6]], align 1
1532+
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[BWD_RESIDUAL_INDEX]]
1533+
; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(1) [[TMP7]], align 1
1534+
; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
1535+
; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
1536+
; OPT: memmove_bwd_middle:
1537+
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
1538+
; OPT: memmove_bwd_main_loop:
1539+
; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
1540+
; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 1
1541+
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[SRC]], i64 [[BWD_MAIN_INDEX]]
1542+
; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(999) [[TMP10]], align 1
1543+
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[BWD_MAIN_INDEX]]
1544+
; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(1) [[TMP11]], align 1
1545+
; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0
1546+
; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
1547+
; OPT: memmove_copy_forward:
1548+
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
1549+
; OPT: memmove_fwd_main_loop:
1550+
; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
1551+
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[SRC]], i64 [[FWD_MAIN_INDEX]]
1552+
; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(999) [[TMP13]], align 1
1553+
; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[DST]], i64 [[FWD_MAIN_INDEX]]
1554+
; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(1) [[TMP14]], align 1
1555+
; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 1
1556+
; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP1]]
1557+
; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
1558+
; OPT: memmove_fwd_middle:
1559+
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
1560+
; OPT: memmove_fwd_residual_loop:
1561+
; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
1562+
; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]]
1563+
; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(999) [[TMP17]], align 1
1564+
; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[DST]], i64 [[FWD_RESIDUAL_INDEX]]
1565+
; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(1) [[TMP18]], align 1
1566+
; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1
1567+
; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]]
1568+
; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
1569+
; OPT: memmove_done:
15181570
; OPT-NEXT: ret void
15191571
;
15201572
call void @llvm.memmove.p1.p999.i64(ptr addrspace(1) %dst, ptr addrspace(999) %src, i64 %size, i1 false)
@@ -1523,7 +1575,59 @@ define amdgpu_kernel void @memmove_global_align1_p999_align1(ptr addrspace(1) %d
15231575

15241576
define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size) {
15251577
; OPT-LABEL: @memmove_p999_align1_p1_align1(
1526-
; OPT-NEXT: call void @llvm.memmove.p999.p1.i64(ptr addrspace(999) [[DST:%.*]], ptr addrspace(1) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false)
1578+
; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[SIZE:%.*]], 4
1579+
; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE]], 15
1580+
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]]
1581+
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0
1582+
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP1]], 0
1583+
; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(999) [[DST:%.*]] to ptr addrspace(1)
1584+
; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(1) [[SRC:%.*]], [[TMP4]]
1585+
; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
1586+
; OPT: memmove_copy_backwards:
1587+
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
1588+
; OPT: memmove_bwd_residual_loop:
1589+
; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
1590+
; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1
1591+
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]]
1592+
; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(1) [[TMP6]], align 1
1593+
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_RESIDUAL_INDEX]]
1594+
; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(999) [[TMP7]], align 1
1595+
; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
1596+
; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
1597+
; OPT: memmove_bwd_middle:
1598+
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
1599+
; OPT: memmove_bwd_main_loop:
1600+
; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
1601+
; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 1
1602+
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[BWD_MAIN_INDEX]]
1603+
; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP10]], align 1
1604+
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]]
1605+
; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(999) [[TMP11]], align 1
1606+
; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0
1607+
; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
1608+
; OPT: memmove_copy_forward:
1609+
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
1610+
; OPT: memmove_fwd_main_loop:
1611+
; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
1612+
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(1) [[SRC]], i64 [[FWD_MAIN_INDEX]]
1613+
; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(1) [[TMP13]], align 1
1614+
; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]]
1615+
; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(999) [[TMP14]], align 1
1616+
; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 1
1617+
; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP1]]
1618+
; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
1619+
; OPT: memmove_fwd_middle:
1620+
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
1621+
; OPT: memmove_fwd_residual_loop:
1622+
; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
1623+
; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]]
1624+
; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(1) [[TMP17]], align 1
1625+
; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_RESIDUAL_INDEX]]
1626+
; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(999) [[TMP18]], align 1
1627+
; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1
1628+
; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]]
1629+
; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
1630+
; OPT: memmove_done:
15271631
; OPT-NEXT: ret void
15281632
;
15291633
call void @llvm.memmove.p999.p1.i64(ptr addrspace(999) %dst, ptr addrspace(1) %src, i64 %size, i1 false)
@@ -1532,7 +1636,59 @@ define amdgpu_kernel void @memmove_p999_align1_p1_align1(ptr addrspace(999) %dst
15321636

15331637
define amdgpu_kernel void @memmove_p999_align1_p998_align1(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size) {
15341638
; OPT-LABEL: @memmove_p999_align1_p998_align1(
1535-
; OPT-NEXT: call void @llvm.memmove.p999.p998.i64(ptr addrspace(999) [[DST:%.*]], ptr addrspace(998) [[SRC:%.*]], i64 [[SIZE:%.*]], i1 false)
1639+
; OPT-NEXT: [[TMP1:%.*]] = lshr i64 [[SIZE:%.*]], 4
1640+
; OPT-NEXT: [[TMP2:%.*]] = and i64 [[SIZE]], 15
1641+
; OPT-NEXT: [[TMP3:%.*]] = sub i64 [[SIZE]], [[TMP2]]
1642+
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i64 [[TMP2]], 0
1643+
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i64 [[TMP1]], 0
1644+
; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(999) [[DST:%.*]] to ptr addrspace(998)
1645+
; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(998) [[SRC:%.*]], [[TMP4]]
1646+
; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
1647+
; OPT: memmove_copy_backwards:
1648+
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]
1649+
; OPT: memmove_bwd_residual_loop:
1650+
; OPT-NEXT: [[TMP5:%.*]] = phi i64 [ [[BWD_RESIDUAL_INDEX:%.*]], [[MEMMOVE_BWD_RESIDUAL_LOOP]] ], [ [[SIZE]], [[MEMMOVE_COPY_BACKWARDS]] ]
1651+
; OPT-NEXT: [[BWD_RESIDUAL_INDEX]] = sub i64 [[TMP5]], 1
1652+
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[BWD_RESIDUAL_INDEX]]
1653+
; OPT-NEXT: [[ELEMENT:%.*]] = load i8, ptr addrspace(998) [[TMP6]], align 1
1654+
; OPT-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[BWD_RESIDUAL_INDEX]]
1655+
; OPT-NEXT: store i8 [[ELEMENT]], ptr addrspace(999) [[TMP7]], align 1
1656+
; OPT-NEXT: [[TMP8:%.*]] = icmp eq i64 [[BWD_RESIDUAL_INDEX]], [[TMP3]]
1657+
; OPT-NEXT: br i1 [[TMP8]], label [[MEMMOVE_BWD_MIDDLE]], label [[MEMMOVE_BWD_RESIDUAL_LOOP]]
1658+
; OPT: memmove_bwd_middle:
1659+
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_DONE:%.*]], label [[MEMMOVE_BWD_MAIN_LOOP:%.*]]
1660+
; OPT: memmove_bwd_main_loop:
1661+
; OPT-NEXT: [[TMP9:%.*]] = phi i64 [ [[BWD_MAIN_INDEX:%.*]], [[MEMMOVE_BWD_MAIN_LOOP]] ], [ [[TMP1]], [[MEMMOVE_BWD_MIDDLE]] ]
1662+
; OPT-NEXT: [[BWD_MAIN_INDEX]] = sub i64 [[TMP9]], 1
1663+
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(998) [[SRC]], i64 [[BWD_MAIN_INDEX]]
1664+
; OPT-NEXT: [[ELEMENT1:%.*]] = load <4 x i32>, ptr addrspace(998) [[TMP10]], align 1
1665+
; OPT-NEXT: [[TMP11:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[BWD_MAIN_INDEX]]
1666+
; OPT-NEXT: store <4 x i32> [[ELEMENT1]], ptr addrspace(999) [[TMP11]], align 1
1667+
; OPT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[BWD_MAIN_INDEX]], 0
1668+
; OPT-NEXT: br i1 [[TMP12]], label [[MEMMOVE_DONE]], label [[MEMMOVE_BWD_MAIN_LOOP]]
1669+
; OPT: memmove_copy_forward:
1670+
; OPT-NEXT: br i1 [[SKIP_MAIN]], label [[MEMMOVE_FWD_MIDDLE:%.*]], label [[MEMMOVE_FWD_MAIN_LOOP:%.*]]
1671+
; OPT: memmove_fwd_main_loop:
1672+
; OPT-NEXT: [[FWD_MAIN_INDEX:%.*]] = phi i64 [ [[TMP15:%.*]], [[MEMMOVE_FWD_MAIN_LOOP]] ], [ 0, [[MEMMOVE_COPY_FORWARD]] ]
1673+
; OPT-NEXT: [[TMP13:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(998) [[SRC]], i64 [[FWD_MAIN_INDEX]]
1674+
; OPT-NEXT: [[ELEMENT2:%.*]] = load <4 x i32>, ptr addrspace(998) [[TMP13]], align 1
1675+
; OPT-NEXT: [[TMP14:%.*]] = getelementptr inbounds <4 x i32>, ptr addrspace(999) [[DST]], i64 [[FWD_MAIN_INDEX]]
1676+
; OPT-NEXT: store <4 x i32> [[ELEMENT2]], ptr addrspace(999) [[TMP14]], align 1
1677+
; OPT-NEXT: [[TMP15]] = add i64 [[FWD_MAIN_INDEX]], 1
1678+
; OPT-NEXT: [[TMP16:%.*]] = icmp eq i64 [[TMP15]], [[TMP1]]
1679+
; OPT-NEXT: br i1 [[TMP16]], label [[MEMMOVE_FWD_MIDDLE]], label [[MEMMOVE_FWD_MAIN_LOOP]]
1680+
; OPT: memmove_fwd_middle:
1681+
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP:%.*]]
1682+
; OPT: memmove_fwd_residual_loop:
1683+
; OPT-NEXT: [[FWD_RESIDUAL_INDEX:%.*]] = phi i64 [ [[TMP19:%.*]], [[MEMMOVE_FWD_RESIDUAL_LOOP]] ], [ [[TMP3]], [[MEMMOVE_FWD_MIDDLE]] ]
1684+
; OPT-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(998) [[SRC]], i64 [[FWD_RESIDUAL_INDEX]]
1685+
; OPT-NEXT: [[ELEMENT3:%.*]] = load i8, ptr addrspace(998) [[TMP17]], align 1
1686+
; OPT-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(999) [[DST]], i64 [[FWD_RESIDUAL_INDEX]]
1687+
; OPT-NEXT: store i8 [[ELEMENT3]], ptr addrspace(999) [[TMP18]], align 1
1688+
; OPT-NEXT: [[TMP19]] = add i64 [[FWD_RESIDUAL_INDEX]], 1
1689+
; OPT-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], [[SIZE]]
1690+
; OPT-NEXT: br i1 [[TMP20]], label [[MEMMOVE_DONE]], label [[MEMMOVE_FWD_RESIDUAL_LOOP]]
1691+
; OPT: memmove_done:
15361692
; OPT-NEXT: ret void
15371693
;
15381694
call void @llvm.memmove.p999.p998.i64(ptr addrspace(999) %dst, ptr addrspace(998) %src, i64 %size, i1 false)
@@ -1726,8 +1882,8 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1(ptr addrspace(0) %ds
17261882
; MAX1024-NEXT: ret void
17271883
;
17281884
; ALL-LABEL: @memmove_flat_align1_local_align1(
1729-
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr
1730-
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP1]], [[DST:%.*]]
1885+
; ALL-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(3)
1886+
; ALL-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[TMP1]]
17311887
; ALL-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_BWD_LOOP:%.*]], label [[MEMMOVE_FWD_LOOP:%.*]]
17321888
; ALL: memmove_bwd_loop:
17331889
; ALL-NEXT: [[TMP2:%.*]] = phi i32 [ [[BWD_INDEX:%.*]], [[MEMMOVE_BWD_LOOP]] ], [ 32, [[TMP0:%.*]] ]
@@ -1761,8 +1917,8 @@ define amdgpu_kernel void @memmove_flat_align1_local_align1_unknown_size(ptr add
17611917
; OPT-NEXT: [[TMP3:%.*]] = sub i32 [[SIZE]], [[TMP2]]
17621918
; OPT-NEXT: [[SKIP_RESIDUAL:%.*]] = icmp eq i32 [[TMP2]], 0
17631919
; OPT-NEXT: [[SKIP_MAIN:%.*]] = icmp eq i32 [[TMP1]], 0
1764-
; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(3) [[SRC:%.*]] to ptr
1765-
; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr [[TMP4]], [[DST:%.*]]
1920+
; OPT-NEXT: [[TMP4:%.*]] = addrspacecast ptr [[DST:%.*]] to ptr addrspace(3)
1921+
; OPT-NEXT: [[COMPARE_SRC_DST:%.*]] = icmp ult ptr addrspace(3) [[SRC:%.*]], [[TMP4]]
17661922
; OPT-NEXT: br i1 [[COMPARE_SRC_DST]], label [[MEMMOVE_COPY_BACKWARDS:%.*]], label [[MEMMOVE_COPY_FORWARD:%.*]]
17671923
; OPT: memmove_copy_backwards:
17681924
; OPT-NEXT: br i1 [[SKIP_RESIDUAL]], label [[MEMMOVE_BWD_MIDDLE:%.*]], label [[MEMMOVE_BWD_RESIDUAL_LOOP:%.*]]

0 commit comments

Comments
 (0)