Skip to content

Commit 6e9448d

Browse files
committed
[X86] narrowBitOpRMW - use reachesChainWithoutSideEffects instead of direct chain matching
This will allow us to match RMW load/store chains through TokenFactor nodes if there are additional loads in the chain before the store
1 parent 8f7efa0 commit 6e9448d

File tree

2 files changed

+48
-155
lines changed

2 files changed

+48
-155
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -53351,21 +53351,11 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5335153351
SelectionDAG &DAG,
5335253352
const X86Subtarget &Subtarget) {
5335353353
using namespace SDPatternMatch;
53354-
53355-
// Only handle normal stores and its chain was a matching normal load.
53356-
auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
53357-
if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
53358-
!ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
53359-
Ld->getBasePtr() != St->getBasePtr() ||
53360-
Ld->getOffset() != St->getOffset())
53361-
return SDValue();
53362-
53363-
SDValue LoadVal(Ld, 0);
5336453354
SDValue StoredVal = St->getValue();
5336553355
EVT VT = StoredVal.getValueType();
5336653356

53367-
// Only narrow larger than legal scalar integers.
53368-
if (!VT.isScalarInteger() ||
53357+
// Only narrow normal stores of larger than legal scalar integers.
53358+
if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() ||
5336953359
VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
5337053360
return SDValue();
5337153361

@@ -53374,18 +53364,26 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5337453364
// BTC: X ^ (1 << ShAmt)
5337553365
//
5337653366
// BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
53377-
SDValue InsertBit, ShAmt;
53367+
SDValue SrcVal, InsertBit, ShAmt;
5337853368
if (!StoredVal.hasOneUse() ||
53379-
!(sd_match(StoredVal, m_And(m_Specific(LoadVal),
53369+
!(sd_match(StoredVal, m_And(m_Value(SrcVal),
5338053370
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
5338153371
sd_match(StoredVal,
53382-
m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
53372+
m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
5338353373
sd_match(StoredVal,
53384-
m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
53385-
sd_match(StoredVal,
53386-
m_Or(m_And(m_Specific(LoadVal),
53387-
m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
53388-
m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
53374+
m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
53375+
sd_match(
53376+
StoredVal,
53377+
m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
53378+
m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
53379+
return SDValue();
53380+
53381+
// SrcVal must be a matching normal load further up the chain.
53382+
auto *Ld = dyn_cast<LoadSDNode>(SrcVal);
53383+
if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
53384+
Ld->getBasePtr() != St->getBasePtr() ||
53385+
Ld->getOffset() != St->getOffset() ||
53386+
!St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1)))
5338953387
return SDValue();
5339053388

5339153389
// Ensure the shift amount is in bounds.
@@ -53419,7 +53417,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
5341953417
SDNodeFlags::NoUnsignedWrap);
5342053418

5342153419
// Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
53422-
SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
53420+
SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt);
5342353421
X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
5342453422

5342553423
SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,

llvm/test/CodeGen/X86/bittest-big-integer.ll

Lines changed: 29 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefixes=X86
33
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,SSE
44
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE
5-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX
6-
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX
5+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
6+
; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
77

88
; bt/btc/btr/bts patterns + 'init' to set single bit value in large integers
99

@@ -1029,151 +1029,46 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
10291029
define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
10301030
; X86-LABEL: reset_multiload_i128:
10311031
; X86: # %bb.0:
1032-
; X86-NEXT: pushl %ebp
1033-
; X86-NEXT: movl %esp, %ebp
10341032
; X86-NEXT: pushl %ebx
10351033
; X86-NEXT: pushl %edi
10361034
; X86-NEXT: pushl %esi
1037-
; X86-NEXT: andl $-16, %esp
1038-
; X86-NEXT: subl $64, %esp
1039-
; X86-NEXT: movl 12(%ebp), %ecx
1040-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1041-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1042-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1043-
; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
1044-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1045-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1046-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1047-
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
1048-
; X86-NEXT: movl %ecx, %eax
1049-
; X86-NEXT: shrb $3, %al
1050-
; X86-NEXT: andb $12, %al
1051-
; X86-NEXT: negb %al
1052-
; X86-NEXT: movsbl %al, %edi
1053-
; X86-NEXT: movl 36(%esp,%edi), %edx
1054-
; X86-NEXT: movl 40(%esp,%edi), %ebx
1055-
; X86-NEXT: movl %ebx, %esi
1056-
; X86-NEXT: shldl %cl, %edx, %esi
1057-
; X86-NEXT: movl 32(%esp,%edi), %eax
1058-
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1059-
; X86-NEXT: movl 44(%esp,%edi), %edi
1060-
; X86-NEXT: shldl %cl, %ebx, %edi
1061-
; X86-NEXT: movl %eax, %ebx
1062-
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
1063-
; X86-NEXT: shll %cl, %ebx
1064-
; X86-NEXT: notl %ebx
1065-
; X86-NEXT: movl 16(%ebp), %eax
1035+
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
1036+
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
1037+
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
10661038
; X86-NEXT: movl (%eax), %eax
1067-
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1068-
; X86-NEXT: movl 12(%ebp), %eax
1069-
; X86-NEXT: andl $96, %eax
1070-
; X86-NEXT: shrl $3, %eax
1071-
; X86-NEXT: movl 8(%ebp), %ecx
1072-
; X86-NEXT: movl (%ecx,%eax), %eax
1073-
; X86-NEXT: andl %ebx, (%ecx)
1074-
; X86-NEXT: movl 12(%ebp), %ecx
1075-
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
1076-
; X86-NEXT: shldl %cl, %ebx, %edx
1077-
; X86-NEXT: notl %edx
1078-
; X86-NEXT: movl 8(%ebp), %ebx
1079-
; X86-NEXT: andl %edx, 4(%ebx)
1080-
; X86-NEXT: notl %esi
1081-
; X86-NEXT: andl %esi, 8(%ebx)
1082-
; X86-NEXT: notl %edi
1083-
; X86-NEXT: andl %edi, 12(%ebx)
1084-
; X86-NEXT: btl %ecx, %eax
1085-
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
1039+
; X86-NEXT: movl %edx, %esi
1040+
; X86-NEXT: andl $96, %esi
1041+
; X86-NEXT: shrl $3, %esi
1042+
; X86-NEXT: movl (%ecx,%esi), %edi
1043+
; X86-NEXT: movl %edi, %ebx
1044+
; X86-NEXT: btrl %edx, %ebx
1045+
; X86-NEXT: btl %edx, %edi
1046+
; X86-NEXT: movl %ebx, (%ecx,%esi)
10861047
; X86-NEXT: jae .LBB22_2
10871048
; X86-NEXT: # %bb.1:
10881049
; X86-NEXT: xorl %eax, %eax
10891050
; X86-NEXT: .LBB22_2:
1090-
; X86-NEXT: leal -12(%ebp), %esp
10911051
; X86-NEXT: popl %esi
10921052
; X86-NEXT: popl %edi
10931053
; X86-NEXT: popl %ebx
1094-
; X86-NEXT: popl %ebp
10951054
; X86-NEXT: retl
10961055
;
1097-
; SSE-LABEL: reset_multiload_i128:
1098-
; SSE: # %bb.0:
1099-
; SSE-NEXT: movl %esi, %ecx
1100-
; SSE-NEXT: movl $1, %esi
1101-
; SSE-NEXT: xorl %r8d, %r8d
1102-
; SSE-NEXT: shldq %cl, %rsi, %r8
1103-
; SSE-NEXT: xorl %eax, %eax
1104-
; SSE-NEXT: shlq %cl, %rsi
1105-
; SSE-NEXT: testb $64, %cl
1106-
; SSE-NEXT: cmovneq %rsi, %r8
1107-
; SSE-NEXT: cmovneq %rax, %rsi
1108-
; SSE-NEXT: notq %r8
1109-
; SSE-NEXT: notq %rsi
1110-
; SSE-NEXT: movl %ecx, %r9d
1111-
; SSE-NEXT: andl $96, %r9d
1112-
; SSE-NEXT: shrl $3, %r9d
1113-
; SSE-NEXT: movl (%rdi,%r9), %r9d
1114-
; SSE-NEXT: btl %ecx, %r9d
1115-
; SSE-NEXT: jb .LBB22_2
1116-
; SSE-NEXT: # %bb.1:
1117-
; SSE-NEXT: movl (%rdx), %eax
1118-
; SSE-NEXT: .LBB22_2:
1119-
; SSE-NEXT: andq %r8, 8(%rdi)
1120-
; SSE-NEXT: andq %rsi, (%rdi)
1121-
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
1122-
; SSE-NEXT: retq
1123-
;
1124-
; AVX2-LABEL: reset_multiload_i128:
1125-
; AVX2: # %bb.0:
1126-
; AVX2-NEXT: movl %esi, %ecx
1127-
; AVX2-NEXT: xorl %eax, %eax
1128-
; AVX2-NEXT: movl $1, %r8d
1129-
; AVX2-NEXT: xorl %esi, %esi
1130-
; AVX2-NEXT: shldq %cl, %r8, %rsi
1131-
; AVX2-NEXT: shlxq %rcx, %r8, %r8
1132-
; AVX2-NEXT: testb $64, %cl
1133-
; AVX2-NEXT: cmovneq %r8, %rsi
1134-
; AVX2-NEXT: cmovneq %rax, %r8
1135-
; AVX2-NEXT: notq %rsi
1136-
; AVX2-NEXT: notq %r8
1137-
; AVX2-NEXT: movl %ecx, %r9d
1138-
; AVX2-NEXT: andl $96, %r9d
1139-
; AVX2-NEXT: shrl $3, %r9d
1140-
; AVX2-NEXT: movl (%rdi,%r9), %r9d
1141-
; AVX2-NEXT: btl %ecx, %r9d
1142-
; AVX2-NEXT: jb .LBB22_2
1143-
; AVX2-NEXT: # %bb.1:
1144-
; AVX2-NEXT: movl (%rdx), %eax
1145-
; AVX2-NEXT: .LBB22_2:
1146-
; AVX2-NEXT: andq %rsi, 8(%rdi)
1147-
; AVX2-NEXT: andq %r8, (%rdi)
1148-
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
1149-
; AVX2-NEXT: retq
1150-
;
1151-
; AVX512-LABEL: reset_multiload_i128:
1152-
; AVX512: # %bb.0:
1153-
; AVX512-NEXT: movl %esi, %ecx
1154-
; AVX512-NEXT: movl $1, %r8d
1155-
; AVX512-NEXT: xorl %esi, %esi
1156-
; AVX512-NEXT: shldq %cl, %r8, %rsi
1157-
; AVX512-NEXT: xorl %eax, %eax
1158-
; AVX512-NEXT: shlxq %rcx, %r8, %r8
1159-
; AVX512-NEXT: testb $64, %cl
1160-
; AVX512-NEXT: cmovneq %r8, %rsi
1161-
; AVX512-NEXT: cmovneq %rax, %r8
1162-
; AVX512-NEXT: notq %rsi
1163-
; AVX512-NEXT: notq %r8
1164-
; AVX512-NEXT: movl %ecx, %r9d
1165-
; AVX512-NEXT: andl $96, %r9d
1166-
; AVX512-NEXT: shrl $3, %r9d
1167-
; AVX512-NEXT: movl (%rdi,%r9), %r9d
1168-
; AVX512-NEXT: btl %ecx, %r9d
1169-
; AVX512-NEXT: jb .LBB22_2
1170-
; AVX512-NEXT: # %bb.1:
1171-
; AVX512-NEXT: movl (%rdx), %eax
1172-
; AVX512-NEXT: .LBB22_2:
1173-
; AVX512-NEXT: andq %rsi, 8(%rdi)
1174-
; AVX512-NEXT: andq %r8, (%rdi)
1175-
; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
1176-
; AVX512-NEXT: retq
1056+
; X64-LABEL: reset_multiload_i128:
1057+
; X64: # %bb.0:
1058+
; X64-NEXT: movl %esi, %ecx
1059+
; X64-NEXT: andl $96, %ecx
1060+
; X64-NEXT: shrl $3, %ecx
1061+
; X64-NEXT: movl (%rdi,%rcx), %r9d
1062+
; X64-NEXT: movl %r9d, %r8d
1063+
; X64-NEXT: btrl %esi, %r8d
1064+
; X64-NEXT: xorl %eax, %eax
1065+
; X64-NEXT: btl %esi, %r9d
1066+
; X64-NEXT: jb .LBB22_2
1067+
; X64-NEXT: # %bb.1:
1068+
; X64-NEXT: movl (%rdx), %eax
1069+
; X64-NEXT: .LBB22_2:
1070+
; X64-NEXT: movl %r8d, (%rdi,%rcx)
1071+
; X64-NEXT: retq
11771072
%rem = and i32 %position, 127
11781073
%ofs = zext nneg i32 %rem to i128
11791074
%bit = shl nuw i128 1, %ofs

0 commit comments

Comments
 (0)