Skip to content

Commit ae57ca5

Browse files
committed
[X86] Prevent APX NDD compression when it creates a partial write
APX NDD instructions may be compressed when the result is also a source. For 8/16b instructions, this may create partial register write hazards if a previous super-register def is within the partial reg update clearance, or incorrect code if the super-register is not dead. This change prevents compression when the super-register is marked as an implicit define, which the virtual rewriter already adds in the case where a subregister is defined but the super-register is not dead. The BreakFalseDeps interface is also updated to add implicit super-register defs for NDD instructions that would incur partial-write stalls if compressed to legacy ops.
1 parent b3d280b commit ae57ca5

File tree

4 files changed

+246
-9
lines changed

4 files changed

+246
-9
lines changed

llvm/lib/Target/X86/X86CompressEVEX.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,23 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
237237
return 0;
238238
return I->NewOpc;
239239
};
240+
241+
// Redundant NDD ops cannot be safely compressed if either:
242+
// - the legacy op would introduce a partial write that BreakFalseDeps
243+
// identified as a potential stall, or
244+
// - the op is writing to a subregister of a live register, i.e. the
245+
// full (zeroed) result is used.
246+
// Both cases are indicated by an implicit def of the superregister.
247+
if (IsRedundantNDD) {
248+
Register Dst = MI.getOperand(0).getReg();
249+
if (Dst &&
250+
(X86::GR16RegClass.contains(Dst) || X86::GR8RegClass.contains(Dst))) {
251+
Register Super = getX86SubSuperRegister(Dst, ST.is64Bit() ? 64 : 32);
252+
if (MI.definesRegister(Super, /*TRI=*/nullptr))
253+
IsRedundantNDD = false;
254+
}
255+
}
256+
240257
// NonNF -> NF only if it's not a compressible NDD instruction and eflags is
241258
// dead.
242259
unsigned NewOpc = IsRedundantNDD

llvm/lib/Target/X86/X86InstrInfo.cpp

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6793,19 +6793,42 @@ static bool hasPartialRegUpdate(unsigned Opcode, const X86Subtarget &Subtarget,
67936793
unsigned X86InstrInfo::getPartialRegUpdateClearance(
67946794
const MachineInstr &MI, unsigned OpNum,
67956795
const TargetRegisterInfo *TRI) const {
6796-
if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode(), Subtarget))
6796+
6797+
// With the NDD/ZU features, ISel may generate NDD/ZU ops which
6798+
// appear to perform partial writes. We detect these based on flags
6799+
// and register class.
6800+
bool HasNDDPartialWrite = false;
6801+
if (OpNum == 0 && (Subtarget.hasNDD() || Subtarget.hasZU()) &&
6802+
X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
6803+
Register Reg = MI.getOperand(0).getReg();
6804+
if (Reg.isVirtual()) {
6805+
auto &MRI = MI.getParent()->getParent()->getRegInfo();
6806+
if (auto *TRC = MRI.getRegClassOrNull(Reg))
6807+
HasNDDPartialWrite = (TRC->getID() == X86::GR16RegClassID ||
6808+
TRC->getID() == X86::GR8RegClassID);
6809+
} else
6810+
HasNDDPartialWrite =
6811+
X86::GR8RegClass.contains(Reg) || X86::GR16RegClass.contains(Reg);
6812+
}
6813+
6814+
if (OpNum != 0 ||
6815+
!(HasNDDPartialWrite || hasPartialRegUpdate(MI.getOpcode(), Subtarget)))
67976816
return 0;
67986817

6799-
// If MI is marked as reading Reg, the partial register update is wanted.
6818+
// For non-NDD ops, if MI is marked as reading Reg, the partial register
6819+
// update is wanted, hence we return 0.
6820+
// For NDD ops, if MI is marked as reading Reg, then it is possible to
6821+
// compress to a legacy form in CompressEVEX, which would create an
6822+
// unwanted partial update, so we return the clearance.
68006823
const MachineOperand &MO = MI.getOperand(0);
68016824
Register Reg = MO.getReg();
6802-
if (Reg.isVirtual()) {
6803-
if (MO.readsReg() || MI.readsVirtualRegister(Reg))
6804-
return 0;
6805-
} else {
6806-
if (MI.readsRegister(Reg, TRI))
6807-
return 0;
6808-
}
6825+
bool ReadsReg = false;
6826+
if (Reg.isVirtual())
6827+
ReadsReg = (MO.readsReg() || MI.readsVirtualRegister(Reg));
6828+
else
6829+
ReadsReg = MI.readsRegister(Reg, TRI);
6830+
if (ReadsReg != HasNDDPartialWrite)
6831+
return 0;
68096832

68106833
// If any instructions in the clearance range are reading Reg, insert a
68116834
// dependency breaking instruction, which is inexpensive and is likely to
@@ -7229,6 +7252,18 @@ void X86InstrInfo::breakPartialRegDependency(
72297252
.addReg(Reg, RegState::Undef)
72307253
.addReg(Reg, RegState::Undef);
72317254
MI.addRegisterKilled(Reg, TRI, true);
7255+
} else if ((X86::GR16RegClass.contains(Reg) ||
7256+
X86::GR8RegClass.contains(Reg)) &&
7257+
X86II::hasNewDataDest(MI.getDesc().TSFlags)) {
7258+
// This case is only expected for NDD ops which appear to be partial
7259+
// writes, but are not due to the zeroing of the upper part. Here
7260+
// we add an implicit def of the superegister, which prevents
7261+
// CompressEVEX from converting this to a legacy form.
7262+
Register SuperReg =
7263+
getX86SubSuperRegister(Reg, Subtarget.is64Bit() ? 64 : 32);
7264+
MachineInstrBuilder BuildMI(*MI.getParent()->getParent(), &MI);
7265+
if (!MI.definesRegister(SuperReg, /*TRI=*/nullptr))
7266+
BuildMI.addReg(SuperReg, RegState::ImplicitDefine);
72327267
}
72337268
}
72347269

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -show-mc-encoding -o - | FileCheck --check-prefixes=ASM,RCDEFAULT %s
3+
# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -start-before=break-false-deps -partial-reg-update-clearance=1 -show-mc-encoding -o - | FileCheck --check-prefixes=ASM,RC1 %s
4+
#
5+
# Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write
6+
# if compressed to a legacy op. MIR has been modified to force different register assignments.
7+
#
8+
# For partial_write, the ADD16rr_ND is compressible, but will become a partial write after compression.
9+
# Compression is inhibited if the eax definition is within the partial-reg-update-clearance threshold.
10+
#
11+
# For no_partial_write, the ADD16rr_ND is incompressible hence it cannot become a partial write.
12+
# This case checks that an implicit-def of eax is not added by breakPartialRegDependency.
13+
#
14+
--- |
15+
define signext i16 @partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
16+
; RCDEFAULT-LABEL: partial_write:
17+
; RCDEFAULT: # %bb.0: # %entry
18+
; RCDEFAULT-NEXT: addl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf2]
19+
; RCDEFAULT-NEXT: movl %eax, (%rdi) # encoding: [0x89,0x07]
20+
; RCDEFAULT-NEXT: addw %cx, %ax, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xc8]
21+
; RCDEFAULT-NEXT: retq # encoding: [0xc3]
22+
;
23+
; RC1-LABEL: partial_write:
24+
; RC1: # %bb.0: # %entry
25+
; RC1-NEXT: addl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xf2]
26+
; RC1-NEXT: movl %eax, (%rdi) # encoding: [0x89,0x07]
27+
; RC1-NEXT: addw %cx, %ax # EVEX TO LEGACY Compression encoding: [0x66,0x01,0xc8]
28+
; RC1-NEXT: retq # encoding: [0xc3]
29+
entry:
30+
%add = add nsw i32 %b, %a
31+
store i32 %add, ptr %p, align 4, !tbaa !1
32+
%add1 = trunc i32 %add to i16
33+
%add2 = add i16 %add1, %x
34+
ret i16 %add2
35+
}
36+
37+
define signext i16 @no_partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
38+
; ASM-LABEL: no_partial_write:
39+
; ASM: # %bb.0: # %entry
40+
; ASM-NEXT: addl %esi, %edx # EVEX TO LEGACY Compression encoding: [0x01,0xf2]
41+
; ASM-NEXT: movl %edx, (%rdi) # encoding: [0x89,0x17]
42+
; ASM-NEXT: addw %cx, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x01,0xca]
43+
; ASM-NEXT: retq # encoding: [0xc3]
44+
entry:
45+
%add = add nsw i32 %b, %a
46+
store i32 %add, ptr %p, align 4, !tbaa !1
47+
%add1 = trunc i32 %add to i16
48+
%add2 = add i16 %add1, %x
49+
ret i16 %add2
50+
}
51+
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx16,+cx8,+egpr,+fxsr,+mmx,+ndd,+sse,+sse2,+x87" "tune-cpu"="generic" }
52+
53+
!llvm.module.flags = !{!0}
54+
55+
!0 = !{i32 1, !"wchar_size", i32 4}
56+
!1 = !{!2, !2, i64 0}
57+
!2 = !{!"int", !3, i64 0}
58+
!3 = !{!"omnipotent char", !4, i64 0}
59+
!4 = !{!"Simple C/C++ TBAA"}
60+
...
61+
---
62+
name: partial_write
63+
tracksRegLiveness: true
64+
noVRegs: true
65+
noPhis: true
66+
isSSA: false
67+
body: |
68+
bb.0.entry:
69+
liveins: $ecx, $edx, $esi, $rdi
70+
renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
71+
MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p, !tbaa !1)
72+
renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
73+
RET64 $ax
74+
...
75+
---
76+
name: no_partial_write
77+
tracksRegLiveness: true
78+
noVRegs: true
79+
noPhis: true
80+
isSSA: false
81+
body: |
82+
bb.0.entry:
83+
liveins: $ecx, $edx, $esi, $rdi
84+
85+
renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
86+
MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p, !tbaa !1)
87+
renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
88+
RET64 $ax
89+
...
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -o - | FileCheck --check-prefixes=MIR,RCDEFAULT %s
3+
# RUN: llc %s -mtriple=x86_64-unknown -mattr=+ndd,+egpr -run-pass=break-false-deps -partial-reg-update-clearance=1 -o - | FileCheck --check-prefixes=MIR,RC1 %s
4+
#
5+
# Check that BreakFalseDeps detects cases where an ND instruction would cause a partial register write
6+
# if compressed to a legacy op. MIR has been modified to force different register assignments.
7+
#
8+
# For partial_write, the ADD16rr_ND is compressible, but will become a partial write after compression.
9+
# Compression is inhibited if the eax definition is within the partial-reg-update-clearance threshold.
10+
#
11+
# For no_partial_write, the ADD16rr_ND is incompressible hence it cannot become a partial write.
12+
# This case checks that an implicit-def of eax is not added by breakPartialRegDependency.
13+
#
14+
--- |
15+
define signext i16 @partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
16+
entry:
17+
%add = add nsw i32 %b, %a
18+
store i32 %add, ptr %p, align 4, !tbaa !1
19+
%add1 = trunc i32 %add to i16
20+
%add2 = add i16 %add1, %x
21+
ret i16 %add2
22+
}
23+
24+
define signext i16 @no_partial_write(ptr %p, i32 %a, i32 %b, i16 signext %x, i16 signext %y) #0 {
25+
entry:
26+
%add = add nsw i32 %b, %a
27+
store i32 %add, ptr %p, align 4, !tbaa !1
28+
%add1 = trunc i32 %add to i16
29+
%add2 = add i16 %add1, %x
30+
ret i16 %add2
31+
}
32+
attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx16,+cx8,+egpr,+fxsr,+mmx,+ndd,+sse,+sse2,+x87" "tune-cpu"="generic" }
33+
34+
!llvm.module.flags = !{!0}
35+
36+
!0 = !{i32 1, !"wchar_size", i32 4}
37+
!1 = !{!2, !2, i64 0}
38+
!2 = !{!"int", !3, i64 0}
39+
!3 = !{!"omnipotent char", !4, i64 0}
40+
!4 = !{!"Simple C/C++ TBAA"}
41+
...
42+
---
43+
name: partial_write
44+
tracksRegLiveness: true
45+
noVRegs: true
46+
noPhis: true
47+
isSSA: false
48+
body: |
49+
bb.0.entry:
50+
liveins: $ecx, $edx, $esi, $rdi
51+
; RCDEFAULT-LABEL: name: partial_write
52+
; RCDEFAULT: liveins: $ecx, $edx, $esi, $rdi
53+
; RCDEFAULT-NEXT: {{ $}}
54+
; RCDEFAULT-NEXT: renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
55+
; RCDEFAULT-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p, !tbaa !1)
56+
; RCDEFAULT-NEXT: renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax, implicit-def $rax
57+
; RCDEFAULT-NEXT: RET64 $ax
58+
;
59+
; RC1-LABEL: name: partial_write
60+
; RC1: liveins: $ecx, $edx, $esi, $rdi
61+
; RC1-NEXT: {{ $}}
62+
; RC1-NEXT: renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
63+
; RC1-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p, !tbaa !1)
64+
; RC1-NEXT: renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
65+
; RC1-NEXT: RET64 $ax
66+
renamable $eax = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
67+
MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $eax :: (store (s32) into %ir.p, !tbaa !1)
68+
renamable $ax = ADD16rr_ND renamable $ax, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit $eax
69+
RET64 $ax
70+
...
71+
---
72+
name: no_partial_write
73+
tracksRegLiveness: true
74+
noVRegs: true
75+
noPhis: true
76+
isSSA: false
77+
body: |
78+
bb.0.entry:
79+
liveins: $ecx, $edx, $esi, $rdi
80+
81+
; MIR-LABEL: name: no_partial_write
82+
; MIR: liveins: $ecx, $edx, $esi, $rdi
83+
; MIR-NEXT: {{ $}}
84+
; MIR-NEXT: renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
85+
; MIR-NEXT: MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p, !tbaa !1)
86+
; MIR-NEXT: renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
87+
; MIR-NEXT: RET64 $ax
88+
renamable $edx = nsw ADD32rr_ND killed renamable $edx, killed renamable $esi, implicit-def dead $eflags
89+
MOV32mr killed renamable $rdi, 1, $noreg, 0, $noreg, renamable $edx :: (store (s32) into %ir.p, !tbaa !1)
90+
renamable $ax = ADD16rr_ND renamable $dx, renamable $cx, implicit-def dead $eflags, implicit killed $ecx, implicit killed $edx
91+
RET64 $ax
92+
...
93+
## NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
94+
# MIR: {{.*}}
95+
# RC1: {{.*}}
96+
# RCDEFAULT: {{.*}}

0 commit comments

Comments
 (0)