-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[NVPTX] Stop using 16-bit CAS instructions from PTX #119349
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-nvptx Author: Akshay Deodhar (akshayrdeodhar) ChangesIncreases minimum CAS size from 16 bit to 32 bit, as 16 bit CAS is not natively supported- atom.cas.b16 is emulated. Full diff: https://github.com/llvm/llvm-project/pull/119349.diff 5 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ce94dded815b8f..2c3ceba03da2a5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -912,7 +912,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
// actions
computeRegisterProperties(STI.getRegisterInfo());
- setMinCmpXchgSizeInBits(STI.hasAtomCas16() ? 16 : 32);
+ // PTX support for 16-bit CAS is emulated. Only use 32+
+ setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
setMaxAtomicSizeInBitsSupported(64);
setMaxDivRemBitWidthSupported(64);
}
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e785bbf830da62..04bfcc6ded4aa1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -123,6 +123,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
// set of equivalent memory operations with a scalar data-type, executed in
// an unspecified order on the elements in the vector.
unsigned getMaxRequiredAlignment() const { return 8; }
+ // Get the smallest cmpxchg word size that the hardware supports.
+ unsigned getMinCmpXchgSizeInBits() const { return 32; }
unsigned getPTXVersion() const { return PTXVersion; }
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
index 0c1ca8cb7ac166..a5b81dfc0cd009 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -132,10 +132,10 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
; CHECKPTX62-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX62-NEXT: // %bb.8: // %atomicrmw.end
; CHECKPTX62-NEXT: ret;
- %r1 = atomicrmw fadd ptr %dp0, half %val seq_cst
- %r2 = atomicrmw fadd ptr %dp0, half 1.0 seq_cst
- %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val seq_cst
- %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val seq_cst
+ %r1 = atomicrmw fadd ptr %dp0, half %val monotonic
+ %r2 = atomicrmw fadd ptr %dp0, half 1.0 monotonic
+ %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val monotonic
+ %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val monotonic
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index 8bae18dcc5eef8..16e7baced67838 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -46,65 +46,105 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
; CHECKPTX71-LABEL: test(
; CHECKPTX71: {
; CHECKPTX71-NEXT: .reg .pred %p<5>;
-; CHECKPTX71-NEXT: .reg .b16 %rs<34>;
-; CHECKPTX71-NEXT: .reg .b32 %r<4>;
+; CHECKPTX71-NEXT: .reg .b16 %rs<18>;
+; CHECKPTX71-NEXT: .reg .b32 %r<58>;
; CHECKPTX71-NEXT: .reg .f32 %f<12>;
; CHECKPTX71-EMPTY:
; CHECKPTX71-NEXT: // %bb.0:
-; CHECKPTX71-NEXT: ld.param.b16 %rs13, [test_param_3];
-; CHECKPTX71-NEXT: ld.param.u32 %r3, [test_param_2];
-; CHECKPTX71-NEXT: ld.param.u32 %r2, [test_param_1];
-; CHECKPTX71-NEXT: ld.param.u32 %r1, [test_param_0];
-; CHECKPTX71-NEXT: ld.b16 %rs30, [%r1];
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs13;
-; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start14
+; CHECKPTX71-NEXT: ld.param.b16 %rs1, [test_param_3];
+; CHECKPTX71-NEXT: ld.param.u32 %r23, [test_param_2];
+; CHECKPTX71-NEXT: ld.param.u32 %r22, [test_param_1];
+; CHECKPTX71-NEXT: ld.param.u32 %r24, [test_param_0];
+; CHECKPTX71-NEXT: and.b32 %r1, %r24, -4;
+; CHECKPTX71-NEXT: and.b32 %r25, %r24, 3;
+; CHECKPTX71-NEXT: shl.b32 %r2, %r25, 3;
+; CHECKPTX71-NEXT: mov.b32 %r26, 65535;
+; CHECKPTX71-NEXT: shl.b32 %r27, %r26, %r2;
+; CHECKPTX71-NEXT: not.b32 %r3, %r27;
+; CHECKPTX71-NEXT: ld.u32 %r54, [%r1];
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs1;
+; CHECKPTX71-NEXT: $L__BB0_1: // %atomicrmw.start45
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f2, %rs30;
-; CHECKPTX71-NEXT: add.rn.f32 %f3, %f2, %f1;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs14, %f3;
-; CHECKPTX71-NEXT: atom.cas.b16 %rs17, [%r1], %rs30, %rs14;
-; CHECKPTX71-NEXT: setp.ne.s16 %p1, %rs17, %rs30;
-; CHECKPTX71-NEXT: mov.u16 %rs30, %rs17;
+; CHECKPTX71-NEXT: shr.u32 %r28, %r54, %r2;
+; CHECKPTX71-NEXT: cvt.u16.u32 %rs2, %r28;
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f1, %rs2;
+; CHECKPTX71-NEXT: add.rn.f32 %f3, %f1, %f2;
+; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs4, %f3;
+; CHECKPTX71-NEXT: cvt.u32.u16 %r29, %rs4;
+; CHECKPTX71-NEXT: shl.b32 %r30, %r29, %r2;
+; CHECKPTX71-NEXT: and.b32 %r31, %r54, %r3;
+; CHECKPTX71-NEXT: or.b32 %r32, %r31, %r30;
+; CHECKPTX71-NEXT: atom.cas.b32 %r6, [%r1], %r54, %r32;
+; CHECKPTX71-NEXT: setp.ne.s32 %p1, %r6, %r54;
+; CHECKPTX71-NEXT: mov.u32 %r54, %r6;
; CHECKPTX71-NEXT: @%p1 bra $L__BB0_1;
-; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end13
-; CHECKPTX71-NEXT: ld.b16 %rs31, [%r1];
-; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start8
+; CHECKPTX71-NEXT: // %bb.2: // %atomicrmw.end44
+; CHECKPTX71-NEXT: ld.u32 %r55, [%r1];
+; CHECKPTX71-NEXT: $L__BB0_3: // %atomicrmw.start27
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs31;
+; CHECKPTX71-NEXT: shr.u32 %r33, %r55, %r2;
+; CHECKPTX71-NEXT: cvt.u16.u32 %rs6, %r33;
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f4, %rs6;
; CHECKPTX71-NEXT: add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs18, %f5;
-; CHECKPTX71-NEXT: atom.cas.b16 %rs21, [%r1], %rs31, %rs18;
-; CHECKPTX71-NEXT: setp.ne.s16 %p2, %rs21, %rs31;
-; CHECKPTX71-NEXT: mov.u16 %rs31, %rs21;
+; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs8, %f5;
+; CHECKPTX71-NEXT: cvt.u32.u16 %r34, %rs8;
+; CHECKPTX71-NEXT: shl.b32 %r35, %r34, %r2;
+; CHECKPTX71-NEXT: and.b32 %r36, %r55, %r3;
+; CHECKPTX71-NEXT: or.b32 %r37, %r36, %r35;
+; CHECKPTX71-NEXT: atom.cas.b32 %r9, [%r1], %r55, %r37;
+; CHECKPTX71-NEXT: setp.ne.s32 %p2, %r9, %r55;
+; CHECKPTX71-NEXT: mov.u32 %r55, %r9;
; CHECKPTX71-NEXT: @%p2 bra $L__BB0_3;
-; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end7
-; CHECKPTX71-NEXT: ld.global.b16 %rs32, [%r2];
-; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start2
+; CHECKPTX71-NEXT: // %bb.4: // %atomicrmw.end26
+; CHECKPTX71-NEXT: and.b32 %r10, %r22, -4;
+; CHECKPTX71-NEXT: shl.b32 %r38, %r22, 3;
+; CHECKPTX71-NEXT: and.b32 %r11, %r38, 24;
+; CHECKPTX71-NEXT: shl.b32 %r40, %r26, %r11;
+; CHECKPTX71-NEXT: not.b32 %r12, %r40;
+; CHECKPTX71-NEXT: ld.global.u32 %r56, [%r10];
+; CHECKPTX71-NEXT: $L__BB0_5: // %atomicrmw.start9
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f7, %rs32;
-; CHECKPTX71-NEXT: add.rn.f32 %f8, %f7, %f1;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs22, %f8;
-; CHECKPTX71-NEXT: atom.global.cas.b16 %rs25, [%r2], %rs32, %rs22;
-; CHECKPTX71-NEXT: setp.ne.s16 %p3, %rs25, %rs32;
-; CHECKPTX71-NEXT: mov.u16 %rs32, %rs25;
+; CHECKPTX71-NEXT: shr.u32 %r41, %r56, %r11;
+; CHECKPTX71-NEXT: cvt.u16.u32 %rs10, %r41;
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f6, %rs10;
+; CHECKPTX71-NEXT: add.rn.f32 %f8, %f6, %f2;
+; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs12, %f8;
+; CHECKPTX71-NEXT: cvt.u32.u16 %r42, %rs12;
+; CHECKPTX71-NEXT: shl.b32 %r43, %r42, %r11;
+; CHECKPTX71-NEXT: and.b32 %r44, %r56, %r12;
+; CHECKPTX71-NEXT: or.b32 %r45, %r44, %r43;
+; CHECKPTX71-NEXT: atom.global.cas.b32 %r15, [%r10], %r56, %r45;
+; CHECKPTX71-NEXT: setp.ne.s32 %p3, %r15, %r56;
+; CHECKPTX71-NEXT: mov.u32 %r56, %r15;
; CHECKPTX71-NEXT: @%p3 bra $L__BB0_5;
-; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end1
-; CHECKPTX71-NEXT: ld.shared.b16 %rs33, [%r3];
+; CHECKPTX71-NEXT: // %bb.6: // %atomicrmw.end8
+; CHECKPTX71-NEXT: and.b32 %r16, %r23, -4;
+; CHECKPTX71-NEXT: shl.b32 %r46, %r23, 3;
+; CHECKPTX71-NEXT: and.b32 %r17, %r46, 24;
+; CHECKPTX71-NEXT: shl.b32 %r48, %r26, %r17;
+; CHECKPTX71-NEXT: not.b32 %r18, %r48;
+; CHECKPTX71-NEXT: ld.shared.u32 %r57, [%r16];
; CHECKPTX71-NEXT: $L__BB0_7: // %atomicrmw.start
; CHECKPTX71-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT: cvt.f32.bf16 %f10, %rs33;
-; CHECKPTX71-NEXT: add.rn.f32 %f11, %f10, %f1;
-; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs26, %f11;
-; CHECKPTX71-NEXT: atom.shared.cas.b16 %rs29, [%r3], %rs33, %rs26;
-; CHECKPTX71-NEXT: setp.ne.s16 %p4, %rs29, %rs33;
-; CHECKPTX71-NEXT: mov.u16 %rs33, %rs29;
+; CHECKPTX71-NEXT: shr.u32 %r49, %r57, %r17;
+; CHECKPTX71-NEXT: cvt.u16.u32 %rs14, %r49;
+; CHECKPTX71-NEXT: cvt.f32.bf16 %f9, %rs14;
+; CHECKPTX71-NEXT: add.rn.f32 %f11, %f9, %f2;
+; CHECKPTX71-NEXT: cvt.rn.bf16.f32 %rs16, %f11;
+; CHECKPTX71-NEXT: cvt.u32.u16 %r50, %rs16;
+; CHECKPTX71-NEXT: shl.b32 %r51, %r50, %r17;
+; CHECKPTX71-NEXT: and.b32 %r52, %r57, %r18;
+; CHECKPTX71-NEXT: or.b32 %r53, %r52, %r51;
+; CHECKPTX71-NEXT: atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
+; CHECKPTX71-NEXT: setp.ne.s32 %p4, %r21, %r57;
+; CHECKPTX71-NEXT: mov.u32 %r57, %r21;
; CHECKPTX71-NEXT: @%p4 bra $L__BB0_7;
; CHECKPTX71-NEXT: // %bb.8: // %atomicrmw.end
; CHECKPTX71-NEXT: ret;
- %r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst
- %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst
- %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst
- %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val seq_cst
+ %r1 = atomicrmw fadd ptr %dp0, bfloat %val monotonic
+ %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 monotonic
+ %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val monotonic
+ %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val monotonic
ret void
}
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
index f7cc32b962b9c8..dd4bd078ee8ccf 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -53,43 +53,44 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
; SM70-LABEL: relaxed_sys_i8(
; SM70: {
; SM70-NEXT: .reg .pred %p<3>;
-; SM70-NEXT: .reg .b16 %rs<17>;
-; SM70-NEXT: .reg .b32 %r<3>;
-; SM70-NEXT: .reg .b64 %rd<5>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<21>;
+; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u8 %rs9, [relaxed_sys_i8_param_2];
+; SM70-NEXT: ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
-; SM70-NEXT: and.b64 %rd1, %rd2, -2;
-; SM70-NEXT: ld.param.u8 %rs10, [relaxed_sys_i8_param_1];
-; SM70-NEXT: and.b64 %rd3, %rd2, 1;
-; SM70-NEXT: shl.b64 %rd4, %rd3, 3;
-; SM70-NEXT: cvt.u32.u64 %r1, %rd4;
-; SM70-NEXT: mov.u16 %rs11, 255;
-; SM70-NEXT: shl.b16 %rs12, %rs11, %r1;
-; SM70-NEXT: not.b16 %rs2, %rs12;
-; SM70-NEXT: shl.b16 %rs3, %rs9, %r1;
-; SM70-NEXT: shl.b16 %rs4, %rs10, %r1;
-; SM70-NEXT: ld.u16 %rs13, [%rd1];
-; SM70-NEXT: and.b16 %rs16, %rs13, %rs2;
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT: and.b32 %r10, %r9, 3;
+; SM70-NEXT: shl.b32 %r1, %r10, 3;
+; SM70-NEXT: mov.b32 %r11, 255;
+; SM70-NEXT: shl.b32 %r12, %r11, %r1;
+; SM70-NEXT: not.b32 %r2, %r12;
+; SM70-NEXT: cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT: and.b32 %r14, %r13, 255;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM70-NEXT: shl.b32 %r4, %r15, %r1;
+; SM70-NEXT: ld.u32 %r16, [%rd1];
+; SM70-NEXT: and.b32 %r20, %r16, %r2;
; SM70-NEXT: $L__BB0_1: // %partword.cmpxchg.loop
; SM70-NEXT: // =>This Inner Loop Header: Depth=1
-; SM70-NEXT: or.b16 %rs14, %rs16, %rs3;
-; SM70-NEXT: or.b16 %rs15, %rs16, %rs4;
-; SM70-NEXT: atom.cas.b16 %rs7, [%rd1], %rs15, %rs14;
-; SM70-NEXT: setp.eq.s16 %p1, %rs7, %rs15;
+; SM70-NEXT: or.b32 %r17, %r20, %r3;
+; SM70-NEXT: or.b32 %r18, %r20, %r4;
+; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r18;
; SM70-NEXT: @%p1 bra $L__BB0_3;
; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
; SM70-NEXT: // in Loop: Header=BB0_1 Depth=1
-; SM70-NEXT: and.b16 %rs8, %rs7, %rs2;
-; SM70-NEXT: setp.ne.s16 %p2, %rs16, %rs8;
-; SM70-NEXT: mov.u16 %rs16, %rs8;
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT: mov.u32 %r20, %r8;
; SM70-NEXT: @%p2 bra $L__BB0_1;
; SM70-NEXT: $L__BB0_3: // %partword.cmpxchg.end
-; SM70-NEXT: cvt.u32.u16 %r2, %rs9;
-; SM70-NEXT: st.param.b32 [func_retval0], %r2;
+; SM70-NEXT: st.param.b32 [func_retval0], %r13;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
ret i8 %new
}
@@ -137,19 +138,44 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
;
; SM70-LABEL: relaxed_sys_i16(
; SM70: {
-; SM70-NEXT: .reg .b16 %rs<4>;
-; SM70-NEXT: .reg .b32 %r<2>;
-; SM70-NEXT: .reg .b64 %rd<2>;
+; SM70-NEXT: .reg .pred %p<3>;
+; SM70-NEXT: .reg .b16 %rs<2>;
+; SM70-NEXT: .reg .b32 %r<20>;
+; SM70-NEXT: .reg .b64 %rd<3>;
; SM70-EMPTY:
; SM70-NEXT: // %bb.0:
-; SM70-NEXT: ld.param.u64 %rd1, [relaxed_sys_i16_param_0];
-; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_1];
-; SM70-NEXT: ld.param.u16 %rs2, [relaxed_sys_i16_param_2];
-; SM70-NEXT: atom.cas.b16 %rs3, [%rd1], %rs1, %rs2;
-; SM70-NEXT: cvt.u32.u16 %r1, %rs2;
-; SM70-NEXT: st.param.b32 [func_retval0], %r1;
+; SM70-NEXT: ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
+; SM70-NEXT: ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM70-NEXT: and.b64 %rd1, %rd2, -4;
+; SM70-NEXT: ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM70-NEXT: cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT: and.b32 %r11, %r10, 3;
+; SM70-NEXT: shl.b32 %r1, %r11, 3;
+; SM70-NEXT: mov.b32 %r12, 65535;
+; SM70-NEXT: shl.b32 %r13, %r12, %r1;
+; SM70-NEXT: not.b32 %r2, %r13;
+; SM70-NEXT: cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT: shl.b32 %r3, %r14, %r1;
+; SM70-NEXT: shl.b32 %r4, %r9, %r1;
+; SM70-NEXT: ld.u32 %r15, [%rd1];
+; SM70-NEXT: and.b32 %r19, %r15, %r2;
+; SM70-NEXT: $L__BB1_1: // %partword.cmpxchg.loop
+; SM70-NEXT: // =>This Inner Loop Header: Depth=1
+; SM70-NEXT: or.b32 %r16, %r19, %r3;
+; SM70-NEXT: or.b32 %r17, %r19, %r4;
+; SM70-NEXT: atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT: setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT: @%p1 bra $L__BB1_3;
+; SM70-NEXT: // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT: // in Loop: Header=BB1_1 Depth=1
+; SM70-NEXT: and.b32 %r8, %r7, %r2;
+; SM70-NEXT: setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT: mov.u32 %r19, %r8;
+; SM70-NEXT: @%p2 bra $L__BB1_1;
+; SM70-NEXT: $L__BB1_3: // %partword.cmpxchg.end
+; SM70-NEXT: st.param.b32 [func_retval0], %r14;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
ret i16 %new
}
@@ -180,7 +206,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
; SM70-NEXT: atom.cas.b32 %r3, [%rd1], %r1, %r2;
; SM70-NEXT: st.param.b32 [func_retval0], %r2;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
ret i32 %new
}
@@ -209,7 +235,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
; SM70-NEXT: atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
; SM70-NEXT: st.param.b64 [func_retval0], %rd3;
; SM70-NEXT: ret;
- %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+ %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
ret i64 %new
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
|
|
CC: @gonzalobg |
|
Thanks @akshayrdeodhar! The change looks good to me! |
|
Not sure if the pull request was closed intentionally. Is there a particular reason for not allowing 16-bit CAS? AFAICT, it is supported by PTX. What was the motivation for this change? |
|
Recreated PR here: #120220 SASS codegen for cmpxchg is better when emulated using 32-bit CAS. |
Increases minimum CAS size from 16 bit to 32 bit.