Skip to content

Conversation

@akshayrdeodhar
Copy link
Contributor

@akshayrdeodhar akshayrdeodhar commented Dec 10, 2024

Increases minimum CAS size from 16 bit to 32 bit.

@llvmbot
Copy link
Member

llvmbot commented Dec 10, 2024

@llvm/pr-subscribers-backend-nvptx

Author: Akshay Deodhar (akshayrdeodhar)

Changes

Increases minimum CAS size from 16 bit to 32 bit, as 16 bit CAS is not natively supported- atom.cas.b16 is emulated.


Full diff: https://github.com/llvm/llvm-project/pull/119349.diff

5 Files Affected:

  • (modified) llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp (+2-1)
  • (modified) llvm/lib/Target/NVPTX/NVPTXSubtarget.h (+2)
  • (modified) llvm/test/CodeGen/NVPTX/atomics-sm70.ll (+4-4)
  • (modified) llvm/test/CodeGen/NVPTX/atomics-sm90.ll (+84-44)
  • (modified) llvm/test/CodeGen/NVPTX/cmpxchg.ll (+64-38)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ce94dded815b8f..2c3ceba03da2a5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -912,7 +912,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // actions
   computeRegisterProperties(STI.getRegisterInfo());
 
-  setMinCmpXchgSizeInBits(STI.hasAtomCas16() ? 16 : 32);
+  // PTX support for 16-bit CAS is emulated. Only use 32+
+  setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
   setMaxAtomicSizeInBitsSupported(64);
   setMaxDivRemBitWidthSupported(64);
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
index e785bbf830da62..04bfcc6ded4aa1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -123,6 +123,8 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
   //  set of equivalent memory operations with a scalar data-type, executed in
   //  an unspecified order on the elements in the vector.
   unsigned getMaxRequiredAlignment() const { return 8; }
+  // Get the smallest cmpxchg word size that the hardware supports.
+  unsigned getMinCmpXchgSizeInBits() const { return 32; }
 
   unsigned getPTXVersion() const { return PTXVersion; }
 
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
index 0c1ca8cb7ac166..a5b81dfc0cd009 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm70.ll
@@ -132,10 +132,10 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, half %
 ; CHECKPTX62-NEXT:    @%p4 bra $L__BB0_7;
 ; CHECKPTX62-NEXT:  // %bb.8: // %atomicrmw.end
 ; CHECKPTX62-NEXT:    ret;
-  %r1 = atomicrmw fadd ptr %dp0, half %val seq_cst
-  %r2 = atomicrmw fadd ptr %dp0, half 1.0 seq_cst
-  %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val seq_cst
-  %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val seq_cst
+  %r1 = atomicrmw fadd ptr %dp0, half %val monotonic
+  %r2 = atomicrmw fadd ptr %dp0, half 1.0 monotonic
+  %r3 = atomicrmw fadd ptr addrspace(1) %dp1, half %val monotonic
+  %r4 = atomicrmw fadd ptr addrspace(3) %dp3, half %val monotonic
   ret void
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
index 8bae18dcc5eef8..16e7baced67838 100644
--- a/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/atomics-sm90.ll
@@ -46,65 +46,105 @@ define void @test(ptr %dp0, ptr addrspace(1) %dp1, ptr addrspace(3) %dp3, bfloat
 ; CHECKPTX71-LABEL: test(
 ; CHECKPTX71:       {
 ; CHECKPTX71-NEXT:    .reg .pred %p<5>;
-; CHECKPTX71-NEXT:    .reg .b16 %rs<34>;
-; CHECKPTX71-NEXT:    .reg .b32 %r<4>;
+; CHECKPTX71-NEXT:    .reg .b16 %rs<18>;
+; CHECKPTX71-NEXT:    .reg .b32 %r<58>;
 ; CHECKPTX71-NEXT:    .reg .f32 %f<12>;
 ; CHECKPTX71-EMPTY:
 ; CHECKPTX71-NEXT:  // %bb.0:
-; CHECKPTX71-NEXT:    ld.param.b16 %rs13, [test_param_3];
-; CHECKPTX71-NEXT:    ld.param.u32 %r3, [test_param_2];
-; CHECKPTX71-NEXT:    ld.param.u32 %r2, [test_param_1];
-; CHECKPTX71-NEXT:    ld.param.u32 %r1, [test_param_0];
-; CHECKPTX71-NEXT:    ld.b16 %rs30, [%r1];
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f1, %rs13;
-; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start14
+; CHECKPTX71-NEXT:    ld.param.b16 %rs1, [test_param_3];
+; CHECKPTX71-NEXT:    ld.param.u32 %r23, [test_param_2];
+; CHECKPTX71-NEXT:    ld.param.u32 %r22, [test_param_1];
+; CHECKPTX71-NEXT:    ld.param.u32 %r24, [test_param_0];
+; CHECKPTX71-NEXT:    and.b32 %r1, %r24, -4;
+; CHECKPTX71-NEXT:    and.b32 %r25, %r24, 3;
+; CHECKPTX71-NEXT:    shl.b32 %r2, %r25, 3;
+; CHECKPTX71-NEXT:    mov.b32 %r26, 65535;
+; CHECKPTX71-NEXT:    shl.b32 %r27, %r26, %r2;
+; CHECKPTX71-NEXT:    not.b32 %r3, %r27;
+; CHECKPTX71-NEXT:    ld.u32 %r54, [%r1];
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f2, %rs1;
+; CHECKPTX71-NEXT:  $L__BB0_1: // %atomicrmw.start45
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f2, %rs30;
-; CHECKPTX71-NEXT:    add.rn.f32 %f3, %f2, %f1;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs14, %f3;
-; CHECKPTX71-NEXT:    atom.cas.b16 %rs17, [%r1], %rs30, %rs14;
-; CHECKPTX71-NEXT:    setp.ne.s16 %p1, %rs17, %rs30;
-; CHECKPTX71-NEXT:    mov.u16 %rs30, %rs17;
+; CHECKPTX71-NEXT:    shr.u32 %r28, %r54, %r2;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs2, %r28;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f1, %rs2;
+; CHECKPTX71-NEXT:    add.rn.f32 %f3, %f1, %f2;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs4, %f3;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r29, %rs4;
+; CHECKPTX71-NEXT:    shl.b32 %r30, %r29, %r2;
+; CHECKPTX71-NEXT:    and.b32 %r31, %r54, %r3;
+; CHECKPTX71-NEXT:    or.b32 %r32, %r31, %r30;
+; CHECKPTX71-NEXT:    atom.cas.b32 %r6, [%r1], %r54, %r32;
+; CHECKPTX71-NEXT:    setp.ne.s32 %p1, %r6, %r54;
+; CHECKPTX71-NEXT:    mov.u32 %r54, %r6;
 ; CHECKPTX71-NEXT:    @%p1 bra $L__BB0_1;
-; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end13
-; CHECKPTX71-NEXT:    ld.b16 %rs31, [%r1];
-; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start8
+; CHECKPTX71-NEXT:  // %bb.2: // %atomicrmw.end44
+; CHECKPTX71-NEXT:    ld.u32 %r55, [%r1];
+; CHECKPTX71-NEXT:  $L__BB0_3: // %atomicrmw.start27
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f4, %rs31;
+; CHECKPTX71-NEXT:    shr.u32 %r33, %r55, %r2;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs6, %r33;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f4, %rs6;
 ; CHECKPTX71-NEXT:    add.rn.f32 %f5, %f4, 0f3F800000;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs18, %f5;
-; CHECKPTX71-NEXT:    atom.cas.b16 %rs21, [%r1], %rs31, %rs18;
-; CHECKPTX71-NEXT:    setp.ne.s16 %p2, %rs21, %rs31;
-; CHECKPTX71-NEXT:    mov.u16 %rs31, %rs21;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs8, %f5;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r34, %rs8;
+; CHECKPTX71-NEXT:    shl.b32 %r35, %r34, %r2;
+; CHECKPTX71-NEXT:    and.b32 %r36, %r55, %r3;
+; CHECKPTX71-NEXT:    or.b32 %r37, %r36, %r35;
+; CHECKPTX71-NEXT:    atom.cas.b32 %r9, [%r1], %r55, %r37;
+; CHECKPTX71-NEXT:    setp.ne.s32 %p2, %r9, %r55;
+; CHECKPTX71-NEXT:    mov.u32 %r55, %r9;
 ; CHECKPTX71-NEXT:    @%p2 bra $L__BB0_3;
-; CHECKPTX71-NEXT:  // %bb.4: // %atomicrmw.end7
-; CHECKPTX71-NEXT:    ld.global.b16 %rs32, [%r2];
-; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start2
+; CHECKPTX71-NEXT:  // %bb.4: // %atomicrmw.end26
+; CHECKPTX71-NEXT:    and.b32 %r10, %r22, -4;
+; CHECKPTX71-NEXT:    shl.b32 %r38, %r22, 3;
+; CHECKPTX71-NEXT:    and.b32 %r11, %r38, 24;
+; CHECKPTX71-NEXT:    shl.b32 %r40, %r26, %r11;
+; CHECKPTX71-NEXT:    not.b32 %r12, %r40;
+; CHECKPTX71-NEXT:    ld.global.u32 %r56, [%r10];
+; CHECKPTX71-NEXT:  $L__BB0_5: // %atomicrmw.start9
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f7, %rs32;
-; CHECKPTX71-NEXT:    add.rn.f32 %f8, %f7, %f1;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs22, %f8;
-; CHECKPTX71-NEXT:    atom.global.cas.b16 %rs25, [%r2], %rs32, %rs22;
-; CHECKPTX71-NEXT:    setp.ne.s16 %p3, %rs25, %rs32;
-; CHECKPTX71-NEXT:    mov.u16 %rs32, %rs25;
+; CHECKPTX71-NEXT:    shr.u32 %r41, %r56, %r11;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs10, %r41;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f6, %rs10;
+; CHECKPTX71-NEXT:    add.rn.f32 %f8, %f6, %f2;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs12, %f8;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r42, %rs12;
+; CHECKPTX71-NEXT:    shl.b32 %r43, %r42, %r11;
+; CHECKPTX71-NEXT:    and.b32 %r44, %r56, %r12;
+; CHECKPTX71-NEXT:    or.b32 %r45, %r44, %r43;
+; CHECKPTX71-NEXT:    atom.global.cas.b32 %r15, [%r10], %r56, %r45;
+; CHECKPTX71-NEXT:    setp.ne.s32 %p3, %r15, %r56;
+; CHECKPTX71-NEXT:    mov.u32 %r56, %r15;
 ; CHECKPTX71-NEXT:    @%p3 bra $L__BB0_5;
-; CHECKPTX71-NEXT:  // %bb.6: // %atomicrmw.end1
-; CHECKPTX71-NEXT:    ld.shared.b16 %rs33, [%r3];
+; CHECKPTX71-NEXT:  // %bb.6: // %atomicrmw.end8
+; CHECKPTX71-NEXT:    and.b32 %r16, %r23, -4;
+; CHECKPTX71-NEXT:    shl.b32 %r46, %r23, 3;
+; CHECKPTX71-NEXT:    and.b32 %r17, %r46, 24;
+; CHECKPTX71-NEXT:    shl.b32 %r48, %r26, %r17;
+; CHECKPTX71-NEXT:    not.b32 %r18, %r48;
+; CHECKPTX71-NEXT:    ld.shared.u32 %r57, [%r16];
 ; CHECKPTX71-NEXT:  $L__BB0_7: // %atomicrmw.start
 ; CHECKPTX71-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECKPTX71-NEXT:    cvt.f32.bf16 %f10, %rs33;
-; CHECKPTX71-NEXT:    add.rn.f32 %f11, %f10, %f1;
-; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs26, %f11;
-; CHECKPTX71-NEXT:    atom.shared.cas.b16 %rs29, [%r3], %rs33, %rs26;
-; CHECKPTX71-NEXT:    setp.ne.s16 %p4, %rs29, %rs33;
-; CHECKPTX71-NEXT:    mov.u16 %rs33, %rs29;
+; CHECKPTX71-NEXT:    shr.u32 %r49, %r57, %r17;
+; CHECKPTX71-NEXT:    cvt.u16.u32 %rs14, %r49;
+; CHECKPTX71-NEXT:    cvt.f32.bf16 %f9, %rs14;
+; CHECKPTX71-NEXT:    add.rn.f32 %f11, %f9, %f2;
+; CHECKPTX71-NEXT:    cvt.rn.bf16.f32 %rs16, %f11;
+; CHECKPTX71-NEXT:    cvt.u32.u16 %r50, %rs16;
+; CHECKPTX71-NEXT:    shl.b32 %r51, %r50, %r17;
+; CHECKPTX71-NEXT:    and.b32 %r52, %r57, %r18;
+; CHECKPTX71-NEXT:    or.b32 %r53, %r52, %r51;
+; CHECKPTX71-NEXT:    atom.shared.cas.b32 %r21, [%r16], %r57, %r53;
+; CHECKPTX71-NEXT:    setp.ne.s32 %p4, %r21, %r57;
+; CHECKPTX71-NEXT:    mov.u32 %r57, %r21;
 ; CHECKPTX71-NEXT:    @%p4 bra $L__BB0_7;
 ; CHECKPTX71-NEXT:  // %bb.8: // %atomicrmw.end
 ; CHECKPTX71-NEXT:    ret;
-  %r1 = atomicrmw fadd ptr %dp0, bfloat %val seq_cst
-  %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 seq_cst
-  %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val seq_cst
-  %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val seq_cst
+  %r1 = atomicrmw fadd ptr %dp0, bfloat %val monotonic
+  %r2 = atomicrmw fadd ptr %dp0, bfloat 1.0 monotonic
+  %r3 = atomicrmw fadd ptr addrspace(1) %dp1, bfloat %val monotonic
+  %r4 = atomicrmw fadd ptr addrspace(3) %dp3, bfloat %val monotonic
   ret void
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg.ll b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
index f7cc32b962b9c8..dd4bd078ee8ccf 100644
--- a/llvm/test/CodeGen/NVPTX/cmpxchg.ll
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg.ll
@@ -53,43 +53,44 @@ define i8 @relaxed_sys_i8(ptr %addr, i8 %cmp, i8 %new) {
 ; SM70-LABEL: relaxed_sys_i8(
 ; SM70:       {
 ; SM70-NEXT:    .reg .pred %p<3>;
-; SM70-NEXT:    .reg .b16 %rs<17>;
-; SM70-NEXT:    .reg .b32 %r<3>;
-; SM70-NEXT:    .reg .b64 %rd<5>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<21>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u8 %rs9, [relaxed_sys_i8_param_2];
+; SM70-NEXT:    ld.param.u8 %rs1, [relaxed_sys_i8_param_2];
 ; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i8_param_0];
-; SM70-NEXT:    and.b64 %rd1, %rd2, -2;
-; SM70-NEXT:    ld.param.u8 %rs10, [relaxed_sys_i8_param_1];
-; SM70-NEXT:    and.b64 %rd3, %rd2, 1;
-; SM70-NEXT:    shl.b64 %rd4, %rd3, 3;
-; SM70-NEXT:    cvt.u32.u64 %r1, %rd4;
-; SM70-NEXT:    mov.u16 %rs11, 255;
-; SM70-NEXT:    shl.b16 %rs12, %rs11, %r1;
-; SM70-NEXT:    not.b16 %rs2, %rs12;
-; SM70-NEXT:    shl.b16 %rs3, %rs9, %r1;
-; SM70-NEXT:    shl.b16 %rs4, %rs10, %r1;
-; SM70-NEXT:    ld.u16 %rs13, [%rd1];
-; SM70-NEXT:    and.b16 %rs16, %rs13, %rs2;
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    cvt.u32.u64 %r9, %rd2;
+; SM70-NEXT:    and.b32 %r10, %r9, 3;
+; SM70-NEXT:    shl.b32 %r1, %r10, 3;
+; SM70-NEXT:    mov.b32 %r11, 255;
+; SM70-NEXT:    shl.b32 %r12, %r11, %r1;
+; SM70-NEXT:    not.b32 %r2, %r12;
+; SM70-NEXT:    cvt.u32.u16 %r13, %rs1;
+; SM70-NEXT:    and.b32 %r14, %r13, 255;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    ld.param.u8 %r15, [relaxed_sys_i8_param_1];
+; SM70-NEXT:    shl.b32 %r4, %r15, %r1;
+; SM70-NEXT:    ld.u32 %r16, [%rd1];
+; SM70-NEXT:    and.b32 %r20, %r16, %r2;
 ; SM70-NEXT:  $L__BB0_1: // %partword.cmpxchg.loop
 ; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
-; SM70-NEXT:    or.b16 %rs14, %rs16, %rs3;
-; SM70-NEXT:    or.b16 %rs15, %rs16, %rs4;
-; SM70-NEXT:    atom.cas.b16 %rs7, [%rd1], %rs15, %rs14;
-; SM70-NEXT:    setp.eq.s16 %p1, %rs7, %rs15;
+; SM70-NEXT:    or.b32 %r17, %r20, %r3;
+; SM70-NEXT:    or.b32 %r18, %r20, %r4;
+; SM70-NEXT:    atom.cas.b32 %r7, [%rd1], %r18, %r17;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r18;
 ; SM70-NEXT:    @%p1 bra $L__BB0_3;
 ; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
 ; SM70-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; SM70-NEXT:    and.b16 %rs8, %rs7, %rs2;
-; SM70-NEXT:    setp.ne.s16 %p2, %rs16, %rs8;
-; SM70-NEXT:    mov.u16 %rs16, %rs8;
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r20, %r8;
+; SM70-NEXT:    mov.u32 %r20, %r8;
 ; SM70-NEXT:    @%p2 bra $L__BB0_1;
 ; SM70-NEXT:  $L__BB0_3: // %partword.cmpxchg.end
-; SM70-NEXT:    cvt.u32.u16 %r2, %rs9;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
+; SM70-NEXT:    st.param.b32 [func_retval0], %r13;
 ; SM70-NEXT:    ret;
-  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new seq_cst seq_cst
+  %pairold = cmpxchg ptr %addr, i8 %cmp, i8 %new monotonic monotonic
   ret i8 %new
 }
 
@@ -137,19 +138,44 @@ define i16 @relaxed_sys_i16(ptr %addr, i16 %cmp, i16 %new) {
 ;
 ; SM70-LABEL: relaxed_sys_i16(
 ; SM70:       {
-; SM70-NEXT:    .reg .b16 %rs<4>;
-; SM70-NEXT:    .reg .b32 %r<2>;
-; SM70-NEXT:    .reg .b64 %rd<2>;
+; SM70-NEXT:    .reg .pred %p<3>;
+; SM70-NEXT:    .reg .b16 %rs<2>;
+; SM70-NEXT:    .reg .b32 %r<20>;
+; SM70-NEXT:    .reg .b64 %rd<3>;
 ; SM70-EMPTY:
 ; SM70-NEXT:  // %bb.0:
-; SM70-NEXT:    ld.param.u64 %rd1, [relaxed_sys_i16_param_0];
-; SM70-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_1];
-; SM70-NEXT:    ld.param.u16 %rs2, [relaxed_sys_i16_param_2];
-; SM70-NEXT:    atom.cas.b16 %rs3, [%rd1], %rs1, %rs2;
-; SM70-NEXT:    cvt.u32.u16 %r1, %rs2;
-; SM70-NEXT:    st.param.b32 [func_retval0], %r1;
+; SM70-NEXT:    ld.param.u16 %rs1, [relaxed_sys_i16_param_2];
+; SM70-NEXT:    ld.param.u64 %rd2, [relaxed_sys_i16_param_0];
+; SM70-NEXT:    and.b64 %rd1, %rd2, -4;
+; SM70-NEXT:    ld.param.u16 %r9, [relaxed_sys_i16_param_1];
+; SM70-NEXT:    cvt.u32.u64 %r10, %rd2;
+; SM70-NEXT:    and.b32 %r11, %r10, 3;
+; SM70-NEXT:    shl.b32 %r1, %r11, 3;
+; SM70-NEXT:    mov.b32 %r12, 65535;
+; SM70-NEXT:    shl.b32 %r13, %r12, %r1;
+; SM70-NEXT:    not.b32 %r2, %r13;
+; SM70-NEXT:    cvt.u32.u16 %r14, %rs1;
+; SM70-NEXT:    shl.b32 %r3, %r14, %r1;
+; SM70-NEXT:    shl.b32 %r4, %r9, %r1;
+; SM70-NEXT:    ld.u32 %r15, [%rd1];
+; SM70-NEXT:    and.b32 %r19, %r15, %r2;
+; SM70-NEXT:  $L__BB1_1: // %partword.cmpxchg.loop
+; SM70-NEXT:    // =>This Inner Loop Header: Depth=1
+; SM70-NEXT:    or.b32 %r16, %r19, %r3;
+; SM70-NEXT:    or.b32 %r17, %r19, %r4;
+; SM70-NEXT:    atom.cas.b32 %r7, [%rd1], %r17, %r16;
+; SM70-NEXT:    setp.eq.s32 %p1, %r7, %r17;
+; SM70-NEXT:    @%p1 bra $L__BB1_3;
+; SM70-NEXT:  // %bb.2: // %partword.cmpxchg.failure
+; SM70-NEXT:    // in Loop: Header=BB1_1 Depth=1
+; SM70-NEXT:    and.b32 %r8, %r7, %r2;
+; SM70-NEXT:    setp.ne.s32 %p2, %r19, %r8;
+; SM70-NEXT:    mov.u32 %r19, %r8;
+; SM70-NEXT:    @%p2 bra $L__BB1_1;
+; SM70-NEXT:  $L__BB1_3: // %partword.cmpxchg.end
+; SM70-NEXT:    st.param.b32 [func_retval0], %r14;
 ; SM70-NEXT:    ret;
-  %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new seq_cst seq_cst
+  %pairold = cmpxchg ptr %addr, i16 %cmp, i16 %new monotonic monotonic
   ret i16 %new
 }
 
@@ -180,7 +206,7 @@ define i32 @relaxed_sys_i32(ptr %addr, i32 %cmp, i32 %new) {
 ; SM70-NEXT:    atom.cas.b32 %r3, [%rd1], %r1, %r2;
 ; SM70-NEXT:    st.param.b32 [func_retval0], %r2;
 ; SM70-NEXT:    ret;
-  %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new seq_cst seq_cst
+  %pairold = cmpxchg ptr %addr, i32 %cmp, i32 %new monotonic monotonic
   ret i32 %new
 }
 
@@ -209,7 +235,7 @@ define i64 @relaxed_sys_i64(ptr %addr, i64 %cmp, i64 %new) {
 ; SM70-NEXT:    atom.cas.b64 %rd4, [%rd1], %rd2, %rd3;
 ; SM70-NEXT:    st.param.b64 [func_retval0], %rd3;
 ; SM70-NEXT:    ret;
-  %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new seq_cst seq_cst
+  %pairold = cmpxchg ptr %addr, i64 %cmp, i64 %new monotonic monotonic
   ret i64 %new
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

@akshayrdeodhar
Copy link
Contributor Author

CC: @gonzalobg

@schwarzschild-radius
Copy link
Contributor

Thanks @akshayrdeodhar! The change looks good to me!
Will let Artem review and approve the changes

@akshayrdeodhar akshayrdeodhar deleted the dev/upstream/casatomics branch December 10, 2024 18:25
@akshayrdeodhar akshayrdeodhar removed the request for review from Artem-B December 10, 2024 18:25
@Artem-B
Copy link
Member

Artem-B commented Dec 10, 2024

Not sure if the pull request was closed intentionally.

Is there a particular reason for not allowing 16-bit CAS? AFAICT, it is supported by PTX. What was the motivation for this change?

@akshayrdeodhar
Copy link
Contributor Author

Recreated PR here: #120220

SASS codegen for cmpxchg is better when emulated using 32-bit CAS.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants