Feat: atomics 3; return of the atomics

RDambrosio016 · RDambrosio016 · commit 8d264add3637 · 2022-01-13T16:00:04.000-05:00
diff --git a/crates/cuda_std/src/atomic.rs b/crates/cuda_std/src/atomic.rs
@@ -89,6 +89,11 @@ macro_rules! atomic_float {
                     }
                 }
 
+                /// Consumes the atomic and returns the contained value.
+                pub fn into_inner(self) -> $float_ty {
+                    self.v.into_inner()
+                }
+
                 #[cfg(not(target_os = "cuda"))]
                 fn as_atomic_bits(&self) -> &core::sync::atomic::[<AtomicU $width>] {
                     // SAFETY: AtomicU32/U64 pointers are compatible with UnsafeCell<u32/u64>.
@@ -120,6 +125,60 @@ macro_rules! atomic_float {
                     self.update_with(order, |v| v + val)
                 }
 
+                /// Subtracts from the current value, returning the previous value **before** the subtraction.
+                ///
+                /// Note, this is actually implemented as `old + (-new)`, CUDA does not have a specialized sub instruction.
+                ///
+                $(#[doc = safety_doc!($unsafety)])?
+                pub $($unsafety)? fn fetch_sub(&self, val: $float_ty, order: Ordering) -> $float_ty {
+                    #[cfg(target_os = "cuda")]
+                    // SAFETY: data races are prevented by atomic intrinsics and the pointer we get is valid.
+                    unsafe {
+                        mid::[<atomic_fetch_sub_ $float_ty _ $scope>](self.v.get(), order, val)
+                    }
+                    #[cfg(not(target_os = "cuda"))]
+                    self.update_with(order, |v| v - val)
+                }
+
+                /// Bitwise "and" with the current value. Returns the value **before** the "and".
+                ///
+                $(#[doc = safety_doc!($unsafety)])?
+                pub $($unsafety)? fn fetch_and(&self, val: $float_ty, order: Ordering) -> $float_ty {
+                    #[cfg(target_os = "cuda")]
+                    // SAFETY: data races are prevented by atomic intrinsics and the pointer we get is valid.
+                    unsafe {
+                        mid::[<atomic_fetch_and_ $float_ty _ $scope>](self.v.get(), order, val)
+                    }
+                    #[cfg(not(target_os = "cuda"))]
+                    self.update_with(order, |v| $float_ty::from_bits(v.to_bits() & val.to_bits()))
+                }
+
+                /// Bitwise "or" with the current value. Returns the value **before** the "or".
+                ///
+                $(#[doc = safety_doc!($unsafety)])?
+                pub $($unsafety)? fn fetch_or(&self, val: $float_ty, order: Ordering) -> $float_ty {
+                    #[cfg(target_os = "cuda")]
+                    // SAFETY: data races are prevented by atomic intrinsics and the pointer we get is valid.
+                    unsafe {
+                        mid::[<atomic_fetch_or_ $float_ty _ $scope>](self.v.get(), order, val)
+                    }
+                    #[cfg(not(target_os = "cuda"))]
+                    self.update_with(order, |v| $float_ty::from_bits(v.to_bits() | val.to_bits()))
+                }
+
+                /// Bitwise "xor" with the current value. Returns the value **before** the "xor".
+                ///
+                $(#[doc = safety_doc!($unsafety)])?
+                pub $($unsafety)? fn fetch_xor(&self, val: $float_ty, order: Ordering) -> $float_ty {
+                    #[cfg(target_os = "cuda")]
+                    // SAFETY: data races are prevented by atomic intrinsics and the pointer we get is valid.
+                    unsafe {
+                        mid::[<atomic_fetch_xor_ $float_ty _ $scope>](self.v.get(), order, val)
+                    }
+                    #[cfg(not(target_os = "cuda"))]
+                    self.update_with(order, |v| $float_ty::from_bits(v.to_bits() ^ val.to_bits()))
+                }
+
                 /// Atomically loads the value behind this atomic.
                 ///
                 /// `load` takes an [`Ordering`] argument which describes the memory ordering of this operation.
diff --git a/crates/cuda_std/src/atomic/intrinsics.rs b/crates/cuda_std/src/atomic/intrinsics.rs
@@ -6,47 +6,56 @@ use paste::paste;
 
 #[gpu_only]
 pub unsafe fn membar_device() {
-    asm!("membar.gl");
+    asm!("membar.gl;");
 }
 
 #[gpu_only]
 pub unsafe fn membar_block() {
-    asm!("membar.cta");
+    asm!("membar.cta;");
 }
 
 #[gpu_only]
 pub unsafe fn membar_system() {
-    asm!("membar.sys");
+    asm!("membar.sys;");
 }
 
 #[gpu_only]
 pub unsafe fn fence_sc_device() {
-    asm!("fence.sc.gl");
+    asm!("fence.sc.gl;");
 }
 
 #[gpu_only]
 pub unsafe fn fence_sc_block() {
-    asm!("fence.sc.cta");
+    asm!("fence.sc.cta;");
 }
 
 #[gpu_only]
 pub unsafe fn fence_sc_system() {
-    asm!("fence.sc.sys");
+    asm!("fence.sc.sys;");
 }
 
 #[gpu_only]
 pub unsafe fn fence_acqrel_device() {
-    asm!("fence.acq_rel.gl");
+    asm!("fence.acq_rel.gl;");
 }
 
 #[gpu_only]
 pub unsafe fn fence_acqrel_block() {
-    asm!("fence.acq_rel.sys");
+    asm!("fence.acq_rel.sys;");
 }
 
 #[gpu_only]
 pub unsafe fn fence_acqrel_system() {
-    asm!("fence.acq_rel.sys");
+    asm!("fence.acq_rel.sys;");
+}
+
+macro_rules! load_scope {
+    (volatile, $scope:ident) => {
+        ""
+    };
+    ($ordering:ident, $scope:ident) => {
+        concat!(".", stringify!($scope))
+    };
 }
 
 macro_rules! load {
@@ -59,7 +68,7 @@ macro_rules! load {
                 pub unsafe fn [<atomic_load_ $ordering _ $width _ $scope>](ptr: *const [<u $width>]) -> [<u $width>] {
                     let mut out;
                     asm!(
-                        concat!("ld.", stringify!($ordering), ".", stringify!($scope_asm), ".", stringify!([<u $width>]), "{}, [{}]"),
+                        concat!("ld.", stringify!($ordering), load_scope!($ordering, $scope), ".", stringify!([<u $width>]), " {}, [{}];"),
                         out([<reg $width>]) out,
                         in(reg64) ptr
                     );
@@ -105,7 +114,7 @@ macro_rules! store {
                 #[doc = concat!("Performs a ", stringify!($ordering), " atomic store at the ", stringify!($scope), " level with a width of ", stringify!($width), " bits")]
                 pub unsafe fn [<atomic_store_ $ordering _ $width _ $scope>](ptr: *mut [<u $width>], val: [<u $width>]) {
                     asm!(
-                        concat!("st.", stringify!($ordering), ".", stringify!($scope_asm), ".", stringify!([<u $width>]), "[{}], {}"),
+                        concat!("st.", stringify!($ordering), load_scope!($ordering, $scope), ".", stringify!([<u $width>]), " [{}], {};"),
                         in(reg64) ptr,
                         in([<reg $width>]) val,
                     );
@@ -141,6 +150,19 @@ store! {
     volatile, 64, system, sys,
 }
 
+#[allow(unused_macros)]
+macro_rules! ptx_type {
+    (i32) => {
+        "s32"
+    };
+    (i64) => {
+        "s64"
+    };
+    ($ty:ident) => {
+        stringify!($ty)
+    };
+}
+
 #[allow(unused_macros)]
 macro_rules! ordering {
     (volatile) => {
@@ -172,7 +194,8 @@ macro_rules! atomic_fetch_op_2_reg {
                             ".",
                             stringify!($op),
                             ".",
-                            "{}, [{}]"
+                            ptx_type!($type),
+                            " {}, [{}];"
                         ),
                         out([<reg $width>]) out,
                         in(reg64) ptr,
@@ -359,7 +382,8 @@ macro_rules! atomic_fetch_op_3_reg {
                             ".",
                             stringify!($op),
                             ".",
-                            "{}, [{}], {}"
+                            ptx_type!($type),
+                            " {}, [{}], {};"
                         ),
                         out([<reg $width>]) out,
                         in(reg64) ptr,
@@ -1101,7 +1125,8 @@ macro_rules! atomic_fetch_op_4_reg {
                             ".",
                             stringify!($op),
                             ".",
-                            "{}, [{}], {}, {}"
+                            ptx_type!($type),
+                            " {}, [{}], {}, {};"
                         ),
                         out([<reg $width>]) out,
                         in(reg64) ptr,
@@ -1227,6 +1252,19 @@ atomic_fetch_op_4_reg! {
     volatile, cas, 64, f64, system, sys,
 }
 
+#[allow(unused_macros)]
+macro_rules! negation {
+    (u32, $val:ident) => {{
+        -($val as i32)
+    }};
+    (u64, $val:ident) => {{
+        -($val as i64)
+    }};
+    ($type:ty, $val:ident) => {{
+        -$val
+    }};
+}
+
 // atomic sub is a little special, nvcc implements it as an atomic add with a negated operand. PTX
 // does not have atom.sub.
 macro_rules! atomic_sub {
@@ -1246,11 +1284,12 @@ macro_rules! atomic_sub {
                             ".",
                             "add",
                             ".",
-                            "{}, [{}], {}"
+                            ptx_type!($type),
+                            " {}, [{}], {};"
                         ),
                         out([<reg $width>]) out,
                         in(reg64) ptr,
-                        in([<reg $width>]) -(val as [<i $width>]),
+                        in([<reg $width>]) negation!($type, val),
                     );
                     out
                 }