Rust-GPU
diff --git a/‎crates/cuda_builder/src/lib.rs
Lines changed: 3 additions & 0 deletions b/‎crates/cuda_builder/src/lib.rs
Lines changed: 3 additions & 0 deletions
diff --git a/‎crates/cuda_std/src/atomic.rs
Lines changed: 153 additions & 0 deletions b/‎crates/cuda_std/src/atomic.rs
Lines changed: 153 additions & 0 deletions
@@ -449,6 +449,9 @@ fn invoke_rustc(builder: &CudaBuilder) -> Result<PathBuf, CudaBuilderError> {
         }
     }
 
+    let arch = format!("{:?}0", builder.arch);
+    cargo.env("CUDA_ARCH", arch.strip_prefix("Compute").unwrap());
+
     let cargo_encoded_rustflags = join_checking_for_separators(rustflags, "\x1f");
 
     let build = cargo
 
@@ -18,4 +18,157 @@
 //! Therefore we chose to go with the approach of implementing all atomics inside cuda_std. In the future, we may support
 //! a subset of core atomics, but for now, you will have to use cuda_std atomics.
 
+#![allow(unused_unsafe)]
+
 pub mod intrinsics;
+pub mod mid;
+
+use core::cell::UnsafeCell;
+use core::sync::atomic::Ordering;
+use paste::paste;
+
+#[allow(dead_code)]
+fn fail_order(order: Ordering) -> Ordering {
+    match order {
+        Ordering::Release | Ordering::Relaxed => Ordering::Relaxed,
+        Ordering::Acquire | Ordering::AcqRel => Ordering::Acquire,
+        Ordering::SeqCst => Ordering::SeqCst,
+        x => x,
+    }
+}
+
+macro_rules! scope_doc {
+    (device) => {
+        "a single device (GPU)."
+    };
+    (block) => {
+        "a single thread block (also called a CTA, cooperative thread array)."
+    };
+    (system) => {
+        "the entire system."
+    };
+}
+
+macro_rules! safety_doc {
+    ($($unsafety:ident)?) => {
+        $(
+            concat!(
+                "# Safety\n",
+                concat!("This function is ", stringify!($unsafety), " because it does not synchronize\n"),
+                "across the entire GPU or System, which leaves it open for data races if used incorrectly"
+            )
+        )?
+    };
+}
+
+macro_rules! atomic_float {
+    ($float_ty:ident, $atomic_ty:ident, $align:tt, $scope:ident, $width:tt $(,$unsafety:ident)?) => {
+        #[doc = concat!("A ", stringify!($width), "-bit float type which can be safely shared between threads and synchronizes across ", scope_doc!($scope))]
+        ///
+        /// This type is guaranteed to have the same memory representation as the underlying integer
+        /// type [`
+        #[doc = stringify!($float_ty)]
+        /// `].
+        ///
+        /// The functions on this type map to hardware instructions on CUDA platforms, and are emulated
+        /// with a CAS loop on the CPU (non-CUDA targets).
+        #[repr(C, align($align))]
+        pub struct $atomic_ty {
+            v: UnsafeCell<$float_ty>,
+        }
+
+        // SAFETY: atomic ops make sure this is fine
+        unsafe impl Sync for $atomic_ty {}
+
+        impl $atomic_ty {
+            paste! {
+                /// Creates a new atomic float.
+                pub const fn new(v: $float_ty) -> $atomic_ty {
+                    Self {
+                        v: UnsafeCell::new(v),
+                    }
+                }
+
+                #[cfg(not(target_os = "cuda"))]
+                fn as_atomic_bits(&self) -> &core::sync::atomic::[<AtomicU $width>] {
+                    // SAFETY: AtomicU32/U64 pointers are compatible with UnsafeCell<u32/u64>.
+                    unsafe {
+                        core::mem::transmute(self)
+                    }
+                }
+
+                #[cfg(not(target_os = "cuda"))]
+                fn update_with(&self, order: Ordering, mut func: impl FnMut($float_ty) -> $float_ty) -> $float_ty {
+                    let res = self
+                        .as_atomic_bits()
+                        .fetch_update(order, fail_order(order), |prev| {
+                            Some(func($float_ty::from_bits(prev))).map($float_ty::to_bits)
+                        }).unwrap();
+                    $float_ty::from_bits(res)
+                }
+
+                /// Adds to the current value, returning the previous value **before** the addition.
+                ///
+                $(#[doc = safety_doc!($unsafety)])?
+                pub $($unsafety)? fn fetch_add(&self, val: $float_ty, order: Ordering) -> $float_ty {
+                    #[cfg(target_os = "cuda")]
+                    // SAFETY: data races are prevented by atomic intrinsics and the pointer we get is valid.
+                    unsafe {
+                        mid::[<atomic_fetch_add_ $float_ty _ $scope>](self.v.get(), order, val)
+                    }
+                    #[cfg(not(target_os = "cuda"))]
+                    self.update_with(order, |v| v + val)
+                }
+
+                /// Atomically loads the value behind this atomic.
+                ///
+                /// `load` takes an [`Ordering`] argument which describes the memory ordering of this operation.
+                /// Possible values are [`Ordering::SeqCst`], [`Ordering::Acquire`], and [`Ordering::Relaxed`].
+                ///
+                /// # Panics
+                ///
+                /// Panics if `order` is [`Ordering::Release`] or [`Ordering::AcqRel`].
+                ///
+                $(#[doc = safety_doc!($unsafety)])?
+                pub $($unsafety)? fn load(&self, order: Ordering) -> $float_ty {
+                    #[cfg(target_os = "cuda")]
+                    unsafe {
+                        let val = mid::[<atomic_load_ $width _ $scope>](self.v.get().cast(), order);
+                        $float_ty::from_bits(val)
+                    }
+                    #[cfg(not(target_os = "cuda"))]
+                    {
+                        let val = self.as_atomic_bits().load(order);
+                        $float_ty::from_bits(val)
+                    }
+                }
+
+                /// Atomically stores a value into this atomic.
+                ///
+                /// `store` takes an [`Ordering`] argument which describes the memory ordering of this operation.
+                /// Possible values are [`Ordering::SeqCst`], [`Ordering::Release`], and [`Ordering::Relaxed`].
+                ///
+                /// # Panics
+                ///
+                /// Panics if `order` is [`Ordering::Acquire`] or [`Ordering::AcqRel`].
+                ///
+                $(#[doc = safety_doc!($unsafety)])?
+                pub $($unsafety)? fn store(&self, val: $float_ty, order: Ordering) {
+                    #[cfg(target_os = "cuda")]
+                    unsafe {
+                        mid::[<atomic_store_ $width _ $scope>](self.v.get().cast(), order, val.to_bits());
+                    }
+                    #[cfg(not(target_os = "cuda"))]
+                    self.as_atomic_bits().store(val.to_bits(), order);
+                }
+            }
+        }
+    };
+}
+
+atomic_float!(f32, AtomicF32, 4, device, 32);
+atomic_float!(f64, AtomicF64, 8, device, 64);
+atomic_float!(f32, BlockAtomicF32, 4, block, 32, unsafe);
+atomic_float!(f64, BlockAtomicF64, 8, block, 64, unsafe);
+atomic_float!(f32, SystemAtomicF32, 4, device, 32);
+atomic_float!(f64, SystemAtomicF64, 8, device, 64);
Original file line number	Diff line number	Diff line change
`@@ -449,6 +449,9 @@ fn invoke_rustc(builder: &CudaBuilder) -> Result<PathBuf, CudaBuilderError> {`
`449`	`449`	`}`
`450`	`450`	`}`
`451`	`451`
	`452`	`+ let arch = format!("{:?}0", builder.arch);`
	`453`	`+ cargo.env("CUDA_ARCH", arch.strip_prefix("Compute").unwrap());`
	`454`	`+`
`452`	`455`	`let cargo_encoded_rustflags = join_checking_for_separators(rustflags, "\x1f");`
`453`	`456`
`454`	`457`	`let build = cargo`