Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ members = [
"rstsr-native-impl",
"rstsr-sci-traits",
"crates-device/rstsr-openblas",
"crates-device/rstsr-accelerate",
"crates-device/rstsr-mkl",
"crates-device/rstsr-blis",
"crates-device/rstsr-aocl",
Expand Down Expand Up @@ -40,6 +41,7 @@ rstsr-linalg-traits = { path = "./rstsr-linalg-traits", default-features = false
rstsr-sci-traits = { path = "./rstsr-sci-traits", default-features = false, version = "0.6.2" }
# members (device)
rstsr-openblas = { path = "./crates-device/rstsr-openblas", default-features = false, version = "0.6.2" }
rstsr-accelerate = { path = "./crates-device/rstsr-accelerate", default-features = false, version = "0.6.2" }
rstsr-mkl = { path = "./crates-device/rstsr-mkl", default-features = false, version = "0.6.2" }
rstsr-blis = { path = "./crates-device/rstsr-blis", default-features = false, version = "0.6.2" }
rstsr-aocl = { path = "./crates-device/rstsr-aocl", default-features = false, version = "0.6.2" }
Expand All @@ -51,6 +53,7 @@ rstsr-test-manifest = { path = "./rstsr-test-manifest", default-features = false
# ffi dependencies
rstsr-cblas-base = { version = "0.1" }
rstsr-openblas-ffi = { version = "0.5", default-features = false, features = ["blas", "cblas", "lapack"] }
rstsr-lapack-ffi = { version = "0.5", default-features = false, features = ["blas", "cblas", "lapack"] }
rstsr-mkl-ffi = { version = "0.2", default-features = false, features = ["blas", "cblas", "lapack"] }
rstsr-blis-ffi = { version = "0.2", default-features = false, features = ["lapack"] }
rstsr-aocl-ffi = { version = "0.2", default-features = false, features = ["blis", "lapack"] }
Expand Down
35 changes: 35 additions & 0 deletions crates-device/rstsr-accelerate/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[package]
name = "rstsr-accelerate"
version.workspace = true
edition.workspace = true
description.workspace = true
repository.workspace = true
keywords.workspace = true
categories.workspace = true
license.workspace = true
readme = "readme.md"

[dependencies]
rayon = { workspace = true }
num = { workspace = true }
duplicate = { workspace = true }
rstsr-lapack-ffi = { workspace = true }
rstsr-native-impl = { workspace = true, features = ["rayon"] }
rstsr-core = { workspace = true, features = ["rayon"] }
rstsr-common = { workspace = true, features = ["rayon"] }
rstsr-dtype-traits = { workspace = true, features = ["half"] }
rstsr-blas-traits = { workspace = true }
rstsr-linalg-traits = { workspace = true, optional = true }
rstsr-sci-traits = { workspace = true, optional = true }

[dev-dependencies]
rstsr = { path = "../../rstsr", default-features = false, features = ["linalg"] }
rstsr-test-manifest = { workspace = true }

[features]
default = ["linalg"]
dynamic_loading = ["rstsr-lapack-ffi/dynamic_loading"]
faer = ["rstsr-core/faer"]
ilp64 = ["rstsr-lapack-ffi/ilp64", "rstsr-blas-traits/ilp64"]
linalg = ["dep:rstsr-linalg-traits"]
sci = ["dep:rstsr-sci-traits"]
12 changes: 12 additions & 0 deletions crates-device/rstsr-accelerate/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
fn main() {
#[cfg(target_os = "macos")]
{
println!("cargo:rustc-link-lib=framework=Accelerate");
}


#[cfg(not(target_os = "macos"))]
{
panic!("'accelerate' feature is only available for macOS target.");
}
}
40 changes: 40 additions & 0 deletions crates-device/rstsr-accelerate/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# RSTSR OpenBLAS device

This crate enables OpenBLAS device.

For more information of OpenBLAS and its usage, we refer to [document of rstsr-openblas-ffi](https://docs.rs/rstsr-openblas-ffi/).

## Usage

```rust
use rstsr_core::prelude::*;
use rstsr_openblas::DeviceOpenBLAS;

// specify the number of threads of 16
let device = DeviceOpenBLAS::new(16);
// if you want to use the default number of threads, use the following line
// let device = DeviceOpenBLAS::default();

let a = rt::linspace((0.0, 1.0, 1048576, &device)).into_shape([16, 256, 256]);
let b = rt::linspace((1.0, 2.0, 1048576, &device)).into_shape([16, 256, 256]);

// by optimized BLAS, the following operation is very fast
let c = &a % &b;

// mean of all elements is also performed in parallel
let c_mean = c.mean_all();

println!("{:?}", c_mean);
assert!((c_mean - 213.2503660477036) < 1e-6);
```

## Important Notes

- We do not provide automatic linkage:
- Please add `-l openblas` in `RUSTFLAGS`, or `cargo:rustc-link-lib=openblas` in build.rs, or something similar, to your project.
We do not use external FFI crates `blas` or `blas-sys`, and do not automatically search OpenBLAS library for linking.
- If feature `openmp` activated, please add `-l gomp` or `-l omp` in `RUSTFLAGS`, or `cargo:rustc-link-lib=gomp` or `cargo:rustc-link-lib=omp` in build.rs, or something similar, to your project.
We do not use external FFI crate `openmp-sys`, and do not automatically search for OpenMP library for linking.

- If your OpenBLAS is compiled with OpenMP, please add `openmp` feature to either this crate or `rstsr-openblas-ffi`.
- In our testing, OpenBLAS with OpenMP is probably more efficient than pthreads. However, we currently decided not make `openmp` as default feature.
79 changes: 79 additions & 0 deletions crates-device/rstsr-accelerate/src/conversion.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
use crate::prelude_dev::*;

macro_rules! impl_change_device {
($DevA: ty, $DevB: ty) => {
impl<'a, R, T, D> DeviceChangeAPI<'a, $DevB, R, T, D> for $DevA
where
T: Clone + Send + Sync + 'a,
D: DimAPI,
R: DataCloneAPI<Data = Vec<T>>,
{
type Repr = R;
type ReprTo = DataRef<'a, Vec<T>>;

fn change_device(
tensor: TensorAny<R, T, $DevA, D>,
device: &$DevB,
) -> Result<TensorAny<Self::Repr, T, $DevB, D>> {
let (storage, layout) = tensor.into_raw_parts();
let (data, _) = storage.into_raw_parts();
let storage = Storage::new(data, device.clone());
let tensor = TensorAny::new(storage, layout);
Ok(tensor)
}

fn into_device(
tensor: TensorAny<R, T, $DevA, D>,
device: &$DevB,
) -> Result<TensorAny<DataOwned<Vec<T>>, T, $DevB, D>> {
let tensor = tensor.into_owned();
DeviceChangeAPI::change_device(tensor, device)
}

fn to_device(tensor: &'a TensorAny<R, T, $DevA, D>, device: &$DevB) -> Result<TensorView<'a, T, $DevB, D>> {
let view = tensor.view();
DeviceChangeAPI::change_device(view, device)
}
}
};
}

impl_change_device!(DeviceCpuSerial, DeviceBLAS);
impl_change_device!(DeviceBLAS, DeviceCpuSerial);
impl_change_device!(DeviceBLAS, DeviceBLAS);
#[cfg(feature = "faer")]
impl_change_device!(DeviceFaer, DeviceBLAS);
#[cfg(feature = "faer")]
impl_change_device!(DeviceBLAS, DeviceFaer);

#[cfg(test)]
mod test {
use super::*;

#[test]
fn test_device_conversion_cpu_serial() {
let device_serial = DeviceCpuSerial::default();
let device = DeviceBLAS::new(0);
let a = linspace((1.0, 5.0, 5, &device));
let b = a.to_device(&device_serial);
println!("{b:?}");
let a = linspace((1.0, 5.0, 5, &device_serial));
let a_view = a.view();
let b = a_view.to_device(&device);
println!("{b:?}");
}

#[test]
#[cfg(feature = "faer")]
fn test_device_conversion_faer() {
let device_faer = DeviceFaer::new(0);
let device = DeviceBLAS::new(0);
let a = linspace((1.0, 5.0, 5, &device));
let b = a.to_device(&device_faer);
println!("{b:?}");
let a = linspace((1.0, 5.0, 5, &device_faer));
let a_view = a.view();
let b = a_view.to_device(&device);
println!("{b:?}");
}
}
134 changes: 134 additions & 0 deletions crates-device/rstsr-accelerate/src/creation.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
use crate::prelude_dev::*;
use num::{complex::ComplexFloat, Num};

// for creation, we use most of the functions from DeviceCpuSerial
impl<T> DeviceCreationAnyAPI<T> for DeviceBLAS
where
Self: DeviceRawAPI<T, Raw = Vec<T>> + DeviceRawAPI<MaybeUninit<T>, Raw = Vec<MaybeUninit<T>>>,
{
unsafe fn empty_impl(&self, len: usize) -> Result<Storage<DataOwned<Vec<T>>, T, Self>> {
let storage = DeviceCpuSerial::default().empty_impl(len)?;
let (data, _) = storage.into_raw_parts();
Ok(Storage::new(data, self.clone()))
}

fn full_impl(&self, len: usize, fill: T) -> Result<Storage<DataOwned<Vec<T>>, T, Self>>
where
T: Clone,
{
let storage = DeviceCpuSerial::default().full_impl(len, fill)?;
let (data, _) = storage.into_raw_parts();
Ok(Storage::new(data, self.clone()))
}

fn outof_cpu_vec(&self, vec: Vec<T>) -> Result<Storage<DataOwned<Vec<T>>, T, Self>> {
Ok(Storage::new(DataOwned::from(vec), self.clone()))
}

fn from_cpu_vec(&self, vec: &[T]) -> Result<Storage<DataOwned<Vec<T>>, T, Self>>
where
T: Clone,
{
let raw = vec.to_vec();
Ok(Storage::new(DataOwned::from(raw), self.clone()))
}

fn uninit_impl(&self, len: usize) -> Result<Storage<DataOwned<Vec<MaybeUninit<T>>>, MaybeUninit<T>, Self>> {
let raw = unsafe { uninitialized_vec(len) }?;
Ok(Storage::new(raw.into(), self.clone()))
}

unsafe fn assume_init_impl(
storage: Storage<DataOwned<Vec<MaybeUninit<T>>>, MaybeUninit<T>, Self>,
) -> Result<Storage<DataOwned<Vec<T>>, T, Self>>
where
Self: DeviceRawAPI<MaybeUninit<T>>,
{
let (data, device) = storage.into_raw_parts();
let vec = data.into_raw();
// transmute `Vec<MaybeUninit<T>>` to `Vec<T>`
let vec = core::mem::transmute::<Vec<MaybeUninit<T>>, Vec<T>>(vec);
let data = vec.into();
Ok(Storage::new(data, device))
}
}

impl<T> DeviceCreationNumAPI<T> for DeviceBLAS
where
T: Num + Clone,
Self: DeviceRawAPI<T, Raw = Vec<T>>,
{
fn zeros_impl(&self, len: usize) -> Result<Storage<DataOwned<Vec<T>>, T, Self>> {
let storage = DeviceCpuSerial::default().zeros_impl(len)?;
let (data, _) = storage.into_raw_parts();
Ok(Storage::new(data, self.clone()))
}

fn ones_impl(&self, len: usize) -> Result<Storage<DataOwned<Vec<T>>, T, Self>> {
let storage = DeviceCpuSerial::default().ones_impl(len)?;
let (data, _) = storage.into_raw_parts();
Ok(Storage::new(data, self.clone()))
}

fn arange_int_impl(&self, len: usize) -> Result<Storage<DataOwned<Vec<T>>, T, Self>> {
let storage = DeviceCpuSerial::default().arange_int_impl(len)?;
let (data, _) = storage.into_raw_parts();
Ok(Storage::new(data, self.clone()))
}
}

impl<T> DeviceCreationPartialOrdNumAPI<T> for DeviceBLAS
where
T: Num + PartialOrd + Clone,
Self: DeviceRawAPI<T, Raw = Vec<T>>,
{
fn arange_impl(&self, start: T, end: T, step: T) -> Result<Storage<DataOwned<Vec<T>>, T, Self>> {
let storage = DeviceCpuSerial::default().arange_impl(start, end, step)?;
let (data, _) = storage.into_raw_parts();
Ok(Storage::new(data, self.clone()))
}
}

impl<T> DeviceCreationComplexFloatAPI<T> for DeviceBLAS
where
T: ComplexFloat + Clone + Send + Sync,
Self: DeviceRawAPI<T, Raw = Vec<T>>,
{
fn linspace_impl(&self, start: T, end: T, n: usize, endpoint: bool) -> Result<Storage<DataOwned<Vec<T>>, T, Self>> {
let storage = DeviceCpuSerial::default().linspace_impl(start, end, n, endpoint)?;
let (data, _) = storage.into_raw_parts();
Ok(Storage::new(data, self.clone()))
}
}

impl<T> DeviceCreationTriAPI<T> for DeviceBLAS
where
T: Num + Clone,
Self: DeviceRawAPI<T, Raw = Vec<T>>,
{
fn tril_impl<D>(&self, raw: &mut Self::Raw, layout: &Layout<D>, k: isize) -> Result<()>
where
D: DimAPI,
{
DeviceCpuSerial::default().tril_impl(raw, layout, k)
}

fn triu_impl<D>(&self, raw: &mut Self::Raw, layout: &Layout<D>, k: isize) -> Result<()>
where
D: DimAPI,
{
DeviceCpuSerial::default().triu_impl(raw, layout, k)
}
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn test_linspace() {
let device = DeviceBLAS::default();
let a = linspace((1.0, 5.0, 5, &device));
assert_eq!(a.raw(), &vec![1., 2., 3., 4., 5.]);
}
}
Loading