unsafe changes to reduce copies going in/out of BEAM for already

mortont · mortont · commit da274a90de03 · 2023-05-19T16:41:20.000-04:00
allocated binaries
diff --git a/native/ortex/src/tensor.rs b/native/ortex/src/tensor.rs
@@ -1,5 +1,6 @@
 //! Conversions for packing/unpacking `OrtexTensor`s into different types
 use ndarray::prelude::*;
+use ndarray::Data;
 use ort::tensor::{DynOrtTensor, FromArray, InputTensor, TensorElementDataType};
 use ort::OrtError;
 use rustler::Atom;
@@ -78,88 +79,36 @@ impl OrtexTensor {
         }
     }
 
-    pub fn to_bytes(&self) -> Vec<u8> {
-        // Annoying and not DRY, Traits are probably the answer here...
-        // Once num_traits has from_<endian>_bytes we can pull that in
-        // https://github.com/rust-num/num-traits/pull/224
-        let contents = match self {
-            OrtexTensor::s8(y) => y
-                .clone()
-                .into_raw_vec()
-                .iter()
-                .flat_map(|f| f.to_ne_bytes().to_vec())
-                .collect(),
-            OrtexTensor::s16(y) => y
-                .clone()
-                .into_raw_vec()
-                .iter()
-                .flat_map(|f| f.to_ne_bytes().to_vec())
-                .collect(),
-            OrtexTensor::s32(y) => y
-                .clone()
-                .into_raw_vec()
-                .iter()
-                .flat_map(|f| f.to_ne_bytes().to_vec())
-                .collect(),
-            OrtexTensor::s64(y) => y
-                .clone()
-                .into_raw_vec()
-                .iter()
-                .flat_map(|f| f.to_ne_bytes().to_vec())
-                .collect(),
-            OrtexTensor::u8(y) => y
-                .clone()
-                .into_raw_vec()
-                .iter()
-                .flat_map(|f| f.to_ne_bytes().to_vec())
-                .collect(),
-            OrtexTensor::u16(y) => y
-                .clone()
-                .into_raw_vec()
-                .iter()
-                .flat_map(|f| f.to_ne_bytes().to_vec())
-                .collect(),
-            OrtexTensor::u32(y) => y
-                .clone()
-                .into_raw_vec()
-                .iter()
-                .flat_map(|f| f.to_ne_bytes().to_vec())
-                .collect(),
-            OrtexTensor::u64(y) => y
-                .clone()
-                .into_raw_vec()
-                .iter()
-                .flat_map(|f| f.to_ne_bytes().to_vec())
-                .collect(),
-            OrtexTensor::f16(y) => y
-                .clone()
-                .into_raw_vec()
-                .iter()
-                .flat_map(|f| f.to_ne_bytes().to_vec())
-                .collect(),
-            OrtexTensor::bf16(y) => y
-                .clone()
-                .into_raw_vec()
-                .iter()
-                .flat_map(|f| f.to_ne_bytes().to_vec())
-                .collect(),
-            OrtexTensor::f32(y) => y
-                .clone()
-                .into_raw_vec()
-                .iter()
-                .flat_map(|f| f.to_ne_bytes().to_vec())
-                .collect(),
-            OrtexTensor::f64(y) => y
-                .clone()
-                .into_raw_vec()
-                .iter()
-                .flat_map(|f| f.to_ne_bytes().to_vec())
-                .collect(),
+    pub fn to_bytes<'a>(&'a self) -> &'a [u8] {
+        let contents: &'a [u8] = match self {
+            OrtexTensor::s8(y) => get_bytes(y),
+            OrtexTensor::s16(y) => get_bytes(y),
+            OrtexTensor::s32(y) => get_bytes(y),
+            OrtexTensor::s64(y) => get_bytes(y),
+            OrtexTensor::u8(y) => get_bytes(y),
+            OrtexTensor::u16(y) => get_bytes(y),
+            OrtexTensor::u32(y) => get_bytes(y),
+            OrtexTensor::u64(y) => get_bytes(y),
+            OrtexTensor::f16(y) => get_bytes(y),
+            OrtexTensor::bf16(y) => get_bytes(y),
+            OrtexTensor::f32(y) => get_bytes(y),
+            OrtexTensor::f64(y) => get_bytes(y),
         };
         contents
     }
 }
 
+fn get_bytes<'a, T>(array: &'a ArrayBase<T, IxDyn>) -> &'a [u8]
+where
+    T: Data,
+{
+    let len = array.len();
+    let binding = unsafe { std::mem::zeroed() };
+    let f = array.get(0).unwrap_or(&binding);
+    let size: usize = std::mem::size_of_val(f);
+    unsafe { std::slice::from_raw_parts(array.as_ptr() as *const u8, len * size) }
+}
+
 impl std::convert::TryFrom<&DynOrtTensor<'_, IxDyn>> for OrtexTensor {
     type Error = OrtError;
     fn try_from(e: &DynOrtTensor<IxDyn>) -> Result<OrtexTensor, Self::Error> {
diff --git a/native/ortex/src/utils.rs b/native/ortex/src/utils.rs
@@ -3,16 +3,22 @@
 
 use crate::constants::*;
 use crate::tensor::OrtexTensor;
+use ndarray::{ArrayViewMut, Ix, IxDyn};
 
-use ndarray::prelude::*;
 use ndarray::ShapeError;
 
 use rustler::resource::ResourceArc;
-use rustler::types::{Binary, OwnedBinary};
-use rustler::{Atom, Env, Error, NifResult};
+use rustler::types::Binary;
+use rustler::{Atom, Env, NifResult};
 
 use ort::{ExecutionProvider, GraphOptimizationLevel};
 
+/// A faster (unsafe) way of creating an Array from an Erlang binary
+fn initialize_from_raw_ptr<T>(ptr: *const T, shape: &[Ix]) -> ArrayViewMut<T, IxDyn> {
+    let array = unsafe { ArrayViewMut::from_shape_ptr(shape, ptr as *mut T) };
+    array
+}
+
 /// Given a Binary term, shape, and dtype from the BEAM, constructs an OrtexTensor and
 /// returns the reference to be used as an Nx.Backend representation.
 ///
@@ -32,115 +38,42 @@ pub fn from_binary(
     dtype_str: String,
     dtype_bits: usize,
 ) -> Result<ResourceArc<OrtexTensor>, ShapeError> {
-    // TODO: make this more DRY, pull out into an impl
     match (dtype_str.as_ref(), dtype_bits) {
         ("bf", 16) => Ok(ResourceArc::new(OrtexTensor::bf16(
-            Array::from_vec(
-                bin.as_slice()
-                    .chunks_exact(2)
-                    .map(|c| half::bf16::from_ne_bytes([c[0], c[1]]))
-                    .collect(),
-            )
-            .into_shape(shape)?,
+            initialize_from_raw_ptr(bin.as_ptr() as *const half::bf16, &shape).to_owned(),
         ))),
         ("f", 16) => Ok(ResourceArc::new(OrtexTensor::f16(
-            Array::from_vec(
-                bin.as_slice()
-                    .chunks_exact(2)
-                    .map(|c| half::f16::from_ne_bytes([c[0], c[1]]))
-                    .collect(),
-            )
-            .into_shape(shape)?,
+            initialize_from_raw_ptr(bin.as_ptr() as *const half::f16, &shape).to_owned(),
         ))),
         ("f", 32) => Ok(ResourceArc::new(OrtexTensor::f32(
-            Array::from_vec(
-                bin.as_slice()
-                    .chunks_exact(4)
-                    .map(|c| f32::from_ne_bytes([c[0], c[1], c[2], c[3]]))
-                    .collect(),
-            )
-            .into_shape(shape)?,
+            initialize_from_raw_ptr(bin.as_ptr() as *const f32, &shape).to_owned(),
         ))),
         ("f", 64) => Ok(ResourceArc::new(OrtexTensor::f64(
-            Array::from_vec(
-                bin.as_slice()
-                    .chunks_exact(8)
-                    .map(|c| f64::from_ne_bytes([c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7]]))
-                    .collect(),
-            )
-            .into_shape(shape)?,
+            initialize_from_raw_ptr(bin.as_ptr() as *const f64, &shape).to_owned(),
         ))),
         ("s", 8) => Ok(ResourceArc::new(OrtexTensor::s8(
-            Array::from_vec(
-                bin.as_slice()
-                    .chunks_exact(1)
-                    .map(|c| i8::from_ne_bytes([c[0]]))
-                    .collect(),
-            )
-            .into_shape(shape)?,
+            initialize_from_raw_ptr(bin.as_ptr() as *const i8, &shape).to_owned(),
         ))),
         ("s", 16) => Ok(ResourceArc::new(OrtexTensor::s16(
-            Array::from_vec(
-                bin.as_slice()
-                    .chunks_exact(2)
-                    .map(|c| i16::from_ne_bytes([c[0], c[1]]))
-                    .collect(),
-            )
-            .into_shape(shape)?,
+            initialize_from_raw_ptr(bin.as_ptr() as *const i16, &shape).to_owned(),
         ))),
         ("s", 32) => Ok(ResourceArc::new(OrtexTensor::s32(
-            Array::from_vec(
-                bin.as_slice()
-                    .chunks_exact(4)
-                    .map(|c| i32::from_ne_bytes([c[0], c[1], c[2], c[3]]))
-                    .collect(),
-            )
-            .into_shape(shape)?,
+            initialize_from_raw_ptr(bin.as_ptr() as *const i32, &shape).to_owned(),
         ))),
         ("s", 64) => Ok(ResourceArc::new(OrtexTensor::s64(
-            Array::from_vec(
-                bin.as_slice()
-                    .chunks_exact(8)
-                    .map(|c| i64::from_ne_bytes([c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7]]))
-                    .collect(),
-            )
-            .into_shape(shape)?,
+            initialize_from_raw_ptr(bin.as_ptr() as *const i64, &shape).to_owned(),
         ))),
         ("u", 8) => Ok(ResourceArc::new(OrtexTensor::u8(
-            Array::from_vec(
-                bin.as_slice()
-                    .chunks_exact(1)
-                    .map(|c| u8::from_ne_bytes([c[0]]))
-                    .collect(),
-            )
-            .into_shape(shape)?,
+            initialize_from_raw_ptr(bin.as_ptr() as *const u8, &shape).to_owned(),
         ))),
         ("u", 16) => Ok(ResourceArc::new(OrtexTensor::u16(
-            Array::from_vec(
-                bin.as_slice()
-                    .chunks_exact(2)
-                    .map(|c| u16::from_ne_bytes([c[0], c[1]]))
-                    .collect(),
-            )
-            .into_shape(shape)?,
+            initialize_from_raw_ptr(bin.as_ptr() as *const u16, &shape).to_owned(),
         ))),
         ("u", 32) => Ok(ResourceArc::new(OrtexTensor::u32(
-            Array::from_vec(
-                bin.as_slice()
-                    .chunks_exact(4)
-                    .map(|c| u32::from_ne_bytes([c[0], c[1], c[2], c[3]]))
-                    .collect(),
-            )
-            .into_shape(shape)?,
+            initialize_from_raw_ptr(bin.as_ptr() as *const u32, &shape).to_owned(),
         ))),
         ("u", 64) => Ok(ResourceArc::new(OrtexTensor::u64(
-            Array::from_vec(
-                bin.as_slice()
-                    .chunks_exact(8)
-                    .map(|c| u64::from_ne_bytes([c[0], c[1], c[2], c[3], c[4], c[5], c[6], c[7]]))
-                    .collect(),
-            )
-            .into_shape(shape)?,
+            initialize_from_raw_ptr(bin.as_ptr() as *const u64, &shape).to_owned(),
         ))),
         (&_, _) => unimplemented!(),
     }
@@ -154,12 +87,7 @@ pub fn to_binary<'a>(
     _bits: usize,
     _limit: usize,
 ) -> NifResult<Binary<'a>> {
-    // TODO: implement limit and size so we aren't dumping the entire binary on every
-    // IO.inspect call
-    let bytes = reference.to_bytes();
-    let mut bin = OwnedBinary::new(bytes.len()).ok_or(Error::Term(Box::new("Out of memory")))?;
-    bin.as_mut_slice().copy_from_slice(&bytes);
-    Ok(Binary::from_owned(bin, env))
+    Ok(reference.make_binary(env, |x| x.to_bytes()))
 }
 
 /// Takes a vec of Atoms and transforms them into a vec of ExecutionProvider Enums
diff --git a/test/dtype/dtype_test.exs b/test/dtype/dtype_test.exs
@@ -0,0 +1,80 @@
+defmodule Ortex.TestDtypes do
+  use ExUnit.Case
+
+  {tensor, _} = Nx.Random.uniform(Nx.Random.key(42), 0, 256, shape: {100, 100})
+  @tensor tensor
+
+  defp bin_binary(dtype) do
+    %{data: %{state: bin}} = @tensor |> Nx.as_type(dtype)
+    bin
+  end
+
+  defp bin_ortex(dtype) do
+    %{data: %{state: bin}} =
+      @tensor
+      |> Nx.as_type(dtype)
+      |> Nx.backend_transfer(Ortex.Backend)
+      |> Nx.backend_transfer(Nx.BinaryBackend)
+
+    bin
+  end
+
+  test "size 0 tensor" do
+    %{data: %{state: bin1}} = Nx.tensor(0)
+
+    %{data: %{state: bin2}} =
+      Nx.tensor(0)
+      |> Nx.backend_transfer(Ortex.Backend)
+      |> Nx.backend_transfer(Nx.BinaryBackend)
+
+    assert bin1 == bin2
+  end
+
+  test "u8 conversion" do
+    assert bin_binary(:u8) == bin_ortex(:u8)
+  end
+
+  test "u16 conversion" do
+    assert bin_binary(:u16) == bin_ortex(:u16)
+  end
+
+  test "u32 conversion" do
+    assert bin_binary(:u32) == bin_ortex(:u32)
+  end
+
+  test "u64 conversion" do
+    assert bin_binary(:u64) == bin_ortex(:u64)
+  end
+
+  test "s8 conversion" do
+    assert bin_binary(:s8) == bin_ortex(:s8)
+  end
+
+  test "s16 conversion" do
+    assert bin_binary(:s16) == bin_ortex(:s16)
+  end
+
+  test "s32 conversion" do
+    assert bin_binary(:s32) == bin_ortex(:s32)
+  end
+
+  test "s64 conversion" do
+    assert bin_binary(:s64) == bin_ortex(:s64)
+  end
+
+  test "f16 conversion" do
+    assert bin_binary(:f16) == bin_ortex(:f16)
+  end
+
+  test "bf16 conversion" do
+    assert bin_binary(:bf16) == bin_ortex(:bf16)
+  end
+
+  test "f32 conversion" do
+    assert bin_binary(:f32) == bin_ortex(:f32)
+  end
+
+  test "f64 conversion" do
+    assert bin_binary(:f64) == bin_ortex(:f64)
+  end
+end
diff --git a/test/ortex_test.exs b/test/ortex_test.exs
@@ -12,14 +12,6 @@ defmodule OrtexTest do
     assert argmax == Nx.tensor([499])
   end
 
-  test "transfer to Ortex.Backend" do
-    assert true
-  end
-
-  test "transfer from Ortex.Backend" do
-    assert true
-  end
-
   test "Nx.Serving with resnet50" do
     model = Ortex.load("./models/resnet50.onnx")