Add storage module.

sebpuetz · sebpuetz · commit c6c049e2e6ae · 2019-10-07T16:12:32.000+02:00
Split storage-related methods into its own module and add
PyStorage as an interface to it. Add shape method to PyStorage.
diff --git a/src/embeddings.rs b/src/embeddings.rs
@@ -12,14 +12,14 @@ use finalfusion::io as ffio;
 use finalfusion::prelude::*;
 use finalfusion::similarity::*;
 use itertools::Itertools;
-use ndarray::Array2;
-use numpy::{IntoPyArray, NpyDataType, PyArray1, PyArray2, ToPyArray};
+use numpy::{IntoPyArray, NpyDataType, PyArray1};
 use pyo3::class::iter::PyIterProtocol;
 use pyo3::prelude::*;
 use pyo3::types::{PyAny, PyTuple};
 use pyo3::{exceptions, PyMappingProtocol};
 use toml::{self, Value};
 
+use crate::storage::PyStorage;
 use crate::{EmbeddingsWrap, PyEmbeddingIterator, PyVocab, PyWordSimilarity};
 
 /// finalfusion embeddings.
@@ -34,24 +34,6 @@ pub struct PyEmbeddings {
     embeddings: Rc<RefCell<EmbeddingsWrap>>,
 }
 
-impl PyEmbeddings {
-    /// Copy storage to an array.
-    ///
-    /// This should only be used for storage types that do not provide
-    /// an ndarray view that can be copied trivially, such as quantized
-    /// storage.
-    fn copy_storage_to_array(storage: &dyn Storage) -> Array2<f32> {
-        let (rows, dims) = storage.shape();
-
-        let mut array = Array2::<f32>::zeros((rows, dims));
-        for idx in 0..rows {
-            array.row_mut(idx).assign(&storage.embedding(idx).as_view());
-        }
-
-        array
-    }
-}
-
 #[pymethods]
 impl PyEmbeddings {
     /// Load embeddings from the given `path`.
@@ -156,6 +138,11 @@ impl PyEmbeddings {
         Ok(PyVocab::new(self.embeddings.clone()))
     }
 
+    /// Get the model's storage.
+    fn storage(&self) -> PyStorage {
+        PyStorage::new(self.embeddings.clone())
+    }
+
     /// Perform an anology query.
     ///
     /// This returns words for the analogy query *w1* is to *w2*
@@ -222,31 +209,6 @@ impl PyEmbeddings {
         })
     }
 
-    /// Copy the entire embeddings matrix.
-    fn matrix_copy(&self) -> Py<PyArray2<f32>> {
-        let embeddings = self.embeddings.borrow();
-
-        use EmbeddingsWrap::*;
-        let gil = pyo3::Python::acquire_gil();
-        let matrix_view = match &*embeddings {
-            View(e) => e.storage().view(),
-            NonView(e) => match e.storage() {
-                StorageWrap::MmapArray(mmap) => mmap.view(),
-                StorageWrap::NdArray(array) => array.view(),
-                StorageWrap::QuantizedArray(quantized) => {
-                    let array = Self::copy_storage_to_array(quantized.as_ref());
-                    return array.to_pyarray(gil.python()).to_owned();
-                }
-                StorageWrap::MmapQuantizedArray(quantized) => {
-                    let array = Self::copy_storage_to_array(quantized);
-                    return array.to_pyarray(gil.python()).to_owned();
-                }
-            },
-        };
-
-        matrix_view.to_pyarray(gil.python()).to_owned()
-    }
-
     /// Embeddings metadata.
     #[getter]
     fn metadata(&self) -> PyResult<Option<String>> {
diff --git a/src/lib.rs b/src/lib.rs
@@ -17,6 +17,9 @@ use similarity::PyWordSimilarity;
 mod vocab;
 use vocab::PyVocab;
 
+mod storage;
+use storage::PyStorage;
+
 /// This is a Python module for using finalfusion embeddings.
 ///
 /// finalfusion is a format for word embeddings that supports words,
@@ -25,6 +28,7 @@ use vocab::PyVocab;
 fn finalfusion(_py: Python, m: &PyModule) -> PyResult<()> {
     m.add_class::<PyEmbeddings>()?;
     m.add_class::<PyEmbedding>()?;
+    m.add_class::<PyStorage>()?;
     m.add_class::<PyWordSimilarity>()?;
     m.add_class::<PyVocab>()?;
     Ok(())
diff --git a/src/storage.rs b/src/storage.rs
@@ -0,0 +1,96 @@
+use std::cell::RefCell;
+use std::rc::Rc;
+
+use finalfusion::prelude::{Storage, StorageView, StorageWrap};
+use ndarray::Array2;
+use numpy::{PyArray1, PyArray2, ToPyArray};
+use pyo3::class::sequence::PySequenceProtocol;
+use pyo3::exceptions;
+use pyo3::prelude::*;
+
+use crate::EmbeddingsWrap;
+
+/// finalfusion vocab.
+#[pyclass(name=Storage)]
+pub struct PyStorage {
+    embeddings: Rc<RefCell<EmbeddingsWrap>>,
+}
+
+impl PyStorage {
+    pub fn new(embeddings: Rc<RefCell<EmbeddingsWrap>>) -> Self {
+        PyStorage { embeddings }
+    }
+    /// Copy storage to an array.
+    ///
+    /// This should only be used for storage types that do not provide
+    /// an ndarray view that can be copied trivially, such as quantized
+    /// storage.
+    fn copy_storage_to_array(storage: &dyn Storage) -> Array2<f32> {
+        let (rows, dims) = storage.shape();
+
+        let mut array = Array2::<f32>::zeros((rows, dims));
+        for idx in 0..rows {
+            array.row_mut(idx).assign(&storage.embedding(idx).as_view());
+        }
+
+        array
+    }
+}
+
+#[pymethods]
+impl PyStorage {
+    /// Copy the entire embeddings matrix.
+    fn matrix_copy(&self) -> Py<PyArray2<f32>> {
+        let embeddings = self.embeddings.borrow();
+
+        use EmbeddingsWrap::*;
+        let gil = pyo3::Python::acquire_gil();
+        let matrix_view = match &*embeddings {
+            View(e) => e.storage().view(),
+            NonView(e) => match e.storage() {
+                StorageWrap::MmapArray(mmap) => mmap.view(),
+                StorageWrap::NdArray(array) => array.view(),
+                StorageWrap::QuantizedArray(quantized) => {
+                    let array = Self::copy_storage_to_array(quantized.as_ref());
+                    return array.to_pyarray(gil.python()).to_owned();
+                }
+                StorageWrap::MmapQuantizedArray(quantized) => {
+                    let array = Self::copy_storage_to_array(quantized);
+                    return array.to_pyarray(gil.python()).to_owned();
+                }
+            },
+        };
+
+        matrix_view.to_pyarray(gil.python()).to_owned()
+    }
+
+    /// Get the shape of the storage.
+    fn shape(&self) -> (usize, usize) {
+        let embeddings = self.embeddings.borrow();
+        embeddings.storage().shape()
+    }
+}
+
+#[pyproto]
+impl PySequenceProtocol for PyStorage {
+    fn __len__(&self) -> PyResult<usize> {
+        let embeds = self.embeddings.borrow();
+        Ok(embeds.storage().shape().0)
+    }
+
+    fn __getitem__(&self, idx: isize) -> PyResult<Py<PyArray1<f32>>> {
+        let embeds = self.embeddings.borrow();
+        let storage = embeds.storage();
+
+        if idx >= storage.shape().0 as isize || idx < 0 {
+            Err(exceptions::IndexError::py_err("list index out of range"))
+        } else {
+            let gil = Python::acquire_gil();
+            Ok(storage
+                .embedding(idx as usize)
+                .into_owned()
+                .to_pyarray(gil.python())
+                .to_owned())
+        }
+    }
+}
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
@@ -17,17 +17,19 @@ def test_embeddings(embeddings_fifu, embeddings_text, embeddings_text_dims):
     assert len(embeddings_fifu.vocab()) == 7
     assert len(embeddings_text.vocab()) == 7
     assert len(embeddings_text_dims.vocab()) == 7
-
+    fifu_storage = embeddings_fifu.storage()
     # Check that the finalfusion embeddings have the correct dimensionality
     # The correct dimensionality of the other embedding types is asserted
     # in the pairwise comparisons below.
-    assert embeddings_fifu.matrix_copy().shape == (7, 10)
+    assert fifu_storage.shape() == (7, 10)
     
-    for embedding in embeddings_fifu:
+    for embedding, storage_row in zip(embeddings_fifu, fifu_storage):
         assert numpy.allclose(
             embedding.embedding, embeddings_text[embedding.word]), "FiFu and text embedding mismatch"
         assert numpy.allclose(
             embedding.embedding, embeddings_text_dims[embedding.word]), "FiFu and textdims embedding mismatch"
+        assert numpy.allclose(
+            embedding.embedding, storage_row), "FiFu and storage row  mismatch"
 
 
 def test_embeddings_pq(similarity_fifu, similarity_pq):