Expose norms functionality, add pytest unit, fix travis ci

NianhengWu · Daniël de Kok · commit 7a04721bca83 · 2019-07-08T09:10:02.000+02:00
diff --git a/.travis.yml b/.travis.yml
@@ -28,13 +28,13 @@ install:
     if [ "$TRAVIS_OS_NAME" == "osx" ]; then
       python3 -m venv venv
       source venv/bin/activate
-      pip install cffi virtualenv pytest
+      pip install cffi virtualenv pytest numpy
     fi
   - |
     if [ "$TRAVIS_OS_NAME" == "linux" ]; then
       python3.6 -m venv venv
       source venv/bin/activate
-      pip install cffi virtualenv pytest
+      pip install cffi virtualenv pytest numpy
     fi
   - cargo install pyo3-pack --vers 0.6.1
   - rustup default nightly-2019-02-07
diff --git a/src/embeddings.rs b/src/embeddings.rs
@@ -12,9 +12,12 @@ use numpy::{IntoPyArray, PyArray1, PyArray2};
 use pyo3::class::iter::PyIterProtocol;
 use pyo3::exceptions;
 use pyo3::prelude::*;
+use pyo3::types::PyTuple;
 use toml::{self, Value};
 
-use crate::{EmbeddingsWrap, PyEmbeddingIterator, PyVocab, PyWordSimilarity};
+use crate::{
+    EmbeddingsWrap, PyEmbeddingIterator, PyEmbeddingWithNormIterator, PyVocab, PyWordSimilarity,
+};
 
 /// finalfusion embeddings.
 #[pyclass(name=Embeddings)]
@@ -126,6 +129,29 @@ impl PyEmbeddings {
         }
     }
 
+    fn embedding_with_norm(&self, word: &str) -> PyResult<Py<PyTuple>> {
+        let embeddings = self.embeddings.borrow();
+
+        use EmbeddingsWrap::*;
+        let embedding_with_norm = match &*embeddings {
+            View(e) => e.embedding_with_norm(word),
+            NonView(e) => e.embedding_with_norm(word),
+        };
+
+        match embedding_with_norm {
+            Some(embedding_with_norm) => {
+                let gil = pyo3::Python::acquire_gil();
+                let py = gil.python();
+                Ok((
+                    embedding_with_norm.embedding.into_owned().into_pyarray(py),
+                    embedding_with_norm.norm,
+                )
+                    .into_py(py))
+            }
+            None => Err(exceptions::KeyError::py_err("Unknown word and n-grams")),
+        }
+    }
+
     /// Copy the entire embeddings matrix.
     fn matrix_copy(&self) -> PyResult<Py<PyArray2<f32>>> {
         let embeddings = self.embeddings.borrow();
@@ -247,6 +273,10 @@ impl PyEmbeddings {
                 .map_err(|err| exceptions::IOError::py_err(err.to_string())),
         }
     }
+
+    fn iter_with_norm(&self) -> PyResult<PyEmbeddingWithNormIterator> {
+        Ok(PyEmbeddingWithNormIterator::new(self.embeddings.clone(), 0))
+    }
 }
 
 #[pyproto]
diff --git a/src/embeddings_wrap.rs b/src/embeddings_wrap.rs
@@ -1,3 +1,4 @@
+use finalfusion::norms::NdNorms;
 use finalfusion::prelude::*;
 
 pub enum EmbeddingsWrap {
@@ -21,4 +22,12 @@ impl EmbeddingsWrap {
             View(e) => e.vocab(),
         }
     }
+
+    pub fn norms(&self) -> Option<&NdNorms> {
+        use EmbeddingsWrap::*;
+        match self {
+            NonView(e) => e.norms(),
+            View(e) => e.norms(),
+        }
+    }
 }
diff --git a/src/iter.rs b/src/iter.rs
@@ -48,3 +48,46 @@ impl PyIterProtocol for PyEmbeddingIterator {
         }
     }
 }
+
+#[pyclass(name=EmbeddingWithNormIterator)]
+pub struct PyEmbeddingWithNormIterator {
+    embeddings: Rc<RefCell<EmbeddingsWrap>>,
+    idx: usize,
+}
+
+impl PyEmbeddingWithNormIterator {
+    pub fn new(embeddings: Rc<RefCell<EmbeddingsWrap>>, idx: usize) -> Self {
+        PyEmbeddingWithNormIterator { embeddings, idx }
+    }
+}
+
+#[pyproto]
+impl PyIterProtocol for PyEmbeddingWithNormIterator {
+    fn __iter__(slf: PyRefMut<Self>) -> PyResult<Py<PyEmbeddingWithNormIterator>> {
+        Ok(slf.into())
+    }
+
+    fn __next__(mut slf: PyRefMut<Self>) -> PyResult<Option<(String, Py<PyArray1<f32>>, f32)>> {
+        let slf = &mut *slf;
+
+        let embeddings = slf.embeddings.borrow();
+        let vocab = embeddings.vocab();
+
+        if slf.idx < vocab.len() {
+            let word = vocab.words()[slf.idx].to_string();
+            let embed = embeddings.storage().embedding(slf.idx);
+            let norm = embeddings.norms().map(|n| n.0[slf.idx]).unwrap_or(1.);
+
+            slf.idx += 1;
+
+            let gil = pyo3::Python::acquire_gil();
+            Ok(Some((
+                word,
+                embed.into_owned().into_pyarray(gil.python()).to_owned(),
+                norm,
+            )))
+        } else {
+            Ok(None)
+        }
+    }
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -9,7 +9,7 @@ mod embeddings_wrap;
 use embeddings_wrap::EmbeddingsWrap;
 
 mod iter;
-use iter::PyEmbeddingIterator;
+use iter::{PyEmbeddingIterator, PyEmbeddingWithNormIterator};
 
 mod similarity;
 use similarity::PyWordSimilarity;
diff --git a/tests/embeddings.fifu b/tests/embeddings.fifu
diff --git a/tests/embeddings.txt b/tests/embeddings.txt
@@ -0,0 +1,7 @@
+one 3.0 1.0 0.0 0.0 0.0 0.0 2.0 2.0 4.0 3.0
+two 2.0 3.0 3.0 3.0 3.0 2.0 0.0 3.0 3.0 4.0
+three 0.0 0.0 2.0 0.0 2.0 1.0 2.0 4.0 0.0 3.0
+four 1.0 4.0 4.0 2.0 4.0 2.0 4.0 1.0 3.0 1.0
+five 0.0 4.0 1.0 2.0 0.0 4.0 0.0 3.0 1.0 3.0
+six 3.0 3.0 4.0 2.0 0.0 0.0 0.0 3.0 2.0 1.0
+seven 1.0 4.0 0.0 2.0 2.0 2.0 4.0 3.0 1.0 1.0
diff --git a/tests/test_embedding.py b/tests/test_embedding.py
@@ -0,0 +1,63 @@
+import finalfusion
+import pytest
+import numpy
+
+TEST_NORMS = [
+    6.557438373565674,
+    8.83176040649414,
+    6.164413928985596,
+    9.165151596069336,
+    7.4833149909973145,
+    7.211102485656738,
+    7.4833149909973145
+]
+
+
+def test_embeddings_with_norms():
+    embeds = finalfusion.Embeddings(
+        "tests/embeddings.fifu")
+    embeds_dict = dict()
+    with open("tests/embeddings.txt", "r", encoding="utf8") as lines:
+        for line in lines:
+            line_list = line.split(' ')
+            embeds_dict[line_list[0]] = [float(val) for val in line_list[1:]]
+
+    for embedding_with_norm, norm in zip(embeds.iter_with_norm(), TEST_NORMS):
+        unnormed_embed = embedding_with_norm[1] * norm
+        test_embed = embeds_dict[embedding_with_norm[0]]
+        assert numpy.allclose(
+            unnormed_embed, test_embed), "Embedding from 'iter_with_norm()' fails to match!"
+        assert len(
+            embedding_with_norm) == 3, "The number of values returned by 'iter_with_norm()' does not match!"
+
+
+def test_embeddings():
+    embeds = finalfusion.Embeddings(
+        "tests/embeddings.fifu")
+    embeds_dict = dict()
+    with open("tests/embeddings.txt", "r", encoding="utf8") as lines:
+        for line in lines:
+            line_list = line.split(' ')
+            embeds_dict[line_list[0]] = [float(i) for i in line_list[1:]]
+
+    for embedding_with_norm, norm in zip(embeds, TEST_NORMS):
+        unnormed_embed = embedding_with_norm[1] * norm
+        test_embed = embeds_dict[embedding_with_norm[0]]
+        assert numpy.allclose(
+            unnormed_embed, test_embed), "Embedding from normal iterator fails to match!"
+        assert len(
+            embedding_with_norm) == 2, "The number of values returned by normal iterator does not match!"
+
+
+def test_norms():
+    embeds = finalfusion.Embeddings(
+        "tests/embeddings.fifu")
+    embeds_dict = dict()
+    with open("tests/embeddings.txt", "r", encoding="utf8") as lines:
+        for line in lines:
+            line_list = line.split(' ')
+            embeds_dict[line_list[0]] = [float(val) for val in line_list[1:]]
+
+    for embedding_with_norm, norm in zip(embeds.iter_with_norm(), TEST_NORMS):
+        assert pytest.approx(
+            embedding_with_norm[2] == norm), "Norm fails to match!"

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+use finalfusion::norms::NdNorms;`
`1`	`2`	`use finalfusion::prelude::*;`
`2`	`3`
`3`	`4`	`pub enum EmbeddingsWrap {`
`@@ -21,4 +22,12 @@ impl EmbeddingsWrap {`
`21`	`22`	`View(e) => e.vocab(),`
`22`	`23`	`}`
`23`	`24`	`}`
	`25`	`+`
	`26`	`+ pub fn norms(&self) -> Option<&NdNorms> {`
	`27`	`+ use EmbeddingsWrap::*;`
	`28`	`+ match self {`
	`29`	`+ NonView(e) => e.norms(),`
	`30`	`+ View(e) => e.norms(),`
	`31`	`+ }`
	`32`	`+ }`
`24`	`33`	`}`