Add the static read_{text,text_dims,word2vec} methods

danieldk · Daniël de Kok · commit d0da7fe1639f · 2019-07-30T17:39:16.000+02:00
diff --git a/src/embeddings.rs b/src/embeddings.rs
@@ -4,9 +4,12 @@ use std::io::{BufReader, BufWriter};
 use std::rc::Rc;
 
 use failure::Error;
+use finalfusion::io as ffio;
 use finalfusion::metadata::Metadata;
 use finalfusion::prelude::*;
 use finalfusion::similarity::*;
+use finalfusion::text::{ReadText, ReadTextDims};
+use finalfusion::word2vec::ReadWord2Vec;
 use itertools::Itertools;
 use ndarray::Array2;
 use numpy::{IntoPyArray, PyArray1, PyArray2};
@@ -44,9 +47,9 @@ impl PyEmbeddings {
         // First try to load embeddings with viewable storage. If that
         // fails, attempt to load the embeddings as non-viewable
         // storage.
-        let embeddings = match load_embeddings(path, mmap) {
+        let embeddings = match read_embeddings(path, mmap) {
             Ok(e) => Rc::new(RefCell::new(EmbeddingsWrap::View(e))),
-            Err(_) => load_embeddings(path, mmap)
+            Err(_) => read_embeddings(path, mmap)
                 .map(|e| Rc::new(RefCell::new(EmbeddingsWrap::NonView(e))))
                 .map_err(|err| exceptions::IOError::py_err(err.to_string()))?,
         };
@@ -56,6 +59,42 @@ impl PyEmbeddings {
         Ok(())
     }
 
+    /// read_text(path,/)
+    /// --
+    ///
+    /// Read embeddings in text format. This format uses one line per
+    /// embedding. Each line starts with the word in UTF-8, followed
+    /// by its vector components encoded in ASCII. The word and its
+    /// components are separated by spaces.
+    #[staticmethod]
+    fn read_text(path: &str) -> PyResult<PyEmbeddings> {
+        read_non_fifu_embeddings(path, |r| Embeddings::read_text(r))
+    }
+
+    /// read_text_dims(path,/)
+    /// --
+    ///
+    /// Read embeddings in text format with dimensions. In this format,
+    /// the first line states the shape of the embedding matrix. The
+    /// number of rows (words) and columns (embedding dimensionality) is
+    /// separated by a space character. The remainder of the file uses
+    /// one line per embedding. Each line starts with the word in UTF-8,
+    /// followed by its vector components encoded in ASCII. The word and
+    /// its components are separated by spaces.
+    #[staticmethod]
+    fn read_text_dims(path: &str) -> PyResult<PyEmbeddings> {
+        read_non_fifu_embeddings(path, |r| Embeddings::read_text_dims(r))
+    }
+
+    /// read_text_dims(path,/)
+    /// --
+    ///
+    /// Read embeddings in the word2vec binary format.
+    #[staticmethod]
+    fn read_word2vec(path: &str) -> PyResult<PyEmbeddings> {
+        read_non_fifu_embeddings(path, |r| Embeddings::read_word2vec_binary(r))
+    }
+
     /// Get the model's vocabulary.
     fn vocab(&self) -> PyResult<PyVocab> {
         Ok(PyVocab::new(self.embeddings.clone()))
@@ -283,7 +322,7 @@ impl PyIterProtocol for PyEmbeddings {
     }
 }
 
-fn load_embeddings<S>(path: &str, mmap: bool) -> Result<Embeddings<VocabWrap, S>, Error>
+fn read_embeddings<S>(path: &str, mmap: bool) -> Result<Embeddings<VocabWrap, S>, Error>
 where
     Embeddings<VocabWrap, S>: ReadEmbeddings + MmapEmbeddings,
 {
@@ -298,3 +337,27 @@ where
 
     Ok(embeddings)
 }
+
+fn read_non_fifu_embeddings<R>(path: &str, read_embeddings: R) -> PyResult<PyEmbeddings>
+where
+    R: FnOnce(&mut BufReader<File>) -> ffio::Result<Embeddings<SimpleVocab, NdArray>>,
+{
+    let f = File::open(path).map_err(|err| {
+        exceptions::IOError::py_err(format!(
+            "Cannot read text embeddings from '{}': {}'",
+            path, err
+        ))
+    })?;
+    let mut reader = BufReader::new(f);
+
+    let embeddings = read_embeddings(&mut reader).map_err(|err| {
+        exceptions::IOError::py_err(format!(
+            "Cannot read text embeddings from '{}': {}'",
+            path, err
+        ))
+    })?;
+
+    Ok(PyEmbeddings {
+        embeddings: Rc::new(RefCell::new(EmbeddingsWrap::View(embeddings.into()))),
+    })
+}
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -17,22 +17,19 @@ def embeddings_fifu(tests_root):
 
 @pytest.fixture
 def embeddings_text(tests_root):
-    embeds = dict()
-
-    with open(os.path.join(tests_root, "embeddings.txt"), "r", encoding="utf8") as lines:
-        for line in lines:
-            line_list = line.split(' ')
-            embeds[line_list[0]] = numpy.array(
-                [float(c) for c in line_list[1:]])
-
-    yield embeds
+    yield finalfusion.Embeddings.read_text(os.path.join(tests_root, "embeddings.txt"))
 
 
 @pytest.fixture
 def similarity_fifu(tests_root):
     yield finalfusion.Embeddings(os.path.join(tests_root, "similarity.fifu"))
 
 
+@pytest.fixture
+def embeddings_text_dims(tests_root):
+    yield finalfusion.Embeddings.read_text_dims(os.path.join(tests_root, "embeddings.dims.txt"))
+
+
 @pytest.fixture
 def tests_root():
     yield os.path.dirname(__file__)
diff --git a/tests/embeddings.dims.txt b/tests/embeddings.dims.txt
@@ -0,0 +1,8 @@
+7 10
+one 3.0 1.0 0.0 0.0 0.0 0.0 2.0 2.0 4.0 3.0
+two 2.0 3.0 3.0 3.0 3.0 2.0 0.0 3.0 3.0 4.0
+three 0.0 0.0 2.0 0.0 2.0 1.0 2.0 4.0 0.0 3.0
+four 1.0 4.0 4.0 2.0 4.0 2.0 4.0 1.0 3.0 1.0
+five 0.0 4.0 1.0 2.0 0.0 4.0 0.0 3.0 1.0 3.0
+six 3.0 3.0 4.0 2.0 0.0 0.0 0.0 3.0 2.0 1.0
+seven 1.0 4.0 0.0 2.0 2.0 2.0 4.0 3.0 1.0 1.0
diff --git a/tests/test_embedding.py b/tests/test_embedding.py
@@ -12,13 +12,22 @@
 ]
 
 
-def test_embeddings(embeddings_fifu, embeddings_text):
-    for embedding, norm in zip(
-            embeddings_fifu, TEST_NORMS):
-        unnormed_embed = embedding.embedding * embedding.norm
-        test_embed = embeddings_text[embedding.word]
+def test_embeddings(embeddings_fifu, embeddings_text, embeddings_text_dims):
+    # Check that we cover all words from all embedding below.
+    assert len(embeddings_fifu.vocab()) == 7
+    assert len(embeddings_text.vocab()) == 7
+    assert len(embeddings_text_dims.vocab()) == 7
+
+    # Check that the finalfusion embeddings have the correct dimensionality
+    # The correct dimensionality of the other embedding types is asserted
+    # in the pairwise comparisons below.
+    assert embeddings_fifu.matrix_copy().shape == (7, 10)
+    
+    for embedding in embeddings_fifu:
+        assert numpy.allclose(
+            embedding.embedding, embeddings_text[embedding.word]), "FiFu and text embedding mismatch"
         assert numpy.allclose(
-            unnormed_embed, test_embed), "Embedding from 'iter_with_norm()' fails to match!"
+            embedding.embedding, embeddings_text_dims[embedding.word]), "FiFu and textdims embedding mismatch"
 
 
 def test_embeddings_with_norms_oov(embeddings_fifu):