Implement compat readers and writers.

sebpuetz · sebpuetz · commit c1009a2c64ee · 2020-05-28T10:36:34.000+02:00
Implement reading and writing for word2vec binary and text(-dims).
diff --git a/src/finalfusion/_util.py b/src/finalfusion/_util.py
@@ -0,0 +1,11 @@
+# pylint: disable=missing-module-docstring
+import numpy as np
+
+from finalfusion.norms import Norms
+from finalfusion.storage import NdArray
+
+
+def _normalize_ndarray_storage(storage: NdArray) -> Norms:
+    norms = np.linalg.norm(storage, axis=1)
+    storage /= norms[:, None]
+    return Norms(norms)
diff --git a/src/finalfusion/compat/__init__.py b/src/finalfusion/compat/__init__.py
@@ -0,0 +1,15 @@
+"""
+Compatibility Module for Embedding formats
+
+This module contains read and write methods for other common embedding formats such as:
+    * text(-dims)
+    * word2vec binary
+"""
+
+from finalfusion.compat.text import load_text, load_text_dims, write_text, write_text_dims
+from finalfusion.compat.word2vec import load_word2vec, write_word2vec
+
+__all__ = [
+    'load_text_dims', 'load_word2vec', 'load_text', 'write_word2vec',
+    'write_text', 'write_text_dims'
+]
diff --git a/src/finalfusion/compat/text.py b/src/finalfusion/compat/text.py
@@ -0,0 +1,160 @@
+"""
+Text based embedding formats.
+"""
+
+import re
+from os import PathLike
+from typing import Union, TextIO
+
+import numpy as np
+
+from finalfusion import Embeddings
+from finalfusion._util import _normalize_ndarray_storage
+from finalfusion.storage import NdArray
+from finalfusion.vocab import SimpleVocab
+
+_ASCII_WHITESPACE_PAT = re.compile(r'(?a)\s+')
+
+
+def load_text_dims(file: Union[str, bytes, int, PathLike]) -> Embeddings:
+    """
+    Read emebddings in text-dims format.
+
+    The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
+    l2-normalized per default and the corresponding norms are stored in the Norms.
+
+    The first line contains whitespace separated rows and cols, the rest of the file contains
+    whitespace separated word and vector components.
+
+    Parameters
+    ----------
+    file : str, bytes, int, PathLike
+        Path to a file with embeddings in word2vec binary format.
+    Returns
+    -------
+    embeddings : Embeddings
+        The embeddings from the input file.
+    """
+    with open(file) as inf:
+        rows, cols = next(inf).split()
+        return _load_text(inf, int(rows), int(cols))
+
+
+def load_text(file: Union[str, bytes, int, PathLike]) -> Embeddings:
+    """
+    Read embeddings in text format.
+
+    The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
+    l2-normalized per default and the corresponding norms are stored in the Norms.
+
+    Expects a file with utf-8 encoded lines with:
+        * word at the start of the line
+        * followed by whitespace
+        * followed by whitespace separated vector components
+
+    Parameters
+    ----------
+    file : str, bytes, int, PathLike
+        Path to a file with embeddings in word2vec binary format.
+
+    Returns
+    -------
+    embeddings : Embeddings
+        Embeddings from the input file. The resulting Embeddings will have a
+        SimpleVocab, NdArray and Norms.
+    """
+    with open(file) as inf:
+        try:
+            first = next(inf)
+        except StopIteration:
+            raise ValueError("Can't read from empty embeddings file.")
+        line = _ASCII_WHITESPACE_PAT.split(first.rstrip())
+        cols = len(line[1:])
+        rows = sum(1 for _ in inf) + 1
+        inf.seek(0)
+        return _load_text(inf, rows, cols)
+
+
+def write_text(file: Union[str, bytes, int, PathLike],
+               embeddings: Embeddings,
+               sep=" "):
+    """
+    Write embeddings in text format.
+
+    Embeddings are un-normalized before serialization, if norms are present, each embedding is
+    scaled by the associated norm.
+
+    The output consists of utf-8 encoded lines with:
+        * word at the start of the line
+        * followed by whitespace
+        * followed by whitespace separated vector components
+
+    Parameters
+    ----------
+    file : str, bytes, int, PathLike
+        Output file
+    embeddings : Embeddings
+        Embeddings to write
+    sep : str
+        Separator of word and embeddings.
+    """
+    _write_text(file, embeddings, False, sep=sep)
+
+
+def write_text_dims(file: Union[str, bytes, int, PathLike],
+                    embeddings: Embeddings,
+                    sep=" "):
+    """
+    Write embeddings in text-dims format.
+
+    Embeddings are un-normalized before serialization, if norms are present, each embedding is
+    scaled by the associated norm.
+
+    The output consists of utf-8 encoded lines with:
+        * `rows cols` on the **first** line
+        * word at the start of the line
+        * followed by whitespace
+        * followed by whitespace separated vector components
+
+    Parameters
+    ----------
+    file : str, bytes, int, PathLike
+        Output file
+    embeddings : Embeddings
+        Embeddings to write
+    sep : str
+        Separator of word and embeddings.
+    """
+    _write_text(file, embeddings, True, sep=sep)
+
+
+def _load_text(file: TextIO, rows: int, cols: int) -> Embeddings:
+    words = []
+    matrix = np.zeros((rows, cols), dtype=np.float32)
+    for row, line in zip(matrix, file):
+        parts = _ASCII_WHITESPACE_PAT.split(line.rstrip())
+        words.append(parts[0])
+        row[:] = parts[1:]
+    storage = NdArray(matrix)
+    return Embeddings(storage=storage,
+                      norms=_normalize_ndarray_storage(storage),
+                      vocab=SimpleVocab(words))
+
+
+def _write_text(file: Union[str, bytes, int, PathLike],
+                embeddings: Embeddings,
+                dims: bool,
+                sep=" "):
+    vocab = embeddings.vocab
+    matrix = embeddings.storage[:len(vocab)]
+    with open(file, 'w') as outf:
+        if dims:
+            print(*matrix.shape, file=outf)
+        for idx, word in enumerate(vocab):
+            row = matrix[idx]
+            if embeddings.norms is not None:
+                row = row * embeddings.norms[idx]
+            print(word, ' '.join(map(str, row)), sep=sep, file=outf)
+
+
+__all__ = ['load_text', 'load_text_dims', 'write_text', 'write_text_dims']
diff --git a/src/finalfusion/compat/word2vec.py b/src/finalfusion/compat/word2vec.py
@@ -0,0 +1,109 @@
+"""
+Word2vec binary format.
+"""
+
+import sys
+from os import PathLike
+from typing import Union, BinaryIO, AnyStr
+
+import numpy as np
+
+from finalfusion import Embeddings
+from finalfusion.io import _serialize_array_as_le
+from finalfusion.storage import NdArray
+from finalfusion._util import _normalize_ndarray_storage
+from finalfusion.vocab import SimpleVocab
+
+
+def load_word2vec(file: Union[str, bytes, int, PathLike]) -> Embeddings:
+    """
+    Read embeddings in word2vec binary format.
+
+    The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
+    l2-normalized per default and the corresponding norms are stored in the Norms.
+
+    Files are expected to start with a line containing rows and cols in utf-8. Words are encoded
+    in utf-8 followed by a single whitespace. After the whitespace, the embedding components are
+    expected as little-endian single-precision floats.
+
+    Parameters
+    ----------
+    file : str, bytes, int, PathLike
+        Path to a file with embeddings in word2vec binary format.
+
+    Returns
+    -------
+    embeddings : Embeddings
+        The embeddings from the input file.
+    """
+    words = []
+    with open(file, 'rb') as inf:
+        rows, cols = map(int, inf.readline().decode("ascii").split())
+        matrix = np.zeros((rows, cols), dtype=np.float32)
+        for row in matrix:
+            words.append(_read_binary_word(inf, b' ').strip())
+            array = np.fromfile(file=inf, count=cols, dtype=np.float32)
+            if sys.byteorder == "big":
+                array.byteswap(inplace=True)
+            row[:] = array
+    storage = NdArray(matrix)
+    return Embeddings(storage=storage,
+                      norms=_normalize_ndarray_storage(storage),
+                      vocab=SimpleVocab(words))
+
+
+def write_word2vec(file: Union[str, bytes, int, PathLike],
+                   embeddings: Embeddings):
+    """
+    Write embeddings in word2vec binary format.
+
+    If the embeddings are not compatible with the w2v format (e.g. include a SubwordVocab), only
+    the known words and embeddings are serialized. I.e. the subword matrix is discarded.
+
+    Embeddings are un-normalized before serialization, if norms are present, each embedding is
+    scaled by the associated norm.
+
+    The output file will contain the shape encoded in utf-8 on the first line as `rows columns`.
+    This is followed by the embeddings.
+
+    Each embedding consists of:
+        * utf-8 encoded word
+        * single space `' '` following the word
+        * `cols` single-precision floating point numbers
+        * `'\n'` newline at the end of each line.
+
+    Parameters
+    ----------
+    file : str, bytes, int, PathLike
+        Output file
+    embeddings : Embeddings
+        The embeddings to serialize.
+    """
+    vocab = embeddings.vocab
+    matrix = embeddings.storage[:len(vocab)]
+    with open(file, 'wb') as outf:
+        outf.write(f'{matrix.shape[0]} {matrix.shape[1]}\n'.encode('ascii'))
+        for idx, word in enumerate(vocab):
+            row = matrix[idx]
+            if embeddings.norms is not None:
+                row = row * embeddings.norms[idx]
+            b_word = word.encode('utf-8')
+            outf.write(b_word)
+            outf.write(b' ')
+            _serialize_array_as_le(outf, row)
+            outf.write(b'\n')
+
+
+def _read_binary_word(inf: BinaryIO, delim: AnyStr):
+    word = []
+    while True:
+        byte = inf.read(1)
+        if byte == delim:
+            break
+        if byte == b'':
+            raise EOFError
+        word.append(byte)
+    return b''.join(word).decode('utf-8')
+
+
+__all__ = ['load_word2vec', 'write_word2vec']
diff --git a/src/finalfusion/vocab/vocab.py b/src/finalfusion/vocab/vocab.py
@@ -3,7 +3,7 @@
 """
 import abc
 import struct
-from typing import List, Optional, Dict, Tuple, BinaryIO, Iterable, Any, Union, Sequence
+from typing import List, Optional, Dict, Tuple, BinaryIO, Iterable, Any, Union, Sequence, Iterator
 
 from finalfusion.io import Chunk, _read_required_binary, _write_binary
 
@@ -82,7 +82,7 @@ def __contains__(self, item: Any) -> bool:
             return all(w in self for w in item)
         return False
 
-    def __iter__(self) -> Iterable[str]:
+    def __iter__(self) -> Iterator[str]:
         return iter(self.words)
 
     def __len__(self) -> int:
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -6,6 +6,7 @@
 
 import finalfusion
 import finalfusion.vocab
+import finalfusion.compat
 
 
 @pytest.fixture
@@ -40,3 +41,21 @@ def embeddings_fifu(tests_root):
 @pytest.fixture
 def bucket_vocab_embeddings_fifu(tests_root):
     yield finalfusion.load_finalfusion(tests_root / "data" / "ff_buckets.fifu")
+
+
+@pytest.fixture
+def embeddings_text(tests_root):
+    yield finalfusion.compat.load_text(
+        os.path.join(tests_root, "data/embeddings.txt"))
+
+
+@pytest.fixture
+def embeddings_text_dims(tests_root):
+    yield finalfusion.compat.load_text_dims(
+        os.path.join(tests_root, "data/embeddings.dims.txt"))
+
+
+@pytest.fixture
+def embeddings_w2v(tests_root):
+    yield finalfusion.compat.load_word2vec(
+        os.path.join(tests_root, "data/embeddings.w2v"))
diff --git a/tests/data/embeddings.dims.txt b/tests/data/embeddings.dims.txt
@@ -0,0 +1,8 @@
+7 10
+one 3.0 1.0 0.0 0.0 0.0 0.0 2.0 2.0 4.0 3.0
+two 2.0 3.0 3.0 3.0 3.0 2.0 0.0 3.0 3.0 4.0
+three 0.0 0.0 2.0 0.0 2.0 1.0 2.0 4.0 0.0 3.0
+four 1.0 4.0 4.0 2.0 4.0 2.0 4.0 1.0 3.0 1.0
+five 0.0 4.0 1.0 2.0 0.0 4.0 0.0 3.0 1.0 3.0
+six 3.0 3.0 4.0 2.0 0.0 0.0 0.0 3.0 2.0 1.0
+seven 1.0 4.0 0.0 2.0 2.0 2.0 4.0 3.0 1.0 1.0
diff --git a/tests/data/embeddings.w2v b/tests/data/embeddings.w2v
diff --git a/tests/test_compat.py b/tests/test_compat.py
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py