Implement ExplicitVocab.

sebpuetz · sebpuetz · commit ea1992f1fb43 · 2020-05-26T11:28:32.000+02:00
diff --git a/src/finalfusion/embeddings.py b/src/finalfusion/embeddings.py
@@ -11,7 +11,8 @@
 from finalfusion.metadata import Metadata
 from finalfusion.norms import Norms
 from finalfusion.storage import Storage, NdArray
-from finalfusion.vocab import Vocab, SimpleVocab, FinalfusionBucketVocab, FastTextVocab
+from finalfusion.vocab import Vocab, SimpleVocab, FinalfusionBucketVocab, FastTextVocab, \
+    ExplicitVocab
 
 
 class Embeddings:  # pylint: disable=too-many-instance-attributes
@@ -40,6 +41,7 @@ class Embeddings:  # pylint: disable=too-many-instance-attributes
         * :class:`~finalfusion.vocab.simple_vocab.SimpleVocab`,
         * :class:`~finalfusion.vocab.subword.FinalfusionBucketVocab`
         * :class:`~finalfusion.vocab.subword.FastTextVocab`
+        * :class:`~finalfusion.vocab.subword.ExplicitVocab`
     3. :class:`~finalfusion.metadata.Metadata`
     4. :class:`~finalfusion.norms.Norms`
 
@@ -463,6 +465,8 @@ def load_finalfusion(file: Union[str, bytes, int, PathLike],
             vocab = FinalfusionBucketVocab.read_chunk(inf)
         elif chunk_id == ChunkIdentifier.FastTextSubwordVocab:
             vocab = FastTextVocab.read_chunk(inf)
+        elif chunk_id == ChunkIdentifier.ExplicitSubwordVocab:
+            vocab = ExplicitVocab.read_chunk(inf)
         else:
             raise FinalfusionFormatError(
                 f'Expected vocab chunk, not {str(chunk_id)}')
diff --git a/src/finalfusion/subword/explicit_indexer.pyx b/src/finalfusion/subword/explicit_indexer.pyx
@@ -207,7 +207,6 @@ cdef class ExplicitIndexer:
                f"\tmin_n={self.min_n},\n" \
                f"\tmax_n={self.max_n},\n" \
                "\tngrams=[...],\n" \
-               "\tngram_index={{...}}\n" \
-               ")"
+               "\tngram_index={{...}})"
 
 __all__ = ['ExplicitIndexer']
diff --git a/src/finalfusion/vocab/__init__.py b/src/finalfusion/vocab/__init__.py
@@ -7,7 +7,7 @@
 from finalfusion.io import ChunkIdentifier, find_chunk
 from finalfusion.vocab.simple_vocab import SimpleVocab, load_simple_vocab
 from finalfusion.vocab.subword import FinalfusionBucketVocab, load_finalfusion_bucket_vocab, \
-    FastTextVocab, load_fasttext_vocab
+    FastTextVocab, load_fasttext_vocab, ExplicitVocab, load_explicit_vocab
 from finalfusion.vocab.vocab import Vocab
 
 
@@ -17,14 +17,20 @@ def load_vocab(file: Union[str, bytes, int, PathLike]) -> Vocab:
 
     Loads the first known vocabulary from a finalfusion file.
 
+    One of:
+        * :class:`~finalfusion.vocab.simple_vocab.SimpleVocab`,
+        * :class:`~finalfusion.vocab.subword.FinalfusionBucketVocab`
+        * :class:`~finalfusion.vocab.subword.FastTextVocab`
+        * :class:`~finalfusion.vocab.subword.ExplicitVocab`
+
     Parameters
     ----------
     file: str, bytes, int, PathLike
         Path to file containing a finalfusion vocab chunk.
 
     Returns
     -------
-    vocab : Union[SimpleVocab, FinalfusionBucketVocab]
+    vocab : Vocab
         First vocabulary in the file.
 
     Raises
@@ -46,16 +52,13 @@ def load_vocab(file: Union[str, bytes, int, PathLike]) -> Vocab:
             return FinalfusionBucketVocab.read_chunk(inf)
         if chunk == ChunkIdentifier.FastTextSubwordVocab:
             return FastTextVocab.read_chunk(inf)
-        raise NotImplementedError('Vocab type is not yet supported.')
+        if chunk == ChunkIdentifier.ExplicitSubwordVocab:
+            return ExplicitVocab.read_chunk(inf)
+        raise ValueError(f'Unexpected chunk type {chunk}.')
 
 
 __all__ = [
-    'Vocab',
-    'load_vocab',
-    'SimpleVocab',
-    'load_simple_vocab',
-    'FinalfusionBucketVocab',
-    'load_finalfusion_bucket_vocab',
-    'FastTextVocab',
-    'load_fasttext_vocab',
+    'Vocab', 'load_vocab', 'SimpleVocab', 'load_simple_vocab',
+    'FinalfusionBucketVocab', 'load_finalfusion_bucket_vocab', 'FastTextVocab',
+    'load_fasttext_vocab', 'ExplicitVocab', 'load_explicit_vocab'
 ]
diff --git a/src/finalfusion/vocab/subword.py b/src/finalfusion/vocab/subword.py
@@ -10,7 +10,7 @@
 from finalfusion.io import ChunkIdentifier, find_chunk, _write_binary, _read_required_binary
 from finalfusion.subword import ExplicitIndexer, FastTextIndexer, FinalfusionHashIndexer, ngrams
 from finalfusion.vocab.vocab import Vocab, _validate_items_and_create_index, \
-    _calculate_binary_list_size, _write_words_binary, _read_items
+    _calculate_binary_list_size, _write_words_binary, _read_items, _read_items_with_indices
 
 
 class SubwordVocab(Vocab):
@@ -133,8 +133,7 @@ def __repr__(self) -> str:
         return f"{type(self).__name__}(\n" \
                f"\tindexer={self.subword_indexer}\n" \
                "\twords=[...]\n" \
-               "\tword_index={{...}}\n" \
-               ")"
+               "\tword_index={{...}})"
 
     def __eq__(self, other: Any) -> bool:
         return isinstance(other, type(self)) and \
@@ -272,6 +271,84 @@ def chunk_identifier():
         return ChunkIdentifier.FastTextSubwordVocab
 
 
+class ExplicitVocab(SubwordVocab):
+    """
+    A vocabulary with explicitly stored n-grams.
+    """
+    def __init__(self, words: List[str], indexer: ExplicitIndexer):
+        """
+        Initialize an ExplicitVocab.
+
+        Initializes the vocabulary with the given words and ExplicitIndexer.
+
+        The word list cannot contain duplicate entries.
+
+        Parameters
+        ----------
+        words : List[str]
+            List of unique words
+        indexer : ExplicitIndexer
+            Subword indexer to use for the vocabulary.
+
+        Raises
+        ------
+        AssertionError
+            If the indexer is not an ExplicitIndexer.
+
+        See Also
+        --------
+        :class:`.ExplicitIndexer`
+        """
+        assert isinstance(indexer, ExplicitIndexer)
+        super().__init__()
+        self._index = _validate_items_and_create_index(words)
+        self._words = words
+        self._indexer = indexer
+
+    @property
+    def word_index(self) -> dict:
+        return self._index
+
+    @property
+    def subword_indexer(self) -> ExplicitIndexer:
+        return self._indexer
+
+    @property
+    def words(self) -> list:
+        return self._words
+
+    @staticmethod
+    def chunk_identifier():
+        return ChunkIdentifier.ExplicitSubwordVocab
+
+    @staticmethod
+    def read_chunk(file: BinaryIO) -> 'ExplicitVocab':
+        length, ngram_length, min_n, max_n = _read_required_binary(
+            file, "<QQII")
+        words = _read_items(file, length)
+        ngram_list, ngram_index = _read_items_with_indices(file, ngram_length)
+        indexer = ExplicitIndexer(ngram_list, min_n, max_n, ngram_index)
+        return ExplicitVocab(words, indexer)
+
+    def write_chunk(self, file) -> None:
+        chunk_length = _calculate_binary_list_size(self.words)
+        chunk_length += _calculate_binary_list_size(
+            self.subword_indexer.ngrams)
+        min_n_max_n_size = struct.calcsize("<II")
+        chunk_length += min_n_max_n_size
+        chunk_header = (int(self.chunk_identifier()), chunk_length,
+                        len(self.words), len(self.subword_indexer.ngrams),
+                        self.min_n, self.max_n)
+        _write_binary(file, "<IQQQII", *chunk_header)
+        _write_words_binary((bytes(word, "utf-8") for word in self.words),
+                            file)
+        for ngram in self.subword_indexer.ngrams:
+            b_ngram = ngram.encode("utf-8")
+            _write_binary(file, "<I", len(b_ngram))
+            file.write(b_ngram)
+            _write_binary(file, "<Q", self.subword_indexer.ngram_index[ngram])
+
+
 def load_finalfusion_bucket_vocab(file: Union[str, bytes, int, PathLike]
                                   ) -> FinalfusionBucketVocab:
     """
@@ -316,6 +393,28 @@ def load_fasttext_vocab(file: Union[str, bytes, int, PathLike]
         return FastTextVocab.read_chunk(inf)
 
 
+def load_explicit_vocab(file: Union[str, bytes, int, PathLike]
+                        ) -> ExplicitVocab:
+    """
+    Load a ExplicitVocab from the given finalfusion file.
+
+    Parameters
+    ----------
+    file : str, bytes, int, PathLike
+        Path to file containing a ExplicitVocab chunk.
+
+    Returns
+    -------
+    vocab : ExplicitVocab
+        Returns the first ExplicitVocab in the file.
+    """
+    with open(file, "rb") as inf:
+        chunk = find_chunk(inf, [ChunkIdentifier.ExplicitSubwordVocab])
+        if chunk is None:
+            raise ValueError('File did not contain a FastTextVocab}')
+        return ExplicitVocab.read_chunk(inf)
+
+
 def _write_bucket_vocab(file: BinaryIO,
                         vocab: Union[FastTextVocab, FinalfusionBucketVocab]):
     min_n_max_n_size = struct.calcsize("<II")
@@ -339,5 +438,6 @@ def _write_bucket_vocab(file: BinaryIO,
 
 __all__ = [
     'SubwordVocab', 'FinalfusionBucketVocab', 'load_finalfusion_bucket_vocab',
-    'FastTextVocab', 'load_fasttext_vocab'
+    'FastTextVocab', 'load_fasttext_vocab', 'ExplicitVocab',
+    'load_explicit_vocab'
 ]
diff --git a/src/finalfusion/vocab/vocab.py b/src/finalfusion/vocab/vocab.py
@@ -131,6 +131,34 @@ def _read_items(file: BinaryIO, length: int) -> List[str]:
     return items
 
 
+def _read_items_with_indices(file: BinaryIO,
+                             length: int) -> Tuple[List[str], Dict[str, int]]:
+    """
+    Helper method to read items from a vocabulary chunk.
+
+    Parameters
+    ----------
+    file : BinaryIO
+        input file
+    length : int
+        number of items to read
+
+    Returns
+    -------
+    words : List[str]
+        The word list
+    """
+    items = []
+    index = dict()
+    for _ in range(length):
+        item_length = _read_required_binary(file, "<I")[0]
+        item = file.read(item_length).decode("utf-8")
+        idx = _read_required_binary(file, "<Q")[0]
+        items.append(item)
+        index[item] = idx
+    return items, index
+
+
 def _calculate_binary_list_size(items: List[str]):
     size = sum(len(bytes(item, "utf-8")) for item in items)
     size += struct.calcsize("<Q")
diff --git a/tests/test_subwords.py b/tests/test_subwords.py
@@ -81,8 +81,7 @@ def test_explicit():
                                "\tmin_n=3,\n" \
                                "\tmax_n=6,\n" \
                                "\tngrams=[...],\n" \
-                               "\tngram_index={{...}}\n" \
-                               ")"
+                               "\tngram_index={{...}})"
     assert indexer["0"] == 0
     assert indexer.ngrams[0] == "0"
     assert indexer("0") == 0
diff --git a/tests/test_vocab.py b/tests/test_vocab.py
@@ -2,8 +2,8 @@
 import finalfusion.vocab
 
 from finalfusion.io import FinalfusionFormatError
-from finalfusion.subword import FinalfusionHashIndexer, FastTextIndexer
-from finalfusion.vocab import FinalfusionBucketVocab, SimpleVocab, load_vocab, FastTextVocab
+from finalfusion.subword import FinalfusionHashIndexer, FastTextIndexer, ExplicitIndexer
+from finalfusion.vocab import FinalfusionBucketVocab, SimpleVocab, load_vocab, FastTextVocab, ExplicitVocab
 
 
 def test_reading(tests_root):
@@ -86,8 +86,7 @@ def test_fifu_buckets_constructor():
     assert repr(v) == f"FinalfusionBucketVocab(\n" \
                       f"\tindexer={repr(v.subword_indexer)}\n" \
                       "\twords=[...]\n" \
-                      "\tword_index={{...}}\n" \
-                      ")"
+                      "\tword_index={{...}})"
 
 
 def test_fasttext_constructor():
@@ -106,8 +105,7 @@ def test_fasttext_constructor():
     assert repr(v) == f"FastTextVocab(\n" \
                       f"\tindexer={repr(v.subword_indexer)}\n" \
                       "\twords=[...]\n" \
-                      "\tword_index={{...}}\n" \
-                      ")"
+                      "\tword_index={{...}})"
 
 
 def test_fasttext_vocab_roundtrip(tmp_path):
@@ -118,6 +116,33 @@ def test_fasttext_vocab_roundtrip(tmp_path):
     assert v == v2
 
 
+def test_explicit_constructor():
+    i = ExplicitIndexer([str(i) for i in range(10)])
+    v = ExplicitVocab([str(i) for i in range(10, 100)], indexer=i)
+    assert [v[str(i)] for i in range(10, 100)] == [i for i in range(90)]
+    with pytest.raises(AssertionError):
+        _ = ExplicitVocab(v.words, FinalfusionHashIndexer(21))
+    assert len(v) == 90
+    assert v.upper_bound == len(v) + 10
+    assert v == v
+    assert v in v
+    assert v != SimpleVocab(v.words)
+    assert v != FastTextVocab(v.words, FastTextIndexer(20))
+    assert repr(v) == f"ExplicitVocab(\n" \
+                      f"\tindexer={repr(v.subword_indexer)}\n" \
+                      "\twords=[...]\n" \
+                      "\tword_index={{...}})"
+
+
+def test_explicit_vocab_roundtrip(tmp_path):
+    filename = tmp_path / "write_explicit_vocab.fifu"
+    i = ExplicitIndexer([str(i) for i in range(10)])
+    v = ExplicitVocab([str(i) for i in range(10, 100)], indexer=i)
+    v.write(filename)
+    v2 = load_vocab(filename)
+    assert v == v2
+
+
 def test_fifu_buckets_roundtrip(tests_root, tmp_path):
     filename = tmp_path / "write_ff_buckets.fifu"
     v = load_vocab(tests_root / "data" / "ff_buckets.fifu")