Implement FastTextVocab.

sebpuetz · sebpuetz · commit b7237978b863 · 2020-05-26T09:41:56.000+02:00
diff --git a/src/finalfusion/embeddings.py b/src/finalfusion/embeddings.py
@@ -11,7 +11,7 @@
 from finalfusion.metadata import Metadata
 from finalfusion.norms import Norms
 from finalfusion.storage import Storage, NdArray
-from finalfusion.vocab import Vocab, SimpleVocab, FinalfusionBucketVocab
+from finalfusion.vocab import Vocab, SimpleVocab, FinalfusionBucketVocab, FastTextVocab
 
 
 class Embeddings:  # pylint: disable=too-many-instance-attributes
@@ -38,7 +38,8 @@ class Embeddings:  # pylint: disable=too-many-instance-attributes
         * :class:`~finalfusion.storage.ndarray.NdArray`
     2. :class:`~finalfusion.vocab.Vocab` *(required)*:
         * :class:`~finalfusion.vocab.simple_vocab.SimpleVocab`,
-          :class:`~finalfusion.vocab.subword.FinalfusionBucketVocab`
+        * :class:`~finalfusion.vocab.subword.FinalfusionBucketVocab`
+        * :class:`~finalfusion.vocab.subword.FastTextVocab`
     3. :class:`~finalfusion.metadata.Metadata`
     4. :class:`~finalfusion.norms.Norms`
 
@@ -460,6 +461,8 @@ def load_finalfusion(file: Union[str, bytes, int, PathLike],
             vocab = SimpleVocab.read_chunk(inf)  # type: Vocab
         elif chunk_id == ChunkIdentifier.BucketSubwordVocab:
             vocab = FinalfusionBucketVocab.read_chunk(inf)
+        elif chunk_id == ChunkIdentifier.FastTextSubwordVocab:
+            vocab = FastTextVocab.read_chunk(inf)
         else:
             raise FinalfusionFormatError(
                 f'Expected vocab chunk, not {str(chunk_id)}')
diff --git a/src/finalfusion/vocab/__init__.py b/src/finalfusion/vocab/__init__.py
@@ -6,7 +6,8 @@
 
 from finalfusion.io import ChunkIdentifier, find_chunk
 from finalfusion.vocab.simple_vocab import SimpleVocab, load_simple_vocab
-from finalfusion.vocab.subword import FinalfusionBucketVocab, load_finalfusion_bucket_vocab
+from finalfusion.vocab.subword import FinalfusionBucketVocab, load_finalfusion_bucket_vocab, \
+    FastTextVocab, load_fasttext_vocab
 from finalfusion.vocab.vocab import Vocab
 
 
@@ -43,10 +44,18 @@ def load_vocab(file: Union[str, bytes, int, PathLike]) -> Vocab:
             return SimpleVocab.read_chunk(inf)
         if chunk == ChunkIdentifier.BucketSubwordVocab:
             return FinalfusionBucketVocab.read_chunk(inf)
+        if chunk == ChunkIdentifier.FastTextSubwordVocab:
+            return FastTextVocab.read_chunk(inf)
         raise NotImplementedError('Vocab type is not yet supported.')
 
 
 __all__ = [
-    'Vocab', 'load_vocab', 'SimpleVocab', 'load_simple_vocab',
-    'FinalfusionBucketVocab', 'load_finalfusion_bucket_vocab'
+    'Vocab',
+    'load_vocab',
+    'SimpleVocab',
+    'load_simple_vocab',
+    'FinalfusionBucketVocab',
+    'load_finalfusion_bucket_vocab',
+    'FastTextVocab',
+    'load_fasttext_vocab',
 ]
diff --git a/src/finalfusion/vocab/subword.py b/src/finalfusion/vocab/subword.py
@@ -148,19 +148,16 @@ class FinalfusionBucketVocab(SubwordVocab):
     """
     def __init__(self,
                  words: List[str],
-                 indexer: FinalfusionHashIndexer = None):
+                 indexer: Optional[FinalfusionHashIndexer] = None):
         """
         Initialize a FinalfusionBucketVocab.
 
-        Initializes the vocabulary with the given words and optional index and
-        indexer.
+        Initializes the vocabulary with the given words.
 
         If no indexer is passed, a FinalfusionHashIndexer with bucket exponent
         21 is used.
 
-        If no index is given, the nth word in the `words` list is assigned
-        index `n`. The word list cannot contain duplicate entries and it needs
-        to be of same length as the index.
+        The word list cannot contain duplicate entries.
 
         Parameters
         ----------
@@ -211,6 +208,70 @@ def chunk_identifier() -> ChunkIdentifier:
         return ChunkIdentifier.BucketSubwordVocab
 
 
+class FastTextVocab(SubwordVocab):
+    """
+    FastText vocabulary
+    """
+    def __init__(self,
+                 words: List[str],
+                 indexer: Optional[FastTextIndexer] = None):
+        """
+        Initialize a FastTextVocab.
+
+        Initializes the vocabulary with the given words.
+
+        If no indexer is passed, a FastTextIndexer with 2_000_000 buckets is used.
+
+        The word list cannot contain duplicate entries.
+
+        Parameters
+        ----------
+        words : List[str]
+            List of unique words
+        indexer : FastTextIndexer, optional
+            Subword indexer to use for the vocabulary. Defaults to an indexer
+            with 2_000_000 buckets and range 3-6.
+
+        Raises
+        ------
+        AssertionError
+            If the indexer is not a FastTextIndexer or ``words`` contains duplicate entries.
+        """
+        if indexer is None:
+            indexer = FastTextIndexer(2000000)
+        assert isinstance(indexer, FastTextIndexer)
+        super().__init__()
+        self._index = _validate_items_and_create_index(words)
+        self._words = words
+        self._indexer = indexer
+
+    @property
+    def subword_indexer(self) -> FastTextIndexer:
+        return self._indexer
+
+    @property
+    def words(self) -> List[str]:
+        return self._words
+
+    @property
+    def word_index(self) -> Dict[str, int]:
+        return self._index
+
+    @staticmethod
+    def read_chunk(file: BinaryIO) -> 'FastTextVocab':
+        length, min_n, max_n, buckets = _read_required_binary(file, "<QIII")
+        words = _read_items(file, length)
+        indexer = FastTextIndexer(buckets, min_n, max_n)
+        return FastTextVocab(words, indexer)
+
+    def write_chunk(self, file: BinaryIO):
+        _write_bucket_vocab(file, self)
+
+    @staticmethod
+    def chunk_identifier():
+        return ChunkIdentifier.FastTextSubwordVocab
+
+
 def load_finalfusion_bucket_vocab(file: Union[str, bytes, int, PathLike]
                                   ) -> FinalfusionBucketVocab:
     """
@@ -233,7 +294,30 @@ def load_finalfusion_bucket_vocab(file: Union[str, bytes, int, PathLike]
         return FinalfusionBucketVocab.read_chunk(inf)
 
 
-def _write_bucket_vocab(file: BinaryIO, vocab: FinalfusionBucketVocab):
+def load_fasttext_vocab(file: Union[str, bytes, int, PathLike]
+                        ) -> FastTextVocab:
+    """
+    Load a FastTextVocab from the given finalfusion file.
+
+    Parameters
+    ----------
+    file : str, bytes, int, PathLike
+        Path to file containing a FastTextVocab chunk.
+
+    Returns
+    -------
+    vocab : FastTextVocab
+        Returns the first FastTextVocab in the file.
+    """
+    with open(file, "rb") as inf:
+        chunk = find_chunk(inf, [ChunkIdentifier.FastTextSubwordVocab])
+        if chunk is None:
+            raise ValueError('File did not contain a FastTextVocab}')
+        return FastTextVocab.read_chunk(inf)
+
+
+def _write_bucket_vocab(file: BinaryIO,
+                        vocab: Union[FastTextVocab, FinalfusionBucketVocab]):
     min_n_max_n_size = struct.calcsize("<II")
     buckets_size = struct.calcsize("<I")
     chunk_length = _calculate_binary_list_size(vocab.words)
@@ -254,5 +338,6 @@ def _write_bucket_vocab(file: BinaryIO, vocab: FinalfusionBucketVocab):
 
 
 __all__ = [
-    'SubwordVocab', 'FinalfusionBucketVocab', 'load_finalfusion_bucket_vocab'
+    'SubwordVocab', 'FinalfusionBucketVocab', 'load_finalfusion_bucket_vocab',
+    'FastTextVocab', 'load_fasttext_vocab'
 ]
diff --git a/tests/test_vocab.py b/tests/test_vocab.py
@@ -3,7 +3,7 @@
 
 from finalfusion.io import FinalfusionFormatError
 from finalfusion.subword import FinalfusionHashIndexer, FastTextIndexer
-from finalfusion.vocab import FinalfusionBucketVocab, SimpleVocab, load_vocab
+from finalfusion.vocab import FinalfusionBucketVocab, SimpleVocab, load_vocab, FastTextVocab
 
 
 def test_reading(tests_root):
@@ -90,6 +90,34 @@ def test_fifu_buckets_constructor():
                       ")"
 
 
+def test_fasttext_constructor():
+    v = FastTextVocab([str(i) for i in range(10)])
+    assert [v[str(i)] for i in range(10)] == [i for i in range(10)]
+    with pytest.raises(AssertionError):
+        v = FastTextVocab(["a"] * 2)
+    with pytest.raises(AssertionError):
+        _ = FastTextVocab(v.words, FinalfusionHashIndexer(21))
+    assert len(v) == 10
+    assert v.upper_bound == len(v) + 2_000_000
+    assert v == v
+    assert v in v
+    assert v != SimpleVocab(v.words)
+    assert v != FastTextVocab(v.words, FastTextIndexer(20))
+    assert repr(v) == f"FastTextVocab(\n" \
+                      f"\tindexer={repr(v.subword_indexer)}\n" \
+                      "\twords=[...]\n" \
+                      "\tword_index={{...}}\n" \
+                      ")"
+
+
+def test_fasttext_vocab_roundtrip(tmp_path):
+    filename = tmp_path / "write_ft_vocab.fifu"
+    v = FastTextVocab([str(i) for i in range(10)])
+    v.write(filename)
+    v2 = load_vocab(filename)
+    assert v == v2
+
+
 def test_fifu_buckets_roundtrip(tests_root, tmp_path):
     filename = tmp_path / "write_ff_buckets.fifu"
     v = load_vocab(tests_root / "data" / "ff_buckets.fifu")