Bucket to explicit conversion.

sebpuetz · sebpuetz · commit 006cf924abfd · 2020-05-26T13:48:20.000+02:00
Implement conversion of bucket vocabularies to explicit ones and
conversion of embeddings with bucket to explicit vocabs.
diff --git a/src/finalfusion/embeddings.py b/src/finalfusion/embeddings.py
@@ -381,6 +381,43 @@ def write(self, file: str):
             for chunk in chunks:
                 chunk.write_chunk(outf)
 
+    def bucket_to_explicit(self) -> 'Embeddings':
+        """
+        Bucket to explicit Embeddings conversion.
+
+        Multiple embeddings can still map to the same bucket, but all buckets that are not
+        indexed by in-vocabulary n-grams are eliminated. This can have a big impact on the
+        size of the embedding matrix.
+
+        Metadata is **not** copied to the new embeddings since it doesn't reflect the
+        changes. You can manually set the metadata and update the values accordingly.
+
+        Returns
+        -------
+        embeddings : Embeddings
+            Embeddings with an ExplicitVocab instead of a hash-based vocabulary.
+
+        Raises
+        ------
+        TypeError
+            If the current vocabulary is not a hash-based vocabulary
+            (FinalfusionBucketVocab or FastTextVocab)
+        """
+        bucket_vocabs = (FastTextVocab, FinalfusionBucketVocab)
+        if not isinstance(self.vocab, bucket_vocabs):
+            raise TypeError(
+                "Only bucketed embeddings can be converted to explicit.")
+        vocab = self.vocab.to_explicit()
+        storage = np.zeros((vocab.upper_bound, self._storage.shape[1]),
+                           dtype=np.float32)
+        storage[:len(vocab)] = self._storage[:len(vocab)]
+        for ngram in vocab.subword_indexer:
+            storage[len(vocab) + vocab.subword_indexer[ngram]] = self._storage[
+                len(vocab) + self.vocab.subword_indexer(ngram)]
+        return Embeddings(vocab=vocab,
+                          storage=NdArray(storage),
+                          norms=self.norms)
+
     def __contains__(self, item):
         return item in self._vocab
 
diff --git a/src/finalfusion/vocab/subword.py b/src/finalfusion/vocab/subword.py
@@ -180,6 +180,25 @@ def __init__(self,
         self._words = words
         self._indexer = indexer
 
+    def to_explicit(self) -> 'ExplicitVocab':
+        """
+        Return an ExplicitVocab built from this vocab.
+
+        This method iterates over the known words and extracts all ngrams within this vocab's
+        bounds. Each of the ngrams is hashed and mapped to an index. This index is not necessarily
+        unique for each ngram, if hashes collide, multiple ngrams will be mapped to the same index.
+
+        The returned vocab will be unable to produce indices for unknown ngrams.
+
+        The indices of the new vocabs known indices will be cover `[0, vocab.upper_bound)`
+
+        Returns
+        -------
+        explicit_vocab : ExplicitVocab
+            The converted vocabulary.
+        """
+        return _bucket_to_explicit(self)
+
     def write_chunk(self, file: BinaryIO):
         _write_bucket_vocab(file, self)
 
@@ -244,6 +263,25 @@ def __init__(self,
         self._words = words
         self._indexer = indexer
 
+    def to_explicit(self) -> 'ExplicitVocab':
+        """
+        Return an ExplicitVocab built from this vocab.
+
+        This method iterates over the known words and extracts all ngrams within this vocab's
+        bounds. Each of the ngrams is hashed and mapped to an index. This index is not necessarily
+        unique for each ngram, if hashes collide, multiple ngrams will be mapped to the same index.
+
+        The returned vocab will be unable to produce indices for unknown ngrams.
+
+        The indices of the new vocabs known indices will be cover `[0, vocab.upper_bound)`
+
+        Returns
+        -------
+        explicit_vocab : ExplicitVocab
+            The converted vocabulary.
+        """
+        return _bucket_to_explicit(self)
+
     @property
     def subword_indexer(self) -> FastTextIndexer:
         return self._indexer
@@ -415,6 +453,25 @@ def load_explicit_vocab(file: Union[str, bytes, int, PathLike]
         return ExplicitVocab.read_chunk(inf)
 
 
+def _bucket_to_explicit(vocab: Union[FinalfusionBucketVocab, FastTextVocab]
+                        ) -> 'ExplicitVocab':
+    ngram_index = dict()
+    idx_index = dict()  # type: Dict[int, int]
+    ngram_list = []
+    for word in vocab.words:
+        token_ngrams = vocab.subwords(word)
+        for ngram in token_ngrams:
+            if ngram not in ngram_index:
+                ngram_list.append(ngram)
+                idx = vocab.subword_indexer(ngram)
+                if idx not in idx_index:
+                    idx_index[idx] = len(idx_index)
+                ngram_index[ngram] = idx_index[idx]
+    indexer = ExplicitIndexer(ngram_list, vocab.min_n, vocab.max_n,
+                              ngram_index)
+    return ExplicitVocab(vocab.words, indexer)
+
+
 def _write_bucket_vocab(file: BinaryIO,
                         vocab: Union[FastTextVocab, FinalfusionBucketVocab]):
     min_n_max_n_size = struct.calcsize("<II")
diff --git a/tests/test_embeddings.py b/tests/test_embeddings.py
@@ -192,3 +192,44 @@ def test_no_norms(vocab_array_tuple):
     embeddings = Embeddings(vocab=SimpleVocab(vocab), storage=NdArray(matrix))
     with pytest.raises(TypeError):
         _ = embeddings.embedding_with_norm("bla")
+
+
+def test_buckets_to_explicit(bucket_vocab_embeddings_fifu):
+    explicit = bucket_vocab_embeddings_fifu.bucket_to_explicit()
+    assert bucket_vocab_embeddings_fifu.vocab.words == explicit.vocab.words
+    for e1, e2 in zip(bucket_vocab_embeddings_fifu, explicit):
+        assert e1[0] == e1[0]
+        assert np.allclose(e1[1], e2[1])
+    assert bucket_vocab_embeddings_fifu.vocab.upper_bound == 1024 + len(
+        bucket_vocab_embeddings_fifu.vocab)
+    assert explicit.vocab.upper_bound == len(
+        bucket_vocab_embeddings_fifu.vocab) + 16
+    known = len(bucket_vocab_embeddings_fifu.vocab)
+    assert np.allclose(bucket_vocab_embeddings_fifu.storage[:known],
+                       explicit.storage[:known])
+    bucket_indexer = bucket_vocab_embeddings_fifu.vocab.subword_indexer
+    explicit_indexer = explicit.vocab.subword_indexer
+    for ngram in explicit_indexer:
+        assert np.allclose(
+            bucket_vocab_embeddings_fifu.storage[2 + bucket_indexer(ngram)],
+            explicit.storage[2 + explicit_indexer(ngram)])
+
+
+def test_buckets_to_explicit_roundtrip(bucket_vocab_embeddings_fifu, tmp_path):
+    filename = tmp_path / "bucket_to_explicit_embeds.fifu"
+    explicit = bucket_vocab_embeddings_fifu.bucket_to_explicit()
+    explicit.write(filename)
+    explicit2 = load_finalfusion(filename)
+    assert explicit.vocab == explicit2.vocab
+    assert np.allclose(explicit.storage, explicit2.storage)
+    assert np.allclose(explicit.norms, explicit2.norms)
+    assert np.allclose(bucket_vocab_embeddings_fifu.norms, explicit2.norms)
+    known = len(bucket_vocab_embeddings_fifu.vocab)
+    assert np.allclose(bucket_vocab_embeddings_fifu.storage[:known],
+                       explicit2.storage[:known])
+    bucket_indexer = bucket_vocab_embeddings_fifu.vocab.subword_indexer
+    explicit_indexer = explicit.vocab.subword_indexer
+    for ngram in explicit_indexer:
+        assert np.allclose(
+            bucket_vocab_embeddings_fifu.storage[2 + bucket_indexer(ngram)],
+            explicit.storage[2 + explicit_indexer(ngram)])
diff --git a/tests/test_vocab.py b/tests/test_vocab.py
@@ -143,6 +143,18 @@ def test_explicit_vocab_roundtrip(tmp_path):
     assert v == v2
 
 
+def test_bucket_to_explicit():
+    v = FinalfusionBucketVocab(["allerdings", "groß"])
+    explicit = v.to_explicit()
+    assert v.words == explicit.words
+    assert explicit.upper_bound == len(v) + 43
+    assert explicit.subword_indexer.upper_bound == 43
+    assert explicit.subword_indexer("dings") == explicit.subword_indexer(
+        "<gro")
+    assert v.subword_indexer("dings") == v.subword_indexer("<gro")
+    assert len(explicit.subword_indexer) == 44
+
+
 def test_fifu_buckets_roundtrip(tests_root, tmp_path):
     filename = tmp_path / "write_ff_buckets.fifu"
     v = load_vocab(tests_root / "data" / "ff_buckets.fifu")