|
| 1 | +import numpy as np |
| 2 | +from finalfusion.scripts.util import Format |
| 3 | +from finalfusion.subword import ngrams |
| 4 | +from finalfusion.vocab.subword import FastTextVocab, ExplicitVocab, FinalfusionBucketVocab |
| 5 | + |
| 6 | + |
| 7 | +def test(inp, input_format, output): |
| 8 | + e1 = Format(input_format).load(inp) |
| 9 | + e2 = Format("finalfusion").load(output) |
| 10 | + |
| 11 | + v1 = e1.vocab |
| 12 | + v2 = e2.vocab |
| 13 | + assert isinstance(v1, (FinalfusionBucketVocab, FastTextVocab)) |
| 14 | + assert isinstance(v2, ExplicitVocab) |
| 15 | + assert v1.words == v2.words |
| 16 | + assert v1.word_index == v2.word_index |
| 17 | + assert v1.subword_indexer.min_n == v2.subword_indexer.min_n, \ |
| 18 | + f"{v1.subword_indexer.min_n} == {v2.subword_indexer.min_n}" |
| 19 | + assert v1.subword_indexer.max_n == v2.subword_indexer.max_n, \ |
| 20 | + f"{v1.subword_indexer.max_n} == {v2.subword_indexer.max_n}" |
| 21 | + v1_ngrams = set([ngram for word in v1.words for ngram in ngrams(word)]) |
| 22 | + v1_unique_indices = set((v1.subword_indexer(ngram) for ngram in v1_ngrams)) |
| 23 | + assert v1_ngrams == set(v2.subword_indexer.ngrams) |
| 24 | + assert len(v1_unique_indices) == v2.subword_indexer.upper_bound, \ |
| 25 | + f"{len(v1_unique_indices)} == {v2.subword_indexer.upper_bound}" |
| 26 | + assert len(v1_unique_indices) + len(v1) == v2.upper_bound, \ |
| 27 | + f"{len(v1_unique_indices)} + {len(v1)} == {v2.upper_bound}" |
| 28 | + assert e2.storage.shape[0] == v2.upper_bound, \ |
| 29 | + f"{e2.storage.shape[0]} == {v2.upper_bound}" |
| 30 | + assert np.allclose(e1.storage[:len(v1)], e2.storage[:len(v2)]) |
| 31 | + for ngram in v1_ngrams: |
| 32 | + e1_ngram_embed = e1.storage[v1.subword_indexer(ngram) + len(v1)] |
| 33 | + e2_ngram_embed = e2.storage[v2.subword_indexer(ngram) + len(v1)] |
| 34 | + assert np.allclose(e1_ngram_embed, e2_ngram_embed) |
| 35 | + |
| 36 | + |
| 37 | +if __name__ == '__main__': |
| 38 | + import sys |
| 39 | + test(*sys.argv[1:]) |
0 commit comments