Skip to content

Commit a20bc46

Browse files
committed
Add bucket to explicit conversion script.
1 parent e17ef91 commit a20bc46

File tree

7 files changed

+116
-3
lines changed

7 files changed

+116
-3
lines changed

.github/workflows/python.yml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,4 @@ jobs:
4242
pytest --doctest-modules
4343
- name: Conversion tests
4444
if: ${{ matrix.python-version == '3.7' }}
45-
run: |
46-
bash ./tests/integration/conversion.sh
47-
45+
run: bash ./tests/integration/all.sh

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,13 @@ between the supported formats.
122122
$ ffp-convert -f fasttext fasttext.bin -t finalfusion embeddings.fifu
123123
~~~
124124

125+
`ffp-bucket-to-explicit` can be used to convert bucket embeddings to embeddings
126+
with an explicit ngram lookup.
127+
~~~shell
128+
# convert finalfusion bucket embeddings to explicit
129+
$ ffp-bucket-to-explicit -f finalfusion embeddings.fifu explicit.fifu
130+
~~~
131+
125132
## Where to go from here
126133

127134
* [finalfrontier](https://finalfusion.github.io/finalfrontier)

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ def run(self):
103103
},
104104
entry_points=dict(console_scripts=[
105105
'ffp-convert=finalfusion.scripts.convert:main',
106+
'ffp-bucket-to-explicit=finalfusion.scripts.bucket_to_explicit:main',
106107
]),
107108
version="0.7.0-pre"
108109
)
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
Conversion from bucket embeddings to explicit.
3+
"""
4+
import argparse
5+
6+
from finalfusion.scripts.util import Format
7+
8+
9+
def main() -> None: # pylint: disable=missing-function-docstring
10+
parser = argparse.ArgumentParser(
11+
prog="ffp-bucket-to-explicit",
12+
description="Convert bucket embeddings to explicit lookups.")
13+
parser.add_argument("input",
14+
help="Input bucket embeddings",
15+
type=str,
16+
metavar="INPUT")
17+
parser.add_argument("output",
18+
help="Output path",
19+
type=str,
20+
metavar="OUTPUT")
21+
parser.add_argument(
22+
"-f",
23+
"--from",
24+
type=str,
25+
choices=['finalfusion', 'fasttext'],
26+
default="finalfusion",
27+
help=
28+
"Valid choices: ['finalfusion', 'fasttext'] Default: 'finalfusion'",
29+
metavar="INPUT_FORMAT")
30+
args = parser.parse_args()
31+
embeds = Format(getattr(args, 'from')).load(args.input)
32+
embeds.bucket_to_explicit().write(args.output)
33+
34+
35+
if __name__ == '__main__':
36+
main()

tests/integration/all.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/usr/bin/env bash
2+
set -eu
3+
4+
TESTDIR="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
5+
6+
echo conversions >&2
7+
"${TESTDIR}"/conversion.sh
8+
9+
echo bucket-to-explicit >&2
10+
"${TESTDIR}"/bucket_to_explicit.sh
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import numpy as np
2+
from finalfusion.scripts.util import Format
3+
from finalfusion.subword import ngrams
4+
from finalfusion.vocab.subword import FastTextVocab, ExplicitVocab, FinalfusionBucketVocab
5+
6+
7+
def test(inp, input_format, output):
8+
e1 = Format(input_format).load(inp)
9+
e2 = Format("finalfusion").load(output)
10+
11+
v1 = e1.vocab
12+
v2 = e2.vocab
13+
assert isinstance(v1, (FinalfusionBucketVocab, FastTextVocab))
14+
assert isinstance(v2, ExplicitVocab)
15+
assert v1.words == v2.words
16+
assert v1.word_index == v2.word_index
17+
assert v1.subword_indexer.min_n == v2.subword_indexer.min_n, \
18+
f"{v1.subword_indexer.min_n} == {v2.subword_indexer.min_n}"
19+
assert v1.subword_indexer.max_n == v2.subword_indexer.max_n, \
20+
f"{v1.subword_indexer.max_n} == {v2.subword_indexer.max_n}"
21+
v1_ngrams = set([ngram for word in v1.words for ngram in ngrams(word)])
22+
v1_unique_indices = set((v1.subword_indexer(ngram) for ngram in v1_ngrams))
23+
assert v1_ngrams == set(v2.subword_indexer.ngrams)
24+
assert len(v1_unique_indices) == v2.subword_indexer.upper_bound, \
25+
f"{len(v1_unique_indices)} == {v2.subword_indexer.upper_bound}"
26+
assert len(v1_unique_indices) + len(v1) == v2.upper_bound, \
27+
f"{len(v1_unique_indices)} + {len(v1)} == {v2.upper_bound}"
28+
assert e2.storage.shape[0] == v2.upper_bound, \
29+
f"{e2.storage.shape[0]} == {v2.upper_bound}"
30+
assert np.allclose(e1.storage[:len(v1)], e2.storage[:len(v2)])
31+
for ngram in v1_ngrams:
32+
e1_ngram_embed = e1.storage[v1.subword_indexer(ngram) + len(v1)]
33+
e2_ngram_embed = e2.storage[v2.subword_indexer(ngram) + len(v1)]
34+
assert np.allclose(e1_ngram_embed, e2_ngram_embed)
35+
36+
37+
if __name__ == '__main__':
38+
import sys
39+
test(*sys.argv[1:])
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env bash
2+
set -eu
3+
4+
tmp_dir=$(mktemp -d /tmp/bucket_to_explicit.XXXXXX)
5+
6+
function finish() {
7+
rm -rf "$tmp_dir"
8+
}
9+
10+
trap finish EXIT
11+
12+
TESTDIR="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
13+
14+
function convert_and_verify() {
15+
echo ffp-bucket-to-explicit "${1}" -f "${2}" "${3}" >&2
16+
ffp-bucket-to-explicit "${1}" -f "${2}" "${3}"
17+
python "${TESTDIR}"/bucket_to_explicit.py "${1}" "${2}" "${3}"
18+
}
19+
20+
convert_and_verify "${TESTDIR}/../data/ff_buckets.fifu" finalfusion fifu_bucket_to_expl.fifu
21+
22+
convert_and_verify "${TESTDIR}/../data/fasttext.bin" fasttext fasttext_to_expl.fifu

0 commit comments

Comments
 (0)