Skip to content

Commit 1524ba8

Browse files
committed
Add conversion script.
1 parent 91776c0 commit 1524ba8

File tree

8 files changed

+236
-3
lines changed

8 files changed

+236
-3
lines changed

.github/workflows/python.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,7 @@ jobs:
4040
run: |
4141
pip install pytest
4242
pytest --doctest-modules
43+
- name: Conversion tests
44+
if: ${{ matrix.python-version == '3.7' }}
45+
run: bash ./tests/conversion_integration.sh
46+

setup.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,6 @@ def run(self):
8888
include_package_data=True,
8989
package_dir={'': 'src'},
9090
package_data={'finalfusion': ['py.typed', '*.pyi', '*.h', '*.c']},
91-
url="https://github.com/finalfusion/ffp")
91+
url="https://github.com/finalfusion/ffp",
92+
entry_points=dict(console_scripts=['ffp-convert=finalfusion.scripts.convert:main']),
93+
)

src/finalfusion/embeddings.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -363,15 +363,15 @@ def chunks(self) -> List[Chunk]:
363363
chunks.append(self.norms)
364364
return chunks
365365

366-
def write(self, file: str):
366+
def write(self, file: Union[str, bytes, int, PathLike]):
367367
"""
368368
Write the Embeddings to the given file.
369369
370370
Writes the Embeddings to a finalfusion file at the given file.
371371
372372
Parameters
373373
----------
374-
file : str
374+
file : str, bytes, int, PathLike
375375
Path of the output file.
376376
"""
377377
with open(file, 'wb') as outf:

src/finalfusion/scripts/__init__.py

Whitespace-only changes.

src/finalfusion/scripts/convert.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
"""
2+
Conversion for Word Embeddings.
3+
4+
Offers conversion from and to any combination of:
5+
* finalfusion
6+
* fastText
7+
* word2vec
8+
* textdims
9+
* text
10+
11+
Conversion of finalfusion files with FinalfusionBucketVocab or ExplicitVocab to fastText
12+
fails.
13+
"""
14+
import argparse
15+
16+
from finalfusion.scripts.util import Format
17+
18+
19+
def main(): # pylint: disable=missing-function-docstring
20+
formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"]
21+
parser = argparse.ArgumentParser(prog="ffp-convert",
22+
description="Convert embeddings.")
23+
parser.add_argument("input",
24+
type=str,
25+
help="Input embeddings",
26+
metavar="INPUT")
27+
parser.add_argument("output",
28+
type=str,
29+
help="Output path",
30+
metavar="OUTPUT")
31+
parser.add_argument("-f",
32+
"--from",
33+
type=str,
34+
choices=formats,
35+
default="word2vec",
36+
help=f"Valid choices: {formats} Default: 'word2vec'",
37+
metavar="INPUT_FORMAT")
38+
parser.add_argument(
39+
"-t",
40+
"--to",
41+
type=str,
42+
choices=formats,
43+
default="finalfusion",
44+
help=f"Valid choices: {formats} Default: 'finalfusion'",
45+
metavar="OUTPUT_FORMAT")
46+
args = parser.parse_args()
47+
embeds = Format(getattr(args, 'from')).load(args.input)
48+
Format(args.to).write(args.output, embeds)
49+
50+
51+
if __name__ == '__main__':
52+
main()

src/finalfusion/scripts/util.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# pylint: disable=missing-docstring
2+
from enum import Enum
3+
from functools import partial
4+
from os import PathLike
5+
from typing import Union, Callable
6+
7+
from finalfusion import Embeddings, load_finalfusion
8+
from finalfusion.compat import write_word2vec, write_text, write_text_dims, write_fasttext, \
9+
load_word2vec, load_text, load_text_dims, load_fasttext
10+
11+
12+
class Format(Enum):
13+
"""
14+
Supported embedding formats.
15+
"""
16+
finalfusion = "finalfusion"
17+
fasttext = "fasttext"
18+
word2vec = "word2vec"
19+
textdims = "textdims"
20+
text = "text"
21+
22+
@property
23+
def write(
24+
self
25+
) -> Callable[[Union[str, bytes, int, PathLike], Embeddings], None]:
26+
"""
27+
Helper to get the write method for different Formats
28+
"""
29+
if self == Format.finalfusion:
30+
31+
def write_fifu(path: Union[str, bytes, int, PathLike],
32+
embeddings: Embeddings):
33+
embeddings.write(path)
34+
35+
return write_fifu
36+
if self == Format.word2vec:
37+
return write_word2vec
38+
if self == Format.text:
39+
return write_text
40+
if self == Format.textdims:
41+
return write_text_dims
42+
if self == Format.fasttext:
43+
return write_fasttext
44+
raise ValueError(f"Unknown format {str(self)}")
45+
46+
@property
47+
def load(self) -> Callable[[Union[str, bytes, int, PathLike]], Embeddings]:
48+
"""
49+
Helper to get the load method for different Formats
50+
"""
51+
if self == Format.finalfusion:
52+
return partial(load_finalfusion, mmap=True)
53+
if self == Format.word2vec:
54+
return load_word2vec
55+
if self == Format.text:
56+
return load_text
57+
if self == Format.textdims:
58+
return load_text_dims
59+
if self == Format.fasttext:
60+
return load_fasttext
61+
raise ValueError(f"Unknown format {str(self)}")

tests/conversion_integration.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
import sys
2+
3+
import numpy as np
4+
5+
from finalfusion.scripts.util import Format
6+
from finalfusion.vocab.subword import FastTextVocab, SubwordVocab
7+
8+
9+
def test(inp, input_format, output, output_format):
10+
e1 = Format(input_format).load(inp)
11+
e2 = Format(output_format).load(output)
12+
if isinstance(e1.vocab, FastTextVocab) and \
13+
output_format in ["finalfusion", "fasttext"]:
14+
exit(cmp_subword_embeds(e1, e2))
15+
16+
if isinstance(e1.vocab, SubwordVocab) and \
17+
input_format in ["fasttext", "finalfusion"] and \
18+
output_format in ["word2vec", "text", "textdims"]:
19+
exit(cmp_subword_embeds_to_simple(e1, e2))
20+
21+
if input_format in ["finalfusion", "word2vec", "text", "textdims"]:
22+
exit(cmp_simple_embeds(e1, e2))
23+
print(
24+
f"missing testcase for {input_format} to {output_format} ({inp} to {output})",
25+
file=sys.stderr)
26+
exit(1)
27+
28+
29+
def cmp_simple_embeds(e1, e2):
30+
assert e1.vocab == e2.vocab
31+
assert np.allclose(e1.storage, e2.storage, atol=1e-5)
32+
assert np.allclose(e1.norms, e2.norms, atol=1e-5)
33+
return 0
34+
35+
36+
def cmp_subword_embeds_to_simple(e1, e2):
37+
assert e1.vocab.words == e2.vocab.words
38+
assert e1.vocab.word_index == e2.vocab.word_index
39+
assert np.allclose(e1.storage[:len(e1.vocab)], e2.storage, atol=1e-5)
40+
if e1.norms is not None:
41+
assert np.allclose(e1.norms, e2.norms, atol=1e-5)
42+
return 0
43+
44+
45+
def cmp_subword_embeds(e1, e2):
46+
assert e1.vocab == e2.vocab
47+
assert np.allclose(e1.storage, e2.storage, atol=1e-5)
48+
if e1.norms is not None:
49+
assert np.allclose(e1.norms, e2.norms, atol=1e-5)
50+
return 0
51+
52+
53+
if __name__ == '__main__':
54+
test(*sys.argv[1:])

tests/conversion_integration.sh

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/usr/bin/env bash
2+
set -eu
3+
4+
tmp_dir=$(mktemp -d /tmp/run_conversion.XXXXXX)
5+
6+
function finish() {
7+
rm -rf "$tmp_dir"
8+
}
9+
10+
trap finish EXIT
11+
12+
TESTDIR="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
13+
14+
function convert_and_verify() {
15+
echo ffp-convert "${1}" -f "${2}" "${3}" -t "${4}" >&2
16+
ffp-convert "${1}" -f "${2}" "${3}" -t "${4}"
17+
python "${TESTDIR}"/conversion_integration.py "${1}" "${2}" "${3}" "${4}"
18+
}
19+
20+
function verify_all_conversions() {
21+
local input=${1}
22+
local in_format=${2}
23+
local out_path_prefix=${tmp_dir}/${in_format}
24+
convert_and_verify "${input}" "${in_format}" "${out_path_prefix}_to_fifu.fifu" finalfusion
25+
convert_and_verify "${input}" "${in_format}" "${out_path_prefix}_to_w2v.w2v" word2vec
26+
convert_and_verify "${input}" "${in_format}" "${out_path_prefix}_to_ft.bin" fasttext
27+
convert_and_verify "${input}" "${in_format}" "${out_path_prefix}_to_text.txt" text
28+
convert_and_verify "${input}" "${in_format}" "${out_path_prefix}_to_text.dims.txt" textdims
29+
}
30+
31+
# txt dims
32+
input="${TESTDIR}/data/embeddings.dims.txt"
33+
verify_all_conversions "${input}" textdims
34+
35+
# txt
36+
input="${TESTDIR}/data/embeddings.txt"
37+
verify_all_conversions "${input}" text
38+
39+
# w2v
40+
input="${TESTDIR}/data/embeddings.w2v"
41+
verify_all_conversions "${input}" word2vec
42+
43+
# fifu
44+
input="${TESTDIR}/data/embeddings.fifu"
45+
verify_all_conversions "${input}" finalfusion
46+
47+
# fasttext doesn't support fifu bucket indexers
48+
# so we're making explicit calls for the other formats
49+
input="${TESTDIR}/data/ff_buckets.fifu"
50+
convert_and_verify "${input}" finalfusion \
51+
"${tmp_dir}/fifu_buckets_to_fifu.fifu" finalfusion
52+
convert_and_verify "${input}" finalfusion \
53+
"${tmp_dir}/fifu_buckets_to_w2v.w2v" word2vec
54+
convert_and_verify "${input}" finalfusion \
55+
"${tmp_dir}/fifu_buckets_to_text.txt" text
56+
convert_and_verify "${input}" finalfusion \
57+
"${tmp_dir}/fifu_buckets_to_text.dims.txt" textdims
58+
59+
input="${TESTDIR}/data/fasttext.bin"
60+
verify_all_conversions "${input}" fasttext

0 commit comments

Comments
 (0)