Skip to content

Commit c1009a2

Browse files
committed
Implement compat readers and writers.
Implement reading and writing for word2vec binary and text(-dims).
1 parent 006cf92 commit c1009a2

File tree

10 files changed

+408
-2
lines changed

10 files changed

+408
-2
lines changed

src/finalfusion/_util.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# pylint: disable=missing-module-docstring
2+
import numpy as np
3+
4+
from finalfusion.norms import Norms
5+
from finalfusion.storage import NdArray
6+
7+
8+
def _normalize_ndarray_storage(storage: NdArray) -> Norms:
9+
norms = np.linalg.norm(storage, axis=1)
10+
storage /= norms[:, None]
11+
return Norms(norms)

src/finalfusion/compat/__init__.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""
2+
Compatibility Module for Embedding formats
3+
4+
This module contains read and write methods for other common embedding formats such as:
5+
* text(-dims)
6+
* word2vec binary
7+
"""
8+
9+
from finalfusion.compat.text import load_text, load_text_dims, write_text, write_text_dims
10+
from finalfusion.compat.word2vec import load_word2vec, write_word2vec
11+
12+
__all__ = [
13+
'load_text_dims', 'load_word2vec', 'load_text', 'write_word2vec',
14+
'write_text', 'write_text_dims'
15+
]

src/finalfusion/compat/text.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
"""
2+
Text based embedding formats.
3+
"""
4+
5+
import re
6+
from os import PathLike
7+
from typing import Union, TextIO
8+
9+
import numpy as np
10+
11+
from finalfusion import Embeddings
12+
from finalfusion._util import _normalize_ndarray_storage
13+
from finalfusion.storage import NdArray
14+
from finalfusion.vocab import SimpleVocab
15+
16+
_ASCII_WHITESPACE_PAT = re.compile(r'(?a)\s+')
17+
18+
19+
def load_text_dims(file: Union[str, bytes, int, PathLike]) -> Embeddings:
20+
"""
21+
Read emebddings in text-dims format.
22+
23+
The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
24+
l2-normalized per default and the corresponding norms are stored in the Norms.
25+
26+
The first line contains whitespace separated rows and cols, the rest of the file contains
27+
whitespace separated word and vector components.
28+
29+
Parameters
30+
----------
31+
file : str, bytes, int, PathLike
32+
Path to a file with embeddings in word2vec binary format.
33+
Returns
34+
-------
35+
embeddings : Embeddings
36+
The embeddings from the input file.
37+
"""
38+
with open(file) as inf:
39+
rows, cols = next(inf).split()
40+
return _load_text(inf, int(rows), int(cols))
41+
42+
43+
def load_text(file: Union[str, bytes, int, PathLike]) -> Embeddings:
44+
"""
45+
Read embeddings in text format.
46+
47+
The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
48+
l2-normalized per default and the corresponding norms are stored in the Norms.
49+
50+
Expects a file with utf-8 encoded lines with:
51+
* word at the start of the line
52+
* followed by whitespace
53+
* followed by whitespace separated vector components
54+
55+
Parameters
56+
----------
57+
file : str, bytes, int, PathLike
58+
Path to a file with embeddings in word2vec binary format.
59+
60+
Returns
61+
-------
62+
embeddings : Embeddings
63+
Embeddings from the input file. The resulting Embeddings will have a
64+
SimpleVocab, NdArray and Norms.
65+
"""
66+
with open(file) as inf:
67+
try:
68+
first = next(inf)
69+
except StopIteration:
70+
raise ValueError("Can't read from empty embeddings file.")
71+
line = _ASCII_WHITESPACE_PAT.split(first.rstrip())
72+
cols = len(line[1:])
73+
rows = sum(1 for _ in inf) + 1
74+
inf.seek(0)
75+
return _load_text(inf, rows, cols)
76+
77+
78+
def write_text(file: Union[str, bytes, int, PathLike],
79+
embeddings: Embeddings,
80+
sep=" "):
81+
"""
82+
Write embeddings in text format.
83+
84+
Embeddings are un-normalized before serialization, if norms are present, each embedding is
85+
scaled by the associated norm.
86+
87+
The output consists of utf-8 encoded lines with:
88+
* word at the start of the line
89+
* followed by whitespace
90+
* followed by whitespace separated vector components
91+
92+
Parameters
93+
----------
94+
file : str, bytes, int, PathLike
95+
Output file
96+
embeddings : Embeddings
97+
Embeddings to write
98+
sep : str
99+
Separator of word and embeddings.
100+
"""
101+
_write_text(file, embeddings, False, sep=sep)
102+
103+
104+
def write_text_dims(file: Union[str, bytes, int, PathLike],
105+
embeddings: Embeddings,
106+
sep=" "):
107+
"""
108+
Write embeddings in text-dims format.
109+
110+
Embeddings are un-normalized before serialization, if norms are present, each embedding is
111+
scaled by the associated norm.
112+
113+
The output consists of utf-8 encoded lines with:
114+
* `rows cols` on the **first** line
115+
* word at the start of the line
116+
* followed by whitespace
117+
* followed by whitespace separated vector components
118+
119+
Parameters
120+
----------
121+
file : str, bytes, int, PathLike
122+
Output file
123+
embeddings : Embeddings
124+
Embeddings to write
125+
sep : str
126+
Separator of word and embeddings.
127+
"""
128+
_write_text(file, embeddings, True, sep=sep)
129+
130+
131+
def _load_text(file: TextIO, rows: int, cols: int) -> Embeddings:
132+
words = []
133+
matrix = np.zeros((rows, cols), dtype=np.float32)
134+
for row, line in zip(matrix, file):
135+
parts = _ASCII_WHITESPACE_PAT.split(line.rstrip())
136+
words.append(parts[0])
137+
row[:] = parts[1:]
138+
storage = NdArray(matrix)
139+
return Embeddings(storage=storage,
140+
norms=_normalize_ndarray_storage(storage),
141+
vocab=SimpleVocab(words))
142+
143+
144+
def _write_text(file: Union[str, bytes, int, PathLike],
145+
embeddings: Embeddings,
146+
dims: bool,
147+
sep=" "):
148+
vocab = embeddings.vocab
149+
matrix = embeddings.storage[:len(vocab)]
150+
with open(file, 'w') as outf:
151+
if dims:
152+
print(*matrix.shape, file=outf)
153+
for idx, word in enumerate(vocab):
154+
row = matrix[idx]
155+
if embeddings.norms is not None:
156+
row = row * embeddings.norms[idx]
157+
print(word, ' '.join(map(str, row)), sep=sep, file=outf)
158+
159+
160+
__all__ = ['load_text', 'load_text_dims', 'write_text', 'write_text_dims']

src/finalfusion/compat/word2vec.py

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
"""
2+
Word2vec binary format.
3+
"""
4+
5+
import sys
6+
from os import PathLike
7+
from typing import Union, BinaryIO, AnyStr
8+
9+
import numpy as np
10+
11+
from finalfusion import Embeddings
12+
from finalfusion.io import _serialize_array_as_le
13+
from finalfusion.storage import NdArray
14+
from finalfusion._util import _normalize_ndarray_storage
15+
from finalfusion.vocab import SimpleVocab
16+
17+
18+
def load_word2vec(file: Union[str, bytes, int, PathLike]) -> Embeddings:
19+
"""
20+
Read embeddings in word2vec binary format.
21+
22+
The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is
23+
l2-normalized per default and the corresponding norms are stored in the Norms.
24+
25+
Files are expected to start with a line containing rows and cols in utf-8. Words are encoded
26+
in utf-8 followed by a single whitespace. After the whitespace, the embedding components are
27+
expected as little-endian single-precision floats.
28+
29+
Parameters
30+
----------
31+
file : str, bytes, int, PathLike
32+
Path to a file with embeddings in word2vec binary format.
33+
34+
Returns
35+
-------
36+
embeddings : Embeddings
37+
The embeddings from the input file.
38+
"""
39+
words = []
40+
with open(file, 'rb') as inf:
41+
rows, cols = map(int, inf.readline().decode("ascii").split())
42+
matrix = np.zeros((rows, cols), dtype=np.float32)
43+
for row in matrix:
44+
words.append(_read_binary_word(inf, b' ').strip())
45+
array = np.fromfile(file=inf, count=cols, dtype=np.float32)
46+
if sys.byteorder == "big":
47+
array.byteswap(inplace=True)
48+
row[:] = array
49+
storage = NdArray(matrix)
50+
return Embeddings(storage=storage,
51+
norms=_normalize_ndarray_storage(storage),
52+
vocab=SimpleVocab(words))
53+
54+
55+
def write_word2vec(file: Union[str, bytes, int, PathLike],
56+
embeddings: Embeddings):
57+
"""
58+
Write embeddings in word2vec binary format.
59+
60+
If the embeddings are not compatible with the w2v format (e.g. include a SubwordVocab), only
61+
the known words and embeddings are serialized. I.e. the subword matrix is discarded.
62+
63+
Embeddings are un-normalized before serialization, if norms are present, each embedding is
64+
scaled by the associated norm.
65+
66+
The output file will contain the shape encoded in utf-8 on the first line as `rows columns`.
67+
This is followed by the embeddings.
68+
69+
Each embedding consists of:
70+
* utf-8 encoded word
71+
* single space `' '` following the word
72+
* `cols` single-precision floating point numbers
73+
* `'\n'` newline at the end of each line.
74+
75+
Parameters
76+
----------
77+
file : str, bytes, int, PathLike
78+
Output file
79+
embeddings : Embeddings
80+
The embeddings to serialize.
81+
"""
82+
vocab = embeddings.vocab
83+
matrix = embeddings.storage[:len(vocab)]
84+
with open(file, 'wb') as outf:
85+
outf.write(f'{matrix.shape[0]} {matrix.shape[1]}\n'.encode('ascii'))
86+
for idx, word in enumerate(vocab):
87+
row = matrix[idx]
88+
if embeddings.norms is not None:
89+
row = row * embeddings.norms[idx]
90+
b_word = word.encode('utf-8')
91+
outf.write(b_word)
92+
outf.write(b' ')
93+
_serialize_array_as_le(outf, row)
94+
outf.write(b'\n')
95+
96+
97+
def _read_binary_word(inf: BinaryIO, delim: AnyStr):
98+
word = []
99+
while True:
100+
byte = inf.read(1)
101+
if byte == delim:
102+
break
103+
if byte == b'':
104+
raise EOFError
105+
word.append(byte)
106+
return b''.join(word).decode('utf-8')
107+
108+
109+
__all__ = ['load_word2vec', 'write_word2vec']

src/finalfusion/vocab/vocab.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"""
44
import abc
55
import struct
6-
from typing import List, Optional, Dict, Tuple, BinaryIO, Iterable, Any, Union, Sequence
6+
from typing import List, Optional, Dict, Tuple, BinaryIO, Iterable, Any, Union, Sequence, Iterator
77

88
from finalfusion.io import Chunk, _read_required_binary, _write_binary
99

@@ -82,7 +82,7 @@ def __contains__(self, item: Any) -> bool:
8282
return all(w in self for w in item)
8383
return False
8484

85-
def __iter__(self) -> Iterable[str]:
85+
def __iter__(self) -> Iterator[str]:
8686
return iter(self.words)
8787

8888
def __len__(self) -> int:

tests/conftest.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import finalfusion
88
import finalfusion.vocab
9+
import finalfusion.compat
910

1011

1112
@pytest.fixture
@@ -40,3 +41,21 @@ def embeddings_fifu(tests_root):
4041
@pytest.fixture
4142
def bucket_vocab_embeddings_fifu(tests_root):
4243
yield finalfusion.load_finalfusion(tests_root / "data" / "ff_buckets.fifu")
44+
45+
46+
@pytest.fixture
47+
def embeddings_text(tests_root):
48+
yield finalfusion.compat.load_text(
49+
os.path.join(tests_root, "data/embeddings.txt"))
50+
51+
52+
@pytest.fixture
53+
def embeddings_text_dims(tests_root):
54+
yield finalfusion.compat.load_text_dims(
55+
os.path.join(tests_root, "data/embeddings.dims.txt"))
56+
57+
58+
@pytest.fixture
59+
def embeddings_w2v(tests_root):
60+
yield finalfusion.compat.load_word2vec(
61+
os.path.join(tests_root, "data/embeddings.w2v"))

tests/data/embeddings.dims.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
7 10
2+
one 3.0 1.0 0.0 0.0 0.0 0.0 2.0 2.0 4.0 3.0
3+
two 2.0 3.0 3.0 3.0 3.0 2.0 0.0 3.0 3.0 4.0
4+
three 0.0 0.0 2.0 0.0 2.0 1.0 2.0 4.0 0.0 3.0
5+
four 1.0 4.0 4.0 2.0 4.0 2.0 4.0 1.0 3.0 1.0
6+
five 0.0 4.0 1.0 2.0 0.0 4.0 0.0 3.0 1.0 3.0
7+
six 3.0 3.0 4.0 2.0 0.0 0.0 0.0 3.0 2.0 1.0
8+
seven 1.0 4.0 0.0 2.0 2.0 2.0 4.0 3.0 1.0 1.0

tests/data/embeddings.w2v

326 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)