|
| 1 | +""" |
| 2 | +Text based embedding formats. |
| 3 | +""" |
| 4 | + |
| 5 | +import re |
| 6 | +from os import PathLike |
| 7 | +from typing import Union, TextIO |
| 8 | + |
| 9 | +import numpy as np |
| 10 | + |
| 11 | +from finalfusion import Embeddings |
| 12 | +from finalfusion._util import _normalize_ndarray_storage |
| 13 | +from finalfusion.storage import NdArray |
| 14 | +from finalfusion.vocab import SimpleVocab |
| 15 | + |
| 16 | +_ASCII_WHITESPACE_PAT = re.compile(r'(?a)\s+') |
| 17 | + |
| 18 | + |
| 19 | +def load_text_dims(file: Union[str, bytes, int, PathLike]) -> Embeddings: |
| 20 | + """ |
| 21 | + Read emebddings in text-dims format. |
| 22 | +
|
| 23 | + The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is |
| 24 | + l2-normalized per default and the corresponding norms are stored in the Norms. |
| 25 | +
|
| 26 | + The first line contains whitespace separated rows and cols, the rest of the file contains |
| 27 | + whitespace separated word and vector components. |
| 28 | +
|
| 29 | + Parameters |
| 30 | + ---------- |
| 31 | + file : str, bytes, int, PathLike |
| 32 | + Path to a file with embeddings in word2vec binary format. |
| 33 | + Returns |
| 34 | + ------- |
| 35 | + embeddings : Embeddings |
| 36 | + The embeddings from the input file. |
| 37 | + """ |
| 38 | + with open(file) as inf: |
| 39 | + rows, cols = next(inf).split() |
| 40 | + return _load_text(inf, int(rows), int(cols)) |
| 41 | + |
| 42 | + |
| 43 | +def load_text(file: Union[str, bytes, int, PathLike]) -> Embeddings: |
| 44 | + """ |
| 45 | + Read embeddings in text format. |
| 46 | +
|
| 47 | + The returned embeddings have a SimpleVocab, NdArray storage and a Norms chunk. The storage is |
| 48 | + l2-normalized per default and the corresponding norms are stored in the Norms. |
| 49 | +
|
| 50 | + Expects a file with utf-8 encoded lines with: |
| 51 | + * word at the start of the line |
| 52 | + * followed by whitespace |
| 53 | + * followed by whitespace separated vector components |
| 54 | +
|
| 55 | + Parameters |
| 56 | + ---------- |
| 57 | + file : str, bytes, int, PathLike |
| 58 | + Path to a file with embeddings in word2vec binary format. |
| 59 | +
|
| 60 | + Returns |
| 61 | + ------- |
| 62 | + embeddings : Embeddings |
| 63 | + Embeddings from the input file. The resulting Embeddings will have a |
| 64 | + SimpleVocab, NdArray and Norms. |
| 65 | + """ |
| 66 | + with open(file) as inf: |
| 67 | + try: |
| 68 | + first = next(inf) |
| 69 | + except StopIteration: |
| 70 | + raise ValueError("Can't read from empty embeddings file.") |
| 71 | + line = _ASCII_WHITESPACE_PAT.split(first.rstrip()) |
| 72 | + cols = len(line[1:]) |
| 73 | + rows = sum(1 for _ in inf) + 1 |
| 74 | + inf.seek(0) |
| 75 | + return _load_text(inf, rows, cols) |
| 76 | + |
| 77 | + |
| 78 | +def write_text(file: Union[str, bytes, int, PathLike], |
| 79 | + embeddings: Embeddings, |
| 80 | + sep=" "): |
| 81 | + """ |
| 82 | + Write embeddings in text format. |
| 83 | +
|
| 84 | + Embeddings are un-normalized before serialization, if norms are present, each embedding is |
| 85 | + scaled by the associated norm. |
| 86 | +
|
| 87 | + The output consists of utf-8 encoded lines with: |
| 88 | + * word at the start of the line |
| 89 | + * followed by whitespace |
| 90 | + * followed by whitespace separated vector components |
| 91 | +
|
| 92 | + Parameters |
| 93 | + ---------- |
| 94 | + file : str, bytes, int, PathLike |
| 95 | + Output file |
| 96 | + embeddings : Embeddings |
| 97 | + Embeddings to write |
| 98 | + sep : str |
| 99 | + Separator of word and embeddings. |
| 100 | + """ |
| 101 | + _write_text(file, embeddings, False, sep=sep) |
| 102 | + |
| 103 | + |
| 104 | +def write_text_dims(file: Union[str, bytes, int, PathLike], |
| 105 | + embeddings: Embeddings, |
| 106 | + sep=" "): |
| 107 | + """ |
| 108 | + Write embeddings in text-dims format. |
| 109 | +
|
| 110 | + Embeddings are un-normalized before serialization, if norms are present, each embedding is |
| 111 | + scaled by the associated norm. |
| 112 | +
|
| 113 | + The output consists of utf-8 encoded lines with: |
| 114 | + * `rows cols` on the **first** line |
| 115 | + * word at the start of the line |
| 116 | + * followed by whitespace |
| 117 | + * followed by whitespace separated vector components |
| 118 | +
|
| 119 | + Parameters |
| 120 | + ---------- |
| 121 | + file : str, bytes, int, PathLike |
| 122 | + Output file |
| 123 | + embeddings : Embeddings |
| 124 | + Embeddings to write |
| 125 | + sep : str |
| 126 | + Separator of word and embeddings. |
| 127 | + """ |
| 128 | + _write_text(file, embeddings, True, sep=sep) |
| 129 | + |
| 130 | + |
| 131 | +def _load_text(file: TextIO, rows: int, cols: int) -> Embeddings: |
| 132 | + words = [] |
| 133 | + matrix = np.zeros((rows, cols), dtype=np.float32) |
| 134 | + for row, line in zip(matrix, file): |
| 135 | + parts = _ASCII_WHITESPACE_PAT.split(line.rstrip()) |
| 136 | + words.append(parts[0]) |
| 137 | + row[:] = parts[1:] |
| 138 | + storage = NdArray(matrix) |
| 139 | + return Embeddings(storage=storage, |
| 140 | + norms=_normalize_ndarray_storage(storage), |
| 141 | + vocab=SimpleVocab(words)) |
| 142 | + |
| 143 | + |
| 144 | +def _write_text(file: Union[str, bytes, int, PathLike], |
| 145 | + embeddings: Embeddings, |
| 146 | + dims: bool, |
| 147 | + sep=" "): |
| 148 | + vocab = embeddings.vocab |
| 149 | + matrix = embeddings.storage[:len(vocab)] |
| 150 | + with open(file, 'w') as outf: |
| 151 | + if dims: |
| 152 | + print(*matrix.shape, file=outf) |
| 153 | + for idx, word in enumerate(vocab): |
| 154 | + row = matrix[idx] |
| 155 | + if embeddings.norms is not None: |
| 156 | + row = row * embeddings.norms[idx] |
| 157 | + print(word, ' '.join(map(str, row)), sep=sep, file=outf) |
| 158 | + |
| 159 | + |
| 160 | +__all__ = ['load_text', 'load_text_dims', 'write_text', 'write_text_dims'] |
0 commit comments