Skip to content

Commit ccabb5e

Browse files
committed
Implement analogy and similarity scripts.
1 parent a20bc46 commit ccabb5e

File tree

7 files changed

+293
-0
lines changed

7 files changed

+293
-0
lines changed

.pylintrc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,5 @@
11
[TYPECHECK]
22
extension-pkg-whitelist=finalfusion.subword.hash_indexers, finalfusion.subword.explicit_indexer
3+
4+
[SIMILARITY]
5+
ignore-imports=yes

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ def run(self):
104104
entry_points=dict(console_scripts=[
105105
'ffp-convert=finalfusion.scripts.convert:main',
106106
'ffp-bucket-to-explicit=finalfusion.scripts.bucket_to_explicit:main',
107+
'ffp-similar=finalfusion.scripts.similar:main',
108+
'ffp-analogy=finalfusion.scripts.analogy:main',
107109
]),
108110
version="0.7.0-pre"
109111
)

src/finalfusion/scripts/analogy.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
"""
2+
Analogy queries for embeddings.
3+
"""
4+
import argparse
5+
import sys
6+
from typing import List, Set
7+
8+
from finalfusion.scripts.util import Format
9+
10+
11+
def main() -> None: # pylint: disable=missing-function-docstring
12+
formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"]
13+
parser = argparse.ArgumentParser(prog="ffp-analogy",
14+
description="Analogy queries.")
15+
parser.add_argument(
16+
"-f",
17+
"--format",
18+
choices=formats,
19+
type=str,
20+
default="finalfusion",
21+
help=f"Valid choices: {formats} Default: 'finalfusion'",
22+
metavar="INPUT_FORMAT")
23+
parser.add_argument("embeddings",
24+
help="Input embeddings",
25+
type=str,
26+
metavar="EMBEDDINGS")
27+
parser.add_argument(
28+
"-i",
29+
"--include",
30+
choices=["a", "b", "c"],
31+
nargs="+",
32+
default=[],
33+
help=
34+
"Specify query parts that should be allowed as answers. Valid choices: ['a', 'b', 'c']"
35+
)
36+
parser.add_argument("-k",
37+
type=int,
38+
default=10,
39+
help=f"Number of neighbours. Default: 1",
40+
metavar="K")
41+
parser.add_argument("input", nargs='?', default=0)
42+
args = parser.parse_args()
43+
if args.include != [] and len(args.include) > 3:
44+
print("-i/--include can take up to 3 unique values: a, b and c.",
45+
file=sys.stderr)
46+
sys.exit(1)
47+
embeds = Format(args.format).load(args.embeddings)
48+
with open(args.input) as queries:
49+
for query in queries:
50+
query_a, query_b, query_c = query.strip().split()
51+
skips = get_skips(query_a, query_b, query_c, args.include)
52+
res = embeds.analogy(query_a,
53+
query_b,
54+
query_c,
55+
k=args.k,
56+
skip=skips)
57+
if res is None:
58+
print(
59+
f"Could not compute for: {query_a} : {query_b}, {query_c} : ? ",
60+
file=sys.stderr)
61+
else:
62+
print("\n".join(f"{ws.word} {ws.similarity}" for ws in res))
63+
64+
65+
def get_skips( # pylint: disable=missing-function-docstring
66+
query_a: str, query_b: str, query_c: str,
67+
includes: List[str]) -> Set[str]:
68+
if includes == []:
69+
return {query_c, query_b, query_a}
70+
skips = set()
71+
if 'a' not in includes:
72+
skips.add(query_a)
73+
if 'b' not in includes:
74+
skips.add(query_b)
75+
if 'c' not in includes:
76+
skips.add(query_b)
77+
return skips
78+
79+
80+
if __name__ == '__main__':
81+
main()

src/finalfusion/scripts/similar.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""
2+
Similarity queries for embeddings.
3+
"""
4+
import argparse
5+
import sys
6+
7+
from finalfusion.scripts.util import Format
8+
9+
10+
def main() -> None: # pylint: disable=missing-function-docstring
11+
formats = ["word2vec", "finalfusion", "fasttext", "text", "textdims"]
12+
parser = argparse.ArgumentParser(prog="ffp-similar",
13+
description="Similarity queries.")
14+
parser.add_argument("embeddings",
15+
type=str,
16+
help="Input embeddings",
17+
metavar="EMBEDDINGS")
18+
parser.add_argument(
19+
"-f",
20+
"--format",
21+
type=str,
22+
choices=formats,
23+
default="finalfusion",
24+
help=f"Valid choices: {formats} Default: 'finalfusion'",
25+
metavar="INPUT_FORMAT")
26+
parser.add_argument("-k",
27+
type=int,
28+
default=10,
29+
help=f"Number of neighbours. Default: 10",
30+
metavar="K")
31+
parser.add_argument("input", nargs='?', default=0)
32+
args = parser.parse_args()
33+
embeds = Format(args.format).load(args.embeddings)
34+
with open(args.input) as queries:
35+
for query in queries:
36+
query = query.strip()
37+
if not query:
38+
continue
39+
res = embeds.word_similarity(query, k=args.k)
40+
if res is None:
41+
print(f"Could not compute neighbours for: {query}",
42+
file=sys.stderr)
43+
else:
44+
print("\n".join(f"{ws.word} {ws.similarity}" for ws in res))
45+
46+
47+
if __name__ == '__main__':
48+
main()

tests/integration/all.sh

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,19 @@ set -eu
33

44
TESTDIR="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
55

6+
if [ -v OS ] && [ "${OS}" = Windows_NT ]; then
7+
export PYTHONIOENCODING=utf-8
8+
export PYTHONUTF8=1
9+
fi
10+
611
echo conversions >&2
712
"${TESTDIR}"/conversion.sh
813

914
echo bucket-to-explicit >&2
1015
"${TESTDIR}"/bucket_to_explicit.sh
16+
17+
echo similarity >&2
18+
"${TESTDIR}"/similarity.sh
19+
20+
echo analogy >&2
21+
"${TESTDIR}"/analogy.sh

tests/integration/analogy.sh

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
#!/usr/bin/env bash
2+
set -eu
3+
export LC_ALL=en_US.UTF-8
4+
5+
tmp_dir=$(mktemp -d /tmp/run_similarity.XXXXXX)
6+
7+
function finish() {
8+
rm -rf "$tmp_dir"
9+
}
10+
11+
trap finish EXIT
12+
13+
TESTDIR="$(
14+
cd "$(dirname "$0")" >/dev/null 2>&1
15+
pwd -P
16+
)"
17+
18+
EXPECTED="Deutschland
19+
Westdeutschland
20+
Sachsen
21+
Mitteldeutschland
22+
Brandenburg
23+
Polen
24+
Norddeutschland
25+
Dänemark
26+
Schleswig-Holstein
27+
Österreich
28+
Bayern
29+
Thüringen
30+
Bundesrepublik
31+
Ostdeutschland
32+
Preußen
33+
Deutschen
34+
Hessen
35+
Potsdam
36+
Mecklenburg
37+
Niedersachsen
38+
Hamburg
39+
Süddeutschland
40+
Bremen
41+
Russland
42+
Deutschlands
43+
BRD
44+
Litauen
45+
Mecklenburg-Vorpommern
46+
DDR
47+
West-Berlin
48+
Saarland
49+
Lettland
50+
Hannover
51+
Rostock
52+
Sachsen-Anhalt
53+
Pommern
54+
Schweden
55+
Deutsche
56+
deutschen
57+
Westfalen"
58+
59+
diff <(echo Paris Frankreich Berlin | \
60+
ffp-analogy "${TESTDIR}/../data/simple_vocab.fifu" -k 40 | \
61+
cut -f 1 -d " ") \
62+
<(echo "${EXPECTED}")
63+
64+
diff <(echo Paris Frankreich Paris | \
65+
ffp-analogy "${TESTDIR}/../data/simple_vocab.fifu" -k 1 -i a b c | \
66+
cut -f 1 -d " ") \
67+
<(echo "Frankreich")
68+
69+
diff <(echo Paris Frankreich Paris | \
70+
ffp-analogy "${TESTDIR}/../data/simple_vocab.fifu" -k 1 -i a c | \
71+
cut -f 1 -d " ") \
72+
<(echo "Russland")
73+
74+
diff <(echo Frankreich Frankreich Frankreich | \
75+
ffp-analogy "${TESTDIR}/../data/simple_vocab.fifu" -k 1 -i a b c | \
76+
cut -f 1 -d " ") \
77+
<(echo "Frankreich")
78+
79+
diff <(echo Frankreich Frankreich Frankreich | \
80+
ffp-analogy "${TESTDIR}/../data/simple_vocab.fifu" -k 1 | \
81+
cut -f 1 -d " ") \
82+
<(echo "Russland")

tests/integration/similarity.sh

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#!/usr/bin/env bash
2+
set -eu
3+
tmp_dir=$(mktemp -d /tmp/run_similarity.XXXXXX)
4+
5+
function finish() {
6+
rm -rf "$tmp_dir"
7+
}
8+
9+
trap finish EXIT
10+
11+
TESTDIR="$( cd "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
12+
13+
EXPECTED="Karlsruhe
14+
Mannheim
15+
München
16+
Darmstadt
17+
Heidelberg
18+
Wiesbaden
19+
Kassel
20+
Düsseldorf
21+
Leipzig
22+
Berlin"
23+
24+
diff <(echo Stuttgart | ffp-similar "tests/data/similarity.fifu" | cut -f 1 -d " ") <(echo "${EXPECTED}")
25+
26+
EXPECTED="Potsdam
27+
Hamburg
28+
Leipzig
29+
Dresden
30+
München
31+
Düsseldorf
32+
Bonn
33+
Stuttgart
34+
Weimar
35+
Berlin-Charlottenburg
36+
Rostock
37+
Karlsruhe
38+
Chemnitz
39+
Breslau
40+
Wiesbaden
41+
Hannover
42+
Mannheim
43+
Kassel
44+
Köln
45+
Danzig
46+
Erfurt
47+
Dessau
48+
Bremen
49+
Charlottenburg
50+
Magdeburg
51+
Neuruppin
52+
Darmstadt
53+
Jena
54+
Wien
55+
Heidelberg
56+
Dortmund
57+
Stettin
58+
Schwerin
59+
Neubrandenburg
60+
Greifswald
61+
Göttingen
62+
Braunschweig
63+
Berliner
64+
Warschau
65+
Berlin-Spandau"
66+
diff <(echo Berlin | ffp-similar "tests/data/similarity.fifu" -k 40 | cut -f 1 -d " ") <(echo "${EXPECTED}")

0 commit comments

Comments
 (0)