Skip to content

Commit ee98cdb

Browse files
danieldkDaniël de Kok
authored andcommitted
Add a Python binding for finalfrontier.
1 parent 6a68c0c commit ee98cdb

File tree

3 files changed

+180
-0
lines changed

3 files changed

+180
-0
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
/target
2+
**/*.rs.bk
3+
Cargo.lock

Cargo.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[package]
2+
name = "finalfrontier-python"
3+
version = "0.1.0"
4+
authors = ["Daniël de Kok <[email protected]>"]
5+
6+
[lib]
7+
name = "finalfrontier"
8+
crate-type = ["cdylib"]
9+
10+
[dependencies.pyo3]
11+
version = "0.4"
12+
features = ["extension-module"]
13+
14+
[dependencies]
15+
failure = "0.1"
16+
finalfrontier = { version = "0.1", path = "../finalfrontier/finalfrontier" }

src/lib.rs

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#![feature(specialization)]
2+
3+
extern crate failure;
4+
extern crate finalfrontier;
5+
extern crate pyo3;
6+
7+
use std::fs::File;
8+
use std::io::BufReader;
9+
10+
use failure::Error;
11+
use finalfrontier::similarity::{Analogy, Similarity};
12+
use finalfrontier::{MmapModelBinary, ReadModelBinary};
13+
use pyo3::prelude::*;
14+
15+
/// This is a binding for finalfrontier.
16+
///
17+
/// finalfrontier is a library and set of programs for training
18+
/// word embeddings with subword units. The Python binding can
19+
/// be used to query the resulting embeddings and do similarity
20+
/// queries.
21+
#[pymodinit]
22+
fn finalfrontier(_py: Python, m: &PyModule) -> PyResult<()> {
23+
m.add_class::<PythonModel>()?;
24+
m.add_class::<PythonWordSimilarity>()?;
25+
Ok(())
26+
}
27+
28+
/// A word and its similarity to a query word.
29+
///
30+
/// The similarity is normally a value between -1 (opposite
31+
/// vectors) and 1 (identical vectors).
32+
#[pyclass(name=WordSimilarity)]
33+
struct PythonWordSimilarity {
34+
#[prop(get)]
35+
word: String,
36+
37+
#[prop(get)]
38+
similarity: f32,
39+
40+
token: PyToken,
41+
}
42+
43+
#[pyproto]
44+
impl PyObjectProtocol for PythonWordSimilarity {
45+
fn __repr__(&self) -> PyResult<String> {
46+
Ok(format!(
47+
"WordSimilarity('{}', {})",
48+
self.word, self.similarity
49+
))
50+
}
51+
52+
fn __str__(&self) -> PyResult<String> {
53+
Ok(format!("{}: {}", self.word, self.similarity))
54+
}
55+
}
56+
57+
/// A finalfrontier model.
58+
#[pyclass(name=Model)]
59+
struct PythonModel {
60+
model: finalfrontier::Model,
61+
token: PyToken,
62+
}
63+
64+
#[pymethods]
65+
impl PythonModel {
66+
/// Load a model from the given `path`.
67+
///
68+
/// When the `mmap` argument is `True`, the embedding matrix is
69+
/// not loaded into memory, but memory mapped. This results in
70+
/// lower memory use and shorter model load times, while sacrificing
71+
/// some query efficiency.
72+
#[new]
73+
#[args(mmap = false)]
74+
fn __new__(obj: &PyRawObject, path: &str, mmap: bool) -> PyResult<()> {
75+
let model = match load_model(path, mmap) {
76+
Ok(model) => model,
77+
Err(err) => {
78+
return Err(exc::IOError::new(err.to_string()));
79+
}
80+
};
81+
82+
obj.init(|token| PythonModel { model, token })
83+
}
84+
85+
/// Perform an anology query.
86+
///
87+
/// This returns words for the analogy query *w1* is to *w2*
88+
/// as *w3* is to ?.
89+
#[args(limit = 10)]
90+
fn analogy(
91+
&self,
92+
py: Python,
93+
word1: &str,
94+
word2: &str,
95+
word3: &str,
96+
limit: usize,
97+
) -> PyResult<Vec<PyObject>> {
98+
let results = match self.model.analogy(word1, word2, word3, limit) {
99+
Some(results) => results,
100+
None => return Err(exc::KeyError::new("Unknown word and n-grams")),
101+
};
102+
103+
let mut r = Vec::with_capacity(results.len());
104+
for ws in results {
105+
r.push(
106+
Py::new(py, |token| PythonWordSimilarity {
107+
word: ws.word.to_owned(),
108+
similarity: ws.similarity.into_inner(),
109+
token,
110+
})?.into_object(py),
111+
)
112+
}
113+
114+
Ok(r)
115+
}
116+
117+
/// Get the embedding for the given word.
118+
///
119+
/// If the word is not known, its representation is approximated
120+
/// using subword units.
121+
fn embedding(&self, word: &str) -> PyResult<Vec<f32>> {
122+
match self.model.embedding(word) {
123+
Some(embedding) => Ok(embedding.to_vec()),
124+
None => Err(exc::KeyError::new("Unknown word and n-grams")),
125+
}
126+
}
127+
128+
/// Perform a similarity query.
129+
#[args(limit = 10)]
130+
fn similarity(&self, py: Python, word: &str, limit: usize) -> PyResult<Vec<PyObject>> {
131+
let results = match self.model.similarity(word, limit) {
132+
Some(results) => results,
133+
None => return Err(exc::KeyError::new("Unknown word and n-grams")),
134+
};
135+
136+
let mut r = Vec::with_capacity(results.len());
137+
for ws in results {
138+
r.push(
139+
Py::new(py, |token| PythonWordSimilarity {
140+
word: ws.word.to_owned(),
141+
similarity: ws.similarity.into_inner(),
142+
token,
143+
})?.into_object(py),
144+
)
145+
}
146+
147+
Ok(r)
148+
}
149+
}
150+
151+
fn load_model(path: &str, mmap: bool) -> Result<finalfrontier::Model, Error> {
152+
let f = File::open(path)?;
153+
154+
let model = if mmap {
155+
finalfrontier::Model::mmap_model_binary(f)?
156+
} else {
157+
finalfrontier::Model::read_model_binary(&mut BufReader::new(f))?
158+
};
159+
160+
Ok(model)
161+
}

0 commit comments

Comments
 (0)