@@ -4,9 +4,12 @@ use std::io::{BufReader, BufWriter};
44use std:: rc:: Rc ;
55
66use failure:: Error ;
7+ use finalfusion:: io as ffio;
78use finalfusion:: metadata:: Metadata ;
89use finalfusion:: prelude:: * ;
910use finalfusion:: similarity:: * ;
11+ use finalfusion:: text:: { ReadText , ReadTextDims } ;
12+ use finalfusion:: word2vec:: ReadWord2Vec ;
1013use itertools:: Itertools ;
1114use ndarray:: Array2 ;
1215use numpy:: { IntoPyArray , PyArray1 , PyArray2 } ;
@@ -44,9 +47,9 @@ impl PyEmbeddings {
4447 // First try to load embeddings with viewable storage. If that
4548 // fails, attempt to load the embeddings as non-viewable
4649 // storage.
47- let embeddings = match load_embeddings ( path, mmap) {
50+ let embeddings = match read_embeddings ( path, mmap) {
4851 Ok ( e) => Rc :: new ( RefCell :: new ( EmbeddingsWrap :: View ( e) ) ) ,
49- Err ( _) => load_embeddings ( path, mmap)
52+ Err ( _) => read_embeddings ( path, mmap)
5053 . map ( |e| Rc :: new ( RefCell :: new ( EmbeddingsWrap :: NonView ( e) ) ) )
5154 . map_err ( |err| exceptions:: IOError :: py_err ( err. to_string ( ) ) ) ?,
5255 } ;
@@ -56,6 +59,42 @@ impl PyEmbeddings {
5659 Ok ( ( ) )
5760 }
5861
62+ /// read_text(path,/)
63+ /// --
64+ ///
65+ /// Read embeddings in text format. This format uses one line per
66+ /// embedding. Each line starts with the word in UTF-8, followed
67+ /// by its vector components encoded in ASCII. The word and its
68+ /// components are separated by spaces.
69+ #[ staticmethod]
70+ fn read_text ( path : & str ) -> PyResult < PyEmbeddings > {
71+ read_non_fifu_embeddings ( path, |r| Embeddings :: read_text ( r) )
72+ }
73+
74+ /// read_text_dims(path,/)
75+ /// --
76+ ///
77+ /// Read embeddings in text format with dimensions. In this format,
78+ /// the first line states the shape of the embedding matrix. The
79+ /// number of rows (words) and columns (embedding dimensionality) is
80+ /// separated by a space character. The remainder of the file uses
81+ /// one line per embedding. Each line starts with the word in UTF-8,
82+ /// followed by its vector components encoded in ASCII. The word and
83+ /// its components are separated by spaces.
84+ #[ staticmethod]
85+ fn read_text_dims ( path : & str ) -> PyResult < PyEmbeddings > {
86+ read_non_fifu_embeddings ( path, |r| Embeddings :: read_text_dims ( r) )
87+ }
88+
89+ /// read_text_dims(path,/)
90+ /// --
91+ ///
92+ /// Read embeddings in the word2vec binary format.
93+ #[ staticmethod]
94+ fn read_word2vec ( path : & str ) -> PyResult < PyEmbeddings > {
95+ read_non_fifu_embeddings ( path, |r| Embeddings :: read_word2vec_binary ( r) )
96+ }
97+
5998 /// Get the model's vocabulary.
6099 fn vocab ( & self ) -> PyResult < PyVocab > {
61100 Ok ( PyVocab :: new ( self . embeddings . clone ( ) ) )
@@ -283,7 +322,7 @@ impl PyIterProtocol for PyEmbeddings {
283322 }
284323}
285324
286- fn load_embeddings < S > ( path : & str , mmap : bool ) -> Result < Embeddings < VocabWrap , S > , Error >
325+ fn read_embeddings < S > ( path : & str , mmap : bool ) -> Result < Embeddings < VocabWrap , S > , Error >
287326where
288327 Embeddings < VocabWrap , S > : ReadEmbeddings + MmapEmbeddings ,
289328{
@@ -298,3 +337,27 @@ where
298337
299338 Ok ( embeddings)
300339}
340+
341+ fn read_non_fifu_embeddings < R > ( path : & str , read_embeddings : R ) -> PyResult < PyEmbeddings >
342+ where
343+ R : FnOnce ( & mut BufReader < File > ) -> ffio:: Result < Embeddings < SimpleVocab , NdArray > > ,
344+ {
345+ let f = File :: open ( path) . map_err ( |err| {
346+ exceptions:: IOError :: py_err ( format ! (
347+ "Cannot read text embeddings from '{}': {}'" ,
348+ path, err
349+ ) )
350+ } ) ?;
351+ let mut reader = BufReader :: new ( f) ;
352+
353+ let embeddings = read_embeddings ( & mut reader) . map_err ( |err| {
354+ exceptions:: IOError :: py_err ( format ! (
355+ "Cannot read text embeddings from '{}': {}'" ,
356+ path, err
357+ ) )
358+ } ) ?;
359+
360+ Ok ( PyEmbeddings {
361+ embeddings : Rc :: new ( RefCell :: new ( EmbeddingsWrap :: View ( embeddings. into ( ) ) ) ) ,
362+ } )
363+ }
0 commit comments