@@ -148,19 +148,16 @@ class FinalfusionBucketVocab(SubwordVocab):
148148 """
149149 def __init__ (self ,
150150 words : List [str ],
151- indexer : FinalfusionHashIndexer = None ):
151+ indexer : Optional [ FinalfusionHashIndexer ] = None ):
152152 """
153153 Initialize a FinalfusionBucketVocab.
154154
155- Initializes the vocabulary with the given words and optional index and
156- indexer.
155+ Initializes the vocabulary with the given words.
157156
158157 If no indexer is passed, a FinalfusionHashIndexer with bucket exponent
159158 21 is used.
160159
161- If no index is given, the nth word in the `words` list is assigned
162- index `n`. The word list cannot contain duplicate entries and it needs
163- to be of same length as the index.
160+ The word list cannot contain duplicate entries.
164161
165162 Parameters
166163 ----------
@@ -211,6 +208,70 @@ def chunk_identifier() -> ChunkIdentifier:
211208 return ChunkIdentifier .BucketSubwordVocab
212209
213210
211+ class FastTextVocab (SubwordVocab ):
212+ """
213+ FastText vocabulary
214+ """
215+ def __init__ (self ,
216+ words : List [str ],
217+ indexer : Optional [FastTextIndexer ] = None ):
218+ """
219+ Initialize a FastTextVocab.
220+
221+ Initializes the vocabulary with the given words.
222+
223+ If no indexer is passed, a FastTextIndexer with 2_000_000 buckets is used.
224+
225+ The word list cannot contain duplicate entries.
226+
227+ Parameters
228+ ----------
229+ words : List[str]
230+ List of unique words
231+ indexer : FastTextIndexer, optional
232+ Subword indexer to use for the vocabulary. Defaults to an indexer
233+ with 2_000_000 buckets and range 3-6.
234+
235+ Raises
236+ ------
237+ AssertionError
238+ If the indexer is not a FastTextIndexer or ``words`` contains duplicate entries.
239+ """
240+ if indexer is None :
241+ indexer = FastTextIndexer (2000000 )
242+ assert isinstance (indexer , FastTextIndexer )
243+ super ().__init__ ()
244+ self ._index = _validate_items_and_create_index (words )
245+ self ._words = words
246+ self ._indexer = indexer
247+
248+ @property
249+ def subword_indexer (self ) -> FastTextIndexer :
250+ return self ._indexer
251+
252+ @property
253+ def words (self ) -> List [str ]:
254+ return self ._words
255+
256+ @property
257+ def word_index (self ) -> Dict [str , int ]:
258+ return self ._index
259+
260+ @staticmethod
261+ def read_chunk (file : BinaryIO ) -> 'FastTextVocab' :
262+ length , min_n , max_n , buckets = _read_required_binary (file , "<QIII" )
263+ words = _read_items (file , length )
264+ indexer = FastTextIndexer (buckets , min_n , max_n )
265+ return FastTextVocab (words , indexer )
266+
267+ def write_chunk (self , file : BinaryIO ):
268+ _write_bucket_vocab (file , self )
269+
270+ @staticmethod
271+ def chunk_identifier ():
272+ return ChunkIdentifier .FastTextSubwordVocab
273+
274+
214275def load_finalfusion_bucket_vocab (file : Union [str , bytes , int , PathLike ]
215276 ) -> FinalfusionBucketVocab :
216277 """
@@ -233,7 +294,30 @@ def load_finalfusion_bucket_vocab(file: Union[str, bytes, int, PathLike]
233294 return FinalfusionBucketVocab .read_chunk (inf )
234295
235296
236- def _write_bucket_vocab (file : BinaryIO , vocab : FinalfusionBucketVocab ):
297+ def load_fasttext_vocab (file : Union [str , bytes , int , PathLike ]
298+ ) -> FastTextVocab :
299+ """
300+ Load a FastTextVocab from the given finalfusion file.
301+
302+ Parameters
303+ ----------
304+ file : str, bytes, int, PathLike
305+ Path to file containing a FastTextVocab chunk.
306+
307+ Returns
308+ -------
309+ vocab : FastTextVocab
310+ Returns the first FastTextVocab in the file.
311+ """
312+ with open (file , "rb" ) as inf :
313+ chunk = find_chunk (inf , [ChunkIdentifier .FastTextSubwordVocab ])
314+ if chunk is None :
315+ raise ValueError ('File did not contain a FastTextVocab}' )
316+ return FastTextVocab .read_chunk (inf )
317+
318+
319+ def _write_bucket_vocab (file : BinaryIO ,
320+ vocab : Union [FastTextVocab , FinalfusionBucketVocab ]):
237321 min_n_max_n_size = struct .calcsize ("<II" )
238322 buckets_size = struct .calcsize ("<I" )
239323 chunk_length = _calculate_binary_list_size (vocab .words )
@@ -254,5 +338,6 @@ def _write_bucket_vocab(file: BinaryIO, vocab: FinalfusionBucketVocab):
254338
255339
256340__all__ = [
257- 'SubwordVocab' , 'FinalfusionBucketVocab' , 'load_finalfusion_bucket_vocab'
341+ 'SubwordVocab' , 'FinalfusionBucketVocab' , 'load_finalfusion_bucket_vocab' ,
342+ 'FastTextVocab' , 'load_fasttext_vocab'
258343]
0 commit comments