1010from finalfusion .io import ChunkIdentifier , find_chunk , _write_binary , _read_required_binary
1111from finalfusion .subword import ExplicitIndexer , FastTextIndexer , FinalfusionHashIndexer , ngrams
1212from finalfusion .vocab .vocab import Vocab , _validate_items_and_create_index , \
13- _calculate_binary_list_size , _write_words_binary , _read_items
13+ _calculate_binary_list_size , _write_words_binary , _read_items , _read_items_with_indices
1414
1515
1616class SubwordVocab (Vocab ):
@@ -133,8 +133,7 @@ def __repr__(self) -> str:
133133 return f"{ type (self ).__name__ } (\n " \
134134 f"\t indexer={ self .subword_indexer } \n " \
135135 "\t words=[...]\n " \
136- "\t word_index={{...}}\n " \
137- ")"
136+ "\t word_index={{...}})"
138137
139138 def __eq__ (self , other : Any ) -> bool :
140139 return isinstance (other , type (self )) and \
@@ -272,6 +271,84 @@ def chunk_identifier():
272271 return ChunkIdentifier .FastTextSubwordVocab
273272
274273
274+ class ExplicitVocab (SubwordVocab ):
275+ """
276+ A vocabulary with explicitly stored n-grams.
277+ """
278+ def __init__ (self , words : List [str ], indexer : ExplicitIndexer ):
279+ """
280+ Initialize an ExplicitVocab.
281+
282+ Initializes the vocabulary with the given words and ExplicitIndexer.
283+
284+ The word list cannot contain duplicate entries.
285+
286+ Parameters
287+ ----------
288+ words : List[str]
289+ List of unique words
290+ indexer : ExplicitIndexer
291+ Subword indexer to use for the vocabulary.
292+
293+ Raises
294+ ------
295+ AssertionError
296+ If the indexer is not an ExplicitIndexer.
297+
298+ See Also
299+ --------
300+ :class:`.ExplicitIndexer`
301+ """
302+ assert isinstance (indexer , ExplicitIndexer )
303+ super ().__init__ ()
304+ self ._index = _validate_items_and_create_index (words )
305+ self ._words = words
306+ self ._indexer = indexer
307+
308+ @property
309+ def word_index (self ) -> dict :
310+ return self ._index
311+
312+ @property
313+ def subword_indexer (self ) -> ExplicitIndexer :
314+ return self ._indexer
315+
316+ @property
317+ def words (self ) -> list :
318+ return self ._words
319+
320+ @staticmethod
321+ def chunk_identifier ():
322+ return ChunkIdentifier .ExplicitSubwordVocab
323+
324+ @staticmethod
325+ def read_chunk (file : BinaryIO ) -> 'ExplicitVocab' :
326+ length , ngram_length , min_n , max_n = _read_required_binary (
327+ file , "<QQII" )
328+ words = _read_items (file , length )
329+ ngram_list , ngram_index = _read_items_with_indices (file , ngram_length )
330+ indexer = ExplicitIndexer (ngram_list , min_n , max_n , ngram_index )
331+ return ExplicitVocab (words , indexer )
332+
333+ def write_chunk (self , file ) -> None :
334+ chunk_length = _calculate_binary_list_size (self .words )
335+ chunk_length += _calculate_binary_list_size (
336+ self .subword_indexer .ngrams )
337+ min_n_max_n_size = struct .calcsize ("<II" )
338+ chunk_length += min_n_max_n_size
339+ chunk_header = (int (self .chunk_identifier ()), chunk_length ,
340+ len (self .words ), len (self .subword_indexer .ngrams ),
341+ self .min_n , self .max_n )
342+ _write_binary (file , "<IQQQII" , * chunk_header )
343+ _write_words_binary ((bytes (word , "utf-8" ) for word in self .words ),
344+ file )
345+ for ngram in self .subword_indexer .ngrams :
346+ b_ngram = ngram .encode ("utf-8" )
347+ _write_binary (file , "<I" , len (b_ngram ))
348+ file .write (b_ngram )
349+ _write_binary (file , "<Q" , self .subword_indexer .ngram_index [ngram ])
350+
351+
275352def load_finalfusion_bucket_vocab (file : Union [str , bytes , int , PathLike ]
276353 ) -> FinalfusionBucketVocab :
277354 """
@@ -316,6 +393,28 @@ def load_fasttext_vocab(file: Union[str, bytes, int, PathLike]
316393 return FastTextVocab .read_chunk (inf )
317394
318395
396+ def load_explicit_vocab (file : Union [str , bytes , int , PathLike ]
397+ ) -> ExplicitVocab :
398+ """
399+ Load a ExplicitVocab from the given finalfusion file.
400+
401+ Parameters
402+ ----------
403+ file : str, bytes, int, PathLike
404+ Path to file containing a ExplicitVocab chunk.
405+
406+ Returns
407+ -------
408+ vocab : ExplicitVocab
409+ Returns the first ExplicitVocab in the file.
410+ """
411+ with open (file , "rb" ) as inf :
412+ chunk = find_chunk (inf , [ChunkIdentifier .ExplicitSubwordVocab ])
413+ if chunk is None :
414+ raise ValueError ('File did not contain a FastTextVocab}' )
415+ return ExplicitVocab .read_chunk (inf )
416+
417+
319418def _write_bucket_vocab (file : BinaryIO ,
320419 vocab : Union [FastTextVocab , FinalfusionBucketVocab ]):
321420 min_n_max_n_size = struct .calcsize ("<II" )
@@ -339,5 +438,6 @@ def _write_bucket_vocab(file: BinaryIO,
339438
340439__all__ = [
341440 'SubwordVocab' , 'FinalfusionBucketVocab' , 'load_finalfusion_bucket_vocab' ,
342- 'FastTextVocab' , 'load_fasttext_vocab'
441+ 'FastTextVocab' , 'load_fasttext_vocab' , 'ExplicitVocab' ,
442+ 'load_explicit_vocab'
343443]
0 commit comments