3636from camel_tools .morphology .database import MorphologyDB
3737from camel_tools .morphology .analyzer import Analyzer
3838
39- from camel_tools .data import DataCatalogue
39+ from camel_tools .data import Catalogue
4040from camel_tools .disambig .common import Disambiguator , DisambiguatedWord
4141from camel_tools .disambig .common import ScoredAnalysis
4242
@@ -56,34 +56,35 @@ def _read_json(f_path):
5656
5757
5858class BERTUnfactoredDisambiguator (Disambiguator ):
59- """A disambiguator using a BERT model that predicts unfactored tag.
59+ """A disambiguator using an unfactored BERT model. This model is based on
60+ *Morphosyntactic Tagging with Pre-trained Language Models for Arabic and
61+ its Dialects* by Inoue, Khalifa, and Habash. Findings of ACL 2022.
62+ (https://arxiv.org/abs/2110.06852)
6063
6164 Args:
6265 model_path (:obj:`str`): The path to the fine-tuned model.
6366 analyzer (:obj:`~camel_tools.morphology.analyzer.Analyzer`): Analyzer
64- to use to provide full morphological analysis of a word.
65- features: :obj:`list`, optional): The list of features used in the
66- feature match function . Defaults to a list of all the 16 features.
67+ to use for providing full morphological analysis of a word.
68+ features: :obj:`list`, optional): A list of morphological features
69+ used in the model . Defaults to 14 features.
6770 top (:obj:`int`, optional): The maximum number of top analyses to
6871 return. Defaults to 1.
6972 scorer (:obj:`str`, optional): The scoring function that matches the
7073 predicted features from the model and the values in the analyses.
7174 If uniform, the scoring based on the uniform weight is used.
72- Defaults to uniform.
75+ Defaults to ` uniform` .
7376 tie_breaker (:obj:`str`, optional): The tie breaker used in the feature
74- match function.
75- If tag, tie breaking based on the tag MLE and factored tag MLE is
76- used.
77- Defaults to tag.
77+ match function. If `tag`, tie breaking based on the tag MLE and
78+ factored tag MLE is used. Defaults to `tag`.
7879 use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
7980 Defaults to True.
80- batch_size (:obj:`int`): The batch size. Defaults to 32.
81- ranking_cache (:obj: `dict`): The cache dictionary of pre-computed
82- scored analyses. Defaults to None.
81+ batch_size (:obj:`int`, optional ): The batch size. Defaults to 32.
82+ ranking_cache (:obj:`dict`, optional ): The cache dictionary of
83+ pre-computed scored analyses. Defaults to ` None` .
8384 """
8485
8586 def __init__ (self , model_path , analyzer ,
86- features = FEATURE_SET_MAP ['feats_16 ' ], top = 1 ,
87+ features = FEATURE_SET_MAP ['feats_14 ' ], top = 1 ,
8788 scorer = 'uniform' , tie_breaker = 'tag' , use_gpu = True ,
8889 batch_size = 32 , ranking_cache = None ):
8990 self .model = {
@@ -112,25 +113,26 @@ def pretrained(model_name=None, top=1, use_gpu=True, batch_size=32,
112113
113114 Args:
114115 model_name (:obj:`str`, optional): Name of pre-trained model to
115- load. Two models are available: 'msa', and 'egy'.
116- If None, the default model ('msa') will be loaded. Defaults to
117- None.
116+ load. Three models are available: 'msa', 'egy', and 'glf .
117+ If ` None` , the default model ('msa') will be loaded.
118+ Defaults to ` None` .
118119 top (:obj:`int`, optional): The maximum number of top analyses to
119120 return. Defaults to 1.
120121 use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
121122 Defaults to True.
122- batch_size (:obj:`int`): The batch size. Defaults to 32.
123- cache_size (:obj:`int`, optional): The number of unique word
124- disambiguations to cache. The cache uses a
125- least-frequently-used eviction policy. Defaults to 100000.
123+ batch_size (:obj:`int`, optional): The batch size. Defaults to 32.
124+ cache_size (:obj:`int`, optional): If greater than zero, then
125+ the analyzer will cache the analyses for the cache_size most
126+ frequent words, otherwise no analyses will be cached.
127+ Defaults to 100000.
126128
127129 Returns:
128130 :obj:`BERTUnfactoredDisambiguator`: Instance with loaded
129131 pre-trained model.
130132 """
131133
132- model_info = DataCatalogue . get_dataset_info ('DisambigBertUnfactored' ,
133- model_name )
134+ model_info = Catalogue . get_dataset ('DisambigBertUnfactored' ,
135+ model_name )
134136 model_config = _read_json (Path (model_info .path , 'default_config.json' ))
135137 model_path = str (model_info .path )
136138 features = FEATURE_SET_MAP [model_config ['feature' ]]
@@ -153,18 +155,22 @@ def pretrained(model_name=None, top=1, use_gpu=True, batch_size=32,
153155
154156 def pretrained_from_config (config , top = 1 , use_gpu = True , batch_size = 32 ,
155157 cache_size = 10000 ):
156- """Load a pre-trained model with custom config file.
158+ """Load a pre-trained model from a config file.
159+
157160 Args:
158161 config (:obj:`str`): Config file that defines the model
159- details. Defaults to None.
162+ details. Defaults to ` None` .
160163 top (:obj:`int`, optional): The maximum number of top analyses
161164 to return. Defaults to 1.
162165 use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
163166 Defaults to True.
164- batch_size (:obj:`int`): The batch size. Defaults to 32.
165- cache_size (:obj:`int`, optional): The number of unique word
166- disambiguations to cache. The cache uses a
167- least-frequently-used eviction policy. Defaults to 10000.
167+ batch_size (:obj:`int`, optional): The batch size.
168+ Defaults to 32.
169+ cache_size (:obj:`int`, optional): If greater than zero, then
170+ the analyzer will cache the analyses for the cache_size
171+ most frequent words, otherwise no analyses will be cached.
172+ Defaults to 100000.
173+
168174 Returns:
169175 :obj:`BERTUnfactoredDisambiguator`: Instance with loaded
170176 pre-trained model.
@@ -191,7 +197,7 @@ def pretrained_from_config(config, top=1, use_gpu=True, batch_size=32,
191197 ranking_cache = ranking_cache )
192198
193199 def _predict_sentences (self , sentences ):
194- """Predict the morphosyntactic labels of multiple sentences.
200+ """Predict the morphosyntactic labels of a list of sentences.
195201
196202 Args:
197203 sentences (:obj:`list` of :obj:`list` of :obj:`str`): The input
@@ -222,14 +228,14 @@ def _predict_sentences(self, sentences):
222228 return parsed_predictions
223229
224230 def _predict_sentence (self , sentence ):
225- """Predict morphosyntactic labels of a single sentence.
231+ """Predict the morphosyntactic labels of a single sentence.
226232
227233 Args:
228234 sentence (:obj:`list` of :obj:`str`): The input sentence.
229235
230236 Returns:
231- :obj:`list` of :obj:`dict`: The predicted morphosyntactic
232- labels for the given sentence.
237+ :obj:`list` of :obj:`dict`: The predicted morphosyntactic labels
238+ for the given sentence.
233239 """
234240
235241 parsed_predictions = []
@@ -286,12 +292,10 @@ def _disambiguate_word(self, word, pred):
286292 return DisambiguatedWord (word , scored_analyses )
287293
288294 def disambiguate_word (self , sentence , word_ndx ):
289- """Disambiguates a single word in a sentence.
295+ """Disambiguates a single word of a sentence.
290296
291297 Args:
292- sentence (:obj:`list` of :obj:`str`): The list of space and
293- punctuation seperated list of tokens comprising a given
294- sentence.
298+ sentence (:obj:`list` of :obj:`str`): The input sentence.
295299 word_ndx (:obj:`int`): The index of the word token in `sentence` to
296300 disambiguate.
297301
@@ -303,16 +307,14 @@ def disambiguate_word(self, sentence, word_ndx):
303307 return self .disambiguate (sentence )[word_ndx ]
304308
305309 def disambiguate (self , sentence ):
306- """Disambiguate all words given a sentence.
310+ """Disambiguate all words of a single sentence.
307311
308312 Args:
309- sentence (:obj:`list` of :obj:`str`): The list of space and
310- punctuation seperated list of tokens comprising a given
311- sentence.
313+ sentence (:obj:`list` of :obj:`str`): The input sentence.
312314
313315 Returns:
314316 :obj:`list` of :obj:`~camel_tools.disambig.common.DisambiguatedWord`: The
315- list of disambiguations for each word in the given sentence.
317+ disambiguated analyses for the given sentence.
316318 """
317319
318320 predictions = self ._predict_sentence (sentence )
@@ -321,16 +323,15 @@ def disambiguate(self, sentence):
321323 for (w , p ) in zip (sentence , predictions )]
322324
323325 def disambiguate_sentences (self , sentences ):
324- """Disambiguate all words given a list of sentences.
326+ """Disambiguate all words of a list of sentences.
325327
326328 Args:
327- sentences (:obj:`list` of :obj:`list` of :obj:`str`): The list of
328- space and punctuation seperated list of tokens comprising a
329- given sentence.
329+ sentences (:obj:`list` of :obj:`list` of :obj:`str`): The input
330+ sentences.
330331
331332 Returns:
332- :obj:`list` of :obj:`~camel_tools.disambig.common.DisambiguatedWord`: The
333- list of disambiguations for each word in the given sentence .
333+ :obj:`list` of :obj:`list` of :obj:` ~camel_tools.disambig.common.DisambiguatedWord`: The
334+ disambiguated analyses for the given sentences .
334335 """
335336
336337 predictions = self ._predict_sentences (sentences )
@@ -346,7 +347,7 @@ def disambiguate_sentences(self, sentences):
346347 return disambiguated_sentences
347348
348349 def tag_sentences (self , sentences , use_analyzer = True ):
349- """Tag morphosyntactic features given a list of sentences.
350+ """Predict the morphosyntactic labels of a list of sentences.
350351
351352 Args:
352353 sentences (:obj:`list` of :obj:`list` of :obj:`str`): The input
@@ -356,7 +357,7 @@ def tag_sentences(self, sentences, use_analyzer=True):
356357 Defaults to True.
357358
358359 Returns:
359- :obj:`list` of :obj:`list` of :obj:`dict`: The list of feature tags
360+ :obj:`list` of :obj:`list` of :obj:`dict`: The predicted The list of feature tags
360361 for each word in the given sentences
361362 """
362363
@@ -372,7 +373,7 @@ def tag_sentences(self, sentences, use_analyzer=True):
372373 return tagged_sentences
373374
374375 def tag_sentence (self , sentence , use_analyzer = True ):
375- """Tag morphosyntactic features given a sentence .
376+ """Predict the morphosyntactic labels of a ssingle entence .
376377
377378 Args:
378379 sentence (:obj:`list` of :obj:`str`): The list of space and
@@ -421,7 +422,7 @@ class BERTFeatureTagger:
421422 Args:
422423 model_path (:obj:`str`): The path to the fine-tuned model.
423424 use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
424- Defaults to True.
425+ Defaults to True.
425426 """
426427
427428 def __init__ (self , model_path , use_gpu = True ):
0 commit comments