Skip to content

Commit 20988b9

Browse files
committed
Update docs
1 parent afcb04b commit 20988b9

File tree

1 file changed

+53
-52
lines changed

1 file changed

+53
-52
lines changed

camel_tools/disambig/bert/__init__.py

Lines changed: 53 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
from camel_tools.morphology.database import MorphologyDB
3737
from camel_tools.morphology.analyzer import Analyzer
3838

39-
from camel_tools.data import DataCatalogue
39+
from camel_tools.data import Catalogue
4040
from camel_tools.disambig.common import Disambiguator, DisambiguatedWord
4141
from camel_tools.disambig.common import ScoredAnalysis
4242

@@ -56,34 +56,35 @@ def _read_json(f_path):
5656

5757

5858
class BERTUnfactoredDisambiguator(Disambiguator):
59-
"""A disambiguator using a BERT model that predicts unfactored tag.
59+
"""A disambiguator using an unfactored BERT model. This model is based on
60+
*Morphosyntactic Tagging with Pre-trained Language Models for Arabic and
61+
its Dialects* by Inoue, Khalifa, and Habash. Findings of ACL 2022.
62+
(https://arxiv.org/abs/2110.06852)
6063
6164
Args:
6265
model_path (:obj:`str`): The path to the fine-tuned model.
6366
analyzer (:obj:`~camel_tools.morphology.analyzer.Analyzer`): Analyzer
64-
to use to provide full morphological analysis of a word.
65-
features: :obj:`list`, optional): The list of features used in the
66-
feature match function. Defaults to a list of all the 16 features.
67+
to use for providing full morphological analysis of a word.
68+
features: :obj:`list`, optional): A list of morphological features
69+
used in the model. Defaults to 14 features.
6770
top (:obj:`int`, optional): The maximum number of top analyses to
6871
return. Defaults to 1.
6972
scorer (:obj:`str`, optional): The scoring function that matches the
7073
predicted features from the model and the values in the analyses.
7174
If uniform, the scoring based on the uniform weight is used.
72-
Defaults to uniform.
75+
Defaults to `uniform`.
7376
tie_breaker (:obj:`str`, optional): The tie breaker used in the feature
74-
match function.
75-
If tag, tie breaking based on the tag MLE and factored tag MLE is
76-
used.
77-
Defaults to tag.
77+
match function. If `tag`, tie breaking based on the tag MLE and
78+
factored tag MLE is used. Defaults to `tag`.
7879
use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
7980
Defaults to True.
80-
batch_size (:obj:`int`): The batch size. Defaults to 32.
81-
ranking_cache (:obj: `dict`): The cache dictionary of pre-computed
82-
scored analyses. Defaults to None.
81+
batch_size (:obj:`int`, optional): The batch size. Defaults to 32.
82+
ranking_cache (:obj:`dict`, optional): The cache dictionary of
83+
pre-computed scored analyses. Defaults to `None`.
8384
"""
8485

8586
def __init__(self, model_path, analyzer,
86-
features=FEATURE_SET_MAP['feats_16'], top=1,
87+
features=FEATURE_SET_MAP['feats_14'], top=1,
8788
scorer='uniform', tie_breaker='tag', use_gpu=True,
8889
batch_size=32, ranking_cache=None):
8990
self.model = {
@@ -112,25 +113,26 @@ def pretrained(model_name=None, top=1, use_gpu=True, batch_size=32,
112113
113114
Args:
114115
model_name (:obj:`str`, optional): Name of pre-trained model to
115-
load. Two models are available: 'msa', and 'egy'.
116-
If None, the default model ('msa') will be loaded. Defaults to
117-
None.
116+
load. Three models are available: 'msa', 'egy', and 'glf.
117+
If `None`, the default model ('msa') will be loaded.
118+
Defaults to `None`.
118119
top (:obj:`int`, optional): The maximum number of top analyses to
119120
return. Defaults to 1.
120121
use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
121122
Defaults to True.
122-
batch_size (:obj:`int`): The batch size. Defaults to 32.
123-
cache_size (:obj:`int`, optional): The number of unique word
124-
disambiguations to cache. The cache uses a
125-
least-frequently-used eviction policy. Defaults to 100000.
123+
batch_size (:obj:`int`, optional): The batch size. Defaults to 32.
124+
cache_size (:obj:`int`, optional): If greater than zero, then
125+
the analyzer will cache the analyses for the cache_size most
126+
frequent words, otherwise no analyses will be cached.
127+
Defaults to 100000.
126128
127129
Returns:
128130
:obj:`BERTUnfactoredDisambiguator`: Instance with loaded
129131
pre-trained model.
130132
"""
131133

132-
model_info = DataCatalogue.get_dataset_info('DisambigBertUnfactored',
133-
model_name)
134+
model_info = Catalogue.get_dataset('DisambigBertUnfactored',
135+
model_name)
134136
model_config = _read_json(Path(model_info.path, 'default_config.json'))
135137
model_path = str(model_info.path)
136138
features = FEATURE_SET_MAP[model_config['feature']]
@@ -153,18 +155,22 @@ def pretrained(model_name=None, top=1, use_gpu=True, batch_size=32,
153155

154156
def pretrained_from_config(config, top=1, use_gpu=True, batch_size=32,
155157
cache_size=10000):
156-
"""Load a pre-trained model with custom config file.
158+
"""Load a pre-trained model from a config file.
159+
157160
Args:
158161
config (:obj:`str`): Config file that defines the model
159-
details. Defaults to None.
162+
details. Defaults to `None`.
160163
top (:obj:`int`, optional): The maximum number of top analyses
161164
to return. Defaults to 1.
162165
use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
163166
Defaults to True.
164-
batch_size (:obj:`int`): The batch size. Defaults to 32.
165-
cache_size (:obj:`int`, optional): The number of unique word
166-
disambiguations to cache. The cache uses a
167-
least-frequently-used eviction policy. Defaults to 10000.
167+
batch_size (:obj:`int`, optional): The batch size.
168+
Defaults to 32.
169+
cache_size (:obj:`int`, optional): If greater than zero, then
170+
the analyzer will cache the analyses for the cache_size
171+
most frequent words, otherwise no analyses will be cached.
172+
Defaults to 100000.
173+
168174
Returns:
169175
:obj:`BERTUnfactoredDisambiguator`: Instance with loaded
170176
pre-trained model.
@@ -191,7 +197,7 @@ def pretrained_from_config(config, top=1, use_gpu=True, batch_size=32,
191197
ranking_cache=ranking_cache)
192198

193199
def _predict_sentences(self, sentences):
194-
"""Predict the morphosyntactic labels of multiple sentences.
200+
"""Predict the morphosyntactic labels of a list of sentences.
195201
196202
Args:
197203
sentences (:obj:`list` of :obj:`list` of :obj:`str`): The input
@@ -222,14 +228,14 @@ def _predict_sentences(self, sentences):
222228
return parsed_predictions
223229

224230
def _predict_sentence(self, sentence):
225-
"""Predict morphosyntactic labels of a single sentence.
231+
"""Predict the morphosyntactic labels of a single sentence.
226232
227233
Args:
228234
sentence (:obj:`list` of :obj:`str`): The input sentence.
229235
230236
Returns:
231-
:obj:`list` of :obj:`dict`: The predicted morphosyntactic
232-
labels for the given sentence.
237+
:obj:`list` of :obj:`dict`: The predicted morphosyntactic labels
238+
for the given sentence.
233239
"""
234240

235241
parsed_predictions = []
@@ -286,12 +292,10 @@ def _disambiguate_word(self, word, pred):
286292
return DisambiguatedWord(word, scored_analyses)
287293

288294
def disambiguate_word(self, sentence, word_ndx):
289-
"""Disambiguates a single word in a sentence.
295+
"""Disambiguates a single word of a sentence.
290296
291297
Args:
292-
sentence (:obj:`list` of :obj:`str`): The list of space and
293-
punctuation seperated list of tokens comprising a given
294-
sentence.
298+
sentence (:obj:`list` of :obj:`str`): The input sentence.
295299
word_ndx (:obj:`int`): The index of the word token in `sentence` to
296300
disambiguate.
297301
@@ -303,16 +307,14 @@ def disambiguate_word(self, sentence, word_ndx):
303307
return self.disambiguate(sentence)[word_ndx]
304308

305309
def disambiguate(self, sentence):
306-
"""Disambiguate all words given a sentence.
310+
"""Disambiguate all words of a single sentence.
307311
308312
Args:
309-
sentence (:obj:`list` of :obj:`str`): The list of space and
310-
punctuation seperated list of tokens comprising a given
311-
sentence.
313+
sentence (:obj:`list` of :obj:`str`): The input sentence.
312314
313315
Returns:
314316
:obj:`list` of :obj:`~camel_tools.disambig.common.DisambiguatedWord`: The
315-
list of disambiguations for each word in the given sentence.
317+
disambiguated analyses for the given sentence.
316318
"""
317319

318320
predictions = self._predict_sentence(sentence)
@@ -321,16 +323,15 @@ def disambiguate(self, sentence):
321323
for (w, p) in zip(sentence, predictions)]
322324

323325
def disambiguate_sentences(self, sentences):
324-
"""Disambiguate all words given a list of sentences.
326+
"""Disambiguate all words of a list of sentences.
325327
326328
Args:
327-
sentences (:obj:`list` of :obj:`list` of :obj:`str`): The list of
328-
space and punctuation seperated list of tokens comprising a
329-
given sentence.
329+
sentences (:obj:`list` of :obj:`list` of :obj:`str`): The input
330+
sentences.
330331
331332
Returns:
332-
:obj:`list` of :obj:`~camel_tools.disambig.common.DisambiguatedWord`: The
333-
list of disambiguations for each word in the given sentence.
333+
:obj:`list` of :obj:`list` of :obj:`~camel_tools.disambig.common.DisambiguatedWord`: The
334+
disambiguated analyses for the given sentences.
334335
"""
335336

336337
predictions = self._predict_sentences(sentences)
@@ -346,7 +347,7 @@ def disambiguate_sentences(self, sentences):
346347
return disambiguated_sentences
347348

348349
def tag_sentences(self, sentences, use_analyzer=True):
349-
"""Tag morphosyntactic features given a list of sentences.
350+
"""Predict the morphosyntactic labels of a list of sentences.
350351
351352
Args:
352353
sentences (:obj:`list` of :obj:`list` of :obj:`str`): The input
@@ -356,7 +357,7 @@ def tag_sentences(self, sentences, use_analyzer=True):
356357
Defaults to True.
357358
358359
Returns:
359-
:obj:`list` of :obj:`list` of :obj:`dict`: The list of feature tags
360+
:obj:`list` of :obj:`list` of :obj:`dict`: The predicted The list of feature tags
360361
for each word in the given sentences
361362
"""
362363

@@ -372,7 +373,7 @@ def tag_sentences(self, sentences, use_analyzer=True):
372373
return tagged_sentences
373374

374375
def tag_sentence(self, sentence, use_analyzer=True):
375-
"""Tag morphosyntactic features given a sentence.
376+
"""Predict the morphosyntactic labels of a ssingle entence.
376377
377378
Args:
378379
sentence (:obj:`list` of :obj:`str`): The list of space and
@@ -421,7 +422,7 @@ class BERTFeatureTagger:
421422
Args:
422423
model_path (:obj:`str`): The path to the fine-tuned model.
423424
use_gpu (:obj:`bool`, optional): The flag to use a GPU or not.
424-
Defaults to True.
425+
Defaults to True.
425426
"""
426427

427428
def __init__(self, model_path, use_gpu=True):

0 commit comments

Comments
 (0)