Improved analysis sorting using pos-lex log prob.

owo · owo · commit f4e8e0721eb7 · 2023-02-17T13:16:13.000+04:00
diff --git a/camel_tools/disambig/bert/unfactored.py b/camel_tools/disambig/bert/unfactored.py
@@ -476,16 +476,23 @@ def _scored_analyses(self, word_dd, prediction):
                                 tie_breaker=self._tie_breaker,
                                 features=self._features), a)
                   for a in analyses]
-        scored.sort(key=lambda s: (-s[0], s[1]['diac']))
+        # scored.sort(key=lambda s: (-s[0], s[1]['diac']))
 
         max_score = max(s[0] for s in scored)
 
-        if max_score != 0:
-            scored_analyses = [ScoredAnalysis(s[0] / max_score, s[1])
-                               for s in scored]
-        else:
-            # If the max score is 0, do not divide
-            scored_analyses = [ScoredAnalysis(0, s[1]) for s in scored]
+        if max_score == 0:
+            max_score = 1
+
+        scored_analyses = [
+            ScoredAnalysis(
+                s / max_score,                  # score
+                a,                              # analysis
+                a['diac'],                      # diac
+                a.get('pos_lex_logprob', -99),  # pos_lex_logprob
+                a.get('lex_logprob', -99),      # lex_logprob
+            ) for s, a in scored]
+
+        scored_analyses.sort()
 
         return scored_analyses[:self._top]
 
diff --git a/camel_tools/disambig/common.py b/camel_tools/disambig/common.py
@@ -32,17 +32,49 @@
 from collections import namedtuple
 
 
-class ScoredAnalysis(namedtuple('ScoredAnalysis', ['score', 'analysis'])):
+class ScoredAnalysis(namedtuple('ScoredAnalysis',
+                                [
+                                    'score',
+                                    'analysis',
+                                    'diac',
+                                    'pos_lex_logprob',
+                                    'lex_logprob'
+                                ])):
     """A named tuple containing an analysis and its score.
 
     Attributes:
-        score (:obj:`float`): The score of a given analysis.
+        score (:obj:`float`): The overall score of the analysis.
 
         analysis (:obj:`dict`): The analysis dictionary.
-            See :doc:`/reference/camel_morphology_features` for more information on
-            features and their values.
+            See :doc:`/reference/camel_morphology_features` for more
+            information on features and their values.
+
+        diac (:obj:`str`): The diactrized form of the associated analysis.
+            Used for tie-breaking equally scored analyses.
+
+        pos_lex_log_prob (:obj:`float`): The log (base 10) of the probability
+            of the associated pos-lex pair values.
+            Used for tie-breaking equally scored analyses.
+
+        lex_log_prob (:obj:`float`): The log (base 10) of the probability of
+            the associated lex value.
+            Used for tie-breaking equally scored analyses.
     """
 
+    def __lt__(self, other):
+        if self.score > other.score:
+            return True
+        elif self.score == other.score:
+            if self.pos_lex_logprob > other.pos_lex_logprob:
+                return True
+            elif self.pos_lex_logprob == other.pos_lex_logprob:
+                if self.lex_logprob > other.lex_logprob:
+                    return True
+                elif self.lex_logprob == other.lex_logprob:
+                    return self.diac < other.diac
+
+        return False
+
 
 class DisambiguatedWord(namedtuple('DisambiguatedWord', ['word', 'analyses'])):
     """A named tuple containing a word and a sorted list (from high to low
diff --git a/camel_tools/disambig/mle.py b/camel_tools/disambig/mle.py
@@ -204,9 +204,17 @@ def _scored_analyses(self, word_dd):
 
             scored_analyses = [ScoredAnalysis(p / max_prob, a)
                                for a, p in zip(analyses, probabilities)]
-            scored_analyses.sort(key=lambda w: (-w.score,
-                                                len(w.analysis['bw']),
-                                                w.analysis['diac']))
+
+            scored_analyses = [
+                ScoredAnalysis(
+                    p / max_prob,                   # score
+                    a,                              # analysis
+                    a['diac'],                      # diac
+                    a.get('pos_lex_logprob', -99),  # pos_lex_logprob
+                    a.get('lex_logprob', -99),      # lex_logprob
+                ) for a, p in zip(analyses, probabilities)]
+
+            scored_analyses.sort()
 
             return scored_analyses[0:self._top]