Fixed rule building and added TF-IDF weight threshold as parameter

zStupan · zStupan · commit 5ddd1fffccc1 · 2022-12-22T13:39:20.000+01:00
diff --git a/niaarm/mine.py b/niaarm/mine.py
@@ -54,7 +54,7 @@ def get_rules(dataset, algorithm, metrics, max_evals=np.inf, max_iters=np.inf, l
     return Result(problem.rules, stop_time - start_time)
 
 
-def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, max_evals=np.inf, max_iters=np.inf,
+def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, threshold=0, max_evals=np.inf, max_iters=np.inf,
                    logging=False, **kwargs):
     """Mine association rules in a text corpus.
 
@@ -69,6 +69,8 @@ def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, m
          a sequence of metrics as strings, in which case, the weights of the metrics will be set to 1.
         smooth (bool): Smooth idf to prevent division by 0 error. Default: ``True``.
         norm (int): Order of norm for normalizing the tf-idf matrix. Default: 2.
+        threshold (Optional[float]): Threshold of tf-idf weights. If a weight is less than or equal to the
+         threshold, the term is not included in the transaction. Default: 0.
         max_evals (Optional[int]): Maximum number of iterations. Default: ``inf``. At least one of ``max_evals`` or
          ``max_iters`` must be provided.
         max_iters (Optional[int]): Maximum number of fitness evaluations. Default: ``inf``.
@@ -78,7 +80,7 @@ def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, m
         Result: A named tuple containing the list of mined rules and the algorithm's run time in seconds.
 
     """
-    problem = NiaARTM(max_terms, corpus.terms(), corpus.tf_idf_matrix(smooth=smooth, norm=norm), metrics, logging)
+    problem = NiaARTM(max_terms, corpus.terms(), corpus.tf_idf_matrix(smooth=smooth, norm=norm), metrics, threshold, logging)
     task = Task(problem, max_evals=max_evals, max_iters=max_iters, optimization_type=OptimizationType.MAXIMIZATION)
 
     if isinstance(algorithm, str):
diff --git a/niaarm/text.py b/niaarm/text.py
@@ -197,6 +197,8 @@ class TextRule(Rule):
         consequent (list[str]): A list of consequent terms of the text rule.
         fitness (Optional[float]): Fitness value of the text rule.
         transactions (Optional[pandas.DataFrame]): The tf-idf matrix as a pandas DataFrame.
+        threshold (Optional[float]): Threshold of tf-idf weights. If a weight is less than or equal to the
+         threshold, the term is not included in the transaction. Default: 0.
 
     Attributes:
         aws: The sum of tf-idf values for all the terms in the rule.
@@ -216,11 +218,19 @@ class TextRule(Rule):
         'comprehensibility', 'netconf', 'yulesq', 'aws'
     )
 
-    def __post_init__(self, transactions):
+    def __init__(self, antecedent, consequent, fitness=0.0, transactions=None, threshold=0):
+        super().__init__(antecedent, consequent, fitness, transactions=None)
+
+        if transactions is not None:
+            self.num_transactions = len(transactions)
+            self.__inclusion = (len(self.antecedent) + len(self.consequent)) / len(transactions.columns)
+            self.__post_init__(transactions, threshold)
+
+    def __post_init__(self, transactions, threshold=0):
         self.__inclusion = (len(self.antecedent) + len(self.consequent)) / len(transactions.columns)
         self.__aws = transactions[self.antecedent + self.consequent].values.sum()
-        contains_antecedent = (transactions[self.antecedent] > 0).all(axis=1)
-        contains_consequent = (transactions[self.consequent] > 0).all(axis=1)
+        contains_antecedent = (transactions[self.antecedent] > threshold).all(axis=1)
+        contains_consequent = (transactions[self.consequent] > threshold).all(axis=1)
         self.antecedent_count = contains_antecedent.sum()
         self.consequent_count = contains_consequent.sum()
         self.full_count = (contains_antecedent & contains_consequent).sum()
@@ -232,6 +242,10 @@ def __post_init__(self, transactions):
     def amplitude(self):
         return np.nan
 
+    @property
+    def inclusion(self):
+        return self.__inclusion
+
     @property
     def aws(self):
         return self.__aws
@@ -253,6 +267,8 @@ class NiaARTM(NiaARM):
         metrics (Union[Dict[str, float], Sequence[str]]): Metrics to take into account when computing the fitness.
          Metrics can either be passed as a Dict of pairs {'metric_name': <weight of metric>} or
          a sequence of metrics as strings, in which case, the weights of the metrics will be set to 1.
+        threshold (Optional[float]): Threshold of tf-idf weights. If a weight is less than or equal to the
+         threshold, the term is not included in the transaction. Default: 0.
         logging (bool): Enable logging of fitness improvements. Default: ``False``.
 
     Attributes:
@@ -264,27 +280,36 @@ class NiaARTM(NiaARM):
         'support', 'confidence', 'coverage', 'interestingness', 'comprehensibility', 'inclusion', 'rhs_support', 'aws'
     )
 
-    def __init__(self, max_terms, terms, transactions, metrics, logging=False):
+    def __init__(self, max_terms, terms, transactions, metrics, threshold=0, logging=False):
         super().__init__(max_terms + 1, terms, transactions, metrics, logging)
         self.max_terms = max_terms
+        self.threshold = threshold
 
     def build_rule(self, vector):
-        y = np.zeros(self.num_features, dtype=bool)
-        y[(vector * (self.num_features - 1)).astype(int)] = True
-        return np.array(self.features)[y].tolist()
+        terms = [self.features[int(val * (self.num_features - 1))] for val in vector]
+
+        seen = set()
+        rule = []
+        for term in terms:
+            if term in seen:
+                continue
+            rule.append(term)
+            seen.add(term)
+
+        return rule
 
-    def _evaluate(self, sol):
-        cut_value = sol[self.dimension - 1]
-        solution = sol[:-1]
-        cut = _cut_point(cut_value, self.max_terms)
+    def _evaluate(self, x):
+        cut_value = x[self.dimension - 1]
+        solution = x[:-1]
 
         rule = self.build_rule(solution)
+        cut = _cut_point(cut_value, len(rule))
 
         antecedent = rule[:cut]
         consequent = rule[cut:]
 
         if antecedent and consequent:
-            rule = TextRule(antecedent, consequent, transactions=self.transactions)
+            rule = TextRule(antecedent, consequent, transactions=self.transactions, threshold=self.threshold)
             metrics = [getattr(rule, metric) for metric in self.metrics]
             fitness = np.dot(self.weights, metrics) / self.sum_weights
             rule.fitness = fitness