Merge pull request #78 from zStupan/update-text-mining

firefly-cpp · web-flow · commit e46ed94a6488 · 2022-12-22T15:51:39.000Z
Update text mining
diff --git a/examples/text_mining.py b/examples/text_mining.py
@@ -5,7 +5,14 @@
 
 df = pd.read_json('datasets/text/artm_test_dataset.json', orient='records')
 documents = df['text'].tolist()
-corpus = Corpus.from_list(documents)
+
+try:
+    corpus = Corpus.from_list(documents)
+except LookupError:
+    import nltk
+    nltk.download('punkt')
+    nltk.download('stopwords')
+    corpus = Corpus.from_list(documents)
 
 algorithm = ParticleSwarmOptimization(population_size=200, seed=123)
 metrics = ('support', 'confidence', 'aws')
diff --git a/niaarm/mine.py b/niaarm/mine.py
@@ -54,7 +54,7 @@ def get_rules(dataset, algorithm, metrics, max_evals=np.inf, max_iters=np.inf, l
     return Result(problem.rules, stop_time - start_time)
 
 
-def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, max_evals=np.inf, max_iters=np.inf,
+def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, threshold=0, max_evals=np.inf, max_iters=np.inf,
                    logging=False, **kwargs):
     """Mine association rules in a text corpus.
 
@@ -69,6 +69,8 @@ def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, m
          a sequence of metrics as strings, in which case, the weights of the metrics will be set to 1.
         smooth (bool): Smooth idf to prevent division by 0 error. Default: ``True``.
         norm (int): Order of norm for normalizing the tf-idf matrix. Default: 2.
+        threshold (Optional[float]): Threshold of tf-idf weights. If a weight is less than or equal to the
+         threshold, the term is not included in the transaction. Default: 0.
         max_evals (Optional[int]): Maximum number of iterations. Default: ``inf``. At least one of ``max_evals`` or
          ``max_iters`` must be provided.
         max_iters (Optional[int]): Maximum number of fitness evaluations. Default: ``inf``.
@@ -78,7 +80,7 @@ def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, m
         Result: A named tuple containing the list of mined rules and the algorithm's run time in seconds.
 
     """
-    problem = NiaARTM(max_terms, corpus.terms(), corpus.tf_idf_matrix(smooth=smooth, norm=norm), metrics, logging)
+    problem = NiaARTM(max_terms, corpus.terms(), corpus.tf_idf_matrix(smooth=smooth, norm=norm), metrics, threshold, logging)
     task = Task(problem, max_evals=max_evals, max_iters=max_iters, optimization_type=OptimizationType.MAXIMIZATION)
 
     if isinstance(algorithm, str):
diff --git a/niaarm/text.py b/niaarm/text.py
@@ -197,6 +197,8 @@ class TextRule(Rule):
         consequent (list[str]): A list of consequent terms of the text rule.
         fitness (Optional[float]): Fitness value of the text rule.
         transactions (Optional[pandas.DataFrame]): The tf-idf matrix as a pandas DataFrame.
+        threshold (Optional[float]): Threshold of tf-idf weights. If a weight is less than or equal to the
+         threshold, the term is not included in the transaction. Default: 0.
 
     Attributes:
         aws: The sum of tf-idf values for all the terms in the rule.
@@ -216,11 +218,19 @@ class TextRule(Rule):
         'comprehensibility', 'netconf', 'yulesq', 'aws'
     )
 
-    def __post_init__(self, transactions):
+    def __init__(self, antecedent, consequent, fitness=0.0, transactions=None, threshold=0):
+        super().__init__(antecedent, consequent, fitness, transactions=None)
+
+        if transactions is not None:
+            self.num_transactions = len(transactions)
+            self.__inclusion = (len(self.antecedent) + len(self.consequent)) / len(transactions.columns)
+            self.__post_init__(transactions, threshold)
+
+    def __post_init__(self, transactions, threshold=0):
         self.__inclusion = (len(self.antecedent) + len(self.consequent)) / len(transactions.columns)
         self.__aws = transactions[self.antecedent + self.consequent].values.sum()
-        contains_antecedent = (transactions[self.antecedent] > 0).all(axis=1)
-        contains_consequent = (transactions[self.consequent] > 0).all(axis=1)
+        contains_antecedent = (transactions[self.antecedent] > threshold).all(axis=1)
+        contains_consequent = (transactions[self.consequent] > threshold).all(axis=1)
         self.antecedent_count = contains_antecedent.sum()
         self.consequent_count = contains_consequent.sum()
         self.full_count = (contains_antecedent & contains_consequent).sum()
@@ -232,6 +242,10 @@ def __post_init__(self, transactions):
     def amplitude(self):
         return np.nan
 
+    @property
+    def inclusion(self):
+        return self.__inclusion
+
     @property
     def aws(self):
         return self.__aws
@@ -253,6 +267,8 @@ class NiaARTM(NiaARM):
         metrics (Union[Dict[str, float], Sequence[str]]): Metrics to take into account when computing the fitness.
          Metrics can either be passed as a Dict of pairs {'metric_name': <weight of metric>} or
          a sequence of metrics as strings, in which case, the weights of the metrics will be set to 1.
+        threshold (Optional[float]): Threshold of tf-idf weights. If a weight is less than or equal to the
+         threshold, the term is not included in the transaction. Default: 0.
         logging (bool): Enable logging of fitness improvements. Default: ``False``.
 
     Attributes:
@@ -264,27 +280,36 @@ class NiaARTM(NiaARM):
         'support', 'confidence', 'coverage', 'interestingness', 'comprehensibility', 'inclusion', 'rhs_support', 'aws'
     )
 
-    def __init__(self, max_terms, terms, transactions, metrics, logging=False):
+    def __init__(self, max_terms, terms, transactions, metrics, threshold=0, logging=False):
         super().__init__(max_terms + 1, terms, transactions, metrics, logging)
         self.max_terms = max_terms
+        self.threshold = threshold
 
     def build_rule(self, vector):
-        y = np.zeros(self.num_features, dtype=bool)
-        y[(vector * (self.num_features - 1)).astype(int)] = True
-        return np.array(self.features)[y].tolist()
+        terms = [self.features[int(val * (self.num_features - 1))] for val in vector]
+
+        seen = set()
+        rule = []
+        for term in terms:
+            if term in seen:
+                continue
+            rule.append(term)
+            seen.add(term)
+
+        return rule
 
-    def _evaluate(self, sol):
-        cut_value = sol[self.dimension - 1]
-        solution = sol[:-1]
-        cut = _cut_point(cut_value, self.max_terms)
+    def _evaluate(self, x):
+        cut_value = x[self.dimension - 1]
+        solution = x[:-1]
 
         rule = self.build_rule(solution)
+        cut = _cut_point(cut_value, len(rule))
 
         antecedent = rule[:cut]
         consequent = rule[cut:]
 
         if antecedent and consequent:
-            rule = TextRule(antecedent, consequent, transactions=self.transactions)
+            rule = TextRule(antecedent, consequent, transactions=self.transactions, threshold=self.threshold)
             metrics = [getattr(rule, metric) for metric in self.metrics]
             fitness = np.dot(self.weights, metrics) / self.sum_weights
             rule.fitness = fitness
diff --git a/tests/test_data/artm_test_dataset.json b/tests/test_data/artm_test_dataset.json
@@ -0,0 +1,47 @@
+[
+  {
+    "id": "1",
+    "text": "Computational Intelligence methods for automatic generation of sport training plans in individual sport disciplines have achieved a mature phase. In order to confirm their added value, they have been deployed into practice. As a result, several methods have been developed for generating well formulated training plans on computers automatically that, typically, depend on the collection of past sport activities. However, monitoring the realization of the performed training sessions still represents a bottleneck in automating the process of sport training as a whole. The objective of this paper is to present a new low-cost and efficient embedded device for monitoring the realization of sport training sessions that is dedicated to monitor cycling training sessions. We designed and developed a new bike computer, i.e. the AST-Monitor, that can be mounted easily on almost every bicycle. The aforementioned bike computer is based on the Raspberry Pi device that supports different external sensors for capturing the data during the realization of sport training sessions. An adjusted GUI tailored to the needs of athletes is developed, along with the hardware. The proof of concept study, using the AST-Monitor in practice, revealed the potential of the proposed solution for monitoring of realized sport training sessions automatically. The new device also opens the door for the future utilization of Artificial Intelligence in a wide variety of sports.",
+    "reference": "https://arxiv.org/abs/2109.13334"
+  },
+  {
+    "id": "2",
+    "text": "Numerical Association Rule Mining is a popular variant of Association Rule Mining, where numerical attributes are handled without discretization. This means that the algorithms for dealing with this problem can operate directly, not only with categorical, but also with numerical attributes. Until recently, a big portion of these algorithms were based on a stochastic nature-inspired population-based paradigm. As a result, evolutionary and swarm intelligence-based algorithms showed big efficiency for dealing with the problem. In line with this, the main mission of this chapter is to make a historical overview of swarm intelligence-based algorithms for Numerical Association Rule Mining, as well as to present the main features of these algorithms for the observed problem. A taxonomy of the algorithms was proposed on the basis of the applied features found in this overview. Challenges, waiting in the future, finish this paper.",
+    "reference": "https://arxiv.org/abs/2010.15524"
+  },
+  {
+    "id": "3",
+    "text": "The paper presents a novel software framework for Association Rule Mining named uARMSolver. The framework is written fully in C++ and runs on all platforms. It allows users to preprocess their data in a transaction database, to make discretization of data, to search for association rules and to guide a presentation/visualization of the best rules found using external tools. As opposed to the existing software packages or frameworks, this also supports numerical and real-valued types of attributes besides the categorical ones. Mining the association rules is defined as an optimization and solved using the nature-inspired algorithms that can be incorporated easily. Because the algorithms normally discover a huge amount of association rules, the framework enables a modular inclusion of so-called visual guiders for extracting the knowledge hidden in data, and visualize these using external tools.",
+    "reference": "https://arxiv.org/abs/2010.10884"
+  },
+  {
+    "id": "4",
+    "text": "Decisions made nowadays by Artificial Intelligence powered systems are usually hard for users to understand. One of the more important issues faced by developers is exposed as how to create more explainable Machine Learning models. In line with this, more explainable techniques need to be developed, where visual explanation also plays a more important role. This technique could also be applied successfully for explaining the results of Association Rule Mining.This Chapter focuses on two issues: (1) How to discover the relevant association rules, and (2) How to express relations between more attributes visually. For the solution of the first issue, the proposed method uses Differential Evolution, while Sankey diagrams are adopted to solve the second one. This method was applied to a transaction database containing data generated by an amateur cyclist in past seasons, using a mobile device worn during the realization of training sessions that is divided into four time periods. The results of visualization showed that a trend in improving performance of an athlete can be indicated by changing the attributes appearing in the selected association rules in different time periods.",
+    "reference": "https://arxiv.org/abs/2010.03834"
+  },
+  {
+    "id": "5",
+    "text": "A COVID-19 pandemic has already proven itself to be a global challenge. It proves how vulnerable humanity can be. It has also mobilized researchers from different sciences and different countries in the search for a way to fight this potentially fatal disease. In line with this, our study analyses the abstracts of papers related to COVID-19 and coronavirus-related-research using association rule text mining in order to find the most interestingness words, on the one hand, and relationships between them on the other. Then, a method, called information cartography, was applied for extracting structured knowledge from a huge amount of association rules. On the basis of these methods, the purpose of our study was to show how researchers have responded in similar epidemic/pandemic situations throughout history.",
+    "reference": "https://arxiv.org/abs/2004.03397"
+  },
+  {
+    "id": "6",
+    "text": "Association Rule Mining is a machine learning method for discovering the interesting relations between the attributes in a huge transaction database. Typically, algorithms for Association Rule Mining generate a huge number of association rules, from which it is hard to extract structured knowledge and present this automatically in a form that would be suitable for the user. Recently, an information cartography has been proposed for creating structured summaries of information and visualizing with methodology called metromaps. This was applied to several problem domains, where pattern mining was necessary. The aim of this study is to develop a method for automatic creation of metro maps of information obtained by Association Rule Mining and, thus, spread its applicability to the other machine learning methods. Although the proposed method consists of multiple steps, its core presents metro map construction that is defined in the study as an optimization problem, which is solved using an evolutionary algorithm. Finally, this was applied to four well-known UCI Machine Learning datasets and one sport dataset. Visualizing the resulted metro maps not only justifies that this is a suitable tool for presenting structured knowledge hidden in data, but also that they can tell stories to users.",
+    "reference": "https://arxiv.org/abs/2003.00348"
+  },
+  {
+    "id": "7",
+    "text": "Modeling preference time in triathlons means predicting the intermediate times of particular sports disciplines by a given overall finish time in a specific triathlon course for the athlete with the known personal best result. This is a hard task for athletes and sport trainers due to a lot of different factors that need to be taken into account, e.g., athlete's abilities, health, mental preparations and even their current sports form. So far, this process was calculated manually without any specific software tools or using the artificial intelligence. This paper presents the new solution for modeling preference time in middle distance triathlons based on particle swarm optimization algorithm and archive of existing sports results. Initial results are presented, which suggest the usefulness of proposed approach, while remarks for future improvements and use are also emphasized. ",
+    "reference": "https://arxiv.org/abs/1707.00718"
+  },
+  {
+    "id": "8",
+    "text": "To predict the final result of an athlete in a marathon run thoroughly is the eternal desire of each trainer. Usually, the achieved result is weaker than the predicted one due to the objective (e.g., environmental conditions) as well as subjective factors (e.g., athlete's malaise). Therefore, making up for the deficit between predicted and achieved results is the main ingredient of the analysis performed by trainers after the competition. In the analysis, they search for parts of a marathon course where the athlete lost time. This paper proposes an automatic making up for the deficit by using a Differential Evolution algorithm. In this case study, the results that were obtained by a wearable sports-watch by an athlete in a real marathon are analyzed. The first experiments with Differential Evolution show the possibility of using this method in the future.",
+    "reference": "https://arxiv.org/abs/1705.03302"
+  },
+  {
+    "id": "9",
+    "text": " The firefly algorithm has become an increasingly important tool of Swarm Intelligence that has been applied in almost all areas of optimization, as well as engineering practice. Many problems from various areas have been successfully solved using the firefly algorithm and its variants. In order to use the algorithm to solve diverse problems, the original firefly algorithm needs to be modified or hybridized. This paper carries out a comprehensive review of this living and evolving discipline of Swarm Intelligence, in order to show that the firefly algorithm could be applied to every problem arising in practice. On the other hand, it encourages new researchers and algorithm developers to use this simple and yet very efficient algorithm for problem solving. It often guarantees that the obtained results will meet the expectations.",
+    "reference": "https://arxiv.org/abs/1312.6609"
+  }
+]
diff --git a/tests/test_text_mining.py b/tests/test_text_mining.py
@@ -0,0 +1,53 @@
+import os
+from unittest import TestCase
+import numpy as np
+import pandas as pd
+import nltk
+
+from niaarm.niaarm import _cut_point
+from niaarm.text import Corpus, TextRule, NiaARTM
+
+
+class TestTextMining(TestCase):
+    def setUp(self):
+        nltk.download('punkt')
+        nltk.download('stopwords')
+        ds_path = os.path.join(os.path.dirname(__file__), 'test_data', 'artm_test_dataset.json')
+        df = pd.read_json(ds_path, orient='records')
+        documents = df['text'].tolist()
+        self.corpus = Corpus.from_list(documents)
+        self.problem = NiaARTM(5, self.corpus.terms(), self.corpus.tf_idf_matrix(), ('support', 'confidence', 'aws'))
+
+    def test_rule_building(self):
+        x = np.array([0.7572383073496659, 0.3585746102449889, 0.534521158129176, 0.7394209354120267, 0.08463251670378619,
+                      0.6666934805])
+        rule = self.problem.build_rule(x[:-1])
+        self.assertEqual(rule, ['resulted', 'form', 'mining', 'relations', 'attributes'])
+
+    def test_cut_point(self):
+        x = np.array([0.7572383073496659, 0.3585746102449889, 0.534521158129176, 0.7394209354120267, 0.08463251670378619,
+                      0.6666934805])
+
+        cut_value = x[-1]
+        rule = self.problem.build_rule(x[:-1])
+        cut = _cut_point(cut_value, self.problem.max_terms)
+
+        antecedent = rule[:cut]
+        consequent = rule[cut:]
+
+        self.assertEqual(cut, 3)
+        self.assertEqual(antecedent, ['resulted', 'form', 'mining'])
+        self.assertEqual(consequent, ['relations', 'attributes'])
+
+    def test_metrics(self):
+        rule = TextRule(['resulted', 'form', 'mining'], ['relations', 'attributes'], transactions=self.problem.transactions)
+        self.assertEqual(rule.lift, 4.5)
+        self.assertEqual(rule.coverage, 0.1111111111111111)
+        self.assertEqual(rule.rhs_support, 0.2222222222222222)
+        self.assertEqual(rule.conviction, 3502799710177052.5)
+        self.assertEqual(rule.inclusion, 0.011111111111111112)
+        self.assertEqual(rule.interestingness, 0.49382716049382713)
+        self.assertEqual(rule.comprehensibility, 0.6131471927654585)
+        self.assertEqual(rule.netconf, 0.8749999999999999)
+        self.assertEqual(rule.yulesq, 1.0)
+        self.assertEqual(rule.aws, 1.44320067609805)