Merge pull request #62 from zStupan/text-mining

firefly-cpp · web-flow · commit d6e746ec7f3c · 2022-05-28T12:07:43.000Z
Experimental support for text mining
diff --git a/README.md b/README.md
@@ -28,7 +28,8 @@ The current version includes (but is not limited to) the following functions:
 - searching for association rules,
 - providing output of mined association rules,
 - generating statistics about mined association rules,
-- visualization of association rules.
+- visualization of association rules,
+- association rule text mining (experimental).
 
 ## Installation
 
@@ -159,6 +160,37 @@ plt.show()
 </p>
 
 
+### Text Mining (Experimental)
+
+An experimental implementation of association rule text mining using nature-inspired algorithms, based on ideas from [5]
+is also provided. The `niaarm.text` module contains the `Corpus` and `Document` classes for loading and preprocessing corpora,
+a `TextRule` class, representing a text rule, and the `NiaARTM` class, implementing association rule text mining
+as a continuous optimization problem. The `get_text_rules` function, equivalent to `get_rules`, but for text mining, was also
+added to the `niaarm.mine` module.
+
+```python
+import pandas as pd
+from niaarm.text import Corpus
+from niaarm.mine import get_text_rules
+from niapy.algorithms.basic import ParticleSwarmOptimization
+
+df = pd.read_json('datasets/text/artm_test_dataset.json', orient='records')
+documents = df['text'].tolist()
+corpus = Corpus.from_list(documents)
+
+algorithm = ParticleSwarmOptimization(population_size=200, seed=123)
+metrics = ('support', 'confidence', 'aws')
+rules, time = get_text_rules(corpus, max_terms=5, algorithm=algorithm, metrics=metrics, max_evals=10000, logging=True)
+
+if len(rules):
+    print(rules)
+    print(f'Run time: {time:.2f}s')
+    rules.to_csv('output.csv')
+else:
+    print('No rules generated')
+    print(f'Run time: {time:.2f}s')
+```
+
 For a full list of examples see the [examples folder](https://github.com/firefly-cpp/NiaARM/tree/main/examples)
 in the GitHub repository.
 
@@ -218,6 +250,10 @@ Ideas are based on the following research papers:
     In: Analide, C., Novais, P., Camacho, D., Yin, H. (eds) Intelligent Data Engineering and Automated Learning – IDEAL 2020.
     IDEAL 2020. Lecture Notes in Computer Science(), vol 12489. Springer, Cham. https://doi.org/10.1007/978-3-030-62362-3_10
 
+[5] I. Fister, S. Deb, I. Fister, „Population-based metaheuristics for Association Rule Text Mining“,
+    In: Proceedings of the 2020 4th International Conference on Intelligent Systems, Metaheuristics & Swarm Intelligence,
+    New York, NY, USA, mar. 2020, pp. 19–23. doi: 10.1145/3396474.3396493.
+
 ## License
 
 This package is distributed under the MIT License. This license can be found online at <http://www.opensource.org/licenses/MIT>.
diff --git a/docs/api/index.rst b/docs/api/index.rst
@@ -9,4 +9,5 @@ API Reference
     niaarm
     rule
     rule_list
+    text
     visualize
diff --git a/docs/api/text.rst b/docs/api/text.rst
@@ -0,0 +1,6 @@
+Text
+====
+
+.. automodule:: niaarm.text
+    :members:
+    :show-inheritance:
diff --git a/docs/getting_started.rst b/docs/getting_started.rst
@@ -217,6 +217,68 @@ presented in `this paper <https://link.springer.com/chapter/10.1007/978-3-030-62
 
 .. image:: _static/hill_slopes.png
 
+Text Mining (Experimental)
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+An experimental implementation of association rule text mining using nature-inspired algorithms
+is also provided. The :mod:`niaarm.text` module contains the :class:`~niaarm.text.Corpus` and :class:`~niaarm.text.Document` classes for loading and preprocessing corpora,
+a :class:`~niaarm.text.TextRule` class, representing a text rule, and the :class:`~niaarm.text.NiaARTM` class, implementing association rule text mining
+as a continuous optimization problem. The :func:`~niaarm.mine.get_text_rules` function, equivalent to :func:`~niaarm.mine.get_rules`, but for text mining, was also
+added to the :mod:`niaarm.mine` module.
+
+.. code:: python
+
+    import pandas as pd
+    from niaarm.text import Corpus
+    from niaarm.mine import get_text_rules
+    from niapy.algorithms.basic import ParticleSwarmOptimization
+
+    df = pd.read_json('datasets/text/artm_test_dataset.json', orient='records')
+    documents = df['text'].tolist()
+    corpus = Corpus.from_list(documents)
+
+    algorithm = ParticleSwarmOptimization(population_size=200, seed=123)
+    metrics = ('support', 'confidence', 'aws')
+    rules, time = get_text_rules(corpus, max_terms=5, algorithm=algorithm, metrics=metrics, max_evals=10000, logging=True)
+
+    if len(rules):
+        print(rules)
+        print(f'Run time: {time:.2f}s')
+        rules.to_csv('output.csv')
+    else:
+        print('No rules generated')
+        print(f'Run time: {time:.2f}s')
+
+**Output:**
+
+.. code:: text
+
+    Fitness: 0.53345778328699, Support: 0.1111111111111111, Confidence: 1.0, Aws: 0.48926223874985886
+    Fitness: 0.7155830770302328, Support: 0.1111111111111111, Confidence: 1.0, Aws: 1.0356381199795872
+    Fitness: 0.7279963436805833, Support: 0.1111111111111111, Confidence: 1.0, Aws: 1.072877919930639
+    Fitness: 0.7875917299029188, Support: 0.1111111111111111, Confidence: 1.0, Aws: 1.251664078597645
+    Fitness: 0.8071206688346807, Support: 0.1111111111111111, Confidence: 1.0, Aws: 1.310250895392931
+    STATS:
+    Total rules: 52
+    Average fitness: 0.5179965084882088
+    Average support: 0.11538461538461527
+    Average confidence: 0.7115384615384616
+    Average lift: 5.524038461538462
+    Average coverage: 0.17948717948717943
+    Average consequent support: 0.1517094017094015
+    Average conviction: 1568561408678185.8
+    Average amplitude: nan
+    Average inclusion: 0.007735042735042727
+    Average interestingness: 0.6170069642291859
+    Average comprehensibility: 0.6763685578758655
+    Average netconf: 0.6675824175824177
+    Average Yule's Q: 0.9670329670329672
+    Average antecedent length: 1.6346153846153846
+    Average consequent length: 1.8461538461538463
+
+    Run time: 13.37s
+    Rules exported to output.csv
+
 Interest Measures
 -----------------
 
diff --git a/docs/index.rst b/docs/index.rst
@@ -24,7 +24,8 @@ The current version includes (but is not limited to) the following functions:
 - searching for association rules,
 - providing output of mined association rules,
 - generating statistics about mined association rules,
-- visualization of association rules.
+- visualization of association rules,
+- association rule text mining (experimental).
 
 Documentation
 =============
diff --git a/docs/refs.bib b/docs/refs.bib
@@ -1,27 +1,66 @@
-@inproceedings{fister2018differential,
-  title={Differential evolution for association rule mining using categorical and numerical attributes},
-  author={Fister Jr., Iztok and Iglesias, Andres and Galvez, Akemi and Ser, Javier Del and Osaba, Eneko and Fister, Iztok},
-  booktitle={International conference on intelligent data engineering and automated learning},
-  pages={79--88},
-  year={2018},
-  organization={Springer}
+@inproceedings{fister_differential_2018,
+    address = {Cham},
+    title = {Differential {Evolution} for {Association} {Rule} {Mining} {Using} {Categorical} and {Numerical} {Attributes}},
+    isbn = {9783030034931},
+    doi = {10.1007/978-3-030-03493-1_9},
+    language = {en},
+    booktitle = {Intelligent {Data} {Engineering} and {Automated} {Learning} – {IDEAL} 2018},
+    publisher = {Springer International Publishing},
+    author = {Fister, Iztok and Iglesias, Andres and Galvez, Akemi and Del Ser, Javier and Osaba, Eneko and Fister, Iztok},
+    editor = {Yin, Hujun and Camacho, David and Novais, Paulo and Tallón-Ballesteros, Antonio J.},
+    year = {2018},
+    pages = {79--88},
 }
 
-@inproceedings{fister2020improved,
-  title={Improved nature-inspired algorithms for numeric association rule mining},
-  author={Fister Jr, Iztok and Podgorelec, Vili and Fister, Iztok},
-  booktitle={International Conference on Intelligent Computing \& Optimization},
-  pages={187--195},
-  year={2020},
-  organization={Springer}
+@inproceedings{fister_jr_improved_2021,
+    address = {Cham},
+    title = {Improved {Nature}-{Inspired} {Algorithms} for {Numeric} {Association} {Rule} {Mining}},
+    isbn = {9783030681548},
+    doi = {10.1007/978-3-030-68154-8_19},
+    language = {en},
+    booktitle = {Intelligent {Computing} and {Optimization}},
+    publisher = {Springer International Publishing},
+    author = {Fister Jr., Iztok and Podgorelec, Vili and Fister, Iztok},
+    editor = {Vasant, Pandian and Zelinka, Ivan and Weber, Gerhard-Wilhelm},
+    year = {2021},
+    pages = {187--195},
 }
 
+@article{fister_jr_brief_2020,
+    title = {A brief overview of swarm intelligence-based algorithms for numerical association rule mining},
+	doi = {10.48550/ARXIV.2010.15524},
+    abstract = {Numerical Association Rule Mining is a popular variant of Association Rule Mining, where numerical attributes are handled without discretization. This means that the algorithms for dealing with this problem can operate directly, not only with categorical, but also with numerical attributes. Until recently, a big portion of these algorithms were based on a stochastic nature-inspired population-based paradigm. As a result, evolutionary and swarm intelligence-based algorithms showed big efficiency for dealing with the problem. In line with this, the main mission of this chapter is to make a historical overview of swarm intelligence-based algorithms for Numerical Association Rule Mining, as well as to present the main features of these algorithms for the observed problem. A taxonomy of the algorithms was proposed on the basis of the applied features found in this overview. Challenges, waiting in the future, finish this paper.},
+    journal = {arXiv:2010.15524 [cs]},
+    author = {Fister Jr. , Iztok and Fister, Iztok},
+    month = oct,
+    year = {2020},
+}
+
+@inproceedings{fister_population-based_2020,
+	address = {New York, NY, USA},
+	series = {{ISMSI} '20},
+	title = {Population-based metaheuristics for {Association} {Rule} {Text} {Mining}},
+	isbn = {9781450377614},
+	doi = {10.1145/3396474.3396493},
+	booktitle = {Proceedings of the 2020 4th {International} {Conference} on {Intelligent} {Systems}, {Metaheuristics} \& {Swarm} {Intelligence}},
+	publisher = {Association for Computing Machinery},
+	author = {Fister, Iztok and Deb, Suash and Fister, Iztok},
+	month = mar,
+	year = {2020},
+	keywords = {association rule text mining, particle swarm optimization, triathlon, natural language processing, optimization},
+	pages = {19--23},
+}
 
-@article{fister2021brief,
-  title={A Brief Overview of Swarm Intelligence-Based Algorithms for Numerical Association Rule Mining},
-  author={Fister Jr, Iztok and Fister, Iztok},
-  journal={Applied Optimization and Swarm Intelligence},
-  pages={47--59},
-  year={2021},
-  publisher={Springer}
+@inproceedings{fister_visualization_2020,
+	address = {Cham},
+	title = {Visualization of {Numerical} {Association} {Rules} by {Hill} {Slopes}},
+	isbn = {9783030623623},
+	doi = {10.1007/978-3-030-62362-3_10},
+	language = {en},
+	booktitle = {Intelligent {Data} {Engineering} and {Automated} {Learning} – {IDEAL} 2020},
+	publisher = {Springer International Publishing},
+	author = {Fister, Iztok and Fister, Dušan and Iglesias, Andres and Galvez, Akemi and Osaba, Eneko and Del Ser, Javier and Fister, Iztok},
+	editor = {Analide, Cesar and Novais, Paulo and Camacho, David and Yin, Hujun},
+	year = {2020},
+	pages = {101--111},
 }
diff --git a/examples/text_mining.py b/examples/text_mining.py
@@ -0,0 +1,20 @@
+import pandas as pd
+from niaarm.text import Corpus
+from niaarm.mine import get_text_rules
+from niapy.algorithms.basic import ParticleSwarmOptimization
+
+df = pd.read_json('datasets/text/artm_test_dataset.json', orient='records')
+documents = df['text'].tolist()
+corpus = Corpus.from_list(documents)
+
+algorithm = ParticleSwarmOptimization(population_size=200, seed=123)
+metrics = ('support', 'confidence', 'aws')
+rules, time = get_text_rules(corpus, max_terms=5, algorithm=algorithm, metrics=metrics, max_evals=10000, logging=True)
+
+if len(rules):
+    print(rules)
+    print(f'Run time: {time:.2f}s')
+    rules.to_csv('output.csv')
+else:
+    print('No rules generated')
+    print(f'Run time: {time:.2f}s')
diff --git a/niaarm/mine.py b/niaarm/mine.py
@@ -4,6 +4,7 @@
 from niaarm.niaarm import NiaARM
 from niapy.task import OptimizationType, Task
 from niapy.util.factory import get_algorithm
+from niaarm.text import NiaARTM
 
 
 class Result(namedtuple('Result', ('rules', 'run_time'))):
@@ -51,3 +52,42 @@ def get_rules(dataset, algorithm, metrics, max_evals=np.inf, max_iters=np.inf, l
     problem.rules.sort()
 
     return Result(problem.rules, stop_time - start_time)
+
+
+def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, max_evals=np.inf, max_iters=np.inf,
+                   logging=False, **kwargs):
+    """Mine association rules in a text corpus.
+
+    Args:
+        corpus (Corpus): Dataset to mine rules on.
+        max_terms (int): Maximum number of terms in association rule.
+        algorithm (Union[niapy.algorithms.Algorithm, str]): Algorithm to use.
+         Can be either an Algorithm object or the class name as a string.
+         In that case, algorithm parameters can be passed in as keyword arguments.
+        metrics (Union[Dict[str, float], Sequence[str]]): Metrics to take into account when computing the fitness.
+         Metrics can either be passed as a Dict of pairs {'metric_name': <weight of metric>} or
+         a sequence of metrics as strings, in which case, the weights of the metrics will be set to 1.
+        smooth (bool): Smooth idf to prevent division by 0 error. Default: ``True``.
+        norm (int): Order of norm for normalizing the tf-idf matrix. Default: 2.
+        max_evals (Optional[int]): Maximum number of iterations. Default: ``inf``. At least one of ``max_evals`` or
+         ``max_iters`` must be provided.
+        max_iters (Optional[int]): Maximum number of fitness evaluations. Default: ``inf``.
+        logging (bool): Enable logging of fitness improvements. Default: ``False``.
+
+    Returns:
+        Result: A named tuple containing the list of mined rules and the algorithm's run time in seconds.
+
+    """
+    problem = NiaARTM(max_terms, corpus.terms(), corpus.tf_idf_matrix(smooth=smooth, norm=norm), metrics, logging)
+    task = Task(problem, max_evals=max_evals, max_iters=max_iters, optimization_type=OptimizationType.MAXIMIZATION)
+
+    if isinstance(algorithm, str):
+        algorithm = get_algorithm(algorithm, **kwargs)
+
+    start_time = time.perf_counter()
+    algorithm.run(task)
+    stop_time = time.perf_counter()
+
+    problem.rules.sort()
+
+    return Result(problem.rules, stop_time - start_time)
diff --git a/niaarm/niaarm.py b/niaarm/niaarm.py
@@ -68,7 +68,6 @@ def __init__(self, dimension, features, transactions, metrics, logging=False):
         super().__init__(dimension, 0.0, 1.0)
 
     def build_rule(self, vector):
-        r"""Build association rule from the candidate solution."""
         rule = []
 
         permutation = vector[-self.num_features:]
diff --git a/niaarm/rule_list.py b/niaarm/rule_list.py
@@ -1,7 +1,6 @@
 from collections import UserList
 import csv
 import numpy as np
-from niaarm.rule import Rule
 
 
 class RuleList(UserList):
@@ -87,12 +86,14 @@ def to_csv(self, filename):
         with open(filename, 'w', newline='') as f:
             writer = csv.writer(f)
 
+            metrics = self.data[0].metrics
+
             # write header
-            writer.writerow(("antecedent", "consequent", "fitness") + Rule.metrics)
+            writer.writerow(("antecedent", "consequent", "fitness") + metrics)
 
             for rule in self.data:
                 writer.writerow(
-                    [rule.antecedent, rule.consequent, rule.fitness] + [getattr(rule, metric) for metric in Rule.metrics])
+                    [rule.antecedent, rule.consequent, rule.fitness] + [getattr(rule, metric) for metric in metrics])
         print(f"Rules exported to {filename}")
 
     def __str__(self):
diff --git a/niaarm/text.py b/niaarm/text.py
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml