Skip to content

Commit e46ed94

Browse files
authored
Merge pull request #78 from zStupan/update-text-mining
Update text mining
2 parents 5ee5abb + 8733539 commit e46ed94

File tree

5 files changed

+149
-15
lines changed

5 files changed

+149
-15
lines changed

examples/text_mining.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,14 @@
55

66
df = pd.read_json('datasets/text/artm_test_dataset.json', orient='records')
77
documents = df['text'].tolist()
8-
corpus = Corpus.from_list(documents)
8+
9+
try:
10+
corpus = Corpus.from_list(documents)
11+
except LookupError:
12+
import nltk
13+
nltk.download('punkt')
14+
nltk.download('stopwords')
15+
corpus = Corpus.from_list(documents)
916

1017
algorithm = ParticleSwarmOptimization(population_size=200, seed=123)
1118
metrics = ('support', 'confidence', 'aws')

niaarm/mine.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def get_rules(dataset, algorithm, metrics, max_evals=np.inf, max_iters=np.inf, l
5454
return Result(problem.rules, stop_time - start_time)
5555

5656

57-
def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, max_evals=np.inf, max_iters=np.inf,
57+
def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, threshold=0, max_evals=np.inf, max_iters=np.inf,
5858
logging=False, **kwargs):
5959
"""Mine association rules in a text corpus.
6060
@@ -69,6 +69,8 @@ def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, m
6969
a sequence of metrics as strings, in which case, the weights of the metrics will be set to 1.
7070
smooth (bool): Smooth idf to prevent division by 0 error. Default: ``True``.
7171
norm (int): Order of norm for normalizing the tf-idf matrix. Default: 2.
72+
threshold (Optional[float]): Threshold of tf-idf weights. If a weight is less than or equal to the
73+
threshold, the term is not included in the transaction. Default: 0.
7274
max_evals (Optional[int]): Maximum number of iterations. Default: ``inf``. At least one of ``max_evals`` or
7375
``max_iters`` must be provided.
7476
max_iters (Optional[int]): Maximum number of fitness evaluations. Default: ``inf``.
@@ -78,7 +80,7 @@ def get_text_rules(corpus, max_terms, algorithm, metrics, smooth=True, norm=2, m
7880
Result: A named tuple containing the list of mined rules and the algorithm's run time in seconds.
7981
8082
"""
81-
problem = NiaARTM(max_terms, corpus.terms(), corpus.tf_idf_matrix(smooth=smooth, norm=norm), metrics, logging)
83+
problem = NiaARTM(max_terms, corpus.terms(), corpus.tf_idf_matrix(smooth=smooth, norm=norm), metrics, threshold, logging)
8284
task = Task(problem, max_evals=max_evals, max_iters=max_iters, optimization_type=OptimizationType.MAXIMIZATION)
8385

8486
if isinstance(algorithm, str):

niaarm/text.py

Lines changed: 37 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,8 @@ class TextRule(Rule):
197197
consequent (list[str]): A list of consequent terms of the text rule.
198198
fitness (Optional[float]): Fitness value of the text rule.
199199
transactions (Optional[pandas.DataFrame]): The tf-idf matrix as a pandas DataFrame.
200+
threshold (Optional[float]): Threshold of tf-idf weights. If a weight is less than or equal to the
201+
threshold, the term is not included in the transaction. Default: 0.
200202
201203
Attributes:
202204
aws: The sum of tf-idf values for all the terms in the rule.
@@ -216,11 +218,19 @@ class TextRule(Rule):
216218
'comprehensibility', 'netconf', 'yulesq', 'aws'
217219
)
218220

219-
def __post_init__(self, transactions):
221+
def __init__(self, antecedent, consequent, fitness=0.0, transactions=None, threshold=0):
222+
super().__init__(antecedent, consequent, fitness, transactions=None)
223+
224+
if transactions is not None:
225+
self.num_transactions = len(transactions)
226+
self.__inclusion = (len(self.antecedent) + len(self.consequent)) / len(transactions.columns)
227+
self.__post_init__(transactions, threshold)
228+
229+
def __post_init__(self, transactions, threshold=0):
220230
self.__inclusion = (len(self.antecedent) + len(self.consequent)) / len(transactions.columns)
221231
self.__aws = transactions[self.antecedent + self.consequent].values.sum()
222-
contains_antecedent = (transactions[self.antecedent] > 0).all(axis=1)
223-
contains_consequent = (transactions[self.consequent] > 0).all(axis=1)
232+
contains_antecedent = (transactions[self.antecedent] > threshold).all(axis=1)
233+
contains_consequent = (transactions[self.consequent] > threshold).all(axis=1)
224234
self.antecedent_count = contains_antecedent.sum()
225235
self.consequent_count = contains_consequent.sum()
226236
self.full_count = (contains_antecedent & contains_consequent).sum()
@@ -232,6 +242,10 @@ def __post_init__(self, transactions):
232242
def amplitude(self):
233243
return np.nan
234244

245+
@property
246+
def inclusion(self):
247+
return self.__inclusion
248+
235249
@property
236250
def aws(self):
237251
return self.__aws
@@ -253,6 +267,8 @@ class NiaARTM(NiaARM):
253267
metrics (Union[Dict[str, float], Sequence[str]]): Metrics to take into account when computing the fitness.
254268
Metrics can either be passed as a Dict of pairs {'metric_name': <weight of metric>} or
255269
a sequence of metrics as strings, in which case, the weights of the metrics will be set to 1.
270+
threshold (Optional[float]): Threshold of tf-idf weights. If a weight is less than or equal to the
271+
threshold, the term is not included in the transaction. Default: 0.
256272
logging (bool): Enable logging of fitness improvements. Default: ``False``.
257273
258274
Attributes:
@@ -264,27 +280,36 @@ class NiaARTM(NiaARM):
264280
'support', 'confidence', 'coverage', 'interestingness', 'comprehensibility', 'inclusion', 'rhs_support', 'aws'
265281
)
266282

267-
def __init__(self, max_terms, terms, transactions, metrics, logging=False):
283+
def __init__(self, max_terms, terms, transactions, metrics, threshold=0, logging=False):
268284
super().__init__(max_terms + 1, terms, transactions, metrics, logging)
269285
self.max_terms = max_terms
286+
self.threshold = threshold
270287

271288
def build_rule(self, vector):
272-
y = np.zeros(self.num_features, dtype=bool)
273-
y[(vector * (self.num_features - 1)).astype(int)] = True
274-
return np.array(self.features)[y].tolist()
289+
terms = [self.features[int(val * (self.num_features - 1))] for val in vector]
290+
291+
seen = set()
292+
rule = []
293+
for term in terms:
294+
if term in seen:
295+
continue
296+
rule.append(term)
297+
seen.add(term)
298+
299+
return rule
275300

276-
def _evaluate(self, sol):
277-
cut_value = sol[self.dimension - 1]
278-
solution = sol[:-1]
279-
cut = _cut_point(cut_value, self.max_terms)
301+
def _evaluate(self, x):
302+
cut_value = x[self.dimension - 1]
303+
solution = x[:-1]
280304

281305
rule = self.build_rule(solution)
306+
cut = _cut_point(cut_value, len(rule))
282307

283308
antecedent = rule[:cut]
284309
consequent = rule[cut:]
285310

286311
if antecedent and consequent:
287-
rule = TextRule(antecedent, consequent, transactions=self.transactions)
312+
rule = TextRule(antecedent, consequent, transactions=self.transactions, threshold=self.threshold)
288313
metrics = [getattr(rule, metric) for metric in self.metrics]
289314
fitness = np.dot(self.weights, metrics) / self.sum_weights
290315
rule.fitness = fitness
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
[
2+
{
3+
"id": "1",
4+
"text": "Computational Intelligence methods for automatic generation of sport training plans in individual sport disciplines have achieved a mature phase. In order to confirm their added value, they have been deployed into practice. As a result, several methods have been developed for generating well formulated training plans on computers automatically that, typically, depend on the collection of past sport activities. However, monitoring the realization of the performed training sessions still represents a bottleneck in automating the process of sport training as a whole. The objective of this paper is to present a new low-cost and efficient embedded device for monitoring the realization of sport training sessions that is dedicated to monitor cycling training sessions. We designed and developed a new bike computer, i.e. the AST-Monitor, that can be mounted easily on almost every bicycle. The aforementioned bike computer is based on the Raspberry Pi device that supports different external sensors for capturing the data during the realization of sport training sessions. An adjusted GUI tailored to the needs of athletes is developed, along with the hardware. The proof of concept study, using the AST-Monitor in practice, revealed the potential of the proposed solution for monitoring of realized sport training sessions automatically. The new device also opens the door for the future utilization of Artificial Intelligence in a wide variety of sports.",
5+
"reference": "https://arxiv.org/abs/2109.13334"
6+
},
7+
{
8+
"id": "2",
9+
"text": "Numerical Association Rule Mining is a popular variant of Association Rule Mining, where numerical attributes are handled without discretization. This means that the algorithms for dealing with this problem can operate directly, not only with categorical, but also with numerical attributes. Until recently, a big portion of these algorithms were based on a stochastic nature-inspired population-based paradigm. As a result, evolutionary and swarm intelligence-based algorithms showed big efficiency for dealing with the problem. In line with this, the main mission of this chapter is to make a historical overview of swarm intelligence-based algorithms for Numerical Association Rule Mining, as well as to present the main features of these algorithms for the observed problem. A taxonomy of the algorithms was proposed on the basis of the applied features found in this overview. Challenges, waiting in the future, finish this paper.",
10+
"reference": "https://arxiv.org/abs/2010.15524"
11+
},
12+
{
13+
"id": "3",
14+
"text": "The paper presents a novel software framework for Association Rule Mining named uARMSolver. The framework is written fully in C++ and runs on all platforms. It allows users to preprocess their data in a transaction database, to make discretization of data, to search for association rules and to guide a presentation/visualization of the best rules found using external tools. As opposed to the existing software packages or frameworks, this also supports numerical and real-valued types of attributes besides the categorical ones. Mining the association rules is defined as an optimization and solved using the nature-inspired algorithms that can be incorporated easily. Because the algorithms normally discover a huge amount of association rules, the framework enables a modular inclusion of so-called visual guiders for extracting the knowledge hidden in data, and visualize these using external tools.",
15+
"reference": "https://arxiv.org/abs/2010.10884"
16+
},
17+
{
18+
"id": "4",
19+
"text": "Decisions made nowadays by Artificial Intelligence powered systems are usually hard for users to understand. One of the more important issues faced by developers is exposed as how to create more explainable Machine Learning models. In line with this, more explainable techniques need to be developed, where visual explanation also plays a more important role. This technique could also be applied successfully for explaining the results of Association Rule Mining.This Chapter focuses on two issues: (1) How to discover the relevant association rules, and (2) How to express relations between more attributes visually. For the solution of the first issue, the proposed method uses Differential Evolution, while Sankey diagrams are adopted to solve the second one. This method was applied to a transaction database containing data generated by an amateur cyclist in past seasons, using a mobile device worn during the realization of training sessions that is divided into four time periods. The results of visualization showed that a trend in improving performance of an athlete can be indicated by changing the attributes appearing in the selected association rules in different time periods.",
20+
"reference": "https://arxiv.org/abs/2010.03834"
21+
},
22+
{
23+
"id": "5",
24+
"text": "A COVID-19 pandemic has already proven itself to be a global challenge. It proves how vulnerable humanity can be. It has also mobilized researchers from different sciences and different countries in the search for a way to fight this potentially fatal disease. In line with this, our study analyses the abstracts of papers related to COVID-19 and coronavirus-related-research using association rule text mining in order to find the most interestingness words, on the one hand, and relationships between them on the other. Then, a method, called information cartography, was applied for extracting structured knowledge from a huge amount of association rules. On the basis of these methods, the purpose of our study was to show how researchers have responded in similar epidemic/pandemic situations throughout history.",
25+
"reference": "https://arxiv.org/abs/2004.03397"
26+
},
27+
{
28+
"id": "6",
29+
"text": "Association Rule Mining is a machine learning method for discovering the interesting relations between the attributes in a huge transaction database. Typically, algorithms for Association Rule Mining generate a huge number of association rules, from which it is hard to extract structured knowledge and present this automatically in a form that would be suitable for the user. Recently, an information cartography has been proposed for creating structured summaries of information and visualizing with methodology called metromaps. This was applied to several problem domains, where pattern mining was necessary. The aim of this study is to develop a method for automatic creation of metro maps of information obtained by Association Rule Mining and, thus, spread its applicability to the other machine learning methods. Although the proposed method consists of multiple steps, its core presents metro map construction that is defined in the study as an optimization problem, which is solved using an evolutionary algorithm. Finally, this was applied to four well-known UCI Machine Learning datasets and one sport dataset. Visualizing the resulted metro maps not only justifies that this is a suitable tool for presenting structured knowledge hidden in data, but also that they can tell stories to users.",
30+
"reference": "https://arxiv.org/abs/2003.00348"
31+
},
32+
{
33+
"id": "7",
34+
"text": "Modeling preference time in triathlons means predicting the intermediate times of particular sports disciplines by a given overall finish time in a specific triathlon course for the athlete with the known personal best result. This is a hard task for athletes and sport trainers due to a lot of different factors that need to be taken into account, e.g., athlete's abilities, health, mental preparations and even their current sports form. So far, this process was calculated manually without any specific software tools or using the artificial intelligence. This paper presents the new solution for modeling preference time in middle distance triathlons based on particle swarm optimization algorithm and archive of existing sports results. Initial results are presented, which suggest the usefulness of proposed approach, while remarks for future improvements and use are also emphasized. ",
35+
"reference": "https://arxiv.org/abs/1707.00718"
36+
},
37+
{
38+
"id": "8",
39+
"text": "To predict the final result of an athlete in a marathon run thoroughly is the eternal desire of each trainer. Usually, the achieved result is weaker than the predicted one due to the objective (e.g., environmental conditions) as well as subjective factors (e.g., athlete's malaise). Therefore, making up for the deficit between predicted and achieved results is the main ingredient of the analysis performed by trainers after the competition. In the analysis, they search for parts of a marathon course where the athlete lost time. This paper proposes an automatic making up for the deficit by using a Differential Evolution algorithm. In this case study, the results that were obtained by a wearable sports-watch by an athlete in a real marathon are analyzed. The first experiments with Differential Evolution show the possibility of using this method in the future.",
40+
"reference": "https://arxiv.org/abs/1705.03302"
41+
},
42+
{
43+
"id": "9",
44+
"text": " The firefly algorithm has become an increasingly important tool of Swarm Intelligence that has been applied in almost all areas of optimization, as well as engineering practice. Many problems from various areas have been successfully solved using the firefly algorithm and its variants. In order to use the algorithm to solve diverse problems, the original firefly algorithm needs to be modified or hybridized. This paper carries out a comprehensive review of this living and evolving discipline of Swarm Intelligence, in order to show that the firefly algorithm could be applied to every problem arising in practice. On the other hand, it encourages new researchers and algorithm developers to use this simple and yet very efficient algorithm for problem solving. It often guarantees that the obtained results will meet the expectations.",
45+
"reference": "https://arxiv.org/abs/1312.6609"
46+
}
47+
]

tests/test_text_mining.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import os
2+
from unittest import TestCase
3+
import numpy as np
4+
import pandas as pd
5+
import nltk
6+
7+
from niaarm.niaarm import _cut_point
8+
from niaarm.text import Corpus, TextRule, NiaARTM
9+
10+
11+
class TestTextMining(TestCase):
12+
def setUp(self):
13+
nltk.download('punkt')
14+
nltk.download('stopwords')
15+
ds_path = os.path.join(os.path.dirname(__file__), 'test_data', 'artm_test_dataset.json')
16+
df = pd.read_json(ds_path, orient='records')
17+
documents = df['text'].tolist()
18+
self.corpus = Corpus.from_list(documents)
19+
self.problem = NiaARTM(5, self.corpus.terms(), self.corpus.tf_idf_matrix(), ('support', 'confidence', 'aws'))
20+
21+
def test_rule_building(self):
22+
x = np.array([0.7572383073496659, 0.3585746102449889, 0.534521158129176, 0.7394209354120267, 0.08463251670378619,
23+
0.6666934805])
24+
rule = self.problem.build_rule(x[:-1])
25+
self.assertEqual(rule, ['resulted', 'form', 'mining', 'relations', 'attributes'])
26+
27+
def test_cut_point(self):
28+
x = np.array([0.7572383073496659, 0.3585746102449889, 0.534521158129176, 0.7394209354120267, 0.08463251670378619,
29+
0.6666934805])
30+
31+
cut_value = x[-1]
32+
rule = self.problem.build_rule(x[:-1])
33+
cut = _cut_point(cut_value, self.problem.max_terms)
34+
35+
antecedent = rule[:cut]
36+
consequent = rule[cut:]
37+
38+
self.assertEqual(cut, 3)
39+
self.assertEqual(antecedent, ['resulted', 'form', 'mining'])
40+
self.assertEqual(consequent, ['relations', 'attributes'])
41+
42+
def test_metrics(self):
43+
rule = TextRule(['resulted', 'form', 'mining'], ['relations', 'attributes'], transactions=self.problem.transactions)
44+
self.assertEqual(rule.lift, 4.5)
45+
self.assertEqual(rule.coverage, 0.1111111111111111)
46+
self.assertEqual(rule.rhs_support, 0.2222222222222222)
47+
self.assertEqual(rule.conviction, 3502799710177052.5)
48+
self.assertEqual(rule.inclusion, 0.011111111111111112)
49+
self.assertEqual(rule.interestingness, 0.49382716049382713)
50+
self.assertEqual(rule.comprehensibility, 0.6131471927654585)
51+
self.assertEqual(rule.netconf, 0.8749999999999999)
52+
self.assertEqual(rule.yulesq, 1.0)
53+
self.assertEqual(rule.aws, 1.44320067609805)

0 commit comments

Comments
 (0)