diff --git a/examples/visualization_examples/prepare_datasets.py b/examples/visualization_examples/prepare_datasets.py index 388bea5..564035a 100644 --- a/examples/visualization_examples/prepare_datasets.py +++ b/examples/visualization_examples/prepare_datasets.py @@ -165,3 +165,127 @@ def get_data_developer_salary_data(): ]] return arm_df + + +def get_abalone_data(): + # Read csv and create DataFrame + df = pd.read_csv("datasets/Abalone.csv") + + ######### DISCRETIZATION ######### + def get_descriptive_stats(data_frame, column, bins_num): + stats = data_frame[column].describe() + bins_values = [] + if bins_num == 5: + bins_values = [ + stats["min"], + stats["25%"], + stats["50%"], + stats["75%"], + stats["max"], + stats["max"] + 0.01 + ] + elif bins_num == 3: + bins_values = [ + stats["min"], + (stats["min"] + (stats["max"] - stats["min"]) / 3), + (stats["min"] + 2 * (stats["max"] - stats["min"]) / 3), + stats["max"] + 0.01 + ] + + return bins_values + + # LENGTH + length_stats = get_descriptive_stats(df, "Length", 3) + length_labels = ["Small", "Medium", "Large"] + df["Length"] = pd.cut( + df["Length"], + bins=length_stats, + labels=length_labels, + include_lowest=True + ) + + # DIAMETER + diameter_stats = get_descriptive_stats(df, "Diameter", 3) + diameter_labels = ["Small", "Medium", "Large"] + df["Diameter"] = pd.cut( + df["Diameter"], + bins=diameter_stats, + labels=diameter_labels, + include_lowest=True + ) + + # HEIGHT + height_stats = get_descriptive_stats(df, "Height", 3) + height_labels = ["Small", "Medium", "Large"] + df["Height"] = pd.cut( + df["Height"], + bins=height_stats, + labels=height_labels, + include_lowest=True + ) + + # WHOLE WEIGHT + whole_weight_stats = get_descriptive_stats(df, "Whole weight", 3) + whole_weight_labels = ["Light", "Medium", "Heavy"] + df["Whole weight"] = pd.cut( + df["Whole weight"], + bins=whole_weight_stats, + labels=whole_weight_labels, + include_lowest=True + ) + + # SHUCKED WEIGHT + shucked_weight_stats = get_descriptive_stats(df, "Shucked weight", 3) + shucked_weight_labels = ["Light", "Medium", "Heavy"] + df["Shucked weight"] = pd.cut( + df["Shucked weight"], + bins=shucked_weight_stats, + labels=shucked_weight_labels, + include_lowest=True + ) + + # VISCERA WEIGHT + viscera_weight_stats = get_descriptive_stats(df, "Viscera weight", 3) + viscera_weight_labels = ["Light", "Medium", "Heavy"] + df["Viscera weight"] = pd.cut( + df["Viscera weight"], + bins=viscera_weight_stats, + labels=viscera_weight_labels, + include_lowest=True + ) + + # SHELL WEIGHT + shell_weight_stats = get_descriptive_stats(df, "Shell weight", 3) + shell_weight_labels = ["Light", "Medium", "Heavy"] + df["Shell weight"] = pd.cut( + df["Shell weight"], + bins=shell_weight_stats, + labels=shell_weight_labels, + include_lowest=True + ) + + # AGE + age_stats = get_descriptive_stats(df, "Rings", 3) + age_labels = ["Young", "Adult", "Old"] + df["Age"] = pd.cut( + df["Rings"], + bins=age_stats, + labels=age_labels, + include_lowest=True + ) + + # Select relevant columns for ARM + arm_df = df[[ + "Sex", + "Length", + "Diameter", + "Height", + "Whole weight", + "Shucked weight", + "Viscera weight", + "Shell weight", + "Age" + ]] + + return arm_df + \ No newline at end of file diff --git a/examples/visualization_examples/sankey_diagram/abalone_sankey_diagram.py b/examples/visualization_examples/sankey_diagram/abalone_sankey_diagram.py new file mode 100644 index 0000000..4bce801 --- /dev/null +++ b/examples/visualization_examples/sankey_diagram/abalone_sankey_diagram.py @@ -0,0 +1,39 @@ +from examples.visualization_examples.prepare_datasets import get_abalone_data +from niaarm import Dataset, get_rules +from niaarm.visualize import sankey_diagram + +# Get prepared data developer salary data +arm_df = get_abalone_data() + +# Prepare Dataset +dataset = Dataset( + path_or_df=arm_df, + delimiter="," +) + +# Get rules +metrics = ("support", "confidence") +rules, run_time = get_rules( + dataset=dataset, + algorithm="DifferentialEvolution", + metrics=metrics, + max_evals=500 +) + +# Sort rules +rules.sort(by="support") +# Print rule information +print("\nRules:") +print(rules) +print(f'\nTime to generate rules: {f"{run_time:.3f}"} seconds') +print("\nRule information: ", rules[3]) +print("Antecedent: ", rules[3].antecedent) +print("Consequent: ", rules[3].consequent) +print("Confidence: ", rules[3].confidence) +print("Support: ", rules[3].support) +print("Lift: ", rules[3].lift) +print("\nMetrics:", metrics) + +# Visualize sankey diagram +fig = sankey_diagram(rules=rules, interestingness_measure="support", M=4) +fig.show() diff --git a/examples/visualization_examples/sankey_diagram/weather_data_sankey_diagram.py b/examples/visualization_examples/sankey_diagram/weather_data_sankey_diagram.py new file mode 100644 index 0000000..5cca6f3 --- /dev/null +++ b/examples/visualization_examples/sankey_diagram/weather_data_sankey_diagram.py @@ -0,0 +1,45 @@ +from examples.visualization_examples.prepare_datasets import get_weather_data +from niaarm import Dataset, get_rules +from niaarm.visualize import sankey_diagram + +# Get prepared weather data +arm_df = get_weather_data() + +# Prepare Dataset +dataset = Dataset( + path_or_df=arm_df, + delimiter="," +) + +# Get rules +metrics = ("support", "confidence") +rules, run_time = get_rules( + dataset=dataset, + algorithm="DifferentialEvolution", + metrics=metrics, + max_evals=500 +) + +# Add lift after the rules have been generated +# Cannot be in metrics before because get_rules metrics doesn't contain lift, therefore we need to add after +metrics = list(metrics) +metrics.append("lift") +metrics = tuple(metrics) + +# Sort rules +rules.sort(by="support") +# Print rule information +print("\nRules:") +print(rules) +print(f'\nTime to generate rules: {f"{run_time:.3f}"} seconds') +print("\nRule information: ", rules[3]) +print("Antecedent: ", rules[3].antecedent) +print("Consequent: ", rules[3].consequent) +print("Confidence: ", rules[3].confidence) +print("Support: ", rules[3].support) +print("Lift: ", rules[3].lift) +print("\nMetrics:", metrics) + +# Visualize sankey diagram +fig = sankey_diagram(rules=rules, interestingness_measure="support", M=4) +fig.show() diff --git a/niaarm/visualize.py b/niaarm/visualize.py index 9e220e5..dbc1fd4 100644 --- a/niaarm/visualize.py +++ b/niaarm/visualize.py @@ -3,8 +3,10 @@ from matplotlib.colors import Normalize import numpy as np import plotly.express as px +import plotly.graph_objects as go import pandas as pd from sklearn.cluster import KMeans +from itertools import combinations def hill_slopes(rule, transactions): @@ -554,4 +556,124 @@ def prepare_data(rules, metrics): plt.legend(title="Order") plt.grid(True) return plt - \ No newline at end of file + + +def sankey_diagram(rules, interestingness_measure, M=4): + """ + Visualize rules as a sankey diagram. + + Args: + rules (Rule): Association rule or rules to visualize. + interestingness_measures (str): Interestingness measure Z = {supp, cons, lift},reflecting the quality of a particular connection. + m (int): Maximum number of rules to be selected for visualization. Default: 4 + + Returns: + Figure or plot. + """ + + + def compute_similarity(rule1, rule2): + """Compute similarity between two rules.""" + ant_inter = len(set(str(rule1.antecedent)) & set(str(rule2.antecedent))) + ant_union = len(set(str(rule1.antecedent)) | set(str(rule2.antecedent))) + con_inter = len(set(str(rule1.consequent)) & set(str(rule2.consequent))) + con_union = len(set(str(rule1.consequent)) | set(str(rule2.consequent))) + return (ant_inter + con_inter) / (ant_union + con_union) + + def build_adjacency_matrix(rules): + size = len(rules) + adjacency_matrix = np.zeros((size, size)) + + for i, j in combinations(range(size), 2): + similarity = compute_similarity(rules[i], rules[j]) + adjacency_matrix[i, j] = similarity + adjacency_matrix[j, i] = similarity + + return adjacency_matrix + + def knapsack_selection(adj_matrix, rules, M): + fitness_scores = np.array([rule.fitness for rule in rules]) + N = len(rules) # number of rules + weights = np.ones(N) # all rules have the same weight + similarity_weight = 1.0 + fitness_weight = 0.5 + combined_profits = similarity_weight * np.sum(adj_matrix) + fitness_weight * fitness_scores # combined similarities with fitness for values + + selected = np.zeros(N, dtype=int) + + # Initialize DP table + dp = np.zeros((N + 1, M + 1)) + for i in range(1, N + 1): + for w in range(1, M + 1): + if weights[i - 1] <= w: + dp[i, w] = max(dp[i - 1, w], dp[i - 1, w - 1] + combined_profits[i - 1]) + else: + dp[i, w] = dp[i - 1, w] + + # Backtrack to find selected rules + w = M + for i in range(N, 0, -1): + if dp[i, w] != dp[i - 1, w]: + selected[i - 1] = 1 + w -= 1 + + selected_rules = [rules[i] for i in range(N) if selected[i]] + + return selected_rules + + def prepare_data(rules, M, interestingness_measure): + if not rules: + return [], [], [], [] + + adj_matrix = build_adjacency_matrix(rules) + selected_rules = knapsack_selection(adj_matrix, rules, M) + + sources=[] + targets=[] + values=[] + labels=[] + node_indices = {} + + for rule in selected_rules: + # Ensure all antecedents and consequents exist in the node list + for item in rule.antecedent + rule.consequent: + item_str = str(item) + if item_str not in node_indices: + node_indices[item_str] = len(labels) + labels.append(item_str) + + # Connect each antecedent to each consequent + for antecedent in rule.antecedent: + for consequent in rule.consequent: + sources.append(node_indices[str(antecedent)]) + targets.append(node_indices[str(consequent)]) + + # Assign measure value for each connection + if hasattr(rule, interestingness_measure): + measure_value = getattr(rule, interestingness_measure) + else: + measure_value = rule.support # Default support + values.append(measure_value) + + return labels, sources, targets, values + + labels, sources, targets, values = prepare_data(rules, M, interestingness_measure) + + # Visualization using Plotly + fig = go.Figure(go.Sankey( + node=dict( + pad=15, + thickness=20, + line=dict(color='black', width=0.5), + label=labels + ), + link=dict( + source=sources, + target=targets, + value=values + ) + )) + fig.update_layout(title_text=f'Sankey Diagram of Association Rules ({interestingness_measure})', font_size=10) + + return fig + \ No newline at end of file diff --git a/tests/test_sankey_diagram.py b/tests/test_sankey_diagram.py new file mode 100644 index 0000000..1356a40 --- /dev/null +++ b/tests/test_sankey_diagram.py @@ -0,0 +1,47 @@ +import unittest +from niaarm.visualize import sankey_diagram +from niaarm import Rule + +class TestSankeyDiagram(unittest.TestCase): + + @classmethod + def setUpClass(cls): + cls.rule1 = Rule(antecedent=["A", "B"], consequent=["C"]) + cls.rule1.fitness = 1.0 + cls.rule1.num_transactions = 10 + cls.rule2 = Rule(antecedent=["D"], consequent=["E", "F"]) + cls.rule2.fitness = 0.8 + cls.rule2.num_transactions = 15 + cls.rule3 = Rule(antecedent=["G", "H"], consequent=["I"]) + cls.rule3.fitness = 0.9 + cls.rule3.num_transactions = 12 + + cls.rules = [cls.rule1, cls.rule2, cls.rule3] + + def test_sankey_output_type(self): + fig = sankey_diagram(self.rules, "support", M=3) + self.assertEqual(fig.__class__.__name__, "Figure") + + def test_sankey_structure(self): + fig = sankey_diagram(self.rules, "support", M=3) + self.assertTrue("source" in fig.data[0].link) + + def test_sankey_values(self): + fig = sankey_diagram(self.rules, "support", M=3) + link_data = fig.data[0].link + flow_values = link_data['value'] + expected_links = sum(len(rule.antecedent) * len(rule.consequent) for rule in self.rules) + self.assertEqual(len(flow_values), expected_links) + + def test_sankey_with_custom_fitness(self): + fig = sankey_diagram(self.rules, "support", M=2) + link_data = fig.data[0].link + flow_values = link_data['value'] + self.assertGreater(len(flow_values), 0) + + def test_sankey_no_empty_rules(self): + fig = sankey_diagram([], "support", M=3) + self.assertEqual(len(fig.data[0].link['source']), 0) + self.assertEqual(len(fig.data[0].link['target']), 0) + self.assertEqual(len(fig.data[0].link['value']), 0) + \ No newline at end of file