Merge pull request #149 from vrabiczan/Sankey-diagram

firefly-cpp · web-flow · commit 8868354b7b20 · 2025-02-26T19:32:28.000Z
Sankey diagram
diff --git a/examples/visualization_examples/prepare_datasets.py b/examples/visualization_examples/prepare_datasets.py
@@ -165,3 +165,127 @@ def get_data_developer_salary_data():
     ]]
 
     return arm_df
+
+
+def get_abalone_data():
+	# Read csv and create DataFrame
+    df = pd.read_csv("datasets/Abalone.csv")
+	
+	######### DISCRETIZATION #########
+    def get_descriptive_stats(data_frame, column, bins_num):
+        stats = data_frame[column].describe()
+        bins_values = []
+        if bins_num == 5:
+            bins_values = [
+                stats["min"],
+                stats["25%"],
+                stats["50%"],
+                stats["75%"],
+                stats["max"],
+                stats["max"] + 0.01
+            ]
+        elif bins_num == 3:
+            bins_values = [
+                stats["min"],
+                (stats["min"] + (stats["max"] - stats["min"]) / 3),
+                (stats["min"] + 2 * (stats["max"] - stats["min"]) / 3),
+                stats["max"] + 0.01
+            ]
+
+        return bins_values
+		
+	# LENGTH
+    length_stats = get_descriptive_stats(df, "Length", 3)
+    length_labels = ["Small", "Medium", "Large"]
+    df["Length"] = pd.cut(
+        df["Length"],
+        bins=length_stats,
+        labels=length_labels,
+        include_lowest=True
+    )
+	
+	# DIAMETER
+    diameter_stats = get_descriptive_stats(df, "Diameter", 3)
+    diameter_labels = ["Small", "Medium", "Large"]
+    df["Diameter"] = pd.cut(
+        df["Diameter"],
+        bins=diameter_stats,
+        labels=diameter_labels,
+        include_lowest=True
+    )
+	
+	# HEIGHT
+    height_stats = get_descriptive_stats(df, "Height", 3)
+    height_labels = ["Small", "Medium", "Large"]
+    df["Height"] = pd.cut(
+        df["Height"],
+        bins=height_stats,
+        labels=height_labels,
+        include_lowest=True
+    )	
+	
+	# WHOLE WEIGHT
+    whole_weight_stats = get_descriptive_stats(df, "Whole weight", 3)
+    whole_weight_labels = ["Light", "Medium", "Heavy"]
+    df["Whole weight"] = pd.cut(
+        df["Whole weight"],
+        bins=whole_weight_stats,
+        labels=whole_weight_labels,
+        include_lowest=True
+    )
+		
+	# SHUCKED WEIGHT
+    shucked_weight_stats = get_descriptive_stats(df, "Shucked weight", 3)
+    shucked_weight_labels = ["Light", "Medium", "Heavy"]
+    df["Shucked weight"] = pd.cut(
+        df["Shucked weight"],
+        bins=shucked_weight_stats,
+        labels=shucked_weight_labels,
+        include_lowest=True
+    )
+			
+	# VISCERA WEIGHT
+    viscera_weight_stats = get_descriptive_stats(df, "Viscera weight", 3)
+    viscera_weight_labels = ["Light", "Medium", "Heavy"]
+    df["Viscera weight"] = pd.cut(
+        df["Viscera weight"],
+        bins=viscera_weight_stats,
+        labels=viscera_weight_labels,
+        include_lowest=True
+    )
+					
+	# SHELL WEIGHT
+    shell_weight_stats = get_descriptive_stats(df, "Shell weight", 3)
+    shell_weight_labels = ["Light", "Medium", "Heavy"]
+    df["Shell weight"] = pd.cut(
+        df["Shell weight"],
+        bins=shell_weight_stats,
+        labels=shell_weight_labels,
+        include_lowest=True
+    )
+			
+	# AGE
+    age_stats = get_descriptive_stats(df, "Rings", 3)
+    age_labels = ["Young", "Adult", "Old"]
+    df["Age"] = pd.cut(
+        df["Rings"],
+        bins=age_stats,
+        labels=age_labels,
+        include_lowest=True
+    )
+	
+    # Select relevant columns for ARM
+    arm_df = df[[
+        "Sex",
+        "Length",
+        "Diameter",
+        "Height",
+        "Whole weight",
+        "Shucked weight",
+        "Viscera weight",
+        "Shell weight",
+        "Age"
+    ]]
+
+    return arm_df
+    
diff --git a/examples/visualization_examples/sankey_diagram/abalone_sankey_diagram.py b/examples/visualization_examples/sankey_diagram/abalone_sankey_diagram.py
@@ -0,0 +1,39 @@
+from examples.visualization_examples.prepare_datasets import get_abalone_data
+from niaarm import Dataset, get_rules
+from niaarm.visualize import sankey_diagram
+
+# Get prepared data developer salary data
+arm_df = get_abalone_data()
+
+# Prepare Dataset
+dataset = Dataset(
+    path_or_df=arm_df,
+    delimiter=","
+)
+
+# Get rules
+metrics = ("support", "confidence")
+rules, run_time = get_rules(
+    dataset=dataset,
+    algorithm="DifferentialEvolution",
+    metrics=metrics,
+    max_evals=500
+)
+
+# Sort rules
+rules.sort(by="support")
+# Print rule information
+print("\nRules:")
+print(rules)
+print(f'\nTime to generate rules: {f"{run_time:.3f}"} seconds')
+print("\nRule information: ", rules[3])
+print("Antecedent: ", rules[3].antecedent)
+print("Consequent: ", rules[3].consequent)
+print("Confidence: ", rules[3].confidence)
+print("Support: ", rules[3].support)
+print("Lift: ", rules[3].lift)
+print("\nMetrics:", metrics)
+
+# Visualize sankey diagram
+fig = sankey_diagram(rules=rules, interestingness_measure="support", M=4)
+fig.show()
diff --git a/examples/visualization_examples/sankey_diagram/weather_data_sankey_diagram.py b/examples/visualization_examples/sankey_diagram/weather_data_sankey_diagram.py
@@ -0,0 +1,45 @@
+from examples.visualization_examples.prepare_datasets import get_weather_data
+from niaarm import Dataset, get_rules
+from niaarm.visualize import sankey_diagram
+
+# Get prepared weather data
+arm_df = get_weather_data()
+
+# Prepare Dataset
+dataset = Dataset(
+    path_or_df=arm_df,
+    delimiter=","
+)
+
+# Get rules
+metrics = ("support", "confidence")
+rules, run_time = get_rules(
+    dataset=dataset,
+    algorithm="DifferentialEvolution",
+    metrics=metrics,
+    max_evals=500
+)
+
+# Add lift after the rules have been generated
+# Cannot be in metrics before because get_rules metrics doesn't contain lift, therefore we need to add after
+metrics = list(metrics)
+metrics.append("lift")
+metrics = tuple(metrics)
+
+# Sort rules
+rules.sort(by="support")
+# Print rule information
+print("\nRules:")
+print(rules)
+print(f'\nTime to generate rules: {f"{run_time:.3f}"} seconds')
+print("\nRule information: ", rules[3])
+print("Antecedent: ", rules[3].antecedent)
+print("Consequent: ", rules[3].consequent)
+print("Confidence: ", rules[3].confidence)
+print("Support: ", rules[3].support)
+print("Lift: ", rules[3].lift)
+print("\nMetrics:", metrics)
+
+# Visualize sankey diagram
+fig = sankey_diagram(rules=rules, interestingness_measure="support", M=4)
+fig.show()
diff --git a/niaarm/visualize.py b/niaarm/visualize.py
@@ -3,8 +3,10 @@
 from matplotlib.colors import Normalize
 import numpy as np
 import plotly.express as px
+import plotly.graph_objects as go
 import pandas as pd
 from sklearn.cluster import KMeans
+from itertools import combinations
 
 
 def hill_slopes(rule, transactions):
@@ -554,4 +556,124 @@ def prepare_data(rules, metrics):
         plt.legend(title="Order")
         plt.grid(True)
         return plt
-        
+
+
+def sankey_diagram(rules, interestingness_measure, M=4):
+    """
+    Visualize rules as a sankey diagram.
+    
+    Args:
+        rules (Rule): Association rule or rules to visualize.
+        interestingness_measures (str): Interestingness measure Z = {supp, cons, lift},reflecting the quality of a particular connection.
+        m (int): Maximum number of rules to be selected for visualization. Default: 4
+    
+    Returns:
+        Figure or plot.
+    """
+
+    
+    def compute_similarity(rule1, rule2):
+        """Compute similarity between two rules."""
+        ant_inter = len(set(str(rule1.antecedent)) & set(str(rule2.antecedent)))
+        ant_union = len(set(str(rule1.antecedent)) | set(str(rule2.antecedent)))
+        con_inter = len(set(str(rule1.consequent)) & set(str(rule2.consequent)))
+        con_union = len(set(str(rule1.consequent)) | set(str(rule2.consequent)))
+        return (ant_inter + con_inter) / (ant_union + con_union)
+
+    def build_adjacency_matrix(rules):
+        size = len(rules)
+        adjacency_matrix = np.zeros((size, size))
+
+        for i, j in combinations(range(size), 2):
+            similarity = compute_similarity(rules[i], rules[j])
+            adjacency_matrix[i, j] = similarity
+            adjacency_matrix[j, i] = similarity
+
+        return adjacency_matrix
+    
+    def knapsack_selection(adj_matrix, rules, M):
+        fitness_scores = np.array([rule.fitness for rule in rules])
+        N = len(rules)  # number of rules
+        weights = np.ones(N) # all rules have the same weight
+        similarity_weight = 1.0
+        fitness_weight = 0.5
+        combined_profits = similarity_weight * np.sum(adj_matrix) + fitness_weight * fitness_scores # combined similarities with fitness for values
+    
+        selected = np.zeros(N, dtype=int)
+    
+        # Initialize DP table
+        dp = np.zeros((N + 1, M + 1))
+        for i in range(1, N + 1):
+            for w in range(1, M + 1):
+                if weights[i - 1] <= w:
+                    dp[i, w] = max(dp[i - 1, w], dp[i - 1, w - 1] + combined_profits[i - 1])
+                else:
+                    dp[i, w] = dp[i - 1, w]
+    
+        # Backtrack to find selected rules
+        w = M
+        for i in range(N, 0, -1):
+            if dp[i, w] != dp[i - 1, w]:
+                selected[i - 1] = 1
+                w -= 1
+    
+        selected_rules = [rules[i] for i in range(N) if selected[i]]
+
+        return selected_rules
+
+    def prepare_data(rules, M, interestingness_measure):
+        if not rules:
+            return [], [], [], []
+			
+        adj_matrix = build_adjacency_matrix(rules)
+        selected_rules = knapsack_selection(adj_matrix, rules, M)
+
+        sources=[]
+        targets=[] 
+        values=[] 
+        labels=[]
+        node_indices = {}
+
+        for rule in selected_rules:
+			# Ensure all antecedents and consequents exist in the node list
+            for item in rule.antecedent + rule.consequent:
+                item_str = str(item)
+                if item_str not in node_indices:
+                    node_indices[item_str] = len(labels)
+                    labels.append(item_str)
+
+			# Connect each antecedent to each consequent
+            for antecedent in rule.antecedent:
+                for consequent in rule.consequent:
+                    sources.append(node_indices[str(antecedent)])
+                    targets.append(node_indices[str(consequent)])
+
+					# Assign measure value for each connection
+                    if hasattr(rule, interestingness_measure):
+                        measure_value = getattr(rule, interestingness_measure)
+                    else:
+                        measure_value = rule.support  # Default support
+                    values.append(measure_value)
+
+        return labels, sources, targets, values
+
+    labels, sources, targets, values = prepare_data(rules, M, interestingness_measure)
+
+	# Visualization using Plotly
+    fig = go.Figure(go.Sankey(
+        node=dict(
+            pad=15, 
+            thickness=20, 
+            line=dict(color='black', width=0.5),
+            label=labels
+        ),
+        link=dict(
+            source=sources,
+            target=targets,
+            value=values
+        )
+    ))
+    fig.update_layout(title_text=f'Sankey Diagram of Association Rules ({interestingness_measure})', font_size=10)
+    
+    return fig       
+	
diff --git a/tests/test_sankey_diagram.py b/tests/test_sankey_diagram.py