Examples, added comments and tests

Zan Vrabic · Zan Vrabic · commit b2aab77d3868 · 2025-02-05T18:20:09.000+01:00
diff --git a/examples/visualization_examples/prepare_datasets.py b/examples/visualization_examples/prepare_datasets.py
@@ -165,3 +165,127 @@ def get_data_developer_salary_data():
     ]]
 
     return arm_df
+
+
+def get_abalone_data():
+	# Read csv and create DataFrame
+    df = pd.read_csv("datasets/Abalone.csv")
+	
+	######### DISCRETIZATION #########
+    def get_descriptive_stats(data_frame, column, bins_num):
+        stats = data_frame[column].describe()
+        bins_values = []
+        if bins_num == 5:
+            bins_values = [
+                stats["min"],
+                stats["25%"],
+                stats["50%"],
+                stats["75%"],
+                stats["max"],
+                stats["max"] + 0.01
+            ]
+        elif bins_num == 3:
+            bins_values = [
+                stats["min"],
+                (stats["min"] + (stats["max"] - stats["min"]) / 3),
+                (stats["min"] + 2 * (stats["max"] - stats["min"]) / 3),
+                stats["max"] + 0.01
+            ]
+
+        return bins_values
+		
+	# LENGTH
+    length_stats = get_descriptive_stats(df, "Length", 3)
+    length_labels = ["Small", "Medium", "Large"]
+    df["Length"] = pd.cut(
+        df["Length"],
+        bins=length_stats,
+        labels=length_labels,
+        include_lowest=True
+    )
+	
+	# DIAMETER
+    diameter_stats = get_descriptive_stats(df, "Diameter", 3)
+    diameter_labels = ["Small", "Medium", "Large"]
+    df["Diameter"] = pd.cut(
+        df["Diameter"],
+        bins=diameter_stats,
+        labels=diameter_labels,
+        include_lowest=True
+    )
+	
+	# HEIGHT
+    height_stats = get_descriptive_stats(df, "Height", 3)
+    height_labels = ["Small", "Medium", "Large"]
+    df["Height"] = pd.cut(
+        df["Height"],
+        bins=height_stats,
+        labels=height_labels,
+        include_lowest=True
+    )	
+	
+	# WHOLE WEIGHT
+    whole_weight_stats = get_descriptive_stats(df, "Whole weight", 3)
+    whole_weight_labels = ["Light", "Medium", "Heavy"]
+    df["Whole weight"] = pd.cut(
+        df["Whole weight"],
+        bins=whole_weight_stats,
+        labels=whole_weight_labels,
+        include_lowest=True
+    )
+		
+	# SHUCKED WEIGHT
+    shucked_weight_stats = get_descriptive_stats(df, "Shucked weight", 3)
+    shucked_weight_labels = ["Light", "Medium", "Heavy"]
+    df["Shucked weight"] = pd.cut(
+        df["Shucked weight"],
+        bins=shucked_weight_stats,
+        labels=shucked_weight_labels,
+        include_lowest=True
+    )
+			
+	# VISCERA WEIGHT
+    viscera_weight_stats = get_descriptive_stats(df, "Viscera weight", 3)
+    viscera_weight_labels = ["Light", "Medium", "Heavy"]
+    df["Viscera weight"] = pd.cut(
+        df["Viscera weight"],
+        bins=viscera_weight_stats,
+        labels=viscera_weight_labels,
+        include_lowest=True
+    )
+					
+	# SHELL WEIGHT
+    shell_weight_stats = get_descriptive_stats(df, "Shell weight", 3)
+    shell_weight_labels = ["Light", "Medium", "Heavy"]
+    df["Shell weight"] = pd.cut(
+        df["Shell weight"],
+        bins=shell_weight_stats,
+        labels=shell_weight_labels,
+        include_lowest=True
+    )
+			
+	# AGE
+    age_stats = get_descriptive_stats(df, "Rings", 3)
+    age_labels = ["Young", "Adult", "Old"]
+    df["Age"] = pd.cut(
+        df["Rings"],
+        bins=age_stats,
+        labels=age_labels,
+        include_lowest=True
+    )
+	
+    # Select relevant columns for ARM
+    arm_df = df[[
+        "Sex",
+        "Length",
+        "Diameter",
+        "Height",
+        "Whole weight",
+        "Shucked weight",
+        "Viscera weight",
+        "Shell weight",
+        "Age"
+    ]]
+
+    return arm_df
+    
diff --git a/examples/visualization_examples/sankey_diagram/abalone_sankey_diagram.py b/examples/visualization_examples/sankey_diagram/abalone_sankey_diagram.py
@@ -0,0 +1,39 @@
+from examples.visualization_examples.prepare_datasets import get_abalone_data
+from niaarm import Dataset, get_rules
+from niaarm.visualize import sankey_diagram
+
+# Get prepared data developer salary data
+arm_df = get_abalone_data()
+
+# Prepare Dataset
+dataset = Dataset(
+    path_or_df=arm_df,
+    delimiter=","
+)
+
+# Get rules
+metrics = ("support", "confidence")
+rules, run_time = get_rules(
+    dataset=dataset,
+    algorithm="DifferentialEvolution",
+    metrics=metrics,
+    max_evals=500
+)
+
+# Sort rules
+rules.sort(by="support")
+# Print rule information
+print("\nRules:")
+print(rules)
+print(f'\nTime to generate rules: {f"{run_time:.3f}"} seconds')
+print("\nRule information: ", rules[3])
+print("Antecedent: ", rules[3].antecedent)
+print("Consequent: ", rules[3].consequent)
+print("Confidence: ", rules[3].confidence)
+print("Support: ", rules[3].support)
+print("Lift: ", rules[3].lift)
+print("\nMetrics:", metrics)
+
+# Visualize sankey diagram
+fig = sankey_diagram(rules=rules, interestingness_measure="support", M=4)
+fig.show()
diff --git a/examples/visualization_examples/sankey_diagram/weather_data_sankey_diagram.py b/examples/visualization_examples/sankey_diagram/weather_data_sankey_diagram.py
@@ -40,6 +40,6 @@
 print("Lift: ", rules[3].lift)
 print("\nMetrics:", metrics)
 
-# Visualize scatter plot
+# Visualize sankey diagram
 fig = sankey_diagram(rules=rules, interestingness_measure="support", M=4)
 fig.show()
diff --git a/niaarm/visualize.py b/niaarm/visualize.py
@@ -593,11 +593,11 @@ def build_adjacency_matrix(rules):
     
     def knapsack_selection(adj_matrix, rules, M):
         fitness_scores = np.array([rule.fitness for rule in rules])
-        N = len(rules)
-        weights = np.ones(N)
+        N = len(rules)  # number of rules
+        weights = np.ones(N) # all rules have the same weight
         similarity_weight = 1.0
         fitness_weight = 0.5
-        combined_profits = similarity_weight * np.sum(adj_matrix) + fitness_weight * fitness_scores
+        combined_profits = similarity_weight * np.sum(adj_matrix) + fitness_weight * fitness_scores # combined similarities with fitness for values
     
         selected = np.zeros(N, dtype=int)
     
@@ -622,6 +622,9 @@ def knapsack_selection(adj_matrix, rules, M):
         return selected_rules
 
     def prepare_data(rules, M, interestingness_measure):
+        if not rules:
+            return [], [], [], []
+			
         adj_matrix = build_adjacency_matrix(rules)
         selected_rules = knapsack_selection(adj_matrix, rules, M)
 
@@ -644,13 +647,17 @@ def prepare_data(rules, M, interestingness_measure):
                     labels.append(str(consequent))
                 targets.append(node_indices[str(consequent)])
             
-            measure_value = getattr(rule, interestingness_measure, rule.support) #default support
-            values.append(measure_value) 
+            if hasattr(rule, interestingness_measure):
+                measure_value = getattr(rule, interestingness_measure)
+            else:
+                measure_value=rule.support # Default support
+            values.append(measure_value)
 
         return labels, sources, targets, values
 
     labels, sources, targets, values = prepare_data(rules, M, interestingness_measure)
 
+	# Visualization using Plotly
     fig = go.Figure(go.Sankey(
         node=dict(
             pad=15, 
diff --git a/tests/test_sankey_diagram.py b/tests/test_sankey_diagram.py
@@ -0,0 +1,46 @@
+import unittest
+from niaarm.visualize import sankey_diagram
+from niaarm import Rule
+
+class TestSankeyDiagram(unittest.TestCase):
+	
+	@classmethod
+	def setUpClass(cls):
+		cls.rule1 = Rule(antecedent=["A", "B"], consequent=["C"])
+		cls.rule1.fitness = 1.0
+		cls.rule1.num_transactions = 10  
+		cls.rule2 = Rule(antecedent=["D"], consequent=["E", "F"])
+		cls.rule2.fitness = 0.8
+		cls.rule2.num_transactions = 15  
+		cls.rule3 = Rule(antecedent=["G", "H"], consequent=["I"])
+		cls.rule3.fitness = 0.9
+		cls.rule3.num_transactions = 12  
+		
+		cls.rules = [cls.rule1, cls.rule2, cls.rule3]
+
+	def test_sankey_output_type(self):
+		fig = sankey_diagram(self.rules, "support", M=3)
+		self.assertEqual(fig.__class__.__name__, "Figure")
+
+	def test_sankey_structure(self):
+		fig = sankey_diagram(self.rules, "support", M=3)
+		self.assertTrue("source" in fig.data[0].link)
+    
+	def test_sankey_values(self):
+		fig = sankey_diagram(self.rules, "support", M=3)
+		link_data = fig.data[0].link
+		flow_values = link_data['value']  
+		self.assertEqual(len(flow_values), len(self.rules))  
+    
+	def test_sankey_with_custom_fitness(self):
+		fig = sankey_diagram(self.rules, "support", M=2)
+		link_data = fig.data[0].link
+		flow_values = link_data['value']  
+		self.assertGreater(len(flow_values), 0)  
+    
+	def test_sankey_no_empty_rules(self):
+		fig = sankey_diagram([], "support", M=3)
+		self.assertEqual(len(fig.data[0].link['source']), 0)
+		self.assertEqual(len(fig.data[0].link['target']), 0)
+		self.assertEqual(len(fig.data[0].link['value']), 0)
+