Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 124 additions & 0 deletions examples/visualization_examples/prepare_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,127 @@ def get_data_developer_salary_data():
]]

return arm_df


def get_abalone_data():
# Read csv and create DataFrame
df = pd.read_csv("datasets/Abalone.csv")

######### DISCRETIZATION #########
def get_descriptive_stats(data_frame, column, bins_num):
stats = data_frame[column].describe()
bins_values = []
if bins_num == 5:
bins_values = [
stats["min"],
stats["25%"],
stats["50%"],
stats["75%"],
stats["max"],
stats["max"] + 0.01
]
elif bins_num == 3:
bins_values = [
stats["min"],
(stats["min"] + (stats["max"] - stats["min"]) / 3),
(stats["min"] + 2 * (stats["max"] - stats["min"]) / 3),
stats["max"] + 0.01
]

return bins_values

# LENGTH
length_stats = get_descriptive_stats(df, "Length", 3)
length_labels = ["Small", "Medium", "Large"]
df["Length"] = pd.cut(
df["Length"],
bins=length_stats,
labels=length_labels,
include_lowest=True
)

# DIAMETER
diameter_stats = get_descriptive_stats(df, "Diameter", 3)
diameter_labels = ["Small", "Medium", "Large"]
df["Diameter"] = pd.cut(
df["Diameter"],
bins=diameter_stats,
labels=diameter_labels,
include_lowest=True
)

# HEIGHT
height_stats = get_descriptive_stats(df, "Height", 3)
height_labels = ["Small", "Medium", "Large"]
df["Height"] = pd.cut(
df["Height"],
bins=height_stats,
labels=height_labels,
include_lowest=True
)

# WHOLE WEIGHT
whole_weight_stats = get_descriptive_stats(df, "Whole weight", 3)
whole_weight_labels = ["Light", "Medium", "Heavy"]
df["Whole weight"] = pd.cut(
df["Whole weight"],
bins=whole_weight_stats,
labels=whole_weight_labels,
include_lowest=True
)

# SHUCKED WEIGHT
shucked_weight_stats = get_descriptive_stats(df, "Shucked weight", 3)
shucked_weight_labels = ["Light", "Medium", "Heavy"]
df["Shucked weight"] = pd.cut(
df["Shucked weight"],
bins=shucked_weight_stats,
labels=shucked_weight_labels,
include_lowest=True
)

# VISCERA WEIGHT
viscera_weight_stats = get_descriptive_stats(df, "Viscera weight", 3)
viscera_weight_labels = ["Light", "Medium", "Heavy"]
df["Viscera weight"] = pd.cut(
df["Viscera weight"],
bins=viscera_weight_stats,
labels=viscera_weight_labels,
include_lowest=True
)

# SHELL WEIGHT
shell_weight_stats = get_descriptive_stats(df, "Shell weight", 3)
shell_weight_labels = ["Light", "Medium", "Heavy"]
df["Shell weight"] = pd.cut(
df["Shell weight"],
bins=shell_weight_stats,
labels=shell_weight_labels,
include_lowest=True
)

# AGE
age_stats = get_descriptive_stats(df, "Rings", 3)
age_labels = ["Young", "Adult", "Old"]
df["Age"] = pd.cut(
df["Rings"],
bins=age_stats,
labels=age_labels,
include_lowest=True
)

# Select relevant columns for ARM
arm_df = df[[
"Sex",
"Length",
"Diameter",
"Height",
"Whole weight",
"Shucked weight",
"Viscera weight",
"Shell weight",
"Age"
]]

return arm_df

Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from examples.visualization_examples.prepare_datasets import get_abalone_data
from niaarm import Dataset, get_rules
from niaarm.visualize import sankey_diagram

# Get prepared data developer salary data
arm_df = get_abalone_data()

# Prepare Dataset
dataset = Dataset(
path_or_df=arm_df,
delimiter=","
)

# Get rules
metrics = ("support", "confidence")
rules, run_time = get_rules(
dataset=dataset,
algorithm="DifferentialEvolution",
metrics=metrics,
max_evals=500
)

# Sort rules
rules.sort(by="support")
# Print rule information
print("\nRules:")
print(rules)
print(f'\nTime to generate rules: {f"{run_time:.3f}"} seconds')
print("\nRule information: ", rules[3])
print("Antecedent: ", rules[3].antecedent)
print("Consequent: ", rules[3].consequent)
print("Confidence: ", rules[3].confidence)
print("Support: ", rules[3].support)
print("Lift: ", rules[3].lift)
print("\nMetrics:", metrics)

# Visualize sankey diagram
fig = sankey_diagram(rules=rules, interestingness_measure="support", M=4)
fig.show()
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from examples.visualization_examples.prepare_datasets import get_weather_data
from niaarm import Dataset, get_rules
from niaarm.visualize import sankey_diagram

# Get prepared weather data
arm_df = get_weather_data()

# Prepare Dataset
dataset = Dataset(
path_or_df=arm_df,
delimiter=","
)

# Get rules
metrics = ("support", "confidence")
rules, run_time = get_rules(
dataset=dataset,
algorithm="DifferentialEvolution",
metrics=metrics,
max_evals=500
)

# Add lift after the rules have been generated
# Cannot be in metrics before because get_rules metrics doesn't contain lift, therefore we need to add after
metrics = list(metrics)
metrics.append("lift")
metrics = tuple(metrics)

# Sort rules
rules.sort(by="support")
# Print rule information
print("\nRules:")
print(rules)
print(f'\nTime to generate rules: {f"{run_time:.3f}"} seconds')
print("\nRule information: ", rules[3])
print("Antecedent: ", rules[3].antecedent)
print("Consequent: ", rules[3].consequent)
print("Confidence: ", rules[3].confidence)
print("Support: ", rules[3].support)
print("Lift: ", rules[3].lift)
print("\nMetrics:", metrics)

# Visualize sankey diagram
fig = sankey_diagram(rules=rules, interestingness_measure="support", M=4)
fig.show()
124 changes: 123 additions & 1 deletion niaarm/visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from matplotlib.colors import Normalize
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd
from sklearn.cluster import KMeans
from itertools import combinations


def hill_slopes(rule, transactions):
Expand Down Expand Up @@ -554,4 +556,124 @@ def prepare_data(rules, metrics):
plt.legend(title="Order")
plt.grid(True)
return plt



def sankey_diagram(rules, interestingness_measure, M=4):
"""
Visualize rules as a sankey diagram.

Args:
rules (Rule): Association rule or rules to visualize.
interestingness_measures (str): Interestingness measure Z = {supp, cons, lift},reflecting the quality of a particular connection.
m (int): Maximum number of rules to be selected for visualization. Default: 4

Returns:
Figure or plot.
"""


def compute_similarity(rule1, rule2):
"""Compute similarity between two rules."""
ant_inter = len(set(str(rule1.antecedent)) & set(str(rule2.antecedent)))
ant_union = len(set(str(rule1.antecedent)) | set(str(rule2.antecedent)))
con_inter = len(set(str(rule1.consequent)) & set(str(rule2.consequent)))
con_union = len(set(str(rule1.consequent)) | set(str(rule2.consequent)))
return (ant_inter + con_inter) / (ant_union + con_union)

def build_adjacency_matrix(rules):
size = len(rules)
adjacency_matrix = np.zeros((size, size))

for i, j in combinations(range(size), 2):
similarity = compute_similarity(rules[i], rules[j])
adjacency_matrix[i, j] = similarity
adjacency_matrix[j, i] = similarity

return adjacency_matrix

def knapsack_selection(adj_matrix, rules, M):
fitness_scores = np.array([rule.fitness for rule in rules])
N = len(rules) # number of rules
weights = np.ones(N) # all rules have the same weight
similarity_weight = 1.0
fitness_weight = 0.5
combined_profits = similarity_weight * np.sum(adj_matrix) + fitness_weight * fitness_scores # combined similarities with fitness for values

selected = np.zeros(N, dtype=int)

# Initialize DP table
dp = np.zeros((N + 1, M + 1))
for i in range(1, N + 1):
for w in range(1, M + 1):
if weights[i - 1] <= w:
dp[i, w] = max(dp[i - 1, w], dp[i - 1, w - 1] + combined_profits[i - 1])
else:
dp[i, w] = dp[i - 1, w]

# Backtrack to find selected rules
w = M
for i in range(N, 0, -1):
if dp[i, w] != dp[i - 1, w]:
selected[i - 1] = 1
w -= 1

selected_rules = [rules[i] for i in range(N) if selected[i]]

return selected_rules

def prepare_data(rules, M, interestingness_measure):
if not rules:
return [], [], [], []

adj_matrix = build_adjacency_matrix(rules)
selected_rules = knapsack_selection(adj_matrix, rules, M)

sources=[]
targets=[]
values=[]
labels=[]
node_indices = {}

for rule in selected_rules:
# Ensure all antecedents and consequents exist in the node list
for item in rule.antecedent + rule.consequent:
item_str = str(item)
if item_str not in node_indices:
node_indices[item_str] = len(labels)
labels.append(item_str)

# Connect each antecedent to each consequent
for antecedent in rule.antecedent:
for consequent in rule.consequent:
sources.append(node_indices[str(antecedent)])
targets.append(node_indices[str(consequent)])

# Assign measure value for each connection
if hasattr(rule, interestingness_measure):
measure_value = getattr(rule, interestingness_measure)
else:
measure_value = rule.support # Default support
values.append(measure_value)

return labels, sources, targets, values

labels, sources, targets, values = prepare_data(rules, M, interestingness_measure)

# Visualization using Plotly
fig = go.Figure(go.Sankey(
node=dict(
pad=15,
thickness=20,
line=dict(color='black', width=0.5),
label=labels
),
link=dict(
source=sources,
target=targets,
value=values
)
))
fig.update_layout(title_text=f'Sankey Diagram of Association Rules ({interestingness_measure})', font_size=10)

return fig

Loading
Loading