Stroke-Binary-Classification/evaluation_plots.py at main · Jackryd/Stroke-Binary-Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

f1_scores = {}


def parse_cm_string(cm_string):
    # Remove whitespace and split into rows
    rows = cm_string.strip().split("\n")
    # Remove brackets and split each row into numbers
    data = [[int(num) for num in re.findall(r"\d+", row)] for row in rows]
    return np.array(data)


def plot_best_f1(file, name):
    # f1,log,pca,cm,colsamle_bytree,gamma,learning_rate,max_depth,n_estimators,reg_alpha,reg_lambda,roc_auc,fpr,tpr,thresholds,
    print(name)
    df = pd.read_csv(file)
    df = df[["f1", "cm", "roc_auc"]]

    df = df.sort_values("f1", axis=0, ascending=False)
    cm = df["cm"][:1].values[0]
    cm = parse_cm_string(cm)

    f1_scores[name] = df["f1"][:1].values[0]

    print(type(cm))

    # Create a figure and axis
    plt.figure(figsize=(10, 8))

    # Plot the confusion matrix using seaborn
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")

    # Set labels and title
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title("Confusion Matrix")

    # Save the plot
    plt.savefig(f"plots/confusion_matrix_{name}.png")
    # plt.show()


def compare_f1_scores(model_performances):
    # Create lists of models and their corresponding performances
    models = list(model_performances.keys())
    performances = list(model_performances.values())

    # Create the bar plot
    plt.figure(figsize=(12, 6))
    bars = plt.bar(models, performances)

    # Customize the plot
    plt.title("Model Performance Comparison", fontsize=16)
    plt.xlabel("Models", fontsize=12)
    plt.ylabel("Performance Score", fontsize=12)
    plt.ylim(0.0, 1.0)  # Adjust y-axis to focus on the range of scores

    # Add value labels on top of each bar
    for bar in bars:
        height = bar.get_height()
        plt.text(
            bar.get_x() + bar.get_width() / 2.0,
            height,
            f"{height:.4f}",
            ha="center",
            va="bottom",
        )

    # Rotate x-axis labels for better readability
    plt.xticks(rotation=45, ha="right")

    # Adjust layout to prevent cutting off labels
    plt.tight_layout()

    # Save the plot
    plt.savefig("plots/model_comparison.png")

    # Display the plot (optional)
    # plt.show()


names = [
    "Random Forest",
    "Neural Network",
    "XGBoost",
    "Naïve Bayes",
    "Logistic Regression",
]

for file, name in zip(
    [
        "data_random_forest",
        "data_neural_network",
        "data_xgboost_classifier",
        "BayesCat",
        "data_logistic_regression",
    ],
    names,
):
    plot_best_f1(f"C:\Code\Stroke\{file}.csv", name)


compare_f1_scores(f1_scores)