XAI-HDiff/utils.py at main · SteadfastAsArt/XAI-HDiff · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# @Author  : Andrian Lee
# @Time    : 2022/3/11 12:44
# @File    : utils.py
import logging

import os

import pandas as pd
from pandas.core.base import PandasObject

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

import xgboost as xgb

SEED = 42

SUPPORT_TABLE_SUFFIX = ['.xlsx', '.csv']

RENAME_MAP = {'H2O': "H$_{2}$O"}


def read_file(file_path: str, sheet_name=0):
    if os.path.splitext(file_path)[-1] in SUPPORT_TABLE_SUFFIX[0]:
        df_tmp = pd.read_excel(file_path, sheet_name=sheet_name)  # TODO: if None here, will get a dict for each sheet
        df = df_tmp.rename(columns=RENAME_MAP)
        return df
    elif os.path.splitext(file_path)[-1] in SUPPORT_TABLE_SUFFIX[1]:
        df_tmp = pd.read_csv(file_path)
        df = df_tmp.rename(columns=RENAME_MAP)
        return df
    else:
        raise TypeError('Other formats not supported yet.')


def out(df: pd.DataFrame, out_path):
    """output data to disk"""
    print('Output to {}...'.format(out_path))
    if os.path.splitext(out_path)[-1] in SUPPORT_TABLE_SUFFIX[0]:
        df.to_excel(out_path, index=False)
    elif os.path.splitext(out_path)[-1] in SUPPORT_TABLE_SUFFIX[1]:
        df.to_csv(out_path, index=False)


PandasObject.out = out


# def sampling(data_all, label_all):
#     # define pipeline
#     over = SMOTE(random_state=SEED, sampling_strategy=0.5)  # sampling_strategy=.2 | minor:major
#     under = RandomUnderSampler(random_state=SEED, sampling_strategy=0.8)  # sampling_strategy=.5 | minor:major
#     # steps = [('o', over), ('u', under)]  # ('o', over),
#     # pipeline = Pipeline(steps=steps)
#     # # transform the dataset
#     # data_all, label_all = pipeline.fit_resample(data_all, label_all)
#     data_all, label_all = over.fit_resample(data_all, label_all)
#     data_all, label_all = under.fit_resample(data_all, label_all)
#     print(Counter(label_all))

#     return data_all, label_all


def classification_metrics(Y_test, Y_test_pred):
    f1 = f1_score(Y_test, Y_test_pred)
    roc_auc = roc_auc_score(Y_test, Y_test_pred)
    acc = accuracy_score(Y_test, Y_test_pred, )
    print('f1:{0:.4f} ROC_AUC:{1:.4f} ACC:{2:.4f}'.format(f1, roc_auc, acc))
    logging.info('f1:{0:.4f} ROC_AUC:{1:.4f} ACC:{2:.4f}'.format(f1, roc_auc, acc))


def plot_feat_imp(model):
    xgb.plot_importance(model)
    # sns.set(rc={'figure.figsize': (5, 5)})
    plt.show()


def plot_kde(x, show=False):
    sns.kdeplot(data=x, hue='label')
    if show:
        plt.show()


def judge_path(_path):
    if not os.path.exists(_path):
        os.makedirs(_path)