HPC_internship/inspect_data.py at main · Moggwp/HPC_internship · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

def load_data(path:str) -> pd.DataFrame:
    """_summary_

    Args:
        path (str): _description_

    Returns:
        pd.DataFrame: _description_
    """
    return pd.read_parquet(path)

if __name__ == "__main__":

    sns.set_theme(style="whitegrid")

    # Path to the PM100 data
    DATA_PATH = "job_table.parquet"

    # Load the dataframe
    df = load_data(DATA_PATH)

    # Exit state pie plot
    df = df.replace("OUT_OF_MEMORY", "OOM+NODE FAIL")
    df = df.replace("NODE_FAIL", "OOM+NODE FAIL")
    # Count the values by state
    data = df.job_state.value_counts()
    plt.pie(data.values, labels = data.index.values, colors=sns.color_palette("colorblind"), explode=[0.03]*(len(data)), autopct='%1.0f%%', )
    plt.savefig("plots/state_pie.png")
    plt.clf()

    # Plot the duration of the jobs divided by exit state
    # Convert runtime to minutes
    df.run_time = df.run_time.apply(lambda rt: round(int(rt/60), -2)).values

    # Plot the histogram
    sns.histplot(df, x = "run_time", hue = "job_state", multiple="dodge", kde = False, log_scale=False, palette=sns.color_palette("colorblind"), hue_order=["COMPLETED", "FAILED", "CANCELLED", "TIMEOUT", "OOM+NODE FAIL"])
    plt.xlabel("Duration (in minutes)")
    plt.ylabel("Number of jobs")
    plt.yscale("log")
    plt.tight_layout()
    plt.savefig("plots/run_time_state.png")
    plt.clf()

    # Plot the distribution of the number of GPU allocated to the jobs
    sns.histplot(df, x = "num_gpus_alloc")
    plt.xlabel("Number of GPUs allocated to the job")
    plt.ylabel("Number of jobs")
    plt.yscale("log")
    plt.xscale("log")
    plt.tight_layout()
    plt.savefig("plots/gpus_alloc.png")
    plt.clf()

    # Plot the distribution of the number of cores allocated to the jobs
    sns.histplot(df, x = "num_cores_alloc")
    plt.xlabel("Number of cores allocated to the job")
    plt.ylabel("Number of jobs")
    plt.yscale("log")
    plt.xscale("log")
    plt.tight_layout()
    plt.savefig("plots/cores_alloc.png")
    plt.clf()

    # Plot the distribution of the number of nodes allocated to the jobs
    sns.histplot(df, x = "num_nodes_alloc")
    plt.xlabel("Number of nodes allocated to the job")
    plt.ylabel("Number of jobs")
    plt.yscale("log")
    plt.xscale("log")
    plt.tight_layout()
    plt.savefig("plots/nodes_alloc.png")
    plt.clf()

    # Plot the distribution of the amount of memory allocated to the jobs
    sns.histplot(df, x = "mem_alloc")
    plt.xlabel("Amount of memory allocated to the job (in gigabytes)")
    plt.ylabel("Number of jobs")
    plt.yscale("log")
    plt.xscale("log")
    plt.tight_layout()
    plt.savefig("plots/mem_alloc.png")
    plt.clf()

    # Plot the distribution of jobs throughout the days
    df["day"] = df.submit_time.apply(lambda t: str(t)[5:10])

    days = df.day.unique()

    days.sort()

    sns.histplot(df, x = "day", kde=True)
    plt.xlabel("Day")
    plt.ylabel("Number of jobs")

    # Plot the ticks to improve readability
    xticks = []
    for day in  plt.gca().get_xticks():
        ym = str(days[day])[:3] + "2020"

        if ym in xticks:
            xticks.append("")
        else:
            xticks.append(ym)

    plt.xticks(ticks = plt.gca().get_xticks(), labels = xticks)
    plt.savefig("plots/day_dist.png")
    plt.clf()

    # Plot several jobs power consumption
    sample = df[df.job_id.isin([3848449, 5165227, 2448430, 2652511, 8296, 5029954, 838942])].sort_values("num_nodes_alloc")

    for i in range(len(sample)):

        y = sample.iloc[i].power_consumption

        xrange = [j*20 for j in range(len(y))]

        if i < 3:
            style = "--"
        else:
            style = "-"
        plt.plot(xrange, y, style)

    plt.xlabel("Seconds")
    plt.legend([f"Job {i+1}" for i in range(len(sample))])
    plt.ylabel("Power consumption (W)")
    plt.tight_layout()
    plt.savefig(f"plots/power_samples.png")
    plt.clf()

    # Plot the power consumption values of the jobs with and without the use of the GPU
    df["use_gpu"] = df.num_gpus_alloc.apply(lambda g: g > 0)

    power_df = {"power":[], "use_gpu":[], "nodes_allocated":[]}

    for pc in df[["power_consumption", "use_gpu", "num_nodes_alloc"]].values:

        power_df["power"] += list(pc[0])
        power_df["use_gpu"] += [pc[1]]*len(pc[0])
        power_df["nodes_allocated"] += [pc[-1]]*len(pc[0])

    power_df = pd.DataFrame.from_dict(power_df)

    sns.boxplot(power_df[power_df["nodes_allocated"] == 1], x = "use_gpu", y = "power")
    plt.xlabel("Jobs using")
    plt.xticks([0, 1], ["Cores", "Cores + GPUs"])
    plt.ylabel("Power consumption per job (W)")
    plt.savefig("plots/power_consumption_cpu_gpu_box.png")
    plt.clf()

    sns.histplot(power_df, x = "power", hue = "use_gpu")
    plt.xlabel("Power consumption (W)")
    plt.legend(["Cores+GPUs", "Cores"])
    plt.ylabel("Number of values")
    plt.yscale("log")
    plt.xscale("log")
    plt.xticks(plt.gca().get_xticks())
    plt.tight_layout()
    plt.savefig("plots/power_consumption_cpu_gpu_hist.png")
    plt.clf()