python-pol-ml-cybersecurity/feature.py at main · keane-fernandes/python-pol-ml-cybersecurity · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import pol_utilities as pu
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np


def feature(input_folder_key, output_folder_key):
    cwd = os.getcwd()
    input_folder_path = os.path.join(cwd, pu.root.get(input_folder_key))
    output_folder_path = os.path.join(cwd, pu.root.get(output_folder_key))

    files_to_process = [
        f
        for f in os.listdir(input_folder_path)
        if os.path.isfile(os.path.join(input_folder_path, f))
        if f.endswith(".csv")
    ]

    processed = [
        d
        for d in os.listdir(output_folder_path)
        if os.path.isfile(os.path.join(output_folder_path, d))
        if d.endswith(".csv")
    ]

    for the_file in files_to_process:
        if not pu.check_if_preprocessed(the_file, processed):
            print("Currently processing: {}".format(the_file))
            file_path = os.path.join(input_folder_path, the_file)
            df_master = pd.read_csv(file_path)

            master_dict = {}
            counter = 0

            # 23 is a prime number and should prevent any cyclic patterns
            number_of_chunks = round(df_master.shape[0] / pu.chunk_size)

            for chunk in np.array_split(df_master, number_of_chunks):

                # Subtract the last timestamp from the first timestamp
                start_time = chunk.iloc[0]["Timestamp"]
                end_time = chunk.iloc[-1]["Timestamp"]

                time_window = end_time - start_time
                features_dict = {}

                # Timeframe
                features_dict["TimeWindow"] = time_window

                # Average timestamp for chunk
                features_dict["Average_Timestamp"] = (
                    chunk.iloc[-1]["Timestamp"] + chunk.iloc[0]["Timestamp"]
                ) / 2

                # Separate master dataframe into smaller, signal specific dataframes
                df_speed = chunk[chunk["StatusType"] == "SPEED"]
                df_throttle = chunk[chunk["StatusType"] == "THROTTLE"]
                df_brake = chunk[chunk["StatusType"] == "BRAKE"]
                df_cruise = chunk[chunk["StatusType"] == "CRUISE"]
                df_rrcp = chunk[chunk["StatusType"] == "RRCP"]
                df_dhcp = chunk[chunk["StatusType"] == "DHCP"]
                df_mdns = chunk[chunk["StatusType"] == "MDNS"]
                df_ssdp = chunk[chunk["StatusType"] == "SSDP"]
                df_arp = chunk[chunk["StatusType"] == "ARP"]
                df_nbns = chunk[chunk["StatusType"] == "NBNS"]
                df_llmnr = chunk[chunk["StatusType"] == "LLMNR"]
                df_malformed = chunk[chunk["StatusType"] == "MALFORMED"]
                df_malicious = chunk[chunk["StatusType"] == "MALICIOUS"]

                # Throughputs
                features_dict["TP_Overall"] = chunk["PacketLength"].sum() / time_window
                features_dict["TP_Speed"] = df_speed["PacketLength"].sum() / time_window
                features_dict["TP_Throttle"] = (
                    df_throttle["PacketLength"].sum() / time_window
                )
                features_dict["TP_Brake"] = df_brake["PacketLength"].sum() / time_window
                features_dict["TP_Cruise"] = (
                    df_cruise["PacketLength"].sum() / time_window
                )
                features_dict["TP_RRCP"] = df_rrcp["PacketLength"].sum() / time_window
                features_dict["TP_DHCP"] = df_dhcp["PacketLength"].sum() / time_window
                features_dict["TP_MDNS"] = df_mdns["PacketLength"].sum() / time_window
                features_dict["TP_SSDP"] = df_ssdp["PacketLength"].sum() / time_window
                features_dict["TP_Malicious"] = (
                    df_malicious["PacketLength"].sum() / time_window
                )
                features_dict["TP_ARP"] = df_arp["PacketLength"].sum() / time_window
                features_dict["TP_NBNS"] = df_nbns["PacketLength"].sum() / time_window
                features_dict["TP_LLMNR"] = df_llmnr["PacketLength"].sum() / time_window
                features_dict["TP_Malformed"] = (
                    df_malformed["PacketLength"].sum() / time_window
                )

                # Average payloads of vehicle speed, throttle, brake and cruise control

                features_dict["VehicleSpeed"] = df_speed["Payload"].mean()
                features_dict["ThrottleDemand"] = df_throttle["Payload"].mean()
                features_dict["BrakePressed"] = df_brake["Payload"].mean()
                features_dict["CruiseDemand"] = df_cruise["Payload"].mean()

                # Behaviour
                features_dict["Behaviour"] = pu.get_behaviour(
                    features_dict["VehicleSpeed"], features_dict["CruiseDemand"]
                )

                # Add the populated features_dict to the master_dict with the counter as the key
                master_dict[counter] = features_dict

                counter += 1

            # Create our Dataframe from the nested dictionary and output to csv

            df_features = pd.DataFrame.from_dict(master_dict, orient="index")
            new_file_path = os.path.join(output_folder_path, str(the_file))
            df_features.to_csv(new_file_path, index=False)
            print("Dataset with {} rows generated.".format(df_features.shape[0]))


def main(input_folder_key="collect", output_folder_key="feature"):
    print("Feature extraction layer started ...")
    start = time.time()
    feature(input_folder_key, output_folder_key)
    end = time.time()
    print("Feature layer execution time: " + str(end - start))


if __name__ == "__main__":
    main()