network_security_project/isolation_forestv2_withPCA.py at main · AyaanJahanzebAhmed/network_security_project · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score,classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score,classification_report
from tabulate import tabulate
import warnings
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
warnings.filterwarnings("ignore")
data = pd.read_csv("/content/KDDTrain.csv")    #loading data set
data.head()

data["class"] = data["class"].apply(lambda x: 0 if x == "normal" else 1)
param_grid = {
    'n_estimators': [10,50],
    'max_samples': [0.5, 0.7, 0.9],
    'contamination': [0.1,0.2,0.3]
}

dataa=data[["src_bytes","dst_bytes","protocol","class"]]  #feature selection

X = dataa.drop(columns="class")
y = dataa["class"]
label_encoder = LabelEncoder()
cols_to_ohe = X.select_dtypes("object").columns               #One Hot Encoding of data
ohe = OneHotEncoder(sparse_output=False)
num_cols = ohe.fit_transform(X[cols_to_ohe])
num_cols_names = ohe.get_feature_names_out(cols_to_ohe)
ohe_df = pd.DataFrame(num_cols, columns=num_cols_names)
X_ohe = pd.concat([X.drop(columns=cols_to_ohe), ohe_df], axis=1)
pca = PCA(n_components=2)
X_reduce = pca.fit_transform(X_ohe)
X_train, X_test, y_train, y_test = train_test_split(X_reduce, y, test_size=0.2, random_state=42)
scaler = StandardScaler()                                      #standard scalar
isolation_forest = IsolationForest()
grid_search = GridSearchCV(estimator=isolation_forest, param_grid=param_grid, cv=5, scoring='precision')
grid_search.fit(X_train)
best_params = grid_search.best_params_
print("best parameters:")
print(best_params)

  #Fitting isolation forest with the best parameters
isolation_forest = IsolationForest(**best_params)
isolation_forest.fit(X_train)
scores_prediction = isolation_forest.decision_function(X_test)
# Adjusing threshold
threshold = np.mean(scores_prediction)
predictions = [1 if score > threshold else 0 for score in scores_prediction]
accuracy = accuracy_score(y_test, predictions)
conf_matrix = confusion_matrix(y_test, predictions)
class_report = classification_report(y_test, predictions)

print("Accuracy:", accuracy)
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, cmap='Blues', interpolation='nearest')
plt.title('Confusion Matrix')
plt.colorbar()
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

fpr, tpr,t = roc_curve(y_test,predictions)
roc_auc = roc_auc_score(y_test, predictions)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='orange', label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()