diff --git a/causallearn/score/LocalScoreFunction.py b/causallearn/score/LocalScoreFunction.py
index 488a2c45..32de5d36 100644
--- a/causallearn/score/LocalScoreFunction.py
+++ b/causallearn/score/LocalScoreFunction.py
@@ -194,11 +194,11 @@ def local_score_cv_general(
     score: local score
     """
 
-    Data = np.mat(Data)
+    Data = Data
     PAi = list(PAi)
 
     T = Data.shape[0]
-    X = Data[:, [Xi]]
+    X = Data[:, Xi].reshape(-1, 1)
     var_lambda = parameters["lambda"]  # regularization parameter
     k = parameters["kfold"]  # k-fold cross validation
     n0 = math.floor(T / k)
@@ -206,24 +206,22 @@ def local_score_cv_general(
     Thresh = 1e-5
 
     if len(PAi):
-        PA = Data[:, PAi]
+        PA = Data[:, PAi].reshape(-1, 1)
 
         # set the kernel for X
-        GX = np.sum(np.multiply(X, X), axis=1)
+        GX = np.multiply(X, X).reshape(-1, 1)
         Q = np.tile(GX, (1, T))
         R = np.tile(GX.T, (T, 1))
-        dists = Q + R - 2 * X * X.T
+        dists = Q + R - 2 * X @ X.T
         dists = dists - np.tril(dists)
         dists = np.reshape(dists, (T**2, 1))
-        width = np.sqrt(
-            0.5 * np.median(dists[np.where(dists > 0)], axis=1)[0, 0]
-        )  # median value
+        width = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)]))
         width = width * 2
         theta = 1 / (width**2)
 
         Kx, _ = kernel(X, X, (theta, 1))  # Gaussian kernel
         H0 = (
-            np.mat(np.eye(T)) - np.mat(np.ones((T, T))) / T
+            np.eye(T) - np.ones((T, T)) / T
         )  # for centering of the data in feature space
         Kx = H0 * Kx * H0  # kernel matrix for X
 
@@ -234,24 +232,25 @@ def local_score_cv_general(
         # mx = len(IIx)
 
         # set the kernel for PA
-        Kpa = np.mat(np.ones((T, T)))
+        Kpa = np.ones((T, T))
 
         for m in range(PA.shape[1]):
-            G = np.sum((np.multiply(PA[:, m], PA[:, m])), axis=1)
+            G = np.multiply(PA[:, [m]], PA[:, [m]]).reshape(-1, 1)
             Q = np.tile(G, (1, T))
             R = np.tile(G.T, (T, 1))
-            dists = Q + R - 2 * PA[:, m] * PA[:, m].T
+            dists = Q + R - 2 * PA[:, [m]] @ PA[:, [m]].T
             dists = dists - np.tril(dists)
             dists = np.reshape(dists, (T**2, 1))
-            width = np.sqrt(
-                0.5 * np.median(dists[np.where(dists > 0)], axis=1)[0, 0]
-            )  # median value
+            width = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)]))
             width = width * 2
             theta = 1 / (width**2)
-            Kpa = np.multiply(Kpa, kernel(PA[:, m], PA[:, m], (theta, 1))[0])
+            Kpa = np.multiply(
+                Kpa,
+                kernel(PA[:, m].reshape(-1, 1), PA[:, m].reshape(-1, 1), (theta, 1))[0],
+            )
 
         H0 = (
-            np.mat(np.eye(T)) - np.mat(np.ones((T, T))) / T
+            np.eye(T) - np.ones((T, T)) / T
         )  # for centering of the data in feature space
         Kpa = H0 * Kpa * H0  # kernel matrix for PA
 
@@ -317,40 +316,36 @@ def local_score_cv_general(
                 raise ValueError("Not cover all logic path")
 
             n1 = T - nv
-            tmp1 = pdinv(Kpa_tr + n1 * var_lambda * np.mat(np.eye(n1)))
-            tmp2 = tmp1 * Kx_tr * tmp1
-            tmp3 = (
-                tmp1
-                * pdinv(np.mat(np.eye(n1)) + n1 * var_lambda**2 / gamma * tmp2)
-                * tmp1
-            )
+            tmp1 = pdinv(Kpa_tr + n1 * var_lambda * np.eye(n1))
+            tmp2 = tmp1 @ Kx_tr @ tmp1
+            tmp3 = tmp1 @ pdinv(np.eye(n1) + n1 * var_lambda**2 / gamma * tmp2) @ tmp1
             A = (
                 Kx_te
-                + Kpa_tr_te.T * tmp2 * Kpa_tr_te
-                - 2 * Kx_tr_te.T * tmp1 * Kpa_tr_te
-                - n1 * var_lambda**2 / gamma * Kx_tr_te.T * tmp3 * Kx_tr_te
+                + Kpa_tr_te.T @ tmp2 @ Kpa_tr_te
+                - 2 * Kx_tr_te.T @ tmp1 @ Kpa_tr_te
+                - n1 * var_lambda**2 / gamma * Kx_tr_te.T @ tmp3 @ Kx_tr_te
                 - n1
                 * var_lambda**2
                 / gamma
                 * Kpa_tr_te.T
-                * tmp1
-                * Kx_tr
-                * tmp3
-                * Kx_tr
-                * tmp1
-                * Kpa_tr_te
+                @ tmp1
+                @ Kx_tr
+                @ tmp3
+                @ Kx_tr
+                @ tmp1
+                @ Kpa_tr_te
                 + 2
                 * n1
                 * var_lambda**2
                 / gamma
                 * Kx_tr_te.T
-                * tmp3
-                * Kx_tr
-                * tmp1
-                * Kpa_tr_te
+                @ tmp3
+                @ Kx_tr
+                @ tmp1
+                @ Kpa_tr_te
             ) / gamma
 
-            B = n1 * var_lambda**2 / gamma * tmp2 + np.mat(np.eye(n1))
+            B = n1 * var_lambda**2 / gamma * tmp2 + np.eye(n1)
             L = np.linalg.cholesky(B)
             C = np.sum(np.log(np.diag(L)))
             #  CV = CV + (nv*nv*log(2*pi) + nv*C + nv*mx*log(gamma) + trace(A))/2;
@@ -365,14 +360,12 @@ def local_score_cv_general(
         dists = Q + R - 2 * X * X.T
         dists = dists - np.tril(dists)
         dists = np.reshape(dists, (T**2, 1))
-        width = np.sqrt(
-            0.5 * np.median(dists[np.where(dists > 0)], axis=1)[0, 0]
-        )  # median value
+        width = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)]))
         width = width * 2
         theta = 1 / (width**2)
 
         Kx, _ = kernel(X, X, (theta, 1))  # Gaussian kernel
-        H0 = np.mat(np.eye(T)) - np.mat(np.ones((T, T))) / (
+        H0 = np.eye(T) - np.ones((T, T)) / (
             T
         )  # for centering of the data in feature space
         Kx = H0 * Kx * H0  # kernel matrix for X
@@ -423,10 +416,10 @@ def local_score_cv_general(
                 - 1
                 / (gamma * n1)
                 * Kx_tr_te.T
-                * pdinv(np.mat(np.eye(n1)) + 1 / (gamma * n1) * Kx_tr)
+                * pdinv(np.eye(n1) + 1 / (gamma * n1) * Kx_tr)
                 * Kx_tr_te
             ) / gamma
-            B = 1 / (gamma * n1) * Kx_tr + np.mat(np.eye(n1))
+            B = 1 / (gamma * n1) * Kx_tr + np.eye(n1)
             L = np.linalg.cholesky(B)
             C = np.sum(np.log(np.diag(L)))
 
@@ -465,7 +458,7 @@ def local_score_cv_multi(
     """
 
     T = Data.shape[0]
-    X = Data[:, parameters["dlabel"][Xi]]
+    X = Data[:, parameters["dlabel"][Xi]].reshape(-1, 1)
     var_lambda = parameters["lambda"]  # regularization parameter
     k = parameters["kfold"]  # k-fold cross validation
     n0 = math.floor(T / k)
@@ -474,43 +467,39 @@ def local_score_cv_multi(
 
     if len(PAi):
         # set the kernel for X
-        GX = np.sum(np.multiply(X, X), axis=1)
+        GX = np.multiply(X, X).reshape(-1, 1)
         Q = np.tile(GX, (1, T))
         R = np.tile(GX.T, (T, 1))
         dists = Q + R - 2 * X * X.T
         dists = dists - np.tril(dists)
         dists = np.reshape(dists, (T**2, 1))
-        width = np.sqrt(
-            0.5 * np.median(dists[np.where(dists > 0)], axis=1)[0, 0]
-        )  # median value
+        width = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)]))
         width = width * 3  ###
         theta = 1 / (width**2 * X.shape[1])  #
 
         Kx, _ = kernel(X, X, (theta, 1))  # Gaussian kernel
-        H0 = np.mat(np.eye(T)) - np.mat(np.ones((T, T))) / (
+        H0 = np.eye(T) - np.ones((T, T)) / (
             T
         )  # for centering of the data in feature space
         Kx = H0 * Kx * H0  # kernel matrix for X
 
         # set the kernel for PA
-        Kpa = np.mat(np.ones((T, T)))
+        Kpa = np.ones((T, T), dtype=np.float64)
 
         for m in range(len(PAi)):
-            PA = Data[:, parameters["dlabel"][PAi[m]]]
-            G = np.sum((np.multiply(PA, PA)), axis=1)
+            PA = Data[:, parameters["dlabel"][PAi[m]]].reshape(-1, 1)
+            G = np.multiply(PA, PA).reshape(-1, 1)
             Q = np.tile(G, (1, T))
             R = np.tile(G.T, (T, 1))
             dists = Q + R - 2 * PA * PA.T
             dists = dists - np.tril(dists)
             dists = np.reshape(dists, (T**2, 1))
-            width = np.sqrt(
-                0.5 * np.median(dists[np.where(dists > 0)], axis=1)[0, 0]
-            )  # median value
+            width = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)]))
             width = width * 3  ###
             theta = 1 / (width**2 * PA.shape[1])
             Kpa = np.multiply(Kpa, kernel(PA, PA, (theta, 1))[0])
 
-        H0 = np.mat(np.eye(T)) - np.mat(np.ones((T, T))) / (
+        H0 = np.eye(T) - np.ones((T, T)) / (
             T
         )  # for centering of the data in feature space
         Kpa = H0 * Kpa * H0  # kernel matrix for PA
@@ -577,40 +566,36 @@ def local_score_cv_multi(
                 raise ValueError("Not cover all logic path")
 
             n1 = T - nv
-            tmp1 = pdinv(Kpa_tr + n1 * var_lambda * np.mat(np.eye(n1)))
-            tmp2 = tmp1 * Kx_tr * tmp1
-            tmp3 = (
-                tmp1
-                * pdinv(np.mat(np.eye(n1)) + n1 * var_lambda**2 / gamma * tmp2)
-                * tmp1
-            )
+            tmp1 = pdinv(Kpa_tr + n1 * var_lambda * np.eye(n1))
+            tmp2 = tmp1 @ Kx_tr @ tmp1
+            tmp3 = tmp1 @ pdinv(np.eye(n1) + n1 * var_lambda**2 / gamma * tmp2) @ tmp1
             A = (
                 Kx_te
-                + Kpa_tr_te.T * tmp2 * Kpa_tr_te
-                - 2 * Kx_tr_te.T * tmp1 * Kpa_tr_te
-                - n1 * var_lambda**2 / gamma * Kx_tr_te.T * tmp3 * Kx_tr_te
+                + Kpa_tr_te.T @ tmp2 @ Kpa_tr_te
+                - 2 * Kx_tr_te.T @ tmp1 @ Kpa_tr_te
+                - n1 * var_lambda**2 / gamma * Kx_tr_te.T @ tmp3 @ Kx_tr_te
                 - n1
                 * var_lambda**2
                 / gamma
                 * Kpa_tr_te.T
-                * tmp1
-                * Kx_tr
-                * tmp3
-                * Kx_tr
-                * tmp1
-                * Kpa_tr_te
+                @ tmp1
+                @ Kx_tr
+                @ tmp3
+                @ Kx_tr
+                @ tmp1
+                @ Kpa_tr_te
                 + 2
                 * n1
                 * var_lambda**2
                 / gamma
                 * Kx_tr_te.T
-                * tmp3
-                * Kx_tr
-                * tmp1
-                * Kpa_tr_te
+                @ tmp3
+                @ Kx_tr
+                @ tmp1
+                @ Kpa_tr_te
             ) / gamma
 
-            B = n1 * var_lambda**2 / gamma * tmp2 + np.mat(np.eye(n1))
+            B = n1 * var_lambda**2 / gamma * tmp2 + np.eye(n1)
             L = np.linalg.cholesky(B)
             C = np.sum(np.log(np.diag(L)))
             #  CV = CV + (nv*nv*log(2*pi) + nv*C + nv*mx*log(gamma) + trace(A))/2;
@@ -625,14 +610,12 @@ def local_score_cv_multi(
         dists = Q + R - 2 * X * X.T
         dists = dists - np.tril(dists)
         dists = np.reshape(dists, (T**2, 1))
-        width = np.sqrt(
-            0.5 * np.median(dists[np.where(dists > 0)], axis=1)[0, 0]
-        )  # median value
+        width = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)]))
         width = width * 3  ###
         theta = 1 / (width**2 * X.shape[1])  #
 
         Kx, _ = kernel(X, X, (theta, 1))  # Gaussian kernel
-        H0 = np.mat(np.eye(T)) - np.mat(np.ones((T, T))) / (
+        H0 = np.eye(T) - np.ones((T, T)) / (
             T
         )  # for centering of the data in feature space
         Kx = H0 * Kx * H0  # kernel matrix for X
@@ -679,10 +662,10 @@ def local_score_cv_multi(
                 - 1
                 / (gamma * n1)
                 * Kx_tr_te.T
-                * pdinv(np.mat(np.eye(n1)) + 1 / (gamma * n1) * Kx_tr)
+                * pdinv(np.eye(n1) + 1 / (gamma * n1) * Kx_tr)
                 * Kx_tr_te
             ) / gamma
-            B = 1 / (gamma * n1) * Kx_tr + np.mat(np.eye(n1))
+            B = 1 / (gamma * n1) * Kx_tr + np.eye(n1)
             L = np.linalg.cholesky(B)
             C = np.sum(np.log(np.diag(L)))
 
@@ -715,20 +698,20 @@ def local_score_marginal_general(
     """
 
     T = Data.shape[0]
-    X = Data[:, [Xi]]
+    X = Data[:, Xi].reshape(-1, 1)
     dX = X.shape[1]
 
     # set the kernel for X
-    GX = np.sum(np.multiply(X, X), axis=1)
+    GX = np.sum(np.multiply(X, X), axis=1).reshape(-1, 1)
     Q = np.tile(GX, (1, T))
     R = np.tile(GX.T, (T, 1))
     dists = Q + R - 2 * X * X.T
     dists = dists - np.tril(dists)
     dists = np.reshape(dists, (T**2, 1))
-    width = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)], axis=1)[0, 0])
+    width = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)]))
     width = width * 2.5  # kernel width
     theta = 1 / (width**2)
-    H = np.mat(np.eye(T)) - np.mat(np.ones((T, T))) / T
+    H = np.eye(T) - np.ones((T, T)) / T
     Kx, _ = kernel(X, X, (theta, 1))
     Kx = H * Kx * H
 
@@ -743,21 +726,19 @@ def local_score_marginal_general(
     if len(PAi):
         PA = Data[:, PAi]
 
-        widthPA = np.mat(np.empty((PA.shape[1], 1)))
+        widthPA = np.empty((PA.shape[1], 1))
         # set the kernel for PA
         for m in range(PA.shape[1]):
-            G = np.sum((np.multiply(PA[:, m], PA[:, m])), axis=1)
+            G = np.multiply(PA[:, [m]], PA[:, [m]]).reshape(-1, 1)
             Q = np.tile(G, (1, T))
             R = np.tile(G.T, (T, 1))
-            dists = Q + R - 2 * PA[:, m] * PA[:, m].T
+            dists = Q + R - 2 * PA[:, [m]] * PA[:, [m]].T
             dists = dists - np.tril(dists)
             dists = np.reshape(dists, (T**2, 1))
-            widthPA[m] = np.sqrt(
-                0.5 * np.median(dists[np.where(dists > 0)], axis=1)[0, 0]
-            )
+            widthPA[m] = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)]))
         widthPA = widthPA * 2.5  # kernel width
 
-        covfunc = np.asarray(["covSum", ["covSEard", "covNoise"]])
+        covfunc = np.asarray(["covSum", ["covSEard", "covNoise"]], dtype=object)
         logtheta0 = np.vstack([np.log(widthPA), 0, np.log(np.sqrt(0.1))])
         logtheta, fvals, iter = minimize(
             logtheta0,
@@ -765,34 +746,34 @@ def local_score_marginal_general(
             -300,
             covfunc,
             PA,
-            2 * np.sqrt(T) * eix * np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
+            2 * np.sqrt(T) * eix @ np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
         )
 
         nlml, dnlml = gpr_multi_new(
             logtheta,
             covfunc,
             PA,
-            2 * np.sqrt(T) * eix * np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
+            2 * np.sqrt(T) * eix @ np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
             nargout=2,
         )
     else:
-        covfunc = np.asarray(["covSum", ["covSEard", "covNoise"]])
-        PA = np.mat(np.zeros((T, 1)))
-        logtheta0 = np.mat([100, 0, np.log(np.sqrt(0.1))]).T
+        covfunc = np.asarray(["covSum", ["covSEard", "covNoise"]], dtype=object)
+        PA = np.zeros((T, 1))
+        logtheta0 = np.array([[100], [0], [np.log(np.sqrt(0.1))]])
         logtheta, fvals, iter = minimize(
             logtheta0,
             "gpr_multi_new",
             -300,
             covfunc,
             PA,
-            2 * np.sqrt(T) * eix * np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
+            2 * np.sqrt(T) * eix @ np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
         )
 
         nlml, dnlml = gpr_multi_new(
             logtheta,
             covfunc,
             PA,
-            2 * np.sqrt(T) * eix * np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
+            2 * np.sqrt(T) * eix @ np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
             nargout=2,
         )
     score = nlml  # negative log-likelihood
@@ -821,20 +802,20 @@ def local_score_marginal_multi(
     score: local score
     """
     T = Data.shape[0]
-    X = Data[:, parameters["dlabel"][Xi]]
+    X = Data[:, parameters["dlabel"][Xi]].reshape(-1, 1)
     dX = X.shape[1]
 
     # set the kernel for X
-    GX = np.sum(np.multiply(X, X), axis=1)
+    GX = np.sum(np.multiply(X, X), axis=1).reshape(-1, 1)
     Q = np.tile(GX, (1, T))
     R = np.tile(GX.T, (T, 1))
     dists = Q + R - 2 * X * X.T
     dists = dists - np.tril(dists)
     dists = np.reshape(dists, (T**2, 1))
-    widthX = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)], axis=1)[0, 0])
+    widthX = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)]))
     widthX = widthX * 2.5  # kernel width
     theta = 1 / (widthX**2)
-    H = np.mat(np.eye(T)) - np.mat(np.ones((T, T))) / T
+    H = np.eye(T) - np.ones((T, T)) / T
     Kx, _ = kernel(X, X, (theta, 1))
     Kx = H * Kx * H
 
@@ -847,28 +828,27 @@ def local_score_marginal_multi(
     eix = eix[:, IIx]
 
     if len(PAi):
-        widthPA_all = np.mat(np.empty((1, 0)))
+        widthPA_all = np.empty((1, 0))
         # set the kernel for PA
-        PA_all = np.mat(np.empty((Data.shape[0], 0)))
+        PA_all = np.empty((Data.shape[0], 0))
         for m in range(len(PAi)):
-            PA = Data[:, parameters["dlabel"][PAi[m]]]
+            PA = Data[:, parameters["dlabel"][PAi[m]]].reshape(-1, 1)
             PA_all = np.hstack([PA_all, PA])
-            G = np.sum((np.multiply(PA, PA)), axis=1)
+            G = np.sum((np.multiply(PA, PA)), axis=1).reshape(-1, 1)
             Q = np.tile(G, (1, T))
             R = np.tile(G.T, (T, 1))
             dists = Q + R - 2 * PA * PA.T
             dists = dists - np.tril(dists)
             dists = np.reshape(dists, (T**2, 1))
-            widthPA = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)], axis=1)[0, 0])
+            widthPA = np.sqrt(0.5 * np.median(dists[np.where(dists > 0)]))
             widthPA_all = np.hstack(
                 [
                     widthPA_all,
-                    widthPA
-                    * np.mat(np.ones((1, np.size(parameters["dlabel"][PAi[m]])))),
+                    widthPA * np.ones((1, np.size(parameters["dlabel"][PAi[m]]))),
                 ]
             )
         widthPA_all = widthPA_all * 2.5  # kernel width
-        covfunc = np.asarray(["covSum", ["covSEard", "covNoise"]])
+        covfunc = np.asarray(["covSum", ["covSEard", "covNoise"]], dtype=object)
         logtheta0 = np.vstack([np.log(widthPA_all.T), 0, np.log(np.sqrt(0.1))])
         logtheta, fvals, iter = minimize(
             logtheta0,
@@ -876,35 +856,35 @@ def local_score_marginal_multi(
             -300,
             covfunc,
             PA_all,
-            2 * np.sqrt(T) * eix * np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
+            2 * np.sqrt(T) * eix @ np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
         )
 
         nlml, dnlml = gpr_multi_new(
             logtheta,
             covfunc,
             PA_all,
-            2 * np.sqrt(T) * eix * np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
+            2 * np.sqrt(T) * eix @ np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
             nargout=2,
         )
     else:
-        covfunc = np.asarray(["covSum", ["covSEard", "covNoise"]])
-        PA = np.mat(np.zeros((T, 1)))
-        logtheta0 = np.mat([100, 0, np.log(np.sqrt(0.1))]).T
+        covfunc = np.asarray(["covSum", ["covSEard", "covNoise"]], dtype=object)
+        PA = np.zeros((T, 1))
+        logtheta0 = np.array([[100], [0], [np.log(np.sqrt(0.1))]])
         logtheta, fvals, iter = minimize(
             logtheta0,
             "gpr_multi_new",
             -300,
             covfunc,
             PA,
-            2 * np.sqrt(T) * eix * np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
+            2 * np.sqrt(T) * eix @ np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
         )
 
         nlml, dnlml = gpr_multi_new(
             logtheta,
             covfunc,
             PA,
-            2 * np.sqrt(T) * eix * np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
+            2 * np.sqrt(T) * eix @ np.diag(np.sqrt(eig_Kx)) / np.sqrt(eig_Kx[0]),
             nargout=2,
         )
     score = nlml  # negative log-likelihood
-    return score
+    return score
\ No newline at end of file
diff --git a/causallearn/search/ScoreBased/GES.py b/causallearn/search/ScoreBased/GES.py
index c9475755..018bbf93 100644
--- a/causallearn/search/ScoreBased/GES.py
+++ b/causallearn/search/ScoreBased/GES.py
@@ -8,8 +8,13 @@
 from typing import Union
 
 
-def ges(X: ndarray, score_func: str = 'local_score_BIC', maxP: Optional[float] = None,
-        parameters: Optional[Dict[str, Any]] = None, node_names: Union[List[str], None] = None,) -> Dict[str, Any]:
+def ges(
+    X: ndarray,
+    score_func: str = "local_score_BIC",
+    maxP: Optional[float] = None,
+    parameters: Optional[Dict[str, Any]] = None,
+    node_names: Union[List[str], None] = None,
+) -> Dict[str, Any]:
     """
     Perform greedy equivalence search (GES) algorithm
 
@@ -39,61 +44,88 @@ def ges(X: ndarray, score_func: str = 'local_score_BIC', maxP: Optional[float] =
     if X.shape[0] < X.shape[1]:
         warnings.warn("The number of features is much larger than the sample size!")
 
-    X = np.mat(X)
-    if score_func == 'local_score_CV_general':  # % k-fold negative cross validated likelihood based on regression in RKHS
+    if (
+        score_func == "local_score_CV_general"
+    ):  # % k-fold negative cross validated likelihood based on regression in RKHS
         if parameters is None:
-            parameters = {'kfold': 10,  # 10 fold cross validation
-                          'lambda': 0.01}  # regularization parameter
+            parameters = {
+                "kfold": 10,  # 10 fold cross validation
+                "lambda": 0.01,
+            }  # regularization parameter
         if maxP is None:
             maxP = X.shape[1] / 2  # maximum number of parents
         N = X.shape[1]  # number of variables
-        localScoreClass = LocalScoreClass(data=X, local_score_fun=local_score_cv_general, parameters=parameters)
+        localScoreClass = LocalScoreClass(
+            data=X, local_score_fun=local_score_cv_general, parameters=parameters
+        )
 
-    elif score_func == 'local_score_marginal_general':  # negative marginal likelihood based on regression in RKHS
+    elif (
+        score_func == "local_score_marginal_general"
+    ):  # negative marginal likelihood based on regression in RKHS
         parameters = {}
         if maxP is None:
             maxP = X.shape[1] / 2  # maximum number of parents
         N = X.shape[1]  # number of variables
-        localScoreClass = LocalScoreClass(data=X, local_score_fun=local_score_marginal_general, parameters=parameters)
+        localScoreClass = LocalScoreClass(
+            data=X, local_score_fun=local_score_marginal_general, parameters=parameters
+        )
 
-    elif score_func == 'local_score_CV_multi':  # k-fold negative cross validated likelihood based on regression in RKHS
+    elif (
+        score_func == "local_score_CV_multi"
+    ):  # k-fold negative cross validated likelihood based on regression in RKHS
         # for data with multi-variate dimensions
         if parameters is None:
-            parameters = {'kfold': 10, 'lambda': 0.01, 'dlabel': {}}  # regularization parameter
+            parameters = {
+                "kfold": 10,
+                "lambda": 0.01,
+                "dlabel": {},
+            }  # regularization parameter
             for i in range(X.shape[1]):
-                parameters['dlabel'][i] = i
+                parameters["dlabel"][i] = i
         if maxP is None:
-            maxP = len(parameters['dlabel']) / 2
-        N = len(parameters['dlabel'])
-        localScoreClass = LocalScoreClass(data=X, local_score_fun=local_score_cv_multi, parameters=parameters)
+            maxP = len(parameters["dlabel"]) / 2
+        N = len(parameters["dlabel"])
+        localScoreClass = LocalScoreClass(
+            data=X, local_score_fun=local_score_cv_multi, parameters=parameters
+        )
 
-    elif score_func == 'local_score_marginal_multi':  # negative marginal likelihood based on regression in RKHS
+    elif (
+        score_func == "local_score_marginal_multi"
+    ):  # negative marginal likelihood based on regression in RKHS
         # for data with multi-variate dimensions
         if parameters is None:
-            parameters = {'dlabel': {}}
+            parameters = {"dlabel": {}}
             for i in range(X.shape[1]):
-                parameters['dlabel'][i] = i
+                parameters["dlabel"][i] = i
         if maxP is None:
-            maxP = len(parameters['dlabel']) / 2
-        N = len(parameters['dlabel'])
-        localScoreClass = LocalScoreClass(data=X, local_score_fun=local_score_marginal_multi, parameters=parameters)
+            maxP = len(parameters["dlabel"]) / 2
+        N = len(parameters["dlabel"])
+        localScoreClass = LocalScoreClass(
+            data=X, local_score_fun=local_score_marginal_multi, parameters=parameters
+        )
 
-    elif score_func == 'local_score_BIC' or score_func == 'local_score_BIC_from_cov':  # Greedy equivalence search with BIC score
+    elif (
+        score_func == "local_score_BIC" or score_func == "local_score_BIC_from_cov"
+    ):  # Greedy equivalence search with BIC score
         if maxP is None:
             maxP = X.shape[1] / 2
         N = X.shape[1]  # number of variables
         parameters = {}
         parameters["lambda_value"] = 2
-        localScoreClass = LocalScoreClass(data=X, local_score_fun=local_score_BIC_from_cov, parameters=parameters)
+        localScoreClass = LocalScoreClass(
+            data=X, local_score_fun=local_score_BIC_from_cov, parameters=parameters
+        )
 
-    elif score_func == 'local_score_BDeu':  # Greedy equivalence search with BDeu score
+    elif score_func == "local_score_BDeu":  # Greedy equivalence search with BDeu score
         if maxP is None:
             maxP = X.shape[1] / 2
         N = X.shape[1]  # number of variables
-        localScoreClass = LocalScoreClass(data=X, local_score_fun=local_score_BDeu, parameters=None)
+        localScoreClass = LocalScoreClass(
+            data=X, local_score_fun=local_score_BDeu, parameters=None
+        )
 
     else:
-        raise Exception('Unknown function!')
+        raise Exception("Unknown function!")
     score_func = localScoreClass
 
     if node_names is None:
@@ -113,8 +145,9 @@ def ges(X: ndarray, score_func: str = 'local_score_BIC', maxP: Optional[float] =
 
     ## --------------------------------------------------------------------
     ## forward greedy search
-    record_local_score = [[] for i in range(
-        N)]  # record the local score calculated each time. Thus when we transition to the second phase,
+    record_local_score = [
+        [] for i in range(N)
+    ]  # record the local score calculated each time. Thus when we transition to the second phase,
     # many of the operators can be scored without an explicit call the the scoring function
     # record_local_score{trial}{j} record the local scores when Xj as a parent
     score_new = score
@@ -132,17 +165,27 @@ def ges(X: ndarray, score_func: str = 'local_score_BIC', maxP: Optional[float] =
         min_desc = []
         for i in range(N):
             for j in range(N):
-                if (G.graph[i, j] == Endpoint.NULL.value and G.graph[j, i] == Endpoint.NULL.value
-                        and i != j and len(np.where(G.graph[j, :] == Endpoint.ARROW.value)[0]) <= maxP):
+                if (
+                    G.graph[i, j] == Endpoint.NULL.value
+                    and G.graph[j, i] == Endpoint.NULL.value
+                    and i != j
+                    and len(np.where(G.graph[j, :] == Endpoint.ARROW.value)[0]) <= maxP
+                ):
                     # find a pair (Xi, Xj) that is not adjacent in the current graph , and restrict the number of parents
-                    Tj = np.intersect1d(np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
-                                        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0])  # neighbors of Xj
+                    Tj = np.intersect1d(
+                        np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
+                        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0],
+                    )  # neighbors of Xj
 
-                    Ti = np.union1d(np.where(G.graph[:, i] != Endpoint.NULL.value)[0],
-                                    np.where(G.graph[i, :] != Endpoint.NULL.value)[0])  # adjacent to Xi
+                    Ti = np.union1d(
+                        np.where(G.graph[:, i] != Endpoint.NULL.value)[0],
+                        np.where(G.graph[i, :] != Endpoint.NULL.value)[0],
+                    )  # adjacent to Xi
 
                     NTi = np.setdiff1d(np.arange(N), Ti)
-                    T0 = np.intersect1d(Tj, NTi)  # find the neighbours of Xj that are not adjacent to Xi
+                    T0 = np.intersect1d(
+                        Tj, NTi
+                    )  # find the neighbours of Xj that are not adjacent to Xi
                     # for any subset of T0
                     sub = Combinatorial(T0.tolist())  # find all the subsets for T0
                     S = np.zeros(len(sub))
@@ -151,34 +194,49 @@ def ges(X: ndarray, score_func: str = 'local_score_BIC', maxP: Optional[float] =
                     # 1: only check the first condition
                     # 2: check nothing and is not valid.
                     for k in range(len(sub)):
-                        if (S[k] < 2):  # S indicate whether we need to check subset(k)
-                            V1 = insert_validity_test1(G, i, j, sub[k])  # Insert operator validation test:condition 1
-                            if (V1):
-                                if (not S[k]):
-                                    V2 = insert_validity_test2(G, i, j,
-                                                               sub[k])  # Insert operator validation test:condition 2
+                        if S[k] < 2:  # S indicate whether we need to check subset(k)
+                            V1 = insert_validity_test1(
+                                G, i, j, sub[k]
+                            )  # Insert operator validation test:condition 1
+                            if V1:
+                                if not S[k]:
+                                    V2 = insert_validity_test2(
+                                        G, i, j, sub[k]
+                                    )  # Insert operator validation test:condition 2
                                 else:
                                     V2 = 1
-                                if (V2):
-                                    Idx = find_subset_include(sub[k], sub)  # find those subsets that include sub(k)
+                                if V2:
+                                    Idx = find_subset_include(
+                                        sub[k], sub
+                                    )  # find those subsets that include sub(k)
                                     S[np.where(Idx == 1)] = 1
-                                    chscore, desc, record_local_score = insert_changed_score(X, G, i, j, sub[k],
-                                                                                             record_local_score,
-                                                                                             score_func,
-                                                                                             parameters)
+                                    chscore, desc, record_local_score = (
+                                        insert_changed_score(
+                                            X,
+                                            G,
+                                            i,
+                                            j,
+                                            sub[k],
+                                            record_local_score,
+                                            score_func,
+                                            parameters,
+                                        )
+                                    )
                                     # calculate the changed score after Insert operator
                                     # desc{count} saves the corresponding (i,j,sub{k})
                                     # sub{k}:
-                                    if (chscore < min_chscore):
+                                    if chscore < min_chscore:
                                         min_chscore = chscore
                                         min_desc = desc
                             else:
-                                Idx = find_subset_include(sub[k], sub)  # find those subsets that include sub(k)
+                                Idx = find_subset_include(
+                                    sub[k], sub
+                                )  # find those subsets that include sub(k)
                                 S[np.where(Idx == 1)] = 2
 
-        if (len(min_desc) != 0):
+        if len(min_desc) != 0:
             score_new = score + min_chscore
-            if (score - score_new <= 0):
+            if score - score_new <= 0:
                 break
             G = insert(G, min_desc[0], min_desc[1], min_desc[2])
             update1.append([min_desc[0], min_desc[1], min_desc[2]])
@@ -206,35 +264,54 @@ def ges(X: ndarray, score_func: str = 'local_score_BIC', maxP: Optional[float] =
         min_desc = []
         for i in range(N):
             for j in range(N):
-                if ((G.graph[j, i] == Endpoint.TAIL.value and G.graph[i, j] == Endpoint.TAIL.value)
-                        or G.graph[j, i] == Endpoint.ARROW.value):  # if Xi - Xj or Xi -> Xj
-                    Hj = np.intersect1d(np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
-                                        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0])  # neighbors of Xj
-                    Hi = np.union1d(np.where(G.graph[i, :] != Endpoint.NULL.value)[0],
-                                    np.where(G.graph[:, i] != Endpoint.NULL.value)[0])  # adjacent to Xi
-                    H0 = np.intersect1d(Hj, Hi)  # find the neighbours of Xj that are adjacent to Xi
+                if (
+                    G.graph[j, i] == Endpoint.TAIL.value
+                    and G.graph[i, j] == Endpoint.TAIL.value
+                ) or G.graph[
+                    j, i
+                ] == Endpoint.ARROW.value:  # if Xi - Xj or Xi -> Xj
+                    Hj = np.intersect1d(
+                        np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
+                        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0],
+                    )  # neighbors of Xj
+                    Hi = np.union1d(
+                        np.where(G.graph[i, :] != Endpoint.NULL.value)[0],
+                        np.where(G.graph[:, i] != Endpoint.NULL.value)[0],
+                    )  # adjacent to Xi
+                    H0 = np.intersect1d(
+                        Hj, Hi
+                    )  # find the neighbours of Xj that are adjacent to Xi
                     # for any subset of H0
                     sub = Combinatorial(H0.tolist())  # find all the subsets for H0
                     S = np.ones(len(sub))  # S indicate whether we need to check sub{k}.
                     # 1: check the condition,
                     # 2: check nothing and is valid;
                     for k in range(len(sub)):
-                        if (S[k] == 1):
-                            V = delete_validity_test(G, i, j, sub[k])  # Delete operator validation test
-                            if (V):
+                        if S[k] == 1:
+                            V = delete_validity_test(
+                                G, i, j, sub[k]
+                            )  # Delete operator validation test
+                            if V:
                                 # find those subsets that include sub(k)
                                 Idx = find_subset_include(sub[k], sub)
                                 S[np.where(Idx == 1)] = 2  # and set their S to 2
                         else:
                             V = 1
 
-                        if (V):
-                            chscore, desc, record_local_score = delete_changed_score(X, G, i, j, sub[k],
-                                                                                     record_local_score, score_func,
-                                                                                     parameters)
+                        if V:
+                            chscore, desc, record_local_score = delete_changed_score(
+                                X,
+                                G,
+                                i,
+                                j,
+                                sub[k],
+                                record_local_score,
+                                score_func,
+                                parameters,
+                            )
                             # calculate the changed score after Insert operator
                             # desc{count} saves the corresponding (i,j,sub{k})
-                            if (chscore < min_chscore):
+                            if chscore < min_chscore:
                                 min_chscore = chscore
                                 min_desc = desc
 
@@ -251,5 +328,12 @@ def ges(X: ndarray, score_func: str = 'local_score_BIC', maxP: Optional[float] =
             score_new = score
             break
 
-    Record = {'update1': update1, 'update2': update2, 'G_step1': G_step1, 'G_step2': G_step2, 'G': G, 'score': score}
+    Record = {
+        "update1": update1,
+        "update2": update2,
+        "G_step1": G_step1,
+        "G_step2": G_step2,
+        "G": G,
+        "score": score,
+    }
     return Record
diff --git a/causallearn/utils/DAG2CPDAG.py b/causallearn/utils/DAG2CPDAG.py
index 7a619a02..2604b1f7 100644
--- a/causallearn/utils/DAG2CPDAG.py
+++ b/causallearn/utils/DAG2CPDAG.py
@@ -25,10 +25,11 @@ def dag2cpdag(G: Dag) -> GeneralGraph:
 
     # order the edges in G
     nodes_order = list(
-        map(lambda x: G.node_map[x], G.get_causal_ordering()))  # Perform a topological sort on the nodes of G
+        map(lambda x: G.node_map[x], G.get_causal_ordering())
+    )  # Perform a topological sort on the nodes of G
     # nodes_order(1) is the node which has the highest order
     # nodes_order(N) is the node which has the lowest order
-    edges_order = np.mat([[], []], dtype=np.int64).T
+    edges_order = np.empty((0, 2), dtype=np.int64)
     # edges_order(1,:) is the edge which has the highest order
     # edges_order(M,:) is the edge which has the lowest order
     M = G.get_num_edges()  # the number of edges in this DAG
@@ -37,30 +38,44 @@ def dag2cpdag(G: Dag) -> GeneralGraph:
     while edges_order.shape[0] < M:
         for ny in range(N - 1, -1, -1):
             j = nodes_order[ny]
-            inci_all = np.where(G.graph[j, :] == 1)[0]  # all the edges that incident to j
+            inci_all = np.where(G.graph[j, :] == 1)[
+                0
+            ]  # all the edges that incident to j
             if len(inci_all) != 0:
                 if len(edges_order) != 0:
-                    inci = edges_order[np.where(edges_order[:, 1] == j)[0], 0]  # ordered edge that incident to j
-                    if len(set(inci_all) - set(inci.T.tolist()[0])) != 0:
+                    inci = edges_order[
+                        np.where(edges_order[:, 1] == j)[0], 0
+                    ]  # ordered edge that incident to j
+                    if len(set(inci_all) - set(inci.tolist())) != 0:
                         break
                 else:
                     break
         for nx in range(N):
             i = nodes_order[nx]
             if len(edges_order) != 0:
-                if (len(np.intersect1d(np.where(edges_order[:, 1] == j)[0],
-                                       np.where(edges_order[:, 0] == i)[0])) == 0 and G.graph[j, i] == 1):
+                if (
+                    len(
+                        np.intersect1d(
+                            np.where(edges_order[:, 1] == j)[0],
+                            np.where(edges_order[:, 0] == i)[0],
+                        )
+                    )
+                    == 0
+                    and G.graph[j, i] == 1
+                ):
                     break
             else:
                 if G.graph[j, i] == 1:
                     break
-        edges_order = np.r_[edges_order, np.mat([i, j])]
+        edges_order = np.r_[edges_order, np.array([[i, j]])]
 
     ## ----------------------------------------------------------------
     sign_edges = np.zeros(M)  # 0 means unknown, 1 means compelled, -1 means reversible
     while len(np.where(sign_edges == 0)[0]) != 0:
         ss = 0
-        for m in range(M - 1, -1, -1):  # let x->y be the lowest ordered edge that is labeled "unknown"
+        for m in range(
+            M - 1, -1, -1
+        ):  # let x->y be the lowest ordered edge that is labeled "unknown"
             if sign_edges[m] == 0:
                 i = edges_order[m, 0]
                 j = edges_order[m, 1]
@@ -70,42 +85,82 @@ def dag2cpdag(G: Dag) -> GeneralGraph:
         for m in range(len(k)):
             if sign_edges[idk[m]] == 1:
                 if G.graph[j, k[m]] != 1:  # if w is not a parent of y
-                    _id = np.where(edges_order[:, 1] == j)[0]  # label every edge that incident into y with "complled"
+                    _id = np.where(edges_order[:, 1] == j)[
+                        0
+                    ]  # label every edge that incident into y with "complled"
                     sign_edges[_id] = 1
                     ss = 1
                     break
                 else:
-                    _id = np.intersect1d(np.where(edges_order[:, 0] == k[m, 0])[0],
-                                         np.where(edges_order[:, 1] == j)[0])  # label w->y with "complled"
+                    _id = np.intersect1d(
+                        np.where(edges_order[:, 0] == k[m])[0],
+                        np.where(edges_order[:, 1] == j)[0],
+                    )  # label w->y with "complled"
                     sign_edges[_id] = 1
         if ss:
             continue
 
         z = np.where(G.graph[j, :] == 1)[0]
-        if (len(np.intersect1d(np.setdiff1d(z, i),
-                               np.union1d(np.union1d(np.where(G.graph[i, :] == 0)[0], np.where(G.graph[i, :] == -1)[0]),
-                                          np.intersect1d(np.where(G.graph[i, :] == -1)[0],
-                                                         np.where(G.graph[:, i] == -1)[0])))) != 0):
-            _id = np.intersect1d(np.where(edges_order[:, 0] == i)[0], np.where(edges_order[:, 1] == j)[0])
+        if (
+            len(
+                np.intersect1d(
+                    np.setdiff1d(z, i),
+                    np.union1d(
+                        np.union1d(
+                            np.where(G.graph[i, :] == 0)[0],
+                            np.where(G.graph[i, :] == -1)[0],
+                        ),
+                        np.intersect1d(
+                            np.where(G.graph[i, :] == -1)[0],
+                            np.where(G.graph[:, i] == -1)[0],
+                        ),
+                    ),
+                )
+            )
+            != 0
+        ):
+            _id = np.intersect1d(
+                np.where(edges_order[:, 0] == i)[0], np.where(edges_order[:, 1] == j)[0]
+            )
             sign_edges[_id] = 1  # label x->y  with "compelled"
             id1 = np.where(edges_order[:, 1] == j)[0]
             id2 = np.intersect1d(np.where(sign_edges == 0)[0], id1)
-            sign_edges[id2] = 1  # label all "unknown" edges incident into y  with "complled"
+            sign_edges[id2] = (
+                1  # label all "unknown" edges incident into y  with "complled"
+            )
         else:
-            _id = np.intersect1d(np.where(edges_order[:, 0] == i)[0], np.where(edges_order[:, 1] == j)[0])
+            _id = np.intersect1d(
+                np.where(edges_order[:, 0] == i)[0], np.where(edges_order[:, 1] == j)[0]
+            )
             sign_edges[_id] = -1  # label x->y with "reversible"
 
             id1 = np.where(edges_order[:, 1] == j)[0]
             id2 = np.intersect1d(np.where(sign_edges == 0)[0], id1)
-            sign_edges[id2] = -1  # label all "unknown" edges incident into y with "reversible"
+            sign_edges[id2] = (
+                -1
+            )  # label all "unknown" edges incident into y with "reversible"
 
     # create CPDAG according the labelled edge
     nodes = G.get_nodes()
     CPDAG = GeneralGraph(nodes)
     for m in range(M):
         if sign_edges[m] == 1:
-            CPDAG.add_edge(Edge(nodes[edges_order[m, 0]], nodes[edges_order[m, 1]], Endpoint.TAIL, Endpoint.ARROW))
+            CPDAG.add_edge(
+                Edge(
+                    nodes[edges_order[m, 0]],
+                    nodes[edges_order[m, 1]],
+                    Endpoint.TAIL,
+                    Endpoint.ARROW,
+                )
+            )
         else:
-            CPDAG.add_edge(Edge(nodes[edges_order[m, 0]], nodes[edges_order[m, 1]], Endpoint.TAIL, Endpoint.TAIL))
+            CPDAG.add_edge(
+                Edge(
+                    nodes[edges_order[m, 0]],
+                    nodes[edges_order[m, 1]],
+                    Endpoint.TAIL,
+                    Endpoint.TAIL,
+                )
+            )
 
     return CPDAG
diff --git a/causallearn/utils/GESUtils.py b/causallearn/utils/GESUtils.py
index 20735676..ee0e1138 100644
--- a/causallearn/utils/GESUtils.py
+++ b/causallearn/utils/GESUtils.py
@@ -2,8 +2,6 @@
 from copy import deepcopy
 from typing import List
 
-import numpy.matlib
-
 from causallearn.graph.Edge import Edge
 from causallearn.graph.Endpoint import Endpoint
 from causallearn.score.LocalScoreFunction import *
@@ -59,12 +57,18 @@ def insert_validity_test1(G, i, j, T) -> int:
     V = 0
 
     # condition 1
-    Tj = np.intersect1d(np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
-                        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0])  # neighbors of Xj
-    Ti = np.union1d(np.where(G.graph[:, i] != Endpoint.NULL.value)[0],
-                    np.where(G.graph[i, :] != Endpoint.NULL.value)[0])  # adjacent to Xi;
+    Tj = np.intersect1d(
+        np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
+        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0],
+    )  # neighbors of Xj
+    Ti = np.union1d(
+        np.where(G.graph[:, i] != Endpoint.NULL.value)[0],
+        np.where(G.graph[i, :] != Endpoint.NULL.value)[0],
+    )  # adjacent to Xi;
     NA = np.intersect1d(Tj, Ti)  # find the neighbours of Xj and are adjacent to Xi
-    V = check_clique(G, list(np.union1d(NA, T).astype(int)))  # check whether it is a clique
+    V = check_clique(
+        G, list(np.union1d(NA, T).astype(int))
+    )  # check whether it is a clique
     return V
 
 
@@ -81,7 +85,9 @@ def check_clique(G, subnode) -> int:  # check whether node subnode is a clique i
         row, col = np.where(Gs == Endpoint.ARROW.value)
         Gs[row, col] = Endpoint.TAIL.value
         Gs[col, row] = Endpoint.TAIL.value
-        if np.all((np.eye(ns) - np.ones((ns, ns))) == Gs):  # check whether it is a clique
+        if np.all(
+            (np.eye(ns) - np.ones((ns, ns))) == Gs
+        ):  # check whether it is a clique
             s = 1
         else:
             s = 0
@@ -92,10 +98,14 @@ def insert_validity_test2(G, i, j, T) -> int:
     # V=Insert_validity_test(G, X, Y, T,1); % do validity test for the operator Insert; V=1 means valid, V=0 mean invalid;
     # here G is CPDAG
     V = 0
-    Tj = np.intersect1d(np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
-                        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0])  # neighbors of Xj
-    Ti = np.union1d(np.where(G.graph[i, :] != Endpoint.NULL.value)[0],
-                    np.where(G.graph[:, i] != Endpoint.NULL.value)[0])  # adjacent to Xi;
+    Tj = np.intersect1d(
+        np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
+        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0],
+    )  # neighbors of Xj
+    Ti = np.union1d(
+        np.where(G.graph[i, :] != Endpoint.NULL.value)[0],
+        np.where(G.graph[:, i] != Endpoint.NULL.value)[0],
+    )  # adjacent to Xi;
     NA = np.intersect1d(Tj, Ti)  # find the neighbours of Xj and are adjacent to Xi
 
     # condition 2: every semi-directed path from Xj to Xi contains a node in union(NA,T)
@@ -113,47 +123,55 @@ def insert_vc2_new(G, j, i, NAT):  # validity test for condition 2 of Insert ope
     start = j
     target = i
     # stack(1)=start; % initialize the stack
-    stack = [{'value': start, 'pa': {}}]
+    stack = [{"value": start, "pa": {}}]
     sign = 1  # If every semi-pathway contains a node in NAT, than sign=1;
     count = 1
 
     while len(stack):
         top = stack[0]
         stack = stack[1:]  # pop
-        if top['value'] == target:  # if find the target, search that pathway to see whether NAT is in that pathway
+        if (
+            top["value"] == target
+        ):  # if find the target, search that pathway to see whether NAT is in that pathway
             curr = top
             ss = 0
             while True:
-                if len(curr['pa']):
-                    if curr['pa']['value'] in NAT:  # contains a node in NAT
+                if len(curr["pa"]):
+                    if curr["pa"]["value"] in NAT:  # contains a node in NAT
                         ss = 1
                         break
                 else:
                     break
-                curr = curr['pa']
+                curr = curr["pa"]
             if not ss:  # do not include NAT
                 sign = 0
                 break
         else:
-            child = np.concatenate((np.where(G.graph[:, top['value']] == Endpoint.ARROW.value)[0],
-                                    np.intersect1d(np.where(G.graph[top['value'], :] == Endpoint.TAIL.value)[0],
-                                                   np.where(G.graph[:, top['value']] == Endpoint.TAIL.value)[0])))
+            child = np.concatenate(
+                (
+                    np.where(G.graph[:, top["value"]] == Endpoint.ARROW.value)[0],
+                    np.intersect1d(
+                        np.where(G.graph[top["value"], :] == Endpoint.TAIL.value)[0],
+                        np.where(G.graph[:, top["value"]] == Endpoint.TAIL.value)[0],
+                    ),
+                )
+            )
             sign_child = np.ones(len(child))
             # check each child, whether it has appeared before in the same pathway
             for k in range(len(child)):
                 curr = top
                 while True:
-                    if len(curr['pa']):
-                        if curr['pa']['value'] == child[k]:
+                    if len(curr["pa"]):
+                        if curr["pa"]["value"] == child[k]:
                             sign_child[k] = 0  # has appeared in that path before
                             break
                     else:
                         break
-                    curr = curr['pa']
+                    curr = curr["pa"]
 
             for k in range(len(sign_child)):
                 if sign_child[k]:
-                    stack.insert(0, {'value': child[k], 'pa': top})  # push
+                    stack.insert(0, {"value": child[k], "pa": top})  # push
     return sign
 
 
@@ -173,10 +191,14 @@ def find_subset_include(s0, sub):
 
 def insert_changed_score(Data, G, i, j, T, record_local_score, score_func, parameters):
     # calculate the changed score after the insert operator: i->j
-    Tj = np.intersect1d(np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
-                        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0])  # neighbors of Xj
-    Ti = np.union1d(np.where(G.graph[i, :] != Endpoint.NULL.value)[0],
-                    np.where(G.graph[:, i] != Endpoint.NULL.value)[0])  # adjacent to Xi;
+    Tj = np.intersect1d(
+        np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
+        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0],
+    )  # neighbors of Xj
+    Ti = np.union1d(
+        np.where(G.graph[i, :] != Endpoint.NULL.value)[0],
+        np.where(G.graph[:, i] != Endpoint.NULL.value)[0],
+    )  # adjacent to Xi;
     NA = np.intersect1d(Tj, Ti)  # find the neighbours of Xj and are adjacent to Xi
     Paj = np.where(G.graph[j, :] == Endpoint.ARROW.value)[0]  # find the parents of Xj
     # the function local_score() calculates the local score
@@ -197,12 +219,15 @@ def insert_changed_score(Data, G, i, j, T, record_local_score, score_func, param
             score1 = record_local_score[j][r0][-1]
             s1 = 1
 
-        if not np.setxor1d(record_local_score[j][r0][0:-1],
-                           tmp2).size:  # notice the difference between 0*0 empty matrix and 1*0 empty matrix
+        if not np.setxor1d(
+            record_local_score[j][r0][0:-1], tmp2
+        ).size:  # notice the difference between 0*0 empty matrix and 1*0 empty matrix
             score2 = record_local_score[j][r0][-1]
             s2 = 1
         else:
-            if (not np.setxor1d(record_local_score[j][r0][0:-1], [-1]).size) and (not tmp2.size):
+            if (not np.setxor1d(record_local_score[j][r0][0:-1], [-1]).size) and (
+                not tmp2.size
+            ):
                 score2 = record_local_score[j][r0][-1]
                 s2 = 1
 
@@ -236,7 +261,9 @@ def insert(G, i, j, T):
     nodes = G.get_nodes()
     G.add_edge(Edge(nodes[i], nodes[j], Endpoint.TAIL, Endpoint.ARROW))
 
-    for k in range(len(T)):  # directing the previous undirected edge between T and Xj as T->Xj
+    for k in range(
+        len(T)
+    ):  # directing the previous undirected edge between T and Xj as T->Xj
         if G.get_edge(nodes[T[k]], nodes[j]) is not None:
             G.remove_edge(G.get_edge(nodes[T[k]], nodes[j]))
         G.add_edge(Edge(nodes[T[k]], nodes[j], Endpoint.TAIL, Endpoint.ARROW))
@@ -250,14 +277,18 @@ def delete_validity_test(G, i, j, H):
     V = 0
 
     # condition 1
-    Hj = np.intersect1d(np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
-                        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0])  # neighbors of Xj
-    Hi = np.union1d(np.where(G.graph[i, :] != Endpoint.NULL.value)[0],
-                    np.where(G.graph[:, i] != Endpoint.NULL.value)[0])  # adjacent to Xi;
+    Hj = np.intersect1d(
+        np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
+        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0],
+    )  # neighbors of Xj
+    Hi = np.union1d(
+        np.where(G.graph[i, :] != Endpoint.NULL.value)[0],
+        np.where(G.graph[:, i] != Endpoint.NULL.value)[0],
+    )  # adjacent to Xi;
     NA = np.intersect1d(Hj, Hi)  # find the neighbours of Xj and are adjacent to Xi
     s1 = check_clique(G, list(set(NA) - set(H)))  # check whether it is a clique
 
-    if (s1):
+    if s1:
         V = 1
 
     return V
@@ -265,12 +296,18 @@ def delete_validity_test(G, i, j, H):
 
 def delete_changed_score(Data, G, i, j, H, record_local_score, score_func, parameters):
     # calculate the changed score after the Delete operator
-    Hj = np.intersect1d(np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
-                        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0])  # neighbors of Xj
-    Hi = np.union1d(np.where(G.graph[i, :] != Endpoint.NULL.value)[0],
-                    np.where(G.graph[:, i] != Endpoint.NULL.value)[0])  # adjacent to Xi;
+    Hj = np.intersect1d(
+        np.where(G.graph[:, j] == Endpoint.TAIL.value)[0],
+        np.where(G.graph[j, :] == Endpoint.TAIL.value)[0],
+    )  # neighbors of Xj
+    Hi = np.union1d(
+        np.where(G.graph[i, :] != Endpoint.NULL.value)[0],
+        np.where(G.graph[:, i] != Endpoint.NULL.value)[0],
+    )  # adjacent to Xi;
     NA = np.intersect1d(Hj, Hi)  # find the neighbours of Xj and are adjacent to Xi
-    Paj = np.union1d(np.where(G.graph[j, :] == Endpoint.ARROW.value)[0], [i])  # find the parents of Xj
+    Paj = np.union1d(
+        np.where(G.graph[j, :] == Endpoint.ARROW.value)[0], [i]
+    )  # find the parents of Xj
     # the function local_score() calculates the local score
     tmp1 = set(NA) - set(H)
     tmp2 = set.union(tmp1, set(Paj))
@@ -289,8 +326,9 @@ def delete_changed_score(Data, G, i, j, H, record_local_score, score_func, param
             score1 = record_local_score[j][r0][-1]
             s1 = 1
 
-        if set(record_local_score[j][r0][
-               0:-1]) == tmp2:  # notice the difference between 0*0 empty matrix and 1*0 empty matrix
+        if (
+            set(record_local_score[j][r0][0:-1]) == tmp2
+        ):  # notice the difference between 0*0 empty matrix and 1*0 empty matrix
             score2 = record_local_score[j][r0][-1]
             s2 = 1
         else:
@@ -356,11 +394,13 @@ def dist2(x, c):
     ndata, dimx = x.shape
     ncentres, dimc = c.shape
     if dimx != dimc:
-        raise Exception('Data dimension does not match dimension of centres')
+        raise Exception("Data dimension does not match dimension of centres")
 
-    n2 = (np.mat(np.ones((ncentres, 1))) * np.sum(np.multiply(x, x).T, axis=0)).T + \
-         np.mat(np.ones((ndata, 1))) * np.sum(np.multiply(c, c).T, axis=0) - \
-         2 * (x * c.T)
+    n2 = (
+        (np.ones((ncentres, 1)) * np.sum(np.multiply(x, x).T, axis=0)).T
+        + np.ones((ndata, 1)) * np.sum(np.multiply(c, c).T, axis=0)
+        - 2 * (x * c.T)
+    )
 
     # Rounding errors occasionally cause negative entries in n2
     n2[np.where(n2 < 0)] = 0
@@ -375,9 +415,9 @@ def pdinv(A):
         invU = np.eye(numData).dot(np.linalg.inv(U))
         Ainv = invU.dot(invU.T)
     except numpy.linalg.LinAlgError as e:
-        warnings.warn('Matrix is not positive definite in pdinv, inverting using svd')
+        warnings.warn("Matrix is not positive definite in pdinv, inverting using svd")
         u, s, vh = np.linalg.svd(A, full_matrices=True)
         Ainv = vh.T.dot(np.diag(1 / s)).dot(u.T)
     except Exception as e:
         raise e
-    return np.mat(Ainv)
+    return Ainv
diff --git a/causallearn/utils/ScoreUtils.py b/causallearn/utils/ScoreUtils.py
index 6bd075aa..1b937451 100644
--- a/causallearn/utils/ScoreUtils.py
+++ b/causallearn/utils/ScoreUtils.py
@@ -10,7 +10,7 @@
 def kernel(x, xKern, theta):
     # KERNEL Compute the rbf kernel
     n2 = dist2(x, xKern)
-    if (theta[0] == 0):
+    if theta[0] == 0:
         theta[0] = 2 / np.median(n2[np.where(np.tril(n2) > 0)])
         theta_new = theta[0]
     wi2 = theta[0] / 2
@@ -36,12 +36,14 @@ def dist2(x, c):
 
     ndata, dimx = x.shape
     ncentres, dimc = c.shape
-    if (dimx != dimc):
-        raise Exception('Data dimension does not match dimension of centres')
+    if dimx != dimc:
+        raise Exception("Data dimension does not match dimension of centres")
 
-    n2 = (np.mat(np.ones((ncentres, 1))) * np.sum(np.multiply(x, x).T, axis=0)).T + \
-         np.mat(np.ones((ndata, 1))) * np.sum(np.multiply(c, c).T, axis=0) - \
-         2 * (x * c.T)
+    n2 = (
+        (np.ones((ncentres, 1)) * np.sum(np.multiply(x, x).T, axis=0)).T
+        + np.ones((ndata, 1)) * np.sum(np.multiply(c, c).T, axis=0)
+        - 2 * (x * c.T)
+    )
 
     # Rounding errors occasionally cause negative entries in n2
     n2[np.where(n2 < 0)] = 0
@@ -56,52 +58,52 @@ def pdinv(A):
         invU = np.eye(numData).dot(np.linalg.inv(U))
         Ainv = invU.dot(invU.T)
     except numpy.linalg.LinAlgError as e:
-        warnings.warn('Matrix is not positive definite in pdinv, inverting using svd')
+        warnings.warn("Matrix is not positive definite in pdinv, inverting using svd")
         u, s, vh = np.linalg.svd(A, full_matrices=True)
         Ainv = vh.T.dot(np.diag(1 / s)).dot(u.T)
     except Exception as e:
         raise e
-    return np.mat(Ainv)
+    return Ainv
 
 
 def eigdec(x, N, evals_only=False):
     # EIGDEC	Sorted eigendecomposition
     #
-    #	Description
-    #	 EVALS = EIGDEC(X, N computes the largest N eigenvalues of the
-    #	matrix X in descending order.  [EVALS, EVEC] = EIGDEC(X, N) also
-    #	computes the corresponding eigenvectors.
+    # 	Description
+    # 	 EVALS = EIGDEC(X, N computes the largest N eigenvalues of the
+    # 	matrix X in descending order.  [EVALS, EVEC] = EIGDEC(X, N) also
+    # 	computes the corresponding eigenvectors.
     #
-    #	See also
-    #	PCA, PPCA
+    # 	See also
+    # 	PCA, PPCA
     #
 
-    if (N != np.round(N) or N < 1 or N > x.shape[1]):
-        raise Exception('Number of PCs must be integer, >0, < dim')
+    if N != np.round(N) or N < 1 or N > x.shape[1]:
+        raise Exception("Number of PCs must be integer, >0, < dim")
 
     # Find the eigenvalues of the data covariance matrix
-    if (evals_only):
+    if evals_only:
         # Use eig function as always more efficient than eigs here
         temp_evals, _ = np.linalg.eig(x)
     else:
         # Use eig function unless fraction of eigenvalues required is tiny
-        if ((N / x.shape[1]) > 0.04):
+        if (N / x.shape[1]) > 0.04:
             temp_evals, temp_evec = np.linalg.eig(x)
         else:
-            temp_evals, temp_evec = scipy.sparse.linalg.eigs(x, k=N, which='LM')
+            temp_evals, temp_evec = scipy.sparse.linalg.eigs(x, k=N, which="LM")
 
     # Eigenvalues nearly always returned in descending order, but just to make sure.....
     evals = np.sort(-temp_evals)
     perm = np.argsort(-temp_evals)
     evals = -evals[0:N]
 
-    if (not evals_only):
-        if (np.all(evals == temp_evals[0:N])):
+    if not evals_only:
+        if np.all(evals == temp_evals[0:N]):
             # Originals were in order
-            evec = temp_evec[:, 0: N]
+            evec = temp_evec[:, 0:N]
         else:
             # Need to reorder the eigenvectors
-            evec = np.empty_like(temp_evec[:, 0: N])
+            evec = np.empty_like(temp_evec[:, 0:N])
             for i in range(N):
                 evec[:, i] = temp_evec[:, perm[i]]
 
@@ -179,9 +181,9 @@ def minimize(X, f, length, *varargin):
         red = 1
 
     if length > 0:
-        S = 'Linesearch'
+        S = "Linesearch"
     else:
-        S = 'Function evaluation'
+        S = "Function evaluation"
 
     i = 0  # zero the run length counter
     ls_failed = 0  # no previous line search has failed
@@ -213,7 +215,7 @@ def minimize(X, f, length, *varargin):
             df3 = df0
             success = False
 
-            while (not success and M > 0):
+            while not success and M > 0:
                 try:
                     M = M - 1
                     i = i + (1 if length < 0 else 0)  # count epochs?!
@@ -221,8 +223,13 @@ def minimize(X, f, length, *varargin):
                     temp.extend(varargin)
                     temp.extend([None, 2])
                     f3, df3 = feval(temp)
-                    if np.isnan(f3) or np.isinf(f3) or np.any(np.isnan(df3)) or np.any(np.isinf(df3)):
-                        raise Exception('')
+                    if (
+                        np.isnan(f3)
+                        or np.isinf(f3)
+                        or np.any(np.isnan(df3))
+                        or np.any(np.isinf(df3))
+                    ):
+                        raise Exception("")
                     success = True
                 except Exception as e:  # catch any error which occurred in f
                     x3 = (x2 + x3) / 2  # bisect and try again
@@ -232,7 +239,9 @@ def minimize(X, f, length, *varargin):
                 F0 = f3
                 dF0 = df3
             d3 = (df3.T * s)[0, 0]  # new slope
-            if d3 > SIG * d0 or f3 > f0 + x3 * RHO * d0 or M == 0:  # are we done extrapolating?
+            if (
+                d3 > SIG * d0 or f3 > f0 + x3 * RHO * d0 or M == 0
+            ):  # are we done extrapolating?
                 break
 
             x1 = x2  # move point 2 to point 1
@@ -243,8 +252,12 @@ def minimize(X, f, length, *varargin):
             d2 = d3
             A = 6 * (f1 - f2) + 3 * (d2 + d1) * (x2 - x1)  # make cubic extrapolation
             B = 3 * (f2 - f1) - (2 * d1 + d2) * (x2 - x1)
-            x3 = x1 - d1 * (x2 - x1) ** 2 / (B + np.sqrt(B * B - A * d1 * (x2 - x1)))  # num. error possible, ok!
-            if not np.isreal(x3) or np.isnan(x3) or np.isinf(x3) or x3 < 0:  # num prob | wrong sign?
+            x3 = x1 - d1 * (x2 - x1) ** 2 / (
+                B + np.sqrt(B * B - A * d1 * (x2 - x1))
+            )  # num. error possible, ok!
+            if (
+                not np.isreal(x3) or np.isnan(x3) or np.isinf(x3) or x3 < 0
+            ):  # num prob | wrong sign?
                 x3 = x2 * EXT  # extrapolate maximum amount
             elif x3 > x2 * EXT:  # new point beyond extrapolation limit?
                 x3 = x2 * EXT  # extrapolate maximum amount
@@ -252,7 +265,9 @@ def minimize(X, f, length, *varargin):
                 x3 = x2 + INT * (x2 - x1)
         # end extrapolation
 
-        while (abs(d3) > -SIG * d0 or f3 > f0 + x3 * RHO * d0) and M > 0:  # keep interpolating
+        while (
+            abs(d3) > -SIG * d0 or f3 > f0 + x3 * RHO * d0
+        ) and M > 0:  # keep interpolating
             if d3 > 0 or f3 > f0 + x3 * RHO * d0:  # choose subinterval
                 x4 = x3  # move point 3 to point 4
                 f4 = f3
@@ -263,16 +278,22 @@ def minimize(X, f, length, *varargin):
                 d2 = d3
 
             if f4 > f0:
-                x3 = x2 - (0.5 * d2 * (x4 - x2) ** 2) / (f4 - f2 - d2 * (x4 - x2))  # quadratic interpolation
+                x3 = x2 - (0.5 * d2 * (x4 - x2) ** 2) / (
+                    f4 - f2 - d2 * (x4 - x2)
+                )  # quadratic interpolation
             else:
                 A = 6 * (f2 - f4) / (x4 - x2) + 3 * (d4 + d2)  # cubic interpolation
                 B = 3 * (f4 - f2) - (2 * d2 + d4) * (x4 - x2)
-                x3 = x2 + (np.sqrt(B * B - A * d2 * (x4 - x2) ** 2) - B) / A  # num. error possible, ok!
+                x3 = (
+                    x2 + (np.sqrt(B * B - A * d2 * (x4 - x2) ** 2) - B) / A
+                )  # num. error possible, ok!
 
             if np.isnan(x3) or np.isinf(x3):
                 x3 = (x2 + x4) / 2  # if we had a numerical problem then bisect
 
-            x3 = max(min(x3, x4 - INT * (x4 - x2)), x2 + INT * (x4 - x2))  # don't accept too close
+            x3 = max(
+                min(x3, x4 - INT * (x4 - x2)), x2 + INT * (x4 - x2)
+            )  # don't accept too close
             temp = [f, X + x3 * s]
             temp.extend(varargin)
             temp.extend([None, 2])
@@ -286,24 +307,28 @@ def minimize(X, f, length, *varargin):
             d3 = (df3.T * s)[0, 0]  # new slope
         # end interpolation
 
-        if (abs(d3) < -SIG * d0 and f3 < f0 + x3 * RHO * d0):  # if line search succeeded
+        if abs(d3) < -SIG * d0 and f3 < f0 + x3 * RHO * d0:  # if line search succeeded
             X = X + x3 * s
             f0 = f3
             fX = np.vstack([fX, f0])  # update variables
-            s = ((df3.T * df3)[0, 0] - df0.T * df3[0, 0]) / (df0.T * df0)[0, 0] * s - df3  # Polack-Ribiere CG direction
+            s = ((df3.T * df3)[0, 0] - df0.T * df3[0, 0]) / (df0.T * df0)[
+                0, 0
+            ] * s - df3  # Polack-Ribiere CG direction
             df0 = df3  # swap derivatives
             d3 = d0
             d0 = (df0.T * s)[0, 0]
-            if (d0 > 0):  # new slope must be negative
+            if d0 > 0:  # new slope must be negative
                 s = -df0
                 d0 = -(s.T * s)[0, 0]
-            x3 = x3 * min(RATIO, d3 / (d0 - sys.float_info.min))  # slope ratio but max RATIO
+            x3 = x3 * min(
+                RATIO, d3 / (d0 - sys.float_info.min)
+            )  # slope ratio but max RATIO
             ls_failed = 0  # this line search did not fail
         else:
             X = X0  # restore best point so far
             f0 = F0
             df0 = dF0
-            if (ls_failed or i > abs(length)):  # line search failed twice in a row
+            if ls_failed or i > abs(length):  # line search failed twice in a row
                 break  # or we ran out of time, so we give up
             s = -df0  # try steepest
             d0 = -(s.T * s)[0, 0]
@@ -313,72 +338,98 @@ def minimize(X, f, length, *varargin):
 
 
 def feval(parameters):
-    if parameters[0] == 'covSum':
-        if (len(parameters) == 1):
+    if parameters[0] == "covSum":
+        if len(parameters) == 1:
             return cov_sum()
-        elif (len(parameters) == 2):
+        elif len(parameters) == 2:
             return cov_sum(parameters[1])
-        elif (len(parameters) == 3):
+        elif len(parameters) == 3:
             return cov_sum(parameters[1], parameters[2])
-        elif (len(parameters) == 4):
+        elif len(parameters) == 4:
             return cov_sum(parameters[1], parameters[2], parameters[3])
-        elif (len(parameters) == 5):
+        elif len(parameters) == 5:
             return cov_sum(parameters[1], parameters[2], parameters[3], parameters[4])
-        elif (len(parameters) == 6):
-            return cov_sum(parameters[1], parameters[2], parameters[3], parameters[4], parameters[5])
-    elif parameters[0] == 'covNoise':
-        if (len(parameters) == 1):
+        elif len(parameters) == 6:
+            return cov_sum(
+                parameters[1],
+                parameters[2],
+                parameters[3],
+                parameters[4],
+                parameters[5],
+            )
+    elif parameters[0] == "covNoise":
+        if len(parameters) == 1:
             return cov_noise()
-        elif (len(parameters) == 2):
+        elif len(parameters) == 2:
             return cov_noise(parameters[1])
-        elif (len(parameters) == 3):
+        elif len(parameters) == 3:
             return cov_noise(parameters[1], parameters[2])
-        elif (len(parameters) == 4):
+        elif len(parameters) == 4:
             return cov_noise(parameters[1], parameters[2], parameters[3])
         else:
             return cov_noise(parameters[1], parameters[2], parameters[3], parameters[4])
-    elif parameters[0] == 'covSEard':
-        if (len(parameters) == 1):
+    elif parameters[0] == "covSEard":
+        if len(parameters) == 1:
             return cov_seard()
-        elif (len(parameters) == 2):
+        elif len(parameters) == 2:
             return cov_seard(parameters[1])
-        elif (len(parameters) == 3):
+        elif len(parameters) == 3:
             return cov_seard(parameters[1], parameters[2])
-        elif (len(parameters) == 4):
+        elif len(parameters) == 4:
             return cov_seard(parameters[1], parameters[2], parameters[3])
         else:
             return cov_seard(parameters[1], parameters[2], parameters[3], parameters[4])
-    elif parameters[0] == 'covSum':
-        if (len(parameters) == 1):
+    elif parameters[0] == "covSum":
+        if len(parameters) == 1:
             return cov_sum()
-        elif (len(parameters) == 2):
+        elif len(parameters) == 2:
             return cov_sum(parameters[1])
-        elif (len(parameters) == 3):
+        elif len(parameters) == 3:
             return cov_sum(parameters[1], parameters[2])
-        elif (len(parameters) == 4):
+        elif len(parameters) == 4:
             return cov_sum(parameters[1], parameters[2], parameters[3])
-        elif (len(parameters) == 5):
+        elif len(parameters) == 5:
             return cov_sum(parameters[1], parameters[2], parameters[3], parameters[4])
-        elif (len(parameters) == 6):
-            return cov_sum(parameters[1], parameters[2], parameters[3], parameters[4], parameters[5])
-    elif parameters[0] == 'gpr_multi_new':
-        if (len(parameters) == 1):
+        elif len(parameters) == 6:
+            return cov_sum(
+                parameters[1],
+                parameters[2],
+                parameters[3],
+                parameters[4],
+                parameters[5],
+            )
+    elif parameters[0] == "gpr_multi_new":
+        if len(parameters) == 1:
             return gpr_multi_new()
-        elif (len(parameters) == 2):
+        elif len(parameters) == 2:
             return gpr_multi_new(parameters[1])
-        elif (len(parameters) == 3):
+        elif len(parameters) == 3:
             return gpr_multi_new(parameters[1], parameters[2])
-        elif (len(parameters) == 4):
+        elif len(parameters) == 4:
             return gpr_multi_new(parameters[1], parameters[2], parameters[3])
-        elif (len(parameters) == 5):
-            return gpr_multi_new(parameters[1], parameters[2], parameters[3], parameters[4])
-        elif (len(parameters) == 6):
-            return gpr_multi_new(parameters[1], parameters[2], parameters[3], parameters[4], parameters[5])
-        elif (len(parameters) == 7):
-            return gpr_multi_new(parameters[1], parameters[2], parameters[3], parameters[4], parameters[5],
-                                 parameters[6])
+        elif len(parameters) == 5:
+            return gpr_multi_new(
+                parameters[1], parameters[2], parameters[3], parameters[4]
+            )
+        elif len(parameters) == 6:
+            return gpr_multi_new(
+                parameters[1],
+                parameters[2],
+                parameters[3],
+                parameters[4],
+                parameters[5],
+            )
+        elif len(parameters) == 7:
+            return gpr_multi_new(
+                parameters[1],
+                parameters[2],
+                parameters[3],
+                parameters[4],
+                parameters[5],
+                parameters[6],
+            )
     else:
-        raise Exception('Undefined function')
+        raise Exception("Undefined function")
 
 
 def gpr_multi_new(logtheta=None, covfunc=None, x=None, y=None, xstar=None, nargout=1):
@@ -418,7 +469,9 @@ def gpr_multi_new(logtheta=None, covfunc=None, x=None, y=None, xstar=None, nargo
     n, D = x.shape
     n, m = y.shape
     if eval(feval(covfunc)) != logtheta.shape[0]:
-        raise Exception('Error: Number of parameters do not agree with covariance function')
+        raise Exception(
+            "Error: Number of parameters do not agree with covariance function"
+        )
 
     temp = list(covfunc.copy())
     temp.append(logtheta)
@@ -429,13 +482,25 @@ def gpr_multi_new(logtheta=None, covfunc=None, x=None, y=None, xstar=None, nargo
     alpha = solve_chol(L.T, y)
 
     if (
-            logtheta is not None and covfunc is not None and x is not None and y is not None and xstar is None):  # if no test cases, compute the negative log marginal likelihood
-        out1 = 0.5 * np.trace(y.T * alpha) + m * np.sum(np.log(np.diag(L)), axis=0) + 0.5 * m * n * np.log(
-            2 * np.pi)
+        logtheta is not None
+        and covfunc is not None
+        and x is not None
+        and y is not None
+        and xstar is None
+    ):  # if no test cases, compute the negative log marginal likelihood
+        out1 = (
+            0.5 * np.trace(y.T @ alpha)
+            + m * np.sum(np.log(np.diag(L)), axis=0)
+            + 0.5 * m * n * np.log(2 * np.pi)
+        )
         if nargout == 2:  # ... and if requested, its partial derivatives
-            out2 = np.mat(np.zeros((logtheta.shape[0], 1)))  # set the size of the derivative vector
-            W = m * (np.linalg.inv(L.T) * (
-                    np.linalg.inv(L) * np.mat(np.eye(n)))) - alpha * alpha.T  # precompute for convenience
+            out2 = np.zeros(
+                (logtheta.shape[0], 1)
+            )  # set the size of the derivative vector
+            W = (
+                m * (np.linalg.inv(L.T) * (np.linalg.inv(L) * np.eye(n)))
+                - alpha @ alpha.T
+            )  # precompute for convenience
             for i in range(len(out2) - 1, len(out2)):
                 temp = list(covfunc.copy())
                 temp.append(logtheta)
@@ -453,7 +518,7 @@ def gpr_multi_new(logtheta=None, covfunc=None, x=None, y=None, xstar=None, nargo
 
         if nargout == 2:
             v = np.linalg.inv(L) * Kstar
-            v = np.mat(v)
+            v = v
             out2 = Kss - np.sum(np.multiply(v, v), axis=0).T
 
     if nargout == 1:
@@ -476,16 +541,16 @@ def solve_chol(A, B):
     # takes precedence over the matlab code in this file.
 
     if A is None or B is None:
-        raise Exception('Wrong number of arguments.')
+        raise Exception("Wrong number of arguments.")
 
-    if (A.shape[0] != A.shape[1] or A.shape[0] != B.shape[0]):
-        raise Exception('Wrong sizes of matrix arguments.')
+    if A.shape[0] != A.shape[1] or A.shape[0] != B.shape[0]:
+        raise Exception("Wrong sizes of matrix arguments.")
 
-    res = np.linalg.inv(A) * (np.linalg.inv(A.T) * B)
+    res = np.linalg.inv(A) @ (np.linalg.inv(A.T) @ B)
     return res
 
 
-K = np.mat(np.empty((0, 0)))
+K = np.empty((0, 0))
 
 
 def cov_noise(logtheta=None, x=None, z=None, nargout=1):
@@ -501,22 +566,24 @@ def cov_noise(logtheta=None, x=None, z=None, nargout=1):
     #
     # For more help on design of covariance functions, see "covFunctions".
 
-    if (logtheta is None and x is None and z is None):  # report number of parameters
-        A = '1'
+    if logtheta is None and x is None and z is None:  # report number of parameters
+        A = "1"
 
         return A
 
-    s2 = np.exp(2 * logtheta)[0, 0]  # noise variance
+    s2 = np.exp(2 * logtheta)[0]  # noise variance
 
-    if (logtheta is not None and x is not None and z is None):  # compute covariance matrix
-        A = s2 * np.mat(np.eye(x.shape[0]))
-    elif (nargout == 2):  # compute test set covariances
+    if (
+        logtheta is not None and x is not None and z is None
+    ):  # compute covariance matrix
+        A = s2 * np.eye(x.shape[0])
+    elif nargout == 2:  # compute test set covariances
         A = s2
         B = 0  # zeros cross covariance by independence
     else:  # compute derivative matrix
-        A = 2 * s2 * np.mat(np.eye(x.shape[0]))
+        A = 2 * s2 * np.eye(x.shape[0])
 
-    if (nargout == 2):
+    if nargout == 2:
         return A, B
     else:
         return A
@@ -541,8 +608,8 @@ def cov_seard(loghyper=None, x=None, z=None, nargout=1):
     # For more help on design of covariance functions, see "covFunctions".
     global K
 
-    if (loghyper is None and x is None and z is None):
-        A = '(D+1)'
+    if loghyper is None and x is None and z is None:
+        A = "(D+1)"
 
         return A  # report number of parameters
 
@@ -551,24 +618,24 @@ def cov_seard(loghyper=None, x=None, z=None, nargout=1):
     ell = np.exp(loghyper[0:D])  # characteristic length scale
     sf2 = np.exp(2 * loghyper[D])  # signal variance
 
-    if (loghyper is not None and x is not None):
-        K = sf2 * np.exp(-sq_dist(np.mat(np.diag(1 / ell) * x.T)) / 2)
+    if loghyper is not None and x is not None:
+        K = sf2 * np.exp(-sq_dist(np.diag(1 / ell) * x.T) / 2)
         A = K
     elif nargout == 2:  # compute test set covariances
-        A = sf2 * np.mat(np.ones((z, 1)))
-        B = sf2 * np.exp(-sq_dist(np.mat(np.diag(1 / ell)) * x.T, np.mat(np.diag(1 / ell)) * z) / 2)
+        A = sf2 * np.ones((z, 1))
+        B = sf2 * np.exp(-sq_dist(np.diag(1 / ell) * x.T, np.diag(1 / ell) * z) / 2)
     else:
         # check for correct dimension of the previously calculated kernel matrix
-        if (K.shape[0] != n or K.shape[1] != n):
-            K = sf2 * np.exp(-sq_dist(np.mat(np.diag(1 / ell) * x.T)) / 2)
+        if K.shape[0] != n or K.shape[1] != n:
+            K = sf2 * np.exp(-sq_dist(np.diag(1 / ell) * x.T) / 2)
 
         if z <= D:  # length scale parameters
             A = np.multiply(K, sq_dist(x[:, z].T / ell[z]))
         else:  # magnitude parameter
             A = 2 * K
-            K = np.mat(np.empty((0, 0)))
+            K = np.empty((0, 0))
 
-    if (nargout == 2):
+    if nargout == 2:
         return A, B
     else:
         return A
@@ -600,30 +667,34 @@ def sq_dist(a, b=None, Q=None):
     # where a is of size D by n, b is of size D by m (or empty), C and Q are of
     # size n by m and c is of size D by 1.
 
-    if b is None or len(b) == 0:  # input arguments are taken to be identical if b is missing or empty
+    if (
+        b is None or len(b) == 0
+    ):  # input arguments are taken to be identical if b is missing or empty
         b = a
 
     D, n = a.shape
     d, m = b.shape
 
     if d != D:
-        raise Exception('Error: column lengths must agree.')
+        raise Exception("Error: column lengths must agree.")
 
     if Q is None:
-        C = np.mat(np.zeros((n, m)))
+        C = np.zeros((n, m))
         for d in range(D):
-            temp = np.tile(b[d, :], (n, 1)) - np.tile(a[d, :].T, (1, m))
+            temp = np.tile(b[d, :], (n, 1)) - np.tile(a[d, :].reshape(-1, 1), (1, m))
             C = C + np.multiply(temp, temp)
     else:
         if (n, m) == Q.shape:
-            C = np.mat(np.zeros((D, 1)))
+            C = np.zeros((D, 1))
             for d in range(D):
-                temp = np.tile(b[d, :], (n, 1)) - np.tile(a[d, :].T, (1, m))
+                temp = np.tile(b[d, :], (n, 1)) - np.tile(
+                    a[d, :].reshape(-1, 1), (1, m)
+                )
                 temp = np.multiply(temp, temp)
                 temp = np.multiply(temp, Q)
                 C[d] = np.sum(temp)
         else:
-            raise Exception('Third argument has wrong size.')
+            raise Exception("Third argument has wrong size.")
     return C
 
 
@@ -640,10 +711,10 @@ def cov_sum(covfunc, logtheta=None, x=None, z=None, nargout=1):
         f = covfunc[i]
         j.append([feval([f])])
 
-    if (logtheta is None and x is None and z is None):  # report number of parameters
+    if logtheta is None and x is None and z is None:  # report number of parameters
         A = j[0][0]
         for i in range(1, len(covfunc)):
-            A = A + '+' + j[i][0]
+            A = A + "+" + j[i][0]
 
         return A
 
@@ -655,8 +726,10 @@ def cov_sum(covfunc, logtheta=None, x=None, z=None, nargout=1):
             v.append(i)
     v = np.asarray(v)
 
-    if (logtheta is not None and x is not None and z is None):  # compute covariance matrix
-        A = np.mat(np.zeros((n, n)))  # allocate space for covariance matrix
+    if (
+        logtheta is not None and x is not None and z is None
+    ):  # compute covariance matrix
+        A = np.zeros((n, n))  # allocate space for covariance matrix
         for i in range(len(covfunc)):  # iteration over summand functions
             f = covfunc[i]
             temp = [f]
@@ -666,10 +739,11 @@ def cov_sum(covfunc, logtheta=None, x=None, z=None, nargout=1):
             A = A + feval(temp)
 
     if (
-            logtheta is not None and x is not None and z is not None):  # compute derivative matrix or test set covariances
+        logtheta is not None and x is not None and z is not None
+    ):  # compute derivative matrix or test set covariances
         if nargout == 2:  # compute test set cavariances
-            A = np.mat(np.zeros((z, 1)))
-            B = np.mat(np.zeros((x.shape[0], z)))  # allocate space
+            A = np.zeros((z, 1))
+            B = np.zeros((x.shape[0], z))  # allocate space
             for i in range(len(covfunc)):
                 f = covfunc[i]
                 temp = [f]
@@ -683,7 +757,9 @@ def cov_sum(covfunc, logtheta=None, x=None, z=None, nargout=1):
                 B = B + BB
         else:  # compute derivative matrices
             i = v[z]  # which covariance function
-            j = np.sum(np.where(v[0:z] == i, 1, 0))  # which parameter in that covariance
+            j = np.sum(
+                np.where(v[0:z] == i, 1, 0)
+            )  # which parameter in that covariance
             f = covfunc[i]
             temp = [f]
             t = logtheta[np.where(v == i)]
@@ -692,7 +768,7 @@ def cov_sum(covfunc, logtheta=None, x=None, z=None, nargout=1):
             temp.append(j)
             A = feval(temp)
 
-    if (nargout == 2):
+    if nargout == 2:
         return A, B
     else:
         return A
diff --git a/tests/TestLocalScore.py b/tests/TestLocalScore.py
index f8061be3..b2937843 100644
--- a/tests/TestLocalScore.py
+++ b/tests/TestLocalScore.py
@@ -2,6 +2,7 @@
 from causallearn.utils.GESUtils import *
 from causallearn.score.LocalScoreFunctionClass import LocalScoreClass
 
+
 class TestLocalScore(unittest.TestCase):
 
     np.random.seed(10)
@@ -9,21 +10,25 @@ class TestLocalScore(unittest.TestCase):
     X_prime = np.random.randn(300, 1)
     Y = X + 0.5 * np.random.randn(300, 1)
     Z = Y + 0.5 * np.random.randn(300, 1)
-    data = np.mat(np.hstack((X, X_prime, Y, Z)))
+    data = np.hstack((X, X_prime, Y, Z))
 
     np.random.seed(10)
     X_slash = np.random.randint(0, 10, (300, 1))
     X_prime_slash = np.random.randint(0, 10, (300, 1))
     Y_slash = X_slash + np.random.randint(0, 10, (300, 1))
     Z_slash = Y_slash + np.random.randint(0, 10, (300, 1))
-    data_slash = np.mat(np.hstack((X_slash, X_prime_slash, Y_slash, Z_slash)))
+    data_slash = np.hstack((X_slash, X_prime_slash, Y_slash, Z_slash))
 
     def test_local_score_marginal_multi(self):
-        parameters = {'dlabel': {}}
+        parameters = {"dlabel": {}}
         for i in range(self.data.shape[1]):
-            parameters['dlabel'][i] = i
+            parameters["dlabel"][i] = i
 
-        localScoreClass = LocalScoreClass(data=self.data, local_score_fun=local_score_marginal_multi, parameters=parameters)
+        localScoreClass = LocalScoreClass(
+            data=self.data,
+            local_score_fun=local_score_marginal_multi,
+            parameters=parameters,
+        )
 
         q = localScoreClass.score(0, [0])
         p = localScoreClass.score(0, [1])
@@ -32,11 +37,17 @@ def test_local_score_marginal_multi(self):
         assert q < v < p
 
     def test_local_score_CV_multi(self):
-        parameters = {'kfold': 10, 'lambda': 0.01, 'dlabel': {}}  # regularization parameter
+        parameters = {
+            "kfold": 10,
+            "lambda": 0.01,
+            "dlabel": {},
+        }  # regularization parameter
         for i in range(self.data.shape[1]):
-            parameters['dlabel'][i] = i
+            parameters["dlabel"][i] = i
 
-        localScoreClass = LocalScoreClass(data=self.data, local_score_fun=local_score_cv_multi, parameters=parameters)
+        localScoreClass = LocalScoreClass(
+            data=self.data, local_score_fun=local_score_cv_multi, parameters=parameters
+        )
 
         q = localScoreClass.score(0, [0])
         p = localScoreClass.score(0, [1])
@@ -48,7 +59,9 @@ def test_local_score_BIC(self):
         parameters = {}
         parameters["lambda_value"] = 2
 
-        localScoreClass = LocalScoreClass(data=self.data, local_score_fun=local_score_BIC, parameters=parameters)
+        localScoreClass = LocalScoreClass(
+            data=self.data, local_score_fun=local_score_BIC, parameters=parameters
+        )
 
         q = localScoreClass.score(0, [0])
         p = localScoreClass.score(0, [1])
@@ -57,10 +70,16 @@ def test_local_score_BIC(self):
         assert q < v < p
 
     def test_local_score_CV_general(self):
-        parameters = {'kfold': 10,  # 10 fold cross validation
-                      'lambda': 0.01}  # regularization parameter
+        parameters = {
+            "kfold": 10,  # 10 fold cross validation
+            "lambda": 0.01,
+        }  # regularization parameter
 
-        localScoreClass = LocalScoreClass(data=self.data, local_score_fun=local_score_cv_general, parameters=parameters)
+        localScoreClass = LocalScoreClass(
+            data=self.data,
+            local_score_fun=local_score_cv_general,
+            parameters=parameters,
+        )
 
         q = localScoreClass.score(0, [0])
         p = localScoreClass.score(0, [1])
@@ -71,7 +90,11 @@ def test_local_score_CV_general(self):
     def test_local_score_marginal_general(self):
         parameters = {}
 
-        localScoreClass = LocalScoreClass(data=self.data, local_score_fun=local_score_marginal_general, parameters=parameters)
+        localScoreClass = LocalScoreClass(
+            data=self.data,
+            local_score_fun=local_score_marginal_general,
+            parameters=parameters,
+        )
 
         q = localScoreClass.score(0, [0])
         p = localScoreClass.score(0, [1])
@@ -80,7 +103,9 @@ def test_local_score_marginal_general(self):
         assert q < v < p
 
     def test_local_score_BDeu(self):
-        localScoreClass = LocalScoreClass(data=self.data_slash, local_score_fun=local_score_BDeu, parameters=None)
+        localScoreClass = LocalScoreClass(
+            data=self.data_slash, local_score_fun=local_score_BDeu, parameters=None
+        )
 
         q = localScoreClass.score(0, [0])
         p = localScoreClass.score(0, [1])