Code formating

ishevche · joaopfonseca · commit b8ca66d54a48 · 2024-12-04T10:00:34.000-05:00
diff --git a/experiments/0.2-basic-experiment.py b/experiments/0.2-basic-experiment.py
@@ -279,4 +279,4 @@
 
             results[dataset["name"]][xai_method["name"]].append(contributions)
             result_df = pd.DataFrame(contributions, columns=X.columns, index=X.index)
-            result_df.to_csv(result_fname)
+            result_df.to_csv(result_fname)
diff --git a/experiments/0.3-time-experiment.py b/experiments/0.3-time-experiment.py
@@ -156,7 +156,6 @@
     + [f"fidelity_{i}" for i in range(N_RUNS)]
 )
 
-
 for dataset in datasets:
     result_df = []
     # Set up basic settings
@@ -174,8 +173,12 @@
     ranking = scores_to_ordering(scores)
 
     # Set experiment size if we deleted too many items
-    dataset["n_observations"] = dataset["n_observations"] if X.shape[0] > dataset["n_observations"] else X.shape[0]
-    
+    dataset["n_observations"] = (
+        dataset["n_observations"]
+        if X.shape[0] > dataset["n_observations"]
+        else X.shape[0]
+    )
+
     rng = check_random_state(RNG_SEED)
 
     # rank and score indexes
@@ -184,7 +187,7 @@
         size=dataset["n_observations"],
         replace=False,
     )
-    
+
     # pairwise pairs
     combos = list(itertools.combinations(np.indices((X.shape[0],)).squeeze(), 2))
     pairs_indexes = rng.choice(
@@ -193,14 +196,25 @@
         replace=False,
     )
     pairs_sample = [combos[i] for i in pairs_indexes]
-    pairs = [(pair[0], pair[1]) if np.random.choice([0,1]) else (pair[1], pair[0]) for pair in pairs_sample]
+    pairs = [
+        (pair[0], pair[1]) if np.random.choice([0, 1]) else (pair[1], pair[0])
+        for pair in pairs_sample
+    ]
 
     for approach in approaches:
         iteration_qoi = approach
         if approach.startswith("pairwise"):
             iteration_qoi = approach.split("-")[1]
             approach = "pairwise"
-        print("----------------", dataset["name"], "|", approach, "|", iteration_qoi, "----------------")
+        print(
+            "----------------",
+            dataset["name"],
+            "|",
+            approach,
+            "|",
+            iteration_qoi,
+            "----------------",
+        )
 
         times = []
         kendall_cons = []
@@ -275,7 +289,7 @@
             [
                 dataset["name"],
                 dataset["n_observations"],
-                approach+"_"+iteration_qoi,
+                approach + "_" + iteration_qoi,
                 np.nan,
                 np.nan,
                 np.mean(times),
@@ -368,10 +382,13 @@
                             contr, baseline_contr, measure="jaccard", n_features=2
                         )[0]
                     )
-                    #Eulidean consistency
+                    # Eulidean consistency
                     euclidean_cons.append(
                         cross_method_explanation_consistency(
-                            contr, baseline_contr, measure="euclidean", normalization=True
+                            contr,
+                            baseline_contr,
+                            measure="euclidean",
+                            normalization=True,
                         )[0]
                     )
                     # Iniatialize normalizer
@@ -396,14 +413,14 @@
                             target_pairs=target[sam_idx2],
                             rank=True,
                         )
-        
+
                     fidelity.append(res_)
 
                 results_row = (
                     [
                         dataset["name"],
                         dataset["n_observations"],
-                        approach+"_"+iteration_qoi,
+                        approach + "_" + iteration_qoi,
                         parameter,
                         parameter_value,
                         np.mean(times),
diff --git a/xai_ranking/_min_dependencies.py b/xai_ranking/_min_dependencies.py
@@ -8,40 +8,45 @@
     "pandas": ("1.3.5", "metrics, datasets"),
     "scipy": ("1.14.1", "metrics"),
     "scikit-learn": ("1.2.0", "metrics"),
-
     "pytest-cov": ("3.0.0", "tests"),
     "flake8": ("3.8.2", "tests"),
     "black": ("22.3", "tests"),
     "pylint": ("2.12.2", "tests"),
     "mypy": ("1.6.1", "tests"),
     "sphinx": ("4.2.0", "docs"),
-
     # dev
     # "coverage": ("", "tests"),
     # "click": ("", "tests"),
-
     # nutrition labels
     # "matplotlib" : ("", "install"),
     # "seaborn" : ("", "install"),
-
     # L2R
     # "lightgbm" : ("", "install"),
-
     # general?
     # "xai-sharp": ("0.1.a1", "install"),
     # "shap" : ("", "install"),
     # "lime" : ("", "install"),
     # "statsmodels" : ("", "install"),
     # "ml-research" : ("", "install"),
-
     # dataset module
     # "openpyxl" : ("", "install"),
     # "" : ("", "install"),
 }
 
 # create inverse mapping for setuptools
 tag_to_packages: dict = {
-    extra: [] for extra in ["install", "optional", "docs", "examples", "tests", "all", "metrics", "datasets", "scores"]
+    extra: []
+    for extra in [
+        "install",
+        "optional",
+        "docs",
+        "examples",
+        "tests",
+        "all",
+        "metrics",
+        "datasets",
+        "scores",
+    ]
 }
 for package, (min_version, extras) in dependent_packages.items():
     for extra in extras.split(", "):
diff --git a/xai_ranking/datasets/_make_synthetic.py b/xai_ranking/datasets/_make_synthetic.py
@@ -8,9 +8,13 @@
 def fetch_synthetic_data(synth_dt_version=2, item_num=1000):
     # Feature names
     column_names = ["n1", "n2", "n3"]
-    
+
     # Check if files exist, if not we will make them
-    filepath = join(dirname(abspath(__file__)), "files", f"Synthetic_{synth_dt_version}_{item_num}.txt")
+    filepath = join(
+        dirname(abspath(__file__)),
+        "files",
+        f"Synthetic_{synth_dt_version}_{item_num}.txt",
+    )
 
     if Path(filepath).is_file():
         df = pd.read_csv(
@@ -22,7 +26,7 @@ def fetch_synthetic_data(synth_dt_version=2, item_num=1000):
     else:
         # Make index names
         ind = range(0, item_num)
-    
+
         # Make features based on synthetic data version passed
         if synth_dt_version == 0:
             # All features are independent
@@ -38,7 +42,7 @@ def fetch_synthetic_data(synth_dt_version=2, item_num=1000):
             corr = -0.8
             cov1_2 = math.sqrt(var[0]) * math.sqrt(var[1]) * corr
             covs = [[var[0], cov1_2, 0], [cov1_2, var[1], 0], [0, 0, var[2]]]
-            features = np.random.multivariate_normal(means, covs, item_num)  
+            features = np.random.multivariate_normal(means, covs, item_num)
         elif synth_dt_version == 2:
             # Features 1 & 2 are negatively correlated
             # Feature 1 & 3 are positively correlated
@@ -57,15 +61,15 @@ def fetch_synthetic_data(synth_dt_version=2, item_num=1000):
             features = np.random.multivariate_normal(means, covs, item_num)
         else:
             return None
-    
+
         # Make dataframe
         df = pd.DataFrame(features, columns=column_names, index=ind)
-    
+
         # Normalize data
         for series_name, series in df.items():
             df[series_name] = (series - series.min()) / (series.max() - series.min())
 
         # Write to file
         df.to_csv(filepath, index=False, header=False)
 
-    return df
+    return df
diff --git a/xai_ranking/metrics/_base.py b/xai_ranking/metrics/_base.py
@@ -5,6 +5,7 @@
 from sharp.utils import scores_to_ordering
 import pandas as pd
 
+
 # Not reviewed
 # Returns neighbors that are either close or far ranking wise
 # AND subselects the top n neighbors in terms of feature similarity
@@ -23,7 +24,7 @@ def _find_neighbors(
             & (rankings <= max_ranking)
             & (rankings != row_rank)
         )
-    else: # Select neighbors that are far ranking wise
+    else:  # Select neighbors that are far ranking wise
         mask = (rankings < min_ranking) | (rankings > max_ranking)
     data_neighbors = np.array(original_data)[mask]
     cont_neighbors = np.array(contributions)[mask]
@@ -41,7 +42,9 @@ def _find_neighbors(
 # Not reviewed
 # Returns all neighbors that are similar feature wise
 # The Euclidean distance between items has to be under the threshold
-def _find_all_neighbors(original_data, rankings, contributions, row_idx, threshold=None):
+def _find_all_neighbors(
+    original_data, rankings, contributions, row_idx, threshold=None
+):
     row_data = np.array(original_data)[row_idx]
 
     data_neighbors = np.array(original_data)
@@ -66,11 +69,11 @@ def _find_all_neighbors(original_data, rankings, contributions, row_idx, thresho
         )
     # Or return distances from all items
     return (
-            data_neighbors,
-            cont_neighbors,
-            rank_neighbors,
-            distances,
-        )
+        data_neighbors,
+        cont_neighbors,
+        rank_neighbors,
+        distances,
+    )
 
 
 # Reviewed
@@ -79,10 +82,12 @@ def _get_importance_mask(row_cont, threshold):
         # Calculate order of absolute contributions
         row_abs = np.abs(row_cont)
         # Find n=threshold largest items
-        res = sorted(row_abs.index.values, key = lambda sub: row_abs[sub])[-threshold:]
+        res = sorted(row_abs.index.values, key=lambda sub: row_abs[sub])[-threshold:]
         # Set mask
-        mask = pd.Series(data=[True if i in res else False for i in row_cont.index.values],
-                         index=row_cont.index.values)
+        mask = pd.Series(
+            data=[True if i in res else False for i in row_cont.index.values],
+            index=row_cont.index.values,
+        )
     else:
         # Calculate cumulative absolute contribution order
         total_contribution = np.sum(np.abs(row_cont))
@@ -128,10 +133,12 @@ def kendall_similarity(a, b):
     idx_pair = list(combinations(range(len(a)), 2))
     val_pair_a = [(a[i], a[j]) for i, j in idx_pair if a[i] != a[j]]
     val_pair_b = [(b[i], b[j]) for i, j in idx_pair if b[i] != b[j]]
-    inversions=0
+    inversions = 0
     for (val11, val12), (val21, val22) in zip(val_pair_a, val_pair_b):
-        if ((val11 > val12) and (val21 < val22)) or ((val11 < val12) and (val21 > val22)):
-            inversions = inversions+1
+        if ((val11 > val12) and (val21 < val22)) or (
+            (val11 < val12) and (val21 > val22)
+        ):
+            inversions = inversions + 1
     kt = 1 - (2 * inversions) / normalizer
     return (kt + 1) / 2
 
@@ -223,7 +230,7 @@ def row_wise_jaccard(results1, results2, n_features):
     >>> n_features = 2
     >>> row_wise_jaccard(results1, results2, n_features)
     """
-    
+
     if n_features is None:
         n_features = results1.shape[1]
 
@@ -246,9 +253,9 @@ def row_wise_euclidean(results1, results2, normalization=True):
         # Make vectors into unit vectors
         v1 = normalize([results1])[0]
         v2 = normalize([results2])[0]
-        return euclidean(v1,v2)/2
+        return euclidean(v1, v2) / 2
     else:
-        return euclidean(results1,results2)
+        return euclidean(results1, results2)
 
 
 # Reviewed
@@ -279,7 +286,8 @@ def euclidean_agreement(results1, results2, normalization):
     vectors in `results1` and `results2` using the Euclidean distance.
     """
     return results1.reset_index(drop=True).apply(
-        lambda row: 1 - row_wise_euclidean(row, results2.iloc[row.name], normalization), axis=1
+        lambda row: 1 - row_wise_euclidean(row, results2.iloc[row.name], normalization),
+        axis=1,
     )
 
 
@@ -315,6 +323,7 @@ def kendall_agreement(results1, results2):
         lambda row: row_wise_kendall(row, results2.iloc[row.name]), axis=1
     )
 
+
 # Reviewed
 def jaccard_agreement(results1, results2, n_features=0.8):
     """
diff --git a/xai_ranking/metrics/_consistency.py b/xai_ranking/metrics/_consistency.py
@@ -19,6 +19,7 @@ def bootstrapped_explanation_consistency(
     sem = np.std(batch_agreement) / np.sqrt(batch_agreement.size)
     return mean, sem
 
+
 # Reviewed
 def cross_method_explanation_consistency(
     results1, results2, measure="kendall", **kwargs
diff --git a/xai_ranking/metrics/_fidelity.py b/xai_ranking/metrics/_fidelity.py
@@ -1,5 +1,6 @@
 import numpy as np
 
+
 # Reviewed
 def outcome_fidelity(
     contributions, target, avg_target, target_max=1, target_pairs=None, rank=True
@@ -21,7 +22,7 @@ def outcome_fidelity(
             better_than = target < target_pairs
         else:
             better_than = target > target_pairs
-            
+
         est_better_than = contributions.sum(axis=1) > 0
         avg_est_err = (better_than == est_better_than).mean()
     return avg_est_err
diff --git a/xai_ranking/metrics/_sensitivity.py b/xai_ranking/metrics/_sensitivity.py
@@ -190,7 +190,7 @@ def row_wise_explanation_sensitivity_all_neighbors(
 
 
 # Calculates the explanation sensitivity of every row of original data and its
-# closest neighbors, 
+# closest neighbors,
 def explanation_sensitivity(
     original_data,
     contributions,