DLBD-Department · Wazabit · Jun 2, 2024 · Dec 11, 2024
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -13,6 +13,11 @@ jobs:
             pip3 install -r requirements.txt
             python3 -m tests.unit.TestBiasDetector
 
+      - run:
+          name: Unit Tests for Risk Calculator
+          command: |
+            python3 -m tests.unit.TestRiskCalculator
+
 workflows:
   version: 2
   test_and_run:

diff --git a/README.md b/README.md
@@ -53,8 +53,13 @@ The method `compare_root_variable_conditioned_groups` performs the same calculat
 
 For both methods the output is a tuple similar to the ones described for the FreqsVsRef method, but with an additional element given by the standard deviation of the distances, provided only when the root variable is a multi-class feature. 
 
+## Focus on the Risk Analysis
+With the latest version (1.2) we introduce a Risk measurement, based on the results of the Bias Detector. Currently it's only accessible via the python API, but we plan to add a frontend option for it. It is implemented in the `brio/risk` sub-module.
+
+The methodology behind the risk computation will be soon published with a scientific paper. If you want to experiment with it already, you can use the method `compute_hazard_from_freqvsfreq_or_freqvsref` of the class `HazardFromBiasDetectionCalculator` upon each results from FreqVsFreq and FreqVsRef. The computed hazards need to be passed to `compute_risk` from `RiskCalculator`: this function will provide an overall measure of risk.  
+
 ## What's next
-Currently (September 2023) we plan to implement functionalities for the Opacity section, which is now empty. Furthermore, we want to introduce a risk measurement analysis, which will provide an overall risk assessment of a model using a series of bias and opacity checks.
+Currently (June 2024) we plan to implement functionalities for the Opacity section, which is now empty, and a more refined and accessible Risk section. 
 
 ## Call to action!
 We hope to raise interest in the data science community and ask for support! Anyone interested in expanding and improving our tool is more than welcome! You can do that opening a pull request for a functionality you wish to include. Also bugs warnings are very important and welcome. 

diff --git a/VERSION.txt b/VERSION.txt
@@ -1 +1 @@
-1.0.0
+1.2.0
diff --git a/brio/bias/FreqVsFreqBiasDetector.py b/brio/bias/FreqVsFreqBiasDetector.py
@@ -1,7 +1,7 @@
 from .BiasDetector import BiasDetector
 from .threshold_calculator import threshold_calculator
 from sklearn.utils.extmath import cartesian
-from itertools import combinations
+from itertools import combinations, compress
 from scipy.spatial.distance import jensenshannon
 import numpy as np
 
@@ -45,16 +45,22 @@ def compute_distance_between_frequencies(self,
             for pair in combinations(observed_distribution, 2):
                 # Squaring JS given that the scipy implementation has square root
                 distance = jensenshannon(p=pair[0], q=pair[1], base=2)**2
-                distances.append(distance)
+                # If no observation are present for one class, the JS distance will be a nan.
+                # Changing into None to keep the functionalities of Risk Measurement. 
+                if np.isnan(distance):
+                    distances.append(None)
+                else:
+                    distances.append(distance)
         else:
             raise Exception("Only TVD or JS are supported as distances for freq_vs_freq analysis")
 
-        overall_distance = self.aggregating_function(distances)
-
-        if len(distances) > 1:
+        #Keeping all the not None distances (relevant for multi-classe root variable scenarios) 
+        distances_not_none = list(compress(distances, [d is not None for d in distances]))
+        overall_distance = self.aggregating_function(distances_not_none) if len(distances_not_none) > 0 else None
+        if len(distances_not_none) > 1:
             ## Computing the standard deviation of the distances in case of 
             # multi-class root_variable
-            return overall_distance, np.std(distances)
+            return overall_distance, np.std(distances_not_none)
         else:
             return overall_distance, None
 
@@ -158,13 +164,11 @@ def compare_root_variable_conditioned_groups(self,
 
         # Second parameter for threshold calculator
         A2 = len(root_variable_labels)
-
-        conditioned_frequencies = {}
-
         conditioning_variables_subsets = list(self.powerset(conditioning_variables))
 
         # All the possible subsets of conditioning variables are inspected. The first one
         # is excluded being the empty set. 
+        conditioned_frequencies = {}
         for conditioning_variables_subset in conditioning_variables_subsets[1:]:
 
             combinations = cartesian([dataframe[v].unique() for v in conditioning_variables_subset])
@@ -177,48 +181,52 @@ def compare_root_variable_conditioned_groups(self,
                 dataframe_subset = dataframe.query(condition)
                 num_of_obs = dataframe_subset.shape[0]
 
-                if num_of_obs >= min_obs_per_group:
-                    if self.target_variable_type == 'class':
-                        conditioned_frequencies[condition] = (
-                                num_of_obs, 
-                                self.get_frequencies_list(
-                                    dataframe_subset,
-                                    target_variable,
-                                    target_variable_labels,
-                                    root_variable,
-                                    root_variable_labels)[0] #taking the relative freqs, the absolute freqs are not needed here
-                                )
-                    elif self.target_variable_type == 'probability':
-                        conditioned_frequencies[condition] = (
-                                num_of_obs, 
-                                self.get_frequencies_list_from_probs(
-                                    dataframe_subset,
-                                    target_variable,
-                                    root_variable,
-                                    root_variable_labels,
-                                    n_bins)[0] #taking the relative freqs, the absolute freqs are not needed here
-                                )
-
-                else:
-                    conditioned_frequencies[condition] = (num_of_obs, None)
+                if self.target_variable_type == 'class':
+                    conditioned_frequencies[condition] = (
+                            num_of_obs, 
+                            self.get_frequencies_list(
+                                dataframe_subset,
+                                target_variable,
+                                target_variable_labels,
+                                root_variable,
+                                root_variable_labels)[0] #taking the relative freqs, the absolute freqs are not needed here
+                            )
+                elif self.target_variable_type == 'probability':
+                    conditioned_frequencies[condition] = (
+                            num_of_obs, 
+                            self.get_frequencies_list_from_probs(
+                                dataframe_subset,
+                                target_variable,
+                                root_variable,
+                                root_variable_labels,
+                                n_bins)[0] #taking the relative freqs, the absolute freqs are not needed here
+                            )
 
         distances = {
                 # group: (number_of_observations, (overall_distance, standard_deviations) )
                 group: (
                         (obs_and_freqs[0], 
                             self.compute_distance_between_frequencies(obs_and_freqs[1]) # (distance, standard_deviations)
-                        ) if obs_and_freqs[1] is not None else (obs_and_freqs[0], None)
+                        )
                     ) for group, obs_and_freqs in conditioned_frequencies.items()
                 }
-
-        results = {group: (
-                (
+
+        results = {}
+        for group, obs_and_dist in distances.items():
+            # Too small groups
+            if obs_and_dist[0] < min_obs_per_group:
+                result = (obs_and_dist[0], None, 'Not enough observations')
+            # Groups for which distance is not defined (only one class available, JS computed)
+            elif obs_and_dist[1][0] is None:
+                result = (obs_and_dist[0], None, 'Distance non defined')
+            else:
+                result = (
                     obs_and_dist[0], #This will also be the A3 for threshold_calculator, being it the number of obs of the group
                     obs_and_dist[1][0], #distance
                     obs_and_dist[1][0]<=threshold_calculator(A1=self.A1, A2=A2, A3=obs_and_dist[0], default_threshold=threshold),
                     threshold_calculator(A1=self.A1, A2=A2, A3=obs_and_dist[0], default_threshold=threshold),
                     obs_and_dist[1][1] #standard deviation
-                ) if obs_and_dist[1] is not None else (obs_and_dist[0], obs_and_dist[1], 'Not enough observations')
-            ) for group, obs_and_dist in distances.items()}
+                )
+            results[group] = result
 
         return results
diff --git a/brio/bias/FreqVsRefBiasDetector.py b/brio/bias/FreqVsRefBiasDetector.py
@@ -93,8 +93,13 @@ def compute_distance_from_reference(self,
                 kl = kl_elementwise.sum()
             else:
                 raise Exception("Only 'no','zero' and 'laplace' are supported as divergence adjustment methods.")
+
             divergence = self.normalization_function(kl)
-            divergences.append(divergence)
+
+            if np.isnan(divergence):
+                divergences.append(None)
+            else:
+                divergences.append(divergence)
 
         return divergences
 
@@ -230,34 +235,30 @@ def compare_root_variable_conditioned_groups(self,
                 dataframe_subset = dataframe.query(condition)
                 num_of_obs = dataframe_subset.shape[0]
 
-                if num_of_obs >= min_obs_per_group:
-                    if self.target_variable_type == 'class':
-                        freqs, abs_freqs = self.get_frequencies_list(
-                                            dataframe_subset,
-                                            target_variable,
-                                            target_variable_labels,
-                                            root_variable,
-                                            root_variable_labels)
-                        conditioned_frequencies[condition] = (
-                                num_of_obs, 
-                                freqs,
-                                [sum(x) for x in abs_freqs]
-                                )
-                    elif self.target_variable_type == 'probability':
-                        freqs, abs_freqs = self.get_frequencies_list_from_probs(
-                                            dataframe_subset,
-                                            target_variable,
-                                            root_variable,
-                                            root_variable_labels,
-                                            n_bins)
-                        conditioned_frequencies[condition] = (
-                                num_of_obs, 
-                                freqs,
-                                [sum(x) for x in abs_freqs]
-                                )
-
-                else:
-                    conditioned_frequencies[condition] = (num_of_obs, None)
+                if self.target_variable_type == 'class':
+                    freqs, abs_freqs = self.get_frequencies_list(
+                                        dataframe_subset,
+                                        target_variable,
+                                        target_variable_labels,
+                                        root_variable,
+                                        root_variable_labels)
+                    conditioned_frequencies[condition] = (
+                            num_of_obs, 
+                            freqs,
+                            [sum(x) for x in abs_freqs]
+                            )
+                elif self.target_variable_type == 'probability':
+                    freqs, abs_freqs = self.get_frequencies_list_from_probs(
+                                        dataframe_subset,
+                                        target_variable,
+                                        root_variable,
+                                        root_variable_labels,
+                                        n_bins)
+                    conditioned_frequencies[condition] = (
+                            num_of_obs, 
+                            freqs,
+                            [sum(x) for x in abs_freqs]
+                            )
 
         distances = {
                 group: (
@@ -266,17 +267,28 @@ def compare_root_variable_conditioned_groups(self,
                         self.compute_distance_from_reference(observed_distribution=obs_and_freqs[1], 
                                                              reference_distribution=reference_distribution, 
                                                              n_obs=obs_and_freqs[2])
-                    ) if obs_and_freqs[1] is not None else (obs_and_freqs[0], None)
+                    ) 
                 ) for group, obs_and_freqs in conditioned_frequencies.items()
             }
-
-        results = {group: (
-                (
-                    obs_and_dist[0], 
-                    obs_and_dist[1], 
-                    [d<=threshold_calculator(A1=self.A1, A2=A2, A3=obs_and_dist[0], default_threshold=threshold) for d in obs_and_dist[1]],
+
+        results = {}
+        for group, obs_and_dist in distances.items():
+            # Too small groups
+            if obs_and_dist[0] < min_obs_per_group:
+                result = (obs_and_dist[0], [None for d in obs_and_dist[1]], 'Not enough observations')
+            else:
+                result = (
+                    obs_and_dist[0], #obs
+                    obs_and_dist[1], #distance
+                    [d<=threshold_calculator(
+                        A1=self.A1,
+                        A2=A2, 
+                        A3=obs_and_dist[0], 
+                        default_threshold=threshold
+                        ) if d is not None else 'Distance not defined' for d in obs_and_dist[1]],
                     threshold_calculator(A1=self.A1, A2=A2, A3=obs_and_dist[0], default_threshold=threshold)
-                ) if obs_and_dist[1] is not None else (obs_and_dist[0], obs_and_dist[1], 'Not enough observations')
-            ) for group, obs_and_dist in distances.items()}
+                )
+
+            results[group] = result
 
         return results
diff --git a/brio/risk/HazardFromBiasDetectionCalculator.py b/brio/risk/HazardFromBiasDetectionCalculator.py
@@ -0,0 +1,96 @@
+import numpy as np
+
+class HazardFromBiasDetectionCalculator:
+
+    '''
+    This class manages the calculation of hazards for the 
+    tests within the Bias module
+    '''
+
+    def as_list(self, x):
+        if type(x) is list:
+            return x
+        else:
+            return [x]
+
+    def compute_hazard_from_freqvsfreq_or_freqvsref(
+        self,
+        overall_result, 
+        conditioned_results, 
+        tot_observations,
+        conditioning_variables,
+        weight_logic="group"):
+
+        '''
+        Computes the hazard for a FreqVsFreq or a FreqVsRef analysis. 
+
+        Args:
+            overall_result: dict with non-conditioned results from FreqVs* analysis
+            conditioned_result: dict with conditioned results from FreqVs* analysis #TODO handle when only overall result is available
+            tot_observation: num, total number of data points analyzed 
+            conditioning_variables: list, conditioning variables used in FreqVs* analysis
+            weight_logic: str, it can be either "group" or "individual", it determines how much each single test will weight on the hazard result
+        '''
+
+        #tot number features=conditioning + root (+1)
+        n_features_total = len(conditioning_variables) + 1
+
+        hazard_overall = 0
+        # Iterating over each reference distribution, if available (FreqVsRef)
+        # In case of FreqVsFreq, there will be a single iteration
+        num_iterations = len(self.as_list(overall_result[0]))
+        for k in np.arange(0, num_iterations):
+
+            # test result, threshold, num_samples, boolean, num_used_features
+            #TODO use dict instead, and use explicit keys for readibility
+            test_results = []
+            test_results.append((
+                            self.as_list(overall_result[0])[k], 
+                            overall_result[2], 
+                            tot_observations, 
+                            self.as_list(overall_result[1])[k],
+                            1 #for the overall test, only 1 feature used, the root variable
+                        ))
+
+            for group_name, group in conditioned_results.items():
+                if (self.as_list(group[1])[k] is not None):
+                    test_results.append(
+                                        (
+                                        self.as_list(group[1])[k], #test result
+                                        group[3], #threshold
+                                        group[0], #num_samples
+                                        self.as_list(group[2])[k], #boolean
+                                        len(group_name.split("&"))+1 #num_used_features, cond.+root
+                                        )
+                                    ) 
+
+            if weight_logic=="group":
+                #T_i in Risk Function document
+                weight_denominator = 0 
+                for line in test_results:
+                    weight_denominator += n_features_total - line[4] + 1
+            elif weight_logic=="individual":
+                #S_i in Risk Function document
+                weight_denominator = np.sum([x[4] for x in test_results]) 
+            else:
+                raise Exception('Only "group" or "individual" are allowed for parameter weight_logic')
+
+
+            hazard = 0
+            for line in test_results:
+                if weight_logic=="group":
+                    c_info = n_features_total - line[4] + 1
+                    weight = c_info/weight_denominator
+                elif weight_logic=="individual":
+                    weight = line[4]/weight_denominator
+                else:
+                    raise Exception('Only "group" or "individual" are allowed for parameter weight_logic')
+
+                delta = 1 if line[3]==False else 0
+                q = line[2]/tot_observations
+                e = line[0] - line[1]
+                hazard += delta * weight * q * abs(e)**(1./3.) * line[1]**(1./3.)
+
+            hazard_overall+= hazard
+
+        return hazard_overall
diff --git a/brio/risk/RiskCalculator.py b/brio/risk/RiskCalculator.py
@@ -0,0 +1,20 @@
+import numpy as np
+
+class RiskCalculator:
+
+    def compute_risk(self, test_hazards):
+        '''
+        Computes the overall risk using the hazards coming from the 
+        different Bias and Opacity tests. 
+
+        Args:
+            test_hazards: list of hazards computed for a set of tests
+
+        Returns:
+            risk: num, the overall measure of risk
+        '''
+        # test_hazards = [list_of_hazards]
+
+        risk = np.sum(test_hazards)/len(test_hazards)
+
+        return risk
diff --git a/brio/risks/__init__.py → brio/risk/__init__.py b/brio/risks/__init__.py → brio/risk/__init__.py