add group feature

CloseChoice · CloseChoice · commit 457d8dd9057d · 2025-07-19T01:19:43.000+02:00
diff --git a/src/xaiflow/mlflow_plugin.py b/src/xaiflow/mlflow_plugin.py
@@ -32,6 +32,7 @@ def log_feature_importance_report(
         shap_values: Explanation,
         feature_encodings: Optional[Dict[str, Dict[int, str]]] = None,
         importance_values: List[float] | np.ndarray = None,
+        group_labels: Optional[List[str]] = None,
         run_id: Optional[str] = None,
         artifact_path: str = "reports",
         report_name: str = "feature_importance_report.html",
@@ -44,6 +45,7 @@ def log_feature_importance_report(
             feature_names: List of feature names
             importance_values: List of importance values corresponding to features
             shap_values: Optional SHAP values matrix (samples x features)
+            group_labels: Optional list of group labels for each sample
             run_id: MLflow run ID (uses active run if None)
             artifact_path: Path within MLflow artifacts to store the report
             report_name: Name of the HTML report file
@@ -74,6 +76,10 @@ def log_feature_importance_report(
                 shap_values = shap_values[..., -1]
                 base_values = float(base_values[-1])
 
+        if group_labels is not None:
+            if len(group_labels) != shap_values.shape[0]:
+                raise ValueError("group_labels length must match the number of samples in shap_values.")
+
         # Use active run if no run_id provided
         if run_id is None:
             active_run = mlflow.active_run()
@@ -105,6 +111,7 @@ def log_feature_importance_report(
             html_content = self._generate_html_content(
                 importance_data=importance_data,
                 shap_values=shap_values,
+                group_labels=group_labels or [],  # Default to empty list if None
                 feature_values=feature_values,
                 base_values=base_values,
                 feature_encodings=feature_encodings,
@@ -143,6 +150,7 @@ def _generate_html_content(
         importance_data: Dict[str, Any],
         shap_values: List[List[float]],
         feature_values: List[float] = None,
+        group_labels: List[str] = None,
         base_values: List[float] = None,
         feature_encodings: Optional[Dict[str, Dict[int, str]]] = None,
         feature_names: List[str] = None
@@ -202,6 +210,7 @@ def _generate_html_content(
             timestamp=current_time,
             importance_data=importance_data,  # Pass as Python dict
             shap_values=shap_values,  # Pass as Python list
+            group_labels=group_labels or [],  # Pass as Python list or empty list
             feature_values=feature_values,  # Pass as Python list or None
             base_values=base_values or [0] * 10,  # Todo: fix this once we hand over numpy arrays
             feature_encodings=feature_encodings or {},  # Pass as optional dict
diff --git a/src/xaiflow/templates/assets/bundle.js b/src/xaiflow/templates/assets/bundle.js
diff --git a/src/xaiflow/templates/assets/bundle.js.map b/src/xaiflow/templates/assets/bundle.js.map
diff --git a/src/xaiflow/templates/components/ChartManager.svelte b/src/xaiflow/templates/components/ChartManager.svelte
@@ -12,6 +12,7 @@
     baseValues: number[] | number; // Base values for SHAP calculations
     featureNames?: string[]; // Optional prop for feature names
     isHigherOutputBetter?: boolean; // Optional prop to determine if higher output is better
+    groupLabels: string[]; // Optional prop for group labels
   }
   
   let { importanceData,
@@ -21,11 +22,35 @@
         baseValues,
         featureNames,
         isHigherOutputBetter,
+        groupLabels,
        }: Props = $props();
   
   // Reactive state for selected label using $state
   let selectedLabel: string | null = $state(null);
   let showDeepDive = $state(false);
+  let selectedGroup: string | null = $state(null);
+  // Compute unique group labels
+  let uniqueGroups: string[] = $derived(Array.from(new Set(groupLabels || [])));
+  console.log('ChartManager: Loaded with props:', {
+    importanceData,
+    shapValues,
+    featureValues,
+    featureEncodings,
+    baseValues,
+    featureNames,
+    isHigherOutputBetter,
+    groupLabels
+  });
+
+  // Compute selectedShapValues based on selectedGroup
+  let selectedShapValues = $derived((selectedGroup && selectedGroup !== "" && selectedGroup !== "All")
+    ? shapValues.filter((_, idx) => groupLabels[idx] === selectedGroup)
+    : shapValues);
+
+  let selectedFeatureValues = $derived((selectedGroup && selectedGroup !== "" && selectedGroup !== "All")
+    ? featureValues.filter((_, idx) => groupLabels[idx] === selectedGroup)
+    : featureValues);
+  console.log('ChartManager: selectedShapValues computed:', selectedShapValues);
 
   console.log("ChartManager", importanceData);
   console.log('ChartManager: 1/4 command in file');
@@ -54,9 +79,22 @@
 </script>
 
 <div class="chart-manager">
-  <div style="display: flex; gap: 1.5rem; align-items: center; margin-bottom: 1.5rem;">
-    <button type="button" on:click={() => showDeepDive = false} class:selected={!showDeepDive}>Charts</button>
-    <button id="deepdive-button" type="button" on:click={() => showDeepDive = true} class:selected={showDeepDive}>Deep Dive</button>
+  <div style="display: flex; gap: 1.5rem; align-items: center; margin-bottom: 1.5rem; justify-content: space-between;">
+    <div style="display: flex; gap: 1.5rem; align-items: center;">
+      <button type="button" on:click={() => showDeepDive = false} class:selected={!showDeepDive}>Charts</button>
+      <button id="deepdive-button" type="button" on:click={() => showDeepDive = true} class:selected={showDeepDive}>Deep Dive</button>
+    </div>
+    {#if uniqueGroups.length > 0}
+      <div style="margin-left: auto;">
+        <label for="group-dropdown" style="margin-right: 0.5em; font-size: 1em;">Group:</label>
+        <select id="group-dropdown" bind:value={selectedGroup} on:change={(e) => selectedGroup = e.target.value} style="font-size: 1em; padding: 0.3em 0.7em;">
+          <option value="">All</option>
+          {#each uniqueGroups as group}
+            <option value={group}>{group}</option>
+          {/each}
+        </select>
+      </div>
+    {/if}
   </div>
   {#if !showDeepDive}
     <div class="charts-row">
@@ -75,8 +113,8 @@
         <h3>SHAP Values</h3>
         <div class="chart-container">
           <ScatterShapValues 
-            shapValues={shapValues} 
-            featureValues={featureValues}
+            shapValues={selectedShapValues} 
+            featureValues={selectedFeatureValues}
             bind:selectedFeatureIndex={selectedFeatureIndex} 
             bind:selectedFeature={selectedLabel}
             isHigherOutputBetter={true} 
@@ -87,8 +125,8 @@
     </div>
   {:else}
     <DeepDiveManager
-      shapValues={shapValues}
-      featureValues={featureValues}
+      shapValues={selectedShapValues}
+      featureValues={selectedFeatureValues}
       selectedFeatureIndex={selectedFeatureIndex}
       selectedFeature={selectedLabel}
       baseValues={baseValues}
diff --git a/src/xaiflow/templates/components/DeepDiveManager.svelte b/src/xaiflow/templates/components/DeepDiveManager.svelte
@@ -32,7 +32,7 @@
     });
     let selectedObservationIndex = $state(0);
     let currentPage = $state(0);
-    let totalObservations = shapValues.length;
+    let totalObservations = $derived(shapValues.length);
     let filterText = $state("");
 
     let allObservations = $derived(
diff --git a/src/xaiflow/templates/report.html b/src/xaiflow/templates/report.html
@@ -289,6 +289,7 @@ <h1>Xflow report by cloudexplain</h1>
             const baseValues = {{ base_values | safe }};
             const featureEncodings = {{ feature_encodings | safe }};
             const featureNames = {{ feature_names | safe }};
+            const groupLabels = {{ group_labels | safe }};
             
             // Initialize ChartManager with all props needed for both managers
             if (window.ChartManager && importanceData) {
@@ -306,7 +307,8 @@ <h1>Xflow report by cloudexplain</h1>
                             featureValues: featureValues,
                             baseValues: baseValues,
                             featureEncodings: featureEncodings,
-                            featureNames: featureNames
+                            featureNames: featureNames,
+                            groupLabels: groupLabels,
                         }
                     });
                     console.log('ChartManager with DeepDiveManager mounted successfully!');
diff --git a/tests/test_mlflow_plugin.py b/tests/test_mlflow_plugin.py
@@ -207,59 +207,6 @@ def test_no_feature_encodings():
     )
     return html_content
 
-@save_and_click_canvas_wrapper
-def test_fix_previous_bug():
-    importanceData = {'features':
-                      ['acv_score_canc_30d',
-                      'avg_canc_dealer_no_weighted',
-                      'ctr_usa_sec_inc_voice_a6m',
-                      'avg_canc_reseller_id_weighted',
-                      'ctr_usa_kb_data_usg_a3m',
-                      'ctr_sales_channel_current',
-                      'ctr_cancellations_per_year',
-                      'avg_vvl_reseller_id_weighted',
-                      'ctr_start_days',
-                      'ctr_min_duration_date_crm_days',
-                      'rlz',
-                      'vvl_l_event_days',
-                      'avg_vvl_sales_channel',
-                      'ctr_dealer_no_current',
-                      'avg_canc_sales_channel',
-                      'prt_cancellation_page_visit_90d_count',
-                      'acv_score_vvl_30d'],
-                      'values': [0.5000000000000614,
-                                 0.49999999999993844]}
-    shapValues = [[0.05231021109253422, -0.05231021109253736], [0.0073606489440402965, -0.007360648944034653], [-0.01633880222219225, 0.016338802222170094], [0.012322311243639975, -0.012322311243637033], [-0.004445322661143976, 0.004445322661143468], [0.0009611405151175154, -0.0009611405151178431], [0.005596997502669034, -0.0055969975026683915], [-0.0008618250588141368, 0.0008618250587932731], [0.0016991238750754237, -0.0016991238750824476], [0.0048252568432011304, -0.004825256843199152], [-0.00038499217151075256, 0.00038499217151299176], [0.005172501948575322, -0.005172501948575318], [-0.003383349580534422, 0.003383349580535079], [-0.017147577240666855, 0.017147577240670973], [0.008064862968425773, -0.008064862968423504], [0.0018500348673166761, -0.0018500348673163927], [0.006529750924148127, -0.006529750924151195]]
-    featureValues = [0.03421833738684654, 0.022704629679359795, 15.0, 0.022704629679359795, 30193.0, 241.0, 0.0, 0.022739316468840073, 951.5416666666666, 2912717.0, 30.0, 9999.0, 0.019356054262267625, 9.0, 0.02191634567074192, 0.0, 0.2959745228290558]
-    baseValues = [0.9058690282100686, 0.09413097178993132]
-    featureEncodings = None
-    featureNames = ['acv_score_canc_30d',
-                    'avg_canc_dealer_no_weighted',
-                    'ctr_usa_sec_inc_voice_a6m',
-                    'avg_canc_reseller_id_weighted',
-                    'ctr_usa_kb_data_usg_a3m',
-                    'ctr_sales_channel_current',
-                    'ctr_cancellations_per_year',
-                    'avg_vvl_reseller_id_weighted',
-                    'ctr_start_days',
-                    'ctr_min_duration_date_crm_days',
-                    'rlz',
-                    'vvl_l_event_days',
-                    'avg_vvl_sales_channel',
-                    'ctr_dealer_no_current',
-                    'avg_canc_sales_channel',
-                    'prt_cancellation_page_visit_90d_count',
-                    'acv_score_vvl_30d']
-    plugin = XaiflowPlugin()
-    html_content = plugin._generate_html_content(
-        importance_data=importanceData,
-        shap_values=shapValues,
-        feature_values=featureValues,
-        feature_encodings=featureEncodings,
-        feature_names=featureNames,
-    )
-    return html_content
-
 
 def test_classification_case(mocker):
     X, y = shap.datasets.adult(n_points=200)
@@ -322,4 +269,61 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             feature_names=list(X.columns),
         )
         html_content_click_test(Path(output_path))
+    # return html_content
+
+
+def test_classification_case_check_list_feature(mocker):
+    X, y = shap.datasets.adult(n_points=200)
+
+    # Identify categorical columns
+    categorical_cols = [col for col in X.columns if X[col].dtype == 'category' or X[col].dtype == 'object']
+    numeric_cols = [col for col in X.columns if col not in categorical_cols]
+
+    label_encoders = {}
+
+    # Fill missing values manually
+    for col in numeric_cols:
+        X[col] = X[col].astype(float).fillna(X[col].mean())
+    for col in categorical_cols:
+        le = LabelEncoder()
+        X[col + '_encoded'] = le.fit_transform(X[col].astype(str))  # convert to string in case of NaNs
+        label_encoders[col] = le  # Save encoder if needed later
+
+    # Train model
+    rfc = RandomForestClassifier()
+    rfc.fit(X, y)
+    ex = shap.TreeExplainer(rfc)
+    shap_values = ex(X)
+    plugin = XaiflowPlugin()
+
+    feature_encodings = {}
+    for col in categorical_cols:
+        feature_encodings[col + '_encoded'] = dict(zip(range(len(label_encoders[col].classes_)), label_encoders[col].classes_))
+    experiment_name = "dummytest"
+    mlflow.set_experiment(experiment_name=experiment_name)
+
+    output_path = f"tests/outputs/test_classification_case_check_list_feature.html"
+    class DummyTmpFile:
+        name = output_path
+        def __enter__(self):
+            self.name = output_path
+            # import pdb; pdb.set_trace()  # Debugging breakpoint
+            return self
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            pass
+
+    mocker.patch("tempfile.NamedTemporaryFile", return_value=DummyTmpFile())
+    mocker.patch("os.unlink")  # Prevent deletion
+
+    # Optionally patch mlflow.log_artifact if you want to avoid real logging
+    mocker.patch("mlflow.log_artifact")
+
+    with mlflow.start_run(run_name="auto_mpg_test"):
+        plugin.log_feature_importance_report(
+            shap_values=shap_values,
+            feature_encodings=feature_encodings,
+            feature_names=list(X.columns),
+            group_labels=["Group 1", "Group 2", "Group 3", "Group 4"] * int(len(shap_values) / 4)  # Example group labels
+        )
+        html_content_click_test(Path(output_path))
     # return html_content