Merge remote-tracking branch 'upstream/hotfixes' into release

fit-alessandro-berti · fit-alessandro-berti · commit 29ab4cf5fb29 · 2025-10-14T15:16:20.000+02:00
diff --git a/pm4py/objects/log/util/dataframe_utils.py b/pm4py/objects/log/util/dataframe_utils.py
@@ -44,6 +44,7 @@ class Parameters(Enum):
     CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
     CASE_PREFIX = constants.CASE_ATTRIBUTE_PREFIX
     CASE_ATTRIBUTES = "case_attributes"
+    CONSIDER_ALL_ATTRIBUTES = "consider_all_attributes"
     MANDATORY_ATTRIBUTES = "mandatory_attributes"
     MAX_NO_CASES = "max_no_cases"
     MIN_DIFFERENT_OCC_STR_ATTR = "min_different_occ_str_attr"
diff --git a/pm4py/objects/log/util/df_features_utils.py b/pm4py/objects/log/util/df_features_utils.py
@@ -75,6 +75,9 @@ def automatic_feature_selection_df(df, parameters=None):
     max_different_occ_str_attr = exec_utils.get_param_value(
         Parameters.MAX_DIFFERENT_OCC_STR_ATTR, parameters, 50
     )
+    consider_all_attributes = exec_utils.get_param_value(
+        Parameters.CONSIDER_ALL_ATTRIBUTES, parameters, True
+    )
 
     cols_dtypes = {x: str(df[x].dtype) for x in df.columns}
     other_attributes_to_retain = set()
@@ -83,9 +86,10 @@ def automatic_feature_selection_df(df, parameters=None):
     for x, y in cols_dtypes.items():
         attr_df = df.dropna(subset=[x])
         this_cases = attr_df[case_id_key].nunique()
+        attr_in_all_cases = this_cases == no_all_cases
 
         # in any case, keep attributes that appears at least once per case
-        if this_cases == no_all_cases:
+        if attr_in_all_cases or consider_all_attributes:
             if "float" in y or "int" in y:
                 # (as in the classic log version) retain always float/int attributes
                 other_attributes_to_retain.add(x)
@@ -197,14 +201,14 @@ def select_string_column(
         )
         # Fill NaN and convert to float32 for all new columns at once
         new_cols = crosstab.columns.tolist()
-        for col_name in new_cols:
-            if col_name in fea_df.columns:
-                fea_df[col_name] = fea_df[col_name].fillna(0).astype(np.float32)
+        if new_cols:
+            fea_df[new_cols] = fea_df[new_cols].astype(np.float32)
     else:
         # Use pivot_table for binary encoding - much faster than loop
         # Create a dummy column for aggregation
         df_filtered = df_filtered.copy()
         df_filtered["_dummy"] = 1
+        cases_with_values = df_filtered[case_id_key].unique()
 
         # Get unique values
         unique_vals = pandas_utils.format_unique(df_filtered[col].unique())
@@ -217,7 +221,6 @@ def select_string_column(
                 columns=col,
                 values="_dummy",
                 aggfunc="max",
-                fill_value=0,
             )
 
             # Rename columns
@@ -234,13 +237,21 @@ def select_string_column(
             fea_df = fea_df.merge(
                 pivot, left_on=case_id_key, right_index=True, how="left"
             )
-            # Fill NaN and convert to float32 for the newly added columns
+            # Fill NaN only for cases having at least one value and convert to float32
             new_cols = pivot.columns.tolist()
-            for col_name in new_cols:
-                if col_name in fea_df.columns:
-                    fea_df[col_name] = (
-                        fea_df[col_name].fillna(0).astype(np.float32)
-                    )
+            if new_cols:
+                mask = (
+                    fea_df[case_id_key].isin(cases_with_values)
+                    if case_id_key in fea_df.columns
+                    else None
+                )
+                for col_name in new_cols:
+                    if col_name in fea_df.columns:
+                        if mask is not None:
+                            fea_df.loc[mask, col_name] = (
+                                fea_df.loc[mask, col_name].fillna(0)
+                            )
+                        fea_df[col_name] = fea_df[col_name].astype(np.float32)
 
     return fea_df
 
diff --git a/pm4py/objects/log/util/pl_lazy_fea_utils.py b/pm4py/objects/log/util/pl_lazy_fea_utils.py
@@ -110,6 +110,9 @@ def automatic_feature_selection_df(
     max_different_occ_str_attr = exec_utils.get_param_value(
         Parameters.MAX_DIFFERENT_OCC_STR_ATTR, parameters, 50
     )
+    consider_all_attributes = exec_utils.get_param_value(
+        Parameters.CONSIDER_ALL_ATTRIBUTES, parameters, True
+    )
 
     other_attributes_to_retain = set()
 
@@ -126,7 +129,7 @@ def automatic_feature_selection_df(
         )
         cases_with_value = int(cases_with_value or 0)
 
-        if cases_with_value != total_cases:
+        if cases_with_value != total_cases and not consider_all_attributes:
             continue
 
         if _is_numeric_dtype(dtype):
@@ -220,7 +223,7 @@ def _select_string_columns(
         for value in unique_values:
             column_name = _sanitize_feature_name(column, value)
 
-            comparison = pl.col(column).eq(value).fill_null(False)
+            comparison = pl.col(column).eq(value)
 
             if count_occurrences:
                 agg_expr = comparison.cast(pl.Int64).sum().alias(column_name)
@@ -229,7 +232,7 @@ def _select_string_columns(
 
             agg_exprs.append(agg_expr)
             fill_exprs.append(
-                pl.col(column_name).cast(pl.Float32).fill_null(0.0)
+                pl.col(column_name).cast(pl.Float32)
             )
 
     feature_chunk = (
diff --git a/pm4py/statistics/process_cube/pandas/variants/classic.py b/pm4py/statistics/process_cube/pandas/variants/classic.py
@@ -101,9 +101,9 @@ def apply(
     numeric_y = y_col in df.columns
     
     if not numeric_x:
-        x_prefix_cols = [c for c in df.columns if c.startswith(x_col)]
+        x_prefix_cols = [c for c in df.columns if c.startswith(x_col+"_")]
     if not numeric_y:
-        y_prefix_cols = [c for c in df.columns if c.startswith(y_col)]
+        y_prefix_cols = [c for c in df.columns if c.startswith(y_col+"_")]
 
     # ------------------------------------------------------
     # Handle X dimension binning
diff --git a/pm4py/statistics/process_cube/polars/variants/classic.py b/pm4py/statistics/process_cube/polars/variants/classic.py
@@ -178,7 +178,7 @@ def _aggregation_expression(column: str, agg_fn: str) -> pl.Expr:
 
 
 def _prefix_columns(df: pl.DataFrame, prefix: str) -> List[str]:
-    return [col for col in df.columns if col.startswith(prefix)]
+    return [col for col in df.columns if col.startswith(prefix+"_")]
 
 
 def _numeric_numeric_case(