Skip to content

Commit 29ab4cf

Browse files
Merge remote-tracking branch 'upstream/hotfixes' into release
2 parents 0084807 + 47a6b7f commit 29ab4cf

File tree

5 files changed

+32
-17
lines changed

5 files changed

+32
-17
lines changed

pm4py/objects/log/util/dataframe_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class Parameters(Enum):
4444
CASE_ID_KEY = constants.PARAMETER_CONSTANT_CASEID_KEY
4545
CASE_PREFIX = constants.CASE_ATTRIBUTE_PREFIX
4646
CASE_ATTRIBUTES = "case_attributes"
47+
CONSIDER_ALL_ATTRIBUTES = "consider_all_attributes"
4748
MANDATORY_ATTRIBUTES = "mandatory_attributes"
4849
MAX_NO_CASES = "max_no_cases"
4950
MIN_DIFFERENT_OCC_STR_ATTR = "min_different_occ_str_attr"

pm4py/objects/log/util/df_features_utils.py

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,9 @@ def automatic_feature_selection_df(df, parameters=None):
7575
max_different_occ_str_attr = exec_utils.get_param_value(
7676
Parameters.MAX_DIFFERENT_OCC_STR_ATTR, parameters, 50
7777
)
78+
consider_all_attributes = exec_utils.get_param_value(
79+
Parameters.CONSIDER_ALL_ATTRIBUTES, parameters, True
80+
)
7881

7982
cols_dtypes = {x: str(df[x].dtype) for x in df.columns}
8083
other_attributes_to_retain = set()
@@ -83,9 +86,10 @@ def automatic_feature_selection_df(df, parameters=None):
8386
for x, y in cols_dtypes.items():
8487
attr_df = df.dropna(subset=[x])
8588
this_cases = attr_df[case_id_key].nunique()
89+
attr_in_all_cases = this_cases == no_all_cases
8690

8791
# in any case, keep attributes that appears at least once per case
88-
if this_cases == no_all_cases:
92+
if attr_in_all_cases or consider_all_attributes:
8993
if "float" in y or "int" in y:
9094
# (as in the classic log version) retain always float/int attributes
9195
other_attributes_to_retain.add(x)
@@ -197,14 +201,14 @@ def select_string_column(
197201
)
198202
# Fill NaN and convert to float32 for all new columns at once
199203
new_cols = crosstab.columns.tolist()
200-
for col_name in new_cols:
201-
if col_name in fea_df.columns:
202-
fea_df[col_name] = fea_df[col_name].fillna(0).astype(np.float32)
204+
if new_cols:
205+
fea_df[new_cols] = fea_df[new_cols].astype(np.float32)
203206
else:
204207
# Use pivot_table for binary encoding - much faster than loop
205208
# Create a dummy column for aggregation
206209
df_filtered = df_filtered.copy()
207210
df_filtered["_dummy"] = 1
211+
cases_with_values = df_filtered[case_id_key].unique()
208212

209213
# Get unique values
210214
unique_vals = pandas_utils.format_unique(df_filtered[col].unique())
@@ -217,7 +221,6 @@ def select_string_column(
217221
columns=col,
218222
values="_dummy",
219223
aggfunc="max",
220-
fill_value=0,
221224
)
222225

223226
# Rename columns
@@ -234,13 +237,21 @@ def select_string_column(
234237
fea_df = fea_df.merge(
235238
pivot, left_on=case_id_key, right_index=True, how="left"
236239
)
237-
# Fill NaN and convert to float32 for the newly added columns
240+
# Fill NaN only for cases having at least one value and convert to float32
238241
new_cols = pivot.columns.tolist()
239-
for col_name in new_cols:
240-
if col_name in fea_df.columns:
241-
fea_df[col_name] = (
242-
fea_df[col_name].fillna(0).astype(np.float32)
243-
)
242+
if new_cols:
243+
mask = (
244+
fea_df[case_id_key].isin(cases_with_values)
245+
if case_id_key in fea_df.columns
246+
else None
247+
)
248+
for col_name in new_cols:
249+
if col_name in fea_df.columns:
250+
if mask is not None:
251+
fea_df.loc[mask, col_name] = (
252+
fea_df.loc[mask, col_name].fillna(0)
253+
)
254+
fea_df[col_name] = fea_df[col_name].astype(np.float32)
244255

245256
return fea_df
246257

pm4py/objects/log/util/pl_lazy_fea_utils.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,9 @@ def automatic_feature_selection_df(
110110
max_different_occ_str_attr = exec_utils.get_param_value(
111111
Parameters.MAX_DIFFERENT_OCC_STR_ATTR, parameters, 50
112112
)
113+
consider_all_attributes = exec_utils.get_param_value(
114+
Parameters.CONSIDER_ALL_ATTRIBUTES, parameters, True
115+
)
113116

114117
other_attributes_to_retain = set()
115118

@@ -126,7 +129,7 @@ def automatic_feature_selection_df(
126129
)
127130
cases_with_value = int(cases_with_value or 0)
128131

129-
if cases_with_value != total_cases:
132+
if cases_with_value != total_cases and not consider_all_attributes:
130133
continue
131134

132135
if _is_numeric_dtype(dtype):
@@ -220,7 +223,7 @@ def _select_string_columns(
220223
for value in unique_values:
221224
column_name = _sanitize_feature_name(column, value)
222225

223-
comparison = pl.col(column).eq(value).fill_null(False)
226+
comparison = pl.col(column).eq(value)
224227

225228
if count_occurrences:
226229
agg_expr = comparison.cast(pl.Int64).sum().alias(column_name)
@@ -229,7 +232,7 @@ def _select_string_columns(
229232

230233
agg_exprs.append(agg_expr)
231234
fill_exprs.append(
232-
pl.col(column_name).cast(pl.Float32).fill_null(0.0)
235+
pl.col(column_name).cast(pl.Float32)
233236
)
234237

235238
feature_chunk = (

pm4py/statistics/process_cube/pandas/variants/classic.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@ def apply(
101101
numeric_y = y_col in df.columns
102102

103103
if not numeric_x:
104-
x_prefix_cols = [c for c in df.columns if c.startswith(x_col)]
104+
x_prefix_cols = [c for c in df.columns if c.startswith(x_col+"_")]
105105
if not numeric_y:
106-
y_prefix_cols = [c for c in df.columns if c.startswith(y_col)]
106+
y_prefix_cols = [c for c in df.columns if c.startswith(y_col+"_")]
107107

108108
# ------------------------------------------------------
109109
# Handle X dimension binning

pm4py/statistics/process_cube/polars/variants/classic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def _aggregation_expression(column: str, agg_fn: str) -> pl.Expr:
178178

179179

180180
def _prefix_columns(df: pl.DataFrame, prefix: str) -> List[str]:
181-
return [col for col in df.columns if col.startswith(prefix)]
181+
return [col for col in df.columns if col.startswith(prefix+"_")]
182182

183183

184184
def _numeric_numeric_case(

0 commit comments

Comments
 (0)