Skip to content

Commit ffa0278

Browse files
Merge remote-tracking branch 'upstream/hotfixes' into release
2 parents cf5ad80 + 850dcbd commit ffa0278

File tree

5 files changed

+224
-398
lines changed

5 files changed

+224
-398
lines changed

pm4py/algo/discovery/inductive/base_case/single_activity.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
IMDataStructureUVCL,
2525
IMDataStructureDFG,
2626
)
27-
from pm4py.objects.process_tree.obj import ProcessTree
27+
from pm4py.objects.process_tree.obj import ProcessTree, Operator
2828
from typing import Optional, Dict, Any
2929

3030

@@ -76,4 +76,14 @@ def leaf(
7676
obj=IMDataStructureDFG,
7777
parameters: Optional[Dict[str, Any]] = None,
7878
) -> ProcessTree:
79-
return ProcessTree(label=list(obj.dfg.start_activities)[0])
79+
leaf = ProcessTree(label=list(obj.dfg.start_activities)[0])
80+
if obj.data_structure.skip is False:
81+
return leaf
82+
else:
83+
tree = ProcessTree(operator=Operator.XOR)
84+
skip = ProcessTree()
85+
skip.parent = tree
86+
leaf.parent = tree
87+
tree.children.append(leaf)
88+
tree.children.append(skip)
89+
return tree

pm4py/algo/discovery/inductive/cuts/sequence.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ def project(
364364
z = activities_idx[b]
365365
j = activities_idx[a] + 1
366366
while j < z:
367-
skippable[j] = False
367+
skippable[j] = True
368368
j = j + 1
369369

370370
return [

pm4py/objects/log/util/dataframe_utils.py

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ class Parameters(Enum):
4747
CASE_ATTRIBUTES = "case_attributes"
4848
MANDATORY_ATTRIBUTES = "mandatory_attributes"
4949
MAX_NO_CASES = "max_no_cases"
50-
MIN_DIFFERENT_OCC_STR_ATTR = 5
51-
MAX_DIFFERENT_OCC_STR_ATTR = 50
50+
MIN_DIFFERENT_OCC_STR_ATTR = "min_different_occ_str_attr"
51+
MAX_DIFFERENT_OCC_STR_ATTR = "max_different_occ_str_attr"
5252
TIMESTAMP_KEY = constants.PARAMETER_CONSTANT_TIMESTAMP_KEY
5353
ACTIVITY_KEY = constants.PARAMETER_CONSTANT_ACTIVITY_KEY
5454
PARAM_ARTIFICIAL_START_ACTIVITY = constants.PARAM_ARTIFICIAL_START_ACTIVITY
@@ -58,6 +58,7 @@ class Parameters(Enum):
5858
USE_EXTREMES_TIMESTAMP = "use_extremes_timestamp"
5959
ADD_CASE_IDENTIFIER_COLUMN = "add_case_identifier_column"
6060
DETERMINISTIC = "deterministic"
61+
COUNT_OCCURRENCES = "count_occurrences"
6162

6263

6364
def insert_partitioning(df, num_partitions, parameters=None):
@@ -396,6 +397,7 @@ def select_string_column(
396397
fea_df: pd.DataFrame,
397398
col: str,
398399
case_id_key=constants.CASE_CONCEPT_NAME,
400+
count_occurrences=False,
399401
) -> pd.DataFrame:
400402
"""
401403
Extract N columns (for N different attribute values; hotencoding) for the features dataframe for the given string attribute
@@ -410,6 +412,9 @@ def select_string_column(
410412
String column
411413
case_id_key
412414
Case ID key
415+
count_occurrences
416+
If True, count the number of occurrences of the attribute value in each case.
417+
If False (default), use binary encoding (1 if present, 0 if not present)
413418
414419
Returns
415420
--------------
@@ -419,18 +424,33 @@ def select_string_column(
419424
vals = pandas_utils.format_unique(df[col].unique())
420425
for val in vals:
421426
if val is not None:
422-
filt_df_cases = pandas_utils.format_unique(
423-
df[df[col] == val][case_id_key].unique()
424-
)
427+
# Convert value to string first to handle all data types
428+
val_str = str(val)
429+
# Remove non-ASCII characters and spaces for column naming
425430
new_col = (
426431
col
427432
+ "_"
428-
+ val.encode("ascii", errors="ignore")
433+
+ val_str.encode("ascii", errors="ignore")
429434
.decode("ascii")
430435
.replace(" ", "")
431436
)
432-
fea_df[new_col] = fea_df[case_id_key].isin(filt_df_cases)
433-
fea_df[new_col] = fea_df[new_col].astype(np.float32)
437+
438+
if count_occurrences:
439+
# Count the number of occurrences of this value per case
440+
counts = df[df[col] == val].groupby(case_id_key).size().reset_index(name='count')
441+
fea_df = fea_df.merge(
442+
counts.rename(columns={'count': new_col}),
443+
on=case_id_key,
444+
how="left"
445+
)
446+
fea_df[new_col] = fea_df[new_col].fillna(0).astype(np.float32)
447+
else:
448+
# Binary encoding (original behavior)
449+
filt_df_cases = pandas_utils.format_unique(
450+
df[df[col] == val][case_id_key].unique()
451+
)
452+
fea_df[new_col] = fea_df[case_id_key].isin(filt_df_cases)
453+
fea_df[new_col] = fea_df[new_col].astype(np.float32)
434454
return fea_df
435455

436456

@@ -451,6 +471,7 @@ def get_features_df(
451471
parameters
452472
Parameters of the algorithm, including:
453473
- Parameters.CASE_ID_KEY: the case ID
474+
- Parameters.COUNT_OCCURRENCES: if True, count occurrences of string attributes instead of binary encoding
454475
455476
Returns
456477
---------------
@@ -466,6 +487,9 @@ def get_features_df(
466487
add_case_identifier_column = exec_utils.get_param_value(
467488
Parameters.ADD_CASE_IDENTIFIER_COLUMN, parameters, False
468489
)
490+
count_occurrences = exec_utils.get_param_value(
491+
Parameters.COUNT_OCCURRENCES, parameters, False
492+
)
469493

470494
fea_df = pandas_utils.instantiate_dataframe(
471495
{
@@ -477,7 +501,7 @@ def get_features_df(
477501
for col in list_columns:
478502
if "obj" in str(df[col].dtype) or "str" in str(df[col].dtype):
479503
fea_df = select_string_column(
480-
df, fea_df, col, case_id_key=case_id_key
504+
df, fea_df, col, case_id_key=case_id_key, count_occurrences=count_occurrences
481505
)
482506
elif "float" in str(df[col].dtype) or "int" in str(df[col].dtype):
483507
fea_df = select_number_column(

0 commit comments

Comments
 (0)