@@ -47,8 +47,8 @@ class Parameters(Enum):
4747 CASE_ATTRIBUTES = "case_attributes"
4848 MANDATORY_ATTRIBUTES = "mandatory_attributes"
4949 MAX_NO_CASES = "max_no_cases"
50- MIN_DIFFERENT_OCC_STR_ATTR = 5
51- MAX_DIFFERENT_OCC_STR_ATTR = 50
50+ MIN_DIFFERENT_OCC_STR_ATTR = "min_different_occ_str_attr"
51+ MAX_DIFFERENT_OCC_STR_ATTR = "max_different_occ_str_attr"
5252 TIMESTAMP_KEY = constants .PARAMETER_CONSTANT_TIMESTAMP_KEY
5353 ACTIVITY_KEY = constants .PARAMETER_CONSTANT_ACTIVITY_KEY
5454 PARAM_ARTIFICIAL_START_ACTIVITY = constants .PARAM_ARTIFICIAL_START_ACTIVITY
@@ -58,6 +58,7 @@ class Parameters(Enum):
5858 USE_EXTREMES_TIMESTAMP = "use_extremes_timestamp"
5959 ADD_CASE_IDENTIFIER_COLUMN = "add_case_identifier_column"
6060 DETERMINISTIC = "deterministic"
61+ COUNT_OCCURRENCES = "count_occurrences"
6162
6263
6364def insert_partitioning (df , num_partitions , parameters = None ):
@@ -396,6 +397,7 @@ def select_string_column(
396397 fea_df : pd .DataFrame ,
397398 col : str ,
398399 case_id_key = constants .CASE_CONCEPT_NAME ,
400+ count_occurrences = False ,
399401) -> pd .DataFrame :
400402 """
401403 Extract N columns (for N different attribute values; hotencoding) for the features dataframe for the given string attribute
@@ -410,6 +412,9 @@ def select_string_column(
410412 String column
411413 case_id_key
412414 Case ID key
415+ count_occurrences
416+ If True, count the number of occurrences of the attribute value in each case.
417+ If False (default), use binary encoding (1 if present, 0 if not present)
413418
414419 Returns
415420 --------------
@@ -419,18 +424,33 @@ def select_string_column(
419424 vals = pandas_utils .format_unique (df [col ].unique ())
420425 for val in vals :
421426 if val is not None :
422- filt_df_cases = pandas_utils . format_unique (
423- df [ df [ col ] == val ][ case_id_key ]. unique ( )
424- )
427+ # Convert value to string first to handle all data types
428+ val_str = str ( val )
429+ # Remove non-ASCII characters and spaces for column naming
425430 new_col = (
426431 col
427432 + "_"
428- + val .encode ("ascii" , errors = "ignore" )
433+ + val_str .encode ("ascii" , errors = "ignore" )
429434 .decode ("ascii" )
430435 .replace (" " , "" )
431436 )
432- fea_df [new_col ] = fea_df [case_id_key ].isin (filt_df_cases )
433- fea_df [new_col ] = fea_df [new_col ].astype (np .float32 )
437+
438+ if count_occurrences :
439+ # Count the number of occurrences of this value per case
440+ counts = df [df [col ] == val ].groupby (case_id_key ).size ().reset_index (name = 'count' )
441+ fea_df = fea_df .merge (
442+ counts .rename (columns = {'count' : new_col }),
443+ on = case_id_key ,
444+ how = "left"
445+ )
446+ fea_df [new_col ] = fea_df [new_col ].fillna (0 ).astype (np .float32 )
447+ else :
448+ # Binary encoding (original behavior)
449+ filt_df_cases = pandas_utils .format_unique (
450+ df [df [col ] == val ][case_id_key ].unique ()
451+ )
452+ fea_df [new_col ] = fea_df [case_id_key ].isin (filt_df_cases )
453+ fea_df [new_col ] = fea_df [new_col ].astype (np .float32 )
434454 return fea_df
435455
436456
@@ -451,6 +471,7 @@ def get_features_df(
451471 parameters
452472 Parameters of the algorithm, including:
453473 - Parameters.CASE_ID_KEY: the case ID
474+ - Parameters.COUNT_OCCURRENCES: if True, count occurrences of string attributes instead of binary encoding
454475
455476 Returns
456477 ---------------
@@ -466,6 +487,9 @@ def get_features_df(
466487 add_case_identifier_column = exec_utils .get_param_value (
467488 Parameters .ADD_CASE_IDENTIFIER_COLUMN , parameters , False
468489 )
490+ count_occurrences = exec_utils .get_param_value (
491+ Parameters .COUNT_OCCURRENCES , parameters , False
492+ )
469493
470494 fea_df = pandas_utils .instantiate_dataframe (
471495 {
@@ -477,7 +501,7 @@ def get_features_df(
477501 for col in list_columns :
478502 if "obj" in str (df [col ].dtype ) or "str" in str (df [col ].dtype ):
479503 fea_df = select_string_column (
480- df , fea_df , col , case_id_key = case_id_key
504+ df , fea_df , col , case_id_key = case_id_key , count_occurrences = count_occurrences
481505 )
482506 elif "float" in str (df [col ].dtype ) or "int" in str (df [col ].dtype ):
483507 fea_df = select_number_column (
0 commit comments