1616'''
1717from pm4py .util import xes_constants , pandas_utils , constants
1818from pm4py .util .business_hours import soj_time_business_hours_diff
19+ import numpy as np
1920
2021
2122def get_dfg_graph (df , measure = "frequency" , activity_key = "concept:name" , case_id_glue = "case:concept:name" ,
2223 start_timestamp_key = None , timestamp_key = "time:timestamp" , perf_aggregation_key = "mean" ,
2324 sort_caseid_required = True ,
2425 sort_timestamp_along_case_id = True , keep_once_per_case = False , window = 1 ,
25- business_hours = False , worktiming = None , weekends = None , workcalendar = constants .DEFAULT_BUSINESS_HOURS_WORKCALENDAR , target_activity_key = None ):
26+ business_hours = False , worktiming = None , weekends = None , workcalendar = constants .DEFAULT_BUSINESS_HOURS_WORKCALENDAR , target_activity_key = None ,
27+ reduce_columns = True ):
2628 """
2729 Get DFG graph from Pandas dataframe
2830
@@ -65,9 +67,19 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
6567
6668 # if not differently specified, set the start timestamp key to the timestamp key
6769 # to avoid retro-compatibility problems
70+ st_eq_ct = start_timestamp_key == timestamp_key
6871 if start_timestamp_key is None :
6972 start_timestamp_key = xes_constants .DEFAULT_START_TIMESTAMP_KEY
7073 df [start_timestamp_key ] = df [timestamp_key ]
74+ st_eq_ct = True
75+
76+ # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
77+ # columns
78+ if reduce_columns :
79+ if measure == "frequency" and not sort_timestamp_along_case_id :
80+ df = df [list ({case_id_glue , activity_key , target_activity_key })]
81+ else :
82+ df = df [list ({case_id_glue , activity_key , start_timestamp_key , timestamp_key , target_activity_key })]
7183
7284 # to get rows belonging to same case ID together, we need to sort on case ID
7385 if sort_caseid_required :
@@ -76,18 +88,12 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
7688 else :
7789 df = df .sort_values (case_id_glue )
7890
79- # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
80- # columns
81- if measure == "frequency" :
82- df_reduced = df [{case_id_glue , activity_key , target_activity_key }]
83- else :
84- df_reduced = df [{case_id_glue , activity_key , start_timestamp_key , timestamp_key , target_activity_key }]
8591 # shift the dataframe by 1, in order to couple successive rows
86- df_reduced_shifted = df_reduced .shift (- window )
92+ df_shifted = df .shift (- window )
8793 # change column names to shifted dataframe
88- df_reduced_shifted .columns = [str (col ) + '_2' for col in df_reduced_shifted .columns ]
94+ df_shifted .columns = [str (col ) + '_2' for col in df_shifted .columns ]
8995 # concate the two dataframe to get a unique dataframe
90- df_successive_rows = pd .concat ([df_reduced , df_reduced_shifted ], axis = 1 )
96+ df_successive_rows = pd .concat ([df , df_shifted ], axis = 1 )
9197 # as successive rows in the sorted dataframe may belong to different case IDs we have to restrict ourselves to
9298 # successive rows belonging to same case ID
9399 df_successive_rows = df_successive_rows [df_successive_rows [case_id_glue ] == df_successive_rows [case_id_glue + '_2' ]]
@@ -99,8 +105,10 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
99105 all_columns = list (all_columns - set ([activity_key , target_activity_key + '_2' ]))
100106
101107 if measure == "performance" or measure == "both" :
102- # in the arc performance calculation, make sure to consider positive or null values
103- df_successive_rows [start_timestamp_key + '_2' ] = df_successive_rows [[start_timestamp_key + '_2' , timestamp_key ]].max (axis = 1 )
108+ if not st_eq_ct :
109+ # in the arc performance calculation, make sure to consider positive or null values
110+ df_successive_rows [start_timestamp_key + '_2' ] = df_successive_rows [[start_timestamp_key + '_2' , timestamp_key ]].max (axis = 1 )
111+
104112 # calculate the difference between the timestamps of two successive events
105113 if business_hours :
106114 if worktiming is None :
@@ -158,7 +166,8 @@ def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="tim
158166 case_id_glue = "case:concept:name" , activity_key = "concept:name" ,
159167 sort_caseid_required = True ,
160168 sort_timestamp_along_case_id = True , reduce_dataframe = True , keep_first_following = True ,
161- business_hours = False , worktiming = None , weekends = None , workcalendar = constants .DEFAULT_BUSINESS_HOURS_WORKCALENDAR ):
169+ business_hours = False , worktiming = None , weekends = None , workcalendar = constants .DEFAULT_BUSINESS_HOURS_WORKCALENDAR ,
170+ event_index = constants .DEFAULT_INDEX_KEY ):
162171 """
163172 Gets the partial order between events (of the same case) in a Pandas dataframe
164173
@@ -191,29 +200,37 @@ def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="tim
191200 # to avoid retro-compatibility problems
192201 if start_timestamp_key is None :
193202 start_timestamp_key = xes_constants .DEFAULT_START_TIMESTAMP_KEY
203+
204+ if start_timestamp_key not in df :
194205 df [start_timestamp_key ] = df [timestamp_key ]
195206
207+ # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
208+ # columns
209+ if reduce_dataframe :
210+ needed_columns = {case_id_glue , activity_key , start_timestamp_key , timestamp_key }
211+ if event_index in df .columns :
212+ needed_columns .add (event_index )
213+ needed_columns = list (needed_columns )
214+ df = df [needed_columns ]
215+
196216 # to get rows belonging to same case ID together, we need to sort on case ID
197217 if sort_caseid_required :
198218 if sort_timestamp_along_case_id :
199219 df = df .sort_values ([case_id_glue , start_timestamp_key , timestamp_key ])
200220 else :
201221 df = df .sort_values (case_id_glue )
222+ df .reset_index (drop = True , inplace = True )
202223
203- # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
204- # columns
205- if reduce_dataframe :
206- df = df [[case_id_glue , activity_key , start_timestamp_key , timestamp_key ]]
224+ if event_index not in df .columns :
225+ df [event_index ] = df .index
207226
208- df = pandas_utils .insert_index (df )
209- df = df .set_index (case_id_glue )
210- df_copy = df .copy ()
227+ df .set_index (case_id_glue , inplace = True )
211228
212- df = df .join (df_copy , rsuffix = "_2" ). dropna ( )
213- df = df [df [constants . DEFAULT_INDEX_KEY ] < df [constants . DEFAULT_INDEX_KEY + "_2" ]]
214- df [ start_timestamp_key + '_2' ] = df [[ start_timestamp_key + '_2' , timestamp_key ]]. max ( axis = 1 )
229+ df = df .join (df , rsuffix = "_2" )
230+ df = df [df [event_index ] < df [event_index + "_2" ]]
231+ df = df [ df [ timestamp_key ] < = df [start_timestamp_key + '_2' ]]
215232
216- df = df .reset_index ()
233+ df .reset_index (inplace = True )
217234
218235 if business_hours :
219236 if worktiming is None :
0 commit comments