Skip to content

Commit 2d45d5b

Browse files
feat(pm4py): preparing for 2.2.25
2 parents 8a4a835 + 51e84a9 commit 2d45d5b

File tree

11 files changed

+130
-70
lines changed

11 files changed

+130
-70
lines changed

CHANGELOG.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,36 @@
11
# Changelog of pm4py
22

3+
## pm4py 2.2.25 (2022.07.29)
4+
5+
### Added
6+
7+
### Changed
8+
* ce94110076e3269c96a6eee61d7618f08f44472a
9+
* optimization in the calculation of the eventually-follows graph on Pandas dataframes.
10+
* 3cca8f97bbd09f4ae5644dcc156489d4b2037028
11+
* optimization in the calculation of the performance directly-follows graph on Pandas dataframes.
12+
* 4d8721787a50da397b265678be614c94894ea851
13+
* column reduction in DFG calculation on top of Pandas dataframes
14+
15+
### Deprecated
16+
17+
### Fixed
18+
* d754ccdac680f610b2b628dc9830d92da6954dc1
19+
cb76238c29b986026f07261c11a1c09a667c9ab9
20+
54970a58927ad0e17b173bff17705a10f5344d92
21+
ef575a8bf0519655bcf8a57b981c7fa3c018db7a
22+
* small fixes in OCEL utilities
23+
* d0094fa4ccc815b57ccc519d15ccbda6399c2ef7
24+
* bug fix eventually_follows filter in LTL checker when timestamp_diff_boundaries is provided.
25+
* eb8617de0cfcfebf7374b4545660158e4b4291b6
26+
* bug fix eventually_follows filter in LTL checker on EventLog objects.
27+
28+
### Removed
29+
30+
### Other
31+
32+
-----
33+
334
## pm4py 2.2.24 (2022.07.12)
435

536
### Added

docs/source/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
# The short X.Y version
2727
version = '2.2'
2828
# The full version, including alpha/beta/rc tags
29-
release = '2.2.24'
29+
release = '2.2.25'
3030

3131
# -- General configuration ---------------------------------------------------
3232

pm4py/algo/discovery/dfg/adapters/pandas/df_statistics.py

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,15 @@
1616
'''
1717
from pm4py.util import xes_constants, pandas_utils, constants
1818
from pm4py.util.business_hours import soj_time_business_hours_diff
19+
import numpy as np
1920

2021

2122
def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_glue="case:concept:name",
2223
start_timestamp_key=None, timestamp_key="time:timestamp", perf_aggregation_key="mean",
2324
sort_caseid_required=True,
2425
sort_timestamp_along_case_id=True, keep_once_per_case=False, window=1,
25-
business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR, target_activity_key=None):
26+
business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR, target_activity_key=None,
27+
reduce_columns=True):
2628
"""
2729
Get DFG graph from Pandas dataframe
2830
@@ -65,9 +67,19 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
6567

6668
# if not differently specified, set the start timestamp key to the timestamp key
6769
# to avoid retro-compatibility problems
70+
st_eq_ct = start_timestamp_key == timestamp_key
6871
if start_timestamp_key is None:
6972
start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY
7073
df[start_timestamp_key] = df[timestamp_key]
74+
st_eq_ct = True
75+
76+
# to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
77+
# columns
78+
if reduce_columns:
79+
if measure == "frequency" and not sort_timestamp_along_case_id:
80+
df = df[list({case_id_glue, activity_key, target_activity_key})]
81+
else:
82+
df = df[list({case_id_glue, activity_key, start_timestamp_key, timestamp_key, target_activity_key})]
7183

7284
# to get rows belonging to same case ID together, we need to sort on case ID
7385
if sort_caseid_required:
@@ -76,18 +88,12 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
7688
else:
7789
df = df.sort_values(case_id_glue)
7890

79-
# to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
80-
# columns
81-
if measure == "frequency":
82-
df_reduced = df[{case_id_glue, activity_key, target_activity_key}]
83-
else:
84-
df_reduced = df[{case_id_glue, activity_key, start_timestamp_key, timestamp_key, target_activity_key}]
8591
# shift the dataframe by 1, in order to couple successive rows
86-
df_reduced_shifted = df_reduced.shift(-window)
92+
df_shifted = df.shift(-window)
8793
# change column names to shifted dataframe
88-
df_reduced_shifted.columns = [str(col) + '_2' for col in df_reduced_shifted.columns]
94+
df_shifted.columns = [str(col) + '_2' for col in df_shifted.columns]
8995
# concate the two dataframe to get a unique dataframe
90-
df_successive_rows = pd.concat([df_reduced, df_reduced_shifted], axis=1)
96+
df_successive_rows = pd.concat([df, df_shifted], axis=1)
9197
# as successive rows in the sorted dataframe may belong to different case IDs we have to restrict ourselves to
9298
# successive rows belonging to same case ID
9399
df_successive_rows = df_successive_rows[df_successive_rows[case_id_glue] == df_successive_rows[case_id_glue + '_2']]
@@ -99,8 +105,10 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
99105
all_columns = list(all_columns - set([activity_key, target_activity_key + '_2']))
100106

101107
if measure == "performance" or measure == "both":
102-
# in the arc performance calculation, make sure to consider positive or null values
103-
df_successive_rows[start_timestamp_key + '_2'] = df_successive_rows[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)
108+
if not st_eq_ct:
109+
# in the arc performance calculation, make sure to consider positive or null values
110+
df_successive_rows[start_timestamp_key + '_2'] = df_successive_rows[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)
111+
104112
# calculate the difference between the timestamps of two successive events
105113
if business_hours:
106114
if worktiming is None:
@@ -158,7 +166,8 @@ def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="tim
158166
case_id_glue="case:concept:name", activity_key="concept:name",
159167
sort_caseid_required=True,
160168
sort_timestamp_along_case_id=True, reduce_dataframe=True, keep_first_following=True,
161-
business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR):
169+
business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR,
170+
event_index=constants.DEFAULT_INDEX_KEY):
162171
"""
163172
Gets the partial order between events (of the same case) in a Pandas dataframe
164173
@@ -191,29 +200,37 @@ def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="tim
191200
# to avoid retro-compatibility problems
192201
if start_timestamp_key is None:
193202
start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY
203+
204+
if start_timestamp_key not in df:
194205
df[start_timestamp_key] = df[timestamp_key]
195206

207+
# to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
208+
# columns
209+
if reduce_dataframe:
210+
needed_columns = {case_id_glue, activity_key, start_timestamp_key, timestamp_key}
211+
if event_index in df.columns:
212+
needed_columns.add(event_index)
213+
needed_columns = list(needed_columns)
214+
df = df[needed_columns]
215+
196216
# to get rows belonging to same case ID together, we need to sort on case ID
197217
if sort_caseid_required:
198218
if sort_timestamp_along_case_id:
199219
df = df.sort_values([case_id_glue, start_timestamp_key, timestamp_key])
200220
else:
201221
df = df.sort_values(case_id_glue)
222+
df.reset_index(drop=True, inplace=True)
202223

203-
# to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
204-
# columns
205-
if reduce_dataframe:
206-
df = df[[case_id_glue, activity_key, start_timestamp_key, timestamp_key]]
224+
if event_index not in df.columns:
225+
df[event_index] = df.index
207226

208-
df = pandas_utils.insert_index(df)
209-
df = df.set_index(case_id_glue)
210-
df_copy = df.copy()
227+
df.set_index(case_id_glue, inplace=True)
211228

212-
df = df.join(df_copy, rsuffix="_2").dropna()
213-
df = df[df[constants.DEFAULT_INDEX_KEY] < df[constants.DEFAULT_INDEX_KEY + "_2"]]
214-
df[start_timestamp_key + '_2'] = df[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)
229+
df = df.join(df, rsuffix="_2")
230+
df = df[df[event_index] < df[event_index + "_2"]]
231+
df = df[df[timestamp_key] <= df[start_timestamp_key + '_2']]
215232

216-
df = df.reset_index()
233+
df.reset_index(inplace=True)
217234

218235
if business_hours:
219236
if worktiming is None:

pm4py/algo/filtering/log/ltl/ltl_checker.py

Lines changed: 21 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
from typing import Optional, Dict, Any, Union, Tuple, List
2828
from pm4py.objects.log.obj import EventLog, EventStream, Trace
29+
import itertools
2930

3031

3132
class Parameters(Enum):
@@ -317,48 +318,29 @@ def eventually_follows(log: EventLog, attribute_values: List[str], parameters: O
317318
omni_present=log.omni_present, properties=log.properties)
318319

319320
for trace in log:
320-
if enable_timestamp:
321-
occurrences = [[trace[i][timestamp_key].timestamp() for i in range(len(trace))
322-
if attribute_key in trace[i] and trace[i][attribute_key] == attribute_value] for attribute_value in attribute_values]
323-
else:
324-
occurrences = [[i for i in range(len(trace))
325-
if attribute_key in trace[i] and trace[i][attribute_key] == attribute_value] for attribute_value in attribute_values]
321+
occurrences = [[i for i in range(len(trace))
322+
if attribute_key in trace[i] and trace[i][attribute_key] == attribute_value] for attribute_value in attribute_values]
326323

324+
is_good = False
327325

328-
is_good = True
329-
if enable_timestamp and timestamp_diff_boundaries:
330-
prev_min = min(occurrences[0], default=-1)
331-
for i in range(1, len(attribute_values)):
332-
if prev_min == -1 or len(occurrences[i]) == 0:
333-
is_good = False
326+
for c in itertools.product(*occurrences):
327+
ok = True
328+
for i in range(len(c)-1):
329+
if c[i] > c[i+1]:
330+
ok = False
334331
break
335-
336-
if timestamp_diff_boundaries:
337-
min_diff = timestamp_diff_boundaries[i - 1][0]
338-
max_diff = timestamp_diff_boundaries[i - 1][1]
339-
min_timestamp = min([o for o in occurrences[i] if (o - prev_min) >= min_diff and (o - prev_min) <= max_diff], default=-1)
340-
else:
341-
min_timestamp = min([o for o in occurrences[i] if o >= prev_min], default = -1)
342-
343-
prev_min = min_timestamp
344-
345-
if prev_min == -1:
346-
is_good = False
347-
break
348-
349-
else:
350-
prev_min = min(occurrences[0], default=-1)
351-
for i in range(1, len(attribute_values)):
352-
if prev_min == -1:
353-
is_good = False
354-
break
355-
356-
if len(occurrences[i]) == 0:
357-
is_good = False
358-
break
359-
360-
min_index = min([o for o in occurrences[i] if o >= prev_min], default = -1)
361-
prev_min = min_index
332+
if ok:
333+
if enable_timestamp and timestamp_diff_boundaries:
334+
for i in range(len(c)-1):
335+
timest_i = trace[i][timestamp_key].timestamp()
336+
timest_j = trace[i+1][timestamp_key].timestamp()
337+
if timest_j - timest_i < timestamp_diff_boundaries[i][0] or timest_j - timest_i > timestamp_diff_boundaries[i][1]:
338+
ok = False
339+
break
340+
341+
if ok:
342+
is_good = True
343+
break
362344

363345
if is_good:
364346
if positive:

pm4py/algo/filtering/pandas/ltl/ltl_checker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -316,8 +316,8 @@ def eventually_follows(df0: pd.DataFrame, attribute_values: List[str], parameter
316316
attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
317317
timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY)
318318
positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
319-
enable_timestamp = exec_utils.get_param_value(Parameters.ENABLE_TIMESTAMP, parameters, False)
320319
timestamp_diff_boundaries = exec_utils.get_param_value(Parameters.TIMESTAMP_DIFF_BOUNDARIES, parameters, [])
320+
enable_timestamp = exec_utils.get_param_value(Parameters.ENABLE_TIMESTAMP, parameters, len(timestamp_diff_boundaries) > 0)
321321

322322
colset = [case_id_glue, attribute_key]
323323
if enable_timestamp:
@@ -341,7 +341,7 @@ def eventually_follows(df0: pd.DataFrame, attribute_values: List[str], parameter
341341
df_join = df_join[df_join["@@diffindex%d" % (i - 1)] > 0]
342342

343343
if enable_timestamp:
344-
for i in range(2, len(df_a)):
344+
for i in range(1, len(df_a)):
345345
df_join["@@difftimestamp%d" % (i - 1)] = (
346346
df_join[timestamp_key + "_%d" % i] - df_join[timestamp_key + '_%d' % (i-1)]).astype(
347347
'timedelta64[s]')

pm4py/algo/transformation/ocel/features/objects/algorithm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,8 @@ def apply(ocel: OCEL, parameters: Optional[Dict[Any, Any]] = None):
102102
enable_object_str_attributes = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_STR_ATTRIBUTES, parameters, enable_all)
103103
enable_object_num_attributes = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_NUM_ATTRIBUTES, parameters, enable_all)
104104
enable_object_interaction_graph_ot = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_INTERACTION_GRAPH_OT, parameters, enable_all)
105-
enable_work_in_progress = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_WORK_IN_PROGRESS, parameters, enable_all)
106105
enable_object_lifecycle_unq_act = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_LIFECYCLE_UNQ_ACT, parameters, enable_all)
106+
enable_work_in_progress = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_WORK_IN_PROGRESS, parameters, False)
107107
enable_related_events_features = exec_utils.get_param_value(Parameters.ENABLE_RELATED_EVENTS_FEATURES, parameters, False)
108108
enable_related_activities_features = exec_utils.get_param_value(Parameters.ENABLE_RELATED_ACTIVITIES_FEATURES, parameters, False)
109109
enable_obj_con_in_graph_features = exec_utils.get_param_value(Parameters.ENABLE_OBJ_CON_IN_GRAPH_FEATURES, parameters, False)

pm4py/algo/transformation/ocel/features/objects/object_degree_centrality.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,9 @@ def apply(ocel: OCEL, parameters: Optional[Dict[Any, Any]] = None):
5454
feature_names = ["@@object_degree_centrality"]
5555

5656
for obj in ordered_objects:
57-
data.append([centrality[obj]])
57+
if obj in centrality:
58+
data.append([centrality[obj]])
59+
else:
60+
data.append([0])
5861

5962
return data, feature_names

pm4py/algo/transformation/ocel/graphs/object_inheritance_graph.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,10 @@ def apply(ocel: OCEL, parameters: Optional[Dict[Any, Any]] = None) -> Set[Tuple[
7474
graph.add((o1, o2))
7575
set_objects.add(o2)
7676

77+
graph_it = list(graph)
78+
for el in graph_it:
79+
if (el[1], el[0]) in graph:
80+
graph.remove((el[0], el[1]))
81+
graph.remove((el[1], el[0]))
82+
7783
return graph

pm4py/filtering.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1033,7 +1033,8 @@ def filter_ocel_objects(ocel: OCEL, object_identifiers: Collection[str], positiv
10331033
while level > 1:
10341034
curr = list(object_identifiers)
10351035
for el in curr:
1036-
object_identifiers = object_identifiers.union(graph[el])
1036+
for el2 in graph[el]:
1037+
object_identifiers.add(el2)
10371038
level = level - 1
10381039
from copy import copy
10391040
from pm4py.objects.ocel.util import filtering_utils

pm4py/meta.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
along with PM4Py. If not, see <https://www.gnu.org/licenses/>.
1616
'''
1717
__name__ = 'pm4py'
18-
VERSION = '2.2.24'
18+
VERSION = '2.2.25'
1919
__version__ = VERSION
2020
__doc__ = 'Process Mining for Python (PM4Py)'
2121
__author__ = 'Fraunhofer Institute for Applied Technology'

0 commit comments

Comments
 (0)