feat(pm4py): preparing for 2.2.25

fit-alessandro-berti · fit-alessandro-berti · commit 2d45d5b4ebbd · 2022-07-29T11:11:33.000+02:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,36 @@
 # Changelog of pm4py
 
+## pm4py 2.2.25 (2022.07.29)
+
+### Added
+
+### Changed
+* ce94110076e3269c96a6eee61d7618f08f44472a
+  * optimization in the calculation of the eventually-follows graph on Pandas dataframes.
+* 3cca8f97bbd09f4ae5644dcc156489d4b2037028
+  * optimization in the calculation of the performance directly-follows graph on Pandas dataframes.
+* 4d8721787a50da397b265678be614c94894ea851
+  * column reduction in DFG calculation on top of Pandas dataframes
+
+### Deprecated
+
+### Fixed
+* d754ccdac680f610b2b628dc9830d92da6954dc1
+  cb76238c29b986026f07261c11a1c09a667c9ab9 
+  54970a58927ad0e17b173bff17705a10f5344d92
+  ef575a8bf0519655bcf8a57b981c7fa3c018db7a
+  * small fixes in OCEL utilities
+* d0094fa4ccc815b57ccc519d15ccbda6399c2ef7
+  * bug fix eventually_follows filter in LTL checker when timestamp_diff_boundaries is provided.
+* eb8617de0cfcfebf7374b4545660158e4b4291b6
+  * bug fix eventually_follows filter in LTL checker on EventLog objects.
+
+### Removed
+
+### Other
+
+-----
+
 ## pm4py 2.2.24 (2022.07.12)
 
 ### Added
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -26,7 +26,7 @@
 # The short X.Y version
 version = '2.2'
 # The full version, including alpha/beta/rc tags
-release = '2.2.24'
+release = '2.2.25'
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/pm4py/algo/discovery/dfg/adapters/pandas/df_statistics.py b/pm4py/algo/discovery/dfg/adapters/pandas/df_statistics.py
@@ -16,13 +16,15 @@
 '''
 from pm4py.util import xes_constants, pandas_utils, constants
 from pm4py.util.business_hours import soj_time_business_hours_diff
+import numpy as np
 
 
 def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_glue="case:concept:name",
                   start_timestamp_key=None, timestamp_key="time:timestamp", perf_aggregation_key="mean",
                   sort_caseid_required=True,
                   sort_timestamp_along_case_id=True, keep_once_per_case=False, window=1,
-                  business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR, target_activity_key=None):
+                  business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR, target_activity_key=None,
+                  reduce_columns=True):
     """
     Get DFG graph from Pandas dataframe
 
@@ -65,9 +67,19 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
 
     # if not differently specified, set the start timestamp key to the timestamp key
     # to avoid retro-compatibility problems
+    st_eq_ct = start_timestamp_key == timestamp_key
     if start_timestamp_key is None:
         start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY
         df[start_timestamp_key] = df[timestamp_key]
+        st_eq_ct = True
+
+    # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
+    # columns
+    if reduce_columns:
+        if measure == "frequency" and not sort_timestamp_along_case_id:
+            df = df[list({case_id_glue, activity_key, target_activity_key})]
+        else:
+            df = df[list({case_id_glue, activity_key, start_timestamp_key, timestamp_key, target_activity_key})]
 
     # to get rows belonging to same case ID together, we need to sort on case ID
     if sort_caseid_required:
@@ -76,18 +88,12 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
         else:
             df = df.sort_values(case_id_glue)
 
-    # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
-    # columns
-    if measure == "frequency":
-        df_reduced = df[{case_id_glue, activity_key, target_activity_key}]
-    else:
-        df_reduced = df[{case_id_glue, activity_key, start_timestamp_key, timestamp_key, target_activity_key}]
     # shift the dataframe by 1, in order to couple successive rows
-    df_reduced_shifted = df_reduced.shift(-window)
+    df_shifted = df.shift(-window)
     # change column names to shifted dataframe
-    df_reduced_shifted.columns = [str(col) + '_2' for col in df_reduced_shifted.columns]
+    df_shifted.columns = [str(col) + '_2' for col in df_shifted.columns]
     # concate the two dataframe to get a unique dataframe
-    df_successive_rows = pd.concat([df_reduced, df_reduced_shifted], axis=1)
+    df_successive_rows = pd.concat([df, df_shifted], axis=1)
     # as successive rows in the sorted dataframe may belong to different case IDs we have to restrict ourselves to
     # successive rows belonging to same case ID
     df_successive_rows = df_successive_rows[df_successive_rows[case_id_glue] == df_successive_rows[case_id_glue + '_2']]
@@ -99,8 +105,10 @@ def get_dfg_graph(df, measure="frequency", activity_key="concept:name", case_id_
     all_columns = list(all_columns - set([activity_key, target_activity_key + '_2']))
 
     if measure == "performance" or measure == "both":
-        # in the arc performance calculation, make sure to consider positive or null values
-        df_successive_rows[start_timestamp_key + '_2'] = df_successive_rows[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)
+        if not st_eq_ct:
+            # in the arc performance calculation, make sure to consider positive or null values
+            df_successive_rows[start_timestamp_key + '_2'] = df_successive_rows[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)
+        
         # calculate the difference between the timestamps of two successive events
         if business_hours:
             if worktiming is None:
@@ -158,7 +166,8 @@ def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="tim
                                 case_id_glue="case:concept:name", activity_key="concept:name",
                                 sort_caseid_required=True,
                                 sort_timestamp_along_case_id=True, reduce_dataframe=True, keep_first_following=True,
-                                business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR):
+                                business_hours=False, worktiming=None, weekends=None, workcalendar=constants.DEFAULT_BUSINESS_HOURS_WORKCALENDAR,
+                                event_index=constants.DEFAULT_INDEX_KEY):
     """
     Gets the partial order between events (of the same case) in a Pandas dataframe
 
@@ -191,29 +200,37 @@ def get_partial_order_dataframe(df, start_timestamp_key=None, timestamp_key="tim
     # to avoid retro-compatibility problems
     if start_timestamp_key is None:
         start_timestamp_key = xes_constants.DEFAULT_START_TIMESTAMP_KEY
+
+    if start_timestamp_key not in df:
         df[start_timestamp_key] = df[timestamp_key]
 
+    # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
+    # columns
+    if reduce_dataframe:
+        needed_columns = {case_id_glue, activity_key, start_timestamp_key, timestamp_key}
+        if event_index in df.columns:
+            needed_columns.add(event_index)
+        needed_columns = list(needed_columns)
+        df = df[needed_columns]
+
     # to get rows belonging to same case ID together, we need to sort on case ID
     if sort_caseid_required:
         if sort_timestamp_along_case_id:
             df = df.sort_values([case_id_glue, start_timestamp_key, timestamp_key])
         else:
             df = df.sort_values(case_id_glue)
+        df.reset_index(drop=True, inplace=True)
 
-    # to increase the speed of the approaches reduce dataframe to case, activity (and possibly complete timestamp)
-    # columns
-    if reduce_dataframe:
-        df = df[[case_id_glue, activity_key, start_timestamp_key, timestamp_key]]
+    if event_index not in df.columns:
+        df[event_index] = df.index
 
-    df = pandas_utils.insert_index(df)
-    df = df.set_index(case_id_glue)
-    df_copy = df.copy()
+    df.set_index(case_id_glue, inplace=True)
 
-    df = df.join(df_copy, rsuffix="_2").dropna()
-    df = df[df[constants.DEFAULT_INDEX_KEY] < df[constants.DEFAULT_INDEX_KEY + "_2"]]
-    df[start_timestamp_key + '_2'] = df[[start_timestamp_key + '_2', timestamp_key]].max(axis=1)
+    df = df.join(df, rsuffix="_2")
+    df = df[df[event_index] < df[event_index + "_2"]]
+    df = df[df[timestamp_key] <= df[start_timestamp_key + '_2']]
 
-    df = df.reset_index()
+    df.reset_index(inplace=True)
 
     if business_hours:
         if worktiming is None:
diff --git a/pm4py/algo/filtering/log/ltl/ltl_checker.py b/pm4py/algo/filtering/log/ltl/ltl_checker.py
@@ -26,6 +26,7 @@
 
 from typing import Optional, Dict, Any, Union, Tuple, List
 from pm4py.objects.log.obj import EventLog, EventStream, Trace
+import itertools
 
 
 class Parameters(Enum):
@@ -317,48 +318,29 @@ def eventually_follows(log: EventLog, attribute_values: List[str], parameters: O
                        omni_present=log.omni_present, properties=log.properties)
 
     for trace in log:
-        if enable_timestamp:
-            occurrences = [[trace[i][timestamp_key].timestamp() for i in range(len(trace)) 
-                            if attribute_key in trace[i] and trace[i][attribute_key] == attribute_value] for attribute_value in attribute_values]
-        else:
-            occurrences = [[i for i in range(len(trace)) 
-                            if attribute_key in trace[i] and trace[i][attribute_key] == attribute_value] for attribute_value in attribute_values]
+        occurrences = [[i for i in range(len(trace))
+                        if attribute_key in trace[i] and trace[i][attribute_key] == attribute_value] for attribute_value in attribute_values]
 
+        is_good = False
 
-        is_good = True
-        if enable_timestamp and timestamp_diff_boundaries:
-            prev_min = min(occurrences[0], default=-1)
-            for i in range(1, len(attribute_values)):
-                if prev_min == -1 or len(occurrences[i]) == 0:
-                    is_good = False
+        for c in itertools.product(*occurrences):
+            ok = True
+            for i in range(len(c)-1):
+                if c[i] > c[i+1]:
+                    ok = False
                     break
-
-                if timestamp_diff_boundaries:
-                    min_diff = timestamp_diff_boundaries[i - 1][0]
-                    max_diff = timestamp_diff_boundaries[i - 1][1]
-                    min_timestamp = min([o for o in occurrences[i] if (o - prev_min) >= min_diff and (o - prev_min) <= max_diff], default=-1)
-                else:
-                    min_timestamp = min([o for o in occurrences[i] if o >= prev_min], default = -1)
-
-                prev_min = min_timestamp
-
-                if prev_min == -1:
-                    is_good = False
-                    break
-                
-        else:        
-            prev_min = min(occurrences[0], default=-1)
-            for i in range(1, len(attribute_values)):
-                if prev_min == -1:
-                    is_good = False
-                    break
-
-                if len(occurrences[i]) == 0:
-                    is_good = False
-                    break
-
-                min_index = min([o for o in occurrences[i] if o >= prev_min], default = -1)
-                prev_min = min_index
+            if ok:
+                if enable_timestamp and timestamp_diff_boundaries:
+                    for i in range(len(c)-1):
+                        timest_i = trace[i][timestamp_key].timestamp()
+                        timest_j = trace[i+1][timestamp_key].timestamp()
+                        if timest_j - timest_i < timestamp_diff_boundaries[i][0] or timest_j - timest_i > timestamp_diff_boundaries[i][1]:
+                            ok = False
+                            break
+
+            if ok:
+                is_good = True
+                break
 
         if is_good:
             if positive:
diff --git a/pm4py/algo/filtering/pandas/ltl/ltl_checker.py b/pm4py/algo/filtering/pandas/ltl/ltl_checker.py
@@ -316,8 +316,8 @@ def eventually_follows(df0: pd.DataFrame, attribute_values: List[str], parameter
     attribute_key = exec_utils.get_param_value(Parameters.ATTRIBUTE_KEY, parameters, DEFAULT_NAME_KEY)
     timestamp_key = exec_utils.get_param_value(Parameters.TIMESTAMP_KEY, parameters, DEFAULT_TIMESTAMP_KEY)
     positive = exec_utils.get_param_value(Parameters.POSITIVE, parameters, True)
-    enable_timestamp = exec_utils.get_param_value(Parameters.ENABLE_TIMESTAMP, parameters, False)
     timestamp_diff_boundaries = exec_utils.get_param_value(Parameters.TIMESTAMP_DIFF_BOUNDARIES, parameters, [])
+    enable_timestamp = exec_utils.get_param_value(Parameters.ENABLE_TIMESTAMP, parameters, len(timestamp_diff_boundaries) > 0)
 
     colset = [case_id_glue, attribute_key]
     if enable_timestamp:
@@ -341,7 +341,7 @@ def eventually_follows(df0: pd.DataFrame, attribute_values: List[str], parameter
         df_join = df_join[df_join["@@diffindex%d" % (i - 1)] > 0]
 
     if enable_timestamp:
-        for i in range(2, len(df_a)):
+        for i in range(1, len(df_a)):
             df_join["@@difftimestamp%d" % (i - 1)] = (
                         df_join[timestamp_key + "_%d" % i] - df_join[timestamp_key + '_%d' % (i-1)]).astype(
                 'timedelta64[s]')
diff --git a/pm4py/algo/transformation/ocel/features/objects/algorithm.py b/pm4py/algo/transformation/ocel/features/objects/algorithm.py
@@ -102,8 +102,8 @@ def apply(ocel: OCEL, parameters: Optional[Dict[Any, Any]] = None):
     enable_object_str_attributes = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_STR_ATTRIBUTES, parameters, enable_all)
     enable_object_num_attributes = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_NUM_ATTRIBUTES, parameters, enable_all)
     enable_object_interaction_graph_ot = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_INTERACTION_GRAPH_OT, parameters, enable_all)
-    enable_work_in_progress = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_WORK_IN_PROGRESS, parameters, enable_all)
     enable_object_lifecycle_unq_act = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_LIFECYCLE_UNQ_ACT, parameters, enable_all)
+    enable_work_in_progress = exec_utils.get_param_value(Parameters.ENABLE_OBJECT_WORK_IN_PROGRESS, parameters, False)
     enable_related_events_features = exec_utils.get_param_value(Parameters.ENABLE_RELATED_EVENTS_FEATURES, parameters, False)
     enable_related_activities_features = exec_utils.get_param_value(Parameters.ENABLE_RELATED_ACTIVITIES_FEATURES, parameters, False)
     enable_obj_con_in_graph_features = exec_utils.get_param_value(Parameters.ENABLE_OBJ_CON_IN_GRAPH_FEATURES, parameters, False)
diff --git a/pm4py/algo/transformation/ocel/features/objects/object_degree_centrality.py b/pm4py/algo/transformation/ocel/features/objects/object_degree_centrality.py
@@ -54,6 +54,9 @@ def apply(ocel: OCEL, parameters: Optional[Dict[Any, Any]] = None):
     feature_names = ["@@object_degree_centrality"]
 
     for obj in ordered_objects:
-        data.append([centrality[obj]])
+        if obj in centrality:
+            data.append([centrality[obj]])
+        else:
+            data.append([0])
 
     return data, feature_names
diff --git a/pm4py/algo/transformation/ocel/graphs/object_inheritance_graph.py b/pm4py/algo/transformation/ocel/graphs/object_inheritance_graph.py
@@ -74,4 +74,10 @@ def apply(ocel: OCEL, parameters: Optional[Dict[Any, Any]] = None) -> Set[Tuple[
                     graph.add((o1, o2))
             set_objects.add(o2)
 
+    graph_it = list(graph)
+    for el in graph_it:
+        if (el[1], el[0]) in graph:
+            graph.remove((el[0], el[1]))
+            graph.remove((el[1], el[0]))
+
     return graph
diff --git a/pm4py/filtering.py b/pm4py/filtering.py
@@ -1033,7 +1033,8 @@ def filter_ocel_objects(ocel: OCEL, object_identifiers: Collection[str], positiv
         while level > 1:
             curr = list(object_identifiers)
             for el in curr:
-                object_identifiers = object_identifiers.union(graph[el])
+                for el2 in graph[el]:
+                    object_identifiers.add(el2)
             level = level - 1
     from copy import copy
     from pm4py.objects.ocel.util import filtering_utils
diff --git a/pm4py/meta.py b/pm4py/meta.py
@@ -15,7 +15,7 @@
     along with PM4Py.  If not, see <https://www.gnu.org/licenses/>.
 '''
 __name__ = 'pm4py'
-VERSION = '2.2.24'
+VERSION = '2.2.25'
 __version__ = VERSION
 __doc__ = 'Process Mining for Python (PM4Py)'
 __author__ = 'Fraunhofer Institute for Applied Technology'
diff --git a/safety_checks/20220729 b/safety_checks/20220729
@@ -0,0 +1,20 @@
++==============================================================================+
+|                                                                              |
+|                               /$$$$$$            /$$                         |
+|                              /$$__  $$          | $$                         |
+|           /$$$$$$$  /$$$$$$ | $$  \__//$$$$$$  /$$$$$$   /$$   /$$           |
+|          /$$_____/ |____  $$| $$$$   /$$__  $$|_  $$_/  | $$  | $$           |
+|         |  $$$$$$   /$$$$$$$| $$_/  | $$$$$$$$  | $$    | $$  | $$           |
+|          \____  $$ /$$__  $$| $$    | $$_____/  | $$ /$$| $$  | $$           |
+|          /$$$$$$$/|  $$$$$$$| $$    |  $$$$$$$  |  $$$$/|  $$$$$$$           |
+|         |_______/  \_______/|__/     \_______/   \___/   \____  $$           |
+|                                                          /$$  | $$           |
+|                                                         |  $$$$$$/           |
+|  by pyup.io                                              \______/            |
+|                                                                              |
++==============================================================================+
+| REPORT                                                                       |
+| checked 45 packages, using free DB (updated once a month)                    |
++==============================================================================+
+| No known security vulnerabilities found.                                     |
++==============================================================================+