Skip to content

Commit 8b32573

Browse files
Merge remote-tracking branch 'upstream/hotfixes' into release
2 parents cf04b0a + b02984e commit 8b32573

File tree

1 file changed

+136
-49
lines changed
  • pm4py/algo/discovery/log_skeleton/variants

1 file changed

+136
-49
lines changed

pm4py/algo/discovery/log_skeleton/variants/classic.py

Lines changed: 136 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,11 @@
2626
from pm4py.objects.log.util import xes
2727
from pm4py.util import exec_utils
2828
from pm4py.util import variants_util, pandas_utils
29-
from pm4py.util.constants import PARAMETER_CONSTANT_ACTIVITY_KEY, PARAMETER_CONSTANT_CASEID_KEY, CASE_CONCEPT_NAME
29+
from pm4py.util.constants import (
30+
PARAMETER_CONSTANT_ACTIVITY_KEY,
31+
PARAMETER_CONSTANT_CASEID_KEY,
32+
CASE_CONCEPT_NAME,
33+
)
3034
from typing import Optional, Dict, Any, Union
3135
from pm4py.objects.log.obj import EventLog
3236
import pandas as pd
@@ -35,11 +39,19 @@
3539
class Parameters(Enum):
3640
# parameter for the noise threshold
3741
NOISE_THRESHOLD = "noise_threshold"
38-
# considered constraints in conformance checking among: equivalence, always_after, always_before, never_together, directly_follows, activ_freq
42+
# considered constraints in conformance checking among: equivalence,
43+
# always_after, always_before, never_together, directly_follows,
44+
# activ_freq
3945
CONSIDERED_CONSTRAINTS = "considered_constraints"
4046
# default choice for conformance checking
41-
DEFAULT_CONSIDERED_CONSTRAINTS = ["equivalence", "always_after", "always_before", "never_together",
42-
"directly_follows", "activ_freq"]
47+
DEFAULT_CONSIDERED_CONSTRAINTS = [
48+
"equivalence",
49+
"always_after",
50+
"always_before",
51+
"never_together",
52+
"directly_follows",
53+
"activ_freq",
54+
]
4355
CASE_ID_KEY = PARAMETER_CONSTANT_CASEID_KEY
4456
ACTIVITY_KEY = PARAMETER_CONSTANT_ACTIVITY_KEY
4557
PARAMETER_VARIANT_DELIMITER = "variant_delimiter"
@@ -85,7 +97,11 @@ def equivalence(logs_traces, all_activs, noise_threshold=0):
8597
for k in rs:
8698
rs[k] = rs[k] * logs_traces[trace]
8799
ret0 += rs
88-
ret = set(x for x, y in ret0.items() if y >= all_activs[x[0]] * (1.0 - noise_threshold))
100+
ret = set(
101+
x
102+
for x, y in ret0.items()
103+
if y >= all_activs[x[0]] * (1.0 - noise_threshold)
104+
)
89105
return ret
90106

91107

@@ -107,17 +123,30 @@ def always_after(logs_traces, all_activs, noise_threshold=0):
107123
rel
108124
List of relations in the log
109125
"""
110-
ret0 = Counter()
111-
for trace in logs_traces:
112-
rs = Counter(trace_skel.after(list(trace)))
113-
for k in rs:
114-
rs[k] = rs[k] * logs_traces[trace]
115-
ret0 += rs
116-
first_count = Counter()
117-
for x, y in ret0.items():
118-
first_count[x[0]] += y
119-
ret = set(x for x, y in ret0.items() if y >= first_count[x[0]] * (1.0 - noise_threshold))
120-
return ret
126+
# logs_traces: Counter mapping each trace‐tuple → frequency
127+
# First, for each A, count how many traces have A at all.
128+
traces_with_A = Counter()
129+
# For each (trace_variant → freq), check if A appears in that variant.
130+
for trace_variant, freq in logs_traces.items():
131+
unique_activities = set(trace_variant)
132+
for act in unique_activities:
133+
traces_with_A[act] += freq
134+
135+
# Next, for each pair (A,B), count how many traces have B after A at least once.
136+
traces_with_A_then_B = Counter()
137+
for trace_variant, freq in logs_traces.items():
138+
# Build the set of all (A,B) such that B comes after A in this one variant
139+
after_pairs = set(trace_skel.after(list(trace_variant)))
140+
for (A,B) in after_pairs:
141+
traces_with_A_then_B[(A,B)] += freq
142+
143+
# Finally, keep only those (A,B) with
144+
# traces_with_A_then_B[(A,B)] >= traces_with_A[A] * (1 - noise_threshold)
145+
result = set()
146+
for (A,B), count_AB in traces_with_A_then_B.items():
147+
if count_AB >= traces_with_A[A] * (1 - noise_threshold):
148+
result.add((A,B))
149+
return result
121150

122151

123152
def always_before(logs_traces, all_activs, noise_threshold=0):
@@ -138,17 +167,23 @@ def always_before(logs_traces, all_activs, noise_threshold=0):
138167
rel
139168
List of relations in the log
140169
"""
141-
ret0 = Counter()
142-
for trace in logs_traces:
143-
rs = Counter(trace_skel.before(list(trace)))
144-
for k in rs:
145-
rs[k] = rs[k] * logs_traces[trace]
146-
ret0 += rs
147-
first_count = Counter()
148-
for x, y in ret0.items():
149-
first_count[x[0]] += y
150-
ret = set(x for x, y in ret0.items() if y >= first_count[x[0]] * (1.0 - noise_threshold))
151-
return ret
170+
traces_with_B = Counter()
171+
for trace_variant, freq in logs_traces.items():
172+
unique_activities = set(trace_variant)
173+
for act in unique_activities:
174+
traces_with_B[act] += freq
175+
176+
traces_with_A_then_B = Counter()
177+
for trace_variant, freq in logs_traces.items():
178+
before_pairs = set(trace_skel.before(list(trace_variant)))
179+
for (A,B) in before_pairs:
180+
traces_with_A_then_B[(A,B)] += freq
181+
182+
result = set()
183+
for (A,B), count_AB in traces_with_A_then_B.items():
184+
if count_AB >= traces_with_B[B] * (1 - noise_threshold):
185+
result.add((A,B))
186+
return result
152187

153188

154189
def never_together(logs_traces, all_activs, len_log, noise_threshold=0):
@@ -180,7 +215,11 @@ def never_together(logs_traces, all_activs, len_log, noise_threshold=0):
180215
for k in rs:
181216
rs[k] = rs[k] * logs_traces[trace]
182217
ret0 -= rs
183-
ret = set(x for x, y in ret0.items() if y >= all_activs[x[0]] * (1.0 - noise_threshold))
218+
ret = set(
219+
x
220+
for x, y in ret0.items()
221+
if y >= all_activs[x[0]] * (1.0 - noise_threshold)
222+
)
184223
return ret
185224

186225

@@ -208,7 +247,11 @@ def directly_follows(logs_traces, all_activs, noise_threshold=0):
208247
for k in rs:
209248
rs[k] = rs[k] * logs_traces[trace]
210249
ret0 += rs
211-
ret = set(x for x, y in ret0.items() if y >= all_activs[x[0]] * (1.0 - noise_threshold))
250+
ret = set(
251+
x
252+
for x, y in ret0.items()
253+
if y >= all_activs[x[0]] * (1.0 - noise_threshold)
254+
)
212255
return ret
213256

214257

@@ -244,19 +287,26 @@ def activ_freq(logs_traces, all_activs, len_log, noise_threshold=0):
244287
ret0[act] = Counter()
245288
ret0[act][rs[act]] += logs_traces[trace]
246289
for act in ret0:
247-
ret0[act] = sorted(list((x, y) for x, y in ret0[act].items()), key=lambda x: x[1], reverse=True)
290+
ret0[act] = sorted(
291+
list((x, y) for x, y in ret0[act].items()),
292+
key=lambda x: x[1],
293+
reverse=True,
294+
)
248295
added = 0
249296
i = 0
250297
while i < len(ret0[act]):
251298
added += ret0[act][i][1]
252299
if added >= (1.0 - noise_threshold) * len_log:
253-
ret0[act] = ret0[act][:min(i + 1, len(ret0[act]))]
300+
ret0[act] = ret0[act][: min(i + 1, len(ret0[act]))]
254301
i = i + 1
255302
ret[act] = set(x[0] for x in ret0[act])
256303
return ret
257304

258305

259-
def apply(log: Union[EventLog, pd.DataFrame], parameters: Optional[Dict[Union[str, Parameters], Any]] = None) -> Dict[str, Any]:
306+
def apply(
307+
log: Union[EventLog, pd.DataFrame],
308+
parameters: Optional[Dict[Union[str, Parameters], Any]] = None,
309+
) -> Dict[str, Any]:
260310
"""
261311
Discover a log skeleton from an event log
262312
@@ -277,25 +327,50 @@ def apply(log: Union[EventLog, pd.DataFrame], parameters: Optional[Dict[Union[st
277327
if parameters is None:
278328
parameters = {}
279329

280-
activity_key = exec_utils.get_param_value(Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY)
281-
noise_threshold = exec_utils.get_param_value(Parameters.NOISE_THRESHOLD, parameters, 0.0)
330+
activity_key = exec_utils.get_param_value(
331+
Parameters.ACTIVITY_KEY, parameters, xes.DEFAULT_NAME_KEY
332+
)
333+
noise_threshold = exec_utils.get_param_value(
334+
Parameters.NOISE_THRESHOLD, parameters, 0.0
335+
)
282336

283337
if type(log) is EventLog:
284338
logs_traces = Counter([tuple(y[activity_key] for y in x) for x in log])
285339
all_activs = Counter(list(y[activity_key] for x in log for y in x))
286340
elif pandas_utils.check_is_pandas_dataframe(log):
287-
case_id_key = exec_utils.get_param_value(Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME)
341+
case_id_key = exec_utils.get_param_value(
342+
Parameters.CASE_ID_KEY, parameters, CASE_CONCEPT_NAME
343+
)
288344
all_activs = log[activity_key].value_counts().to_dict()
289-
logs_traces = Counter([tuple(x) for x in log.groupby(case_id_key)[activity_key].agg(list).to_dict().values()])
345+
logs_traces = Counter(
346+
[
347+
tuple(x)
348+
for x in log.groupby(case_id_key)[activity_key]
349+
.agg(list)
350+
.to_dict()
351+
.values()
352+
]
353+
)
290354

291355
ret = {}
292-
ret[Outputs.EQUIVALENCE.value] = equivalence(logs_traces, all_activs, noise_threshold=noise_threshold)
293-
ret[Outputs.ALWAYS_AFTER.value] = always_after(logs_traces, all_activs, noise_threshold=noise_threshold)
294-
ret[Outputs.ALWAYS_BEFORE.value] = always_before(logs_traces, all_activs, noise_threshold=noise_threshold)
295-
ret[Outputs.NEVER_TOGETHER.value] = never_together(logs_traces, all_activs, len(log),
296-
noise_threshold=noise_threshold)
297-
ret[Outputs.DIRECTLY_FOLLOWS.value] = directly_follows(logs_traces, all_activs, noise_threshold=noise_threshold)
298-
ret[Outputs.ACTIV_FREQ.value] = activ_freq(logs_traces, all_activs, len(log), noise_threshold=noise_threshold)
356+
ret[Outputs.EQUIVALENCE.value] = equivalence(
357+
logs_traces, all_activs, noise_threshold=noise_threshold
358+
)
359+
ret[Outputs.ALWAYS_AFTER.value] = always_after(
360+
logs_traces, all_activs, noise_threshold=noise_threshold
361+
)
362+
ret[Outputs.ALWAYS_BEFORE.value] = always_before(
363+
logs_traces, all_activs, noise_threshold=noise_threshold
364+
)
365+
ret[Outputs.NEVER_TOGETHER.value] = never_together(
366+
logs_traces, all_activs, len(log), noise_threshold=noise_threshold
367+
)
368+
ret[Outputs.DIRECTLY_FOLLOWS.value] = directly_follows(
369+
logs_traces, all_activs, noise_threshold=noise_threshold
370+
)
371+
ret[Outputs.ACTIV_FREQ.value] = activ_freq(
372+
logs_traces, all_activs, len(log), noise_threshold=noise_threshold
373+
)
299374

300375
return ret
301376

@@ -342,11 +417,23 @@ def prepare_encode(log_skeleton):
342417
log_skeleton
343418
Log skeleton (with lists instead of sets)
344419
"""
345-
log_skeleton[Outputs.EQUIVALENCE.value] = list(log_skeleton[Outputs.EQUIVALENCE.value])
346-
log_skeleton[Outputs.ALWAYS_AFTER.value] = list(log_skeleton[Outputs.ALWAYS_AFTER.value])
347-
log_skeleton[Outputs.ALWAYS_BEFORE.value] = list(log_skeleton[Outputs.ALWAYS_BEFORE.value])
348-
log_skeleton[Outputs.NEVER_TOGETHER.value] = list(log_skeleton[Outputs.NEVER_TOGETHER.value])
349-
log_skeleton[Outputs.DIRECTLY_FOLLOWS.value] = list(log_skeleton[Outputs.DIRECTLY_FOLLOWS.value])
420+
log_skeleton[Outputs.EQUIVALENCE.value] = list(
421+
log_skeleton[Outputs.EQUIVALENCE.value]
422+
)
423+
log_skeleton[Outputs.ALWAYS_AFTER.value] = list(
424+
log_skeleton[Outputs.ALWAYS_AFTER.value]
425+
)
426+
log_skeleton[Outputs.ALWAYS_BEFORE.value] = list(
427+
log_skeleton[Outputs.ALWAYS_BEFORE.value]
428+
)
429+
log_skeleton[Outputs.NEVER_TOGETHER.value] = list(
430+
log_skeleton[Outputs.NEVER_TOGETHER.value]
431+
)
432+
log_skeleton[Outputs.DIRECTLY_FOLLOWS.value] = list(
433+
log_skeleton[Outputs.DIRECTLY_FOLLOWS.value]
434+
)
350435
for act in log_skeleton[Outputs.ACTIV_FREQ.value]:
351-
log_skeleton[Outputs.ACTIV_FREQ.value][act] = list(log_skeleton[Outputs.ACTIV_FREQ.value][act])
436+
log_skeleton[Outputs.ACTIV_FREQ.value][act] = list(
437+
log_skeleton[Outputs.ACTIV_FREQ.value][act]
438+
)
352439
return log_skeleton

0 commit comments

Comments
 (0)