Merge pull request #84 from DES-Lab/gsm-dev

Edi Muškardin · web-flow · commit b23bfe8ef5fc · 2025-03-18T11:04:05.000+01:00
Support more data formats and auto-detect
diff --git a/Examples.py b/Examples.py
@@ -1184,12 +1184,12 @@ def passive_vpa_learning_on_all_benchmark_models():
 
 def gsm_rpni():
     from aalpy import load_automaton_from_file
-    from aalpy.utils.Sampling import get_io_traces, sample_with_length_limits
+    from aalpy.utils.Sampling import get_data_from_input_sequence, sample_with_length_limits
     from aalpy.learning_algs.general_passive.GeneralizedStateMerging import run_GSM
 
     automaton = load_automaton_from_file("DotModels/car_alarm.dot", "moore")
     input_traces = sample_with_length_limits(automaton.get_input_alphabet(), 100, 20, 30)
-    traces = get_io_traces(automaton, input_traces)
+    traces = get_data_from_input_sequence(automaton, input_traces, "io_traces")
 
     learned_model = run_GSM(traces, output_behavior="moore", transition_behavior="deterministic")
     learned_model.visualize()
diff --git a/aalpy/learning_algs/general_passive/GeneralizedStateMerging.py b/aalpy/learning_algs/general_passive/GeneralizedStateMerging.py
@@ -3,7 +3,7 @@
 from typing import Dict, Tuple, Callable, List, Optional
 
 from aalpy.learning_algs.general_passive.GsmNode import GsmNode, OutputBehavior, TransitionBehavior, TransitionInfo, \
-    OutputBehaviorRange, TransitionBehaviorRange, intersection_iterator, NodeOrders, unknown_output
+    OutputBehaviorRange, TransitionBehaviorRange, intersection_iterator, NodeOrders, unknown_output, detect_data_format
 from aalpy.learning_algs.general_passive.ScoreFunctionsGSM import ScoreCalculation, hoeffding_compatibility
 
 
@@ -93,13 +93,17 @@ def compute_local_compatibility(self, a: GsmNode, b: GsmNode):
 
     # TODO: make more generic by adding the option to use a different algorithm than red blue
     #  for selecting potential merge candidates. Maybe using inheritance with abstract `run`.
-    def run(self, data, convert=True, instrumentation: Instrumentation=None, data_format="io_traces"):
+    def run(self, data, convert=True, instrumentation: Instrumentation=None, data_format=None):
         if instrumentation is None:
             instrumentation = Instrumentation()
         instrumentation.reset(self)
 
+        if data_format is None:
+            data_format = detect_data_format(data)
         if data_format == "labeled_sequences" and self.transition_behavior != "deterministic":
             raise ValueError("learning from labeled_sequences is not possible for nondeterministic systems")
+        if data_format == "traces" and self.transition_behavior == "deterministic":
+            print("learning deterministic systems from (output) traces only. this rarely makes sense. is `data_format` set correctly?")
         root = GsmNode.createPTA(data, self.output_behavior, data_format)
 
         root = self.pta_preprocessing(root)
@@ -310,7 +314,7 @@ def run_GSM(data: list, *,
             depth_first=False,
             instrumentation=None,
             convert=True,
-            data_format='io_traces',
+            data_format=None,
             ):
     """
     Performs a state merging algorithm in the red-blue framework on provided data.
diff --git a/aalpy/learning_algs/general_passive/GsmNode.py b/aalpy/learning_algs/general_passive/GsmNode.py
@@ -20,7 +20,7 @@
 TransitionBehaviorRange = ["deterministic", "nondeterministic", "stochastic"]
 
 DataFormat = str
-DataFormatRange = ["io_traces", "labeled_sequences", "tree"]
+DataFormatRange = ["io_traces", "labeled_sequences", "traces", "tree"]
 
 IOPair = Tuple[Any, Any]
 IOTrace = Sequence[IOPair]
@@ -52,6 +52,52 @@ def union_iterator(a: Dict[Key, Val], b: Dict[Key, Val], default: Val = None) ->
         yield key, a_val, b_val
 
 
+# TODO reuse in RPNI
+def detect_data_format(data, check_consistency=False, guess=False):
+    # The different data formats are
+    # - "tree": a tree-shaped automaton provided as a GsmNode
+    # - "io_traces": either
+    #   - Moore traces [[o, (i,o), (i,o), ...], ...]
+    #   - Mealy traces [[(i,o), (i,o), ...], ...]
+    # - "labeled_sequences": [([i, i, ...], o), ...]
+    # - "traces": [[o, o, ...], ...]
+
+    if isinstance(data, GsmNode):
+        if not data.is_tree():
+            raise ValueError("provided automaton is not a tree")
+        return "tree"
+
+    accepted_types = (Tuple, List)
+
+    # mapping data formats to compatibility criteria
+    check_dict = dict(
+        io_traces=lambda obj: len(obj) <= 1 or all(isinstance(o, accepted_types) and len(o) == 2 for o in obj[1:]),
+        labeled_sequences=lambda obj: len(obj) == 2 and isinstance(obj[0], accepted_types),
+    )
+    accept_dict = {k: True for k in check_dict}
+
+    if not isinstance(data, accepted_types):
+        raise ValueError("wrong input format. expected tuple or list.")
+    if len(data) == 0:
+        return "io_traces"
+
+    accepted_formats = list(accept_dict.keys())
+    for data_point in data:
+        if not isinstance(data_point, accepted_types):
+            raise ValueError("wrong input format. expected tuple or list.")
+        for k, check in check_dict.items():
+            accept_dict[k] &= check(data_point)
+        accepted_formats = [k for k, v in accept_dict.items() if v]
+        if len(accepted_formats) == 1 and not check_consistency:
+            return accepted_formats[0]
+        if len(accepted_formats) == 0:
+            return "traces" # default to traces
+            #raise ValueError("invalid or inconsistent data. no options left")
+    if len(accepted_formats) != 1 and not guess:
+        raise ValueError("ambiguous data format. data format needs to be specified explicitly.")
+    return accepted_formats[0]
+
+
 # TODO maybe split this for maintainability (and perfomance?)
 class TransitionInfo:
     __slots__ = ["target", "count", "original_target", "original_count"]
@@ -379,6 +425,8 @@ def add_labeled_sequence(self, example: IOExample):
 
     @staticmethod
     def createPTA(data, output_behavior, data_format=None) -> 'GsmNode':
+        if data_format is None:
+            data_format = detect_data_format(data)
         if data_format not in DataFormatRange:
             raise ValueError(f"invalid data format {data_format}. should be in {DataFormatRange}")
 
@@ -388,12 +436,14 @@ def createPTA(data, output_behavior, data_format=None) -> 'GsmNode':
         if data_format == "labeled_sequences":
             for example in data:
                 root_node.add_labeled_sequence(example)
-        if data_format == "io_traces":
+        if data_format == "io_traces" or data_format == "traces":
             if output_behavior == "moore":
                 initial_output = data[0][0]
                 root_node.prefix_access_pair = (None, initial_output)
                 data = (d[1:] for d in data)
             for trace in data:
+                if data_format == "traces":
+                    trace = (("step", t) for t in trace)
                 root_node.add_trace(trace)
         return root_node
 
diff --git a/aalpy/utils/HelperFunctions.py b/aalpy/utils/HelperFunctions.py
@@ -3,6 +3,8 @@
 from itertools import product
 from collections import defaultdict
 
+from aalpy import Mdp, MarkovChain, McState, MooreMachine, Dfa, DfaState
+
 
 def extend_set(list_to_extend: list, new_elements: list) -> list:
     """
@@ -409,9 +411,7 @@ def product_with_possible_empty_iterable(*iterables, repeat=1):
     return product(*non_empty_iterables, repeat=repeat)
 
 
-def dfa_from_moore(moore_model):
-    from aalpy.automata import Dfa, DfaState
-
+def dfa_from_moore(moore_model: MooreMachine) -> Dfa:
     dfa_state_map = dict()
     # define states
     for moore_state in moore_model.states:
@@ -430,3 +430,20 @@ def dfa_from_moore(moore_model):
 
     initial_state = dfa_state_map[moore_model.initial_state.state_id]
     return Dfa(initial_state, list(dfa_state_map.values()))
+
+def mc_from_mdp(mdp: Mdp, input_symbol=None) -> MarkovChain:
+    alphabet = mdp.get_input_alphabet()
+    if len(alphabet) != 1 and input_symbol is None:
+        raise ValueError('Cannot convert MDP with several inputs to Markov chain.')
+    input_symbol = input_symbol or alphabet[0]
+
+    state_map = {state.state_id: McState(state.state_id, state.output) for state in mdp.states}
+    for state in mdp.states:
+        mdp_transitions = state.transitions.get(input_symbol)
+        if mdp_transitions is None:
+            continue
+        mc_transitions = [(state_map[mdp_target.state_id], prob) for mdp_target, prob in mdp_transitions]
+        state_map[state.state_id].transitions = mc_transitions
+
+    initial_state = state_map[mdp.initial_state.state_id]
+    return MarkovChain(initial_state, list(state_map.values()))
diff --git a/aalpy/utils/Sampling.py b/aalpy/utils/Sampling.py
@@ -18,6 +18,31 @@ def get_io_traces(automaton: Automaton, input_traces: list) -> list:
     return traces
 
 
+def get_labeled_sequences(automaton: Automaton, input_traces: list) -> list:
+    moore_automata = (MooreMachine, Dfa, NDMooreMachine, Mdp, MarkovChain)
+    is_moore = isinstance(automaton, moore_automata)
+
+    data = []
+    for input_trace in input_traces:
+        if len(input_trace) == 0:
+            if not is_moore:
+                raise ValueError("tried to get label of empty sequence for Mealy automaton.")
+            output = automaton.initial_state.output
+        else:
+            output = automaton.execute_sequence(automaton.initial_state, input_trace)[-1]
+        data.append((input_trace, output))
+    return data
+
+
+def get_data_from_input_sequence(automaton: Automaton, input_sequence: list, data_format: str = "io_sequences"):
+    if data_format == "io_sequences":
+        return get_io_traces(automaton, input_sequence)
+    elif data_format == "labeled_sequences":
+        return get_labeled_sequences(automaton, input_sequence)
+    else:
+        raise ValueError(f"invalid data_format {data_format}. must be 'io_sequences' or 'labeled_sequences'")
+
+
 def support_automaton_arg(require_transform):
     def decorator(f):
         @wraps(f)