Quickfix of RecursiveDialogueSampler

NotBioWaste905 · NotBioWaste905 · commit a4b537fdcdf6 · 2025-01-23T11:45:10.000+03:00
diff --git a/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/algorithms/dialogue_generation.py b/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/algorithms/dialogue_generation.py
@@ -71,28 +71,28 @@ def invoke(self, graph: BaseGraph, start_node: int = 1, end_node: int = -1, topi
 
         return all_dialogues
 
-    async def ainvoke(self, *args, **kwargs): 
+    async def ainvoke(self, *args, **kwargs):
         return self.invoke(*args, **kwargs)
 
 
 @AlgorithmRegistry.register(input_type=BaseGraph, output_type=Dialogue)
 class DialoguePathSampler(DialogueGenerator):
     def invoke(self, graph: BaseGraph, start_node: int = 1, end_node: int = -1, topic="") -> list[Dialogue]:
         nx_graph = graph.graph
-        
+
         # Find all nodes with no outgoing edges (end nodes)
         end_nodes = [node for node in nx_graph.nodes() if nx_graph.out_degree(node) == 0]
         dialogues = []
         # If no end nodes found, return empty list
         if not end_nodes:
             return []
-            
+
         all_paths = []
         # Get paths from start node to each end node
         for end in end_nodes:
             paths = list(nx.all_simple_paths(nx_graph, source=start_node, target=end))
             all_paths.extend(paths)
-        
+
         for path in all_paths:
             dialogue_turns = []
             # Process each node and edge in the path
@@ -101,59 +101,69 @@ def invoke(self, graph: BaseGraph, start_node: int = 1, end_node: int = -1, topi
                 current_node = path[i]
                 assistant_utterance = random.choice(nx_graph.nodes[current_node]["utterances"])
                 dialogue_turns.append({"text": assistant_utterance, "participant": "assistant"})
-                
+
                 # Add user utterance from edge (if not at last node)
                 if i < len(path) - 1:
                     next_node = path[i + 1]
                     edge_data = nx_graph.edges[current_node, next_node]
-                    user_utterance = (
-                        random.choice(edge_data["utterances"])
-                        if isinstance(edge_data["utterances"], list)
-                        else edge_data["utterances"]
-                    )
+                    user_utterance = random.choice(edge_data["utterances"]) if isinstance(edge_data["utterances"], list) else edge_data["utterances"]
                     dialogue_turns.append({"text": user_utterance, "participant": "user"})
-            
+
             dialogues.append(Dialogue().from_list(dialogue_turns))
-        
+
         return dialogues
-        
+
     async def ainvoke(self, *args, **kwargs):
         return self.invoke(*args, **kwargs)
-    
+
 
 @AlgorithmRegistry.register(input_type=BaseGraph, output_type=Dialogue)
 class RecursiveDialogueSampler(DialogueGenerator):
     def _list_in(self, a: list, b: list) -> bool:
         """Check if sequence a exists within sequence b."""
-        return any(map(lambda x: b[x:x + len(a)] == a, range(len(b) - len(a) + 1)))
-
-    
+        return any(map(lambda x: b[x : x + len(a)] == a, range(len(b) - len(a) + 1)))
 
     def invoke(self, graph: BaseGraph, start_node: int = 1, end_node: int = -1, topic="") -> list[Dialogue]:
         starts = [n for n in graph.graph_dict.get("nodes") if n["is_start"]]
         visitedList = [[]]
+
         def all_paths(graph, start: int, visited: list):
             # print("start: ", start, len(visitedList))
-            if len(visited) < 2 or not self._list_in(visited[-2:]+[start],visited):
+            if len(visited) < 2 or not self._list_in(visited[-2:] + [start], visited):
                 visited.append(start)
                 # print("visited:", visited)
                 for edge in graph.edge_by_source(start):
 
-                # if [start,edge['target']] not in visited:           
-                    all_paths(graph, edge['target'], visited.copy())
+                    # if [start,edge['target']] not in visited:
+                    all_paths(graph, edge["target"], visited.copy())
             visitedList.append(visited)
 
-        all_paths(graph, starts[0]['id'], [])
+        all_paths(graph, starts[0]["id"], [])
         visitedList.sort()
-        final = list(k for k,_ in itertools.groupby(visitedList))[1:]
-        
-        dialogues = []
-        for nodes in final:
-            dialogues.append(Dialogue().from_nodes_ids(graph=graph, node_list=nodes))
+        final = list(k for k, _ in itertools.groupby(visitedList))[1:]
+        sources = list(set([g["source"] for g in graph.graph_dict["edges"]]))
+        ends = [g["id"] for g in graph.graph_dict["nodes"] if g["id"] not in sources]
+        node_paths = [f for f in final if f[-1] in ends]
+        full_paths = []
+        for p in node_paths:
+            # print(p)
+            path = []
+            for idx, s in enumerate(p[:-1]):
+                path.append({"participant": "assistant", "text": graph.node_by_id(s)["utterances"][0]})
+                # path.append({"user": list(set(gr.edge_by_source(s)) & set(gr.edge_by_target(p[idx+1])))[0]['utterances']})
+                sources = graph.edge_by_source(s)
+                targets = graph.edge_by_target(p[idx + 1])
+                # print("SOURCES: ", sources, s)
+                # print("TARGETS: ", targets, p[idx+1])
+                # targets = set([(e['source'],e['target']) for e in gr.edge_by_target(p[idx+1])])
+                edge = [e for e in sources if e in targets][0]
+                path.append(({"participant": "user", "text": edge["utterances"][0]}))
+            path.append({"participant": "assistant", "text": graph.node_by_id(p[-1])["utterances"][0]})
+            full_paths.append(path)
+
+        dialogues = [Dialogue().from_list(i) for i in full_paths]
 
         return dialogues
 
     async def ainvoke(self, *args, **kwargs):
         return self.invoke(*args, **kwargs)
-    
-
diff --git a/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/dialogue.py b/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/dialogue.py
@@ -52,24 +52,15 @@ def from_nodes_ids(cls, graph, node_list, validate: bool = True) -> "Dialogue":
         nodes_attributes = nx.get_node_attributes(graph.graph, "utterances")
         edges_attributes = nx.get_edge_attributes(graph.graph, "utterances")
         for node in range(len(node_list)):
-            utts.append({
-                "participant": "assistant",
-                "text": nodes_attributes[node_list[node]][0]
-                })
+            utts.append({"participant": "assistant", "text": nodes_attributes[node_list[node]][0]})
             if node == len(node_list) - 1:
                 if graph.graph.has_edge(node_list[node], node_list[0]):
-                    utts.append({
-                        "participant": "user",
-                        "text": edges_attributes[(node_list[node], node_list[0])][0]})
+                    utts.append({"participant": "user", "text": edges_attributes[(node_list[node], node_list[0])][0]})
             else:
-                if graph.graph.has_edge(node_list[node], node_list[node+1]):
-                    utts.append({"participant": "user", "text": edges_attributes[(node_list[node], node_list[node+1])][0]})
-            
+                if graph.graph.has_edge(node_list[node], node_list[node + 1]):
+                    utts.append({"participant": "user", "text": edges_attributes[(node_list[node], node_list[node + 1])][0]})
+
         return cls(messages=utts, validate=validate)
-        
-        
-                
-            
 
     def to_list(self) -> List[Dict[str, str]]:
         """Converts Dialogue to a list of message dictionaries."""
@@ -97,10 +88,9 @@ def extend(self, messages: List[Union[DialogueMessage, Dict[str, str]]]) -> None
         new_messages = [msg if isinstance(msg, DialogueMessage) else DialogueMessage(**msg) for msg in messages]
         self.__validate(new_messages)
         self.messages.extend(new_messages)
-    
+
     def __validate(self, messages):
-        """Ensure that messages meets expectations.
-        """
+        """Ensure that messages meets expectations."""
         if not messages:
             return
 
diff --git a/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/graph.py b/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/graph.py
@@ -36,6 +36,10 @@ def edges_by_utterance(self):
     def node_by_id(self):
         raise NotImplementedError
 
+    @abc.abstractmethod
+    def edge_by_target(self):
+        raise NotImplementedError
+
 
 class Graph(BaseGraph):
 
@@ -87,15 +91,18 @@ def visualise(self, *args, **kwargs):
         plt.show()
 
     def nodes_by_utterance(self, utterance: str) -> list[dict]:
-        return [node for node in self.graph_dict['nodes'] if utterance in node['utterances']]
-            
+        return [node for node in self.graph_dict["nodes"] if utterance in node["utterances"]]
+
     def edges_by_utterance(self, utterance: str) -> list[dict]:
-        return [edge for edge in self.graph_dict['edges'] if utterance in edge['utterances']]
-            
+        return [edge for edge in self.graph_dict["edges"] if utterance in edge["utterances"]]
+
     def node_by_id(self, id: int):
-        for node in self.graph_dict['nodes']:
-            if node['id'] == id:
+        for node in self.graph_dict["nodes"]:
+            if node["id"] == id:
                 return node
-    
+
     def edge_by_source(self, source: int):
-        return [edge for edge in self.graph_dict['edges'] if source == edge['source']]
+        return [edge for edge in self.graph_dict["edges"] if source == edge["source"]]
+
+    def edge_by_target(self, target: int):
+        return [edge for edge in self.graph_dict["edges"] if target == edge["target"]]
diff --git a/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/metrics/automatic_metrics.py b/dev_packages/chatsky_llm_autoconfig/chatsky_llm_autoconfig/metrics/automatic_metrics.py
@@ -156,8 +156,8 @@ def all_utterances_present(G: BaseGraph, dialogues: list[Dialogue]) -> bool:
     if graph_utterances.issubset(dialogue_utterances):
         return True
     else:
-        return False
-        # return graph_utterances.difference(dialogue_utterances)
+        # return False
+        return graph_utterances.difference(dialogue_utterances)
 
 
 def all_roles_correct(D1: Dialogue, D2: Dialogue) -> bool:
diff --git a/experiments/2025.01.13_data_check_and_sampler_debugging/sampler.ipynb b/experiments/2025.01.13_data_check_and_sampler_debugging/sampler.ipynb