dialog frequency

Yuriy Peshkichev · Yuriy Peshkichev · commit 376d1b8f98ca · 2025-05-12T14:15:01.000+03:00
diff --git a/dialog2graph/pipelines/core/dialog_sampling.py b/dialog2graph/pipelines/core/dialog_sampling.py
@@ -216,7 +216,8 @@ def remove_duplicated_paths(node_paths: list[list[int]]) -> list[list[int]]:
 
 
 def get_dialog_triplets(seq: list[list[dict]]) -> set[tuple[str]]:
-    """Find all dialog triplets with (source, edge, target) utterances
+    """Get all dialog triplets with (source, edge, target) utterances
+       from sequence of dialogs
 
     Args:
         seq: sequence of dialogs
diff --git a/dialog2graph/pipelines/core/graph.py b/dialog2graph/pipelines/core/graph.py
@@ -12,6 +12,7 @@
 from typing import Optional, Any
 import matplotlib.pyplot as plt
 import abc
+import colorsys
 
 from dialog2graph.utils.logger import Logger
 
@@ -252,7 +253,7 @@ def visualise_short(self, name="", *args, **kwargs):
         try:
             pos = nx.nx_agraph.pygraphviz_layout(self.graph)
         except ImportError as e:
-            pos = nx.kamada_kawai_layout(self.graph)
+            pos = nx.spring_layout(self.graph)
             logger.warning(
                 f"{e}.\nInstall pygraphviz from http://pygraphviz.github.io/ .\nFalling back to default layout."
             )
@@ -290,29 +291,48 @@ def visualise_interactive(self, *args, **kwargs) -> gv._internal.plotting.data_s
         
         """
         Visualises the graph using interactive visualisation library "gravis".
+            
         Returns:
           A figure object representing the interactive graph visualization.
         """
-        graph = self.graph.copy()
-
-        node_labels = nx.get_node_attributes(graph, "utterances")
-        edge_labels = nx.get_edge_attributes(graph, "utterances")
-
-        for edge_id in graph.edges:
-            edge = graph.edges[edge_id]
-            edge['label'] = len(edge_labels[edge_id])
-            edge['hover'] = edge_labels[edge_id]
-        
-        for node_id in graph.nodes:
-            node = graph.nodes[node_id]
-            node['label'] = f"{node_id}:{len(node_labels[node_id])}"
-            node['hover'] = node_labels[node_id]
-        
+        graph = {"graph": {}}
+        if "frequency" in self.graph_dict["nodes"][0]:
+            node_rgb = [colorsys.hsv_to_rgb(node["frequency"]/30, 1.0, 1.0) for node in self.graph_dict["nodes"]]
+            node_colors = ["#%02x%02x%02x" % tuple([round(255*x) for x in rgb]) for rgb in node_rgb]
+            node_frequency = [node["frequency"] for node in self.graph_dict["nodes"]]
+        else:
+            node_colors = ["#000000"]*len(self.graph_dict["nodes"])
+            node_frequency = [0]*len(self.graph_dict["nodes"])
+        if "frequency" in self.graph_dict["edges"][0]:
+            edge_rgb = [colorsys.hsv_to_rgb(node["frequency"]/30, 1.0, 1.0) for node in self.graph_dict["edges"]]
+            edge_colors = ["#%02x%02x%02x" % tuple([round(255*x) for x in rgb]) for rgb in edge_rgb]
+            edge_frequency = [edge["frequency"] for edge in self.graph_dict["edges"]]
+        else:
+            edge_colors = ["#000000"]*len(self.graph_dict["edges"])
+            edge_frequency = [0]*len(self.graph_dict["edges"])
+
+        graph["graph"]["nodes"] = {
+            str(node["id"]): {
+                "label": f"{node['id']}:{len(node['utterances'])}",
+                "metadata": {
+                    "hover": f"frequency: {node_frequency[idx]}\n" + '\n'.join([str(i+1)+": "+ node["utterances"][i] for i in range(len(node["utterances"]))]),
+                    "color": node_colors[idx]
+                    }
+                } for idx, node in enumerate(self.graph_dict["nodes"])
+            }
+        graph["graph"]["edges"] = [{"source": str(e["source"]),
+                                "target": str(e["target"]),
+                                "label": len(e["utterances"]),
+                                "metadata": {
+                                "hover": f"frequency: {edge_frequency[idx]}\n" + '\n'.join([str(i+1)+": "+ e["utterances"][i] for i in range(len(e["utterances"]))]),
+                                "color": edge_colors[idx]
+                                }
+                                } for idx, e in enumerate(self.graph_dict["edges"])]
         return gv.vis(
             graph, show_node_label=True, show_edge_label=True,
             node_label_data_source='label',
             edge_label_data_source='label', edge_label_size_factor=1.7,
-            layout_algorithm="hierarchicalRepulsion"
+            layout_algorithm="hierarchicalRepulsion",
             )
 
 
diff --git a/dialog2graph/utils/dg_helper.py b/dialog2graph/utils/dg_helper.py
@@ -28,12 +28,19 @@ def connect_nodes(
     """
     edges = []
     node_store = NodeStore(nodes, utt_sim)
+    for idx in range(len(nodes)):
+        nodes[idx]["frequency"] = 0
     for dialog in dialogs:
         turns = dialog.to_list()
         dialog_store = DialogStore(turns, utt_sim)
         for node in nodes:
             for utt in node["utterances"]:
-                ids = dialog_store.search_assistant(utt)
+                ids = dialog_store.search_store(
+                    dialog_store.assistant_store,
+                    dialog_store.assistant_size,
+                    utt
+                    )
+                node["frequency"] += len(ids)
                 if ids:
                     for id, user_utt in zip(ids, dialog_store.get_user_by_id(ids=ids)):
                         if len(turns) > 2 * (int(id) + 1):
@@ -66,6 +73,7 @@ def connect_nodes(
                                                 "utterances"
                                             ]
                                             + [user_utt],
+                                            "frequency": 0
                                         }
                                     )
                             else:
@@ -74,8 +82,17 @@ def connect_nodes(
                                         "source": node["id"],
                                         "target": target,
                                         "utterances": [user_utt],
+                                        "frequency": 0,
                                     }
                                 )
+        for edge in edges:
+            for utt in edge["utterances"]:
+                ids = dialog_store.search_store(
+                    dialog_store.user_store,
+                    dialog_store.user_size,
+                    utt
+                    )
+                edge["frequency"] += len(ids)
     return {"edges": edges, "nodes": nodes}
 
 
diff --git a/dialog2graph/utils/vector_stores.py b/dialog2graph/utils/vector_stores.py
@@ -17,15 +17,17 @@ class DialogStore:
     User and assistant utterances vectorized separately
 
     Attributes:
-      _assistant_store: store for assistant utterances
-      _user_store: store for user utterances
-      _assistant_size: number of assistant utterances
+      assistant_store: store for assistant utterances
+      user_store: store for user utterances
+      assistant_size: number of assistant utterances
+      user_size: number of user utterances
       _score_threshold: simlarity threshold
     """
 
-    _assistant_store: Chroma
-    _user_store: Chroma
-    _assistant_size: int
+    assistant_store: Chroma
+    user_store: Chroma
+    assistant_size: int
+    user_size: int
     _score_threshold: int
 
     def _load_dialog(
@@ -39,10 +41,10 @@ def _load_dialog(
           dialog: list of dicts in a form {"participant": "user" or "assistant", "text": text}
           embedder: embedding function for vector store
         """
-        self._assistant_store = Chroma(
+        self.assistant_store = Chroma(
             collection_name=str(uuid.uuid4()), embedding_function=embedder
         )
-        self._user_store = Chroma(
+        self.user_store = Chroma(
             collection_name=str(uuid.uuid4()), embedding_function=embedder
         )
         assistant_docs = [
@@ -53,11 +55,12 @@ def _load_dialog(
         ]
         user_docs = [
             Document(page_content=turn["text"].lower(), id=id, metadata={"id": id})
-            for id, turn in enumerate(d for d in dialog if d["participant"] == "user")
+            for id, turn in enumerate([d for d in dialog if d["participant"] == "user"])
         ]
-        self._assistant_size = len(assistant_docs)
-        self._assistant_store.add_documents(documents=assistant_docs)
-        self._user_store.add_documents(documents=user_docs)
+        self.assistant_size = len(assistant_docs)
+        self.user_size = len(user_docs)
+        self.assistant_store.add_documents(documents=assistant_docs)
+        self.user_store.add_documents(documents=user_docs)
 
     def __init__(
         self,
@@ -75,17 +78,20 @@ def __init__(
         self._score_threshold = score_threshold
         self._load_dialog(dialog, embedder)
 
-    def search_assistant(self, utterance) -> list[str]:
-        """Search for utterance over assistant store
+    def search_store(self, store: Chroma, size: int, utterance: str) -> list[str]:
+        """Search for utterance over store
 
         Args:
+          store: Chroma store
+          size: size of the store
           utterance: utterance to search for
         Returns:
-          list of found documents ids of assistant store
+          list of found documents ids
         """
-        docs = self._assistant_store.similarity_search_with_relevance_scores(
+
+        docs = store.similarity_search_with_relevance_scores(
             utterance.lower(),
-            k=self._assistant_size,
+            k=size,
             score_threshold=self._score_threshold,
         )
         res = [d[0].metadata["id"] for d in docs]
@@ -94,6 +100,7 @@ def search_assistant(self, utterance) -> list[str]:
 
         return res
 
+
     def get_user_by_id(self, ids: list[str]) -> list[str]:
         """Get utterances of user with ids
 
@@ -102,7 +109,7 @@ def get_user_by_id(self, ids: list[str]) -> list[str]:
         Returns:
           list of utterances
         """
-        res = self._user_store.get(ids=ids)["documents"]
+        res = self.user_store.get(ids=ids)["documents"]
         return res
 
 
diff --git a/experiments/exp2025_03_20_d2g_pipeline/exp2025_03_20_d2g_pipeline/test_pipeline.ipynb b/experiments/exp2025_03_20_d2g_pipeline/exp2025_03_20_d2g_pipeline/test_pipeline.ipynb
diff --git a/scripts/check_metrics.py b/scripts/check_metrics.py
@@ -113,6 +113,7 @@ def test_d2g_pipeline(pipeline: BasePipeline) -> bool:
         # Parse the raw data
         raw_data = PipelineRawDataType(dialogs=dialogs, true_graph=graph)
         report = pipeline.invoke(raw_data, enable_evals=True)[1].model_dump()
+
         # Extract the duration and similarity from the report
         new_summary.append(
             {
diff --git a/tests/metrics_results/extender_12.05.2025_1.json b/tests/metrics_results/extender_12.05.2025_1.json
@@ -0,0 +1,12 @@
+[
+    {
+        "graph": "Responding to DMs on Instagram/Facebook.",
+        "duration": 42.00281095504761,
+        "similarity": 0.9939578771591187
+    },
+    {
+        "graph": "average",
+        "duration": 42.00281095504761,
+        "similarity": 0.9939578771591187
+    }
+]
diff --git a/tests/metrics_results/three_stages_light_12.05.2025_1.json b/tests/metrics_results/three_stages_light_12.05.2025_1.json
@@ -0,0 +1,12 @@
+[
+    {
+        "graph": "Responding to DMs on Instagram/Facebook.",
+        "duration": 19.314972162246704,
+        "similarity": 0.9247153401374817
+    },
+    {
+        "graph": "average",
+        "duration": 19.314972162246704,
+        "similarity": 0.9247153401374817
+    }
+]
diff --git a/tests/metrics_results/three_stages_light_12.05.2025_2.json b/tests/metrics_results/three_stages_light_12.05.2025_2.json
@@ -0,0 +1,12 @@
+[
+    {
+        "graph": "Responding to DMs on Instagram/Facebook.",
+        "duration": 19.48843026161194,
+        "similarity": 0.9247153401374817
+    },
+    {
+        "graph": "average",
+        "duration": 19.48843026161194,
+        "similarity": 0.9247153401374817
+    }
+]
diff --git a/tests/metrics_results/three_stages_light_12.05.2025_3.json b/tests/metrics_results/three_stages_light_12.05.2025_3.json
@@ -0,0 +1,12 @@
+[
+    {
+        "graph": "Responding to DMs on Instagram/Facebook.",
+        "duration": 21.439910411834717,
+        "similarity": 0.9247153401374817
+    },
+    {
+        "graph": "average",
+        "duration": 21.439910411834717,
+        "similarity": 0.9247153401374817
+    }
+]
diff --git a/tests/metrics_results/three_stages_llm_12.05.2025_1.json b/tests/metrics_results/three_stages_llm_12.05.2025_1.json
@@ -0,0 +1,12 @@
+[
+    {
+        "graph": "Responding to DMs on Instagram/Facebook.",
+        "duration": 24.345693111419678,
+        "similarity": 0.9939578771591187
+    },
+    {
+        "graph": "average",
+        "duration": 24.345693111419678,
+        "similarity": 0.9939578771591187
+    }
+]

Original file line number	Diff line number	Diff line change
`@@ -113,6 +113,7 @@ def test_d2g_pipeline(pipeline: BasePipeline) -> bool:`
`113`	`113`	`# Parse the raw data`
`114`	`114`	`raw_data = PipelineRawDataType(dialogs=dialogs, true_graph=graph)`
`115`	`115`	`report = pipeline.invoke(raw_data, enable_evals=True)[1].model_dump()`
	`116`	`+`
`116`	`117`	`# Extract the duration and similarity from the report`
`117`	`118`	`new_summary.append(`
`118`	`119`	`{`