[bugfix][hop_infos speed up]

lightislost · lightislost · commit b0845f792567 · 2024-08-20T15:51:54.000+08:00
diff --git a/muagent/base_configs/prompts/simple_prompts.py b/muagent/base_configs/prompts/simple_prompts.py
@@ -207,45 +207,83 @@
 
 
 
-text2EKG_prompt_en = '''你是一个结构化信息抽取的专家，你需要根据输入的文档，抽取其中的关键节点及节点间的连接顺序。请用json结构返回。
-
-json结构定义如下:
+text2EKG_prompt_en = '''# 上下文 #
+给定一个关于某个描述流程或者步骤的输入文本，我需要根据给定的输入文本，得到输入文本中流程或者操作步骤的结构化表示，之后可以用来在其它程序中绘制流程图。
+你是一个结构化信息抽取和总结的专家，你可以根据输入的流程相关描述文本，抽取其中的关键节点及节点间的连接顺序，生成流程或者步骤的结构化json表示。
+#############
+# 目标 #
+我希望你根据输入文本，提供一个输入文本中流程、操作的结构化json表示。可以参考以下步骤思考，但是不要输出每个步骤中间结果，只输出最后的流程图json：
+1. 确定流程图节点: 根据输入文本内容，确定流程图的各个节点。节点可以用如下结构表示：
 {
   "nodes": {
     "节点序号": {
       "type": "节点类型",
       "content": "节点内容"
-    }
-  },
-  "edges": [
-    {
-      "start": "起始节点序号",
-      "end": "终止节点序号"
-    }
-  ]
+    },
+  }
 }
-其中 nodes 用来存放抽取的节点，每个 node 的 key 通过从0开始对递增序列表示，value 是一个字典，包含 type 和 content 两个属性， type 对应下面定义的三种节点类型，content 为抽取的节点内容。
-edges 用来存放节点间的连接顺序，它是一个列表，每个元素是一个字典，包含 start 和 end 两个属性， start 为起始 node 的 节点序号, end 为结束 node 的 节点序号。
-
+其中 nodes 用来存放抽取的节点，每个 node 的 key 通过从0开始对递增序列表示，value 是一个字典，包含 type 和 content 两个属性， type 对应下面定义的四种节点类型，content 为抽取的节点内容。
 节点类型定义如下:
 Schedule:
-  表示整篇输入文档所做的事情，是对整篇输入文档的总结；
+  表示输入文本中流程和操作要完成的事情和任务，是对输入文本的意图的总结；
   第一个节点永远是Schedule节点。
 Task: 
-  表示需要执行的任务。
+  表示该节点需要执行的任务。
 Phenomenon:
   表示依据Task节点的执行结果，得到的事实结论。
   Phenomenon节点只能连接在Task节点之后。
 Analysis:
   表示依据Phenomenon节点的事实进行推断的过程；
   Analysis节点只能连接在Phenomenon节点之后。
 
-以下是一个例子：
-input: 路径：排查网络问题
+2. 连接流程图节点: 根据输入文本内容，确定流程图的各个节点的连接关系。节点之间的连接关系可以用如下结构表示：
+{
+  "edges": [
+    {
+      "start": "起始节点序号",
+      "end": "终止节点序号"
+    }
+  ]
+}
+edges 用来存放节点间的连接顺序，它是一个列表，每个元素是一个字典，包含 start 和 end 两个属性， start 为起始 node 的 节点序号, end 为结束 node 的 节点序号。
+
+3. 生成表示流程图的完整json: 将上面[确定流程图节点]和[连接流程图节点]步骤中的结果放到一个json，检查生成的流程图是否符合给定输入文本的内容，优化流程图的结构，合并相邻同类型节点，返回最终的json。
+#############
+# 风格 #
+流程图节点数尽可能少，保持流程图结构简洁，相邻同类型节点可以合并。流程图节点中的节点内容content要准确、详细、充分。
+#############
+# 语气 #
+专业，技术性
+#############
+# 受众 #
+面向提供流程文本的人员，让他们确信你生成的流程图准确表示了文本中的流程步骤。
+#############
+# 响应 #
+返回json结构定义如下:
+{
+  "nodes": {
+    "节点序号": {
+      "type": "节点类型",
+      "content": "节点内容"
+    }
+  },
+  "edges": [
+    {
+      "start": "起始节点序号",
+      "end": "终止节点序号"
+    }
+  ]
+}
+#############
+# 例子 #
+以下是几个例子：
+
+# 例子1 #
+输入文本:路径：排查网络问题
 1. 通过观察sofagw网关监控发现，BOLT失败数突增
 2. 且失败曲线与退保成功率曲线相关性较高，判定是网络问题。
 
-output: {
+输出:{
   "nodes": {
     "0": {
       "type": "Schedule",
@@ -296,10 +334,122 @@
   ]
 }
 
-请根据上述说明和例子来对以下的输入文档抽取结构化信息:
+# 例子2 #
+输入文本:二、使用模版创建选品集
+STEP1：创建选品集
+注：因为只能选择同类型模版，必须先选择数据类型，才能选择模版创建
+STEP2：按需选择模版后，点击确认
+
+- 我的收藏：个人选择收藏的模版
+- 我的创建：个人创建的模版
+- 模版广场：公开的模版，可以通过名称/创建人搜索到需要的模版并选择使用 
+
+STEP3：按需调整指标模版内的值，完成选品集创建
+
+输出:{
+  "nodes": {
+    "0": {
+      "type": "Schedule",
+      "content": "使用模版创建选品集"
+    },
+    "1": {
+      "type": "Task",
+      "content": "创建选品集\n\n注：因为只能选择同类型模版，必须先选择数据类型，才能选择模版创建"
+    },
+    "2": {
+      "type": "Task",
+      "content": "按需选择模版后，点击确认\n\n - 我的收藏：个人选择收藏的模版 \n\n - 我的创建：个人创建的模版 \n\n - 模版广场：公开的模版，可以通过名称/创建人搜索到需要的模版并选择使用"
+    },
+    "3": {
+      "type": "Task",
+      "content": "按需调整指标模版内的值，完成选品集创建"
+    }
+  },
+  "edges": [
+    {
+      "start": "0",
+      "end": "1"
+    },
+    {
+      "start": "1",
+      "end": "2"
+    },
+    {
+      "start": "2",
+      "end": "3"
+    }
+  ]
+}
+
+# 例子3 #
+输入文本:Step1
+
+- 点击右侧的左右切换箭头，找到自己所在的站点或业务模块;
+Step2
+
+- 查询对应一级场景，若没有所需一级场景则联系 [@小明][@小红]添加：具体操作如下：
+- 邮件模板
+| 项目背景：<br />场景名称：<br />场景描述：<br />数据类型：商家/商品/营销商家/营销商品/权益商家/权益选品（必要的才选）<br />业务管理员：（花名） |
+| --- |
+
+- 发送邮件[@小明][@小红]抄送 [@小白]
+Step3
+
+- 查询对应二级场景，若没有所需二级场景则联系一级场景管理员添加，支持通过搜索二级场景名称和ID快速查询二级场景；
+- 一级管理员为下图蓝色框①所在位置查看
+Step4
+
+- 申请二级场景数据权限，由对应二级场景管理员审批。若二级场景管理员为@小花@小映，按一级场景走申请流程。
+- 二级管理员为下图蓝色框②所在位置查看
+
+输出:{
+  "nodes": {
+    "0": {
+      "type": "Schedule",
+      "content": "场景权限申请"
+    },
+    "1": {
+      "type": "Task",
+      "content": "点击右侧的左右切换箭头，找到自己所在的站点或业务模块"
+    },
+    "2": {
+      "type": "Task",
+      "content": "查询对应一级场景，若没有所需一级场景则联系 [@小明][@小红]添加：具体操作如下：\n 发送邮件给小明和小红，抄送小白，邮件内容包括项目背景，场景名称，场景描述，数据类型和业务管理员"
+    },
+    "3": {
+      "type": "Task",
+      "content": "查询对应二级场景，若没有所需二级场景则联系一级场景管理员添加，支持通过搜索二级场景名称和ID快速查询二级场景"
+    },
+    "4": {
+      "type": "Task",
+      "content": "申请二级场景数据权限，由对应二级场景管理员审批。若二级场景管理员为@小花@小映，按一级场景走申请流程"
+    }
+  },
+  "edges": [
+    {
+      "start": "0",
+      "end": "1"
+    },
+    {
+      "start": "1",
+      "end": "2"
+    },
+    {
+      "start": "2",
+      "end": "3"
+    },
+    {
+      "start": "3",
+      "end": "4"
+    }
+  ]
+}
+#############
+# 开始抽取 #
+请根据上述说明和例子来对以下的输入文本抽取结构化流程json:
 
-input: {text}
+输入文本:{text}
 
-output:'''
+输出:'''
 
 text2EKG_prompt_zh = text2EKG_prompt_en
diff --git a/muagent/db_handler/graph_db_handler/geabase_handler.py b/muagent/db_handler/graph_db_handler/geabase_handler.py
@@ -13,7 +13,7 @@
 from muagent.db_handler.utils import deduplicate_dict
 from muagent.schemas.db import GBConfig
 from muagent.schemas.common import *
-from muagent.utils.common_utils import double_hashing
+from muagent.utils.common_utils import double_hashing, func_timer
 
 
 class GeaBaseHandler(GBHandler):
@@ -142,7 +142,6 @@ def get_current_nodeID(self, attributes: dict, node_type: str) -> int:
     def get_current_edgeID(self, src_id, dst_id, edeg_type:str = None):
         if not isinstance(src_id, int) or not isinstance(dst_id, int):
             result = self.get_current_edge(src_id, dst_id, edeg_type)
-            logger.debug(f"{result}")
             return result.attributes.get("SRCID"), result.attributes.get("DSTID"), result.attributes.get("timestamp")
         else:
             return src_id, dst_id, 1
@@ -225,7 +224,7 @@ def get_hop_infos(self, attributes: dict, node_type: str = None, hop: int = 2, b
         '''
         hop >= 2， 表面需要至少两跳
         '''
-        hop_max = 10
+        hop_max = 8
         # 
         where_str = ' and '.join([f"n0.{k}='{v}'" for k, v in attributes.items()])
         if reverse:
@@ -235,6 +234,7 @@ def get_hop_infos(self, attributes: dict, node_type: str = None, hop: int = 2, b
         last_node_ids, last_node_types = [], []
 
         result = {}
+        iter_index = 0
         while hop > 1:
             if last_node_ids == []:
                 # 
@@ -247,11 +247,13 @@ def get_hop_infos(self, attributes: dict, node_type: str = None, hop: int = 2, b
                     # 
                     _result = self.execute(gql)
                     _result = self.decode_result(_result, gql)
+                    # logger.info(f"p_lens: {len(_result['p'])}")
         
                     result = self.merge_hotinfos(result, _result)
             # 
-            last_node_ids, last_node_types, result = self.deduplicate_paths(result, block_attributes, select_attributes)
+            last_node_ids, last_node_types, result = self.deduplicate_paths(result, block_attributes, select_attributes, hop=min(hop, hop_max)+iter_index*hop_max)
             hop -= hop_max
+            iter_index += 1
 
         nodes = self.convert2GNodes(result.get("n1", []))
         edges = self.convert2GEdges(result.get("e", []))
@@ -274,7 +276,7 @@ def get_hop_paths(self, attributes: dict, node_type: str = None, hop: int = 2, b
         result = self.get_hop_infos(attributes, node_type, hop, block_attributes)
         return result.paths
 
-    def deduplicate_paths(self, result, block_attributes: dict = {}, select_attributes: dict = {}):
+    def deduplicate_paths(self, result, block_attributes: dict = {}, select_attributes: dict = {}, hop:int=None):
         # 获取数据
         n0, n1, e, p = result["n0"], result["n1"], result["e"], result["p"]
         block_node_ids = [
@@ -308,24 +310,26 @@ def deduplicate_paths(self, result, block_attributes: dict = {}, select_attribut
         # 根据保留路径进行合并
         nodeid2type = {i["id"]: i["type"] for i in n0+n1}
         unique_node_ids = [j for i in new_p for j in i]
-        last_node_ids = [i[-1] for i in new_p]
+        last_node_ids = list(set([i[-1] for i in new_p if len(i)>=hop]))
         last_node_types = [nodeid2type[i] for i in last_node_ids]
         new_n0 = deduplicate_dict([i for i in n0 if i["id"] in unique_node_ids])
         new_n1 = deduplicate_dict([i for i in n1 if i["id"] in unique_node_ids])
         new_e = deduplicate_dict([i for i in e if i["start_id"] in unique_node_ids and i["end_id"] in unique_node_ids])
 
         return last_node_ids, last_node_types, {"n0": new_n0, "n1": new_n1, "e": new_e, "p": new_p}
-        
+
     def merge_hotinfos(self, result1, result2) -> Dict:
-        new_n0 = result1["n0"] + result2["n0"]
-        new_n1 = result1["n1"] + result2["n1"]
+        old_n0_sets = set([n["id"] for n in result1["n0"]])
+        old_n1_sets = set([n["id"] for n in result1["n1"]])
+        new_n0 = result1["n0"] + [n for n in result2["n0"] if n["id"] not in old_n0_sets]
+        new_n1 = result1["n1"] + [n for n in result2["n1"] if n["id"] not in old_n1_sets]
         new_e = result1["e"] + result2["e"]
-        new_p = result1["p"] + result2["p"] + [
+        new_p = result1["p"] + [
             p_old_1 + p_old_2[1:] 
             for p_old_1 in result1["p"] 
             for p_old_2 in result2["p"] 
             if p_old_2[0] == p_old_1[-1]
-        ]
+        ] # + result2["p"]
         new_result = {"n0": new_n0, "n1": new_n1, "e": new_e, "p": new_p}
         return new_result
     
diff --git a/muagent/service/ekg_construct/ekg_construct_base.py b/muagent/service/ekg_construct/ekg_construct_base.py
@@ -609,6 +609,9 @@ def returndsl(self, graph_datas_by_path: dict, intents: List[str], ) -> dict:
 
     def get_intents(self, rootid, text: str):
         '''according contents search intents'''
+        if rootid is None or rootid=="":
+            raise Exception(f"rootid={rootid}, it is empty")
+        
         result = self.intention_router.get_intention_by_node_info_nlp(
             root_node_id=rootid,
             query=text,
diff --git a/muagent/utils/common_utils.py b/muagent/utils/common_utils.py
@@ -38,21 +38,21 @@ def datefromatToTimestamp(dt, interval=1000, dateformat=DATE_FORMAT):
     return int(datetime.strptime(dt, dateformat).timestamp()*interval)
 
 
-def func_timer():
+def func_timer(function):
     '''
     用装饰器实现函数计时
     :param function: 需要计时的函数
     :return: None
     '''
     @wraps(function)
     def function_timer(*args, **kwargs):
+        # logger.info('[Function: {name} start...]'.format(name=function.__name__))
         t0 = time.time()
         result = function(*args, **kwargs)
         t1 = time.time()
         logger.info('[Function: {name} finished, spent time: {time:.3f}s]'.format(
             name=function.__name__,
-            time=t1 - t0
-        ))
+            time=t1 - t0))
         return result
     return function_timer