refactor: abstract run_concurrent & delete semaphore

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 8fd34b2d9681 · 2025-09-28T16:12:38.000+08:00
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -136,8 +136,8 @@ async def insert(self):
 
         inserting_chunks = await chunk_documents(
             new_docs,
-            self.chunk_size,
-            self.chunk_overlap,
+            self.config["split"]["chunk_size"],
+            self.config["split"]["chunk_overlap"],
             self.tokenizer_instance,
             self.progress_bar,
         )
diff --git a/graphgen/operators/build_kg/extract_kg.py b/graphgen/operators/build_kg/extract_kg.py
@@ -1,10 +1,8 @@
-import asyncio
 import re
 from collections import defaultdict
 from typing import List
 
 import gradio as gr
-from tqdm.asyncio import tqdm as tqdm_async
 
 from graphgen.bases.base_storage import BaseGraphStorage
 from graphgen.bases.datatypes import Chunk
@@ -17,6 +15,7 @@
     handle_single_relationship_extraction,
     logger,
     pack_history_conversations,
+    run_concurrent,
     split_string_by_multi_markers,
 )
 
@@ -28,115 +27,91 @@ async def extract_kg(
     tokenizer_instance: Tokenizer,
     chunks: List[Chunk],
     progress_bar: gr.Progress = None,
-    max_concurrent: int = 1000,
 ):
     """
     :param llm_client: Synthesizer LLM model to extract entities and relationships
     :param kg_instance
     :param tokenizer_instance
     :param chunks
     :param progress_bar: Gradio progress bar to show the progress of the extraction
-    :param max_concurrent
     :return:
     """
 
-    semaphore = asyncio.Semaphore(max_concurrent)
-
     async def _process_single_content(chunk: Chunk, max_loop: int = 3):
-        async with semaphore:
-            chunk_id = chunk.id
-            content = chunk.content
-            if detect_if_chinese(content):
-                language = "Chinese"
-            else:
-                language = "English"
-            KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
-
-            hint_prompt = KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format(
-                **KG_EXTRACTION_PROMPT["FORMAT"], input_text=content
+        chunk_id = chunk.id
+        content = chunk.content
+        if detect_if_chinese(content):
+            language = "Chinese"
+        else:
+            language = "English"
+        KG_EXTRACTION_PROMPT["FORMAT"]["language"] = language
+
+        hint_prompt = KG_EXTRACTION_PROMPT[language]["TEMPLATE"].format(
+            **KG_EXTRACTION_PROMPT["FORMAT"], input_text=content
+        )
+
+        final_result = await llm_client.generate_answer(hint_prompt)
+        logger.info("First result: %s", final_result)
+
+        history = pack_history_conversations(hint_prompt, final_result)
+        for loop_index in range(max_loop):
+            if_loop_result = await llm_client.generate_answer(
+                text=KG_EXTRACTION_PROMPT[language]["IF_LOOP"], history=history
+            )
+            if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
+            if if_loop_result != "yes":
+                break
+
+            glean_result = await llm_client.generate_answer(
+                text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history
             )
+            logger.info("Loop %s glean: %s", loop_index, glean_result)
 
-            final_result = await llm_client.generate_answer(hint_prompt)
-            logger.info("First result: %s", final_result)
-
-            history = pack_history_conversations(hint_prompt, final_result)
-            for loop_index in range(max_loop):
-                if_loop_result = await llm_client.generate_answer(
-                    text=KG_EXTRACTION_PROMPT[language]["IF_LOOP"], history=history
-                )
-                if_loop_result = if_loop_result.strip().strip('"').strip("'").lower()
-                if if_loop_result != "yes":
-                    break
-
-                glean_result = await llm_client.generate_answer(
-                    text=KG_EXTRACTION_PROMPT[language]["CONTINUE"], history=history
-                )
-                logger.info("Loop %s glean: %s", loop_index, glean_result)
-
-                history += pack_history_conversations(
-                    KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result
-                )
-                final_result += glean_result
-                if loop_index == max_loop - 1:
-                    break
-
-            records = split_string_by_multi_markers(
-                final_result,
-                [
-                    KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
-                    KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"],
-                ],
+            history += pack_history_conversations(
+                KG_EXTRACTION_PROMPT[language]["CONTINUE"], glean_result
+            )
+            final_result += glean_result
+            if loop_index == max_loop - 1:
+                break
+
+        records = split_string_by_multi_markers(
+            final_result,
+            [
+                KG_EXTRACTION_PROMPT["FORMAT"]["record_delimiter"],
+                KG_EXTRACTION_PROMPT["FORMAT"]["completion_delimiter"],
+            ],
+        )
+
+        nodes = defaultdict(list)
+        edges = defaultdict(list)
+
+        for record in records:
+            record = re.search(r"\((.*)\)", record)
+            if record is None:
+                continue
+            record = record.group(1)  # 提取括号内的内容
+            record_attributes = split_string_by_multi_markers(
+                record, [KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
             )
 
-            nodes = defaultdict(list)
-            edges = defaultdict(list)
-
-            for record in records:
-                record = re.search(r"\((.*)\)", record)
-                if record is None:
-                    continue
-                record = record.group(1)  # 提取括号内的内容
-                record_attributes = split_string_by_multi_markers(
-                    record, [KG_EXTRACTION_PROMPT["FORMAT"]["tuple_delimiter"]]
-                )
-
-                entity = await handle_single_entity_extraction(
-                    record_attributes, chunk_id
-                )
-                if entity is not None:
-                    nodes[entity["entity_name"]].append(entity)
-                    continue
-                relation = await handle_single_relationship_extraction(
-                    record_attributes, chunk_id
-                )
-                if relation is not None:
-                    edges[(relation["src_id"], relation["tgt_id"])].append(relation)
-            return dict(nodes), dict(edges)
-
-    results = []
-    chunk_number = len(chunks)
-    async for result in tqdm_async(
-        asyncio.as_completed([_process_single_content(c) for c in chunks]),
-        total=len(chunks),
+            entity = await handle_single_entity_extraction(record_attributes, chunk_id)
+            if entity is not None:
+                nodes[entity["entity_name"]].append(entity)
+                continue
+            relation = await handle_single_relationship_extraction(
+                record_attributes, chunk_id
+            )
+            if relation is not None:
+                edges[(relation["src_id"], relation["tgt_id"])].append(relation)
+        return dict(nodes), dict(edges)
+
+    results = await run_concurrent(
+        _process_single_content,
+        chunks,
         desc="[2/4]Extracting entities and relationships from chunks",
         unit="chunk",
-    ):
-        try:
-            if progress_bar is not None:
-                progress_bar(
-                    len(results) / chunk_number,
-                    desc="[3/4]Extracting entities and relationships from chunks",
-                )
-            results.append(await result)
-            if progress_bar is not None and len(results) == chunk_number:
-                progress_bar(
-                    1, desc="[3/4]Extracting entities and relationships from chunks"
-                )
-        except Exception as e:  # pylint: disable=broad-except
-            logger.error(
-                "Error occurred while extracting entities and relationships from chunks: %s",
-                e,
-            )
+        progress_bar=progress_bar,
+    )
 
     nodes = defaultdict(list)
     edges = defaultdict(list)
diff --git a/graphgen/utils/__init__.py b/graphgen/utils/__init__.py
@@ -13,4 +13,5 @@
 from .help_nltk import NLTKHelper
 from .log import logger, parse_log, set_logger
 from .loop import create_event_loop
+from .run_concurrent import run_concurrent
 from .wrap import async_to_sync_method
diff --git a/graphgen/utils/run_concurrent.py b/graphgen/utils/run_concurrent.py
@@ -0,0 +1,38 @@
+import asyncio
+from typing import Awaitable, Callable, List, Optional, TypeVar
+
+import gradio as gr
+from tqdm.asyncio import tqdm as tqdm_async
+
+from graphgen.utils.log import logger
+
+T = TypeVar("T")
+R = TypeVar("R")
+
+
+async def run_concurrent(
+    coro_fn: Callable[[T], Awaitable[R]],
+    items: List[T],
+    *,
+    desc: str = "processing",
+    unit: str = "item",
+    progress_bar: Optional[gr.Progress] = None,
+) -> List[R]:
+    tasks = [asyncio.create_task(coro_fn(it)) for it in items]
+
+    results = await tqdm_async.gather(*tasks, desc=desc, unit=unit)
+
+    ok_results = []
+    for idx, res in enumerate(results):
+        if isinstance(res, Exception):
+            logger.exception("Task failed: %s", res)
+            if progress_bar:
+                progress_bar((idx + 1) / len(items), desc=desc)
+            continue
+        ok_results.append(res)
+        if progress_bar:
+            progress_bar((idx + 1) / len(items), desc=desc)
+
+    if progress_bar:
+        progress_bar(1.0, desc=desc)
+    return ok_results