fix(webui): refine gradio progress_bar output

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 776cc208858b · 2025-04-18T11:46:07.000+08:00
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -7,6 +7,7 @@
 from dataclasses import dataclass
 
 from tqdm.asyncio import tqdm as tqdm_async
+import gradio as gr
 
 from .models import Chunk, JsonKVStorage, OpenAIModel, NetworkXStorage, WikiSearch, Tokenizer, TraverseStrategy
 from .models.storage.base_storage import StorageNameSpace
@@ -39,6 +40,9 @@ class GraphGen:
     # traverse strategy
     traverse_strategy: TraverseStrategy = TraverseStrategy()
 
+    # webui
+    progress_bar: gr.Progress = None
+
     def __post_init__(self):
         self.full_docs_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="full_docs"
@@ -78,6 +82,9 @@ async def async_split_chunks(self, data: Union[List[list], List[dict]], data_typ
                 logger.warning("All docs are already in the storage")
                 return {}
             logger.info("[New Docs] inserting %d docs", len(new_docs))
+
+            cur_index = 1
+            doc_number = len(new_docs)
             for doc_key, doc in tqdm_async(
                     new_docs.items(), desc="Chunking documents", unit="doc"
                 ):
@@ -89,6 +96,13 @@ async def async_split_chunks(self, data: Union[List[list], List[dict]], data_typ
                                                                             self.chunk_overlap_size, self.chunk_size)
                 }
                 inserting_chunks.update(chunks)
+
+                if self.progress_bar is not None:
+                    self.progress_bar(
+                        cur_index / doc_number, f"Chunking {doc_key}"
+                    )
+                    cur_index += 1
+
             _add_chunk_keys = await self.text_chunks_storage.filter_keys(list(inserting_chunks.keys()))
             inserting_chunks = {k: v for k, v in inserting_chunks.items() if k in _add_chunk_keys}
         elif data_type == "chunked":
@@ -141,7 +155,8 @@ async def async_insert(self, data: Union[List[list], List[dict]], data_type: str
             llm_client=self.synthesizer_llm_client,
             kg_instance=self.graph_storage,
             tokenizer_instance=self.tokenizer_instance,
-            chunks=[Chunk(id=k, content=v['content']) for k, v in inserting_chunks.items()]
+            chunks=[Chunk(id=k, content=v['content']) for k, v in inserting_chunks.items()],
+            progress_bar = self.progress_bar,
         )
         if not _add_entities_and_relations:
             logger.warning("No entities or relations extracted")
@@ -199,16 +214,19 @@ async def async_traverse(self):
                                                       self.tokenizer_instance,
                                                       self.graph_storage,
                                                       self.traverse_strategy,
-                                                      self.text_chunks_storage)
+                                                      self.text_chunks_storage,
+                                                      self.progress_bar)
         elif self.traverse_strategy.qa_form == "multi_hop":
             results = await traverse_graph_for_multi_hop(self.synthesizer_llm_client,
                                                             self.tokenizer_instance,
                                                             self.graph_storage,
                                                             self.traverse_strategy,
-                                                            self.text_chunks_storage)
+                                                            self.text_chunks_storage,
+                                                            self.progress_bar)
         else:
             results = await traverse_graph_by_edge(self.synthesizer_llm_client, self.tokenizer_instance,
-                                                   self.graph_storage, self.traverse_strategy, self.text_chunks_storage)
+                                                   self.graph_storage, self.traverse_strategy, self.text_chunks_storage,
+                                                   self.progress_bar)
         await self.qa_storage.upsert(results)
         await self.qa_storage.index_done_callback()
 
diff --git a/graphgen/operators/extract_kg.py b/graphgen/operators/extract_kg.py
@@ -1,8 +1,9 @@
 import re
 import asyncio
-
 from typing import List
 from collections import defaultdict
+
+import gradio as gr
 from tqdm.asyncio import tqdm as tqdm_async
 from graphgen.models import Chunk, OpenAIModel, Tokenizer
 from graphgen.models.storage.base_storage import BaseGraphStorage
@@ -13,18 +14,21 @@
 from graphgen.operators.merge_kg import merge_nodes, merge_edges
 
 
+# pylint: disable=too-many-statements
 async def extract_kg(
         llm_client: OpenAIModel,
         kg_instance: BaseGraphStorage,
         tokenizer_instance: Tokenizer,
         chunks: List[Chunk],
+        progress_bar: gr.Progress = None,
         max_concurrent: int = 1000
 ):
     """
     :param llm_client: Synthesizer LLM model to extract entities and relationships
     :param kg_instance
     :param tokenizer_instance
     :param chunks
+    :param progress_bar: Gradio progress bar to show the progress of the extraction
     :param max_concurrent
     :return:
     """
@@ -98,6 +102,7 @@ async def _process_single_content(chunk: Chunk, max_loop: int = 3):
             return dict(nodes), dict(edges)
 
     results = []
+    chunk_number = len(chunks)
     for result in tqdm_async(
         asyncio.as_completed([_process_single_content(c) for c in chunks]),
         total=len(chunks),
@@ -106,6 +111,8 @@ async def _process_single_content(chunk: Chunk, max_loop: int = 3):
     ):
         try:
             results.append(await result)
+            if progress_bar is not None:
+                progress_bar(len(results) / chunk_number, desc="Extracting entities and relationships from chunks")
         except Exception as e: # pylint: disable=broad-except
             logger.error("Error occurred while extracting entities and relationships from chunks: %s", e)
 
diff --git a/graphgen/operators/traverse_graph.py b/graphgen/operators/traverse_graph.py
@@ -1,4 +1,5 @@
 import asyncio
+import gradio as gr
 
 from tqdm.asyncio import tqdm as tqdm_async
 
@@ -167,6 +168,7 @@ async def traverse_graph_by_edge(
     graph_storage: NetworkXStorage,
     traverse_strategy: TraverseStrategy,
     text_chunks_storage: JsonKVStorage,
+    progress_bar: gr.Progress = None,
     max_concurrent: int = 1000
 ) -> dict:
     """
@@ -177,6 +179,7 @@ async def traverse_graph_by_edge(
     :param graph_storage
     :param traverse_strategy
     :param text_chunks_storage
+    :param progress_bar
     :param max_concurrent
     :return: question and answer
     """
@@ -289,11 +292,13 @@ async def _process_single_batch(
 
     for result in tqdm_async(asyncio.as_completed(
         [_process_single_batch(batch) for batch in processing_batches]
-    ), total=len(processing_batches), desc="Processing batches"):
+    ), total=len(processing_batches), desc="Generating QAs"):
         try:
             results.update(await result)
+            if progress_bar is not None:
+                progress_bar(len(results) / len(processing_batches), desc="Generating QAs")
         except Exception as e: # pylint: disable=broad-except
-            logger.error("Error occurred while processing batches: %s", e)
+            logger.error("Error occurred while generating QA: %s", e)
 
     return results
 
@@ -304,6 +309,7 @@ async def traverse_graph_atomically(
     graph_storage: NetworkXStorage,
     traverse_strategy: TraverseStrategy,
     text_chunks_storage: JsonKVStorage,
+    progress_bar: gr.Progress = None,
     max_concurrent: int = 1000
 ) -> dict:
     """
@@ -314,6 +320,7 @@ async def traverse_graph_atomically(
     :param graph_storage
     :param traverse_strategy
     :param text_chunks_storage
+    :param progress_bar
     :param max_concurrent
     :return: question and answer
     """
@@ -391,12 +398,14 @@ async def _generate_question(
     for result in tqdm_async(
         asyncio.as_completed([_generate_question(task) for task in tasks]),
         total=len(tasks),
-        desc="Generating questions"
+        desc="Generating QAs"
     ):
         try:
             results.update(await result)
+            if progress_bar is not None:
+                progress_bar(len(results) / len(tasks), desc="Generating QAs")
         except Exception as e: # pylint: disable=broad-except
-            logger.error("Error occurred while generating questions: %s", e)
+            logger.error("Error occurred while generating QA: %s", e)
     return results
 
 async def traverse_graph_for_multi_hop(
@@ -405,6 +414,7 @@ async def traverse_graph_for_multi_hop(
     graph_storage: NetworkXStorage,
     traverse_strategy: TraverseStrategy,
     text_chunks_storage: JsonKVStorage,
+    progress_bar: gr.Progress = None,
     max_concurrent: int = 1000
 ) -> dict:
     """
@@ -415,6 +425,7 @@ async def traverse_graph_for_multi_hop(
     :param graph_storage
     :param traverse_strategy
     :param text_chunks_storage
+    :param progress_bar
     :param max_concurrent
     :return: question and answer
     """
@@ -499,10 +510,12 @@ async def _process_single_batch(
     for result in tqdm_async(
         asyncio.as_completed([_process_single_batch(batch) for batch in processing_batches]),
         total=len(processing_batches),
-        desc="Processing batches"
+        desc="Generating QAs"
     ):
         try:
             results.update(await result)
+            if progress_bar is not None:
+                progress_bar(len(results) / len(processing_batches), desc="Generating QAs")
         except Exception as e: # pylint: disable=broad-except
-            logger.error("Error occurred while processing batches: %s", e)
+            logger.error("Error occurred while generating QA: %s", e)
     return results
diff --git a/webui/app.py b/webui/app.py
@@ -100,6 +100,8 @@ def run_graphgen(*arguments: list, progress=gr.Progress()):
     graph_gen.clear()
     progress(0.2, "Model Initialized")
 
+    graph_gen.progress_bar = progress
+
     try:
         # Load input data
         file = config['input_file']