[pdf2vqa] 一个文件一个算子

fatty-belly · fatty-belly · commit 49b8a82d46ae · 2026-01-20T15:14:18.000+08:00
diff --git a/dataflow/operators/pdf2vqa/__init__.py b/dataflow/operators/pdf2vqa/__init__.py
@@ -1,7 +1,9 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from .generate.pdf2vqa_formatter import MinerU2LLMInputOperator, LLMOutputParser, QA_Merger
+    from .generate.mineru_to_llm_input_operator import MinerU2LLMInputOperator
+    from .generate.llm_output_parser import LLMOutputParser
+    from .generate.qa_merger import QA_Merger
 
 
 else:
diff --git a/dataflow/operators/pdf2vqa/generate/llm_output_parser.py b/dataflow/operators/pdf2vqa/generate/llm_output_parser.py
@@ -8,73 +8,7 @@
 from dataflow.utils.registry import OPERATOR_REGISTRY
 from dataflow.utils.storage import DataFlowStorage
 from dataflow import get_logger
-from dataflow.utils.pdf2vqa.format_utils import merge_qa_pair, jsonl_to_md
 
-@OPERATOR_REGISTRY.register()
-class MinerU2LLMInputOperator(OperatorABC):
-    def __init__(self):
-        pass
-    
-    @staticmethod
-    def get_desc(lang: str = "zh") -> str:
-        if lang == 'zh':
-            return (
-                "MinerU格式转换为LLM输入格式算子。"
-                "将MinerU生成的内容列表JSON文件转换为适合LLM处理的格式，"
-                "包括展平列表项并重新编号。"
-            )
-        else:
-            return (
-                "Convert MinerU format to LLM input format operator."
-                "Transforms the content list JSON file generated by MinerU into a format suitable for LLM processing,"
-                "including flattening list items and re-indexing."
-            )
-
-    def _convert_json(self, input_file, output_file):
-        with open(input_file, 'r') as infile:
-            data = list(json.load(infile))
-        
-        new_data = []
-        id = 0
-        for item in data:
-            item['id'] = id
-            item.pop('bbox', None)
-            item.pop('page_idx', None)
-            if item.get('type','') == 'list':
-                if item['sub_type'] == 'text':
-                    for idx, list_item in enumerate(item.get('list_items', [])):
-                        new_item = {
-                            'type': 'text',
-                            'text': list_item,
-                            'id': id + idx,
-                        }
-                        new_data.append(new_item)
-                    id += len(item.get('list_items', []))
-            else:
-                new_data.append(item)
-                id += 1
-        
-        with open(output_file, 'w') as outfile:
-            json.dump(new_data, outfile, ensure_ascii=False)
-    
-    def run(self, storage: DataFlowStorage,
-            input_markdown_path_key,
-            output_converted_layout_key,
-            ):
-        dataframe = storage.read("dataframe")
-    
-        for index, row in dataframe.iterrows():
-            input_json_path = row[input_markdown_path_key].replace('.md', '_content_list.json')
-            converted_path = input_json_path.replace('.json', '_converted.json')
-            self._convert_json(input_json_path, converted_path)
-            dataframe.at[index, output_converted_layout_key] = converted_path
-            
-            with open(converted_path, 'r') as infile:
-                data = json.load(infile)
-                assert isinstance(data, list), f"Expected list, got {type(data)} for {input_json_path}"
-                
-        storage.write(dataframe)
-        
 @OPERATOR_REGISTRY.register()
 class LLMOutputParser(OperatorABC):
     def __init__(self, 
@@ -204,64 +138,4 @@ def run(self, storage: DataFlowStorage,
             
             dataframe.loc[idx, output_qalist_path_key] = output_qalist_path
             
-        storage.write(dataframe)
-        
-@OPERATOR_REGISTRY.register()
-class QA_Merger(OperatorABC):
-    def __init__(self, output_dir, strict_title_match=False):
-        self.output_dir = output_dir
-        self.strict_title_match = strict_title_match
-        
-    @staticmethod
-    def get_desc(lang: str = "zh") -> str:
-        if lang == 'zh':
-            return (
-                "QA对合并算子。"
-                "将问题和答案的QA列表进行合并，生成最终的QA对文件，"
-                "并转换为Markdown格式。"
-            )
-        else:
-            return (
-                "QA pair merging operator."
-                "Merges question and answer QA lists to generate final QA pair files,"
-                "and converts them to Markdown format."
-            )
-    
-    def run(self, storage: DataFlowStorage,
-            input_question_qalist_path_key,
-            input_answer_qalist_path_key,
-            input_name_key,
-            output_merged_qalist_path_key,
-            output_merged_md_path_key,
-            output_qa_item_key="qa_item"  # 新增：展开后的 QA 内容列名
-            ):
-        dataframe = storage.read("dataframe")
-        
-        # 为了能存储 list 对象，先初始化该列为 object 类型
-        dataframe[output_qa_item_key] = None
-        dataframe[output_qa_item_key] = dataframe[output_qa_item_key].astype(object)
-
-        for idx, row in dataframe.iterrows():
-            question_qalist_path = row[input_question_qalist_path_key]
-            answer_qalist_path = row[input_answer_qalist_path_key]
-            name = row[input_name_key]
-            
-            output_merged_qalist_path = os.path.join(self.output_dir, name, "merged_qa_pairs.jsonl")
-            merge_qa_pair(question_qalist_path, answer_qalist_path, output_merged_qalist_path, strict_title_match=self.strict_title_match)
-            
-            output_merged_md_path = os.path.join(self.output_dir, name, "merged_qa_pairs.md")
-            jsonl_to_md(output_merged_qalist_path, output_merged_md_path)
-            
-            qa_pairs = []
-            if os.path.exists(output_merged_qalist_path):
-                with open(output_merged_qalist_path, 'r', encoding='utf-8') as f:
-                    qa_pairs = [json.loads(line) for line in f]
-            
-            dataframe.at[idx, output_qa_item_key] = qa_pairs
-
-            dataframe.loc[idx, output_merged_qalist_path_key] = output_merged_qalist_path
-            dataframe.loc[idx, output_merged_md_path_key] = output_merged_md_path
-            
-        dataframe = dataframe.explode(output_qa_item_key).reset_index(drop=True)
-
         storage.write(dataframe)
diff --git a/dataflow/operators/pdf2vqa/generate/mineru_to_llm_input_operator.py b/dataflow/operators/pdf2vqa/generate/mineru_to_llm_input_operator.py
@@ -0,0 +1,69 @@
+import json
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+
+@OPERATOR_REGISTRY.register()
+class MinerU2LLMInputOperator(OperatorABC):
+    def __init__(self):
+        pass
+    
+    @staticmethod
+    def get_desc(lang: str = "zh") -> str:
+        if lang == 'zh':
+            return (
+                "MinerU格式转换为LLM输入格式算子。"
+                "将MinerU生成的内容列表JSON文件转换为适合LLM处理的格式，"
+                "包括展平列表项并重新编号。"
+            )
+        else:
+            return (
+                "Convert MinerU format to LLM input format operator."
+                "Transforms the content list JSON file generated by MinerU into a format suitable for LLM processing,"
+                "including flattening list items and re-indexing."
+            )
+
+    def _convert_json(self, input_file, output_file):
+        with open(input_file, 'r') as infile:
+            data = list(json.load(infile))
+        
+        new_data = []
+        id = 0
+        for item in data:
+            item['id'] = id
+            item.pop('bbox', None)
+            item.pop('page_idx', None)
+            if item.get('type','') == 'list':
+                if item['sub_type'] == 'text':
+                    for idx, list_item in enumerate(item.get('list_items', [])):
+                        new_item = {
+                            'type': 'text',
+                            'text': list_item,
+                            'id': id + idx,
+                        }
+                        new_data.append(new_item)
+                    id += len(item.get('list_items', []))
+            else:
+                new_data.append(item)
+                id += 1
+        
+        with open(output_file, 'w') as outfile:
+            json.dump(new_data, outfile, ensure_ascii=False)
+    
+    def run(self, storage: DataFlowStorage,
+            input_markdown_path_key,
+            output_converted_layout_key,
+            ):
+        dataframe = storage.read("dataframe")
+    
+        for index, row in dataframe.iterrows():
+            input_json_path = row[input_markdown_path_key].replace('.md', '_content_list.json')
+            converted_path = input_json_path.replace('.json', '_converted.json')
+            self._convert_json(input_json_path, converted_path)
+            dataframe.at[index, output_converted_layout_key] = converted_path
+            
+            with open(converted_path, 'r') as infile:
+                data = json.load(infile)
+                assert isinstance(data, list), f"Expected list, got {type(data)} for {input_json_path}"
+                
+        storage.write(dataframe)
diff --git a/dataflow/operators/pdf2vqa/generate/qa_merger.py b/dataflow/operators/pdf2vqa/generate/qa_merger.py
@@ -0,0 +1,66 @@
+import os
+import json
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.pdf2vqa.format_utils import merge_qa_pair, jsonl_to_md
+
+@OPERATOR_REGISTRY.register()
+class QA_Merger(OperatorABC):
+    def __init__(self, output_dir, strict_title_match=False):
+        self.output_dir = output_dir
+        self.strict_title_match = strict_title_match
+        
+    @staticmethod
+    def get_desc(lang: str = "zh") -> str:
+        if lang == 'zh':
+            return (
+                "QA对合并算子。"
+                "将问题和答案的QA列表进行合并，生成最终的QA对文件，"
+                "并转换为Markdown格式。"
+            )
+        else:
+            return (
+                "QA pair merging operator."
+                "Merges question and answer QA lists to generate final QA pair files,"
+                "and converts them to Markdown format."
+            )
+    
+    def run(self, storage: DataFlowStorage,
+            input_question_qalist_path_key,
+            input_answer_qalist_path_key,
+            input_name_key,
+            output_merged_qalist_path_key,
+            output_merged_md_path_key,
+            output_qa_item_key="qa_item"  # 新增：展开后的 QA 内容列名
+            ):
+        dataframe = storage.read("dataframe")
+        
+        # 为了能存储 list 对象，先初始化该列为 object 类型
+        dataframe[output_qa_item_key] = None
+        dataframe[output_qa_item_key] = dataframe[output_qa_item_key].astype(object)
+
+        for idx, row in dataframe.iterrows():
+            question_qalist_path = row[input_question_qalist_path_key]
+            answer_qalist_path = row[input_answer_qalist_path_key]
+            name = row[input_name_key]
+            
+            output_merged_qalist_path = os.path.join(self.output_dir, name, "merged_qa_pairs.jsonl")
+            merge_qa_pair(question_qalist_path, answer_qalist_path, output_merged_qalist_path, strict_title_match=self.strict_title_match)
+            
+            output_merged_md_path = os.path.join(self.output_dir, name, "merged_qa_pairs.md")
+            jsonl_to_md(output_merged_qalist_path, output_merged_md_path)
+            
+            qa_pairs = []
+            if os.path.exists(output_merged_qalist_path):
+                with open(output_merged_qalist_path, 'r', encoding='utf-8') as f:
+                    qa_pairs = [json.loads(line) for line in f]
+            
+            dataframe.at[idx, output_qa_item_key] = qa_pairs
+
+            dataframe.loc[idx, output_merged_qalist_path_key] = output_merged_qalist_path
+            dataframe.loc[idx, output_merged_md_path_key] = output_merged_md_path
+            
+        dataframe = dataframe.explode(output_qa_item_key).reset_index(drop=True)
+
+        storage.write(dataframe)
diff --git a/dataflow/statics/pipelines/api_pipelines/pdf_vqa_extract_pipeline.py b/dataflow/statics/pipelines/api_pipelines/pdf_vqa_extract_pipeline.py
@@ -8,7 +8,7 @@
 from dataflow.pipeline import PipelineABC
 from dataflow.prompts.pdf2vqa import QAExtractPrompt
 
-class VQA_extract_optimized_pipeline(PipelineABC):
+class PDF_VQA_extract_optimized_pipeline(PipelineABC):
     def __init__(self):
         super().__init__()
         self.storage = FileStorage(
@@ -102,6 +102,6 @@ def forward(self):
 if __name__ == "__main__":
     # jsonl中每一行包含question_pdf_path, answer_pdf_path, name (math1, math2, physics1, chemistry1, ...)
     # 如果question和answer在同一份pdf中，请将question_pdf_path和answer_pdf_path设置为相同的路径，会自动切换为interleaved模式
-    pipeline = VQA_extract_optimized_pipeline()
+    pipeline = PDF_VQA_extract_optimized_pipeline()
     pipeline.compile()
     pipeline.forward()