MigoXLab
diff --git a/‎dingo/model/llm/vlm_layout_quality.py‎
Lines changed: 124 additions & 0 deletions b/‎dingo/model/llm/vlm_layout_quality.py‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎dingo/model/prompt/prompt_layout_quality.py‎
Lines changed: 124 additions & 0 deletions b/‎dingo/model/prompt/prompt_layout_quality.py‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎docs/document_parsing_quality_guide.md‎
Lines changed: 4 additions & 2 deletions b/‎docs/document_parsing_quality_guide.md‎
Lines changed: 4 additions & 2 deletions
@@ -0,0 +1,124 @@
+import base64
+import json
+import os
+from typing import List
+
+from dingo.io import Data
+from dingo.model import Model
+from dingo.model.llm.base_openai import BaseOpenAI
+from dingo.model.modelres import ModelRes
+from dingo.model.prompt.prompt_layout_quality import PromptLayoutQuality
+from dingo.utils import log
+
+
+@Model.llm_register("VLMLayoutQuality")
+class VLMLayoutQuality(BaseOpenAI):
+    prompt = PromptLayoutQuality
+
+    @classmethod
+    def _encode_image(cls, image_path: str) -> str:
+        """
+        Encode a local image file to base64 data URL format.
+        If the input is already a URL, return it as is.
+
+        This method follows Python's standard path resolution:
+        - Relative paths are resolved relative to the current working directory
+        - Absolute paths are used as-is
+        - URLs (http://, https://, data:) are passed through unchanged
+
+        Args:
+            image_path: Local file path (absolute or relative) or URL
+
+        Returns:
+            Base64 data URL for local files, or original URL for web resources
+
+        Raises:
+            FileNotFoundError: If a local file path does not exist
+            RuntimeError: If the file cannot be read
+        """
+        # Pass through URLs unchanged
+        if image_path.startswith('data:'):
+            return image_path
+
+        if image_path.startswith(("http://", "https://", 'data:')):
+            return image_path
+
+        # Standard file path handling (relative or absolute)
+        if not os.path.isfile(image_path):
+            raise FileNotFoundError(
+                f"Image file not found: '{image_path}'\n"
+                f"Current working directory: {os.getcwd()}\n"
+                f"Absolute path would be: {os.path.abspath(image_path)}\n"
+                f"Ensure the path is correct relative to your current working directory."
+            )
+
+        try:
+            with open(image_path, "rb") as image_file:
+                base64_image = base64.b64encode(image_file.read()).decode('utf-8')
+                # Determine MIME type from file extension
+                ext = os.path.splitext(image_path)[1].lower()
+                mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else f'image/{ext[1:]}'
+                return f"data:{mime_type};base64,{base64_image}"
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to read image file '{image_path}': {e}"
+            )
+
+    @classmethod
+    def build_messages(cls, input_data: Data) -> List:
+        if isinstance(input_data.image[0], str):
+            image_base64 = cls._encode_image(input_data.image[0])
+
+        bboxs = eval(input_data.content)
+
+        bbox_line = [
+            f"Bbox{bbox['bbox_id']} Type: {bbox['type']}"
+            for bbox in bboxs
+        ]
+        bbox_info = "\n".join(bbox_line)
+
+        layout_prompt = cls.prompt.content.replace("{{ bbox_typr_list }}", bbox_info)
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": layout_prompt},
+                    {"type": "image_url", "image_url": {"url": image_base64}},
+                ]
+            }
+        ]
+        return messages
+
+    @classmethod
+    def process_response(cls, response: str) -> ModelRes:
+        log.info(response)
+
+        response = response.replace("```json", "")
+        response = response.replace("```", "")
+
+        types = []
+        names = []
+
+        if response:
+            try:
+                result_data = json.loads(response)
+                errors = result_data.get("errors", [])
+
+                for error in errors:
+                    error_type = error.get("error_type", "")
+                    error_location = error.get("error_location", "")
+
+                    if error_type and error_location:
+                        types.append(error_type)
+                        names.append(error_location)
+            except json.JSONDecodeError as e:
+                log.error(f"JSON解析错误: {e}")
+
+        result = ModelRes()
+        result.error_status = False
+        result.type = types
+        result.name = names
+        result.reason = [response]
+
+        return result
@@ -0,0 +1,124 @@
+from dingo.model.model import Model
+from dingo.model.prompt.base import BasePrompt
+
+
+@Model.prompt_register("PromptLayoutQuality", [], ['VLMLayoutQuality'])
+class PromptLayoutQuality(BasePrompt):
+    # Metadata for documentation generation
+    _metric_info = {
+        "category": "Layout Eval Metric",
+        "metric_name": "PromptLayoutQuality",
+        "description": "Evaluate the quality of layout detctection and conversion quality.",
+        "evaluation_results": "",
+    }
+    content = r"""
+    # 角色
+    你是一名严谨细致的布局检测模型专家，你的任务是审查一个布局检测模型的输出结果。由于没有标准的正确答案（Ground Truth），你需要运用你对通用文档结构、排版惯例和逻辑关系的深刻理解，来识别并标记模型预测中的所有错误。
+
+    # 布局类别定义
+    模型能够识别并输出的类别是固定的。在判断“类别错误”时，请以此处定义的类别为准。合法的类别包括：
+    *   **title (标题)**: 独立成行，在视觉上（如字体、字号、加粗）与正文有明显区别的各级标题。
+    *   **text (文本)**: 普通段落文本。每个自然段应对应一个边界框，每一个列表项也对应一个边界框。
+    *   **table (表格)**: 具有清晰行/列结构的数据或文本。结构简单的（如仅有几行几列且无标题）可被视为多个独立的`text`元素。
+    *   **figure (图片)**: 照片、插图、示意图等非统计性图表。
+        *   **分割原则**: 如果图片内部有明显的空白分界线，应将其拆分为多个子图。
+        *   **文本密集型图片**: 若图片主要由文本构成（如无复杂流程的截图），应将其中的文本块标注为`text`。
+    *   **chart (统计图表)**: 柱状图、折线图、饼图等具有数学统计属性的图表。
+    *   **formula (公式)**: 单个独立成行的数学或化学公式，可以包含公式编号。
+    *   **caption (图/表/代码标题)**: 位于图片、图表、表格或代码块上方或下方的标题或说明文字。
+    *   **footnote (图/表/代码注释)**: 位于图片、图表、表格或代码块下方的补充性注释文字。
+    *   **header (页眉)**: 页面顶部区域固定的、重复出现的内容，如章节名。
+    *   **footer (页脚)**: 页面底部区域固定的、重复出现的内容，通常不包含页码。
+    *   **page_number (页码)**: 仅包含页码的元素，通常位于页眉或页脚。
+    *   **page_footnote (页面注释)**: 位于页面底部，对正文某处内容进行补充说明的注释（如脚注¹）。
+    *   **reference (参考文献)**: 参考文献区域的单个条目。
+    *   **code (代码)**: 多行代码块。
+    *   **algorithm (算法块)**: 格式化的算法描述区域。
+    *   **pinyin (拼音)**: 位于汉字上方的拼音标注，按行标注。
+    *   **aside (边栏)**: 页面主内容区域之外的侧边栏文本或图像。
+    *   **other (其他)**: 无法归入以上任何类别的元素。
+
+
+    # 任务
+    请你仔细审查图片上的每一个边界框，并结合其对应的类别信息，根据下方定义的错误类型，找出所有存在的错误。最终，你需要生成一份详细的、结构化的JSON格式错误报告。如果没有任何错误，请返回一个空的错误列表。
+
+    # 错误类型定义
+    在审核时，请重点关注以下几种基于视觉的错误：
+    1.  **检测遗漏错误**:页面上肉眼可见的、有明确意义的独立内容（如文本块、图片、表格等），但模型未能为其生成任何边界框。
+    2.  **检测不准错误**：检测不准确包括检测冗余、检测不完整、检测框重叠。检测冗余表示模型在**没有任何实际内容**的空白区域，或在不应被视为独立元素的装饰性图案/线条上，错误地生成了一个边界框。检测不完整表示元素的边界框过小，未能完整地包裹其全部视觉内容，导致部分内容（如文字笔画、图像边缘）被截断或遗漏在框外。**请注意：只要内容被完整包裹，边界框包含额外的空白区域是可以接受的，不应视为错误。**检测框重叠表示原本互不重叠的检测框重叠在了一起。
+    3.  **类别错误**: 元素的类别（label）与其在图片上呈现的视觉功能不符。结合框内**文本内容、字体大小、粗细、颜色、排版位置（如居中、缩进）、以及它在整个页面布局中的作用**来综合判断。
+    *   **示例**:
+        *   一个框内的文字是“第一章 绪论”，且字体显著大于正文、位置居中，但其`label`被标为`text`（文本），这应是`title`（标题）。
+        *   一个明显是数据图表或照片的区域被错误地标记为`table`（表格）。
+    4.  **阅读顺序错误**:模型输出的元素ID顺序与文档内容的**自然阅读流**不一致。
+    *   **示例**:
+        *   在一个双栏布局的页面上，左栏的段落ID为`[2, 4]`，右栏的段落ID为`[3, 5]`。这导致阅读顺序在两栏之间来回跳跃，而不是先读完左栏再读右栏。
+    5.  **其他错误**:用于标记所有未被上述明确类别覆盖，但明显不符合文档逻辑结构或排版常识的错误。这是一个“兜底”类别，旨在捕获模型预测中各种预料之外的异常情况。
+
+    # 工作流程
+    1.  **全局审阅**: 首先快速浏览整张图片，对页面的整体布局、内容分区（如页眉、页脚、正文区、边栏）有一个大致的了解。
+    2.  **逐项核对**: 按照ID顺序（或按视觉从上到下的顺序），仔细检查图片上的每一个边界框及其标注。
+    3.  **综合判断**: 对于每个框，结合其**框内的视觉内容、标注的类别以及它与周围框体的空间关系**，判断是否存在错误。
+    4.  **记录错误**: 一旦发现错误，根据上述【错误类型定义】，记录下来。
+    5.  **生成报告**: 将所有发现的错误整理成指定格式的JSON报告。
+
+    # 输出格式要求
+    请严格按照以下JSON格式输出你的审核报告。报告的主体是一个名为`error_analysis`的列表，其中每个对象代表一个已识别的错误。
+
+    **请特别注意以下两条规则：**
+    *   **聚合相似错误**: 如果页面上有多个元素犯了**完全相同性质的错误**，请将它们**合并到同一个错误条目**中。将所有相关的`element_ids`都列出，并在`description`中进行概括性描述。
+    *   **允许单个元素的多重错误**: 如果**同一个元素**（例如 `id=1`）同时存在多种类型的错误（例如，既有`Boundary Error`，又有`Classification Error`），你需要为它**创建多个独立的错误条目**，每个条目对应一种错误类型。
+    *   对于“检测遗漏错误”，也应遵循此原则。例如，如果页面同时遗漏了页眉和页脚，你应该只创建一个检测遗漏错误条目，并在description中同时描述这两个被遗漏的元素，而不是创建两个独立的错误条目。
+
+    **输出格式示例**
+    请严格按照以下JSON结构输出完整报告：
+    ```json
+    {
+        "errors": [
+            {
+                "error_id": 1,
+                "error_type": "边界框不准错误",
+                "error_location": "元素1的边界框过小，未能完整包含其文本内容'第一章：系统概述'的全部，文字的下半部分被截断。",
+                "suggestion": "应调整边界框，确保其紧密包裹整个文本区域。"
+            },
+            {
+                "error_ids": 2,
+                "error_type": "元素类别错误",
+                "error_location": "元素1在图片上显示为大号、加粗、居中的文本'第一章：系统概述'，这是一个典型的章节标题，但被错误地标记为'text'。",
+                "suggestion": "应将label修正为'title'"
+            },
+            {
+                "error_id": 3,
+                "error_type": "其他错误",
+                "error_location": "这是一个合并错误。元素10将一个独立的图标题'图3：用户增长曲线'和其下方的图片本身错误地合并到了同一个边界框中。",
+                "suggestion": "应将此元素拆分为两个独立的元素：一个label为'figure_caption'的标题元素，和一个label为'figure'的图片元素。"
+            },
+            {
+                "error_id": 4,
+                "error_type": "检测遗漏错误",
+                "error_location": "页面上有两处明显的检测遗漏：1. 页面右上角的页眉 '财务报表' 未被检测。 2. 页面右下角的页脚 '2021年度报告 307' 未被检测。",
+                "suggestion": "应为页眉和页脚分别添加新的边界框，并将其类别分别标记为 'header' 和 'footer'。"
+            }
+        ]
+    }
+    ```
+
+    *   `error_ids`: (Int)错误问题的编号，从1开始计数，以此类推。
+    *   `error_type`: (String) 从上述【错误类型定义】中选择一个。
+    *   `error_location`: (String) 对错误位置的详细、客观的文字描述，**请结合图片上的视觉特征进行说明**。
+    *   `suggestion`: (String) 针对该错误提出的具体、可操作的修改建议。
+
+     *如果未发现任何错误，请返回：*
+    ```json
+    {
+        "errors": []
+    }
+    ```
+    ---------
+    # 任务开始
+
+    ## 输入信息
+    1.  **布局检测图**: [待提供的原始图像]
+    2.  **元素属性列表**: 以下是模型为当前图片中每个ID预测的类别。请基于此列表和图片进行分析。
+    {{ bbox_typr_list }}
+    """
@@ -134,9 +134,11 @@ if __name__ == '__main__':
 ### JSONL数据格式
 
 ```jsonl
-{"id": "1", "content": "即当 \\(x\\longrightarrow0\\) 时， \\(f(x)\\) 与 \\(6x^{2}\\) 互为等价无穷小量，故 \\(c=6,k=3\\) ，应选A.\n\n# 强化20\n\n【解析】当 \\(x\\longrightarrow0^{2}\\) 时，有\n\n\\(\\alpha^{\\prime}=\\cos x^{2}\\rightarrow1\\) ，即 \\(\\alpha\\!\\sim\\!x\\) （为 \\(x\\) 的1阶无穷小量），\n\n\\(\\beta^{\\prime}\\!=\\!\\tan x\\cdot2x\\!\\sim\\!2x^{2}\\) 即 \\(\\beta\\!\\sim\\!\\frac{2}{3}x^{3}\\) （为 \\(x\\) 的3阶无穷小量），\n\n\\(\\gamma^{\\prime}=\\sin x^{\\frac{3}{7}}\\cdot\\frac{1}{2\\sqrt{x}}-x^{\\frac{3}{7}}\\frac{1}{2\\sqrt{x}}=\\frac{1}{2}x\\) 即 \\(\\gamma\\!\\sim\\!\\frac{1}{4}x^{2}\\) （为 \\(x\\) 的2阶无穷小量），\n\n所以当 \\(x\\longrightarrow0\\) 时无穷小量从低阶到高阶的顺序为 \\(\\alpha,\\gamma,\\beta\\) 故应选B.\n\n# 强化21\n\n【解析】方法一：导数定阶法\n\n由当 \\(x\\longrightarrow0^{2}\\) 时，\n\n\\(\\left[\\int_{0}^{x}\\left(\\mathrm{e}^{t^{2}}-1\\right)\\mathrm{d}t\\right]^{\\prime}=\\mathrm{e}^{x^{2}}-1-x^{2}\\) 故 \\(\\int_{0}^{x}\\left(\\mathrm{e}^{t^{2}}-1\\right)\\mathrm{d}t=\\frac{1}{3}x^{3}\\) 为3阶无穷小量；\\(\\left[\\int_{0}^{x}\\ln\\left(1+\\sqrt{t^{2}}\\right)\\mathrm{d}t\\right]^{\\prime}=\\ln\\left(1+\\sqrt{x^{2}}\\right)\\sim\\sqrt{x^{2}}\\) 故 \\(\\int_{0}^{x}\\ln\\left(1+\\sqrt{t^{2}}\\right)\\mathrm{d}t-\\frac{2}{5}x^{\\frac{2}{3}}\\) 为 \\(\\frac{5}{2}\\) 阶无穷小量；\\(\\left[\\int_{0}^{x}\\sin t^{2}\\mathrm{d}t\\right]^{\\prime}=\\sin(\\sin x)^{2}\\cos x-x^{2}\\) 故 \\(\\int_{0}^{x}\\sin t^{2}\\mathrm{d}t-\\frac{1}{3}x^{3}\\) 为3阶无穷小量；\\(\\left[\\int_{0}^{x}(1-\\cos t)\\,\\mathrm{d}t\\right]^{\\prime}-1-\\cos x-\\frac{1}{2}x^{2}\\) 故 \\(\\int_{0}^{x}(1-\\cos t)\\,\\mathrm{d}t-\\frac{1}{6}x^{3}\\) 为3阶无穷小量；\\(\\left[\\int_{0}^{1-\\cos x}\\sqrt{\\sin^{3}t}\\,\\mathrm{d}t\\right]^{\\prime}=\\sqrt{\\sin^{3}(1-\\cos x)}\\,\\cdot\\,\\sin x-\\frac{1}{2}x^{2}\\right]^{\\frac{3}{2}}\\,\\cdot\\,x=\\frac{1}{2\\sqrt{2}}\\,x^{4}\\) 故 \\(\\int_{0}^{1-\\cos x}\\sqrt{\\sin^{3}t}\\,\\mathrm{d}t\\) 为5阶无穷小量，应选E.\n\n\n\n# 方法二：经验法，见【敲重点】\n\n对于选项A， \\(\\int_{0}^{x}\\left(\\mathrm{e}^{t^{2}}-1\\right)\\mathrm{d}t\\) 为 \\(x\\longrightarrow0^{2}\\) 时的 \\(n(m+1)=1\\times(2+1)=3\\) 阶无穷小量；对于选项B， \\(\\int_{0}^{x}\\ln\\left(1+\\sqrt{t^{2}}\\right)\\mathrm{d}t\\) 为 \\(x\\longrightarrow0^{2}\\) 时的 \\(n(m+1)=1\\times\\left({\\frac{3}{2}}+1\\right)={\\frac{5}{2}}\\) 阶无穷小量；对于选项C， \\(\\int_{0}^{1\\sim x}\\sin t^{2}\\mathrm{d}t\\) 为 \\(x\\longrightarrow0^{2}\\) 时的 \\(n(m+1)=1\\times(2+1)=3\\) 阶无穷小量；对于选项D， \\(\\int_{0}^{x}\\left(1-\\cos t\\right)\\mathrm{d}t\\) 为 \\(x\\longrightarrow0^{2}\\) 时的 \\(n(m+1)=1\\times(2+1)=3\\) 阶无穷小量，对于选项E， \\(\\int_{0}^{1-\\cos x}\\sqrt{\\sin^{3}t}\\,\\mathrm{d}t\\) 为 \\(x\\longrightarrow0^{2}\\) 时的 \\(n(m+1)=2\\times\\left({\\frac{3}{2}}+1\\right)=5\\) 阶无穷小量，故应选E.", "img": "../../test/data/c6be64e4-1dd4-4bd4-b923-55a63a6de397_page_1.jpg"}
+{"id": "1", "content": "content xxx", "img": "path/to/your/image.jpg"}
 ```
-
+id: 数据id，可以自定义
+content：带质检的文本
+img：图片路径
 
 ## 最佳实践
 ### 评估模型