MigoXLab · shijinpjlab · Jan 9, 2025 · Jan 6, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/dingo/model/prompt/prompt_common.py b/dingo/model/prompt/prompt_common.py
@@ -46,7 +46,28 @@ class PromptWordStick(BasePrompt):
     Return your answer in JSON format: {"score": 0, "type": "xxx", "reason": "xxx"}.
     Here are the data you need to evaluate:
     """
-
+@Model.prompt_register("CODE_LIST_ISSUE", [])
+class PromptUnreadIssue(BasePrompt):
+    content = """
+    ### Role
+    You are a data quality assessment expert with fluent English communication skills, and you have insight into the considerations of Chinese professionals in your field.
+    ### Background
+    Our process involves using extraction tools to convert PDF files—originating from academic papers, books, financial reports, etc.—into markdown format. Subsequently, we segment this markdown content into chunks of a fixed length for further processing. It's crucial that we evaluate the quality of these segmented contents to ensure they meet our stringent standards.
+    ### Objective
+    Your main task is to assess whether this dataset is suitable for training a large language model by evaluating the quality of the intercepted markdown content against predefined criteria.
+    ### Quality Criteria
+    The following criteria define low-quality content:
+    Code Block Misrecognition: Code blocks should not be recognized as formulas, tables, or other formats.
+    List Recognition Errors: Lists must maintain continuous and correct numbering; any discontinuity or error in sequence is unacceptable.
+    ### Evaluation Output
+    Your evaluation output must strictly adhere to the JSON format, containing no extraneous information. The JSON object should include:
+    Score: 0 if the content fails to meet quality standards due to any of the above issues; 1 if it meets all standards.
+    Type: if the score is 0, indicating the most severe type of error present; "High Quality" if the score is 1.
+    Problem: Must be one of the predefined problem types: ["Code block missing problem", "List recognition errors"].
+    Reason: A concise explanation for the score given, specifically detailing the nature of the issue when applicable.
+    Return your answer in JSON format: {"score": 0, "type": "xxx", "reason": "xxx"}.
+    Here are the data you need to evaluate:
+    """
 @Model.prompt_register("UNREAD_ISSUE", [])
 class PromptUnreadIssue(BasePrompt):
     content = """

diff --git a/dingo/model/rule/rule_image.py b/dingo/model/rule/rule_image.py
@@ -1,3 +1,4 @@
+import os
 from typing import List
 
 import numpy as np
@@ -92,12 +93,12 @@ class RuleImageRepeat(BaseRule):
     @classmethod
     def eval(cls, input_data: MetaData) -> ModelRes:
         from imagededup.methods import CNN, PHash
-
         res = ModelRes()
         image_dir = input_data.content
+        if len(os.listdir(image_dir)) == 0:
+            raise ZeroDivisionError("The directory is empty, cannot calculate the ratio.")
         phasher = PHash()
         cnn_encoder = CNN()
-
         phash_encodings = phasher.encode_images(image_dir=image_dir)
         duplicates_phash = phasher.find_duplicates(encoding_map=phash_encodings)
         duplicate_images_phash = set()
@@ -112,10 +113,8 @@ def eval(cls, input_data: MetaData) -> ModelRes:
             res.type = cls.metric_type
             res.name = cls.__name__
             res.reason = [f'{image} -> {duplicates_cnn[image]}' for image in common_duplicates]
-
+            res.reason.append({"duplicate_ratio": len(common_duplicates) / len(os.listdir(image_dir))})
         return res
-
-
 @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', [])
 class RuleImageTextSimilarity(BaseRule):