diff --git a/dingo/model/prompt/prompt_common.py b/dingo/model/prompt/prompt_common.py index 1f3e6b4f..abcaf179 100644 --- a/dingo/model/prompt/prompt_common.py +++ b/dingo/model/prompt/prompt_common.py @@ -46,7 +46,28 @@ class PromptWordStick(BasePrompt): Return your answer in JSON format: {"score": 0, "type": "xxx", "reason": "xxx"}. Here are the data you need to evaluate: """ - +@Model.prompt_register("CODE_LIST_ISSUE", []) +class PromptUnreadIssue(BasePrompt): + content = """ + ### Role + You are a data quality assessment expert with fluent English communication skills, and you have insight into the considerations of Chinese professionals in your field. + ### Background + Our process involves using extraction tools to convert PDF files—originating from academic papers, books, financial reports, etc.—into markdown format. Subsequently, we segment this markdown content into chunks of a fixed length for further processing. It's crucial that we evaluate the quality of these segmented contents to ensure they meet our stringent standards. + ### Objective + Your main task is to assess whether this dataset is suitable for training a large language model by evaluating the quality of the intercepted markdown content against predefined criteria. + ### Quality Criteria + The following criteria define low-quality content: + Code Block Misrecognition: Code blocks should not be recognized as formulas, tables, or other formats. + List Recognition Errors: Lists must maintain continuous and correct numbering; any discontinuity or error in sequence is unacceptable. + ### Evaluation Output + Your evaluation output must strictly adhere to the JSON format, containing no extraneous information. The JSON object should include: + Score: 0 if the content fails to meet quality standards due to any of the above issues; 1 if it meets all standards. + Type: if the score is 0, indicating the most severe type of error present; "High Quality" if the score is 1. + Problem: Must be one of the predefined problem types: ["Code block missing problem", "List recognition errors"]. + Reason: A concise explanation for the score given, specifically detailing the nature of the issue when applicable. + Return your answer in JSON format: {"score": 0, "type": "xxx", "reason": "xxx"}. + Here are the data you need to evaluate: + """ @Model.prompt_register("UNREAD_ISSUE", []) class PromptUnreadIssue(BasePrompt): content = """ diff --git a/dingo/model/rule/rule_image.py b/dingo/model/rule/rule_image.py index 41e08037..8f39145c 100644 --- a/dingo/model/rule/rule_image.py +++ b/dingo/model/rule/rule_image.py @@ -1,3 +1,4 @@ +import os from typing import List import numpy as np @@ -92,12 +93,12 @@ class RuleImageRepeat(BaseRule): @classmethod def eval(cls, input_data: MetaData) -> ModelRes: from imagededup.methods import CNN, PHash - res = ModelRes() image_dir = input_data.content + if len(os.listdir(image_dir)) == 0: + raise ZeroDivisionError("The directory is empty, cannot calculate the ratio.") phasher = PHash() cnn_encoder = CNN() - phash_encodings = phasher.encode_images(image_dir=image_dir) duplicates_phash = phasher.find_duplicates(encoding_map=phash_encodings) duplicate_images_phash = set() @@ -112,10 +113,8 @@ def eval(cls, input_data: MetaData) -> ModelRes: res.type = cls.metric_type res.name = cls.__name__ res.reason = [f'{image} -> {duplicates_cnn[image]}' for image in common_duplicates] - + res.reason.append({"duplicate_ratio": len(common_duplicates) / len(os.listdir(image_dir))}) return res - - @Model.rule_register('QUALITY_BAD_EFFECTIVENESS', []) class RuleImageTextSimilarity(BaseRule):