Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion dingo/model/prompt/prompt_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,28 @@ class PromptWordStick(BasePrompt):
Return your answer in JSON format: {"score": 0, "type": "xxx", "reason": "xxx"}.
Here are the data you need to evaluate:
"""

@Model.prompt_register("CODE_LIST_ISSUE", [])
class PromptUnreadIssue(BasePrompt):
content = """
### Role
You are a data quality assessment expert with fluent English communication skills, and you have insight into the considerations of Chinese professionals in your field.
### Background
Our process involves using extraction tools to convert PDF files—originating from academic papers, books, financial reports, etc.—into markdown format. Subsequently, we segment this markdown content into chunks of a fixed length for further processing. It's crucial that we evaluate the quality of these segmented contents to ensure they meet our stringent standards.
### Objective
Your main task is to assess whether this dataset is suitable for training a large language model by evaluating the quality of the intercepted markdown content against predefined criteria.
### Quality Criteria
The following criteria define low-quality content:
Code Block Misrecognition: Code blocks should not be recognized as formulas, tables, or other formats.
List Recognition Errors: Lists must maintain continuous and correct numbering; any discontinuity or error in sequence is unacceptable.
### Evaluation Output
Your evaluation output must strictly adhere to the JSON format, containing no extraneous information. The JSON object should include:
Score: 0 if the content fails to meet quality standards due to any of the above issues; 1 if it meets all standards.
Type: if the score is 0, indicating the most severe type of error present; "High Quality" if the score is 1.
Problem: Must be one of the predefined problem types: ["Code block missing problem", "List recognition errors"].
Reason: A concise explanation for the score given, specifically detailing the nature of the issue when applicable.
Return your answer in JSON format: {"score": 0, "type": "xxx", "reason": "xxx"}.
Here are the data you need to evaluate:
"""
@Model.prompt_register("UNREAD_ISSUE", [])
class PromptUnreadIssue(BasePrompt):
content = """
Expand Down
9 changes: 4 additions & 5 deletions dingo/model/rule/rule_image.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from typing import List

import numpy as np
Expand Down Expand Up @@ -92,12 +93,12 @@ class RuleImageRepeat(BaseRule):
@classmethod
def eval(cls, input_data: MetaData) -> ModelRes:
from imagededup.methods import CNN, PHash

res = ModelRes()
image_dir = input_data.content
if len(os.listdir(image_dir)) == 0:
raise ZeroDivisionError("The directory is empty, cannot calculate the ratio.")
phasher = PHash()
cnn_encoder = CNN()

phash_encodings = phasher.encode_images(image_dir=image_dir)
duplicates_phash = phasher.find_duplicates(encoding_map=phash_encodings)
duplicate_images_phash = set()
Expand All @@ -112,10 +113,8 @@ def eval(cls, input_data: MetaData) -> ModelRes:
res.type = cls.metric_type
res.name = cls.__name__
res.reason = [f'{image} -> {duplicates_cnn[image]}' for image in common_duplicates]

res.reason.append({"duplicate_ratio": len(common_duplicates) / len(os.listdir(image_dir))})
return res


@Model.rule_register('QUALITY_BAD_EFFECTIVENESS', [])
class RuleImageTextSimilarity(BaseRule):

Expand Down
Loading