MigoXLab
diff --git a/‎dingo/model/llm/vlm_document_parsing.py‎
Lines changed: 1 addition & 1 deletion b/‎dingo/model/llm/vlm_document_parsing.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dingo/model/prompt/prompt_document_parsing.py‎
Lines changed: 18 additions & 17 deletions b/‎dingo/model/prompt/prompt_document_parsing.py‎
Lines changed: 18 additions & 17 deletions
diff --git a/‎dingo/model/rule/rule_audio.py‎
Lines changed: 120 additions & 0 deletions b/‎dingo/model/rule/rule_audio.py‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎dingo/model/rule/rule_common.py‎
Lines changed: 211 additions & 1 deletion b/‎dingo/model/rule/rule_common.py‎
Lines changed: 211 additions & 1 deletion
@@ -16,7 +16,7 @@ class VLMDocumentParsingQuality(BaseOpenAI):
     def build_messages(cls, input_data: Data) -> List:
         messages = [
             {
-                "role": "user", 
+                "role": "user",
                 "content": [
                     {"type": "text", "text": cls.prompt.content},
                     {"type": "image_url", "image_url": {"url": input_data.img}},
 
@@ -0,0 +1,120 @@
+from pathlib import Path
+
+import numpy as np
+
+from dingo.config.input_args import EvaluatorRuleArgs
+from dingo.io import Data
+from dingo.model.model import Model
+from dingo.model.modelres import ModelRes
+from dingo.model.rule.base import BaseRule
+
+
+@Model.rule_register(
+    "QUALITY_BAD_EFFECTIVENESS",
+    [
+        "multi_lan_ar",
+        "multi_lan_ko",
+        "multi_lan_ru",
+        "multi_lan_th",
+        "multi_lan_vi",
+        "multi_lan_cs",
+        "multi_lan_hu",
+        "multi_lan_sr",
+    ],
+)
+class RuleAudioDuration(BaseRule):
+    """check whether the audio duration meets the standard"""
+
+    # Metadata for documentation generation
+    _metric_info = {
+        "category": "Audio Quality Metrics",
+        "quality_dimension": "Audio_EFFECTIVENESS",
+        "metric_name": "RuleAudioDuration",
+        "description": "Check whether the audio duration meets the standard",
+        "evaluation_results": ""
+    }
+
+    dynamic_config = EvaluatorRuleArgs()
+
+    @classmethod
+    def eval(cls, input_data: Data) -> ModelRes:
+        import librosa
+        from scipy.signal import welch
+
+        res = ModelRes()
+
+        y, sr = librosa.load(input_data.content, sr=16000)
+        f_signal, Pxx_signal = welch(y, fs=sr)
+        noise_threshold = np.percentile(Pxx_signal, 50)
+        Pxx_noise = np.where(Pxx_signal <= noise_threshold, Pxx_signal, 0)
+        signal_power = np.sum(Pxx_signal)
+        noise_power = np.sum(Pxx_noise)
+
+        if noise_power == 0:
+            res.error_status = True
+            res.type = cls.metric_type
+            res.name = cls.__name__
+            res.reason = ["The audio power is zero. Cannot calculate SNR."]
+
+        snr_dB = round(10 * np.log10(signal_power / noise_power), 2)
+
+        if snr_dB < 8:
+            res.error_status = True
+            res.type = cls.metric_type
+            res.name = cls.__name__
+            res.reason = ["The audio signal-to-noise ratio is too low."]
+        return res
+
+
+@Model.rule_register(
+    "QUALITY_BAD_EFFECTIVENESS",
+    [
+        "multi_lan_ar",
+        "multi_lan_ko",
+        "multi_lan_ru",
+        "multi_lan_th",
+        "multi_lan_vi",
+        "multi_lan_cs",
+        "multi_lan_hu",
+        "multi_lan_sr",
+    ],
+)
+class RuleAudioSnrQuality(BaseRule):
+    """check whether the audio signal-to-noise ratio meets the standard"""
+
+    # Metadata for documentation generation
+    _metric_info = {
+        "category": "Audio Quality Metrics",
+        "quality_dimension": "Audio_EFFECTIVENESS",
+        "metric_name": "RuleAudioSnrQuality",
+        "description": "Check whether the audio signal-to-noise ratio meets the standard",
+        "evaluation_results": ""
+    }
+
+    dynamic_config = EvaluatorRuleArgs()
+
+    @classmethod
+    def eval(cls, input_data: Data) -> ModelRes:
+        import wave
+
+        res = ModelRes()
+        if not input_data.content:
+            return res
+        if isinstance(input_data.content, str):
+            with wave.open(str(Path(input_data.content)), 'r') as w:
+                frame_count = w.getnframes()
+                sample_rate = w.getframerate()
+                duration = frame_count / sample_rate
+
+        if duration > 10:
+            res.error_status = True
+            res.type = cls.metric_type
+            res.name = cls.__name__
+            res.reason = ["The audio duration is too long."]
+        return res
+
+
+if __name__ == "__main__":
+    data = Data(data_id="1", content=r"../test/data/audio/test.wav")
+    res = RuleAudioDuration().eval(data)
+    print(res)
@@ -140,6 +140,48 @@ def eval(cls, input_data: Data) -> ModelRes:
         return res
 
 
+@Model.rule_register(
+    "QUALITY_BAD_EFFECTIVENESS",
+    [
+        "multi_lan_ar",
+        "multi_lan_ko",
+        "multi_lan_ru",
+        "multi_lan_th",
+        "multi_lan_vi",
+        "multi_lan_cs",
+        "multi_lan_hu",
+        "multi_lan_sr",
+    ]
+)
+class RuleAudioDataFormat(BaseRule):
+    """check whether the audio data format is right"""
+
+    # Metadata for documentation generation
+    _metric_info = {
+        "category": "Rule-Based TEXT Quality Metrics",
+        "quality_dimension": "EFFECTIVENESS",
+        "metric_name": "RuleAudioDataFormat",
+        "description": "Check whether the audio data format is right",
+        "evaluation_results": ""
+    }
+
+    dynamic_config = EvaluatorRuleArgs()
+
+    @classmethod
+    def eval(cls, input_data: Data) -> ModelRes:
+        res = ModelRes()
+
+        raw_data = input_data.raw_data
+        key_list = ["id", "audio", "text"]
+        if all(key in raw_data for key in key_list):
+            return res
+        else:
+            res.error_status = True
+            res.type = cls.metric_type
+            res.name = cls.__name__
+            res.reason = ["Audio Data format error"]
+
+
 @Model.rule_register("QUALITY_BAD_UNDERSTANDABILITY", ["pretrain"])
 class RuleCapitalWords(BaseRule):
     """check whether capital words ratio > 0.2"""
@@ -1125,6 +1167,48 @@ def eval(cls, input_data: Data) -> ModelRes:
         return res
 
 
+@Model.rule_register(
+    "QUALITY_BAD_EFFECTIVENESS",
+    [
+        "multi_lan_ar",
+        "multi_lan_ko",
+        "multi_lan_ru",
+        "multi_lan_th",
+        "multi_lan_vi",
+        "multi_lan_cs",
+        "multi_lan_hu",
+        "multi_lan_sr",
+    ]
+)
+class RuleImageDataFormat(BaseRule):
+    """check whether the nlp data format is right"""
+
+    # Metadata for documentation generation
+    _metric_info = {
+        "category": "Rule-Based TEXT Quality Metrics",
+        "quality_dimension": "EFFECTIVENESS",
+        "metric_name": "RuleImageDataFormat",
+        "description": "Check whether the image data format is right",
+        "evaluation_results": ""
+    }
+
+    dynamic_config = EvaluatorRuleArgs()
+
+    @classmethod
+    def eval(cls, input_data: Data) -> ModelRes:
+        res = ModelRes()
+
+        raw_data = input_data.raw_data
+        key_list = ["img_id", "image"]
+        if all(key in raw_data for key in key_list):
+            return res
+        else:
+            res.error_status = True
+            res.type = cls.metric_type
+            res.name = cls.__name__
+            res.reason = ["Image Data format error"]
+
+
 @Model.rule_register("QUALITY_BAD_EFFECTIVENESS", ["pdf_all"])
 class RuleLatexSpecialChar(BaseRule):
     """check pdf content latex abnormal char."""
@@ -1441,6 +1525,48 @@ def eval(cls, input_data: Data) -> ModelRes:
         return res
 
 
+@Model.rule_register(
+    "QUALITY_BAD_EFFECTIVENESS",
+    [
+        "multi_lan_ar",
+        "multi_lan_ko",
+        "multi_lan_ru",
+        "multi_lan_th",
+        "multi_lan_vi",
+        "multi_lan_cs",
+        "multi_lan_hu",
+        "multi_lan_sr",
+    ]
+)
+class RuleNlpDataFormat(BaseRule):
+    """check whether the nlp data format is right"""
+
+    # Metadata for documentation generation
+    _metric_info = {
+        "category": "Rule-Based TEXT Quality Metrics",
+        "quality_dimension": "EFFECTIVENESS",
+        "metric_name": "RuleNlpDataFormat",
+        "description": "Check whether the nlp data format is right",
+        "evaluation_results": ""
+    }
+
+    dynamic_config = EvaluatorRuleArgs()
+
+    @classmethod
+    def eval(cls, input_data: Data) -> ModelRes:
+        res = ModelRes()
+
+        raw_data = input_data.raw_data
+        key_list = ["track_id", "content"]
+        if all(key in raw_data for key in key_list):
+            return res
+        else:
+            res.error_status = True
+            res.type = cls.metric_type
+            res.name = cls.__name__
+            res.reason = ["NLP Data format error"]
+
+
 @Model.rule_register(
     "QUALITY_BAD_FLUENCY",
     [
@@ -1472,7 +1598,7 @@ class RuleNoPunc(BaseRule):
         "paper_title": "RedPajama: an Open Dataset for Training Large Language Models",
         "paper_url": "https://github.com/togethercomputer/RedPajama-Data",
         "paper_authors": "Together Computer, 2023",
-        "evaluation_results": "docs/eval/rule/slimpajama_data_evaluated_by_rule.md"
+        "evaluation_results": ""
     }
 
     dynamic_config = EvaluatorRuleArgs(threshold=112)
@@ -1568,6 +1694,48 @@ def eval(cls, input_data: Data) -> ModelRes:
         return res
 
 
+@Model.rule_register(
+    "QUALITY_BAD_EFFECTIVENESS",
+    [
+        "multi_lan_ar",
+        "multi_lan_ko",
+        "multi_lan_ru",
+        "multi_lan_th",
+        "multi_lan_vi",
+        "multi_lan_cs",
+        "multi_lan_hu",
+        "multi_lan_sr",
+    ]
+)
+class RuleSftDataFormat(BaseRule):
+    """check whether the nlp data format is right"""
+
+    # Metadata for documentation generation
+    _metric_info = {
+        "category": "Rule-Based TEXT Quality Metrics",
+        "quality_dimension": "EFFECTIVENESS",
+        "metric_name": "RuleSftDataFormat",
+        "description": "Check whether the sft data format is right",
+        "evaluation_results": ""
+    }
+
+    dynamic_config = EvaluatorRuleArgs()
+
+    @classmethod
+    def eval(cls, input_data: Data) -> ModelRes:
+        res = ModelRes()
+
+        raw_data = input_data.raw_data
+        key_list = ["track_id", "type", "prompt", "completion"]
+        if all(key in raw_data for key in key_list):
+            return res
+        else:
+            res.error_status = True
+            res.type = cls.metric_type
+            res.name = cls.__name__
+            res.reason = ["SFT Data format error"]
+
+
 @Model.rule_register(
     "QUALITY_BAD_EFFECTIVENESS",
     [
@@ -1856,6 +2024,48 @@ def eval(cls, input_data: Data) -> ModelRes:
         return res
 
 
+@Model.rule_register(
+    "QUALITY_BAD_EFFECTIVENESS",
+    [
+        "multi_lan_ar",
+        "multi_lan_ko",
+        "multi_lan_ru",
+        "multi_lan_th",
+        "multi_lan_vi",
+        "multi_lan_cs",
+        "multi_lan_hu",
+        "multi_lan_sr",
+    ]
+)
+class RuleVedioDataFormat(BaseRule):
+    """check whether the vedio data format is right"""
+
+    # Metadata for documentation generation
+    _metric_info = {
+        "category": "Rule-Based TEXT Quality Metrics",
+        "quality_dimension": "EFFECTIVENESS",
+        "metric_name": "RuleVedioDataFormat",
+        "description": "Check whether the vedio data format is right",
+        "evaluation_results": ""
+    }
+
+    dynamic_config = EvaluatorRuleArgs()
+
+    @classmethod
+    def eval(cls, input_data: Data) -> ModelRes:
+        res = ModelRes()
+
+        raw_data = input_data.raw_data
+        key_list = ["id", "video", "text"]
+        if all(key in raw_data for key in key_list):
+            return res
+        else:
+            res.error_status = True
+            res.type = cls.metric_type
+            res.name = cls.__name__
+            res.reason = ["Vedio Data format error"]
+
+
 @Model.rule_register(
     "QUALITY_BAD_EFFECTIVENESS",
     [
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ class VLMDocumentParsingQuality(BaseOpenAI):`
`16`	`16`	`def build_messages(cls, input_data: Data) -> List:`
`17`	`17`	`messages = [`
`18`	`18`	`{`
`19`		`- "role": "user",`
	`19`	`+ "role": "user",`
`20`	`20`	`"content": [`
`21`	`21`	`{"type": "text", "text": cls.prompt.content},`
`22`	`22`	`{"type": "image_url", "image_url": {"url": input_data.img}},`