Skip to content

Commit 7037da2

Browse files
authored
Merge pull request #189 from chaserRen/main
Add xyz audio and format rules
2 parents e934614 + 2b36869 commit 7037da2

File tree

8 files changed

+387
-20
lines changed

8 files changed

+387
-20
lines changed

dingo/model/llm/vlm_document_parsing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class VLMDocumentParsingQuality(BaseOpenAI):
1616
def build_messages(cls, input_data: Data) -> List:
1717
messages = [
1818
{
19-
"role": "user",
19+
"role": "user",
2020
"content": [
2121
{"type": "text", "text": cls.prompt.content},
2222
{"type": "image_url", "image_url": {"url": input_data.img}},

dingo/model/prompt/prompt_document_parsing.py

Lines changed: 18 additions & 17 deletions
Large diffs are not rendered by default.

dingo/model/rule/rule_audio.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
from pathlib import Path
2+
3+
import numpy as np
4+
5+
from dingo.config.input_args import EvaluatorRuleArgs
6+
from dingo.io import Data
7+
from dingo.model.model import Model
8+
from dingo.model.modelres import ModelRes
9+
from dingo.model.rule.base import BaseRule
10+
11+
12+
@Model.rule_register(
13+
"QUALITY_BAD_EFFECTIVENESS",
14+
[
15+
"multi_lan_ar",
16+
"multi_lan_ko",
17+
"multi_lan_ru",
18+
"multi_lan_th",
19+
"multi_lan_vi",
20+
"multi_lan_cs",
21+
"multi_lan_hu",
22+
"multi_lan_sr",
23+
],
24+
)
25+
class RuleAudioDuration(BaseRule):
26+
"""check whether the audio duration meets the standard"""
27+
28+
# Metadata for documentation generation
29+
_metric_info = {
30+
"category": "Audio Quality Metrics",
31+
"quality_dimension": "Audio_EFFECTIVENESS",
32+
"metric_name": "RuleAudioDuration",
33+
"description": "Check whether the audio duration meets the standard",
34+
"evaluation_results": ""
35+
}
36+
37+
dynamic_config = EvaluatorRuleArgs()
38+
39+
@classmethod
40+
def eval(cls, input_data: Data) -> ModelRes:
41+
import librosa
42+
from scipy.signal import welch
43+
44+
res = ModelRes()
45+
46+
y, sr = librosa.load(input_data.content, sr=16000)
47+
f_signal, Pxx_signal = welch(y, fs=sr)
48+
noise_threshold = np.percentile(Pxx_signal, 50)
49+
Pxx_noise = np.where(Pxx_signal <= noise_threshold, Pxx_signal, 0)
50+
signal_power = np.sum(Pxx_signal)
51+
noise_power = np.sum(Pxx_noise)
52+
53+
if noise_power == 0:
54+
res.error_status = True
55+
res.type = cls.metric_type
56+
res.name = cls.__name__
57+
res.reason = ["The audio power is zero. Cannot calculate SNR."]
58+
59+
snr_dB = round(10 * np.log10(signal_power / noise_power), 2)
60+
61+
if snr_dB < 8:
62+
res.error_status = True
63+
res.type = cls.metric_type
64+
res.name = cls.__name__
65+
res.reason = ["The audio signal-to-noise ratio is too low."]
66+
return res
67+
68+
69+
@Model.rule_register(
70+
"QUALITY_BAD_EFFECTIVENESS",
71+
[
72+
"multi_lan_ar",
73+
"multi_lan_ko",
74+
"multi_lan_ru",
75+
"multi_lan_th",
76+
"multi_lan_vi",
77+
"multi_lan_cs",
78+
"multi_lan_hu",
79+
"multi_lan_sr",
80+
],
81+
)
82+
class RuleAudioSnrQuality(BaseRule):
83+
"""check whether the audio signal-to-noise ratio meets the standard"""
84+
85+
# Metadata for documentation generation
86+
_metric_info = {
87+
"category": "Audio Quality Metrics",
88+
"quality_dimension": "Audio_EFFECTIVENESS",
89+
"metric_name": "RuleAudioSnrQuality",
90+
"description": "Check whether the audio signal-to-noise ratio meets the standard",
91+
"evaluation_results": ""
92+
}
93+
94+
dynamic_config = EvaluatorRuleArgs()
95+
96+
@classmethod
97+
def eval(cls, input_data: Data) -> ModelRes:
98+
import wave
99+
100+
res = ModelRes()
101+
if not input_data.content:
102+
return res
103+
if isinstance(input_data.content, str):
104+
with wave.open(str(Path(input_data.content)), 'r') as w:
105+
frame_count = w.getnframes()
106+
sample_rate = w.getframerate()
107+
duration = frame_count / sample_rate
108+
109+
if duration > 10:
110+
res.error_status = True
111+
res.type = cls.metric_type
112+
res.name = cls.__name__
113+
res.reason = ["The audio duration is too long."]
114+
return res
115+
116+
117+
if __name__ == "__main__":
118+
data = Data(data_id="1", content=r"../test/data/audio/test.wav")
119+
res = RuleAudioDuration().eval(data)
120+
print(res)

dingo/model/rule/rule_common.py

Lines changed: 211 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,48 @@ def eval(cls, input_data: Data) -> ModelRes:
140140
return res
141141

142142

143+
@Model.rule_register(
144+
"QUALITY_BAD_EFFECTIVENESS",
145+
[
146+
"multi_lan_ar",
147+
"multi_lan_ko",
148+
"multi_lan_ru",
149+
"multi_lan_th",
150+
"multi_lan_vi",
151+
"multi_lan_cs",
152+
"multi_lan_hu",
153+
"multi_lan_sr",
154+
]
155+
)
156+
class RuleAudioDataFormat(BaseRule):
157+
"""check whether the audio data format is right"""
158+
159+
# Metadata for documentation generation
160+
_metric_info = {
161+
"category": "Rule-Based TEXT Quality Metrics",
162+
"quality_dimension": "EFFECTIVENESS",
163+
"metric_name": "RuleAudioDataFormat",
164+
"description": "Check whether the audio data format is right",
165+
"evaluation_results": ""
166+
}
167+
168+
dynamic_config = EvaluatorRuleArgs()
169+
170+
@classmethod
171+
def eval(cls, input_data: Data) -> ModelRes:
172+
res = ModelRes()
173+
174+
raw_data = input_data.raw_data
175+
key_list = ["id", "audio", "text"]
176+
if all(key in raw_data for key in key_list):
177+
return res
178+
else:
179+
res.error_status = True
180+
res.type = cls.metric_type
181+
res.name = cls.__name__
182+
res.reason = ["Audio Data format error"]
183+
184+
143185
@Model.rule_register("QUALITY_BAD_UNDERSTANDABILITY", ["pretrain"])
144186
class RuleCapitalWords(BaseRule):
145187
"""check whether capital words ratio > 0.2"""
@@ -1125,6 +1167,48 @@ def eval(cls, input_data: Data) -> ModelRes:
11251167
return res
11261168

11271169

1170+
@Model.rule_register(
1171+
"QUALITY_BAD_EFFECTIVENESS",
1172+
[
1173+
"multi_lan_ar",
1174+
"multi_lan_ko",
1175+
"multi_lan_ru",
1176+
"multi_lan_th",
1177+
"multi_lan_vi",
1178+
"multi_lan_cs",
1179+
"multi_lan_hu",
1180+
"multi_lan_sr",
1181+
]
1182+
)
1183+
class RuleImageDataFormat(BaseRule):
1184+
"""check whether the nlp data format is right"""
1185+
1186+
# Metadata for documentation generation
1187+
_metric_info = {
1188+
"category": "Rule-Based TEXT Quality Metrics",
1189+
"quality_dimension": "EFFECTIVENESS",
1190+
"metric_name": "RuleImageDataFormat",
1191+
"description": "Check whether the image data format is right",
1192+
"evaluation_results": ""
1193+
}
1194+
1195+
dynamic_config = EvaluatorRuleArgs()
1196+
1197+
@classmethod
1198+
def eval(cls, input_data: Data) -> ModelRes:
1199+
res = ModelRes()
1200+
1201+
raw_data = input_data.raw_data
1202+
key_list = ["img_id", "image"]
1203+
if all(key in raw_data for key in key_list):
1204+
return res
1205+
else:
1206+
res.error_status = True
1207+
res.type = cls.metric_type
1208+
res.name = cls.__name__
1209+
res.reason = ["Image Data format error"]
1210+
1211+
11281212
@Model.rule_register("QUALITY_BAD_EFFECTIVENESS", ["pdf_all"])
11291213
class RuleLatexSpecialChar(BaseRule):
11301214
"""check pdf content latex abnormal char."""
@@ -1441,6 +1525,48 @@ def eval(cls, input_data: Data) -> ModelRes:
14411525
return res
14421526

14431527

1528+
@Model.rule_register(
1529+
"QUALITY_BAD_EFFECTIVENESS",
1530+
[
1531+
"multi_lan_ar",
1532+
"multi_lan_ko",
1533+
"multi_lan_ru",
1534+
"multi_lan_th",
1535+
"multi_lan_vi",
1536+
"multi_lan_cs",
1537+
"multi_lan_hu",
1538+
"multi_lan_sr",
1539+
]
1540+
)
1541+
class RuleNlpDataFormat(BaseRule):
1542+
"""check whether the nlp data format is right"""
1543+
1544+
# Metadata for documentation generation
1545+
_metric_info = {
1546+
"category": "Rule-Based TEXT Quality Metrics",
1547+
"quality_dimension": "EFFECTIVENESS",
1548+
"metric_name": "RuleNlpDataFormat",
1549+
"description": "Check whether the nlp data format is right",
1550+
"evaluation_results": ""
1551+
}
1552+
1553+
dynamic_config = EvaluatorRuleArgs()
1554+
1555+
@classmethod
1556+
def eval(cls, input_data: Data) -> ModelRes:
1557+
res = ModelRes()
1558+
1559+
raw_data = input_data.raw_data
1560+
key_list = ["track_id", "content"]
1561+
if all(key in raw_data for key in key_list):
1562+
return res
1563+
else:
1564+
res.error_status = True
1565+
res.type = cls.metric_type
1566+
res.name = cls.__name__
1567+
res.reason = ["NLP Data format error"]
1568+
1569+
14441570
@Model.rule_register(
14451571
"QUALITY_BAD_FLUENCY",
14461572
[
@@ -1472,7 +1598,7 @@ class RuleNoPunc(BaseRule):
14721598
"paper_title": "RedPajama: an Open Dataset for Training Large Language Models",
14731599
"paper_url": "https://github.com/togethercomputer/RedPajama-Data",
14741600
"paper_authors": "Together Computer, 2023",
1475-
"evaluation_results": "docs/eval/rule/slimpajama_data_evaluated_by_rule.md"
1601+
"evaluation_results": ""
14761602
}
14771603

14781604
dynamic_config = EvaluatorRuleArgs(threshold=112)
@@ -1568,6 +1694,48 @@ def eval(cls, input_data: Data) -> ModelRes:
15681694
return res
15691695

15701696

1697+
@Model.rule_register(
1698+
"QUALITY_BAD_EFFECTIVENESS",
1699+
[
1700+
"multi_lan_ar",
1701+
"multi_lan_ko",
1702+
"multi_lan_ru",
1703+
"multi_lan_th",
1704+
"multi_lan_vi",
1705+
"multi_lan_cs",
1706+
"multi_lan_hu",
1707+
"multi_lan_sr",
1708+
]
1709+
)
1710+
class RuleSftDataFormat(BaseRule):
1711+
"""check whether the nlp data format is right"""
1712+
1713+
# Metadata for documentation generation
1714+
_metric_info = {
1715+
"category": "Rule-Based TEXT Quality Metrics",
1716+
"quality_dimension": "EFFECTIVENESS",
1717+
"metric_name": "RuleSftDataFormat",
1718+
"description": "Check whether the sft data format is right",
1719+
"evaluation_results": ""
1720+
}
1721+
1722+
dynamic_config = EvaluatorRuleArgs()
1723+
1724+
@classmethod
1725+
def eval(cls, input_data: Data) -> ModelRes:
1726+
res = ModelRes()
1727+
1728+
raw_data = input_data.raw_data
1729+
key_list = ["track_id", "type", "prompt", "completion"]
1730+
if all(key in raw_data for key in key_list):
1731+
return res
1732+
else:
1733+
res.error_status = True
1734+
res.type = cls.metric_type
1735+
res.name = cls.__name__
1736+
res.reason = ["SFT Data format error"]
1737+
1738+
15711739
@Model.rule_register(
15721740
"QUALITY_BAD_EFFECTIVENESS",
15731741
[
@@ -1856,6 +2024,48 @@ def eval(cls, input_data: Data) -> ModelRes:
18562024
return res
18572025

18582026

2027+
@Model.rule_register(
2028+
"QUALITY_BAD_EFFECTIVENESS",
2029+
[
2030+
"multi_lan_ar",
2031+
"multi_lan_ko",
2032+
"multi_lan_ru",
2033+
"multi_lan_th",
2034+
"multi_lan_vi",
2035+
"multi_lan_cs",
2036+
"multi_lan_hu",
2037+
"multi_lan_sr",
2038+
]
2039+
)
2040+
class RuleVedioDataFormat(BaseRule):
2041+
"""check whether the vedio data format is right"""
2042+
2043+
# Metadata for documentation generation
2044+
_metric_info = {
2045+
"category": "Rule-Based TEXT Quality Metrics",
2046+
"quality_dimension": "EFFECTIVENESS",
2047+
"metric_name": "RuleVedioDataFormat",
2048+
"description": "Check whether the vedio data format is right",
2049+
"evaluation_results": ""
2050+
}
2051+
2052+
dynamic_config = EvaluatorRuleArgs()
2053+
2054+
@classmethod
2055+
def eval(cls, input_data: Data) -> ModelRes:
2056+
res = ModelRes()
2057+
2058+
raw_data = input_data.raw_data
2059+
key_list = ["id", "video", "text"]
2060+
if all(key in raw_data for key in key_list):
2061+
return res
2062+
else:
2063+
res.error_status = True
2064+
res.type = cls.metric_type
2065+
res.name = cls.__name__
2066+
res.reason = ["Vedio Data format error"]
2067+
2068+
18592069
@Model.rule_register(
18602070
"QUALITY_BAD_EFFECTIVENESS",
18612071
[

0 commit comments

Comments
 (0)