@@ -140,6 +140,48 @@ def eval(cls, input_data: Data) -> ModelRes:
140140 return res
141141
142142
143+ @Model .rule_register (
144+ "QUALITY_BAD_EFFECTIVENESS" ,
145+ [
146+ "multi_lan_ar" ,
147+ "multi_lan_ko" ,
148+ "multi_lan_ru" ,
149+ "multi_lan_th" ,
150+ "multi_lan_vi" ,
151+ "multi_lan_cs" ,
152+ "multi_lan_hu" ,
153+ "multi_lan_sr" ,
154+ ]
155+ )
156+ class RuleAudioDataFormat (BaseRule ):
157+ """check whether the audio data format is right"""
158+
159+ # Metadata for documentation generation
160+ _metric_info = {
161+ "category" : "Rule-Based TEXT Quality Metrics" ,
162+ "quality_dimension" : "EFFECTIVENESS" ,
163+ "metric_name" : "RuleAudioDataFormat" ,
164+ "description" : "Check whether the audio data format is right" ,
165+ "evaluation_results" : ""
166+ }
167+
168+ dynamic_config = EvaluatorRuleArgs ()
169+
170+ @classmethod
171+ def eval (cls , input_data : Data ) -> ModelRes :
172+ res = ModelRes ()
173+
174+ raw_data = input_data .raw_data
175+ key_list = ["id" , "audio" , "text" ]
176+ if all (key in raw_data for key in key_list ):
177+ return res
178+ else :
179+ res .error_status = True
180+ res .type = cls .metric_type
181+ res .name = cls .__name__
182+ res .reason = ["Audio Data format error" ]
183+
184+
143185@Model .rule_register ("QUALITY_BAD_UNDERSTANDABILITY" , ["pretrain" ])
144186class RuleCapitalWords (BaseRule ):
145187 """check whether capital words ratio > 0.2"""
@@ -1125,6 +1167,48 @@ def eval(cls, input_data: Data) -> ModelRes:
11251167 return res
11261168
11271169
1170+ @Model .rule_register (
1171+ "QUALITY_BAD_EFFECTIVENESS" ,
1172+ [
1173+ "multi_lan_ar" ,
1174+ "multi_lan_ko" ,
1175+ "multi_lan_ru" ,
1176+ "multi_lan_th" ,
1177+ "multi_lan_vi" ,
1178+ "multi_lan_cs" ,
1179+ "multi_lan_hu" ,
1180+ "multi_lan_sr" ,
1181+ ]
1182+ )
1183+ class RuleImageDataFormat (BaseRule ):
1184+ """check whether the nlp data format is right"""
1185+
1186+ # Metadata for documentation generation
1187+ _metric_info = {
1188+ "category" : "Rule-Based TEXT Quality Metrics" ,
1189+ "quality_dimension" : "EFFECTIVENESS" ,
1190+ "metric_name" : "RuleImageDataFormat" ,
1191+ "description" : "Check whether the image data format is right" ,
1192+ "evaluation_results" : ""
1193+ }
1194+
1195+ dynamic_config = EvaluatorRuleArgs ()
1196+
1197+ @classmethod
1198+ def eval (cls , input_data : Data ) -> ModelRes :
1199+ res = ModelRes ()
1200+
1201+ raw_data = input_data .raw_data
1202+ key_list = ["img_id" , "image" ]
1203+ if all (key in raw_data for key in key_list ):
1204+ return res
1205+ else :
1206+ res .error_status = True
1207+ res .type = cls .metric_type
1208+ res .name = cls .__name__
1209+ res .reason = ["Image Data format error" ]
1210+
1211+
11281212@Model .rule_register ("QUALITY_BAD_EFFECTIVENESS" , ["pdf_all" ])
11291213class RuleLatexSpecialChar (BaseRule ):
11301214 """check pdf content latex abnormal char."""
@@ -1441,6 +1525,48 @@ def eval(cls, input_data: Data) -> ModelRes:
14411525 return res
14421526
14431527
1528+ @Model .rule_register (
1529+ "QUALITY_BAD_EFFECTIVENESS" ,
1530+ [
1531+ "multi_lan_ar" ,
1532+ "multi_lan_ko" ,
1533+ "multi_lan_ru" ,
1534+ "multi_lan_th" ,
1535+ "multi_lan_vi" ,
1536+ "multi_lan_cs" ,
1537+ "multi_lan_hu" ,
1538+ "multi_lan_sr" ,
1539+ ]
1540+ )
1541+ class RuleNlpDataFormat (BaseRule ):
1542+ """check whether the nlp data format is right"""
1543+
1544+ # Metadata for documentation generation
1545+ _metric_info = {
1546+ "category" : "Rule-Based TEXT Quality Metrics" ,
1547+ "quality_dimension" : "EFFECTIVENESS" ,
1548+ "metric_name" : "RuleNlpDataFormat" ,
1549+ "description" : "Check whether the nlp data format is right" ,
1550+ "evaluation_results" : ""
1551+ }
1552+
1553+ dynamic_config = EvaluatorRuleArgs ()
1554+
1555+ @classmethod
1556+ def eval (cls , input_data : Data ) -> ModelRes :
1557+ res = ModelRes ()
1558+
1559+ raw_data = input_data .raw_data
1560+ key_list = ["track_id" , "content" ]
1561+ if all (key in raw_data for key in key_list ):
1562+ return res
1563+ else :
1564+ res .error_status = True
1565+ res .type = cls .metric_type
1566+ res .name = cls .__name__
1567+ res .reason = ["NLP Data format error" ]
1568+
1569+
14441570@Model .rule_register (
14451571 "QUALITY_BAD_FLUENCY" ,
14461572 [
@@ -1472,7 +1598,7 @@ class RuleNoPunc(BaseRule):
14721598 "paper_title" : "RedPajama: an Open Dataset for Training Large Language Models" ,
14731599 "paper_url" : "https://github.com/togethercomputer/RedPajama-Data" ,
14741600 "paper_authors" : "Together Computer, 2023" ,
1475- "evaluation_results" : "docs/eval/rule/slimpajama_data_evaluated_by_rule.md "
1601+ "evaluation_results" : ""
14761602 }
14771603
14781604 dynamic_config = EvaluatorRuleArgs (threshold = 112 )
@@ -1568,6 +1694,48 @@ def eval(cls, input_data: Data) -> ModelRes:
15681694 return res
15691695
15701696
1697+ @Model .rule_register (
1698+ "QUALITY_BAD_EFFECTIVENESS" ,
1699+ [
1700+ "multi_lan_ar" ,
1701+ "multi_lan_ko" ,
1702+ "multi_lan_ru" ,
1703+ "multi_lan_th" ,
1704+ "multi_lan_vi" ,
1705+ "multi_lan_cs" ,
1706+ "multi_lan_hu" ,
1707+ "multi_lan_sr" ,
1708+ ]
1709+ )
1710+ class RuleSftDataFormat (BaseRule ):
1711+ """check whether the nlp data format is right"""
1712+
1713+ # Metadata for documentation generation
1714+ _metric_info = {
1715+ "category" : "Rule-Based TEXT Quality Metrics" ,
1716+ "quality_dimension" : "EFFECTIVENESS" ,
1717+ "metric_name" : "RuleSftDataFormat" ,
1718+ "description" : "Check whether the sft data format is right" ,
1719+ "evaluation_results" : ""
1720+ }
1721+
1722+ dynamic_config = EvaluatorRuleArgs ()
1723+
1724+ @classmethod
1725+ def eval (cls , input_data : Data ) -> ModelRes :
1726+ res = ModelRes ()
1727+
1728+ raw_data = input_data .raw_data
1729+ key_list = ["track_id" , "type" , "prompt" , "completion" ]
1730+ if all (key in raw_data for key in key_list ):
1731+ return res
1732+ else :
1733+ res .error_status = True
1734+ res .type = cls .metric_type
1735+ res .name = cls .__name__
1736+ res .reason = ["SFT Data format error" ]
1737+
1738+
15711739@Model .rule_register (
15721740 "QUALITY_BAD_EFFECTIVENESS" ,
15731741 [
@@ -1856,6 +2024,48 @@ def eval(cls, input_data: Data) -> ModelRes:
18562024 return res
18572025
18582026
2027+ @Model .rule_register (
2028+ "QUALITY_BAD_EFFECTIVENESS" ,
2029+ [
2030+ "multi_lan_ar" ,
2031+ "multi_lan_ko" ,
2032+ "multi_lan_ru" ,
2033+ "multi_lan_th" ,
2034+ "multi_lan_vi" ,
2035+ "multi_lan_cs" ,
2036+ "multi_lan_hu" ,
2037+ "multi_lan_sr" ,
2038+ ]
2039+ )
2040+ class RuleVedioDataFormat (BaseRule ):
2041+ """check whether the vedio data format is right"""
2042+
2043+ # Metadata for documentation generation
2044+ _metric_info = {
2045+ "category" : "Rule-Based TEXT Quality Metrics" ,
2046+ "quality_dimension" : "EFFECTIVENESS" ,
2047+ "metric_name" : "RuleVedioDataFormat" ,
2048+ "description" : "Check whether the vedio data format is right" ,
2049+ "evaluation_results" : ""
2050+ }
2051+
2052+ dynamic_config = EvaluatorRuleArgs ()
2053+
2054+ @classmethod
2055+ def eval (cls , input_data : Data ) -> ModelRes :
2056+ res = ModelRes ()
2057+
2058+ raw_data = input_data .raw_data
2059+ key_list = ["id" , "video" , "text" ]
2060+ if all (key in raw_data for key in key_list ):
2061+ return res
2062+ else :
2063+ res .error_status = True
2064+ res .type = cls .metric_type
2065+ res .name = cls .__name__
2066+ res .reason = ["Vedio Data format error" ]
2067+
2068+
18592069@Model .rule_register (
18602070 "QUALITY_BAD_EFFECTIVENESS" ,
18612071 [
0 commit comments