@@ -110,7 +110,7 @@ def evaluate_task_difficulty():
110110 "executor" : {
111111 "max_workers" : 5 ,
112112 "result_save" : {
113- "bad" : False , # 难度评估通常不需要保存"bad"
113+ "bad" : True ,
114114 "good" : True , # 保存所有评估结果
115115 "all_labels" : True
116116 }
@@ -172,7 +172,7 @@ def evaluate_both():
172172
173173 input_data = {
174174 "task_name" : "comprehensive_instruction_evaluation" ,
175- "input_path" : "test/data/instructions.jsonl" ,
175+ "input_path" : str ( Path ( "test/data/instructions.jsonl" )) ,
176176 "output_path" : "outputs/instruction_comprehensive/" ,
177177 "dataset" : {
178178 "source" : "local" ,
@@ -246,101 +246,6 @@ def evaluate_both():
246246 return summary
247247
248248
249- def analyze_difficulty_distribution ():
250- """分析任务难度分布(用于数据集平衡)"""
251- print ("=" * 80 )
252- print (" 任务难度分布分析" )
253- print ("=" * 80 + "\n " )
254-
255- input_data = {
256- "task_name" : "difficulty_distribution_analysis" ,
257- "input_path" : "test/data/instructions.jsonl" ,
258- "output_path" : "outputs/difficulty_distribution/" ,
259- "dataset" : {
260- "source" : "local" ,
261- "format" : "jsonl"
262- },
263- "executor" : {
264- "max_workers" : 10 ,
265- "result_save" : {
266- "bad" : False ,
267- "good" : True ,
268- "all_labels" : True
269- }
270- },
271- "evaluator" : [
272- {
273- "fields" : {"content" : "instruction" },
274- "evals" : [
275- {
276- "name" : "LLMTaskDifficulty" ,
277- "config" : {
278- "model" : OPENAI_MODEL ,
279- "key" : OPENAI_API_KEY ,
280- "api_url" : OPENAI_BASE_URL
281- }
282- }
283- ]
284- }
285- ]
286- }
287-
288- input_args = InputArgs (** input_data )
289- executor = Executor .exec_map ["local" ](input_args )
290- summary = executor .execute ()
291-
292- # 分析结果
293- good_list = executor .get_good_info_list ()
294-
295- # 统计难度分布
296- difficulty_counts = {
297- "Easy (0-3)" : 0 ,
298- "Moderate (4-6)" : 0 ,
299- "Hard (7-8)" : 0 ,
300- "Expert (9-10)" : 0
301- }
302-
303- total_score = 0
304- for item in good_list :
305- eval_details = item .get ('eval_details' , {})
306- for field , details in eval_details .items ():
307- for detail in details :
308- if detail .get ('metric' ) == 'LLMTaskDifficulty' :
309- score = detail .get ('score' , 0 )
310- total_score += score
311-
312- if score <= 3 :
313- difficulty_counts ["Easy (0-3)" ] += 1
314- elif score <= 6 :
315- difficulty_counts ["Moderate (4-6)" ] += 1
316- elif score <= 8 :
317- difficulty_counts ["Hard (7-8)" ] += 1
318- else :
319- difficulty_counts ["Expert (9-10)" ] += 1
320-
321- print ("\n " + "=" * 80 )
322- print (" 难度分布分析" )
323- print ("=" * 80 )
324- print (f"总数: { len (good_list )} " )
325- if good_list :
326- print (f"平均难度: { total_score / len (good_list ):.2f} /10" )
327- print ("\n 难度级别分布:" )
328- for level , count in difficulty_counts .items ():
329- percentage = (count / len (good_list ) * 100 ) if good_list else 0
330- print (f" { level } : { count } ({ percentage :.1f} %)" )
331-
332- print ("\n 💡 数据集平衡建议:" )
333- # 理想分布: Easy 20%, Moderate 50%, Hard 25%, Expert 5%
334- if difficulty_counts ["Easy (0-3)" ] / len (good_list ) > 0.3 :
335- print (" ⚠️ 简单任务过多,考虑增加难度或过滤部分简单任务" )
336- if difficulty_counts ["Moderate (4-6)" ] / len (good_list ) < 0.3 :
337- print (" ⚠️ 中等难度任务不足,这是 SFT 的核心部分" )
338- if difficulty_counts ["Hard (7-8)" ] / len (good_list ) > 0.4 :
339- print (" ⚠️ 困难任务过多,可能影响训练效率" )
340-
341- return summary
342-
343-
344249if __name__ == "__main__" :
345250 import sys
346251
@@ -361,8 +266,6 @@ def analyze_difficulty_distribution():
361266 evaluate_instruction_clarity ()
362267 elif mode == "difficulty" :
363268 evaluate_task_difficulty ()
364- elif mode == "distribution" :
365- analyze_difficulty_distribution ()
366269 else :
367270 evaluate_both ()
368271
0 commit comments