diff --git a/app_gradio/app.py b/app_gradio/app.py index 1294d69d..4e5762eb 100644 --- a/app_gradio/app.py +++ b/app_gradio/app.py @@ -14,17 +14,17 @@ def dingo_demo( uploaded_file, - dataset_source, data_format, input_path, max_workers, batch_size, - column_id, column_prompt, column_content, column_image, - rule_list, prompt_list, scene_list, - model, key, api_url + dataset_source, data_format, remove_output, input_path, max_workers, batch_size, + fields_data, + rule_list, llm_list, + # rule_config_data, + llm_config_data ): if not data_format: raise gr.Error('ValueError: data_format can not be empty, please input.') - # if not column_content: - # raise gr.Error('ValueError: column_content can not be empty, please input.') - if not rule_list and not prompt_list: - raise gr.Error('ValueError: rule_list and prompt_list can not be empty at the same time.') + + if not rule_list and not llm_list: + raise gr.Error('ValueError: rule_list and llm_list can not be empty at the same time.') # Handle input path based on dataset source if dataset_source == "hugging_face": @@ -47,42 +47,127 @@ def dingo_demo( raise gr.Error('Please input value > 0 in batch_size.') try: + # Parse fields from dataframe + fields = {} + if fields_data is not None and len(fields_data) > 0: + for row in fields_data.values.tolist(): + if len(row) >= 2 and row[0] and row[1]: # Both key and value are not empty + fields[row[0]] = row[1] + + # Parse rule configs from dataframe + rule_configs = {} + # if rule_config_data is not None and len(rule_config_data) > 0: + # for row in rule_config_data.values.tolist(): + # if len(row) >= 6 and row[0]: # Rule name exists + # rule_name = row[0] + # config = {} + # + # # threshold + # if row[1] is not None and str(row[1]).strip(): + # try: + # config['threshold'] = float(row[1]) + # except: + # pass + # + # # pattern + # if row[2] and str(row[2]).strip(): + # config['pattern'] = str(row[2]) + # + # # key_list + # if row[3] and str(row[3]).strip(): + # try: + # val = str(row[3]) + # config['key_list'] = json.loads(val) if val.startswith('[') else [k.strip() for k in val.split(',') if k.strip()] + # except: + # config['key_list'] = [k.strip() for k in str(row[3]).split(',') if k.strip()] + # + # # refer_path + # if row[4] and str(row[4]).strip(): + # try: + # val = str(row[4]) + # config['refer_path'] = json.loads(val) if val.startswith('[') else [p.strip() for p in val.split(',') if p.strip()] + # except: + # config['refer_path'] = [p.strip() for p in str(row[4]).split(',') if p.strip()] + # + # # parameters + # if row[5] and str(row[5]).strip(): + # try: + # config['parameters'] = json.loads(str(row[5])) + # except: + # pass + # + # if config: + # rule_configs[rule_name] = config + + # Parse llm configs from dataframe + llm_configs = {} + if llm_config_data is not None and len(llm_config_data) > 0: + for row in llm_config_data.values.tolist(): + if len(row) >= 5 and row[0]: # LLM name exists + llm_name = row[0] + config = {} + + # model + if row[1] and str(row[1]).strip(): + config['model'] = str(row[1]) + + # key + if row[2] and str(row[2]).strip(): + config['key'] = str(row[2]) + + # api_url + if row[3] and str(row[3]).strip(): + config['api_url'] = str(row[3]) + + # parameters + if row[4] and str(row[4]).strip(): + try: + config['parameters'] = json.loads(str(row[4])) + except json.JSONDecodeError as e: + raise gr.Error(f"Invalid JSON in 'parameters' for LLM '{llm_name}': {e}") + + if config: + llm_configs[llm_name] = config + + # Build evals array + evals = [] + + # Add rule evaluators and their configurations + for rule in rule_list: + eval_item = {"name": rule} + if rule in rule_configs: + eval_item["config"] = rule_configs[rule] + evals.append(eval_item) + + # Add LLM evaluators and their configurations + for llm in llm_list: + eval_item = {"name": llm} + if llm in llm_configs: + eval_item["config"] = llm_configs[llm] + evals.append(eval_item) + input_data = { "input_path": final_input_path, "output_path": "" if dataset_source == 'hugging_face' else os.path.dirname(final_input_path), "dataset": { "source": dataset_source, "format": data_format, - "field": {} }, "executor": { - "rule_list": rule_list, - "prompt_list": prompt_list, "result_save": { "bad": True, - "raw": True + "good": True }, "max_workers": max_workers, "batch_size": batch_size, }, - "evaluator": { - "llm_config": { - scene_list: { - "model": model, - "key": key, - "api_url": api_url, - } + "evaluator": [ + { + "fields": fields, + "evals": evals } - } + ] } - if column_id: - input_data['dataset']['field']['id'] = column_id - if column_prompt: - input_data['dataset']['field']['prompt'] = column_prompt - if column_content: - input_data['dataset']['field']['content'] = column_content - if column_image: - input_data['dataset']['field']['image'] = column_image # print(input_data) # exit(0) @@ -95,24 +180,26 @@ def dingo_demo( for item in detail: new_detail.append(item) if summary['output_path']: - shutil.rmtree(summary['output_path']) + if remove_output == "true": + shutil.rmtree(summary['output_path']) + summary['output_path'] = "" - # 返回两个值:概要信息和详细信息 + # Return summary and detail information return json.dumps(summary, indent=4), new_detail except Exception as e: raise gr.Error(str(e)) def update_input_components(dataset_source): - # 根据数据源的不同,返回不同的输入组件 + # Return different input components based on data source if dataset_source == "hugging_face": - # 如果数据源是huggingface,返回一个可见的文本框和一个不可见的文件组件 + # If data source is huggingface, return a visible textbox and an invisible file component return [ gr.Textbox(visible=True), gr.File(visible=False), ] else: # local - # 如果数据源是本地,返回一个不可见的文本框和一个可见的文件组件 + # If data source is local, return an invisible textbox and a visible file component return [ gr.Textbox(visible=False), gr.File(visible=True), @@ -127,68 +214,61 @@ def update_rule_list(rule_type_mapping, rule_type): ) -def update_prompt_list(scene_prompt_mapping, scene): - """根据选择的场景更新可用的prompt列表,并清空所有勾选""" - return gr.CheckboxGroup( - choices=scene_prompt_mapping.get(scene, []), - value=[], # 清空所有勾选 - label="prompt_list" - ) +# Generate configuration dataframes based on selected evaluators +# def generate_rule_config_dataframe(rule_list): +# """Generate rule configuration dataframe based on selected rules""" +# if not rule_list: +# return gr.update(value=[], visible=False) +# +# # Create rows for each rule +# rows = [] +# for rule in rule_list: +# rows.append([rule, None, "", "", "", ""]) +# +# return gr.update(value=rows, visible=True) -# prompt_list变化时,动态控制model、key、api_url的显示 -def toggle_llm_fields(prompt_values): - visible = bool(prompt_values) - return ( - gr.update(visible=visible), - gr.update(visible=visible), - gr.update(visible=visible) - ) +def generate_llm_config_dataframe(llm_list): + """Generate LLM configuration dataframe based on selected LLMs""" + if not llm_list: + return gr.update(value=[], visible=False) + + # Create rows for each LLM + rows = [] + for llm in llm_list: + rows.append([llm, "deepseek-chat", "your-api-key", "https://api.deepseek.com/v1", ""]) + return gr.update(value=rows, visible=True) -# 控制column_id、column_prompt、column_content、column_image的显示 -def update_column_fields(rule_list, prompt_list): + +def suggest_fields_dataframe(rule_list, llm_list): + """Suggest required field mappings based on selected evaluators""" + suggested_fields = set() + + # Fields required by rule evaluators rule_type_mapping = get_rule_type_mapping() - scene_prompt_mapping = get_scene_prompt_mapping() data_column_mapping = get_data_column_mapping() - status_mapping = { - 'id': False, - 'prompt': False, - 'content': False, - 'image': False, - } - res = ( - gr.update(visible=status_mapping['id']), - gr.update(visible=status_mapping['prompt']), - gr.update(visible=status_mapping['content']), - gr.update(visible=status_mapping['image']) - ) - if not rule_list and not prompt_list: - return res + for rule in rule_list: + # Find which type this rule belongs to + for rule_type, rules in rule_type_mapping.items(): + if rule in rules: + if rule_type in data_column_mapping: + suggested_fields.update(data_column_mapping[rule_type]) + break - key_list = [] - key_list += get_key_by_mapping(rule_type_mapping, rule_list) - key_list += get_key_by_mapping(scene_prompt_mapping, prompt_list) + # Fields required by LLM evaluators + llm_column_mapping = get_llm_column_mapping() + for llm in llm_list: + if llm in llm_column_mapping: + suggested_fields.update(llm_column_mapping[llm]) - data_column = [] - for key in key_list: - if not data_column: - data_column = data_column_mapping[key] - else: - new_data_column = data_column_mapping[key] - if data_column != new_data_column: - raise gr.Error(f'ConflictError: {key} need data type is different from other.') - - for c in data_column: - status_mapping[c] = True - res = ( - gr.update(visible=status_mapping['id']), - gr.update(visible=status_mapping['prompt']), - gr.update(visible=status_mapping['content']), - gr.update(visible=status_mapping['image']) - ) - return res + # Generate suggested fields rows + rows = [] + for field in sorted(suggested_fields): + rows.append([field, field]) + + return gr.update(value=rows if rows else [["content", "content"]]) def get_rule_type_mapping(): @@ -208,50 +288,32 @@ def get_rule_type_mapping(): return process_map -def get_scene_prompt_mapping(): - origin_map = Model.get_scenario_prompt_map() - process_map = {'LLMTextQualityModelBase': [], 'LLMTextQualityPromptBase': []} # can adjust the order - for k, v in origin_map.items(): - for p in v: - if k not in process_map: - process_map[k] = [] - process_map[k].append(p.__name__) - # print(process_map) - - return process_map - - -def get_key_by_mapping(map_dict: dict, value_list: list): - key_list = [] - for k, v in map_dict.items(): - if bool(set(v) & set(value_list)): - key_list.append(k) - - return key_list +def get_llm_list(): + """Get LLM list from Model.llm_name_map""" + llm_name_map = Model.get_llm_name_map() + return list(llm_name_map.keys()) + + +def get_llm_column_mapping(): + """Get column mapping required by each LLM""" + # Define columns required by each LLM based on actual needs + # Can be dynamically obtained from Model information, using default configuration for now + llm_list = get_llm_list() + mapping = {} + for llm_name in llm_list: + # Specify different field requirements based on specific LLM type + if 'VLM' in llm_name or 'Image' in llm_name: + mapping[llm_name] = ['content', 'image'] + elif 'Relevant' in llm_name: + mapping[llm_name] = ['prompt', 'content'] + else: + mapping[llm_name] = ['content'] + return mapping def get_data_column_mapping(): return { - # llm - 'LLMTextQualityPromptBase': ['content'], - 'LLMTextQualityModelBase': ['content'], - 'LLMSecurityPolitics': ['content'], - 'LLMSecurityProhibition': ['content'], - 'LLMText3HHarmless': ['content'], - 'LLMText3HHelpful': ['content'], - 'LLMText3HHonest': ['content'], - 'LLMClassifyTopic': ['content'], - 'LLMClassifyQR': ['content'], - 'LLMDatamanAssessment': ['content'], - 'VLMImageRelevant': ['prompt', 'content'], - - # rule - # 'QUALITY_BAD_COMPLETENESS': ['content'], - # 'QUALITY_BAD_EFFECTIVENESS': ['content'], - # 'QUALITY_BAD_FLUENCY': ['content'], - # 'QUALITY_BAD_RELEVANCE': ['content'], - # 'QUALITY_BAD_SIMILARITY': ['content'], - # 'QUALITY_BAD_UNDERSTANDABILITY': ['content'], + # Rule mapping 'Rule-Based TEXT Quality Metrics': ['content'], 'QUALITY_BAD_SECURITY': ['content'], 'QUALITY_BAD_IMG_EFFECTIVENESS': ['image'], @@ -264,8 +326,7 @@ def get_data_column_mapping(): rule_type_mapping = get_rule_type_mapping() rule_type_options = list(rule_type_mapping.keys()) - scene_prompt_mapping = get_scene_prompt_mapping() - scene_options = list(scene_prompt_mapping.keys()) + llm_options = get_llm_list() current_dir = Path(__file__).parent with open(os.path.join(current_dir, 'header.html'), "r") as file: @@ -291,10 +352,16 @@ def get_data_column_mapping(): visible=False ) - data_format = gr.Dropdown( - ["jsonl", "json", "plaintext", "listjson","image"], - label="data_format" - ) + with gr.Row(): + data_format = gr.Dropdown( + ["jsonl", "json", "plaintext", "listjson","image"], + label="data_format" + ) + remove_output = gr.Dropdown( + ["true", "false"], + value="true", + label="remove_output" + ) with gr.Row(): max_workers = gr.Number( value=1, @@ -313,84 +380,70 @@ def get_data_column_mapping(): rule_type = gr.Dropdown( choices=rule_type_options, value=rule_type_options[0], - label="rule_type", + label="Rule Type", interactive=True ) rule_list = gr.CheckboxGroup( choices=rule_type_mapping.get(rule_type_options[0], []), - label="rule_list" + label="Rule List" ) - # 添加场景选择下拉框 - scene_list = gr.Dropdown( - choices=scene_options, - value=scene_options[0], - label="scenario_list", - interactive=True + # LLM evaluator list + llm_list = gr.CheckboxGroup( + choices=llm_options, + label="LLM List" ) - prompt_list = gr.CheckboxGroup( - choices=scene_prompt_mapping.get(scene_options[0], []), - label="prompt_list" - ) - # LLM模型名 - model = gr.Textbox( - placeholder="If want to use llm, please input model, such as: deepseek-chat", - label="model", - visible=False - ) - # LLM API KEY - key = gr.Textbox( - placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx", - label="API KEY", - visible=False + + gr.Markdown("### EvalPipline Configuration") + gr.Markdown("Configure field mappings and evaluator parameters based on selected evaluators ([Examples](https://github.com/MigoXLab/dingo/tree/main/examples))") + + # Field mapping configuration + gr.Markdown("**EvalPipline.fields** - Field Mapping") + fields_dataframe = gr.Dataframe( + value=[["content", "content"]], + headers=["Field Key", "Dataset Column"], + datatype=["str", "str"], + col_count=(2, "fixed"), + row_count=(1, "dynamic"), + label="Field Mappings (add/remove rows as needed)", + interactive=True ) - # LLM API URL - api_url = gr.Textbox( - placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1", - label="API URL", + + # Rule configuration + # gr.Markdown("**Rule Config** - EvalPiplineConfig.config for Rules") + # rule_config_dataframe = gr.Dataframe( + # value=[], + # headers=["Rule Name", "threshold", "pattern", "key_list", "refer_path", "parameters"], + # datatype=["str", "number", "str", "str", "str", "str"], + # col_count=(6, "fixed"), + # row_count=(0, "dynamic"), + # label="Rule Configurations (auto-generated based on rule_list selection)", + # interactive=True, + # visible=False + # ) + + # LLM configuration + gr.Markdown("**LLM Config** - EvalPiplineConfig.config for LLMs") + llm_config_dataframe = gr.Dataframe( + value=[], + headers=["LLM Name", "model", "key", "api_url", "parameters"], + datatype=["str", "str", "str", "str", "str"], + col_count=(5, "fixed"), + row_count=(0, "dynamic"), + label="LLM Configurations (auto-generated based on llm_list selection)", + interactive=True, visible=False ) - with gr.Row(): - # 字段映射说明文本,带示例链接 - with gr.Column(): - gr.Markdown( - "Please input the column name of dataset in the input boxes below ( [examples](https://github.com/MigoXLab/dingo/tree/main/examples) )") - - column_id = gr.Textbox( - value="", - placeholder="Column name of id in the input file. If exists multiple levels, use '.' separate", - label="column_id", - visible=False - ) - column_prompt = gr.Textbox( - value="", - placeholder="Column name of prompt in the input file. If exists multiple levels, use '.' separate", - label="column_prompt", - visible=False - ) - column_content = gr.Textbox( - value="content", - placeholder="Column name of content in the input file. If exists multiple levels, use '.' separate", - label="column_content", - visible=False - ) - column_image = gr.Textbox( - value="", - placeholder="Column name of image in the input file. If exists multiple levels, use '.' separate", - label="column_image", - visible=False - ) - with gr.Row(): submit_single = gr.Button(value="Submit", interactive=True, variant="primary") with gr.Column(): - # 修改输出组件部分,使用Tabs + # Output component section, using Tabs with gr.Tabs(): with gr.Tab("Result Summary"): - summary_output = gr.JSON(label="summary", max_height=800) + summary_output = gr.JSON(label="Summary", max_height=800) with gr.Tab("Result Detail"): - detail_output = gr.JSON(label="detail", max_height=800) # 使用JSON组件来更好地展示结构化数据 + detail_output = gr.JSON(label="Detail", max_height=800) # Use JSON component for better structured data display dataset_source.change( fn=update_input_components, @@ -404,38 +457,40 @@ def get_data_column_mapping(): outputs=rule_list ) - # 场景变化时更新prompt列表 - scene_list.change( - fn=partial(update_prompt_list, scene_prompt_mapping), - inputs=scene_list, - outputs=prompt_list - ) - - prompt_list.change( - fn=toggle_llm_fields, - inputs=prompt_list, - outputs=[model, key, api_url] + # Auto-generate configuration dataframes when rule_list changes + # rule_list.change( + # fn=generate_rule_config_dataframe, + # inputs=rule_list, + # outputs=rule_config_dataframe + # ) + + # Auto-generate configuration dataframes when llm_list changes + llm_list.change( + fn=generate_llm_config_dataframe, + inputs=llm_list, + outputs=llm_config_dataframe ) - # column字段显示控制 - for comp in [rule_list, prompt_list]: + # Suggest field mappings when evaluators change + for comp in [rule_list, llm_list]: comp.change( - fn=update_column_fields, - inputs=[rule_list, prompt_list], - outputs=[column_id, column_prompt, column_content, column_image] + fn=suggest_fields_dataframe, + inputs=[rule_list, llm_list], + outputs=fields_dataframe ) submit_single.click( fn=dingo_demo, inputs=[ uploaded_file, - dataset_source, data_format, input_path, max_workers, batch_size, - column_id, column_prompt, column_content, column_image, - rule_list, prompt_list, scene_list, - model, key, api_url + dataset_source, data_format, remove_output, input_path, max_workers, batch_size, + fields_dataframe, + rule_list, llm_list, + # rule_config_dataframe, + llm_config_dataframe ], - outputs=[summary_output, detail_output] # 修改输出为两个组件 + outputs=[summary_output, detail_output] ) - # 启动界面 + # Launch interface demo.launch(share=True) diff --git a/dingo/model/llm/base.py b/dingo/model/llm/base.py index 778f7f1f..440193e2 100644 --- a/dingo/model/llm/base.py +++ b/dingo/model/llm/base.py @@ -9,7 +9,7 @@ class BaseLLM: client = None prompt: str | List = None - dynamic_config: EvaluatorLLMArgs + dynamic_config: EvaluatorLLMArgs = EvaluatorLLMArgs() @classmethod def eval(cls, input_data: Data) -> EvalDetail: diff --git a/dingo/model/llm/rag/llm_rag_context_precision.py b/dingo/model/llm/rag/llm_rag_context_precision.py index e9cefb5a..94a2a84f 100644 --- a/dingo/model/llm/rag/llm_rag_context_precision.py +++ b/dingo/model/llm/rag/llm_rag_context_precision.py @@ -236,9 +236,9 @@ def process_response(cls, responses: List[str]) -> EvalDetail: context_verdicts.append(verdict) all_verdicts.append(verdict) - all_reasons.append(f"上下文{i+1}: {'相关' if verdict else '不相关'}\n理由: {reason}") + all_reasons.append(f"上下文{i + 1}: {'相关' if verdict else '不相关'}\n理由: {reason}") except json.JSONDecodeError: - raise ConvertJsonError(f"Convert to JSON format failed for response {i+1}: {response}") + raise ConvertJsonError(f"Convert to JSON format failed for response {i + 1}: {response}") # 计算平均精度 avg_precision = cls._calculate_average_precision(context_verdicts) @@ -304,7 +304,7 @@ def eval(cls, input_data: Data) -> EvalDetail: # } res.status = True res.label = ["QUALITY_BAD.REQUEST_FAILED"] - res.reason = [f"为上下文{item['context_index']+1}发送请求失败"] + res.reason = [f"为上下文{item['context_index'] + 1}发送请求失败"] return res responses.append(response) diff --git a/dingo/model/llm/rag/llm_rag_context_recall.py b/dingo/model/llm/rag/llm_rag_context_recall.py index ee27cad7..2a37fb5c 100644 --- a/dingo/model/llm/rag/llm_rag_context_recall.py +++ b/dingo/model/llm/rag/llm_rag_context_recall.py @@ -202,7 +202,7 @@ def process_response(cls, response: str) -> EvalDetail: reason = item.get("reason", "") status_text = "可归因于上下文" if is_attributed else "不可归因于上下文" - all_reasons.append(f"陈述{i+1}: {statement}\n状态: {status_text}\n理由: {reason}") + all_reasons.append(f"陈述{i + 1}: {statement}\n状态: {status_text}\n理由: {reason}") # 构建完整的reason文本 reason_text = "\n\n".join(all_reasons) diff --git a/dingo/model/model.py b/dingo/model/model.py index ee3fa25b..faa1c1c9 100644 --- a/dingo/model/model.py +++ b/dingo/model/model.py @@ -154,21 +154,23 @@ def load_model(cls): cls.module_loaded = True @classmethod - def set_config_rule(self, rule: BaseRule, rule_config: EvaluatorRuleArgs): + def set_config_rule(cls, rule: BaseRule, rule_config: EvaluatorRuleArgs): if not rule_config: return config_default = getattr(rule, 'dynamic_config') - for k, v in rule_config: + # Iterate over rule_config fields using Pydantic's model_dump() + for k, v in rule_config.model_dump().items(): if v is not None: setattr(config_default, k, v) setattr(rule, 'dynamic_config', config_default) @classmethod - def set_config_llm(self, llm: BaseLLM, llm_config: EvaluatorLLMArgs): + def set_config_llm(cls, llm: BaseLLM, llm_config: EvaluatorLLMArgs): if not llm_config: return config_default = getattr(llm, 'dynamic_config') - for k, v in llm_config: + # Iterate over llm_config fields using Pydantic's model_dump() + for k, v in llm_config.model_dump().items(): if v is not None: setattr(config_default, k, v) setattr(llm, 'dynamic_config', config_default) diff --git a/dingo/model/rule/base.py b/dingo/model/rule/base.py index ff6dded6..e8346bf2 100644 --- a/dingo/model/rule/base.py +++ b/dingo/model/rule/base.py @@ -6,9 +6,9 @@ class BaseRule: - metric_type: str # This will be set by the decorator - group: List[str] # This will be set by the decorator - dynamic_config: EvaluatorRuleArgs + metric_type: str = '' # This will be set by the decorator + group: List[str] = [] # This will be set by the decorator + dynamic_config: EvaluatorRuleArgs = EvaluatorRuleArgs() # Default config, can be overridden by subclasses @classmethod def eval(cls, input_data: Data) -> EvalDetail: