diff --git a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py index 01b92b7ef..217c3d278 100644 --- a/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py +++ b/hugegraph-llm/src/hugegraph_llm/config/prompt_config.py @@ -386,4 +386,25 @@ class PromptConfig(BasePromptConfig): doc_input_text_CN: str = """介绍一下Sarah,她是一位30岁的律师,还有她的室友James,他们从2010年开始一起合租。James是一名记者, 职业道路也很出色。另外,Sarah拥有一个个人网站www.sarahsplace.com,而James也经营着自己的网页,不过这里没有提到具体的网址。这两个人, Sarah和James,不仅建立起了深厚的室友情谊,还各自在网络上开辟了自己的一片天地,展示着他们各自丰富多彩的兴趣和经历。 +""" + + review_prompt: str = """ +## 评审任务 +请根据以下标准答案对模型的回答进行专业评估: + +## 评估要求 +1. 从准确性(与标准答案一致性)、相关性(与问题相关度)、完整性(信息完整度)三个维度进行1-5分评分 +2. 计算综合评分(三个维度平均分,保留1位小数) +3. 提供简明扼要的改进建议 +4. 使用JSON格式返回以下字段(返回内容一定要被```json ```所包围): + - accuracy_score (int) + - relevance_score (int) + - completeness_score (int) + - overall_score (float) + - comment (str) + +## 标准答案 +{standard_answer} + +## 待评审回答 """ diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py index cb3677709..acc2f120d 100644 --- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py +++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/configs_block.py @@ -219,7 +219,7 @@ def apply_llm_config(current_llm_config, arg1, arg2, arg3, arg4, origin_call=Non data = { "model": arg3, "temperature": 0.01, - "messages": [{"role": "user", "content": "test"}], + "messages": [{"role": "user", "content": "hello"}], } headers = {"Authorization": f"Bearer {arg1}"} status_code = test_api_connection(test_url, method="POST", headers=headers, body=data, origin_call=origin_call) diff --git a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/other_block.py b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/other_block.py index da10f50f4..08b3ed27f 100644 --- a/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/other_block.py +++ b/hugegraph-llm/src/hugegraph_llm/demo/rag_demo/other_block.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +import os import asyncio from contextlib import asynccontextmanager @@ -24,8 +25,10 @@ from fastapi import FastAPI from hugegraph_llm.utils.hugegraph_utils import init_hg_test_data, run_gremlin_query, backup_data +from hugegraph_llm.utils.other_tool_utils import auto_test_llms from hugegraph_llm.utils.log import log from hugegraph_llm.demo.rag_demo.vector_graph_block import timely_update_vid_embedding +from hugegraph_llm.config import llm_settings, resource_path def create_other_block(): @@ -42,6 +45,40 @@ def create_other_block(): out = gr.Textbox(label="Backup Graph Manually (Auto backup at 1:00 AM everyday)", show_copy_button=True) btn = gr.Button("Backup Graph Data") btn.click(fn=backup_data, inputs=inp, outputs=out) # pylint: disable=no-member + # auto test llm + with gr.Accordion("Evaluation Model Settings (only support openai)", open=True): + with gr.Row(): + review_model_name = gr.Textbox(label="Model Name", value="ernie-4.5-8k-preview", interactive=True) + review_max_tokens = gr.Textbox(label="Max Tokens", value=2048) + key = gr.Textbox(value=getattr(llm_settings, "openai_chat_api_key"), label="API Key") + base = gr.Textbox(value=getattr(llm_settings, "openai_chat_api_base"),label="API Base") + with gr.Row(): + with gr.Column(): + with gr.Tab("file") as tab_upload_file: # pylint: disable=W0612 + inp1_file = gr.File( + value=os.path.join(resource_path, "demo", "llm_review.yaml"), + label="yaml file", + file_count="single", + ) + with gr.Tab("text") as tab_upload_text: # pylint: disable=W0612 + inp1 = gr.Textbox( + value="openai, model_name, api_key, api_base, max_tokens\n" \ + "qianfan_wenxin, model_name, api_key, secret_key\n" \ + "ollama/local, model_name, host, port, max_tokens\n" \ + "litellm, model_name, api_key, api_base, max_tokens\n", + label="LLMs Config (every line represents a different LLM)", + show_copy_button=True, lines=6 + ) + with gr.Row(): + inp2 = gr.Textbox(value="hello, how are you?", label="Prompt", show_copy_button=True, lines=8) + inp3 = gr.Textbox(value="I am fine, thank you", label="Standard Answer", show_copy_button=True, lines=8) + out = gr.Code(label="Output", language="json", elem_classes="code-container-show") + btn = gr.Button("Run LLM Test") + btn.click( # pylint: disable=no-member + fn=auto_test_llms, + inputs=[inp1, inp1_file, inp2, inp3, review_model_name, review_max_tokens, key, base], + outputs=out + ) with gr.Accordion("Init HugeGraph test data (🚧)", open=False): with gr.Row(): inp = [] @@ -49,7 +86,6 @@ def create_other_block(): btn = gr.Button("(BETA) Init HugeGraph test data (🚧)") btn.click(fn=init_hg_test_data, inputs=inp, outputs=out) # pylint: disable=no-member - @asynccontextmanager async def lifespan(app: FastAPI): # pylint: disable=W0621 log.info("Starting background scheduler...") diff --git a/hugegraph-llm/src/hugegraph_llm/resources/demo/llm_review.yaml b/hugegraph-llm/src/hugegraph_llm/resources/demo/llm_review.yaml new file mode 100644 index 000000000..f06ecadfb --- /dev/null +++ b/hugegraph-llm/src/hugegraph_llm/resources/demo/llm_review.yaml @@ -0,0 +1,11 @@ +- type: openai + model_name: ernie-4.5-8k-preview + api_key: + api_base: + max_tokens: 2048 + +- type: openai + model_name: gpt-4.1-mini + api_key: + api_base: + max_tokens: 4096 diff --git a/hugegraph-llm/src/hugegraph_llm/utils/other_tool_utils.py b/hugegraph-llm/src/hugegraph_llm/utils/other_tool_utils.py new file mode 100644 index 000000000..659f62c9d --- /dev/null +++ b/hugegraph-llm/src/hugegraph_llm/utils/other_tool_utils.py @@ -0,0 +1,236 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import time +import json +import re +import gradio as gr +import yaml + +from hugegraph_llm.config import PromptConfig +from hugegraph_llm.utils.log import log +from hugegraph_llm.models.llms.ollama import OllamaClient +from hugegraph_llm.models.llms.openai import OpenAIClient +from hugegraph_llm.models.llms.qianfan import QianfanClient +from hugegraph_llm.models.llms.litellm import LiteLLMClient +def judge(answers, standard_answer, review_model_name, review_max_tokens, key, base): + try: + review_client = OpenAIClient( + api_key=key, + api_base=base, + model_name=review_model_name, + max_tokens=int(review_max_tokens) + ) + review_prompt = PromptConfig.review_prompt.format(standard_answer=standard_answer) + for _, (model_name, answer) in enumerate(answers.items(), start=1): + review_prompt += f"### {model_name}:\n{answer.strip()}\n\n" + log.debug("Review_prompt: %s", review_prompt) + response = review_client.generate(prompt=review_prompt) + log.debug("orig_review_response: %s", response) + match = re.search(r'```json\n(.*?)\n```', response, re.DOTALL) + if match: + response = match.group(1).strip() + reviews = json.loads(response) + return reviews + except Exception as e: # pylint: disable=W0718 + log.error("Review failed: %s", str(e)) + reviews = {"error": f"Review error: {str(e)}"} + return reviews + +def parse_llm_configurations(config_text: str): + configs = [] + lines = config_text.strip().split("\n") + for i, line in enumerate(lines, 1): + fields = [x.strip() for x in line.split(",")] + if not fields: + continue + llm_type = fields[0] + try: + if llm_type == "openai": + # openai, model_name, api_key, api_base, max_tokens + model_name, api_key, api_base, max_tokens = fields[1:5] + configs.append({ + "type": "openai", + "model_name": model_name, + "api_key": api_key, + "api_base": api_base, + "max_tokens": int(max_tokens), + }) + elif llm_type == "qianfan_wenxin": + # qianfan_wenxin, model_name, api_key, secret_key + model_name, api_key, secret_key = fields[1:4] + configs.append({ + "type": "qianfan_wenxin", + "model_name": model_name, + "api_key": api_key, + "secret_key": secret_key, + }) + elif llm_type == "ollama/local": + # ollama/local, model_name, host, port, max_tokens + model_name, host, port, max_tokens = fields[1:5] + configs.append({ + "type": "ollama/local", + "model_name": model_name, + "host": host, + "port": int(port), + "max_tokens": int(max_tokens), + }) + elif llm_type == "litellm": + # litellm, model_name, api_key, api_base, max_tokens + model_name, api_key, api_base, max_tokens = fields[1:5] + configs.append({ + "type": "litellm", + "model_name": model_name, + "api_key": api_key, + "api_base": api_base, + "max_tokens": int(max_tokens), + }) + else: + raise ValueError(f"Unsupported llm type '{llm_type}' in line {i}") + except Exception as e: + raise ValueError(f"Error parsing line {i}: {line}\nDetails: {e}") from e + return configs + +def parse_llm_configurations_from_yaml(yaml_file_path: str): + configs = [] + with open(yaml_file_path, "r", encoding="utf-8") as f: + raw_configs = yaml.safe_load(f) + if not isinstance(raw_configs, list): + raise ValueError("YAML 文件内容必须是一个 LLM 配置列表。") + for i, config in enumerate(raw_configs, 1): + try: + llm_type = config.get("type") + if llm_type == "openai": + configs.append({ + "type": "openai", + "model_name": config["model_name"], + "api_key": config["api_key"], + "api_base": config["api_base"], + "max_tokens": int(config["max_tokens"]), + }) + elif llm_type == "qianfan_wenxin": + configs.append({ + "type": "qianfan_wenxin", + "model_name": config["model_name"], + "api_key": config["api_key"], + "secret_key": config["secret_key"], + }) + elif llm_type == "ollama/local": + configs.append({ + "type": "ollama/local", + "model_name": config["model_name"], + "host": config["host"], + "port": int(config["port"]), + "max_tokens": int(config["max_tokens"]), + }) + elif llm_type == "litellm": + configs.append({ + "type": "litellm", + "model_name": config["model_name"], + "api_key": config["api_key"], + "api_base": config["api_base"], + "max_tokens": int(config["max_tokens"]), + }) + else: + raise ValueError(f"不支持的 llm type '{llm_type}',在配置第 {i} 项") + except Exception as e: + raise ValueError(f"解析配置第 {i} 项失败: {e}") from e + + return configs + + +def auto_test_llms( + llm_configs, + llm_configs_file, + prompt, + standard_answer, + review_model_name, + review_max_tokens, + key, + base, + fmt=True + ): + configs = None + if llm_configs_file and llm_configs: + raise gr.Error("Please only choose one between file and text.") + if llm_configs: + configs = parse_llm_configurations(llm_configs) + elif llm_configs_file: + configs = parse_llm_configurations_from_yaml(llm_configs_file) + log.debug("LLM_configs: %s", configs) + answers = {} + for config in configs: + output = None + time_start = time.perf_counter() + try: + if config["type"] == "openai": + client = OpenAIClient( + api_key=config["api_key"], + api_base=config["api_base"], + model_name=config["model_name"], + max_tokens=config["max_tokens"], + ) + output = client.generate(prompt=prompt) + elif config["type"] == "qianfan_wenxin": + client = QianfanClient( + model_name=config["model_name"], + api_key=config["api_key"], + secret_key=config["secret_key"] + ) + output = client.generate(prompt=prompt) + elif config["type"] == "ollama/local": + client = OllamaClient( + model_name=config["model_name"], + host=config["host"], + port=config["port"], + ) + output = client.generate(prompt=prompt) + elif config["type"] == "litellm": + client = LiteLLMClient( + api_key=config["api_key"], + api_base=config["api_base"], + model_name=config["model_name"], + max_tokens=config["max_tokens"], + ) + output = client.generate(prompt=prompt) + except Exception as e: # pylint: disable=broad-except + log.error("Generate failed for %s: %s", config["model_name"], e) + output = f"[ERROR] {e}" + time_end = time.perf_counter() + latency = time_end - time_start + answers[config["model_name"]] = { + "answer": output, + "latency": f"{round(latency, 2)}s" + } + reviews = judge( + {k: v["answer"] for k, v in answers.items()}, + standard_answer, + review_model_name, + review_max_tokens, + key, + base + ) + log.debug("reviews: %s", reviews) + result = {} + reviews_dict = {item["model"]: item for item in reviews} if isinstance(reviews, list) else reviews + for model_name, infos in answers.items(): + result[model_name] = { + "answer": infos["answer"], + "latency": infos["latency"], + "review": reviews_dict.get(model_name, {}) + } + return json.dumps(result, indent=4, ensure_ascii=False) if fmt else reviews