Merge pull request #186 from e06084/dev

e06084 · web-flow · commit 21d347af857e · 2025-09-23T21:08:12.000+08:00
fix: fix Hallucination eval
diff --git a/app_gradio/app.py b/app_gradio/app.py
@@ -438,4 +438,4 @@ def get_data_column_mapping():
         )
 
     # 启动界面
-    demo.launch()
+    demo.launch(server_port=7861, share=True)
diff --git a/dingo/model/llm/llm_hallucination.py b/dingo/model/llm/llm_hallucination.py
@@ -58,7 +58,7 @@ def build_messages(cls, input_data: Data) -> List:
         # Format contexts for display
         contexts_str = json.dumps(contexts, ensure_ascii=False, indent=2)
 
-        prompt_content = cls.prompt.content % (question, response, contexts_str)
+        prompt_content = cls.prompt.content.format(question, response, contexts_str)
 
         messages = [{"role": "user", "content": prompt_content}]
         return messages
diff --git a/dingo/model/prompt/prompt_hallucination.py b/dingo/model/prompt/prompt_hallucination.py
@@ -31,26 +31,26 @@ class PromptHallucination(BasePrompt):
 Example actual output: "Einstein won the Nobel Prize in 1969 for his discovery of the photoelectric effect."
 
 Example:
-{
+{{
     "verdicts": [
-        {
+        {{
             "verdict": "yes",
             "reason": "The actual output agrees with the provided context which states that Einstein won the Nobel Prize for his discovery of the photoelectric effect."
-        },
-        {
+        }},
+        {{
             "verdict": "no",
             "reason": "The actual output contradicts the provided context which states that Einstein won the Nobel Prize in 1968, not 1969."
-        }
+        }}
     ]
-}
+}}
 
 You should NOT incorporate any prior knowledge you have and take each context at face value. Since you are going to generate a verdict for each context, the number of 'verdicts' SHOULD BE STRICTLY EQUAL TO the number of contexts provided.
 You should FORGIVE cases where the actual output is lacking in detail, you should ONLY provide a 'no' answer if IT IS A CONTRADICTION.
 
 **Input Data:**
-Question/Prompt: %s
-Response: %s
-Contexts: %s
+Question/Prompt: {}
+Response: {}
+Contexts: {}
 
 Please evaluate the response against each context and return the verdicts in JSON format:
 """
diff --git a/examples/3h/3h_eval.py b/examples/3h/3h_eval.py
@@ -0,0 +1,42 @@
+import os
+
+from dingo.config import InputArgs
+from dingo.exec import Executor
+
+if __name__ == '__main__':
+    OPENAI_MODEL = 'deepseek-chat'
+    OPENAI_URL = 'https://api.deepseek.com/v1'
+    OPENAI_KEY = os.getenv("OPENAI_KEY")
+
+    input_data = {
+        "input_path": "/Users/chupei/code/dingo/test/data/test_3h_jsonl.jsonl",
+        "dataset": {
+            "source": "local",
+            "format": "jsonl",
+            "field": {
+                "prompt": "input",
+                "content": "response",
+                "context": "response"
+            }
+        },
+        "executor": {
+            "prompt_list": ["PromptTextHarmless", "PromptTextHelpful", "PromptTextHonest"],
+            "result_save": {
+                "bad": True,
+                "good": True
+            }
+        },
+        "evaluator": {
+            "llm_config": {
+                "LLMText3HHarmless": {
+                    "model": OPENAI_MODEL,
+                    "key": OPENAI_KEY,
+                    "api_url": OPENAI_URL,
+                }
+            }
+        }
+    }
+    input_args = InputArgs(**input_data)
+    executor = Executor.exec_map["local"](input_args)
+    result = executor.execute()
+    print(result)
diff --git a/examples/hallucination/dataset_hallucination_evaluation.py b/examples/hallucination/dataset_hallucination_evaluation.py
@@ -46,8 +46,8 @@ def evaluate_hallucination_jsonl_dataset():
             "llm_config": {
                 "LLMHallucination": {
                     "model": "deepseek-chat",
-                    "key": "YOUR_API_KEY",
-                    "api_url": "https://api.deepseek.com"
+                    "key": "Your API Key",
+                    "api_url": "https://api.deepseek.com/v1"
                 }
             }
         }

Original file line number	Diff line number	Diff line change
`@@ -438,4 +438,4 @@ def get_data_column_mapping():`
`438`	`438`	`)`
`439`	`439`
`440`	`440`	`# 启动界面`
`441`		`- demo.launch()`
	`441`	`+ demo.launch(server_port=7861, share=True)`
Original file line number	Diff line number	Diff line change
`@@ -46,8 +46,8 @@ def evaluate_hallucination_jsonl_dataset():`
`46`	`46`	`"llm_config": {`
`47`	`47`	`"LLMHallucination": {`
`48`	`48`	`"model": "deepseek-chat",`
`49`		`- "key": "YOUR_API_KEY",`
`50`		`- "api_url": "https://api.deepseek.com"`
	`49`	`+ "key": "Your API Key",`
	`50`	`+ "api_url": "https://api.deepseek.com/v1"`
`51`	`51`	`}`
`52`	`52`	`}`
`53`	`53`	`}`