Skip to content

Commit 45bab28

Browse files
authored
Merge pull request #221 from seancoding-day/dev
feat: 1. add VLM explanation; 2. update image format in messages; 3. update image dataset
2 parents daa8ce5 + 1a3afb6 commit 45bab28

File tree

4 files changed

+94
-22
lines changed

4 files changed

+94
-22
lines changed

dingo/model/llm/vlm_image_relevant.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import base64
2+
import os
13
from typing import List
24

35
from dingo.io import Data
@@ -10,15 +12,65 @@
1012
class VLMImageRelevant(BaseOpenAI):
1113
prompt = PromptImageRelevant
1214

15+
@classmethod
16+
def _encode_image(cls, image_path: str) -> str:
17+
"""
18+
Encode a local image file to base64 data URL format.
19+
If the input is already a URL, return it as is.
20+
21+
This method follows Python's standard path resolution:
22+
- Relative paths are resolved relative to the current working directory
23+
- Absolute paths are used as-is
24+
- URLs (http://, https://, data:) are passed through unchanged
25+
26+
Args:
27+
image_path: Local file path (absolute or relative) or URL
28+
29+
Returns:
30+
Base64 data URL for local files, or original URL for web resources
31+
32+
Raises:
33+
FileNotFoundError: If a local file path does not exist
34+
RuntimeError: If the file cannot be read
35+
"""
36+
# Pass through URLs unchanged
37+
if image_path.startswith(('http://', 'https://', 'data:')):
38+
return image_path
39+
40+
# Standard file path handling (relative or absolute)
41+
if not os.path.isfile(image_path):
42+
raise FileNotFoundError(
43+
f"Image file not found: '{image_path}'\n"
44+
f"Current working directory: {os.getcwd()}\n"
45+
f"Absolute path would be: {os.path.abspath(image_path)}\n"
46+
f"Ensure the path is correct relative to your current working directory."
47+
)
48+
49+
try:
50+
with open(image_path, "rb") as image_file:
51+
base64_image = base64.b64encode(image_file.read()).decode('utf-8')
52+
# Determine MIME type from file extension
53+
ext = os.path.splitext(image_path)[1].lower()
54+
mime_type = 'image/jpeg' if ext in ['.jpg', '.jpeg'] else f'image/{ext[1:]}'
55+
return f"data:{mime_type};base64,{base64_image}"
56+
except Exception as e:
57+
raise RuntimeError(
58+
f"Failed to read image file '{image_path}': {e}"
59+
)
60+
1361
@classmethod
1462
def build_messages(cls, input_data: Data) -> List:
63+
# Encode images if they are local file paths
64+
image_url_1 = cls._encode_image(input_data.prompt)
65+
image_url_2 = cls._encode_image(input_data.content)
66+
1567
messages = [
1668
{
1769
"role": "user",
1870
"content": [
1971
{"type": "text", "text": cls.prompt.content},
20-
{"type": "image_url", "image_url": {"url": input_data.prompt}},
21-
{"type": "image_url", "image_url": {"url": input_data.content}},
72+
{"type": "image_url", "image_url": {"url": image_url_1}},
73+
{"type": "image_url", "image_url": {"url": image_url_2}},
2274
],
2375
}
2476
]

dingo/model/prompt/prompt_image_relevant.py

Lines changed: 31 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,16 +9,37 @@ class PromptImageRelevant(BasePrompt):
99
_metric_info = {
1010
"category": "Multimodality Assessment Metrics",
1111
"metric_name": "PromptImageRelevant",
12-
"description": "Evaluates if an image matches reference image in terms of face count, feature details, and visual elements",
12+
"description": "Evaluates image consistency and relevance through comprehensive analysis of content, semantics, visual quality, and detail fidelity",
1313
"evaluation_results": ""
1414
}
1515

16-
content = """
17-
作为一款专业的图片检测AI工具,请结合第一张图评估第二张图片是否符合标准。请先分析第一张图片,包括背景信息、人脸数量、以及每个人物的脸部和手部特征。
18-
然后根据以下标准对第二张图片进行评分:\n
19-
1. 图片中的人脸数量是否与第一张图片一致;\n
20-
2. 每个人物的脸部和手部是否变形;\n
21-
3. 如果第一张图片中有国旗标志,则判断第二张图片中的国旗标志颜色和形状是否一致。\n
22-
只要存在一处不符合,即不通过。评分0表示不通过,1表示通过。\n
23-
请只输出评分和理由,输出格式为json,模版为{"score": xxx, "reason": "xxx"}。\n
24-
"""
16+
content = """你是一个专业的图像对比分析系统。请对比分析两张图片的一致性和相关性。
17+
18+
【分析步骤】
19+
1. 第一张图片分析
20+
仔细观察并记录第一张图片的核心内容:
21+
- 主要对象(人物、物体、场景)
22+
- 视觉元素(颜色、构图、风格)
23+
- 关键细节(文字、标识、特征)
24+
- 语义信息(主题、意图、情境)
25+
26+
2. 第二张图片评估
27+
基于第一张图片,从以下维度评估第二张图片:
28+
- 内容一致性:主要对象和场景元素是否保持一致
29+
- 语义相关性:主题意图和信息传达是否相符
30+
- 视觉质量:图像清晰度、完整性、是否存在明显缺陷
31+
- 细节保真度:重要特征、比例、空间关系是否准确
32+
33+
3. 综合评分
34+
评分标准:
35+
- 分数1:图片整体一致且相关,无明显问题
36+
- 分数0:存在以下任一情况
37+
* 主要内容不一致或缺失
38+
* 语义偏离或不相关
39+
* 存在明显的质量缺陷
40+
* 关键细节错误或失真
41+
42+
【输出要求】
43+
请进行逐步分析后,输出最终评分和简要原因。
44+
输出格式必须为JSON:{"score": 评分, "reason": "原因说明"}
45+
"""

examples/image/sdk_image_relevant.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ def image_relevant():
2424
},
2525
"evaluator": {
2626
"llm_config": {
27+
# IMPORTANT: VLMImageRelevant requires a vision-language model (VLM)
2728
"VLMImageRelevant": {
29+
"model": "", # e.g. qwen3-vl, gpt-4o, doubao-seed-vision
2830
"key": "",
2931
"api_url": "",
3032
}

test/data/test_img_jsonl.jsonl

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
1-
{"id": "1", "url_1": "../test/data/img_builtin/origin.jpg", "url_2": "../test/data/img_builtin/new1.jpg"}
2-
{"id": "2", "url_1": "../test/data/img_builtin/origin.jpg", "url_2": "../test/data/img_builtin/new2.jpg"}
3-
{"id": "3", "url_1": "../test/data/img_builtin/origin.jpg", "url_2": "../test/data/img_builtin/new3.jpg"}
4-
{"id": "4", "url_1": "../test/data/img_builtin/origin.jpg", "url_2": "../test/data/img_builtin/new4.jpg"}
5-
{"id": "5", "url_1": "../test/data/img_builtin/origin.jpg", "url_2": "../test/data/img_builtin/new5.jpg"}
6-
{"id": "6", "url_1": "../test/data/img_builtin/origin.jpg", "url_2": "../test/data/img_builtin/new6.jpg"}
7-
{"id": "7", "url_1": "../test/data/img_builtin/origin.jpg", "url_2": "../test/data/img_builtin/new7.jpg"}
8-
{"id": "8", "url_1": "../test/data/img_builtin/origin.jpg", "url_2": "../test/data/img_builtin/new8.jpg"}
9-
{"id": "9", "url_1": "../test/data/img_builtin/origin.jpg", "url_2": "../test/data/img_builtin/new9.jpg"}
10-
{"id": "10", "url_1": "../test/data/img_builtin/origin.jpg", "url_2": "../test/data/img_builtin/new10.jpg"}
1+
{"id": "1", "url_1": "../../test/data/img_builtin/origin.jpg", "url_2": "../../test/data/img_builtin/new1.jpeg"}
2+
{"id": "2", "url_1": "../../test/data/img_builtin/origin.jpg", "url_2": "../../test/data/img_builtin/new2.jpeg"}
3+
{"id": "3", "url_1": "../../test/data/img_builtin/origin.jpg", "url_2": "../../test/data/img_builtin/new3.jpeg"}
4+
{"id": "4", "url_1": "../../test/data/img_builtin/origin.jpg", "url_2": "../../test/data/img_builtin/new4.jpeg"}
5+
{"id": "5", "url_1": "../../test/data/img_builtin/origin.jpg", "url_2": "../../test/data/img_builtin/new5.jpeg"}
6+
{"id": "6", "url_1": "../../test/data/img_builtin/origin.jpg", "url_2": "../../test/data/img_builtin/new6.jpeg"}
7+
{"id": "7", "url_1": "../../test/data/img_builtin/origin.jpg", "url_2": "../../test/data/img_builtin/new7.jpeg"}

0 commit comments

Comments
 (0)