MMSearch/eval_end2end.py at main · CaraJ7/MMSearch · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
'''
RUNNING THIS FILE WILL COMPLETE THE REQUERY TASK AT THE SAMETIME.
'''

import os
import json
import datetime
import datasets
from tqdm import tqdm

from utils.logging_utils import setup_logging
import logging

setup_logging()
logger = logging.getLogger(__name__)
import argparse

from utils.utils import *
from prompts.prompt import *
from prompts.prompt_w_imagesearch import *
from utils.prompt_utils import *
from models.load import load_model
from score.req_score import get_requery_score
from score.f1_score import get_f1_score
from utils.image_utils import pil_image_to_bytes
from retrieve_content.retriever import Content_Retriever
from constants import FULLPAGE_SPLIT_DICT
from score.result_summary import get_result_summary
import random


def parse_args():
    argparser = argparse.ArgumentParser()
    argparser.add_argument("--model_type", default='Llava', type=str, help='the number of results from search engine')
    argparser.add_argument("--model_path", default='lmms-lab/llava-onevision-qwen2-7b-ov', type=str, help='the number of results from search engine')
    argparser.add_argument("--world-size", type=int, default=1)
    argparser.add_argument("--rank", type=int, default=0)
    argparser.add_argument("--brief_result_num", default=8, type=int)
    argparser.add_argument("--fullpage_num", default=1, type=int)
    argparser.add_argument("--save_path", default='output/end2end/debug', type=str)
    argparser.add_argument("--save_middle_results", type=str, default='')
    argparser.add_argument("--generation_args_path", type=str, default='customs/generation_args.json', help='LMM generation parameters, should be a json')
    argparser.add_argument("--verbose", action='store_true', default=False,)
    return argparser.parse_args()

args = parse_args()

sample_save_path = os.path.join(args.save_path, 'samples')
os.makedirs(sample_save_path, exist_ok=True)

# load model
model = load_model(args)
# load content retriever
content_retriever = Content_Retriever()

# load data
anno = datasets.load_dataset('CaraJ/MMSearch', name='end2end', split='end2end')
# calculate start and end for each rank
bin = len(anno) // args.world_size
rank_start = bin * args.rank
rank_end = (args.rank+1)*bin if args.rank != args.world_size - 1 else len(anno)

brief_result_num = args.brief_result_num
fullpage_num = args.fullpage_num

# setup dir
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
# Create a temporary directory with the timestamp
random.seed(args.rank)
if args.save_middle_results == '':
    temp_dir = f"temp_files/{timestamp}_{random.randint(1, 1000)}"
else:
    temp_dir = args.save_middle_results
os.makedirs(temp_dir, exist_ok=True)

'''
pipeline:
1. stage1: GPT requery
2. stage2: GPT rerank
3. stage3: GPT summarization
'''

result_list = []
for data_index, inst in tqdm(enumerate(anno)):
    # only run the instance for current rank
    if data_index < rank_start or data_index >= rank_end:
        continue

    # if this sample already exists, load the instance and continue
    if os.path.exists(os.path.join(sample_save_path, f"{inst['sample_id']}.json")):
        result_list.append(json.load(open(os.path.join(sample_save_path, f"{inst['sample_id']}.json"))))
        continue

    # set up image dir
    screenshot_dir = f"{temp_dir}/{data_index}"
    os.makedirs(screenshot_dir, exist_ok=True)

    # prepare query information
    if inst['query_image'] is None:
        query_has_image = False
        prompt_template_dict = text_query_dict
    else: # query with image
        query_has_image = True
        prompt_template_dict = image_search_text_query_dict

    query = inst['query']

    logger.info('************************************* Stage1 *************************************')
    # input: query information
    # output: requery
    # then search the text
    stage1_screenshot_dir = os.path.join(screenshot_dir, 'stage1')
    prompt_template = prompt_template_dict['stage1']
    if not query_has_image:
        image_files = []
        text_query = prompt_template.format(question=query)
    else:
        image_files = [
            pil_image_to_bytes(inst['query_image']),
            pil_image_to_bytes(inst['image_search_result'])
        ]
        text_query = prompt_template.format(
            question=DEFAULT_IMAGE_TOKEN + query,
            image_search_result=DEFAULT_IMAGE_TOKEN
        )

    requery = model.infer(
        image_files=image_files,
        text_query=text_query
    )
    if args.verbose:
        logger.info(f"Requery: {requery}")
        logger.info(f"Query ID: {inst['sample_id']}")

    result_brief = search_text_brief_result(
        query=requery,
        max_result_num=brief_result_num,
        screenshot_dir=stage1_screenshot_dir # relative path
    ) # [{'title', 'text','screenshot_path', 'url'}]

    if result_brief is None:
        logger.info("Duckduckgo returns None, skip this question")
        continue

    ### stage2: rerank
    logger.info('************************************* Stage2 *************************************')
    # input: query information
    # output: requery
    # then search the text
    website_information, input_image_list = get_website_information(result_brief)

    prompt_template = prompt_template_dict['stage2']
    if not query_has_image:
        image_files = input_image_list
        text_query = prompt_template.format(
            brief_result_num=brief_result_num,
            rerank_num=fullpage_num,
            question=query,
            website_information=website_information,
            incontext_example=get_rerank_incontext_example(fullpage_num)
        )
    else:
        image_files = [
            pil_image_to_bytes(inst['query_image']),
            pil_image_to_bytes(inst['image_search_result']),
            *input_image_list
        ]
        text_query = prompt_template.format(
            brief_result_num=brief_result_num,
            rerank_num=fullpage_num,
            question=DEFAULT_IMAGE_TOKEN+query,
            image_search_result=DEFAULT_IMAGE_TOKEN,
            website_information=website_information,
            incontext_example=get_rerank_incontext_example(fullpage_num)
        )

    rerank = model.infer(
        image_files=image_files,
        text_query=text_query
    )
    if args.verbose:
        logger.info(f"Rerank: {rerank}")

    ### stage3: gpt summarize
    logger.info('************************************* Stage3 *************************************')
    stage3_screenshot_dir = os.path.join(screenshot_dir, 'stage3')

    selected_index, _ = postprocess_rerank(rerank, fullpage_num)
    selected_website = [result_brief[i] for i in selected_index]
    result_full = search_url_full_result(
        urls=[web['url'] for web in selected_website],
        screenshot_dir=stage3_screenshot_dir # relative path
    ) # [{'content', 'screenshot_fullpage_path'}]

    if args.verbose:
        logger.info(f'Selected index: {selected_index}')
        logger.info(selected_website)

    # add title and snippet
    for full_idx, brief_idx in enumerate(selected_index):
        result_full[full_idx]['title'] = result_brief[brief_idx]['title']
        result_full[full_idx]['snippet'] = result_brief[brief_idx]['snippet']

    # conduct content retrieval
    for idx, inst_full in enumerate(result_full):
        if inst_full['content'] is None: # in case cannot get web content
            inst_full['content'] = ''
        if inst_full['content'].strip() != '': # some web do not contain language content
            result_full[idx]['content'] = content_retriever.get_retrieved_content(requery, inst_full['content'])

    website_full_information, input_image_list = get_full_website_information(
        result_full=result_full,
        image_dir=stage3_screenshot_dir,
        fullpage_split_dict=FULLPAGE_SPLIT_DICT
    )

    # text_query and input_image_list
    prompt_template = prompt_template_dict['stage3']
    if not query_has_image:
        image_files = input_image_list
        text_query = prompt_template.format(
            rerank_num=fullpage_num,
            website_information=website_full_information,
            question=query,
        )
    else:
        image_files = [
            *input_image_list,
            pil_image_to_bytes(inst['image_search_result']),
            pil_image_to_bytes(inst['query_image'])
        ]
        # assume only 1 image in the query
        text_query = prompt_template.format(
            rerank_num=args.fullpage_num,
            website_information=website_full_information,
            image_search_result=DEFAULT_IMAGE_TOKEN,
            question=DEFAULT_IMAGE_TOKEN + query
        )

    prediction = model.infer(
        image_files=image_files,
        text_query=text_query
    )
    if args.verbose:
        logger.info(f'Summarize: {prediction}')

    # calculate the score
    ## requery
    gt_requery = inst['gt_requery']
    req_score = get_requery_score(requery, gt_requery)
    ## end2end
    gt_answer = inst['gt_answer']
    f1_score = get_f1_score(prediction, gt_answer)
    for gt_alternative_answer in inst['alternative_gt_answers']:
        alternative_f1_score = get_f1_score(prediction, gt_alternative_answer)
        if alternative_f1_score > f1_score:
            f1_score = alternative_f1_score

    save_inst = dict(
        sample_id=inst['sample_id'],
        query=inst['query'],
        requery=requery,
        gt_requery=inst['gt_requery'],
        req_score=req_score['score'],
        req_score_dict=req_score,
        prediction=prediction,
        gt_answer=gt_answer,
        f1_score=f1_score,
        result_brief=result_brief,
        rerank=rerank,
        fullpage_url = [web['url'] for web in selected_website],
        result_full=result_full,
        area=inst['area'],
        subfield=inst['subfield'],
    )

    json.dump(save_inst, open(os.path.join(sample_save_path, f"{inst['sample_id']}.json"), 'w'), indent=4)
    result_list.append(save_inst)

result_summary = get_result_summary(anno, result_list, summary_key=['req_score', 'f1_score'])
logger.info(f"Total length: {result_summary['f1_score']['total_dict']['total_length']}")
logger.info(f"Average F1 Score: {result_summary['f1_score']['total_dict']['average']}")
json.dump(
    result_summary,
    open(os.path.join(args.save_path, f"result_summary_end2end.json"), 'w'),
    indent=4
)