@@ -144,19 +144,18 @@ Expected results:
144144
145145### Fine-tuned Image-Text Retrieval
146146
147+ #### Flickr30K
148+
147149<table >
148150 <tr align =center >
149151 <td rowspan="3" align=center><b>model</b></td>
150152 <td colspan="6" align=center><b>Flickr30K</b></td>
151- <td colspan="6" align=center><b>Flickr30K-CN</b></td>
152153 <td rowspan="3" align=center><b>avg</b></td>
153154
154155</tr >
155156 <tr align =center >
156157 <td colspan="3" align=center><b>image-to-text</b></td>
157158 <td colspan="3" align=center><b>text-to-image</b></td>
158- <td colspan="3" align=center><b>image-to-text</b></td>
159- <td colspan="3" align=center><b>text-to-image</b></td>
160159 </tr >
161160 <tr >
162161 <td>R@1</td>
@@ -165,12 +164,6 @@ Expected results:
165164 <td>R@1</td>
166165 <td>R@5</td>
167166 <td>R@10</td>
168- <td>R@1</td>
169- <td>R@5</td>
170- <td>R@10</td>
171- <td>R@1</td>
172- <td>R@5</td>
173- <td>R@10</td>
174167 </tr >
175168
176169<tr align =center >
@@ -181,13 +174,7 @@ Expected results:
181174 <td>88.5</td>
182175 <td>98.4</td>
183176 <td>99.2</td>
184- <td>96.5</td>
185- <td>99.9</td>
186- <td>100.0</td>
187- <td>85.2</td>
188- <td>97.0</td>
189- <td>98.5</td>
190- <td>96.7</td>
177+ <td>97.2</td>
191178 </tr >
192179<tr align =center >
193180 <td>InternVL-G-FT</td>
@@ -197,13 +184,7 @@ Expected results:
197184 <td>89.6</td>
198185 <td>98.6</td>
199186 <td>99.2</td>
200- <td>96.9</td>
201- <td>99.9</td>
202- <td>100.0</td>
203- <td>85.9</td>
204- <td>97.1</td>
205- <td>98.7</td>
206- <td>97.0</td>
187+ <td>97.6</td>
207188 </tr >
208189
209190</table >
@@ -230,43 +211,88 @@ Expected results:
230211</details >
231212
232213<details >
233- <summary >[InternVL-C -FT] Flickr30K-CN </summary >
214+ <summary >[InternVL-G -FT] Flickr30K</summary >
234215
235216``` bash
236217cd ../clip_benchmark/
237- CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language " cn " --task " zeroshot_retrieval" \
238- --dataset " flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval_hf \
239- --pretrained ./work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10 / --output result_ft.json
218+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language " en " --task " zeroshot_retrieval" \
219+ --dataset " flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
220+ --pretrained ./work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10 / --output result_ft.json
240221```
241222
242223Expected results:
243224
244225```
245- {"dataset": "flickr30k", "model": "internvl_c_retrieval_hf ", "pretrained": "./work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10 ", "task": "zeroshot_retrieval",
246- "metrics": {"image_retrieval_recall@1": 0.8521999716758728 , "text_retrieval_recall@1": 0.9649999737739563 ,
247- "image_retrieval_recall@5": 0.9697999954223633 , "text_retrieval_recall@5": 0.9990000128746033 ,
248- "image_retrieval_recall@10": 0.9854000210762024 , "text_retrieval_recall@10": 1.0}, "language": "cn "}
226+ {"dataset": "flickr30k", "model": "internvl_g_retrieval_hf ", "pretrained": "./work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10 ", "task": "zeroshot_retrieval",
227+ "metrics": {"image_retrieval_recall@1": 0.895799994468689 , "text_retrieval_recall@1": 0.9789999723434448 ,
228+ "image_retrieval_recall@5": 0.9861999750137329 , "text_retrieval_recall@5": 1.0 ,
229+ "image_retrieval_recall@10": 0.9922000169754028 , "text_retrieval_recall@10": 1.0}, "language": "en "}
249230```
250231
251232</details >
252233
234+ #### Flickr30K-CN
235+
236+ <table >
237+ <tr align =center >
238+ <td rowspan="3" align=center><b>model</b></td>
239+ <td colspan="6" align=center><b>Flickr30K-CN</b></td>
240+ <td rowspan="3" align=center><b>avg</b></td>
241+
242+ </tr >
243+ <tr align =center >
244+ <td colspan="3" align=center><b>image-to-text</b></td>
245+ <td colspan="3" align=center><b>text-to-image</b></td>
246+ </tr >
247+ <tr >
248+ <td>R@1</td>
249+ <td>R@5</td>
250+ <td>R@10</td>
251+ <td>R@1</td>
252+ <td>R@5</td>
253+ <td>R@10</td>
254+ </tr >
255+
256+ <tr align =center >
257+ <td>InternVL-C-FT</td>
258+ <td>96.5</td>
259+ <td>99.9</td>
260+ <td>100.0</td>
261+ <td>85.2</td>
262+ <td>97.0</td>
263+ <td>98.5</td>
264+ <td>96.2</td>
265+ </tr >
266+ <tr align =center >
267+ <td>InternVL-G-FT</td>
268+ <td>96.9</td>
269+ <td>99.9</td>
270+ <td>100.0</td>
271+ <td>85.9</td>
272+ <td>97.1</td>
273+ <td>98.7</td>
274+ <td>96.4</td>
275+ </tr >
276+
277+ </table >
278+
253279<details >
254- <summary >[InternVL-G -FT] Flickr30K</summary >
280+ <summary >[InternVL-C -FT] Flickr30K-CN </summary >
255281
256282``` bash
257283cd ../clip_benchmark/
258- CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language " en " --task " zeroshot_retrieval" \
259- --dataset " flickr30k" --dataset_root ./data/flickr30k --model internvl_g_retrieval_hf \
260- --pretrained ./work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10 / --output result_ft.json
284+ CUDA_VISIBLE_DEVICES=0 python3 clip_benchmark/cli.py eval --model_type internvl --language " cn " --task " zeroshot_retrieval" \
285+ --dataset " flickr30k" --dataset_root ./data/flickr30k --model internvl_c_retrieval_hf \
286+ --pretrained ./work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10 / --output result_ft.json
261287```
262288
263289Expected results:
264290
265291```
266- {"dataset": "flickr30k", "model": "internvl_g_retrieval_hf ", "pretrained": "./work_dirs/internvl_stage2_finetune_flickr_364_bs1024_ep10 ", "task": "zeroshot_retrieval",
267- "metrics": {"image_retrieval_recall@1": 0.895799994468689 , "text_retrieval_recall@1": 0.9789999723434448 ,
268- "image_retrieval_recall@5": 0.9861999750137329 , "text_retrieval_recall@5": 1.0 ,
269- "image_retrieval_recall@10": 0.9922000169754028 , "text_retrieval_recall@10": 1.0}, "language": "en "}
292+ {"dataset": "flickr30k", "model": "internvl_c_retrieval_hf ", "pretrained": "./work_dirs/internvl_stage2_finetune_flickrcn_364_bs1024_ep10 ", "task": "zeroshot_retrieval",
293+ "metrics": {"image_retrieval_recall@1": 0.8521999716758728 , "text_retrieval_recall@1": 0.9649999737739563 ,
294+ "image_retrieval_recall@5": 0.9697999954223633 , "text_retrieval_recall@5": 0.9990000128746033 ,
295+ "image_retrieval_recall@10": 0.9854000210762024 , "text_retrieval_recall@10": 1.0}, "language": "cn "}
270296```
271297
272298</details >
0 commit comments