Skip to content

Commit e4ddad9

Browse files
committed
[Fix] Qwen2VL batchsize>1 visual alignment
1 parent 69e5287 commit e4ddad9

File tree

1 file changed

+15
-8
lines changed

1 file changed

+15
-8
lines changed

lmms_eval/models/simple/qwen2_vl.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,16 @@ def _collate(x):
188188
# Import utils here if flatten is moved
189189
import lmms_eval.utils as utils
190190

191+
def _ensure_list(v):
192+
if v is None:
193+
return []
194+
if isinstance(v, list):
195+
# incase [[img]]
196+
if len(v) == 1 and isinstance(v[0], list):
197+
return v[0]
198+
return v
199+
return [v]
200+
191201
pbar = tqdm(total=len(requests), disable=(self.rank != 0), desc="Model Responding")
192202
# we group requests by their generation_kwargs,
193203
# so that we don't try to execute e.g. greedy sampling and temp=0.8 sampling
@@ -202,13 +212,10 @@ def _collate(x):
202212
# TODO: Clarify the behavior of doc_to_visual for documents without visual info.
203213
# The current logic might incorrectly discard all visuals if one doc lacks them.
204214
# Ensure flatten is appropriate here based on doc_to_visual's return type.
205-
visual_list = [doc_to_visual[0](self.task_dict[task][split][ids]) for ids in doc_id]
206-
if None in visual_list: # This check might need refinement
207-
# If a mix of visual/non-visual is possible, this needs careful handling
208-
# Currently sets all visuals to empty if any doc returns None
209-
visual_list = []
210-
else:
211-
visual_list = self.flatten(visual_list) # Assumes doc_to_visual returns list of lists
215+
visuals_per_doc = []
216+
for fn, ids in zip(doc_to_visual, doc_id):
217+
v = fn(self.task_dict[task][split][ids])
218+
visuals_per_doc.append(_ensure_list(v))
212219

213220
gen_kwargs = all_gen_kwargs[0] if all_gen_kwargs else {}
214221

@@ -249,7 +256,7 @@ def _collate(x):
249256
# Needs careful review based on doc_to_visual output structure
250257
# For simplicity, assuming visual_list contains all visuals for the batch for now
251258
# A more robust approach might map visuals back to their original context index.
252-
relevant_visuals = visual_list # Placeholder: needs logic to get visuals for context 'i'
259+
relevant_visuals = visuals_per_doc[i] # Placeholder: needs logic to get visuals for context 'i'
253260

254261
for visual in relevant_visuals:
255262
if isinstance(visual, str) and visual.endswith((".mp4", ".avi", ".mov")): # Video file

0 commit comments

Comments
 (0)