Skip to content

Commit 3d9ccbf

Browse files
authored
Update sentencepiece dependency and add new parameters to mathvista_t… (#716)
* Update sentencepiece dependency and add new parameters to mathvista_test.yaml * Refactor imports and clean up code in VideoMathQA task files - Rearranged and organized import statements in `cot_postprocess.py` and `cot_step_evaluation.py` for better readability. - Removed unnecessary blank lines in `utils.py` and improved import order. - Ensured consistent formatting across the files.
1 parent d438332 commit 3d9ccbf

File tree

5 files changed

+25
-22
lines changed

5 files changed

+25
-22
lines changed

lmms_eval/tasks/mathvista/mathvista_test.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,4 +21,7 @@ metric_list:
2121

2222
lmms_eval_specific_kwargs:
2323
default:
24-
shot_type: "reason-first" # can be "reason-first", "solution", "step-by-step"
24+
shot_type: "reason-first" # can be "reason-first", "solution", "step-by-step"
25+
shot: 0
26+
use_caption: False
27+
use_ocr: False

lmms_eval/tasks/videomathqa/cot_postprocess.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
1+
import argparse
2+
import json
13
import os
4+
import random
25
import re
36
import sys
4-
import json
5-
import random
6-
import argparse
7+
78
from tqdm import tqdm
8-
from vllm import LLM, SamplingParams
99
from transformers import AutoTokenizer
10-
from videomathqa.utils import (extract_characters_regex,
11-
videomathqa_process_results,
12-
videomathqa_mcq_aggregate_results,
13-
videomathqa_multi_binary_aggregate_results)
14-
10+
from videomathqa.utils import (
11+
extract_characters_regex,
12+
videomathqa_mcq_aggregate_results,
13+
videomathqa_multi_binary_aggregate_results,
14+
videomathqa_process_results,
15+
)
16+
from vllm import LLM, SamplingParams
1517

1618
mcq_prompt = (
1719
"Given the original multiple-choice options and a model-generated answer containing reasoning and a final answer, identify the option that best matches the final answer and return only the corresponding letter (A, B, C, D, or E)."

lmms_eval/tasks/videomathqa/cot_step_evaluation.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
import os
1+
import argparse
22
import ast
33
import json
4-
import argparse
4+
import os
5+
56
import pandas as pd
67
from tqdm import tqdm
7-
from vllm import LLM, SamplingParams
88
from transformers import AutoTokenizer
9+
from vllm import LLM, SamplingParams
910

1011
system_prompt = """
1112
You are a intelligent assistant for grading math question solutions. You will be given:
@@ -110,7 +111,6 @@ def compute_score(gt_data, res_data, res_file, tokenizer, llm, sampling_params,
110111
batch = []
111112
scored_samples = []
112113
for sample in tqdm(gt_data, desc="Assigning scores with Qwen3"):
113-
114114
qid = sample["question_id"]
115115
matched = [res for res in res_data if res["doc"]["question_id"] == qid]
116116
if not matched:

lmms_eval/tasks/videomathqa/utils.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
import os
22
import re
3-
import cv2
43
import sys
5-
import yaml
6-
import numpy as np
7-
4+
from collections import defaultdict
85
from pathlib import Path
96
from typing import List
10-
from collections import defaultdict
7+
8+
import cv2
9+
import numpy as np
10+
import yaml
1111
from loguru import logger as eval_logger
1212

1313
VIDEO_LENGTH = ["short", "medium", "long"]
@@ -28,7 +28,6 @@ def decode_video(video_path: str) -> List[np.ndarray]:
2828

2929

3030
def load_video(video_path, max_frames, annot_sample_rate=1):
31-
3231
def uniform_sample(m, n):
3332
assert n <= m
3433
stride = (m - 1) / (n - 1) if n > 1 else 0 # Calculate the stride
@@ -298,7 +297,6 @@ def videomathqa_mcq_aggregate_results(results):
298297

299298

300299
def videomathqa_multi_binary_aggregate_results(results):
301-
302300
grouped = defaultdict(list)
303301
for result in results:
304302
grouped[result["question_id"]].append(result)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ dependencies = [
4646
"av",
4747
"hf_transfer",
4848
"nltk",
49-
"sentencepiece==0.1.99",
49+
"sentencepiece",
5050
"yt-dlp",
5151
"pycocoevalcap",
5252
"tqdm-multiprocess",

0 commit comments

Comments
 (0)