From c50d407abd30fc965690332c28ee9e5fb5faf754 Mon Sep 17 00:00:00 2001 From: dany <973031439@qq.com> Date: Fri, 18 Jul 2025 11:56:04 +0800 Subject: [PATCH] feat: add mmlongbenchdoc test script --- MMLongBench-Doc/README.md | 113 ++++++++ MMLongBench-Doc/asset/top_figure.png | Bin 0 -> 1983216 bytes MMLongBench-Doc/eval/__init__.py | 0 MMLongBench-Doc/eval/eval_score.py | 260 ++++++++++++++++++ MMLongBench-Doc/eval/extract_answer.py | 30 ++ .../eval/prompt_for_answer_extraction.md | 35 +++ MMLongBench-Doc/models/__init__.py | 0 MMLongBench-Doc/models/internlm_xc2_4khd.py | 125 +++++++++ MMLongBench-Doc/models/internvl_chat.py | 139 ++++++++++ MMLongBench-Doc/models/minicpm_llama3.py | 56 ++++ MMLongBench-Doc/run_api.py | 242 ++++++++++++++++ 11 files changed, 1000 insertions(+) create mode 100644 MMLongBench-Doc/README.md create mode 100644 MMLongBench-Doc/asset/top_figure.png create mode 100644 MMLongBench-Doc/eval/__init__.py create mode 100644 MMLongBench-Doc/eval/eval_score.py create mode 100644 MMLongBench-Doc/eval/extract_answer.py create mode 100644 MMLongBench-Doc/eval/prompt_for_answer_extraction.md create mode 100644 MMLongBench-Doc/models/__init__.py create mode 100644 MMLongBench-Doc/models/internlm_xc2_4khd.py create mode 100644 MMLongBench-Doc/models/internvl_chat.py create mode 100644 MMLongBench-Doc/models/minicpm_llama3.py create mode 100644 MMLongBench-Doc/run_api.py diff --git a/MMLongBench-Doc/README.md b/MMLongBench-Doc/README.md new file mode 100644 index 000000000..46e9b4be7 --- /dev/null +++ b/MMLongBench-Doc/README.md @@ -0,0 +1,113 @@ +
+
+ Yubo Ma + · + Yuhang Zang + · + Liangyu Chen + · + Meiqi Chen + · + Yizhu Jiao + · + Xinze Li + · + Xinyuan Lu + · + Ziyu Liu + · + Yan Ma + · + Xiaoyi Dong + · + Pan Zhang + · + Liangming Pan + . + Yu-Gang Jiang + . + Jiaqi Wang + . + Yixin Cao + . + Aixin Sun +
+ + 📖Paper |🏠Homepage|🤗Huggingface + ++
+The automatic understanding of lengthy documents (Long-context Document Understanding; DU) stands as a long-standing task in urgent and practical needs. Although many LVLMs now claim (and show promising cases) their capabilities on long-context DU, there lacks a unified and quantitative evaluation of existing models due to the absence of related benchmark.
+To bridge this gap, we construct MMLongBench-Doc which comprises 135 documents and 1091 qustions (each accompanied by a short, deterministic reference answer and detailed meta information.). The documents have an average of 47.5 pages and 21,214 tokens, cover 7 diverse domains, and are PDF-formatted with rich layouts and multi-modal components. The questions are either curated from existing datasets or newly-annotated by expert-level annotators. Towards a comprehensive evaluation, the questions cover different sources like text, table, chart, image, etc., and different locations (page index) of the documents. Notably, 33.0% questions are cross-page questions necessitating comprehension and reasoning on evidences across multiple pages. And 22.5% questions are designed to be unanswerable for reducing the shortcuts in this benchmark and detecting LVLMs' hallucinations.
+
+
+k6PpKPM^9WRnml
zT)#u;Sy_HIJ2 gD1SjFy`P$_xWmP}i+su3zviFt8px0q;Y9k5VFZSr`Z-2O
zbTOvHE8;3DHyv&?u1>*Ar FG6lc9LLL9bn~
z^h#tY{^`Go1Ky5s<^YL#|BSX^wf-x1t_wIc|oqX;fh{s;!}mjsFVi3Zha^OTz(32(ooB@_D@W4>oQgl|}6dj>?NppEp+X
zvXL@QJp;B|5OkEtLu*;Q>I|`B
zz;uGM