JianGuanTHU commited on
Commit
41c6214
·
1 Parent(s): d30eb2d
Files changed (1) hide show
  1. vilabench.csv +21 -20
vilabench.csv CHANGED
@@ -1,21 +1,21 @@
1
- Benchmark,URL,year,cognitive levels,category,domain,task,image,multiple image,video,score,use,,,,,,,,,,,,,,,,,
2
  RefCOCO,https://aclanthology.org/D14-1086.pdf,2014,Understanding,General,natural,grounding,1,,,"{""Seed1.5VL"": 91.3, ""Gemini2.5-Pro"": 74.6, ""Qwen2.5VL-72B"": 90.3}","seed1.5vl thinking 91.3, Gemini2.5-pro thinking 74.6, Qwen2.5vl-72B 90.3",,,,,,,,,,,,,,,,,
3
  VQA-v2,https://openaccess.thecvf.com/content_cvpr_2017/papers/Goyal_Making_the_v_CVPR_2017_paper.pdf,2017,Understanding,General,natural,perception,1,,,"{""Gemini-Ultra"": 77.8, ""Gemini1.5-Pro"": 80.2}","Gemini-ultra 77.8, Gemini1.5-pro 80.2",,,,,,,,,,,,,,,,,
4
  FSC-147 (mean absolute error↓),https://arxiv.org/pdf/2104.08391,2021,Understanding,General,natural,grounding,1,,,"{""Seed1.5VL"": 17.9, ""Gemini2.5-Pro"": 24.5, ""OpenAI GPT4o"": 46.8, ""Qwen2.5VL-72B"": 28.6}","seed1.5vl thinking 17.9, Gemini2.5-pro thinking 24.5, OpenAI GPT4o 46.8, Qwen2.5vl-72B 28.6",,,,,,,,,,,,,,,,,
5
  CountBench,https://arxiv.org/pdf/2302.12066,2023,Understanding,General,"natural, synthetic",grounding,1,,,"{""Seed1.5VL"": 93.7, ""Gemini2.5-Pro"": 91.0, ""OpenAI GPT4o"": 85.7, ""Qwen2.5VL-72B"": 93.6}","seed1.5vl thinking 93.7, Gemini2.5-pro thinking 91.0, OpenAI GPT4o 85.7, Qwen2.5vl-72B 93.6",,,,,,,,,,,,,,,,,
6
- POPE,https://arxiv.org/pdf/2305.10355,2023,Understanding,General,natural,grounding,1,,,"{""InternVL3-78B"": 90.3}",InternVL3 78B 90.3,,,,,,,,,,,,,,,,,
7
  HallusionBench,https://arxiv.org/pdf/2310.14566,2023,Understanding,General,"natural, synthetic",perception,1,1,1,"{""Seed1.5VL"": 60.3, ""Gemini2.5-Pro"": 63.7, ""OpenAI GPT4o"": 56.2, ""Qwen2.5VL-72B"": 55.2, ""InternVL3-78B"": 59.1}","seed1.5vl thinking 60.3, Gemini2.5-pro thinking 63.7, OpenAI GPT4o 56.2, Qwen2.5vl-72B 55.2, InternVL3 78B 59.1",,,,,,,,,,,,,,,,,
8
  V* bench,https://arxiv.org/pdf/2312.14135,2023,Understanding,General,natural,perception,1,,,"{""Gemini1.5-Pro"": 71.7, ""OpenAI o3 high"": 95.7, ""Seed1.5VL"": 89.0, ""Gemini2.5-Pro"": 79.1, ""OpenAI GPT4o"": 73.9, ""Qwen2.5VL-72B"": 86.4}","Gemini1.5-pro 71.7, OpenAI o3 high 95.7, seed1.5vl thinking 89.0, Gemini2.5-pro thinking 79.1, OpenAI GPT4o 73.9, Qwen2.5vl-72B 86.4",,,,,,,,,,,,,,,,,
9
  MMVP,https://arxiv.org/pdf/2401.06209,2024,Understanding,General,"natural, synthetic",perception,1,,,"{""Seed1.5VL"": 70.7, ""Gemini2.5-Pro"": 70.7, ""OpenAI GPT4o"": 70.7, ""Qwen2.5VL-72B"": 66.7}","seed1.5vl thinking 70.7, Gemini2.5-pro thinking 70.7, OpenAI GPT4o 70.7, Qwen2.5vl-72B 66.7",,,,,,,,,,,,,,,,,
10
  CRPE,https://arxiv.org/pdf/2402.19474,2024,Understanding,General,natural,relation understanding,1,,,"{""Qwen2.5VL-72B"": 79.2, ""InternVL2.5-78B"": 78.8}","Qwen2.5vl-72B 79.2, InternVL2.5 78B 78.8",,,,,,,,,,,,,,,,,
11
- Vibe-Eval (Reka),https://arxiv.org/pdf/2405.02287,2024,Understanding,General,"natural, synthetic",perception,1,,,"{""Gemini2.5-Pro"": 67.2}",Gemini2.5-pro thinking 67.2,,,,,,,,,,,,,,,,,
12
  MuirBench,https://arxiv.org/pdf/2406.09411,2024,Understanding,General,natural,"counting, attribute similarity, image-text matching, visual retrieval, geographic understanding, scene understanding, cartoon understanding, diagram understanding",,1,,"{""Qwen2.5VL-72B"": 70.7, ""InternVL2.5-78B"": 63.5}","Qwen2.5vl-72B 70.7, InternVL2.5 78B 63.5",,,,,,,,,,,,,,,,,
13
- WildVision,https://arxiv.org/pdf/2406.11069,2024,Understanding,General,"natural, synthetic",alignment with human prefernece,1,,,"{""InternVL3-78B"": 73.6}",InternVL3 78B 73.6,,,,,,,,,,,,,,,,,
14
  VLMs are blind,https://arxiv.org/pdf/2407.06581,2024,Understanding,General,synthetic,perception,1,,,"{""OpenAI o3 high"": 90.1, ""Seed1.5VL"": 92.1, ""Gemini2.5-Pro"": 84.3, ""OpenAI GPT4o"": 50.4, ""Qwen2.5VL-72B"": 69.0}","OpenAI o3 high 90.1, seed1.5vl thinking 92.1,Gemini2.5-pro thinking 84.3, OpenAI GPT4o 50.4, Qwen2.5vl-72B 69.0",,,,,,,,,,,,,,,,,
15
  Realworld QA,https://huggingface.co/datasets/xai-org/RealworldQA,2024,Understanding,General,natural,perception,1,,,"{""Gemini1.5-Pro"": 70.4, ""Seed1.5VL"": 78.4, ""Gemini2.5-Pro"": 78.0, ""OpenAI GPT4o"": 76.2, ""Qwen2.5VL-72B"": 75.7, ""InternVL3-78B"": 78.0}","Gemini1.5-pro 70.4, seed1.5vl thinking 78.4, Gemini2.5-pro thinking 78.0, OpenAI GPT4o 76.2, Qwen2.5vl-72B 75.7, InternVL3 78B 78.0",,,,,,,,,,,,,,,,,
16
  MME-Realworld,https://arxiv.org/pdf/2408.13257,2024,Understanding,General,"video monitoring, OCR, autonomous driving, diagram tables",perception,1,,,"{""Qwen2.5VL-72B"": 63.2, ""InternVL3-78B"": 65.4}","Qwen2.5vl-72B 63.2, InternVL3 78B 65.4",,,,,,,,,,,,,,,,,
17
- MMIU,https://arxiv.org/pdf/2408.02718,2024,Understanding,General,"natural, synthetic","semantic understanding, spatial understanding, temporal understanding",,1,,"{""InternVL3-78B"": 60.4}",InternVL3-78B 60.4,,,,,,,,,,,,,,,,,
18
- R-Bench,https://arxiv.org/pdf/2410.05474,2024,Understanding,General,natural,robustness to corruption,1,,,"{""InternVL3-78B"": 77.4}",InternVL3 78B 77.4,,,,,,,,,,,,,,,,,
19
  SimpleVQA,https://arxiv.org/pdf/2502.13059,2025,Understanding,General,natural,knowledge memorization,1,,,"{""Seed1.5VL"": 63.4, ""Gemini2.5-Pro"": 62.0, ""OpenAI GPT4o"": 52.4, ""Qwen2.5VL-72B"": 52.4}","seed1.5vl thinking 63.4, Gemini2.5-pro thinking 62.0, OpenAI GPT4o 52.4, Qwen2.5vl-72B 52.4",,,,,,,,,,,,,,,,,
20
  MMMU,https://arxiv.org/pdf/2311.16502,2023,Reasoning,General,multi-discipline,QA,1,,,"{""Gemini-Ultra"": 59.4, ""Gemini1.5-Pro"": 62.2, ""Gemini2.5-Pro"": 82.0, ""OpenAI o3 high"": 82.9, ""Claude4-Opus"": 76.5, ""Grok3-beta"": 76.0, ""Seed1.5VL"": 77.9, ""Qwen2.5VL-72B"": 70.2, ""InternVL3-78B"": 72.2}","Gemini-ultra 59.4, Gemini1.5-pro 62.2, Gemini2.5-pro thinking 82.0, OpenAI o3 high 82.9 Claude 4 Opus 76.5, Grok3 beta 76.0, seed1.5vl thinking 77.9, Qwen2.5vl-72B 70.2, InternVL3 78B 72.2",,,,,,,,,,,,,,,,,
21
  ZeroBench,https://arxiv.org/pdf/2502.09696,2025,Reasoning,General,"natural, synthetic",QA,1,,,"{""Gemini2.5-Pro"": 4.5, ""Seed1.5VL"": 2.0, ""OpenAI GPT4o"": 0.0, ""Qwen2.5VL-72B"": 0.0}","Gemini2.5-pro thinking 4.5, seed1.5vl thinking 2.0, OpenAI GPT4o 0.0, Qwen2.5vl-72B 0.0",,,,,,,,,,,,,,,,,
@@ -23,9 +23,9 @@ MMBench,https://arxiv.org/pdf/2307.06281,2023,Comprehensive,General,"natural, sy
23
  MM-VET,https://arxiv.org/pdf/2308.02490,2023,Comprehensive,General,"natural, synthetic","OCR, math, recognition, spatial understanding, knowledge memorization",1,,,"{""Qwen2.5VL-72B"": 76.2, ""InternVL2.5-78B"": 72.3}","Qwen2.5vl-72B 76.2, InternVL2.5 78B 72.3",,,,,,,,,,,,,,,,,
24
  MMStar,https://arxiv.org/pdf/2403.20330,2024,Comprehensive,General,multi-discipline,QA,1,,,"{""Seed1.5VL"": 77.8, ""Gemini2.5-Pro"": 77.5, ""OpenAI GPT4o"": 65.1, ""Qwen2.5VL-72B"": 70.8, ""InternVL3-78B"": 72.5}","seed1.5vl thinking 77.8, Gemini2.5-pro thinking 77.5, OpenAI GPT4o 65.1, Qwen2.5vl-72B 70.8, InternVL3 78B 72.5",,,,,,,,,,,,,,,,,
25
  Blink,https://arxiv.org/pdf/2404.12390,2024,Comprehensive,General,"natural, synthetic","grounding, low-level pattern matching (e.g., visual correspondence) to mid-level spatial reasoning (e.g., relative depth), and up to high-level visual understanding (e.g., visual similarity)",1,1,,"{""Gemini1.5-Pro"": 61.4, ""Seed1.5VL"": 72.1, ""Gemini2.5-Pro"": 70.6, ""OpenAI GPT4o"": 65.9, ""Qwen2.5VL-72B"": 64.4, ""InternVL3-78B"": 66.3}","Gemini1.5-pro 61.4, seed1.5vl thinking 72.1, Gemini2.5-pro thinking 70.6, OpenAI GPT4o 65.9, Qwen2.5vl-72B 64.4, InternVL3-78B 66.3",,,,,,,,,,,,,,,,,
26
- MMT-Bench,https://arxiv.org/pdf/2404.16006,2024,Comprehensive,General,"natural, synthetic",162 tasks,,1,,"{""InternVL3-78B"": 73.2}",InternVL3-78B 73.2,,,,,,,,,,,,,,,,,
27
- Mantis-Eval,https://arxiv.org/pdf/2405.01483,2024,Comprehensive,General,natural,"Co-reference, Comparison, Reasoning, Temporal understanding",,1,,"{""InternVL3-78B"": 79.3}",InternVL3-78B 79.3,,,,,,,,,,,,,,,,,
28
- MIRB ,https://arxiv.org/pdf/2406.12742,2024,Comprehensive,General,"natural, synthetic","perception, reasoning, knowledge, multi-hop",,1,,"{""InternVL3-78B"": 64.3}",InternVL3-78B 64.3,,,,,,,,,,,,,,,,,
29
  MegaBench,https://arxiv.org/pdf/2410.10563,2024,Comprehensive,General,"natural, synthetic","mathematics, information extraction, planning, coding, perception, metrics, scene, knowledge",1,1,,"{""Qwen2.5VL-72B"": 46.8, ""InternVL2.5-78B"": 45.6}","Qwen2.5vl-72B 46.8, InternVL2.5 78B 45.6",,,,,,,,,,,,,,,,,
30
  Mathvista,https://arxiv.org/pdf/2310.02255,2023,Reasoning,Math & Logic,math,problem-solving,1,,,"{""Gemini-Ultra"": 53.0, ""Gemini1.5-Pro"": 63.9, ""OpenAI o3 high"": 86.8, ""Seed1.5VL"": 85.6, ""Gemini2.5-Pro"": 82.7, ""OpenAI GPT4o"": 63.8, ""Qwen2.5VL-72B"": 74.8, ""InternVL3-78B"": 80.5, ""MiMo-VL-7B"": 81.5}","Gemini-ultra 53.0, Gemini1.5-pro 63.9, OpenAI o3 high 86.8,seed1.5vl thinking 85.6, Gemini2.5-pro thinking 82.7, OpenAI GPT4o 63.8, Qwen2.5vl-72B 74.8, InternVL3 78B 80.5, MiMo-VL 7B 81.5",,,,,,,,,,,,,,,,,
31
  OlympiadBench,https://arxiv.org/pdf/2402.14008,2024,Reasoning,Math & Logic,math,problem-solving,1,,,"{""Seed1.5VL"": 65.0, ""Gemini2.5-Pro"": 69.8, ""OpenAI GPT4o"": 25.9, ""Qwen2.5VL-72B"": 35.9, ""MiMo-VL-7B"": 59.4}","seed1.5vl thinking 65.0, Gemini2.5-pro thinking 69.8, OpenAI GPT4o 25.9, Qwen2.5vl-72B 35.9, MiMo-VL 7B 59.4",,,,,,,,,,,,,,,,,
@@ -43,24 +43,24 @@ TextVQA,https://arxiv.org/abs/1904.08920,2019,Reasoning,OCR & Chart& Document,OC
43
  DocVQA,https://arxiv.org/pdf/2007.00398,2020,Reasoning,OCR & Chart& Document,document,QA,1,,,"{""Gemini-Ultra"": 90.9, ""Gemini1.5-Pro"": 93.1, ""Seed1.5VL"": 96.9, ""Gemini2.5-Pro"": 94.0, ""OpenAI GPT4o"": 66.2, ""Qwen2.5VL-72B"": 96.4, ""InternVL3-78B"": 95.4}","Gemini-ultra 90.9, Gemini1.5-pro 93.1, seed1.5vl thinking 96.9, Gemini2.5-pro thinking 94.0, OpenAI GPT4o 66.2, Qwen2.5vl-72B 96.4, InternVL3 78B 95.4",,,,,,,,,,,,,,,,,
44
  InfographicVQA ,https://arxiv.org/pdf/2104.12756,2021,Reasoning,OCR & Chart& Document,infographic,QA,1,,,"{""Gemini-Ultra"": 80.3, ""Gemini1.5-Pro"": 81.0, ""Seed1.5VL"": 91.2, ""Gemini2.5-Pro"": 84.3, ""OpenAI GPT4o"": 79.2, ""Qwen2.5VL-72B"": 87.3, ""InternVL3-78B"": 85.2}","Gemini-ultra 80.3, Gemini1.5-pro 81.0, seed1.5vl thinking 91.2, Gemini2.5-pro thinking 84.3, OpenAI GPT4o 79.2, Qwen2.5vl-72B 87.3, InternVL3 78B 85.2",,,,,,,,,,,,,,,,,
45
  ChartQA ,https://arxiv.org/pdf/2203.10244,2022,Reasoning,OCR & Chart& Document,chart,QA,1,,,"{""Gemini-Ultra"": 80.8, ""Gemini1.5-Pro"": 87.2, ""Seed1.5VL"": 89.1, ""Gemini2.5-Pro"": 83.3, ""OpenAI GPT4o"": 86.7, ""Qwen2.5VL-72B"": 89.5, ""InternVL3-78B"": 89.7}","Gemini-ultra 80.8, Gemini1.5-pro 87.2, seed1.5vl thinking 89.1, Gemini2.5-pro thinking 83.3, OpenAI GPT4o 86.7, Qwen2.5vl-72B 89.5, InternVL3 78B 89.7",,,,,,,,,,,,,,,,,
46
- TAT-DQA,https://arxiv.org/pdf/2207.11871,2022,Reasoning,OCR & Chart& Document,document,QA,1,,,"{""Gemini1.5-Pro"": 37.8}",Gemini1.5-pro 37.8,,,,,,,,,,,,,,,,,
47
- DUDE ,https://arxiv.org/pdf/2305.08455,2023,Reasoning,OCR & Chart& Document,document,QA,,1,,"{""Gemini1.5-Pro"": 46.0}",Gemini1.5-pro 46.0,,,,,,,,,,,,,,,,,
48
  SEED-Bench-2-Plus,https://arxiv.org/pdf/2404.16790,2024,Reasoning,OCR & Chart& Document,"chart, web page, map",QA,1,,,"{""Qwen2.5VL-72B"": 73.0, ""InternVL2.5-78B"": 71.3, ""Gemini1.5-Pro"": 70.8}","Qwen2.5vl-72B 73.0, InternVL2.5 78B 71.3, Gemini 1.5 pro 70.8",,,,,,,,,,,,,,,,,
49
  CharXiv reasoning/description,https://arxiv.org/pdf/2406.18521,2024,Reasoning,OCR & Chart& Document,chart,QA,1,,,"{""OpenAI o3 high"": {""score1"": 78.6, ""score2"": 95.0}, ""Seed1.5VL"": {""score1"": 60.2, ""score2"": 92.6}, ""Gemini2.5-Pro"": {""score1"": 69.9, ""score2"": 94.4}, ""OpenAI GPT4o"": {""score1"": 52.0, ""score2"": 86.5}, ""Qwen2.5VL-72B"": {""score1"": 49.7, ""score2"": 87.4}}","OpenAI o3 high 78.6/95.0, seed1.5vl thinking 60.2/92.6, Gemini2.5-pro thinking 69.9/94.4, OpenAI GPT4o 52.0/86.5, Qwen2.5vl-72B 49.7/87.4",,,,,,,,,,,,,,,,,
50
  MMLongBench-DOC,https://arxiv.org/pdf/2407.01523,2024,Reasoning,OCR & Chart& Document,long document,QA,,1,,"{""Kimi-VL-A3B-Thinking-2506"": 42.1, ""OpenAI GPT4o"": 42.8, ""Qwen2.5VL-72B"": 38.8}",,,,,,,,,,,,,,,,,,
51
  VisualWebBench ,https://arxiv.org/pdf/2404.05955,2024,Comprehensive,OCR & Chart& Document,web page,grounding,1,,,"{""Seed1.5VL"": 87.3, ""Gemini2.5-Pro"": 87.3, ""OpenAI GPT4o"": 80.2, ""Qwen2.5VL-72B"": 82.3}","seed1.5vl thinking 87.3, Gemini2.5-pro thinking 87.3, OpenAI GPT4o 80.2, Qwen2.5vl-72B 82.3",,,,,,,,,,,,,,,,,
52
  QVHighlights,https://arxiv.org/pdf/2107.09609,2021,Understanding,Short Video,natural,moment retrieval,,,1,"{""Gemini2.5-Pro"": 75.0, ""OpenAI GPT4.1"": 71.4}","Gemini2.5-pro thinking 75.0, OpenAI GPT4.1 71.4",,,,,,,,,,,,,,,,,
53
- TACoS ,https://arxiv.org/pdf/1403.6173,2014,Understanding,Short Video,natural,grounding,,,1,"{""Seed1.5VL"": 49.6}",Seed1.5-VL thinking 49.6,,,,,,,,,,,,,,,,,
54
  Charades-STA,https://arxiv.org/pdf/1705.02101,2017,Understanding,Short Video,natural,grounding,,,1,"{""Seed1.5VL"": 64.0, ""Qwen2.5VL-72B"": 50.9}","Seed1.5-VL thinking 64.0, Qwen2.5-VL 72B 50.9",,,,,,,,,,,,,,,,,50.9
55
  YouCook2 ,https://arxiv.org/pdf/1703.09788,2017,Understanding,Short Video,cooking,perception,,,1,"{""Gemini-Ultra"": 135.4, ""Gemini1.5-Pro"": 106.5, ""Gemini2.5-Pro"": 188.3, ""OpenAI GPT4.1"": 127.6}","Gemini-ultra 135.4, Gemini1.5-pro 106.5, Gemini2.5-pro thinking 188.3, OpenAI GPT4.1 127.6",,,,,,,,,,,,,,,,,
56
  VATEX ,https://arxiv.org/pdf/1904.03493,2019,Understanding,Short Video,natural,perception,,,1,"{""Gemini-Ultra"": {""en"": 62.7, ""zh"": 51.3}, ""Gemini1.5-Pro"": {""en"": 64.6, ""zh"": 55.3}, ""Gemini2.5-Pro"": {""en"": 71.3, ""zh"": 59.7}, ""OpenAI GPT4.1"": {""en"": 64.1, ""zh"": 48.7}}","Gemini-ultra 62.7/51.3 (En/Zh), Gemini1.5-pro 64.6/55.3 (En/Zh), Gemini2.5-pro thinking 71.3/59.7 (En/Zh), OpenAI GPT4.1 64.1/48.7 (En/Zh)",,,,,,,,,,,,,,,,,
57
  EgoSchema,https://arxiv.org/pdf/2308.09126,2023,Understanding,Short Video,natural,perception,,,1,"{""Gemini1.5-Pro"": 72.2, ""Qwen2.5VL-72B"": 76.2}","Gemini1.5-pro 72.2, Qwen2.5-VL 72B 76.2",,,,,,,,,,,,,,,,,
58
  TemporalBench,https://arxiv.org/pdf/2410.10818,2024,Understanding,Short Video,natural,captioning,,,1,"{""Seed1.5VL"": 79.8, ""OpenAI GPT4o"": 73.3}","Seed1.5-VL thinking 79.8, OpenAI GPT4o 73.3",,,,,,,,,,,,,,,,,
59
- Dream-1K,https://arxiv.org/pdf/2407.00634,2024,Understanding,Short Video,natural,captioning,,,1,"{""Seed1.5VL"": 43.9}",Seed1.5-VL thinking 43.9,,,,,,,,,,,,,,,,,
60
- MotionBench,https://arxiv.org/pdf/2501.02955,2025,Understanding,Short Video,natural,motion understanding,,,1,"{""Seed1.5VL"": 68.4}",Seed1.5-VL thinking 68.4,,,,,,,,,,,,,,,,,
61
  MVBench,https://arxiv.org/pdf/2311.17005,2025,Understanding,Short Video,natural,"temporal understanding, spatial understanding",,,1,"{""Seed1.5VL"": 74.4, ""InternVL2.5-78B"": 76.4, ""Qwen2.5VL-72B"": 70.4}","Seed1.5-VL thinking 74.4, InternVL-2.5 76.4, Qwen2.5-VL 72B 70.4",,,,,,,,,,,,,,,,,70.4
62
  ActivityNet-QA,https://arxiv.org/pdf/1906.02467,2019,Reasoning,Short Video,natural,QA,,,1,"{""Gemini-Ultra"": 52.2, ""Gemini1.5-Pro"": 57.5, ""Gemini2.5-Pro"": 66.6, ""OpenAI GPT4.1"": 60.4}","Gemini-ultra 52.2, Gemini1.5-pro 57.5, Gemini2.5-pro thinking 66.6, OpenAI GPT4.1 60.4",,,,,,,,,,,,,,,,,
63
- NextQA ,https://arxiv.org/pdf/2105.08276,2021,Reasoning,Short Video,natural,QA,,,1,"{""Gemini-Ultra"": 29.9}",Gemini-ultra 29.9,,,,,,,,,,,,,,,,,
64
  Perception Test MCQA,https://proceedings.neurips.cc/paper_files/paper/2023/file/8540fba4abdc7f9f7a7b1cc6cd60e409-Paper-Datasets_and_Benchmarks.pdf,2023,Reasoning,Short Video,natural,QA,,,1,"{""Gemini-Ultra"": 54.7, ""Gemini2.5-Pro"": 78.4, ""OpenAI GPT4.1"": 64.8, ""Qwen2.5VL-72B"": 73.2}","Gemini-ultra 54.7, Gemini2.5-pro thinking 78.4, OpenAI GPT4.1 64.8, Qwen2.5-VL 72B 73.2",,,,,,,,,,,,,,,,,
65
  MMVU,https://arxiv.org/pdf/2501.12380,2025,Reasoning,Short Video,multi-discipline,QA,,,1,"{""Gemini2.5-Pro"": 75.8, ""Seed1.5VL"": 70.1, ""Qwen2.5VL-72B"": 62.9}","Gemini2.5-pro thinking 75.8, seed1.5vl thinking 70.1, Qwen2.5-VL 72B 62.9",,,,,,,,,,,,,,,,,
66
  VideoMMMU,https://arxiv.org/pdf/2501.13826,2025,Reasoning,Short Video,multi-discipline,QA,,,1,"{""Gemini2.5-Pro"": 83.6, ""OpenAI GPT4.1"": 60.9, ""Seed1.5VL"": 81.4, ""Kimi k1.6"": 76.7, ""Qwen2.5VL-72B"": 60.2}","Gemini2.5-pro thinking 83.6, OpenAI GPT4.1 60.9, seed1.5vl thinking 81.4, Kimi k1.6 76.7, Qwen2.5-VL 72B 60.2",,,,,,,,,,,,,,,,,
@@ -69,7 +69,7 @@ TempCompass ,https://arxiv.org/pdf/2403.00476,2024,Reasoning,Short Video,"natura
69
  TVBench,https://arxiv.org/pdf/2410.07752,2024,Reasoning,Short Video,"natural, synthetic",temporal reasoning,,,1,"{""Seed1.5VL"": 63.6, ""Gemini2.5-Pro"": 62.6}","Seed1.5-VL thinking 63.6, Gemini2.5-pro thinking 62.6",,,,,,,,,,,,,,,,,
70
  TOMATO ,https://arxiv.org/pdf/2410.23266,2024,Reasoning,Short Video,natural,temporal reasoning,,,1,"{""Seed1.5VL"": 44.7, ""Gemini2.5-Pro"": 46.9}","Seed1.5-VL thinking 44.7, Gemini2.5-pro thinking 46.9",,,,,,,,,,,,,,,,,
71
  EgoTempo,https://arxiv.org/pdf/2503.13646v1,2025,Reasoning,Short Video,natural,temporal reasoning,,,1,"{""Gemini2.5-Pro"": 44.3, ""OpenAI GPT4.1"": 40.3}","Gemini2.5-pro thinking 44.3, OpenAI GPT4.1 40.3",,,,,,,,,,,,,,,,,
72
- MMBench-Video,https://arxiv.org/pdf/2406.14515,2024,Comprehensive,Short Video,"natural, synthetic","temporal reasoning, commonsense reasoning, attribute reasoning, logic reasoning, relation reasoning, perception",,,1,"{""Qwen2.5VL-72B"": 2.02}",Qwen2.5vl-72B 2.02,,,,,,,,,,,,,,,,,
73
  1H-VideoQA,https://arxiv.org/pdf/2403.05530,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Gemini2.5-Pro"": 81.0, ""OpenAI GPT4.1"": 56.8}","Gemini2.5-pro thinking 81.0, OpenAI GPT4.1 56.8",,,,,,,,,,,,,,,,,
74
  LVBench,https://arxiv.org/pdf/2406.08035,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Gemini2.5-Pro"": 78.7, ""OpenAI GPT4.1"": 63.4, ""Seed1.5VL"": 64.6, ""Qwen2.5VL-72B"": 47.3}","Gemini2.5-pro thinking 78.7, OpenAI GPT4.1 63.4, Seed1.5-VL thinking 64.6, Qwen2.5-VL 72B 47.3",,,,,,,,,,,,,,,,,
75
  VideoMME w/o subtitle,https://arxiv.org/abs/2405.21075,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Gemini2.5-Pro"": 84.3, ""OpenAI GPT4.1"": 72.0, ""Seed1.5VL"": 77.9}","Gemini2.5-pro thinking 84.3, OpenAI GPT4.1 72.0, Seed1.5-vl thinking 77.9",,,,,,,,,,,,,,,,,
@@ -78,12 +78,13 @@ LongVideoBench ,https://arxiv.org/pdf/2407.15754,2024,Comprehensive,Long Video,n
78
  Neptune,https://arxiv.org/pdf/2412.09582,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Gemini2.5-Pro"": 87.3, ""OpenAI GPT4.1"": 85.2}","Gemini2.5-pro thinking 87.3, OpenAI GPT4.1 85.2",,,,,,,,,,,,,,,,,
79
  StreamBench ,https://arxiv.org/pdf/2501.13468,2025,Reasoning,Streaming Video,natural,streaming reasoning,,,1,"{""Seed1.5VL"": 72.8, ""OpenAI GPT4o"": 68.7}","Seed1.5-VL thinking 72.8, OpenAI GPT4o 68.7",,,,,,,,,,,,,,,,,
80
  OVO-Bench,https://arxiv.org/pdf/2501.05510,2025,Reasoning,Streaming Video,natural,streaming reasoning,,,1,"{""Seed1.5VL"": 72.3, ""Gemini1.5-Pro"": 67.7}","Seed1.5-VL thinking 72.3, Gemini1.5-pro 67.7",,,,,,,,,,,,,,,,,
81
- OVBench ,https://arxiv.org/pdf/2501.00584,2025,Reasoning,Streaming Video,natural,streaming reasoning,,,1,"{""Seed1.5VL"": 60.0}",Seed1.5-VL thinking 60.0,,,,,,,,,,,,,,,,,
82
  NYU-Depth V2 (absolute relative error↓),https://link.springer.com/chapter/10.1007/978-3-642-33715-4_54,2012,Understanding,Spatial & Embodied Reasoning,indoor scene,depth estimation,1,,,"{""Seed1.5VL"": 13.6, ""Gemini2.5-Pro"": 27.5, ""OpenAI GPT4o"": 73.8, ""Qwen2.5VL-72B"": 35.5}","seed1.5vl thinking 13.6, Gemini2.5-pro thinking 27.5, OpenAI GPT4o 73.8, Qwen2.5vl-72B 35.5",,,,,,,,,,,,,,,,,
83
  DA-2K,https://arxiv.org/pdf/2406.09414v1,2024,Understanding,Spatial & Embodied Reasoning,natural,depth estimation,1,,,"{""Seed1.5VL"": 91.7, ""Gemini2.5-Pro"": 73.0, ""OpenAI GPT4o"": 66.9, ""Qwen2.5VL-72B"": 69.6}","seed1.5vl thinking 91.7, Gemini2.5-pro thinking 73.0, OpenAI GPT4o 66.9, Qwen2.5vl-72B 69.6",,,,,,,,,,,,,,,,,
84
- OpenEQA ,https://openaccess.thecvf.com/content/CVPR2024/papers/Majumdar_OpenEQA_Embodied_Question_Answering_in_the_Era_of_Foundation_Models_CVPR_2024_paper.pdf,2024,Reasoning,Spatial & Embodied Reasoning,indoor scene,embodied reasoning,,,1,"{""Gemini-Ultra"": 57.9}",Gemini-ultra 57.9,,,,,,,,,,,,,,,,,
85
  VSI-Bench,https://arxiv.org/abs/2412.14171,2024,Reasoning,Spatial & Embodied Reasoning,indoor scene,"object count, object size, relative distance, absolute distance, appearance order, room size, relative direction, route plan",,,1,"{""InternVL3-78B"": 48.4, ""Gemini1.5-Pro"": 45.4, ""OpenAI GPT4o"": 34.0}","InternVL3-78B 48.4, Gemini-1.5 Pro 45.4, OpenAI GPT 4o 34.0",,,,,,,,,,,,,,,,,
86
  All-Angles Bench,https://arxiv.org/pdf/2504.15280,2024,Reasoning,Spatial & Embodied Reasoning,"indoor scene, residential area, industrial space",spatial reasoning,,1,,"{""Seed1.5VL"": 58.6, ""Gemini2.5-Pro"": 53.4, ""OpenAI GPT4o"": 49.1, ""Qwen2.5VL-72B"": 55.7}","seed1.5vl thinking 58.6, Gemini2.5-pro thinking 53.4, OpenAI GPT4o 49.1, Qwen2.5vl-72B 55.7",,,,,,,,,,,,,,,,,
 
87
  ScreenSpot-V2,https://arxiv.org/pdf/2410.23218,2024,Understanding,Agent,GUI,grounding,,1,,"{""Seed1.5VL"": 95.2, ""OpenAI CUA"": 87.9, ""Claude3.7-Sonnet"": 87.6, ""Kimi-VL-A3B"": 92.8}","Seed1.5-VL thinking 95.2, OpenAI CUA 87.9, Claude 3.7 Sonnet 87.6, Kimi VL-A3B 92.8",,,,,,,,,,,,,,,,,
88
  ScreenSpot-Pro,https://arxiv.org/pdf/2504.07981v1,2025,Understanding,Agent,GUI,grounding,1,,,"{""Seed1.5VL"": 60.9, ""OpenAI CUA"": 23.4, ""Claude3.7-Sonnet"": 27.7, ""Kimi-VL-A3B"": 34.5, ""Qwen2.5VL-72B"": 43.6}","Seed1.5-VL thinking 60.9, OpenAI CUA 23.4, Claude 3.7 Sonnet 27.7, Kimi VL-A3B 34.5, Qwen2.5vl 72B 43.6",,,,,,,,,,,,,,,,,
89
  OSWorld ,https://arxiv.org/pdf/2404.07972,2024,Reasoning,Agent,GUI,computer use,,1,,"{""Seed1.5VL"": 36.1, ""OpenAI CUA"": 38.1, ""Claude3.7-Sonnet"": 28.0, ""Kimi-VL-A3B"": 8.2, ""Qwen2.5VL-72B"": 8.8}","Seed1.5-VL thinking 36.1, OpenAI CUA 38.1, Claude 3.7 Sonnet 28.0, Kimi VL-A3B 8.2, Qwen2.5vl 72B 8.8",,,,,,,,,,,,,,,,,
@@ -91,5 +92,5 @@ Windows Agent Arena,https://arxiv.org/pdf/2409.08264,2024,Reasoning,Agent,GUI,co
91
  WebVoyager ,https://arxiv.org/pdf/2401.13919,2024,Reasoning,Agent,GUI,browser use,,1,,"{""Seed1.5VL"": 87.2, ""OpenAI CUA"": 87.0, ""Claude3.7-Sonnet"": 84.1}","Seed1.5-VL thinking 87.2, OpenAI CUA 87.0, Claude 3.7 Sonnet 84.1",,,,,,,,,,,,,,,,,
92
  Online-Mind2Web,https://arxiv.org/pdf/2504.01382,2025,Reasoning,Agent,GUI,browser use,,1,,"{""Seed1.5VL"": 76.4, ""OpenAI CUA"": 71.0, ""Claude3.7-Sonnet"": 62.9}","Seed1.5-VL thinking 76.4, OpenAI CUA 71.0, Claude 3.7 Sonnet 62.9",,,,,,,,,,,,,,,,,
93
  Android World,https://arxiv.org/pdf/2405.14573v2,2024,Reasoning,Agent,GUI,phone use,,1,,"{""Seed1.5VL"": 62.1, ""Qwen2.5VL-72B"": 35.0}","Seed1.5-VL thinking 62.1, Qwen2.5vl 72B 35.0",,,,,,,,,,,,,,,,,
94
- MobileMiniWob++,https://arxiv.org/pdf/2405.14573v2,2024,Reasoning,Agent,GUI,phone use,,1,,"{""Qwen2.5VL-72B"": 68.0}",Qwen2.5vl 72B 68.0,,,,,,,,,,,,,,,,,
95
- Android Control,https://arxiv.org/pdf/2406.03679,2024,Reasoning,Agent,GUI,phone use,,1,,"{""Qwen2.5VL-72B"": {""high"": 67.4, ""low"": 93.7}}",Qwen2.5vl 72B 67.4/93.7 (high/low),,,,,,,,,,,,,,,,,
 
1
+ Benchmark,URL,year,cognitive levels,category,domain,task,image,multiple image,video,score,use,,,,,,,,,,,,,,,,,
2
  RefCOCO,https://aclanthology.org/D14-1086.pdf,2014,Understanding,General,natural,grounding,1,,,"{""Seed1.5VL"": 91.3, ""Gemini2.5-Pro"": 74.6, ""Qwen2.5VL-72B"": 90.3}","seed1.5vl thinking 91.3, Gemini2.5-pro thinking 74.6, Qwen2.5vl-72B 90.3",,,,,,,,,,,,,,,,,
3
  VQA-v2,https://openaccess.thecvf.com/content_cvpr_2017/papers/Goyal_Making_the_v_CVPR_2017_paper.pdf,2017,Understanding,General,natural,perception,1,,,"{""Gemini-Ultra"": 77.8, ""Gemini1.5-Pro"": 80.2}","Gemini-ultra 77.8, Gemini1.5-pro 80.2",,,,,,,,,,,,,,,,,
4
  FSC-147 (mean absolute error↓),https://arxiv.org/pdf/2104.08391,2021,Understanding,General,natural,grounding,1,,,"{""Seed1.5VL"": 17.9, ""Gemini2.5-Pro"": 24.5, ""OpenAI GPT4o"": 46.8, ""Qwen2.5VL-72B"": 28.6}","seed1.5vl thinking 17.9, Gemini2.5-pro thinking 24.5, OpenAI GPT4o 46.8, Qwen2.5vl-72B 28.6",,,,,,,,,,,,,,,,,
5
  CountBench,https://arxiv.org/pdf/2302.12066,2023,Understanding,General,"natural, synthetic",grounding,1,,,"{""Seed1.5VL"": 93.7, ""Gemini2.5-Pro"": 91.0, ""OpenAI GPT4o"": 85.7, ""Qwen2.5VL-72B"": 93.6}","seed1.5vl thinking 93.7, Gemini2.5-pro thinking 91.0, OpenAI GPT4o 85.7, Qwen2.5vl-72B 93.6",,,,,,,,,,,,,,,,,
6
+ POPE,https://arxiv.org/pdf/2305.10355,2023,Understanding,General,natural,grounding,1,,,{"InternVL3-78B": 90.3},InternVL3 78B 90.3,,,,,,,,,,,,,,,,,
7
  HallusionBench,https://arxiv.org/pdf/2310.14566,2023,Understanding,General,"natural, synthetic",perception,1,1,1,"{""Seed1.5VL"": 60.3, ""Gemini2.5-Pro"": 63.7, ""OpenAI GPT4o"": 56.2, ""Qwen2.5VL-72B"": 55.2, ""InternVL3-78B"": 59.1}","seed1.5vl thinking 60.3, Gemini2.5-pro thinking 63.7, OpenAI GPT4o 56.2, Qwen2.5vl-72B 55.2, InternVL3 78B 59.1",,,,,,,,,,,,,,,,,
8
  V* bench,https://arxiv.org/pdf/2312.14135,2023,Understanding,General,natural,perception,1,,,"{""Gemini1.5-Pro"": 71.7, ""OpenAI o3 high"": 95.7, ""Seed1.5VL"": 89.0, ""Gemini2.5-Pro"": 79.1, ""OpenAI GPT4o"": 73.9, ""Qwen2.5VL-72B"": 86.4}","Gemini1.5-pro 71.7, OpenAI o3 high 95.7, seed1.5vl thinking 89.0, Gemini2.5-pro thinking 79.1, OpenAI GPT4o 73.9, Qwen2.5vl-72B 86.4",,,,,,,,,,,,,,,,,
9
  MMVP,https://arxiv.org/pdf/2401.06209,2024,Understanding,General,"natural, synthetic",perception,1,,,"{""Seed1.5VL"": 70.7, ""Gemini2.5-Pro"": 70.7, ""OpenAI GPT4o"": 70.7, ""Qwen2.5VL-72B"": 66.7}","seed1.5vl thinking 70.7, Gemini2.5-pro thinking 70.7, OpenAI GPT4o 70.7, Qwen2.5vl-72B 66.7",,,,,,,,,,,,,,,,,
10
  CRPE,https://arxiv.org/pdf/2402.19474,2024,Understanding,General,natural,relation understanding,1,,,"{""Qwen2.5VL-72B"": 79.2, ""InternVL2.5-78B"": 78.8}","Qwen2.5vl-72B 79.2, InternVL2.5 78B 78.8",,,,,,,,,,,,,,,,,
11
+ Vibe-Eval (Reka),https://arxiv.org/pdf/2405.02287,2024,Understanding,General,"natural, synthetic",perception,1,,,{"Gemini2.5-Pro": 67.2},Gemini2.5-pro thinking 67.2,,,,,,,,,,,,,,,,,
12
  MuirBench,https://arxiv.org/pdf/2406.09411,2024,Understanding,General,natural,"counting, attribute similarity, image-text matching, visual retrieval, geographic understanding, scene understanding, cartoon understanding, diagram understanding",,1,,"{""Qwen2.5VL-72B"": 70.7, ""InternVL2.5-78B"": 63.5}","Qwen2.5vl-72B 70.7, InternVL2.5 78B 63.5",,,,,,,,,,,,,,,,,
13
+ WildVision,https://arxiv.org/pdf/2406.11069,2024,Understanding,General,"natural, synthetic",alignment with human prefernece,1,,,{"InternVL3-78B": 73.6},InternVL3 78B 73.6,,,,,,,,,,,,,,,,,
14
  VLMs are blind,https://arxiv.org/pdf/2407.06581,2024,Understanding,General,synthetic,perception,1,,,"{""OpenAI o3 high"": 90.1, ""Seed1.5VL"": 92.1, ""Gemini2.5-Pro"": 84.3, ""OpenAI GPT4o"": 50.4, ""Qwen2.5VL-72B"": 69.0}","OpenAI o3 high 90.1, seed1.5vl thinking 92.1,Gemini2.5-pro thinking 84.3, OpenAI GPT4o 50.4, Qwen2.5vl-72B 69.0",,,,,,,,,,,,,,,,,
15
  Realworld QA,https://huggingface.co/datasets/xai-org/RealworldQA,2024,Understanding,General,natural,perception,1,,,"{""Gemini1.5-Pro"": 70.4, ""Seed1.5VL"": 78.4, ""Gemini2.5-Pro"": 78.0, ""OpenAI GPT4o"": 76.2, ""Qwen2.5VL-72B"": 75.7, ""InternVL3-78B"": 78.0}","Gemini1.5-pro 70.4, seed1.5vl thinking 78.4, Gemini2.5-pro thinking 78.0, OpenAI GPT4o 76.2, Qwen2.5vl-72B 75.7, InternVL3 78B 78.0",,,,,,,,,,,,,,,,,
16
  MME-Realworld,https://arxiv.org/pdf/2408.13257,2024,Understanding,General,"video monitoring, OCR, autonomous driving, diagram tables",perception,1,,,"{""Qwen2.5VL-72B"": 63.2, ""InternVL3-78B"": 65.4}","Qwen2.5vl-72B 63.2, InternVL3 78B 65.4",,,,,,,,,,,,,,,,,
17
+ MMIU,https://arxiv.org/pdf/2408.02718,2024,Understanding,General,"natural, synthetic","semantic understanding, spatial understanding, temporal understanding",,1,,{"InternVL3-78B": 60.4},InternVL3-78B 60.4,,,,,,,,,,,,,,,,,
18
+ R-Bench,https://arxiv.org/pdf/2410.05474,2024,Understanding,General,natural,robustness to corruption,1,,,{"InternVL3-78B": 77.4},InternVL3 78B 77.4,,,,,,,,,,,,,,,,,
19
  SimpleVQA,https://arxiv.org/pdf/2502.13059,2025,Understanding,General,natural,knowledge memorization,1,,,"{""Seed1.5VL"": 63.4, ""Gemini2.5-Pro"": 62.0, ""OpenAI GPT4o"": 52.4, ""Qwen2.5VL-72B"": 52.4}","seed1.5vl thinking 63.4, Gemini2.5-pro thinking 62.0, OpenAI GPT4o 52.4, Qwen2.5vl-72B 52.4",,,,,,,,,,,,,,,,,
20
  MMMU,https://arxiv.org/pdf/2311.16502,2023,Reasoning,General,multi-discipline,QA,1,,,"{""Gemini-Ultra"": 59.4, ""Gemini1.5-Pro"": 62.2, ""Gemini2.5-Pro"": 82.0, ""OpenAI o3 high"": 82.9, ""Claude4-Opus"": 76.5, ""Grok3-beta"": 76.0, ""Seed1.5VL"": 77.9, ""Qwen2.5VL-72B"": 70.2, ""InternVL3-78B"": 72.2}","Gemini-ultra 59.4, Gemini1.5-pro 62.2, Gemini2.5-pro thinking 82.0, OpenAI o3 high 82.9 Claude 4 Opus 76.5, Grok3 beta 76.0, seed1.5vl thinking 77.9, Qwen2.5vl-72B 70.2, InternVL3 78B 72.2",,,,,,,,,,,,,,,,,
21
  ZeroBench,https://arxiv.org/pdf/2502.09696,2025,Reasoning,General,"natural, synthetic",QA,1,,,"{""Gemini2.5-Pro"": 4.5, ""Seed1.5VL"": 2.0, ""OpenAI GPT4o"": 0.0, ""Qwen2.5VL-72B"": 0.0}","Gemini2.5-pro thinking 4.5, seed1.5vl thinking 2.0, OpenAI GPT4o 0.0, Qwen2.5vl-72B 0.0",,,,,,,,,,,,,,,,,
 
23
  MM-VET,https://arxiv.org/pdf/2308.02490,2023,Comprehensive,General,"natural, synthetic","OCR, math, recognition, spatial understanding, knowledge memorization",1,,,"{""Qwen2.5VL-72B"": 76.2, ""InternVL2.5-78B"": 72.3}","Qwen2.5vl-72B 76.2, InternVL2.5 78B 72.3",,,,,,,,,,,,,,,,,
24
  MMStar,https://arxiv.org/pdf/2403.20330,2024,Comprehensive,General,multi-discipline,QA,1,,,"{""Seed1.5VL"": 77.8, ""Gemini2.5-Pro"": 77.5, ""OpenAI GPT4o"": 65.1, ""Qwen2.5VL-72B"": 70.8, ""InternVL3-78B"": 72.5}","seed1.5vl thinking 77.8, Gemini2.5-pro thinking 77.5, OpenAI GPT4o 65.1, Qwen2.5vl-72B 70.8, InternVL3 78B 72.5",,,,,,,,,,,,,,,,,
25
  Blink,https://arxiv.org/pdf/2404.12390,2024,Comprehensive,General,"natural, synthetic","grounding, low-level pattern matching (e.g., visual correspondence) to mid-level spatial reasoning (e.g., relative depth), and up to high-level visual understanding (e.g., visual similarity)",1,1,,"{""Gemini1.5-Pro"": 61.4, ""Seed1.5VL"": 72.1, ""Gemini2.5-Pro"": 70.6, ""OpenAI GPT4o"": 65.9, ""Qwen2.5VL-72B"": 64.4, ""InternVL3-78B"": 66.3}","Gemini1.5-pro 61.4, seed1.5vl thinking 72.1, Gemini2.5-pro thinking 70.6, OpenAI GPT4o 65.9, Qwen2.5vl-72B 64.4, InternVL3-78B 66.3",,,,,,,,,,,,,,,,,
26
+ MMT-Bench,https://arxiv.org/pdf/2404.16006,2024,Comprehensive,General,"natural, synthetic",162 tasks,,1,,{"InternVL3-78B": 73.2},InternVL3-78B 73.2,,,,,,,,,,,,,,,,,
27
+ Mantis-Eval,https://arxiv.org/pdf/2405.01483,2024,Comprehensive,General,natural,"Co-reference, Comparison, Reasoning, Temporal understanding",,1,,{"InternVL3-78B": 79.3},InternVL3-78B 79.3,,,,,,,,,,,,,,,,,
28
+ MIRB ,https://arxiv.org/pdf/2406.12742,2024,Comprehensive,General,"natural, synthetic","perception, reasoning, knowledge, multi-hop",,1,,{"InternVL3-78B": 64.3},InternVL3-78B 64.3,,,,,,,,,,,,,,,,,
29
  MegaBench,https://arxiv.org/pdf/2410.10563,2024,Comprehensive,General,"natural, synthetic","mathematics, information extraction, planning, coding, perception, metrics, scene, knowledge",1,1,,"{""Qwen2.5VL-72B"": 46.8, ""InternVL2.5-78B"": 45.6}","Qwen2.5vl-72B 46.8, InternVL2.5 78B 45.6",,,,,,,,,,,,,,,,,
30
  Mathvista,https://arxiv.org/pdf/2310.02255,2023,Reasoning,Math & Logic,math,problem-solving,1,,,"{""Gemini-Ultra"": 53.0, ""Gemini1.5-Pro"": 63.9, ""OpenAI o3 high"": 86.8, ""Seed1.5VL"": 85.6, ""Gemini2.5-Pro"": 82.7, ""OpenAI GPT4o"": 63.8, ""Qwen2.5VL-72B"": 74.8, ""InternVL3-78B"": 80.5, ""MiMo-VL-7B"": 81.5}","Gemini-ultra 53.0, Gemini1.5-pro 63.9, OpenAI o3 high 86.8,seed1.5vl thinking 85.6, Gemini2.5-pro thinking 82.7, OpenAI GPT4o 63.8, Qwen2.5vl-72B 74.8, InternVL3 78B 80.5, MiMo-VL 7B 81.5",,,,,,,,,,,,,,,,,
31
  OlympiadBench,https://arxiv.org/pdf/2402.14008,2024,Reasoning,Math & Logic,math,problem-solving,1,,,"{""Seed1.5VL"": 65.0, ""Gemini2.5-Pro"": 69.8, ""OpenAI GPT4o"": 25.9, ""Qwen2.5VL-72B"": 35.9, ""MiMo-VL-7B"": 59.4}","seed1.5vl thinking 65.0, Gemini2.5-pro thinking 69.8, OpenAI GPT4o 25.9, Qwen2.5vl-72B 35.9, MiMo-VL 7B 59.4",,,,,,,,,,,,,,,,,
 
43
  DocVQA,https://arxiv.org/pdf/2007.00398,2020,Reasoning,OCR & Chart& Document,document,QA,1,,,"{""Gemini-Ultra"": 90.9, ""Gemini1.5-Pro"": 93.1, ""Seed1.5VL"": 96.9, ""Gemini2.5-Pro"": 94.0, ""OpenAI GPT4o"": 66.2, ""Qwen2.5VL-72B"": 96.4, ""InternVL3-78B"": 95.4}","Gemini-ultra 90.9, Gemini1.5-pro 93.1, seed1.5vl thinking 96.9, Gemini2.5-pro thinking 94.0, OpenAI GPT4o 66.2, Qwen2.5vl-72B 96.4, InternVL3 78B 95.4",,,,,,,,,,,,,,,,,
44
  InfographicVQA ,https://arxiv.org/pdf/2104.12756,2021,Reasoning,OCR & Chart& Document,infographic,QA,1,,,"{""Gemini-Ultra"": 80.3, ""Gemini1.5-Pro"": 81.0, ""Seed1.5VL"": 91.2, ""Gemini2.5-Pro"": 84.3, ""OpenAI GPT4o"": 79.2, ""Qwen2.5VL-72B"": 87.3, ""InternVL3-78B"": 85.2}","Gemini-ultra 80.3, Gemini1.5-pro 81.0, seed1.5vl thinking 91.2, Gemini2.5-pro thinking 84.3, OpenAI GPT4o 79.2, Qwen2.5vl-72B 87.3, InternVL3 78B 85.2",,,,,,,,,,,,,,,,,
45
  ChartQA ,https://arxiv.org/pdf/2203.10244,2022,Reasoning,OCR & Chart& Document,chart,QA,1,,,"{""Gemini-Ultra"": 80.8, ""Gemini1.5-Pro"": 87.2, ""Seed1.5VL"": 89.1, ""Gemini2.5-Pro"": 83.3, ""OpenAI GPT4o"": 86.7, ""Qwen2.5VL-72B"": 89.5, ""InternVL3-78B"": 89.7}","Gemini-ultra 80.8, Gemini1.5-pro 87.2, seed1.5vl thinking 89.1, Gemini2.5-pro thinking 83.3, OpenAI GPT4o 86.7, Qwen2.5vl-72B 89.5, InternVL3 78B 89.7",,,,,,,,,,,,,,,,,
46
+ TAT-DQA,https://arxiv.org/pdf/2207.11871,2022,Reasoning,OCR & Chart& Document,document,QA,1,,,{"Gemini1.5-Pro": 37.8},Gemini1.5-pro 37.8,,,,,,,,,,,,,,,,,
47
+ DUDE ,https://arxiv.org/pdf/2305.08455,2023,Reasoning,OCR & Chart& Document,document,QA,,1,,{"Gemini1.5-Pro": 46.0},Gemini1.5-pro 46.0,,,,,,,,,,,,,,,,,
48
  SEED-Bench-2-Plus,https://arxiv.org/pdf/2404.16790,2024,Reasoning,OCR & Chart& Document,"chart, web page, map",QA,1,,,"{""Qwen2.5VL-72B"": 73.0, ""InternVL2.5-78B"": 71.3, ""Gemini1.5-Pro"": 70.8}","Qwen2.5vl-72B 73.0, InternVL2.5 78B 71.3, Gemini 1.5 pro 70.8",,,,,,,,,,,,,,,,,
49
  CharXiv reasoning/description,https://arxiv.org/pdf/2406.18521,2024,Reasoning,OCR & Chart& Document,chart,QA,1,,,"{""OpenAI o3 high"": {""score1"": 78.6, ""score2"": 95.0}, ""Seed1.5VL"": {""score1"": 60.2, ""score2"": 92.6}, ""Gemini2.5-Pro"": {""score1"": 69.9, ""score2"": 94.4}, ""OpenAI GPT4o"": {""score1"": 52.0, ""score2"": 86.5}, ""Qwen2.5VL-72B"": {""score1"": 49.7, ""score2"": 87.4}}","OpenAI o3 high 78.6/95.0, seed1.5vl thinking 60.2/92.6, Gemini2.5-pro thinking 69.9/94.4, OpenAI GPT4o 52.0/86.5, Qwen2.5vl-72B 49.7/87.4",,,,,,,,,,,,,,,,,
50
  MMLongBench-DOC,https://arxiv.org/pdf/2407.01523,2024,Reasoning,OCR & Chart& Document,long document,QA,,1,,"{""Kimi-VL-A3B-Thinking-2506"": 42.1, ""OpenAI GPT4o"": 42.8, ""Qwen2.5VL-72B"": 38.8}",,,,,,,,,,,,,,,,,,
51
  VisualWebBench ,https://arxiv.org/pdf/2404.05955,2024,Comprehensive,OCR & Chart& Document,web page,grounding,1,,,"{""Seed1.5VL"": 87.3, ""Gemini2.5-Pro"": 87.3, ""OpenAI GPT4o"": 80.2, ""Qwen2.5VL-72B"": 82.3}","seed1.5vl thinking 87.3, Gemini2.5-pro thinking 87.3, OpenAI GPT4o 80.2, Qwen2.5vl-72B 82.3",,,,,,,,,,,,,,,,,
52
  QVHighlights,https://arxiv.org/pdf/2107.09609,2021,Understanding,Short Video,natural,moment retrieval,,,1,"{""Gemini2.5-Pro"": 75.0, ""OpenAI GPT4.1"": 71.4}","Gemini2.5-pro thinking 75.0, OpenAI GPT4.1 71.4",,,,,,,,,,,,,,,,,
53
+ TACoS ,https://arxiv.org/pdf/1403.6173,2014,Understanding,Short Video,natural,grounding,,,1,{"Seed1.5VL": 49.6},Seed1.5-VL thinking 49.6,,,,,,,,,,,,,,,,,
54
  Charades-STA,https://arxiv.org/pdf/1705.02101,2017,Understanding,Short Video,natural,grounding,,,1,"{""Seed1.5VL"": 64.0, ""Qwen2.5VL-72B"": 50.9}","Seed1.5-VL thinking 64.0, Qwen2.5-VL 72B 50.9",,,,,,,,,,,,,,,,,50.9
55
  YouCook2 ,https://arxiv.org/pdf/1703.09788,2017,Understanding,Short Video,cooking,perception,,,1,"{""Gemini-Ultra"": 135.4, ""Gemini1.5-Pro"": 106.5, ""Gemini2.5-Pro"": 188.3, ""OpenAI GPT4.1"": 127.6}","Gemini-ultra 135.4, Gemini1.5-pro 106.5, Gemini2.5-pro thinking 188.3, OpenAI GPT4.1 127.6",,,,,,,,,,,,,,,,,
56
  VATEX ,https://arxiv.org/pdf/1904.03493,2019,Understanding,Short Video,natural,perception,,,1,"{""Gemini-Ultra"": {""en"": 62.7, ""zh"": 51.3}, ""Gemini1.5-Pro"": {""en"": 64.6, ""zh"": 55.3}, ""Gemini2.5-Pro"": {""en"": 71.3, ""zh"": 59.7}, ""OpenAI GPT4.1"": {""en"": 64.1, ""zh"": 48.7}}","Gemini-ultra 62.7/51.3 (En/Zh), Gemini1.5-pro 64.6/55.3 (En/Zh), Gemini2.5-pro thinking 71.3/59.7 (En/Zh), OpenAI GPT4.1 64.1/48.7 (En/Zh)",,,,,,,,,,,,,,,,,
57
  EgoSchema,https://arxiv.org/pdf/2308.09126,2023,Understanding,Short Video,natural,perception,,,1,"{""Gemini1.5-Pro"": 72.2, ""Qwen2.5VL-72B"": 76.2}","Gemini1.5-pro 72.2, Qwen2.5-VL 72B 76.2",,,,,,,,,,,,,,,,,
58
  TemporalBench,https://arxiv.org/pdf/2410.10818,2024,Understanding,Short Video,natural,captioning,,,1,"{""Seed1.5VL"": 79.8, ""OpenAI GPT4o"": 73.3}","Seed1.5-VL thinking 79.8, OpenAI GPT4o 73.3",,,,,,,,,,,,,,,,,
59
+ Dream-1K,https://arxiv.org/pdf/2407.00634,2024,Understanding,Short Video,natural,captioning,,,1,{"Seed1.5VL": 43.9},Seed1.5-VL thinking 43.9,,,,,,,,,,,,,,,,,
60
+ MotionBench,https://arxiv.org/pdf/2501.02955,2025,Understanding,Short Video,natural,motion understanding,,,1,{"Seed1.5VL": 68.4},Seed1.5-VL thinking 68.4,,,,,,,,,,,,,,,,,
61
  MVBench,https://arxiv.org/pdf/2311.17005,2025,Understanding,Short Video,natural,"temporal understanding, spatial understanding",,,1,"{""Seed1.5VL"": 74.4, ""InternVL2.5-78B"": 76.4, ""Qwen2.5VL-72B"": 70.4}","Seed1.5-VL thinking 74.4, InternVL-2.5 76.4, Qwen2.5-VL 72B 70.4",,,,,,,,,,,,,,,,,70.4
62
  ActivityNet-QA,https://arxiv.org/pdf/1906.02467,2019,Reasoning,Short Video,natural,QA,,,1,"{""Gemini-Ultra"": 52.2, ""Gemini1.5-Pro"": 57.5, ""Gemini2.5-Pro"": 66.6, ""OpenAI GPT4.1"": 60.4}","Gemini-ultra 52.2, Gemini1.5-pro 57.5, Gemini2.5-pro thinking 66.6, OpenAI GPT4.1 60.4",,,,,,,,,,,,,,,,,
63
+ NextQA ,https://arxiv.org/pdf/2105.08276,2021,Reasoning,Short Video,natural,QA,,,1,{"Gemini-Ultra": 29.9},Gemini-ultra 29.9,,,,,,,,,,,,,,,,,
64
  Perception Test MCQA,https://proceedings.neurips.cc/paper_files/paper/2023/file/8540fba4abdc7f9f7a7b1cc6cd60e409-Paper-Datasets_and_Benchmarks.pdf,2023,Reasoning,Short Video,natural,QA,,,1,"{""Gemini-Ultra"": 54.7, ""Gemini2.5-Pro"": 78.4, ""OpenAI GPT4.1"": 64.8, ""Qwen2.5VL-72B"": 73.2}","Gemini-ultra 54.7, Gemini2.5-pro thinking 78.4, OpenAI GPT4.1 64.8, Qwen2.5-VL 72B 73.2",,,,,,,,,,,,,,,,,
65
  MMVU,https://arxiv.org/pdf/2501.12380,2025,Reasoning,Short Video,multi-discipline,QA,,,1,"{""Gemini2.5-Pro"": 75.8, ""Seed1.5VL"": 70.1, ""Qwen2.5VL-72B"": 62.9}","Gemini2.5-pro thinking 75.8, seed1.5vl thinking 70.1, Qwen2.5-VL 72B 62.9",,,,,,,,,,,,,,,,,
66
  VideoMMMU,https://arxiv.org/pdf/2501.13826,2025,Reasoning,Short Video,multi-discipline,QA,,,1,"{""Gemini2.5-Pro"": 83.6, ""OpenAI GPT4.1"": 60.9, ""Seed1.5VL"": 81.4, ""Kimi k1.6"": 76.7, ""Qwen2.5VL-72B"": 60.2}","Gemini2.5-pro thinking 83.6, OpenAI GPT4.1 60.9, seed1.5vl thinking 81.4, Kimi k1.6 76.7, Qwen2.5-VL 72B 60.2",,,,,,,,,,,,,,,,,
 
69
  TVBench,https://arxiv.org/pdf/2410.07752,2024,Reasoning,Short Video,"natural, synthetic",temporal reasoning,,,1,"{""Seed1.5VL"": 63.6, ""Gemini2.5-Pro"": 62.6}","Seed1.5-VL thinking 63.6, Gemini2.5-pro thinking 62.6",,,,,,,,,,,,,,,,,
70
  TOMATO ,https://arxiv.org/pdf/2410.23266,2024,Reasoning,Short Video,natural,temporal reasoning,,,1,"{""Seed1.5VL"": 44.7, ""Gemini2.5-Pro"": 46.9}","Seed1.5-VL thinking 44.7, Gemini2.5-pro thinking 46.9",,,,,,,,,,,,,,,,,
71
  EgoTempo,https://arxiv.org/pdf/2503.13646v1,2025,Reasoning,Short Video,natural,temporal reasoning,,,1,"{""Gemini2.5-Pro"": 44.3, ""OpenAI GPT4.1"": 40.3}","Gemini2.5-pro thinking 44.3, OpenAI GPT4.1 40.3",,,,,,,,,,,,,,,,,
72
+ MMBench-Video,https://arxiv.org/pdf/2406.14515,2024,Comprehensive,Short Video,"natural, synthetic","temporal reasoning, commonsense reasoning, attribute reasoning, logic reasoning, relation reasoning, perception",,,1,{"Qwen2.5VL-72B": 2.02},Qwen2.5vl-72B 2.02,,,,,,,,,,,,,,,,,
73
  1H-VideoQA,https://arxiv.org/pdf/2403.05530,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Gemini2.5-Pro"": 81.0, ""OpenAI GPT4.1"": 56.8}","Gemini2.5-pro thinking 81.0, OpenAI GPT4.1 56.8",,,,,,,,,,,,,,,,,
74
  LVBench,https://arxiv.org/pdf/2406.08035,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Gemini2.5-Pro"": 78.7, ""OpenAI GPT4.1"": 63.4, ""Seed1.5VL"": 64.6, ""Qwen2.5VL-72B"": 47.3}","Gemini2.5-pro thinking 78.7, OpenAI GPT4.1 63.4, Seed1.5-VL thinking 64.6, Qwen2.5-VL 72B 47.3",,,,,,,,,,,,,,,,,
75
  VideoMME w/o subtitle,https://arxiv.org/abs/2405.21075,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Gemini2.5-Pro"": 84.3, ""OpenAI GPT4.1"": 72.0, ""Seed1.5VL"": 77.9}","Gemini2.5-pro thinking 84.3, OpenAI GPT4.1 72.0, Seed1.5-vl thinking 77.9",,,,,,,,,,,,,,,,,
 
78
  Neptune,https://arxiv.org/pdf/2412.09582,2024,Comprehensive,Long Video,natural,QA,,,1,"{""Gemini2.5-Pro"": 87.3, ""OpenAI GPT4.1"": 85.2}","Gemini2.5-pro thinking 87.3, OpenAI GPT4.1 85.2",,,,,,,,,,,,,,,,,
79
  StreamBench ,https://arxiv.org/pdf/2501.13468,2025,Reasoning,Streaming Video,natural,streaming reasoning,,,1,"{""Seed1.5VL"": 72.8, ""OpenAI GPT4o"": 68.7}","Seed1.5-VL thinking 72.8, OpenAI GPT4o 68.7",,,,,,,,,,,,,,,,,
80
  OVO-Bench,https://arxiv.org/pdf/2501.05510,2025,Reasoning,Streaming Video,natural,streaming reasoning,,,1,"{""Seed1.5VL"": 72.3, ""Gemini1.5-Pro"": 67.7}","Seed1.5-VL thinking 72.3, Gemini1.5-pro 67.7",,,,,,,,,,,,,,,,,
81
+ OVBench ,https://arxiv.org/pdf/2501.00584,2025,Reasoning,Streaming Video,natural,streaming reasoning,,,1,{"Seed1.5VL": 60.0},Seed1.5-VL thinking 60.0,,,,,,,,,,,,,,,,,
82
  NYU-Depth V2 (absolute relative error↓),https://link.springer.com/chapter/10.1007/978-3-642-33715-4_54,2012,Understanding,Spatial & Embodied Reasoning,indoor scene,depth estimation,1,,,"{""Seed1.5VL"": 13.6, ""Gemini2.5-Pro"": 27.5, ""OpenAI GPT4o"": 73.8, ""Qwen2.5VL-72B"": 35.5}","seed1.5vl thinking 13.6, Gemini2.5-pro thinking 27.5, OpenAI GPT4o 73.8, Qwen2.5vl-72B 35.5",,,,,,,,,,,,,,,,,
83
  DA-2K,https://arxiv.org/pdf/2406.09414v1,2024,Understanding,Spatial & Embodied Reasoning,natural,depth estimation,1,,,"{""Seed1.5VL"": 91.7, ""Gemini2.5-Pro"": 73.0, ""OpenAI GPT4o"": 66.9, ""Qwen2.5VL-72B"": 69.6}","seed1.5vl thinking 91.7, Gemini2.5-pro thinking 73.0, OpenAI GPT4o 66.9, Qwen2.5vl-72B 69.6",,,,,,,,,,,,,,,,,
84
+ OpenEQA ,https://openaccess.thecvf.com/content/CVPR2024/papers/Majumdar_OpenEQA_Embodied_Question_Answering_in_the_Era_of_Foundation_Models_CVPR_2024_paper.pdf,2024,Reasoning,Spatial & Embodied Reasoning,indoor scene,embodied reasoning,,,1,{"Gemini-Ultra": 57.9},Gemini-ultra 57.9,,,,,,,,,,,,,,,,,
85
  VSI-Bench,https://arxiv.org/abs/2412.14171,2024,Reasoning,Spatial & Embodied Reasoning,indoor scene,"object count, object size, relative distance, absolute distance, appearance order, room size, relative direction, route plan",,,1,"{""InternVL3-78B"": 48.4, ""Gemini1.5-Pro"": 45.4, ""OpenAI GPT4o"": 34.0}","InternVL3-78B 48.4, Gemini-1.5 Pro 45.4, OpenAI GPT 4o 34.0",,,,,,,,,,,,,,,,,
86
  All-Angles Bench,https://arxiv.org/pdf/2504.15280,2024,Reasoning,Spatial & Embodied Reasoning,"indoor scene, residential area, industrial space",spatial reasoning,,1,,"{""Seed1.5VL"": 58.6, ""Gemini2.5-Pro"": 53.4, ""OpenAI GPT4o"": 49.1, ""Qwen2.5VL-72B"": 55.7}","seed1.5vl thinking 58.6, Gemini2.5-pro thinking 53.4, OpenAI GPT4o 49.1, Qwen2.5vl-72B 55.7",,,,,,,,,,,,,,,,,
87
+ ERQA,https://storage.googleapis.com/deepmind-media/gemini-robotics/gemini_robotics_report.pdf,2025,Reasoning,Spatial & Embodied Reasoning,"indoor scene, residential area",spatial reasoning,1,1,,"{""Gemini2.0-Pro Experimental"": 54.8, ""OpenAI GPT4o"": 47.0, ""Qwen2.5VL-72B"": 44.8, ""GLM-4.5V"": 50.0}","seed1.5vl thinking 58.6, Gemini2.5-pro thinking 53.4, OpenAI GPT4o 49.1, Qwen2.5vl-72B 55.7",,,,,,,,,,,,,,,,,
88
  ScreenSpot-V2,https://arxiv.org/pdf/2410.23218,2024,Understanding,Agent,GUI,grounding,,1,,"{""Seed1.5VL"": 95.2, ""OpenAI CUA"": 87.9, ""Claude3.7-Sonnet"": 87.6, ""Kimi-VL-A3B"": 92.8}","Seed1.5-VL thinking 95.2, OpenAI CUA 87.9, Claude 3.7 Sonnet 87.6, Kimi VL-A3B 92.8",,,,,,,,,,,,,,,,,
89
  ScreenSpot-Pro,https://arxiv.org/pdf/2504.07981v1,2025,Understanding,Agent,GUI,grounding,1,,,"{""Seed1.5VL"": 60.9, ""OpenAI CUA"": 23.4, ""Claude3.7-Sonnet"": 27.7, ""Kimi-VL-A3B"": 34.5, ""Qwen2.5VL-72B"": 43.6}","Seed1.5-VL thinking 60.9, OpenAI CUA 23.4, Claude 3.7 Sonnet 27.7, Kimi VL-A3B 34.5, Qwen2.5vl 72B 43.6",,,,,,,,,,,,,,,,,
90
  OSWorld ,https://arxiv.org/pdf/2404.07972,2024,Reasoning,Agent,GUI,computer use,,1,,"{""Seed1.5VL"": 36.1, ""OpenAI CUA"": 38.1, ""Claude3.7-Sonnet"": 28.0, ""Kimi-VL-A3B"": 8.2, ""Qwen2.5VL-72B"": 8.8}","Seed1.5-VL thinking 36.1, OpenAI CUA 38.1, Claude 3.7 Sonnet 28.0, Kimi VL-A3B 8.2, Qwen2.5vl 72B 8.8",,,,,,,,,,,,,,,,,
 
92
  WebVoyager ,https://arxiv.org/pdf/2401.13919,2024,Reasoning,Agent,GUI,browser use,,1,,"{""Seed1.5VL"": 87.2, ""OpenAI CUA"": 87.0, ""Claude3.7-Sonnet"": 84.1}","Seed1.5-VL thinking 87.2, OpenAI CUA 87.0, Claude 3.7 Sonnet 84.1",,,,,,,,,,,,,,,,,
93
  Online-Mind2Web,https://arxiv.org/pdf/2504.01382,2025,Reasoning,Agent,GUI,browser use,,1,,"{""Seed1.5VL"": 76.4, ""OpenAI CUA"": 71.0, ""Claude3.7-Sonnet"": 62.9}","Seed1.5-VL thinking 76.4, OpenAI CUA 71.0, Claude 3.7 Sonnet 62.9",,,,,,,,,,,,,,,,,
94
  Android World,https://arxiv.org/pdf/2405.14573v2,2024,Reasoning,Agent,GUI,phone use,,1,,"{""Seed1.5VL"": 62.1, ""Qwen2.5VL-72B"": 35.0}","Seed1.5-VL thinking 62.1, Qwen2.5vl 72B 35.0",,,,,,,,,,,,,,,,,
95
+ MobileMiniWob++,https://arxiv.org/pdf/2405.14573v2,2024,Reasoning,Agent,GUI,phone use,,1,,{"Qwen2.5VL-72B": 68.0},Qwen2.5vl 72B 68.0,,,,,,,,,,,,,,,,,
96
+ Android Control,https://arxiv.org/pdf/2406.03679,2024,Reasoning,Agent,GUI,phone use,,1,,"{""Qwen2.5VL-72B"": {""high"": 67.4, ""low"": 93.7}}",Qwen2.5vl 72B 67.4/93.7 (high/low),,,,,,,,,,,,,,,,,