File size: 5,904 Bytes
16280dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import json
import os
import pandas as pd
from utils import create_hyperlinked_names

def sum_lst(lst):
    assert isinstance(lst, list) and lst, f"Input should be a non-empty list, got {type(lst)}, size {len(lst)}"
    total = lst[0]
    for item in lst[1:]:
        assert isinstance(item, (list, int, float)), f"Expected types are list and numbers, got {type(item)}"
        total += item
    return total

SCORE_BASE_DIR = "Scores"
META_DATA = ["model_name", "model_size", "url"]
DATASETS = {
    "image": {
        "I-CLS": ['VOC2007', 'N24News', 'SUN397', 'ObjectNet', 'Country211', 'Place365', 'ImageNet-1K', 'HatefulMemes', 'ImageNet-A', 'ImageNet-R'], 
        "I-QA": ['OK-VQA', 'A-OKVQA', 'DocVQA', 'InfographicsVQA', 'ChartQA', 'Visual7W-Pointing', 'ScienceQA', 'GQA', 'TextVQA', 'VizWiz'], 
        "I-RET": ['VisDial', 'CIRR', 'VisualNews_t2i', 'VisualNews_i2t', 'MSCOCO_t2i', 'MSCOCO_i2t', 'NIGHTS', 'WebQA', 'FashionIQ', 'Wiki-SS-NQ', 'OVEN', 'EDIS'],
        "I-VG": ['MSCOCO', 'RefCOCO', 'RefCOCO-Matching', 'Visual7W']
        }, 
    "visdoc": {
        "VisDoc": ['ViDoRe_arxivqa', 'ViDoRe_docvqa', 'ViDoRe_infovqa', 'ViDoRe_tabfquad', 'ViDoRe_tatdqa', 'ViDoRe_shiftproject', 'ViDoRe_syntheticDocQA_artificial_intelligence', 'ViDoRe_syntheticDocQA_energy', 'ViDoRe_syntheticDocQA_government_reports', 'ViDoRe_syntheticDocQA_healthcare_industry', 'VisRAG_ArxivQA', 'VisRAG_ChartQA', 'VisRAG_MP-DocVQA', 'VisRAG_SlideVQA', 'VisRAG_InfoVQA', 'VisRAG_PlotQA', 'ViDoSeek-page', 'ViDoSeek-doc', 'MMLongBench-page', 'MMLongBench-doc']
        }, 
    "video": {
        "V-CLS": ['K700', 'UCF101', 'HMDB51', 'SmthSmthV2', 'Breakfast'], 
        "V-QA": ['Video-MME', 'MVBench', 'NExTQA', 'EgoSchema'], 
        "V-RET": ['MSR-VTT', 'MSVD', 'DiDeMo', 'VATEX', 'YouCook2'], 
        "V-MRET": ['QVHighlight', 'Charades-STA', 'MomentSeeker', 'ActivityNetQA']
        }
}
ALL_DATASETS_SPLITS = {k: sum_lst(list(v.values())) for k, v in DATASETS.items()}
ALL_DATASETS = sum_lst(list(ALL_DATASETS_SPLITS.values()))
MODALITIES = list(DATASETS.keys())
SPECIAL_METRICS = {
    '__default__': 'hit@1',
}

BASE_COLS = ['Rank', 'Models', 'Model Size(B)']
TASKS = ["Overall", "Image-Overall", "I-CLS", "I-QA", "I-RET", "I-VG", "VisDoc", "Video-Overall", "V-CLS", "V-QA", "V-RET", "V-MRET"]
COLUMN_NAMES = BASE_COLS + TASKS

DATA_TITLE_TYPE = ['number', 'markdown', 'str', 'markdown'] + \
                    ['number'] * len(TASKS)

TABLE_INTRODUCTION = """"""

LEADERBOARD_INFO = """
## Dataset Summary
"""

CITATION_BUTTON_TEXT = r""""""

def load_single_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

def load_data(base_dir=SCORE_BASE_DIR):
    all_data = []
    for file_name in os.listdir(base_dir):
        if file_name.endswith('-scores_report.json'):
            file_path = os.path.join(base_dir, file_name)
            data = load_single_json(file_path)
            all_data.append(data)
    return all_data

def calculate_score(raw_scores=None):
    """This function calculates the overall average scores for all datasets as well as avg scores for each modality and sub-task based on the raw scores.
    Algorithm summary: 
    """
    def get_avg(sum_score, leng):
        return sum_score / leng if leng > 0 else 0.0
    
    avg_scores = {}
    overall_scores_summary = {} # Stores the scores sum and length for each modality and all datasets
    for modality, datasets_list in DATASETS.items(): # Ex.: ('image', {'I-CLS': [...], 'I-QA': [...]})
        overall_scores_summary[modality] = (0.0, 0) # Initialize the sum and count for each modality
        for sub_task, datasets in datasets_list.items(): # Ex.: ('I-CLS', ['VOC2007', 'N24News', ...])
            sub_task_sum_score, sub_task_datasets_len = 0.0, len(datasets)
            for dataset in datasets: # Ex.: 'VOC2007'
                score = raw_scores.get(modality, {}).get(dataset, 0.0)
                score = 0.0 if score == "FILE_N/A" else score
                metric = SPECIAL_METRICS.get(dataset, 'hit@1')
                if isinstance(score, dict):
                    score = score.get(metric, 0.0)
                sub_task_sum_score += score

            sub_task_overall = get_avg(sub_task_sum_score, sub_task_datasets_len)
            avg_scores[sub_task] = sub_task_overall

            # Accumulate the scores sum and length for the each modality
            modality_sum_score, modality_datasets_len = overall_scores_summary[modality]
            modality_sum_score += sub_task_sum_score
            modality_datasets_len += sub_task_datasets_len
            overall_scores_summary[modality] = (modality_sum_score, modality_datasets_len)

    all_datasets_sum_score, all_datasets_len = 0.0, 0
    for modality, (modality_sum_score, modality_datasets_len) in overall_scores_summary.items():
        name = f"{modality.capitalize()}-Overall"
        avg_scores[name] = get_avg(modality_sum_score, modality_datasets_len)
        # Accumulate the scores sum and length for all datasets
        all_datasets_sum_score += modality_sum_score
        all_datasets_len += modality_datasets_len
    avg_scores['Overall'] = get_avg(all_datasets_sum_score, all_datasets_len)
    return avg_scores

def generate_model_row(data):
    metadata = data['metadata']
    row = {
        'Models': metadata.get('model_name', None), 
        'Model Size(B)': metadata.get('model_size', None),
        'URL': metadata.get('url', None)
    }
    scores = calculate_score(data['metrics'])
    row.update(scores)
    return row

def get_df():
    """Generates a DataFrame from the loaded data."""
    all_data = load_data()
    rows = [generate_model_row(data) for data in all_data]
    df = pd.DataFrame(rows)
    df = df.sort_values(by='Overall', ascending=False).reset_index(drop=True)
    df['Rank'] = range(1, len(df) + 1)
    df = create_hyperlinked_names(df)

    return df