Commit
Β·
b54190c
1
Parent(s):
7a82a99
group results by max eval length
Browse files- app.py +4 -2
- draw_utils.py +11 -4
app.py
CHANGED
@@ -11,11 +11,13 @@ def draw_leaderboard():
|
|
11 |
df = load_results()
|
12 |
|
13 |
tasks = ['avg'] + [f"qa{i}" for i in range(1, 11)]
|
14 |
-
columns = ["model_name", "
|
15 |
|
16 |
st.title("πππͺ‘πβ BABILong Leaderboard π")
|
17 |
st.markdown(PAGE_INFO)
|
18 |
-
st.subheader("
|
|
|
|
|
19 |
search_term = st.text_input("Search models:", "")
|
20 |
|
21 |
tabs = st.tabs([str(task) for task in tasks])
|
|
|
11 |
df = load_results()
|
12 |
|
13 |
tasks = ['avg'] + [f"qa{i}" for i in range(1, 11)]
|
14 |
+
columns = ["model_name", "<=32k", "<=128k"] + LENGTHS
|
15 |
|
16 |
st.title("πππͺ‘πβ BABILong Leaderboard π")
|
17 |
st.markdown(PAGE_INFO)
|
18 |
+
st.subheader("Evaluation results:")
|
19 |
+
st.text('Each tab corresponds to a task, avg - averaged scores over qa1-5 tasks.')
|
20 |
+
st.markdown('All models predictions: [BABILong evals](https://huggingface.co/datasets/RMT-team/babilong_evals)')
|
21 |
search_term = st.text_input("Search models:", "")
|
22 |
|
23 |
tabs = st.tabs([str(task) for task in tasks])
|
draw_utils.py
CHANGED
@@ -13,7 +13,7 @@ PAGE_MARKDOWN = """
|
|
13 |
</style>
|
14 |
"""
|
15 |
|
16 |
-
PAGE_INFO = """[](https://huggingface.co/datasets/
|
17 |
|
18 |
LENGTHS = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k', '512k', '1M', '2M']
|
19 |
LENGTHS_32k = ['0k', '1k', '2k', '4k', '8k', '16k', '32k']
|
@@ -28,9 +28,16 @@ def load_results():
|
|
28 |
|
29 |
res = pd.concat([old_results, new_results])
|
30 |
res.replace(-1, np.nan, inplace=True)
|
31 |
-
res['
|
32 |
-
res['
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
return res
|
36 |
|
|
|
13 |
</style>
|
14 |
"""
|
15 |
|
16 |
+
PAGE_INFO = """[](https://huggingface.co/datasets/RMT-team/babilong) | [GitHub](https://github.com/booydar/babilong) | [Paper](https://arxiv.org/abs/2406.10149) | [HF Dataset](https://huggingface.co/datasets/RMT-team/babilong) | [HF Dataset 1k samples per task](https://huggingface.co/datasets/RMT-team/babilong-1k-samples) |"""
|
17 |
|
18 |
LENGTHS = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k', '512k', '1M', '2M']
|
19 |
LENGTHS_32k = ['0k', '1k', '2k', '4k', '8k', '16k', '32k']
|
|
|
28 |
|
29 |
res = pd.concat([old_results, new_results])
|
30 |
res.replace(-1, np.nan, inplace=True)
|
31 |
+
res['<=32k'] = res[LENGTHS_32k].mean(axis=1)
|
32 |
+
res['<=128k'] = res[LENGTHS_128k].mean(axis=1)
|
33 |
+
|
34 |
+
# Calculate the maximum length with non-NaN values for each model
|
35 |
+
res['max_eval_length_idx'] = res.apply(
|
36 |
+
lambda row: max([LENGTHS.index(col) for col in LENGTHS if not pd.isna(row[col])], default=-1), axis=1)
|
37 |
+
res['max_eval_length'] = res['max_eval_length_idx'].apply(lambda x: LENGTHS[x])
|
38 |
+
|
39 |
+
# Sort first by max length (descending) and then by average score (descending)
|
40 |
+
res.sort_values(['max_eval_length_idx', '<=128k'], ascending=[False, False], inplace=True)
|
41 |
|
42 |
return res
|
43 |
|