yurakuratov commited on
Commit
b54190c
Β·
1 Parent(s): 7a82a99

group results by max eval length

Browse files
Files changed (2) hide show
  1. app.py +4 -2
  2. draw_utils.py +11 -4
app.py CHANGED
@@ -11,11 +11,13 @@ def draw_leaderboard():
11
  df = load_results()
12
 
13
  tasks = ['avg'] + [f"qa{i}" for i in range(1, 11)]
14
- columns = ["model_name", "avg(32k)", "avg(128k)"] + LENGTHS
15
 
16
  st.title("πŸ”ŽπŸ“šπŸͺ‘πŸ“šβ“ BABILong Leaderboard πŸ†")
17
  st.markdown(PAGE_INFO)
18
- st.subheader("Average Accuracy")
 
 
19
  search_term = st.text_input("Search models:", "")
20
 
21
  tabs = st.tabs([str(task) for task in tasks])
 
11
  df = load_results()
12
 
13
  tasks = ['avg'] + [f"qa{i}" for i in range(1, 11)]
14
+ columns = ["model_name", "<=32k", "<=128k"] + LENGTHS
15
 
16
  st.title("πŸ”ŽπŸ“šπŸͺ‘πŸ“šβ“ BABILong Leaderboard πŸ†")
17
  st.markdown(PAGE_INFO)
18
+ st.subheader("Evaluation results:")
19
+ st.text('Each tab corresponds to a task, avg - averaged scores over qa1-5 tasks.')
20
+ st.markdown('All models predictions: [BABILong evals](https://huggingface.co/datasets/RMT-team/babilong_evals)')
21
  search_term = st.text_input("Search models:", "")
22
 
23
  tabs = st.tabs([str(task) for task in tasks])
draw_utils.py CHANGED
@@ -13,7 +13,7 @@ PAGE_MARKDOWN = """
13
  </style>
14
  """
15
 
16
- PAGE_INFO = """[![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-lg.svg)](https://huggingface.co/datasets/booydar/babilong) | [GitHub](https://github.com/booydar/babilong) | [Paper](https://arxiv.org/abs/2406.10149) | [HF Dataset](https://huggingface.co/datasets/booydar/babilong) | [HF Dataset 1k samples per task](https://huggingface.co/datasets/RMT-team/babilong-1k-samples) |"""
17
 
18
  LENGTHS = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k', '512k', '1M', '2M']
19
  LENGTHS_32k = ['0k', '1k', '2k', '4k', '8k', '16k', '32k']
@@ -28,9 +28,16 @@ def load_results():
28
 
29
  res = pd.concat([old_results, new_results])
30
  res.replace(-1, np.nan, inplace=True)
31
- res['avg(32k)'] = res[LENGTHS_32k].mean(axis=1)
32
- res['avg(128k)'] = res[LENGTHS_128k].mean(axis=1)
33
- res.sort_values(['avg(128k)'], ascending=False, inplace=True)
 
 
 
 
 
 
 
34
 
35
  return res
36
 
 
13
  </style>
14
  """
15
 
16
+ PAGE_INFO = """[![Dataset on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/dataset-on-hf-lg.svg)](https://huggingface.co/datasets/RMT-team/babilong) | [GitHub](https://github.com/booydar/babilong) | [Paper](https://arxiv.org/abs/2406.10149) | [HF Dataset](https://huggingface.co/datasets/RMT-team/babilong) | [HF Dataset 1k samples per task](https://huggingface.co/datasets/RMT-team/babilong-1k-samples) |"""
17
 
18
  LENGTHS = ['0k', '1k', '2k', '4k', '8k', '16k', '32k', '64k', '128k', '512k', '1M', '2M']
19
  LENGTHS_32k = ['0k', '1k', '2k', '4k', '8k', '16k', '32k']
 
28
 
29
  res = pd.concat([old_results, new_results])
30
  res.replace(-1, np.nan, inplace=True)
31
+ res['<=32k'] = res[LENGTHS_32k].mean(axis=1)
32
+ res['<=128k'] = res[LENGTHS_128k].mean(axis=1)
33
+
34
+ # Calculate the maximum length with non-NaN values for each model
35
+ res['max_eval_length_idx'] = res.apply(
36
+ lambda row: max([LENGTHS.index(col) for col in LENGTHS if not pd.isna(row[col])], default=-1), axis=1)
37
+ res['max_eval_length'] = res['max_eval_length_idx'].apply(lambda x: LENGTHS[x])
38
+
39
+ # Sort first by max length (descending) and then by average score (descending)
40
+ res.sort_values(['max_eval_length_idx', '<=128k'], ascending=[False, False], inplace=True)
41
 
42
  return res
43