natolambert commited on
Commit
f460af4
Β·
1 Parent(s): 96e55d5

init v1 port

Browse files
Files changed (3) hide show
  1. app.py +93 -95
  2. leaderboard/final-rbv1-data.csv +182 -0
  3. leaderboard/retired-app.py +462 -0
app.py CHANGED
@@ -5,24 +5,25 @@ import gradio as gr
5
  import numpy as np
6
  from datasets import load_dataset
7
  from huggingface_hub import HfApi, snapshot_download
 
8
 
9
  from leaderboard.constants import example_counts, length_categories, subset_mapping
10
  from leaderboard.css import custom_css
11
  from leaderboard.md import *
12
  from leaderboard.utils import load_all_data
13
 
 
 
 
14
  api = HfApi()
15
 
16
  COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
17
  evals_repo = "allenai/reward-bench-v2-results"
18
 
19
  eval_set_repo = "allenai/reward-bench-v2-v0"
20
- repo_dir_rewardbench = "./evals/rewardbench/"
21
-
22
-
23
- def restart_space():
24
- api.restart_space(repo_id="allenai/reward-bench-v2", token=COLLAB_TOKEN)
25
 
 
26
 
27
  print("Pulling evaluation results")
28
  repo = snapshot_download(
@@ -35,6 +36,9 @@ repo = snapshot_download(
35
  repo_type="dataset",
36
  )
37
 
 
 
 
38
 
39
  def avg_over_rewardbench_v2(dataframe_core):
40
  domain_cols = ["chat", "factuality", "safety", "math", "precise if", "ties"]
@@ -56,7 +60,6 @@ def avg_over_rewardbench_v2(dataframe_core):
56
 
57
  return new_df
58
 
59
-
60
  def avg_over_rewardbench(dataframe_core, dataframe_prefs):
61
  """
62
  Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
@@ -125,55 +128,6 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
125
  new_df = new_df[keep_columns]
126
  return new_df
127
 
128
-
129
- def expand_subsets(dataframe):
130
- # TODO need to modify data/ script to do this
131
- pass
132
-
133
-
134
- def length_bias_check(dataframe):
135
- """
136
- Takes the raw rewardbench dataframe and splits the data into new buckets according to length_categories.
137
- Then, take the average of the three buckets as "average"
138
- """
139
- new_df = dataframe.copy()
140
- existing_subsets = new_df.columns[3:] # model, model_type, average
141
- final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
142
- # new data is empty list dict for each final subset
143
- new_data = {s: [] for s in final_subsets}
144
-
145
- # now, subsets correspond to those with True, Nuetral, and False length bias
146
- # check if length_categories[subset] == "True" or "False" or "Neutral"
147
- for subset in existing_subsets:
148
- subset_data = new_df[subset].values
149
- subset_length = length_categories[subset]
150
- # route to the correct bucket
151
- if subset_length == "True":
152
- new_data["Length Bias"].append(subset_data)
153
- elif subset_length == "Neutral":
154
- new_data["Neutral"].append(subset_data)
155
- elif subset_length == "False":
156
- new_data["Terse Bias"].append(subset_data)
157
-
158
- # take average of new_data and add to new_df (removing other columns than model)
159
- for subset in final_subsets:
160
- new_df[subset] = np.nanmean(new_data[subset], axis=0)
161
- keep_columns = ["model"] + final_subsets
162
- new_df = new_df[keep_columns]
163
- # recompute average
164
- # new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2)
165
-
166
- return new_df
167
-
168
-
169
- rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by="average", ascending=False)
170
- # rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
171
- # prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
172
- # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
173
-
174
- rewardbench_data_avg = avg_over_rewardbench_v2(rewardbench_data).sort_values(by="average", ascending=False)
175
-
176
-
177
  def prep_df(df):
178
  # add column to 0th entry with count (column name itself empty)
179
  df.insert(0, "", range(1, 1 + len(df)))
@@ -191,24 +145,28 @@ def prep_df(df):
191
 
192
  return df
193
 
 
 
194
 
195
  # add count column to all dataframes
196
  rewardbench_data = prep_df(rewardbench_data)
197
  rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
198
- # adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
199
-
200
- # rewardbench_data_length = prep_df(rewardbench_data_length)
201
- # prefs_data = prep_df(prefs_data)
202
 
203
  col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
204
  col_types_rewardbench_avg = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
205
- # cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
206
- # col_types_prefs = ["number"] + ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
207
- ## col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
 
 
 
 
208
 
209
  # for showing random samples
210
  eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="test")
211
-
 
 
212
 
213
  def random_sample(r: gr.Request, subset):
214
  if subset is None or subset == []:
@@ -225,8 +183,21 @@ def random_sample(r: gr.Request, subset):
225
  markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
226
  return markdown_text
227
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
- subsets = eval_set.unique("subset")
 
230
 
231
  color_map = {
232
  "Generative": "#7497db",
@@ -235,7 +206,6 @@ color_map = {
235
  "DPO": "#75809c",
236
  }
237
 
238
-
239
  def color_model_type_column(df, color_map):
240
  """
241
  Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
@@ -260,7 +230,6 @@ def color_model_type_column(df, color_map):
260
 
261
  return df.style.applymap(apply_color, subset=["Model Type"]).format(format_dict, na_rep="")
262
 
263
-
264
  def regex_table(dataframe, regex, filter_button, style=True):
265
  """
266
  Takes a model name as a regex, then returns only the rows that has that in it.
@@ -321,7 +290,6 @@ def regex_table(dataframe, regex, filter_button, style=True):
321
 
322
  return data
323
 
324
-
325
  # import ipdb; ipdb.set_trace()
326
 
327
  total_models = len(
@@ -334,6 +302,10 @@ assets = Path("leaderboard").resolve() # absolute dir with the image
334
  # Using a string for a predefined color
335
  theme = gr.themes.Default(primary_hue="blue")
336
 
 
 
 
 
337
  with gr.Blocks(theme=theme, css=custom_css) as app:
338
  # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
339
  with gr.Row():
@@ -396,32 +368,6 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
396
  max_height=800, # 800 px β‰ˆ ~25 rows on default row-height
397
  )
398
 
399
- # removed because the data does not have sub-domains
400
- # with gr.TabItem("Detailed"):
401
- # with gr.Row():
402
- # search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
403
- # model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
404
- # value=["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"],
405
- # label="Model Types",
406
- # show_label=False,
407
- # # info="Which model types to include."
408
- # )
409
- # with gr.Row():
410
- # # ref data
411
- # rewardbench_table_detailed_hidden = gr.Dataframe(
412
- # rewardbench_data.values,
413
- # datatype=col_types_rewardbench,
414
- # headers=rewardbench_data.columns.tolist(),
415
- # visible=False,
416
- # )
417
- # rewardbench_table_detailed = gr.Dataframe(
418
- # regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"]),
419
- # datatype=col_types_rewardbench,
420
- # headers=rewardbench_data.columns.tolist(),
421
- # elem_id="rewardbench_dataframe",
422
- # # height=1000,
423
- # )
424
-
425
  with gr.TabItem("About"):
426
  with gr.Row():
427
  gr.Markdown(ABOUT_TEXT_V2)
@@ -431,15 +377,67 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
431
  # loads one sample
432
  gr.Markdown("""## Random Dataset Sample Viewer""")
433
  subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
434
- button = gr.Button("Show Random Sample")
435
 
436
  with gr.Row():
437
  sample_display = gr.Markdown("{sampled data loads here}")
438
 
439
- button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
440
  with gr.TabItem("RewardBench", scale=1.5):
441
  with gr.Row():
442
  gr.Markdown(CAPTION_V1.format(str(total_models)))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
 
444
  search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
445
  # search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
 
5
  import numpy as np
6
  from datasets import load_dataset
7
  from huggingface_hub import HfApi, snapshot_download
8
+ import pandas as pd
9
 
10
  from leaderboard.constants import example_counts, length_categories, subset_mapping
11
  from leaderboard.css import custom_css
12
  from leaderboard.md import *
13
  from leaderboard.utils import load_all_data
14
 
15
+ #######################################################
16
+ # Setup #
17
+ #######################################################
18
  api = HfApi()
19
 
20
  COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
21
  evals_repo = "allenai/reward-bench-v2-results"
22
 
23
  eval_set_repo = "allenai/reward-bench-v2-v0"
24
+ eval_set_repo_v1 = "allenai/reward-bench"
 
 
 
 
25
 
26
+ repo_dir_rewardbench = "./evals/rewardbench/"
27
 
28
  print("Pulling evaluation results")
29
  repo = snapshot_download(
 
36
  repo_type="dataset",
37
  )
38
 
39
+ ###########################################
40
+ # Load Data #
41
+ ###########################################
42
 
43
  def avg_over_rewardbench_v2(dataframe_core):
44
  domain_cols = ["chat", "factuality", "safety", "math", "precise if", "ties"]
 
60
 
61
  return new_df
62
 
 
63
  def avg_over_rewardbench(dataframe_core, dataframe_prefs):
64
  """
65
  Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
 
128
  new_df = new_df[keep_columns]
129
  return new_df
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  def prep_df(df):
132
  # add column to 0th entry with count (column name itself empty)
133
  df.insert(0, "", range(1, 1 + len(df)))
 
145
 
146
  return df
147
 
148
+ rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by="average", ascending=False)
149
+ rewardbench_data_avg = avg_over_rewardbench_v2(rewardbench_data).sort_values(by="average", ascending=False)
150
 
151
  # add count column to all dataframes
152
  rewardbench_data = prep_df(rewardbench_data)
153
  rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
 
 
 
 
154
 
155
  col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
156
  col_types_rewardbench_avg = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
157
+
158
+ # get v1 data
159
+ rb_orig_snapshot = pd.read_csv("leaderboard/final-rbv1-data.csv")
160
+
161
+ ###########################################
162
+ # Leaderboard Helpers & Setting #
163
+ ###########################################
164
 
165
  # for showing random samples
166
  eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="test")
167
+ eval_set_v1 = load_dataset(eval_set_repo_v1, use_auth_token=COLLAB_TOKEN, split="filtered")
168
+ subsets = eval_set.unique("subset")
169
+ subsets_v1 = eval_set_v1.unique("subset")
170
 
171
  def random_sample(r: gr.Request, subset):
172
  if subset is None or subset == []:
 
183
  markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
184
  return markdown_text
185
 
186
+ # Duplicating because they use global variables with gradio setup
187
+ def random_sample_v1(r: gr.Request, subset):
188
+ if subset is None or subset == []:
189
+ sample_index = np.random.randint(0, len(eval_set) - 1)
190
+ sample = eval_set[sample_index]
191
+ else: # filter by subsets (can be list)
192
+ if isinstance(subset, str):
193
+ subset = [subset]
194
+ # filter down dataset to only include the subset(s)
195
+ eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
196
+ sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
197
+ sample = eval_set_filtered[sample_index]
198
 
199
+ markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
200
+ return markdown_text
201
 
202
  color_map = {
203
  "Generative": "#7497db",
 
206
  "DPO": "#75809c",
207
  }
208
 
 
209
  def color_model_type_column(df, color_map):
210
  """
211
  Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
 
230
 
231
  return df.style.applymap(apply_color, subset=["Model Type"]).format(format_dict, na_rep="")
232
 
 
233
  def regex_table(dataframe, regex, filter_button, style=True):
234
  """
235
  Takes a model name as a regex, then returns only the rows that has that in it.
 
290
 
291
  return data
292
 
 
293
  # import ipdb; ipdb.set_trace()
294
 
295
  total_models = len(
 
302
  # Using a string for a predefined color
303
  theme = gr.themes.Default(primary_hue="blue")
304
 
305
+ #############################################
306
+ # Gradio App #
307
+ #############################################
308
+
309
  with gr.Blocks(theme=theme, css=custom_css) as app:
310
  # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
311
  with gr.Row():
 
368
  max_height=800, # 800 px β‰ˆ ~25 rows on default row-height
369
  )
370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  with gr.TabItem("About"):
372
  with gr.Row():
373
  gr.Markdown(ABOUT_TEXT_V2)
 
377
  # loads one sample
378
  gr.Markdown("""## Random Dataset Sample Viewer""")
379
  subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
380
+ button_data = gr.Button("Show Random Sample")
381
 
382
  with gr.Row():
383
  sample_display = gr.Markdown("{sampled data loads here}")
384
 
385
+ button_data.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
386
  with gr.TabItem("RewardBench", scale=1.5):
387
  with gr.Row():
388
  gr.Markdown(CAPTION_V1.format(str(total_models)))
389
+ with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
390
+ with gr.TabItem("Leaderboard"):
391
+ pass
392
+ # with gr.Row():
393
+ # search_1 = gr.Textbox(
394
+ # label="Model Search (delimit with , )",
395
+ # placeholder="Model Search (delimit with , )",
396
+ # show_label=False,
397
+ # )
398
+ # model_types_1 = gr.CheckboxGroup(
399
+ # ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
400
+ # value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
401
+ # label="Model Types",
402
+ # show_label=False,
403
+ # # info="Which model types to include.",
404
+ # )
405
+ # with gr.Row():
406
+ # # reference data
407
+ # rewardbench_table_hidden = gr.Dataframe(
408
+ # rewardbench_data_avg.values,
409
+ # datatype=col_types_rewardbench_avg,
410
+ # headers=rewardbench_data_avg.columns.tolist(),
411
+ # visible=False,
412
+ # )
413
+ # rewardbench_table = gr.Dataframe(
414
+ # regex_table(
415
+ # rewardbench_data_avg.copy(),
416
+ # "",
417
+ # ["Seq. Classifiers", "Custom Classifiers", "Generative"],
418
+ # ),
419
+ # datatype=col_types_rewardbench_avg,
420
+ # headers=rewardbench_data_avg.columns.tolist(),
421
+ # elem_id="rewardbench_dataframe_avg",
422
+ # max_height=800, # 800 px β‰ˆ ~25 rows on default row-height
423
+ # )
424
+ with gr.TabItem("About"):
425
+ with gr.Row():
426
+ gr.Markdown(ABOUT_TEXT_V1)
427
+
428
+ with gr.TabItem("Dataset Viewer"):
429
+ with gr.Row():
430
+ # loads one sample
431
+ gr.Markdown("""## Random Dataset Sample Viewer""")
432
+ subset_selector_v1 = gr.Dropdown(subsets_v1, label="Subset", value=None, multiselect=True)
433
+ button_data_v1 = gr.Button("Show Random Sample")
434
+
435
+ with gr.Row():
436
+ sample_display = gr.Markdown("{sampled data loads here}")
437
+
438
+ button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display])
439
+
440
+
441
 
442
  search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
443
  # search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
leaderboard/final-rbv1-data.csv ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,Model,Model Type,Score,Chat,Chat Hard,Safety,Reasoning,Prior Sets (0.5 weight)
2
+ 1,"<a target=""_blank"" href=""https://huggingface.co/infly/INF-ORM-Llama3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">infly/INF-ORM-Llama3.1-70B</a>",Seq. Classifier,95.10529562974679,96.64804469273743,91.00877192982456,93.64864864864865,99.1157172477765,
3
+ 2,"<a target=""_blank"" href=""https://huggingface.co/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1</a>",Seq. Classifier,94.99413134933042,96.36871508379889,90.78947368421052,93.78378378378379,99.03455284552845,
4
+ 3,"<a target=""_blank"" href=""https://huggingface.co/nicolinho/QRM-Gemma-2-27B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nicolinho/QRM-Gemma-2-27B</a>",Seq. Classifier,94.43611331484493,96.64804469273743,90.13157894736842,92.70270270270271,98.26212691657118,
5
+ 4,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Reward-Gemma-2-27B-v0.2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Reward-Gemma-2-27B-v0.2</a>",Seq. Classifier,94.26093621016115,96.08938547486034,89.91228070175438,92.97297297297297,98.0691056910569,
6
+ 5,"<a target=""_blank"" href=""https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nvidia/Llama-3.1-Nemotron-70B-Reward</a> *",Custom Classifier,94.10897209520822,97.48603351955308,85.74561403508773,95.13513513513513,98.0691056910569,
7
+ 6,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Reward-Gemma-2-27B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Reward-Gemma-2-27B</a> ⚠️",Seq. Classifier,93.80116450605776,95.81005586592178,91.44736842105263,91.89189189189189,96.05534184536477,
8
+ 7,"<a target=""_blank"" href=""https://huggingface.co/SF-Foundation/TextEval-Llama3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">SF-Foundation/TextEval-Llama3.1-70B</a> * ⚠️",Generative,93.48032435319458,94.1340782122905,90.13157894736842,93.24324324324324,96.41239700987613,
9
+ 8,"<a target=""_blank"" href=""https://huggingface.co/meta-metrics/MetaMetrics-RM-v1.0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-metrics/MetaMetrics-RM-v1.0</a>",Custom Classifier,93.42462545063005,98.32402234636872,86.40350877192982,90.8108108108108,98.16015987341082,
10
+ 9,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Critic-Llama-3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Critic-Llama-3.1-70B</a> ⚠️",Generative,93.30801781900792,96.64804469273743,87.93859649122807,93.10810810810811,95.5373219839581,
11
+ 10,"<a target=""_blank"" href=""https://huggingface.co/nicolinho/QRM-Llama3.1-8B-v2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nicolinho/QRM-Llama3.1-8B-v2</a>",Seq. Classifier,93.13653373860271,96.36871508379889,86.84210526315789,92.56756756756756,96.76774703988652,
12
+ 11,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Reward-Llama-3.1-8B-v0.2</a>",Seq. Classifier,93.12997963530022,94.6927374301676,88.37719298245614,92.70270270270271,96.7472854258744,
13
+ 12,"<a target=""_blank"" href=""https://huggingface.co/nicolinho/QRM-Llama3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nicolinho/QRM-Llama3.1-8B</a> ⚠️",Seq. Classifier,93.05891420009982,94.41340782122904,89.69298245614036,92.29729729729729,95.83196922573254,
14
+ 13,"<a target=""_blank"" href=""https://huggingface.co/LxzGordon/URM-LLaMa-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">LxzGordon/URM-LLaMa-3.1-8B</a> ⚠️",Seq. Classifier,92.93773298857982,95.53072625698324,88.15789473684211,91.08108108108108,96.98122987941288,
15
+ 14,"<a target=""_blank"" href=""https://huggingface.co/Salesforce/SFR-LLaMa-3.1-70B-Judge-r"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Salesforce/SFR-LLaMa-3.1-70B-Judge-r</a> *",Generative,92.71833683150776,96.92737430167598,84.75877192982456,91.62162162162163,97.56557947290882,
16
+ 15,"<a target=""_blank"" href=""https://huggingface.co/R-I-S-E/RISE-Judge-Qwen2.5-32B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">R-I-S-E/RISE-Judge-Qwen2.5-32B</a>",Generative,92.66088172895866,96.64804469273743,83.33333333333333,91.89189189189189,98.77025699787198,
17
+ 16,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Reward-Llama-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Reward-Llama-3.1-8B</a> ⚠️",Seq. Classifier,92.52495013691698,95.81005586592178,87.28070175438596,90.8108108108108,96.19823211654936,
18
+ 17,"<a target=""_blank"" href=""https://huggingface.co/AtlaAI/Selene-1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">AtlaAI/Selene-1</a>",Generative,92.41086740661206,97.76536312849161,83.99122807017544,92.16216216216216,95.72471626561904,
19
+ 18,"<a target=""_blank"" href=""https://huggingface.co/general-preference/GPM-Llama-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">general-preference/GPM-Llama-3.1-8B</a> ⚠️",Custom Classifier,92.23713029788581,93.29608938547486,88.59649122807018,91.08108108108108,95.97485949691712,
20
+ 19,"<a target=""_blank"" href=""https://huggingface.co/nvidia/Nemotron-4-340B-Reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nvidia/Nemotron-4-340B-Reward</a> *",Custom Classifier,91.9958677606516,95.81005586592178,87.06140350877193,91.48648648648648,93.6255251814263,
21
+ 20,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-Llama3-8B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-Llama3-8B-rewardmodel-ft</a> ⚠️",Seq. Classifier,91.53526049213252,95.53072625698324,86.1842105263158,90.8108108108108,93.61529437442026,
22
+ 21,"<a target=""_blank"" href=""https://huggingface.co/nicolinho/QRM-Llama3-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nicolinho/QRM-Llama3-8B</a> ⚠️",Seq. Classifier,91.0990919512119,95.81005586592178,81.14035087719299,89.86486486486487,97.581096196868,
23
+ 22,"<a target=""_blank"" href=""https://huggingface.co/SF-Foundation/TextEval-OffsetBias-12B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">SF-Foundation/TextEval-OffsetBias-12B</a> *",Generative,91.04924182882311,91.89944134078212,86.62280701754386,92.02702702702703,93.64769192993944,
24
+ 23,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-llama3.2-3B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-llama3.2-3B-rewardmodel-ft</a>",Seq. Classifier,90.92295892363056,91.62011173184358,84.86842105263158,92.70270270270271,94.50060020734435,
25
+ 24,"<a target=""_blank"" href=""https://huggingface.co/Salesforce/SFR-nemo-12B-Judge-r"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Salesforce/SFR-nemo-12B-Judge-r</a> *",Generative,90.26551100385808,97.20670391061452,82.23684210526316,86.48648648648648,95.13201151306815,
26
+ 25,"<a target=""_blank"" href=""https://huggingface.co/internlm/internlm2-20b-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">internlm/internlm2-20b-reward</a>",Seq. Classifier,90.15948083664846,98.88268156424581,76.53508771929825,89.45945945945945,95.76069460359032,
27
+ 26,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-VL-Reward-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-VL-Reward-7B</a>",Seq. Classifier,90.07022246172819,89.94413407821229,87.5,91.08108108108108,91.75567468761938,
28
+ 27,"<a target=""_blank"" href=""https://huggingface.co/facebook/Self-taught-evaluator-llama3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">facebook/Self-taught-evaluator-llama3.1-70B</a> *",Generative,90.01358317701886,96.92737430167598,85.08771929824562,89.5945945945946,88.44464451355923,
29
+ 28,"<a target=""_blank"" href=""https://huggingface.co/LxzGordon/URM-LLaMa-3-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">LxzGordon/URM-LLaMa-3-8B</a>",Seq. Classifier,89.90981543420907,96.92737430167598,78.7280701754386,88.24324324324324,95.74057401647842,
30
+ 29,"<a target=""_blank"" href=""https://huggingface.co/NCSOFT/Llama-3-OffsetBias-RM-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NCSOFT/Llama-3-OffsetBias-RM-8B</a>",Seq. Classifier,89.41975692993036,97.20670391061452,81.79824561403508,86.75675675675676,91.91732143831506,
31
+ 30,"<a target=""_blank"" href=""https://huggingface.co/AtlaAI/Selene-1-Mini-Llama-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">AtlaAI/Selene-1-Mini-Llama-3.1-8B</a>",Generative,89.12784912886812,93.57541899441341,79.3859649122807,89.25675675675676,94.29325585202162,
32
+ 31,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Critic-Llama-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Critic-Llama-3.1-8B</a>",Generative,88.95511699074142,93.57541899441341,81.35964912280701,91.08108108108108,89.80431876466416,
33
+ 32,"<a target=""_blank"" href=""https://huggingface.co/nvidia/Llama3-70B-SteerLM-RM"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nvidia/Llama3-70B-SteerLM-RM</a> *",Custom Classifier,88.76963582088416,91.34078212290503,80.26315789473684,92.83783783783784,90.63676542805698,
34
+ 33,"<a target=""_blank"" href=""https://huggingface.co/Salesforce/SFR-LLaMa-3.1-8B-Judge-r"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Salesforce/SFR-LLaMa-3.1-8B-Judge-r</a> *",Generative,88.65372403487248,95.53072625698324,77.74122807017544,86.21621621621621,95.12672559611501,
35
+ 34,"<a target=""_blank"" href=""https://huggingface.co/facebook/Self-taught-Llama-3-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">facebook/Self-taught-Llama-3-70B</a> *",Generative,88.62795600264494,96.92737430167598,83.99122807017544,91.08108108108108,82.5121405576472,
36
+ 35,"<a target=""_blank"" href=""https://huggingface.co/RLHFlow/ArmoRM-Llama3-8B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">RLHFlow/ArmoRM-Llama3-8B-v0.1</a>",Custom Classifier,88.60367185781917,96.92737430167598,76.75438596491227,90.54054054054055,97.34715174332952,74.29414161945574
37
+ 36,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-gemma2-2B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-gemma2-2B-rewardmodel-ft</a>",Seq. Classifier,88.39250002515702,93.01675977653632,77.19298245614036,92.16216216216216,91.19809570578929,
38
+ 37,"<a target=""_blank"" href=""https://huggingface.co/google/gemini-1.5-pro-0514"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/gemini-1.5-pro-0514</a> *",Generative,88.20069001791948,92.31843575418995,80.59210526315789,87.9054054054054,91.98681364892467,
39
+ 38,"<a target=""_blank"" href=""https://huggingface.co/R-I-S-E/RISE-Judge-Qwen2.5-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">R-I-S-E/RISE-Judge-Qwen2.5-7B</a>",Generative,88.19099980224239,92.17877094972067,76.53508771929825,87.97297297297297,96.07716756697768,
40
+ 39,"<a target=""_blank"" href=""https://huggingface.co/Cohere May 2024"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Cohere May 2024</a> *",Custom Classifier,88.16038708182192,96.36871508379889,71.2719298245614,92.29729729729729,97.68272221312816,78.20215489882585
41
+ 40,"<a target=""_blank"" href=""https://huggingface.co/google/flame-1.0-24B-july-2024"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/flame-1.0-24B-july-2024</a> *",Generative,87.80801832232187,92.17877094972067,75.65789473684211,89.5945945945946,93.80081300813008,
42
+ 41,"<a target=""_blank"" href=""https://huggingface.co/internlm/internlm2-7b-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">internlm/internlm2-7b-reward</a>",Seq. Classifier,87.59316719911449,99.16201117318435,69.51754385964912,87.16216216216216,94.53095160146232,
43
+ 42,"<a target=""_blank"" href=""https://huggingface.co/ZiyiYe/Con-J-Qwen2-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ZiyiYe/Con-J-Qwen2-7B</a> ⚠️",Generative,87.12028871485069,91.89944134078212,80.26315789473684,88.24324324324324,88.0753123806406,
44
+ 43,"<a target=""_blank"" href=""https://huggingface.co/google/gemini-1.5-pro-0924"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/gemini-1.5-pro-0924</a>",Generative,86.78430992050927,94.1340782122905,76.97368421052632,85.8108108108108,90.21866644840945,
45
+ 44,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4o-2024-08-06"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4o-2024-08-06</a>",Generative,86.72554986675267,96.08938547486034,76.09649122807018,88.10810810810811,86.60821465597208,
46
+ 45,"<a target=""_blank"" href=""https://huggingface.co/RLHFlow/pair-preference-model-LLaMA3-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">RLHFlow/pair-preference-model-LLaMA3-8B</a>",Custom Classifier,85.74792972712865,98.32402234636872,65.78947368421052,89.72972972972973,94.73420363398264,74.57650875557454
47
+ 46,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-llama3-8B-sftreg"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-llama3-8B-sftreg</a>",Seq. Classifier,85.42084389305319,98.60335195530726,67.76315789473684,89.1891891891892,92.29347410923774,73.08924874053665
48
+ 47,"<a target=""_blank"" href=""https://huggingface.co/opencompass/CompassJudger-1-32B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">opencompass/CompassJudger-1-32B-Instruct</a>",Generative,85.22047081369766,98.04469273743017,65.13157894736842,85.27027027027027,92.43534129972173,
49
+ 48,"<a target=""_blank"" href=""https://huggingface.co/Cohere March 2024"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Cohere March 2024</a> *",Custom Classifier,85.10802881361649,94.6927374301676,65.13157894736842,87.70270270270271,98.17073170731707,74.57675774743672
50
+ 49,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-llama3-8B-distill"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-llama3-8B-distill</a>",Seq. Classifier,84.63918882385776,98.32402234636872,68.42105263157895,86.75675675675676,91.3273449009658,72.09434614337957
51
+ 50,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-Gemma-2B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-Gemma-2B-rewardmodel-ft</a> ⚠️",Seq. Classifier,84.46827345209587,89.3854748603352,75.21929824561404,84.45945945945945,88.80886124297484,
52
+ 51,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4-0125-preview"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4-0125-preview</a>",Generative,84.33564801010327,95.25139664804469,74.34210526315789,87.56756756756756,86.9236645386588,70.85136405607162
53
+ 52,"<a target=""_blank"" href=""https://huggingface.co/mattshumer/Reflection-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mattshumer/Reflection-70B</a>",Generative,84.22327632009588,97.48603351955308,70.6140350877193,83.17567567567568,85.61736099743548,
54
+ 53,"<a target=""_blank"" href=""https://huggingface.co/Anthropic/claude-3-5-sonnet-20240620"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Anthropic/claude-3-5-sonnet-20240620</a>",Generative,84.17242041164789,96.36871508379889,74.01315789473684,81.62162162162163,84.68618704643423,
55
+ 54,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo</a>",Generative,84.12067803631126,97.20670391061452,74.56140350877193,77.56756756756756,87.14703715829104,
56
+ 55,"<a target=""_blank"" href=""https://huggingface.co/opencompass/CompassJudger-1-14B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">opencompass/CompassJudger-1-14B-Instruct</a>",Generative,84.09022697921793,97.48603351955308,62.280701754385966,83.91891891891892,92.67525372401374,
57
+ 56,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3.1-70B-Instruct</a>",Generative,84.05217990917473,97.20670391061452,70.17543859649123,82.83783783783784,85.98873929175534,
58
+ 57,"<a target=""_blank"" href=""https://huggingface.co/NCSOFT/Llama-3-OffsetBias-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NCSOFT/Llama-3-OffsetBias-8B</a>",Generative,83.96777752436938,92.45810055865921,80.26315789473684,86.75675675675676,76.39309488732471,
59
+ 58,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4-turbo-2024-04-09"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4-turbo-2024-04-09</a>",Generative,83.95011678629895,95.25139664804469,75.43859649122807,87.56756756756756,82.70345664866045,73.629016365689
60
+ 59,"<a target=""_blank"" href=""https://huggingface.co/sfairXC/FsfairX-LLaMA3-RM-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">sfairXC/FsfairX-LLaMA3-RM-v0.1</a>",Seq. Classifier,83.38339965331156,99.44134078212291,65.13157894736842,86.75675675675676,86.43633709827031,74.91856971076719
61
+ 60,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4o-2024-05-13"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4o-2024-05-13</a>",Generative,83.2681071132992,96.64804469273743,70.39473684210526,86.48648648648648,84.86965951874285,72.61510893954863
62
+ 61,"<a target=""_blank"" href=""https://huggingface.co/opencompass/CompassJudger-1-7B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">opencompass/CompassJudger-1-7B-Instruct</a>",Generative,83.16709323590604,97.76536312849161,60.96491228070175,84.45945945945945,89.47863807497134,
63
+ 62,"<a target=""_blank"" href=""https://huggingface.co/internlm/internlm2-1_8b-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">internlm/internlm2-1_8b-reward</a>",Seq. Classifier,82.16733515408055,93.57541899441341,66.2280701754386,81.62162162162163,87.24422982484859,
64
+ 63,"<a target=""_blank"" href=""https://huggingface.co/CIR-AMS/BTRM_Qwen2_7b_0613"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">CIR-AMS/BTRM_Qwen2_7b_0613</a>",Seq. Classifier,81.72269085246006,97.48603351955308,57.23684210526316,90.13513513513513,87.74894963714738,70.2902968779431
65
+ 64,"<a target=""_blank"" href=""https://huggingface.co/openbmb/Eurus-RM-7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openbmb/Eurus-RM-7b</a>",Seq. Classifier,81.58895090730017,98.04469273743017,65.5701754385965,81.35135135135135,86.3251623288045,71.71779445333651
66
+ 65,"<a target=""_blank"" href=""https://huggingface.co/Nexusflow/Starling-RM-34B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Nexusflow/Starling-RM-34B</a>",Seq. Classifier,81.33351263768401,96.92737430167598,57.23684210526316,87.70270270270271,88.45078299776287,71.36620952434669
67
+ 66,"<a target=""_blank"" href=""https://huggingface.co/google/gemma-2-27b-it"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/gemma-2-27b-it</a>",Generative,80.89669003773389,94.83240223463687,59.10087719298246,86.35135135135135,83.30212937196487,
68
+ 67,"<a target=""_blank"" href=""https://huggingface.co/google/gemini-1.5-flash-001"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/gemini-1.5-flash-001</a>",Generative,80.5391103484727,92.17877094972067,63.48684210526316,86.95945945945945,85.1162219675888,69.36940417219024
69
+ 68,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/Gemma-2B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/Gemma-2B-rewardmodel-ft</a> ⚠️",Seq. Classifier,80.47843057507436,77.93296089385476,74.78070175438596,85.27027027027027,83.92978938178643,
70
+ 69,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-v2.5-13b-preference-mix-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-v2.5-13b-preference-mix-rm</a>",Seq. Classifier,80.26558812003782,93.57541899441341,68.20175438596492,77.29729729729729,88.50261908659355,67.23611355180205
71
+ 70,"<a target=""_blank"" href=""https://huggingface.co/Anthropic/claude-3-opus-20240229"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Anthropic/claude-3-opus-20240229</a>",Generative,80.0759036376447,94.6927374301676,60.30701754385965,86.62162162162163,78.68223795492989,
72
+ 71,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4o-mini-2024-07-18"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4o-mini-2024-07-18</a>",Generative,80.06759386119498,94.97206703910615,60.74561403508772,80.8108108108108,83.7418835597752,
73
+ 72,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/RM-Mistral-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/RM-Mistral-7B</a>",Seq. Classifier,79.8233742639417,96.64804469273743,60.526315789473685,87.02702702702703,77.35615485349484,75.29528365000934
74
+ 73,"<a target=""_blank"" href=""https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NousResearch/Hermes-3-Llama-3.1-70B</a>",Generative,78.47084260833167,96.22905027932961,56.68859649122807,82.29729729729729,78.6684263654717,
75
+ 74,"<a target=""_blank"" href=""https://huggingface.co/hendrydong/Mistral-RM-for-RAFT-GSHF-v0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">hendrydong/Mistral-RM-for-RAFT-GSHF-v0</a>",Seq. Classifier,78.46503174091394,98.32402234636872,57.89473684210526,85.0,74.33602062530693,75.07572604066365
76
+ 75,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo</a>",Generative,78.08002309698713,87.56983240223464,66.8859649122807,75.06756756756756,82.79672750586566,
77
+ 76,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/reward-model-Mistral-7B-instruct-Unified-Feedback"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/reward-model-Mistral-7B-instruct-Unifie...</a>",Seq. Classifier,76.61192139206588,97.76536312849161,50.6578947368421,85.27027027027027,73.88893435914224,74.3423675391006
78
+ 77,"<a target=""_blank"" href=""https://huggingface.co/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3</a>",DPO,76.52088102568138,97.20670391061452,63.37719298245614,76.35135135135135,72.84129972172205,69.13483329884433
79
+ 78,"<a target=""_blank"" href=""https://huggingface.co/stabilityai/stablelm-2-12b-chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stabilityai/stablelm-2-12b-chat</a>",DPO,76.41872322421631,96.64804469273743,55.48245614035088,78.10810810810811,89.44862770775359,48.39403572004667
80
+ 79,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3-70B-Instruct</a>",Generative,76.26515082171642,97.62569832402235,58.88157894736842,72.97297297297297,78.53644895509358,70.3529589965331
81
+ 80,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-2-dpo-70b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-2-dpo-70b</a>",DPO,76.20735542607979,97.48603351955308,60.526315789473685,84.45945945945945,74.07206580455066,52.778449688644265
82
+ 81,"<a target=""_blank"" href=""https://huggingface.co/gemini-1.5-flash-8b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">gemini-1.5-flash-8b</a>",Generative,76.00524043227317,94.41340782122904,59.86842105263158,73.98648648648648,75.75264636874557,
83
+ 82,"<a target=""_blank"" href=""https://huggingface.co/Ahjeong/MMPO_Gemma_7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ahjeong/MMPO_Gemma_7b</a>",DPO,75.8660587247668,96.92737430167598,61.40350877192982,71.35135135135135,77.55872483221475,68.31261000855747
84
+ 83,"<a target=""_blank"" href=""https://huggingface.co/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-20240229_meta-llama/Llama-3-70b-chat-hf"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022...</a>",Generative,75.77705517745792,95.25139664804469,54.05701754385965,80.33783783783784,73.46196868008948,
85
+ 84,"<a target=""_blank"" href=""https://huggingface.co/allenai/llama-3-tulu-2-dpo-70b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/llama-3-tulu-2-dpo-70b</a>",DPO,74.9612075859509,96.36871508379889,57.45614035087719,74.86486486486487,80.2023653625798,56.86669694931664
86
+ 85,"<a target=""_blank"" href=""https://huggingface.co/NousResearch/Nous-Hermes-2-Mistral-7B-DPO"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NousResearch/Nous-Hermes-2-Mistral-7B-DPO</a>",DPO,74.80880493527766,92.17877094972067,60.526315789473685,82.43243243243244,73.75184154526109,55.500522983723165
87
+ 86,"<a target=""_blank"" href=""https://huggingface.co/Anthropic/claude-3-sonnet-20240229"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Anthropic/claude-3-sonnet-20240229</a>",Generative,74.57545943180953,93.43575418994413,56.578947368421055,81.6891891891892,69.07005374583947,69.63124589949818
88
+ 87,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai/Mixtral-8x7B-Instruct-v0.1</a>",DPO,74.54632435829336,94.97206703910615,64.03508771929825,72.56756756756756,78.71855731980139,50.330359933093675
89
+ 88,"<a target=""_blank"" href=""https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">prometheus-eval/prometheus-8x7b-v2.0</a>",Generative,74.5095375782243,93.01675977653632,47.14912280701754,80.47297297297297,77.39929475637038,
90
+ 89,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-Gemma-2B-sftreg"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-Gemma-2B-sftreg</a>",Seq. Classifier,74.50927082674883,95.53072625698324,48.68421052631579,79.32432432432432,76.83949909968898,69.82591702611495
91
+ 90,"<a target=""_blank"" href=""https://huggingface.co/general-preference/GPM-Gemma-2B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">general-preference/GPM-Gemma-2B</a>",Custom Classifier,74.49128373533642,71.50837988826815,69.73684210526316,81.21621621621621,75.50369673159818,
92
+ 91,"<a target=""_blank"" href=""https://huggingface.co/0-hero/Matter-0.1-7B-boost-DPO-preview"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">0-hero/Matter-0.1-7B-boost-DPO-preview</a>",DPO,74.47914014376505,91.06145251396649,60.96491228070175,71.35135135135135,83.94718175369673,55.6624654944527
93
+ 92,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-v2.5-70b-uf-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-v2.5-70b-uf-rm</a>",Seq. Classifier,73.98314832639727,86.59217877094972,71.71052631578948,70.13513513513513,75.70046925301467,57.571715987797305
94
+ 93,"<a target=""_blank"" href=""https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">HuggingFaceH4/zephyr-7b-alpha</a>",DPO,73.92192687696839,91.62011173184358,62.5,76.62162162162163,75.13982102908277,53.534233127619544
95
+ 94,"<a target=""_blank"" href=""https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">upstage/SOLAR-10.7B-Instruct-v1.0</a>",DPO,73.91132026830088,81.56424581005587,68.64035087719299,85.13513513513513,72.51596005892944,49.49049865208112
96
+ 95,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-2-dpo-13b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-2-dpo-13b</a>",DPO,73.68126195691116,95.81005586592178,58.333333333333336,79.45945945945945,73.22972936105201,49.46620157266727
97
+ 96,"<a target=""_blank"" href=""https://huggingface.co/opencompass/CompassJudger-1-1.5B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">opencompass/CompassJudger-1-1.5B-Instruct</a>",Generative,73.44238723104029,96.36871508379889,49.23245614035088,78.17567567567568,69.99270202433568,
98
+ 97,"<a target=""_blank"" href=""https://huggingface.co/allenai/llama-3-tulu-2-8b-uf-mean-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/llama-3-tulu-2-8b-uf-mean-rm</a>",Seq. Classifier,73.41574916848018,95.25139664804469,59.21052631578947,61.62162162162162,82.1155262727124,64.3436007999852
99
+ 98,"<a target=""_blank"" href=""https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">HuggingFaceH4/starchat2-15b-v0.1</a>",DPO,73.22060109644468,93.85474860335195,55.48245614035088,70.94594594594595,81.58522944289845,55.248649602907626
100
+ 99,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/Gemma-2B-rewardmodel-baseline"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/Gemma-2B-rewardmodel-baseline</a>",Seq. Classifier,72.89758740021966,94.1340782122905,46.92982456140351,78.64864864864865,73.84050853931359,68.97216667866445
101
+ 100,"<a target=""_blank"" href=""https://huggingface.co/Anthropic/claude-3-haiku-20240307"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Anthropic/claude-3-haiku-20240307</a>",Generative,72.89194286431167,92.73743016759776,51.973684210526315,79.52702702702703,70.60194658154636,66.34730980541012
102
+ 101,"<a target=""_blank"" href=""https://huggingface.co/HuggingFaceH4/zephyr-7b-beta"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">HuggingFaceH4/zephyr-7b-beta</a>",DPO,72.80507814531524,95.25139664804469,62.719298245614034,65.67567567567568,77.89497735581382,52.16300745754066
103
+ 102,"<a target=""_blank"" href=""https://huggingface.co/allenai/llama-3-tulu-2-dpo-8b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/llama-3-tulu-2-dpo-8b</a>",DPO,72.74751270450155,95.25139664804469,53.50877192982456,66.48648648648648,86.63038140448519,50.973541402832126
104
+ 103,"<a target=""_blank"" href=""https://huggingface.co/0-hero/Matter-0.1-7B-DPO-preview"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">0-hero/Matter-0.1-7B-DPO-preview</a>",DPO,72.47264404067178,89.3854748603352,57.675438596491226,63.78378378378378,88.54320128771758,53.477999309390405
105
+ 104,"<a target=""_blank"" href=""https://huggingface.co/jondurbin/bagel-dpo-34b-v0.5"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">jondurbin/bagel-dpo-34b-v0.5</a>",DPO,72.15167952196515,93.85474860335195,55.04385964912281,64.45945945945945,88.8907076990233,44.867564875771365
106
+ 105,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-2-dpo-7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-2-dpo-7b</a>",DPO,72.11611434356087,97.48603351955308,56.14035087719298,75.27027027027027,71.75717520598025,47.737369346054734
107
+ 106,"<a target=""_blank"" href=""https://huggingface.co/prometheus-eval/prometheus-7b-v2.0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">prometheus-eval/prometheus-7b-v2.0</a>",Generative,72.04295178846496,85.47486033519553,49.12280701754386,77.0945945945946,76.4795452065259,
108
+ 107,"<a target=""_blank"" href=""https://huggingface.co/stabilityai/stablelm-zephyr-3b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stabilityai/stablelm-zephyr-3b</a>",DPO,71.45809212918405,86.31284916201118,60.08771929824562,74.05405405405405,75.73184372783325,50.74989667836822
109
+ 108,"<a target=""_blank"" href=""https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO</a>",DPO,71.38329552978793,91.62011173184358,60.526315789473685,81.48648648648648,61.26104927156654,52.66173320935087
110
+ 109,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json</a>",Seq. Classifier,71.27478404602779,93.57541899441341,40.78947368421053,79.45945945945945,,
111
+ 110,"<a target=""_blank"" href=""https://huggingface.co/berkeley-nest/Starling-RM-7B-alpha"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">berkeley-nest/Starling-RM-7B-alpha</a>",Seq. Classifier,71.13020256724107,98.04469273743017,45.6140350877193,84.45945945945945,57.998444917335085,67.93855870128164
112
+ 111,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-380k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,70.58403596186601,95.25139664804469,39.473684210526315,77.02702702702703,,
113
+ 112,"<a target=""_blank"" href=""https://huggingface.co/CohereForAI/c4ai-command-r-plus"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">CohereForAI/c4ai-command-r-plus</a>",Generative,70.56998248762835,95.11173184357541,57.56578947368421,59.86486486486486,70.40312789872866,69.23881422694875
114
+ 113,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-2660k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,70.19339171573809,94.97206703910615,37.5,78.10810810810811,,
115
+ 114,"<a target=""_blank"" href=""https://huggingface.co/allenai/llama-3-tulu-2-70b-uf-mean-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/llama-3-tulu-2-70b-uf-mean-rm</a>",Seq. Classifier,70.19307792664753,86.31284916201118,56.14035087719298,60.945945945945944,82.68367708844875,59.57205519263016
116
+ 115,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-3420k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,70.07936854820123,93.85474860335195,38.81578947368421,77.56756756756756,,
117
+ 116,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-3.8m.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,70.03734328271229,94.1340782122905,38.81578947368421,77.16216216216216,,
118
+ 117,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/RM-Gemma-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/RM-Gemma-7B</a>",Seq. Classifier,69.66957334431098,96.92737430167598,49.780701754385966,57.83783783783784,73.62395645768537,70.68641939562845
119
+ 118,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-3040k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,69.44952818151877,93.85474860335195,37.06140350877193,77.43243243243244,,
120
+ 119,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-1900k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,69.2421964746281,94.41340782122904,35.74561403508772,77.56756756756756,,
121
+ 120,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/RM-Gemma-7B-4096"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/RM-Gemma-7B-4096</a>",Seq. Classifier,69.22303170109127,94.97206703910615,50.219298245614034,56.08108108108108,75.10912860806461,70.24413536208964
122
+ 121,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-760k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,69.04502561956252,94.41340782122904,35.96491228070175,76.75675675675676,,
123
+ 122,"<a target=""_blank"" href=""https://huggingface.co/openbmb/UltraRM-13b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openbmb/UltraRM-13b</a>",Seq. Classifier,69.02867919901104,96.36871508379889,55.48245614035088,59.86486486486486,62.44270748076608,72.94062565153789
124
+ 123,"<a target=""_blank"" href=""https://huggingface.co/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5</a>",Seq. Classifier,69.00517292135855,88.54748603351955,48.68421052631579,63.108108108108105,77.51882468489114,65.32929758655776
125
+ 124,"<a target=""_blank"" href=""https://huggingface.co/openbmb/Eurus-7b-kto"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openbmb/Eurus-7b-kto</a>",DPO,68.99912142883106,95.25139664804469,53.728070175438596,60.54054054054054,74.67261417580619,52.606849779819356
126
+ 125,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-2280k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,68.95403268602327,93.85474860335195,37.06140350877193,75.94594594594595,,
127
+ 126,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-14B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-14B-Chat</a>",DPO,68.64045386840729,57.262569832402235,70.17543859649123,71.21621621621621,89.61129753914987,41.23304044714641
128
+ 127,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-1140k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,68.08398077583611,93.01675977653632,35.96491228070175,75.27027027027027,,
129
+ 128,"<a target=""_blank"" href=""https://huggingface.co/RLHFlow/LLaMA3-iterative-DPO-final"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">RLHFlow/LLaMA3-iterative-DPO-final</a>",DPO,67.82774529803461,83.79888268156425,59.21052631578947,78.64864864864865,61.60650952147105,43.920573347364794
130
+ 129,"<a target=""_blank"" href=""https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">HuggingFaceH4/zephyr-7b-gemma-v0.1</a>",DPO,67.57835885153328,95.81005586592178,49.56140350877193,58.24324324324324,74.63476018988378,51.70630404815817
131
+ 130,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized.json</a>",Seq. Classifier,67.55772237983352,91.34078212290503,39.03508771929825,72.29729729729729,,
132
+ 131,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-7B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-7B-Chat</a>",DPO,67.50138253417825,53.63128491620112,69.07894736842105,69.1891891891892,90.41475691602555,42.884086027930344
133
+ 132,"<a target=""_blank"" href=""https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openbmb/MiniCPM-2B-dpo-fp32</a>",DPO,67.304776500488,89.10614525139665,49.3421052631579,57.2972972972973,82.33378348884159,49.58432590300511
134
+ 133,"<a target=""_blank"" href=""https://huggingface.co/mightbe/Better-PairRM"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mightbe/Better-PairRM</a>",Custom Classifier,67.29754324103595,95.53072625698324,39.25438596491228,82.02702702702703,49.826076280897034,72.40145810968448
135
+ 134,"<a target=""_blank"" href=""https://huggingface.co/allenai/OLMo-7B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/OLMo-7B-Instruct</a>",DPO,67.27282652187517,89.66480446927375,50.6578947368421,64.86486486486487,71.6763518306324,51.72760689365022
136
+ 135,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-72B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-72B-Chat</a>",DPO,67.23151527906012,62.29050279329609,66.00877192982456,67.56756756756756,85.54352867354177,42.26289558308108
137
+ 136,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0.json</a>",Seq. Classifier,66.54559072450868,93.29608938547486,45.39473684210526,60.945945945945944,,
138
+ 137,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-MoE-A2.7B-Chat</a>",DPO,66.4408456376338,72.90502793296089,63.1578947368421,62.83783783783784,77.40082937742129,45.364430968579995
139
+ 138,"<a target=""_blank"" href=""https://huggingface.co/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">RLHFlow/RewardModel-Mistral-7B-for-DPA-v1</a>",Seq. Classifier,66.33145463112653,87.98882681564245,49.780701754385966,70.67567567567568,59.70835379494734,60.675975598835954
140
+ 139,"<a target=""_blank"" href=""https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stabilityai/stablelm-2-zephyr-1_6b</a>",DPO,65.73535970393974,96.64804469273743,46.71052631578947,60.270270270270274,67.84218639166257,48.67618199453821
141
+ 140,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo</a>",Generative,65.65164437199641,80.72625698324022,49.780701754385966,63.986486486486484,68.11313226387297,
142
+ 141,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/RM-Gemma-2B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/RM-Gemma-2B</a>",Seq. Classifier,65.48909618129333,94.41340782122904,40.78947368421053,49.86486486486486,76.37399738091341,66.51837812920436
143
+ 142,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-3.5-turbo-0125"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-3.5-turbo-0125</a>",Generative,65.34011575979856,92.17877094972067,44.51754385964912,65.47297297297297,59.12315163420091,65.4761630050997
144
+ 143,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-v2.5-70b-preference-mix-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-v2.5-70b-preference-mix-rm</a>",Seq. Classifier,65.15941759094567,77.37430167597765,59.21052631578947,84.86486486486487,41.37508866699405,60.785195271258935
145
+ 144,"<a target=""_blank"" href=""https://huggingface.co/wenbopan/Faro-Yi-9B-DPO"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">wenbopan/Faro-Yi-9B-DPO</a>",DPO,64.61094996096162,92.17877094972067,53.07017543859649,55.13513513513514,58.392672013968465,63.945042573813076
146
+ 145,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3-8B-Instruct</a>",Generative,64.49786646478918,85.47486033519553,41.55701754385965,67.97297297297297,64.82341627107546,60.82426393689548
147
+ 146,"<a target=""_blank"" href=""https://huggingface.co/ai2/llama-2-chat-ultrafeedback-60k.jsonl"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/llama-2-chat-ultrafeedback-60k.jsonl</a>",Seq. Classifier,64.3955076805709,94.41340782122904,45.39473684210526,53.37837837837838,,
148
+ 147,"<a target=""_blank"" href=""https://huggingface.co/IDEA-CCNL/Ziya-LLaMA-7B-Reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">IDEA-CCNL/Ziya-LLaMA-7B-Reward</a>",Seq. Classifier,63.784551529691385,86.87150837988827,46.05263157894737,64.05405405405405,57.74540295738528,64.61376982667257
149
+ 148,"<a target=""_blank"" href=""https://huggingface.co/PKU-Alignment/beaver-7b-v2.0-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PKU-Alignment/beaver-7b-v2.0-reward</a>",Seq. Classifier,63.66172878401215,89.94413407821229,36.40350877192982,60.4054054054054,68.87004146887108,61.70937960727216
150
+ 149,"<a target=""_blank"" href=""https://huggingface.co/stabilityai/stable-code-instruct-3b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stabilityai/stable-code-instruct-3b</a>",DPO,62.1618132126384,57.82122905027933,58.55263157894737,65.54054054054055,75.28271130026737,45.06209397367635
151
+ 150,"<a target=""_blank"" href=""https://huggingface.co/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1</a>",Seq. Classifier,61.501047673154666,92.45810055865921,37.280701754385966,54.45945945945946,58.55022644186174,68.01245262965921
152
+ 151,"<a target=""_blank"" href=""https://huggingface.co/OpenAssistant/reward-model-deberta-v3-large-v2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">OpenAssistant/reward-model-deberta-v3-large-v2</a>",Seq. Classifier,61.25988488574668,89.3854748603352,45.175438596491226,73.37837837837837,38.54968079882141,58.361018703667625
153
+ 152,"<a target=""_blank"" href=""https://huggingface.co/llm-blender/PairRM-hf"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">llm-blender/PairRM-hf</a>",Custom Classifier,60.868838250756006,90.22346368715084,52.19298245614035,47.7027027027027,48.983739837398375,69.61376689001952
154
+ 153,"<a target=""_blank"" href=""https://huggingface.co/PKU-Alignment/beaver-7b-v2.0-cost"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PKU-Alignment/beaver-7b-v2.0-cost</a>",Seq. Classifier,59.56778097839703,57.262569832402235,45.6140350877193,76.08108108108108,62.111570360670044,53.97151608182796
155
+ 154,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_llama13b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_llama13b</a>",DPO,59.52205456101889,84.07821229050279,37.719298245614034,46.486486486486484,70.76683308779397,57.5968308283755
156
+ 155,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_llama30b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_llama30b</a>",DPO,59.00687538053444,84.35754189944134,40.57017543859649,60.54054054054054,50.75435150324658,58.616659661160035
157
+ 156,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-1.8B-Chat</a>",DPO,58.89567615638699,56.14525139664804,60.30701754385965,48.37837837837838,77.93283134173623,44.53412808623833
158
+ 157,"<a target=""_blank"" href=""https://huggingface.co/ai2/llama-2-chat-7b-nectar-3.8m.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/llama-2-chat-7b-nectar-3.8m.json</a>",Seq. Classifier,58.426789771247286,86.31284916201118,26.535087719298247,62.432432432432435,,
159
+ 158,"<a target=""_blank"" href=""https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-cost"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PKU-Alignment/beaver-7b-v1.0-cost</a>",Seq. Classifier,57.97567401900532,61.73184357541899,42.324561403508774,73.51351351351352,54.82109728815409,56.999034609857176
160
+ 159,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_llama30b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_llama30b</a>",DPO,56.18285201407361,69.27374301675978,44.73684210526316,62.83783783783784,47.449118786489876,57.0505846339612
161
+ 160,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_pythia1-4b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_pythia1-4b</a>",DPO,55.809930200702766,68.43575418994413,37.93859649122807,52.567567567567565,64.47488677906914,55.455761750707126
162
+ 161,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_pythia6-9b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_pythia6-9b</a>",DPO,55.6117865296703,77.6536312849162,36.18421052631579,53.648648648648646,54.153707644459004,57.22568255835343
163
+ 162,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_pythia2-8b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_pythia2-8b</a>",DPO,54.96592159422631,75.69832402234637,34.21052631578947,47.432432432432435,62.1572679652971,55.69619287630597
164
+ 163,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-4B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-4B-Chat</a>",DPO,54.77003940637828,38.8268156424581,62.719298245614034,55.67567567567568,66.89344955530092,44.69987641930703
165
+ 164,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_llama13b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_llama13b</a>",DPO,53.99846978252061,71.22905027932961,42.98245614035088,56.486486486486484,44.013272766955865,56.56369669643977
166
+ 165,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_llama7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_llama7b</a>",DPO,53.883046644273705,55.865921787709496,43.64035087719298,45.67567567567568,69.41432040159329,55.754882314120465
167
+ 166,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_llama7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_llama7b</a>",DPO,53.036829672694374,57.82122905027933,44.51754385964912,52.027027027027025,56.58147814699623,55.43691088634592
168
+ 167,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-0.5B-Chat</a>",DPO,52.982802188122534,35.47486033519553,62.93859649122807,57.027027027027025,59.83862607082447,46.28699984455265
169
+ 168,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_pythia2-8b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_pythia2-8b</a>",DPO,52.857927047782155,80.72625698324022,33.55263157894737,44.729729729729726,51.34671522889725,55.0106763884103
170
+ 169,"<a target=""_blank"" href=""https://huggingface.co/my_model/"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">my_model/</a>",Seq. Classifier,52.672491797862534,45.53072625698324,55.921052631578945,43.91891891891892,65.319269383969,
171
+ 170,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_pythia6-9b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_pythia6-9b</a>",DPO,52.6326255248281,74.86033519553072,34.21052631578947,51.75675675675676,48.470153325694326,55.09808653591037
172
+ 171,"<a target=""_blank"" href=""https://huggingface.co/ai2/llama-2-chat-nectar-180k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/llama-2-chat-nectar-180k.json</a>",Seq. Classifier,52.34906620822528,88.26815642458101,28.50877192982456,40.270270270270274,,
173
+ 172,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_pythia1-4b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_pythia1-4b</a>",DPO,52.334628884533196,63.96648044692738,37.280701754385966,50.4054054054054,56.71652479947619,54.27343514840888
174
+ 173,"<a target=""_blank"" href=""https://huggingface.co/stanfordnlp/SteamSHP-flan-t5-xl"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stanfordnlp/SteamSHP-flan-t5-xl</a>",Custom Classifier,51.34535042343637,85.47486033519553,36.8421052631579,37.83783783783784,38.41156490423965,64.97541713006551
175
+ 174,"<a target=""_blank"" href=""https://huggingface.co/SultanR/SmolTulu-1.7b-RM"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">SultanR/SmolTulu-1.7b-RM</a>",Seq. Classifier,50.93872947030961,74.30167597765363,44.078947368421055,57.16216216216216,28.212132373001584,
176
+ 175,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_pythia12-0b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_pythia12-0b</a>",DPO,50.52988550561952,74.86033519553072,36.18421052631579,47.567567567567565,41.27175751623288,55.001227939281776
177
+ 176,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/hh_rlhf_rm_open_llama_3b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/hh_rlhf_rm_open_llama_3b</a>",Seq. Classifier,50.274817067272814,81.84357541899442,37.280701754385966,41.486486486486484,32.80815190702243,65.63552247167672
178
+ 177,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_pythia12-0b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_pythia12-0b</a>",DPO,50.08791349970499,66.75977653631286,36.40350877192982,54.32432432432432,41.39384514650516,53.02831193920059
179
+ 178,"<a target=""_blank"" href=""https://huggingface.co/random"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">random</a>",,50.0,50.0,50.0,50.0,50.0,50.0
180
+ 179,"<a target=""_blank"" href=""https://huggingface.co/stanfordnlp/SteamSHP-flan-t5-large"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stanfordnlp/SteamSHP-flan-t5-large</a>",Custom Classifier,49.62050475651485,85.75418994413408,33.1140350877193,37.432432432432435,35.62673923719103,62.72974940567991
181
+ 180,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-v2.5-13b-uf-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-v2.5-13b-uf-rm</a>",Seq. Classifier,48.05551076423311,39.385474860335194,42.324561403508774,55.54054054054054,47.36897746494243,63.26048833944414
182
+ 181,"<a target=""_blank"" href=""https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PKU-Alignment/beaver-7b-v1.0-reward</a>",Seq. Classifier,47.26664990676508,81.84357541899442,28.728070175438596,37.567567567567565,34.596155944780925,59.929110947322734
leaderboard/retired-app.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ from huggingface_hub import HfApi, snapshot_download
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from datasets import load_dataset
6
+ from src.utils import load_all_data
7
+ from src.md import ABOUT_TEXT, TOP_TEXT
8
+ from src.plt import plot_avg_correlation
9
+ from src.constants import subset_mapping, length_categories, example_counts
10
+ from src.css import custom_css
11
+ import numpy as np
12
+
13
+ api = HfApi()
14
+
15
+ COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
16
+ evals_repo = "allenai/reward-bench-results"
17
+
18
+ eval_set_repo = "allenai/reward-bench"
19
+ repo_dir_rewardbench = "./evals/rewardbench/"
20
+
21
+ def restart_space():
22
+ api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)
23
+
24
+ print("Pulling evaluation results")
25
+ repo = snapshot_download(
26
+ local_dir=repo_dir_rewardbench,
27
+ ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
28
+ repo_id=evals_repo,
29
+ use_auth_token=COLLAB_TOKEN,
30
+ tqdm_class=None,
31
+ etag_timeout=30,
32
+ repo_type="dataset",
33
+ )
34
+
35
+
36
+ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
37
+ """
38
+ Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
39
+
40
+ We average over 4 core sections (per prompt weighting):
41
+ 1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
42
+ 2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
43
+ 3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
44
+ 4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
45
+ 5. Prior Sets (0.5 weight): Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
46
+ """
47
+ new_df = dataframe_core.copy()
48
+ dataframe_prefs = dataframe_prefs.copy()
49
+
50
+ # for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
51
+ for subset, sub_subsets in subset_mapping.items():
52
+ subset_cols = [col for col in new_df.columns if col in sub_subsets]
53
+ sub_data = new_df[subset_cols].values # take the relevant column values
54
+ sub_counts = [example_counts[s] for s in subset_cols] # take the example counts
55
+ new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts) # take the weighted average
56
+ # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
57
+
58
+ data_cols = list(subset_mapping.keys())
59
+ keep_columns = ["model",] + ["model_type"] + data_cols
60
+ # keep_columns = ["model", "average"] + subsets
61
+ new_df = new_df[keep_columns]
62
+
63
+ # selected average from pref_sets
64
+ pref_columns = ["anthropic_helpful", "anthropic_hhh", "shp", "summarize"]
65
+ pref_data = dataframe_prefs[pref_columns].values
66
+
67
+ # add column test sets knowing the rows are not identical, take superset
68
+ dataframe_prefs["Prior Sets (0.5 weight)"] = np.nanmean(pref_data, axis=1)
69
+
70
+ # add column Test Sets empty to new_df
71
+ new_df["Prior Sets (0.5 weight)"] = np.nan
72
+ # per row in new_df if model is in dataframe_prefs, add the value to new_df["Prior Sets (0.5 weight)"]
73
+ values = []
74
+ for i, row in new_df.iterrows():
75
+ model = row["model"]
76
+ if model in dataframe_prefs["model"].values:
77
+ values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0])
78
+ # new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
79
+ else:
80
+ values.append(np.nan)
81
+
82
+ new_df["Prior Sets (0.5 weight)"] = values
83
+
84
+ # add total average
85
+ data_cols += ["Prior Sets (0.5 weight)"]
86
+ final_data = new_df[data_cols].values
87
+ masked_data = np.ma.masked_array(final_data, np.isnan(final_data))
88
+ weights = [2, 2, 2, 2, 1]
89
+ average = np.ma.average(masked_data, axis=1, weights=weights)
90
+ new_df["average"] = average.filled(np.nan)
91
+ # new_df["average"] = np.nanmean(new_df[data_cols].values, axis=1)
92
+
93
+ # make average third column
94
+ keep_columns = ["model", "model_type", "average"] + data_cols
95
+ new_df = new_df[keep_columns]
96
+ return new_df
97
+
98
+ def expand_subsets(dataframe):
99
+ # TODO need to modify data/ script to do this
100
+ pass
101
+
102
+
103
+ def length_bias_check(dataframe):
104
+ """
105
+ Takes the raw rewardbench dataframe and splits the data into new buckets according to length_categories.
106
+ Then, take the average of the three buckets as "average"
107
+ """
108
+ new_df = dataframe.copy()
109
+ existing_subsets = new_df.columns[3:] # model, model_type, average
110
+ final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
111
+ # new data is empty list dict for each final subset
112
+ new_data = {s: [] for s in final_subsets}
113
+
114
+ # now, subsets correspond to those with True, Nuetral, and False length bias
115
+ # check if length_categories[subset] == "True" or "False" or "Neutral"
116
+ for subset in existing_subsets:
117
+ subset_data = new_df[subset].values
118
+ subset_length = length_categories[subset]
119
+ # route to the correct bucket
120
+ if subset_length == "True":
121
+ new_data["Length Bias"].append(subset_data)
122
+ elif subset_length == "Neutral":
123
+ new_data["Neutral"].append(subset_data)
124
+ elif subset_length == "False":
125
+ new_data["Terse Bias"].append(subset_data)
126
+
127
+ # take average of new_data and add to new_df (removing other columns than model)
128
+ for subset in final_subsets:
129
+ new_df[subset] = np.nanmean(new_data[subset], axis=0)
130
+ keep_columns = ["model"] + final_subsets
131
+ new_df = new_df[keep_columns]
132
+ # recompute average
133
+ # new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2)
134
+
135
+ return new_df
136
+
137
+
138
+
139
+ rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
140
+ rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
141
+ prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
142
+ # prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
143
+
144
+ rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
145
+
146
+ def prep_df(df):
147
+ # add column to 0th entry with count (column name itself empty)
148
+ df.insert(0, '', range(1, 1 + len(df)))
149
+
150
+ # replace "model" with "Model" and "model_type" with "Model Type" and "average" with "Average"
151
+ df = df.rename(columns={"model": "Model", "model_type": "Model Type", "average": "Average"})
152
+
153
+ # if "Model Type" in columns
154
+ if "Model Type" in df.columns:
155
+ # get model_types that have generative in them
156
+ mask = df["Model Type"].str.contains("generative", case=False, na=False)
157
+
158
+ # set these values to "Generative"
159
+ df.loc[mask, "Model Type"] = "Generative"
160
+
161
+ return df
162
+
163
+ # add count column to all dataframes
164
+ rewardbench_data = prep_df(rewardbench_data)
165
+ rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
166
+ # adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
167
+
168
+ # save rewardbench_data_avg to csv or json
169
+ rewardbench_data_avg.to_csv("rewardbench_data_avg.csv", index=False)
170
+
171
+ rewardbench_data_length = prep_df(rewardbench_data_length)
172
+ prefs_data = prep_df(prefs_data)
173
+
174
+ col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
175
+ col_types_rewardbench_avg = ["number"] + ["markdown"]+ ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
176
+ cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
177
+ col_types_prefs = ["number"] + ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
178
+ # col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
179
+
180
+ # for showing random samples
181
+ eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
182
+ def random_sample(r: gr.Request, subset):
183
+ if subset is None or subset == []:
184
+ sample_index = np.random.randint(0, len(eval_set) - 1)
185
+ sample = eval_set[sample_index]
186
+ else: # filter by subsets (can be list)
187
+ if isinstance(subset, str):
188
+ subset = [subset]
189
+ # filter down dataset to only include the subset(s)
190
+ eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
191
+ sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
192
+ sample = eval_set_filtered[sample_index]
193
+
194
+ markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
195
+ return markdown_text
196
+
197
+ subsets = eval_set.unique("subset")
198
+
199
+ color_map = {
200
+ "Generative": "#7497db",
201
+ "Custom Classifier": "#E8ECF2",
202
+ "Seq. Classifier": "#ffcd75",
203
+ "DPO": "#75809c",
204
+ }
205
+ def color_model_type_column(df, color_map):
206
+ """
207
+ Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
208
+
209
+ Parameters:
210
+ df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
211
+ color_map (dict): A dictionary mapping model types to colors.
212
+
213
+ Returns:
214
+ pd.Styler: The styled DataFrame.
215
+ """
216
+ # Function to apply color based on the model type
217
+ def apply_color(val):
218
+ color = color_map.get(val, "default") # Default color if not specified in color_map
219
+ return f'background-color: {color}'
220
+
221
+ # Format for different columns
222
+ format_dict = {col: "{:.1f}" for col in df.columns if col not in ['Average', 'Model', 'Model Type']}
223
+ format_dict['Average'] = "{:.2f}"
224
+ format_dict[''] = "{:d}"
225
+
226
+ return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
227
+
228
+ def regex_table(dataframe, regex, filter_button, style=True):
229
+ """
230
+ Takes a model name as a regex, then returns only the rows that has that in it.
231
+ """
232
+ # Split regex statement by comma and trim whitespace around regexes
233
+ regex_list = [x.strip() for x in regex.split(",")]
234
+ # Join the list into a single regex pattern with '|' acting as OR
235
+ combined_regex = '|'.join(regex_list)
236
+
237
+ # remove internal ai2 data
238
+ dataframe = dataframe[~dataframe["Model"].str.contains("ai2", case=False, na=False)]
239
+
240
+ # if filter_button, remove all rows with "ai2" in the model name
241
+ update_scores = False
242
+ if isinstance(filter_button, list) or isinstance(filter_button, str):
243
+ if "Prior Sets" not in filter_button and 'Prior Sets (0.5 weight)' in dataframe.columns:
244
+ update_scores = True
245
+ # remove the column "Prior Sets (0.5 weight)" from the outputted table
246
+ dataframe = dataframe.drop(columns=['Prior Sets (0.5 weight)'])
247
+ if "Seq. Classifiers" not in filter_button:
248
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
249
+ if "DPO" not in filter_button:
250
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
251
+ if "Custom Classifiers" not in filter_button:
252
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
253
+ if "Generative" not in filter_button:
254
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("generative", case=False, na=False)]
255
+ # Filter the dataframe such that 'model' contains any of the regex patterns
256
+ data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
257
+
258
+ # if update the score to not use prior sets, do so
259
+ if update_scores:
260
+ data["Score"] = (data["Chat"] + data["Chat Hard"] + data["Safety"] + data["Reasoning"]) / 4
261
+ # if "Prior Sets (0.5 weight)" in data.columns:
262
+ # data["Prior Sets (0.5 weight)"] = np.nan
263
+ # sort array by Score column
264
+ data = data.sort_values(by='Score', ascending=False)
265
+
266
+ data.reset_index(drop=True, inplace=True)
267
+
268
+ # replace column '' with count/rank
269
+ data[''] = np.arange(1, 1 + len(data))
270
+
271
+ # if Score exists, round to 2 decimals
272
+ if "Score" in data.columns:
273
+ data["Score"] = np.round(np.array(data["Score"].values).astype(float), 2)
274
+ if "Average" in data.columns:
275
+ data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
276
+ # round all others to 1 decimal
277
+ for col in data.columns:
278
+ if col not in ["", "Model", "Model Type", "Score", "Average"]:
279
+ # replace any data[col].values == '' with np.nan
280
+ data[col] = data[col].replace('', np.nan)
281
+ data[col] = np.round(np.array(data[col].values).astype(float), 1)
282
+ if style:
283
+ # apply color
284
+ data = color_model_type_column(data, color_map)
285
+
286
+ return data
287
+
288
+ # import ipdb; ipdb.set_trace()
289
+
290
+ total_models = len(regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False).values)
291
+
292
+ with gr.Blocks(css=custom_css) as app:
293
+ # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
294
+ with gr.Row():
295
+ with gr.Column(scale=6):
296
+ gr.Markdown(TOP_TEXT.format(str(total_models)))
297
+ with gr.Column(scale=4):
298
+ # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
299
+ # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
300
+ # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
301
+ gr.Markdown("""
302
+ ![](file/src/logo.png)
303
+ """)
304
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
305
+ with gr.TabItem("πŸ† RewardBench Leaderboard"):
306
+ with gr.Row():
307
+ search_1 = gr.Textbox(label="Model Search (delimit with , )",
308
+ placeholder="Model Search (delimit with , )",
309
+ show_label=False)
310
+ model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "Prior Sets"],
311
+ value=["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
312
+ label="Model Types",
313
+ show_label=False,
314
+ # info="Which model types to include.",
315
+ )
316
+ with gr.Row():
317
+ # reference data
318
+ rewardbench_table_hidden = gr.Dataframe(
319
+ rewardbench_data_avg.values,
320
+ datatype=col_types_rewardbench_avg,
321
+ headers=rewardbench_data_avg.columns.tolist(),
322
+ visible=False,
323
+ )
324
+ rewardbench_table = gr.Dataframe(
325
+ regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"]),
326
+ datatype=col_types_rewardbench_avg,
327
+ headers=rewardbench_data_avg.columns.tolist(),
328
+ elem_id="rewardbench_dataframe_avg",
329
+ height=1000,
330
+ )
331
+
332
+ with gr.TabItem("πŸ” RewardBench - Detailed"):
333
+ with gr.Row():
334
+ search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
335
+ model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
336
+ value=["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"],
337
+ label="Model Types",
338
+ show_label=False,
339
+ # info="Which model types to include."
340
+ )
341
+ with gr.Row():
342
+ # ref data
343
+ rewardbench_table_detailed_hidden = gr.Dataframe(
344
+ rewardbench_data.values,
345
+ datatype=col_types_rewardbench,
346
+ headers=rewardbench_data.columns.tolist(),
347
+ visible=False,
348
+ )
349
+ rewardbench_table_detailed = gr.Dataframe(
350
+ regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"]),
351
+ datatype=col_types_rewardbench,
352
+ headers=rewardbench_data.columns.tolist(),
353
+ elem_id="rewardbench_dataframe",
354
+ height=1000,
355
+ )
356
+ # with gr.TabItem("rewardbench Eval Set - Length Bias"):
357
+ # with gr.Row():
358
+ # # backup
359
+ # rewardbench_table_len_hidden = gr.Dataframe(
360
+ # rewardbench_data_length.values,
361
+ # datatype=cols_rewardbench_data_length,
362
+ # headers=rewardbench_data_length.columns.tolist(),
363
+ # visible=False,
364
+ # )
365
+ # rewardbench_table_len = gr.Dataframe(
366
+ # regex_table(rewardbench_data_length.copy(), "", False).values,
367
+ # datatype=cols_rewardbench_data_length,
368
+ # headers=rewardbench_data_length.columns.tolist(),
369
+ # elem_id="rewardbench_dataframe_length",
370
+ # height=1000,
371
+ # )
372
+ with gr.TabItem("Prior Test Sets"):
373
+ with gr.Row():
374
+ search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
375
+ model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
376
+ value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
377
+ label="Model Types",
378
+ show_label=False,
379
+ # info="Which model types to include.",
380
+ )
381
+ with gr.Row():
382
+ PREF_SET_TEXT = """
383
+ For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Helpful, Anthropic HHH, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard ranking.
384
+ """
385
+ gr.Markdown(PREF_SET_TEXT)
386
+ with gr.Row():
387
+ # backup
388
+ pref_sets_table_hidden = gr.Dataframe(
389
+ prefs_data.values,
390
+ datatype=col_types_prefs,
391
+ headers=prefs_data.columns.tolist(),
392
+ visible=False,
393
+ )
394
+ pref_sets_table = gr.Dataframe(
395
+ regex_table(prefs_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]),
396
+ datatype=col_types_prefs,
397
+ headers=prefs_data.columns.tolist(),
398
+ elem_id="prefs_dataframe",
399
+ height=1000,
400
+ )
401
+
402
+
403
+ with gr.TabItem("About"):
404
+ with gr.Row():
405
+ gr.Markdown(ABOUT_TEXT)
406
+
407
+ with gr.TabItem("Dataset Viewer"):
408
+ with gr.Row():
409
+ # loads one sample
410
+ gr.Markdown("""## Random Dataset Sample Viewer
411
+ Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
412
+ subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
413
+ button = gr.Button("Show Random Sample")
414
+
415
+ with gr.Row():
416
+ sample_display = gr.Markdown("{sampled data loads here}")
417
+
418
+ button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
419
+ # removed plot because not pretty enough
420
+ # with gr.TabItem("Model Correlation"):
421
+ # with gr.Row():
422
+ # plot = plot_avg_correlation(rewardbench_data_avg, prefs_data)
423
+ # gr.Plot(plot)
424
+
425
+ search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
426
+ search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
427
+ # search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
428
+ search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
429
+
430
+ model_types_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
431
+ model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
432
+ model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
433
+
434
+ with gr.Row():
435
+ with gr.Accordion("πŸ“š Citation", open=False):
436
+ citation_button = gr.Textbox(
437
+ value=r"""@misc{RewardBench,
438
+ title={RewardBench: Evaluating Reward Models for Language Modeling},
439
+ author={Lambert, Nathan and Pyatkin, Valentina and Morrison, Jacob and Miranda, LJ and Lin, Bill Yuchen and Chandu, Khyathi and Dziri, Nouha and Kumar, Sachin and Zick, Tom and Choi, Yejin and Smith, Noah A. and Hajishirzi, Hannaneh},
440
+ year={2024},
441
+ howpublished={\url{https://huggingface.co/spaces/allenai/reward-bench}
442
+ }""",
443
+ lines=7,
444
+ label="Copy the following to cite these results.",
445
+ elem_id="citation-button",
446
+ show_copy_button=True,
447
+ )
448
+ # Load data when app starts, TODO make this used somewhere...
449
+ # def load_data_on_start():
450
+ # data_rewardbench = load_all_data(repo_dir_rewardbench)
451
+ # rewardbench_table.update(data_rewardbench)
452
+
453
+ # data_rewardbench_avg = avg_over_rewardbench(repo_dir_rewardbench)
454
+ # rewardbench_table.update(data_rewardbench_avg)
455
+
456
+ # data_prefs = load_all_data(repo_dir_prefs)
457
+ # pref_sets_table.update(data_prefs)
458
+
459
+ scheduler = BackgroundScheduler()
460
+ scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
461
+ scheduler.start()
462
+ app.launch(allowed_paths=['src/']) # had .queue() before launch before... not sure if that's necessary