Spaces:
Running
Running
Commit
Β·
f460af4
1
Parent(s):
96e55d5
init v1 port
Browse files- app.py +93 -95
- leaderboard/final-rbv1-data.csv +182 -0
- leaderboard/retired-app.py +462 -0
app.py
CHANGED
@@ -5,24 +5,25 @@ import gradio as gr
|
|
5 |
import numpy as np
|
6 |
from datasets import load_dataset
|
7 |
from huggingface_hub import HfApi, snapshot_download
|
|
|
8 |
|
9 |
from leaderboard.constants import example_counts, length_categories, subset_mapping
|
10 |
from leaderboard.css import custom_css
|
11 |
from leaderboard.md import *
|
12 |
from leaderboard.utils import load_all_data
|
13 |
|
|
|
|
|
|
|
14 |
api = HfApi()
|
15 |
|
16 |
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
|
17 |
evals_repo = "allenai/reward-bench-v2-results"
|
18 |
|
19 |
eval_set_repo = "allenai/reward-bench-v2-v0"
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
def restart_space():
|
24 |
-
api.restart_space(repo_id="allenai/reward-bench-v2", token=COLLAB_TOKEN)
|
25 |
|
|
|
26 |
|
27 |
print("Pulling evaluation results")
|
28 |
repo = snapshot_download(
|
@@ -35,6 +36,9 @@ repo = snapshot_download(
|
|
35 |
repo_type="dataset",
|
36 |
)
|
37 |
|
|
|
|
|
|
|
38 |
|
39 |
def avg_over_rewardbench_v2(dataframe_core):
|
40 |
domain_cols = ["chat", "factuality", "safety", "math", "precise if", "ties"]
|
@@ -56,7 +60,6 @@ def avg_over_rewardbench_v2(dataframe_core):
|
|
56 |
|
57 |
return new_df
|
58 |
|
59 |
-
|
60 |
def avg_over_rewardbench(dataframe_core, dataframe_prefs):
|
61 |
"""
|
62 |
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
@@ -125,55 +128,6 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
|
|
125 |
new_df = new_df[keep_columns]
|
126 |
return new_df
|
127 |
|
128 |
-
|
129 |
-
def expand_subsets(dataframe):
|
130 |
-
# TODO need to modify data/ script to do this
|
131 |
-
pass
|
132 |
-
|
133 |
-
|
134 |
-
def length_bias_check(dataframe):
|
135 |
-
"""
|
136 |
-
Takes the raw rewardbench dataframe and splits the data into new buckets according to length_categories.
|
137 |
-
Then, take the average of the three buckets as "average"
|
138 |
-
"""
|
139 |
-
new_df = dataframe.copy()
|
140 |
-
existing_subsets = new_df.columns[3:] # model, model_type, average
|
141 |
-
final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
|
142 |
-
# new data is empty list dict for each final subset
|
143 |
-
new_data = {s: [] for s in final_subsets}
|
144 |
-
|
145 |
-
# now, subsets correspond to those with True, Nuetral, and False length bias
|
146 |
-
# check if length_categories[subset] == "True" or "False" or "Neutral"
|
147 |
-
for subset in existing_subsets:
|
148 |
-
subset_data = new_df[subset].values
|
149 |
-
subset_length = length_categories[subset]
|
150 |
-
# route to the correct bucket
|
151 |
-
if subset_length == "True":
|
152 |
-
new_data["Length Bias"].append(subset_data)
|
153 |
-
elif subset_length == "Neutral":
|
154 |
-
new_data["Neutral"].append(subset_data)
|
155 |
-
elif subset_length == "False":
|
156 |
-
new_data["Terse Bias"].append(subset_data)
|
157 |
-
|
158 |
-
# take average of new_data and add to new_df (removing other columns than model)
|
159 |
-
for subset in final_subsets:
|
160 |
-
new_df[subset] = np.nanmean(new_data[subset], axis=0)
|
161 |
-
keep_columns = ["model"] + final_subsets
|
162 |
-
new_df = new_df[keep_columns]
|
163 |
-
# recompute average
|
164 |
-
# new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2)
|
165 |
-
|
166 |
-
return new_df
|
167 |
-
|
168 |
-
|
169 |
-
rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by="average", ascending=False)
|
170 |
-
# rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
|
171 |
-
# prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
|
172 |
-
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
|
173 |
-
|
174 |
-
rewardbench_data_avg = avg_over_rewardbench_v2(rewardbench_data).sort_values(by="average", ascending=False)
|
175 |
-
|
176 |
-
|
177 |
def prep_df(df):
|
178 |
# add column to 0th entry with count (column name itself empty)
|
179 |
df.insert(0, "", range(1, 1 + len(df)))
|
@@ -191,24 +145,28 @@ def prep_df(df):
|
|
191 |
|
192 |
return df
|
193 |
|
|
|
|
|
194 |
|
195 |
# add count column to all dataframes
|
196 |
rewardbench_data = prep_df(rewardbench_data)
|
197 |
rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
|
198 |
-
# adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
|
199 |
-
|
200 |
-
# rewardbench_data_length = prep_df(rewardbench_data_length)
|
201 |
-
# prefs_data = prep_df(prefs_data)
|
202 |
|
203 |
col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
|
204 |
col_types_rewardbench_avg = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
|
205 |
-
|
206 |
-
#
|
207 |
-
|
|
|
|
|
|
|
|
|
208 |
|
209 |
# for showing random samples
|
210 |
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="test")
|
211 |
-
|
|
|
|
|
212 |
|
213 |
def random_sample(r: gr.Request, subset):
|
214 |
if subset is None or subset == []:
|
@@ -225,8 +183,21 @@ def random_sample(r: gr.Request, subset):
|
|
225 |
markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
|
226 |
return markdown_text
|
227 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
|
229 |
-
|
|
|
230 |
|
231 |
color_map = {
|
232 |
"Generative": "#7497db",
|
@@ -235,7 +206,6 @@ color_map = {
|
|
235 |
"DPO": "#75809c",
|
236 |
}
|
237 |
|
238 |
-
|
239 |
def color_model_type_column(df, color_map):
|
240 |
"""
|
241 |
Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
|
@@ -260,7 +230,6 @@ def color_model_type_column(df, color_map):
|
|
260 |
|
261 |
return df.style.applymap(apply_color, subset=["Model Type"]).format(format_dict, na_rep="")
|
262 |
|
263 |
-
|
264 |
def regex_table(dataframe, regex, filter_button, style=True):
|
265 |
"""
|
266 |
Takes a model name as a regex, then returns only the rows that has that in it.
|
@@ -321,7 +290,6 @@ def regex_table(dataframe, regex, filter_button, style=True):
|
|
321 |
|
322 |
return data
|
323 |
|
324 |
-
|
325 |
# import ipdb; ipdb.set_trace()
|
326 |
|
327 |
total_models = len(
|
@@ -334,6 +302,10 @@ assets = Path("leaderboard").resolve() # absolute dir with the image
|
|
334 |
# Using a string for a predefined color
|
335 |
theme = gr.themes.Default(primary_hue="blue")
|
336 |
|
|
|
|
|
|
|
|
|
337 |
with gr.Blocks(theme=theme, css=custom_css) as app:
|
338 |
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
339 |
with gr.Row():
|
@@ -396,32 +368,6 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
|
|
396 |
max_height=800, # 800 px β ~25 rows on default row-height
|
397 |
)
|
398 |
|
399 |
-
# removed because the data does not have sub-domains
|
400 |
-
# with gr.TabItem("Detailed"):
|
401 |
-
# with gr.Row():
|
402 |
-
# search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
403 |
-
# model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
|
404 |
-
# value=["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"],
|
405 |
-
# label="Model Types",
|
406 |
-
# show_label=False,
|
407 |
-
# # info="Which model types to include."
|
408 |
-
# )
|
409 |
-
# with gr.Row():
|
410 |
-
# # ref data
|
411 |
-
# rewardbench_table_detailed_hidden = gr.Dataframe(
|
412 |
-
# rewardbench_data.values,
|
413 |
-
# datatype=col_types_rewardbench,
|
414 |
-
# headers=rewardbench_data.columns.tolist(),
|
415 |
-
# visible=False,
|
416 |
-
# )
|
417 |
-
# rewardbench_table_detailed = gr.Dataframe(
|
418 |
-
# regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"]),
|
419 |
-
# datatype=col_types_rewardbench,
|
420 |
-
# headers=rewardbench_data.columns.tolist(),
|
421 |
-
# elem_id="rewardbench_dataframe",
|
422 |
-
# # height=1000,
|
423 |
-
# )
|
424 |
-
|
425 |
with gr.TabItem("About"):
|
426 |
with gr.Row():
|
427 |
gr.Markdown(ABOUT_TEXT_V2)
|
@@ -431,15 +377,67 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
|
|
431 |
# loads one sample
|
432 |
gr.Markdown("""## Random Dataset Sample Viewer""")
|
433 |
subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
|
434 |
-
|
435 |
|
436 |
with gr.Row():
|
437 |
sample_display = gr.Markdown("{sampled data loads here}")
|
438 |
|
439 |
-
|
440 |
with gr.TabItem("RewardBench", scale=1.5):
|
441 |
with gr.Row():
|
442 |
gr.Markdown(CAPTION_V1.format(str(total_models)))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
443 |
|
444 |
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
445 |
# search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
|
|
|
5 |
import numpy as np
|
6 |
from datasets import load_dataset
|
7 |
from huggingface_hub import HfApi, snapshot_download
|
8 |
+
import pandas as pd
|
9 |
|
10 |
from leaderboard.constants import example_counts, length_categories, subset_mapping
|
11 |
from leaderboard.css import custom_css
|
12 |
from leaderboard.md import *
|
13 |
from leaderboard.utils import load_all_data
|
14 |
|
15 |
+
#######################################################
|
16 |
+
# Setup #
|
17 |
+
#######################################################
|
18 |
api = HfApi()
|
19 |
|
20 |
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
|
21 |
evals_repo = "allenai/reward-bench-v2-results"
|
22 |
|
23 |
eval_set_repo = "allenai/reward-bench-v2-v0"
|
24 |
+
eval_set_repo_v1 = "allenai/reward-bench"
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
repo_dir_rewardbench = "./evals/rewardbench/"
|
27 |
|
28 |
print("Pulling evaluation results")
|
29 |
repo = snapshot_download(
|
|
|
36 |
repo_type="dataset",
|
37 |
)
|
38 |
|
39 |
+
###########################################
|
40 |
+
# Load Data #
|
41 |
+
###########################################
|
42 |
|
43 |
def avg_over_rewardbench_v2(dataframe_core):
|
44 |
domain_cols = ["chat", "factuality", "safety", "math", "precise if", "ties"]
|
|
|
60 |
|
61 |
return new_df
|
62 |
|
|
|
63 |
def avg_over_rewardbench(dataframe_core, dataframe_prefs):
|
64 |
"""
|
65 |
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
|
|
128 |
new_df = new_df[keep_columns]
|
129 |
return new_df
|
130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
def prep_df(df):
|
132 |
# add column to 0th entry with count (column name itself empty)
|
133 |
df.insert(0, "", range(1, 1 + len(df)))
|
|
|
145 |
|
146 |
return df
|
147 |
|
148 |
+
rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by="average", ascending=False)
|
149 |
+
rewardbench_data_avg = avg_over_rewardbench_v2(rewardbench_data).sort_values(by="average", ascending=False)
|
150 |
|
151 |
# add count column to all dataframes
|
152 |
rewardbench_data = prep_df(rewardbench_data)
|
153 |
rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
|
|
|
|
|
|
|
|
|
154 |
|
155 |
col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
|
156 |
col_types_rewardbench_avg = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
|
157 |
+
|
158 |
+
# get v1 data
|
159 |
+
rb_orig_snapshot = pd.read_csv("leaderboard/final-rbv1-data.csv")
|
160 |
+
|
161 |
+
###########################################
|
162 |
+
# Leaderboard Helpers & Setting #
|
163 |
+
###########################################
|
164 |
|
165 |
# for showing random samples
|
166 |
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="test")
|
167 |
+
eval_set_v1 = load_dataset(eval_set_repo_v1, use_auth_token=COLLAB_TOKEN, split="filtered")
|
168 |
+
subsets = eval_set.unique("subset")
|
169 |
+
subsets_v1 = eval_set_v1.unique("subset")
|
170 |
|
171 |
def random_sample(r: gr.Request, subset):
|
172 |
if subset is None or subset == []:
|
|
|
183 |
markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
|
184 |
return markdown_text
|
185 |
|
186 |
+
# Duplicating because they use global variables with gradio setup
|
187 |
+
def random_sample_v1(r: gr.Request, subset):
|
188 |
+
if subset is None or subset == []:
|
189 |
+
sample_index = np.random.randint(0, len(eval_set) - 1)
|
190 |
+
sample = eval_set[sample_index]
|
191 |
+
else: # filter by subsets (can be list)
|
192 |
+
if isinstance(subset, str):
|
193 |
+
subset = [subset]
|
194 |
+
# filter down dataset to only include the subset(s)
|
195 |
+
eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
|
196 |
+
sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
|
197 |
+
sample = eval_set_filtered[sample_index]
|
198 |
|
199 |
+
markdown_text = "\n\n".join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
|
200 |
+
return markdown_text
|
201 |
|
202 |
color_map = {
|
203 |
"Generative": "#7497db",
|
|
|
206 |
"DPO": "#75809c",
|
207 |
}
|
208 |
|
|
|
209 |
def color_model_type_column(df, color_map):
|
210 |
"""
|
211 |
Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
|
|
|
230 |
|
231 |
return df.style.applymap(apply_color, subset=["Model Type"]).format(format_dict, na_rep="")
|
232 |
|
|
|
233 |
def regex_table(dataframe, regex, filter_button, style=True):
|
234 |
"""
|
235 |
Takes a model name as a regex, then returns only the rows that has that in it.
|
|
|
290 |
|
291 |
return data
|
292 |
|
|
|
293 |
# import ipdb; ipdb.set_trace()
|
294 |
|
295 |
total_models = len(
|
|
|
302 |
# Using a string for a predefined color
|
303 |
theme = gr.themes.Default(primary_hue="blue")
|
304 |
|
305 |
+
#############################################
|
306 |
+
# Gradio App #
|
307 |
+
#############################################
|
308 |
+
|
309 |
with gr.Blocks(theme=theme, css=custom_css) as app:
|
310 |
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
311 |
with gr.Row():
|
|
|
368 |
max_height=800, # 800 px β ~25 rows on default row-height
|
369 |
)
|
370 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
with gr.TabItem("About"):
|
372 |
with gr.Row():
|
373 |
gr.Markdown(ABOUT_TEXT_V2)
|
|
|
377 |
# loads one sample
|
378 |
gr.Markdown("""## Random Dataset Sample Viewer""")
|
379 |
subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
|
380 |
+
button_data = gr.Button("Show Random Sample")
|
381 |
|
382 |
with gr.Row():
|
383 |
sample_display = gr.Markdown("{sampled data loads here}")
|
384 |
|
385 |
+
button_data.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
|
386 |
with gr.TabItem("RewardBench", scale=1.5):
|
387 |
with gr.Row():
|
388 |
gr.Markdown(CAPTION_V1.format(str(total_models)))
|
389 |
+
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
390 |
+
with gr.TabItem("Leaderboard"):
|
391 |
+
pass
|
392 |
+
# with gr.Row():
|
393 |
+
# search_1 = gr.Textbox(
|
394 |
+
# label="Model Search (delimit with , )",
|
395 |
+
# placeholder="Model Search (delimit with , )",
|
396 |
+
# show_label=False,
|
397 |
+
# )
|
398 |
+
# model_types_1 = gr.CheckboxGroup(
|
399 |
+
# ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
|
400 |
+
# value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
|
401 |
+
# label="Model Types",
|
402 |
+
# show_label=False,
|
403 |
+
# # info="Which model types to include.",
|
404 |
+
# )
|
405 |
+
# with gr.Row():
|
406 |
+
# # reference data
|
407 |
+
# rewardbench_table_hidden = gr.Dataframe(
|
408 |
+
# rewardbench_data_avg.values,
|
409 |
+
# datatype=col_types_rewardbench_avg,
|
410 |
+
# headers=rewardbench_data_avg.columns.tolist(),
|
411 |
+
# visible=False,
|
412 |
+
# )
|
413 |
+
# rewardbench_table = gr.Dataframe(
|
414 |
+
# regex_table(
|
415 |
+
# rewardbench_data_avg.copy(),
|
416 |
+
# "",
|
417 |
+
# ["Seq. Classifiers", "Custom Classifiers", "Generative"],
|
418 |
+
# ),
|
419 |
+
# datatype=col_types_rewardbench_avg,
|
420 |
+
# headers=rewardbench_data_avg.columns.tolist(),
|
421 |
+
# elem_id="rewardbench_dataframe_avg",
|
422 |
+
# max_height=800, # 800 px β ~25 rows on default row-height
|
423 |
+
# )
|
424 |
+
with gr.TabItem("About"):
|
425 |
+
with gr.Row():
|
426 |
+
gr.Markdown(ABOUT_TEXT_V1)
|
427 |
+
|
428 |
+
with gr.TabItem("Dataset Viewer"):
|
429 |
+
with gr.Row():
|
430 |
+
# loads one sample
|
431 |
+
gr.Markdown("""## Random Dataset Sample Viewer""")
|
432 |
+
subset_selector_v1 = gr.Dropdown(subsets_v1, label="Subset", value=None, multiselect=True)
|
433 |
+
button_data_v1 = gr.Button("Show Random Sample")
|
434 |
+
|
435 |
+
with gr.Row():
|
436 |
+
sample_display = gr.Markdown("{sampled data loads here}")
|
437 |
+
|
438 |
+
button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display])
|
439 |
+
|
440 |
+
|
441 |
|
442 |
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
443 |
# search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
|
leaderboard/final-rbv1-data.csv
ADDED
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,Model,Model Type,Score,Chat,Chat Hard,Safety,Reasoning,Prior Sets (0.5 weight)
|
2 |
+
1,"<a target=""_blank"" href=""https://huggingface.co/infly/INF-ORM-Llama3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">infly/INF-ORM-Llama3.1-70B</a>",Seq. Classifier,95.10529562974679,96.64804469273743,91.00877192982456,93.64864864864865,99.1157172477765,
|
3 |
+
2,"<a target=""_blank"" href=""https://huggingface.co/ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ShikaiChen/LDL-Reward-Gemma-2-27B-v0.1</a>",Seq. Classifier,94.99413134933042,96.36871508379889,90.78947368421052,93.78378378378379,99.03455284552845,
|
4 |
+
3,"<a target=""_blank"" href=""https://huggingface.co/nicolinho/QRM-Gemma-2-27B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nicolinho/QRM-Gemma-2-27B</a>",Seq. Classifier,94.43611331484493,96.64804469273743,90.13157894736842,92.70270270270271,98.26212691657118,
|
5 |
+
4,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Reward-Gemma-2-27B-v0.2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Reward-Gemma-2-27B-v0.2</a>",Seq. Classifier,94.26093621016115,96.08938547486034,89.91228070175438,92.97297297297297,98.0691056910569,
|
6 |
+
5,"<a target=""_blank"" href=""https://huggingface.co/nvidia/Llama-3.1-Nemotron-70B-Reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nvidia/Llama-3.1-Nemotron-70B-Reward</a> *",Custom Classifier,94.10897209520822,97.48603351955308,85.74561403508773,95.13513513513513,98.0691056910569,
|
7 |
+
6,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Reward-Gemma-2-27B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Reward-Gemma-2-27B</a> β οΈ",Seq. Classifier,93.80116450605776,95.81005586592178,91.44736842105263,91.89189189189189,96.05534184536477,
|
8 |
+
7,"<a target=""_blank"" href=""https://huggingface.co/SF-Foundation/TextEval-Llama3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">SF-Foundation/TextEval-Llama3.1-70B</a> * β οΈ",Generative,93.48032435319458,94.1340782122905,90.13157894736842,93.24324324324324,96.41239700987613,
|
9 |
+
8,"<a target=""_blank"" href=""https://huggingface.co/meta-metrics/MetaMetrics-RM-v1.0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-metrics/MetaMetrics-RM-v1.0</a>",Custom Classifier,93.42462545063005,98.32402234636872,86.40350877192982,90.8108108108108,98.16015987341082,
|
10 |
+
9,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Critic-Llama-3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Critic-Llama-3.1-70B</a> β οΈ",Generative,93.30801781900792,96.64804469273743,87.93859649122807,93.10810810810811,95.5373219839581,
|
11 |
+
10,"<a target=""_blank"" href=""https://huggingface.co/nicolinho/QRM-Llama3.1-8B-v2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nicolinho/QRM-Llama3.1-8B-v2</a>",Seq. Classifier,93.13653373860271,96.36871508379889,86.84210526315789,92.56756756756756,96.76774703988652,
|
12 |
+
11,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Reward-Llama-3.1-8B-v0.2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Reward-Llama-3.1-8B-v0.2</a>",Seq. Classifier,93.12997963530022,94.6927374301676,88.37719298245614,92.70270270270271,96.7472854258744,
|
13 |
+
12,"<a target=""_blank"" href=""https://huggingface.co/nicolinho/QRM-Llama3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nicolinho/QRM-Llama3.1-8B</a> β οΈ",Seq. Classifier,93.05891420009982,94.41340782122904,89.69298245614036,92.29729729729729,95.83196922573254,
|
14 |
+
13,"<a target=""_blank"" href=""https://huggingface.co/LxzGordon/URM-LLaMa-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">LxzGordon/URM-LLaMa-3.1-8B</a> β οΈ",Seq. Classifier,92.93773298857982,95.53072625698324,88.15789473684211,91.08108108108108,96.98122987941288,
|
15 |
+
14,"<a target=""_blank"" href=""https://huggingface.co/Salesforce/SFR-LLaMa-3.1-70B-Judge-r"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Salesforce/SFR-LLaMa-3.1-70B-Judge-r</a> *",Generative,92.71833683150776,96.92737430167598,84.75877192982456,91.62162162162163,97.56557947290882,
|
16 |
+
15,"<a target=""_blank"" href=""https://huggingface.co/R-I-S-E/RISE-Judge-Qwen2.5-32B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">R-I-S-E/RISE-Judge-Qwen2.5-32B</a>",Generative,92.66088172895866,96.64804469273743,83.33333333333333,91.89189189189189,98.77025699787198,
|
17 |
+
16,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Reward-Llama-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Reward-Llama-3.1-8B</a> β οΈ",Seq. Classifier,92.52495013691698,95.81005586592178,87.28070175438596,90.8108108108108,96.19823211654936,
|
18 |
+
17,"<a target=""_blank"" href=""https://huggingface.co/AtlaAI/Selene-1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">AtlaAI/Selene-1</a>",Generative,92.41086740661206,97.76536312849161,83.99122807017544,92.16216216216216,95.72471626561904,
|
19 |
+
18,"<a target=""_blank"" href=""https://huggingface.co/general-preference/GPM-Llama-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">general-preference/GPM-Llama-3.1-8B</a> β οΈ",Custom Classifier,92.23713029788581,93.29608938547486,88.59649122807018,91.08108108108108,95.97485949691712,
|
20 |
+
19,"<a target=""_blank"" href=""https://huggingface.co/nvidia/Nemotron-4-340B-Reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nvidia/Nemotron-4-340B-Reward</a> *",Custom Classifier,91.9958677606516,95.81005586592178,87.06140350877193,91.48648648648648,93.6255251814263,
|
21 |
+
20,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-Llama3-8B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-Llama3-8B-rewardmodel-ft</a> β οΈ",Seq. Classifier,91.53526049213252,95.53072625698324,86.1842105263158,90.8108108108108,93.61529437442026,
|
22 |
+
21,"<a target=""_blank"" href=""https://huggingface.co/nicolinho/QRM-Llama3-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nicolinho/QRM-Llama3-8B</a> β οΈ",Seq. Classifier,91.0990919512119,95.81005586592178,81.14035087719299,89.86486486486487,97.581096196868,
|
23 |
+
22,"<a target=""_blank"" href=""https://huggingface.co/SF-Foundation/TextEval-OffsetBias-12B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">SF-Foundation/TextEval-OffsetBias-12B</a> *",Generative,91.04924182882311,91.89944134078212,86.62280701754386,92.02702702702703,93.64769192993944,
|
24 |
+
23,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-llama3.2-3B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-llama3.2-3B-rewardmodel-ft</a>",Seq. Classifier,90.92295892363056,91.62011173184358,84.86842105263158,92.70270270270271,94.50060020734435,
|
25 |
+
24,"<a target=""_blank"" href=""https://huggingface.co/Salesforce/SFR-nemo-12B-Judge-r"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Salesforce/SFR-nemo-12B-Judge-r</a> *",Generative,90.26551100385808,97.20670391061452,82.23684210526316,86.48648648648648,95.13201151306815,
|
26 |
+
25,"<a target=""_blank"" href=""https://huggingface.co/internlm/internlm2-20b-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">internlm/internlm2-20b-reward</a>",Seq. Classifier,90.15948083664846,98.88268156424581,76.53508771929825,89.45945945945945,95.76069460359032,
|
27 |
+
26,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-VL-Reward-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-VL-Reward-7B</a>",Seq. Classifier,90.07022246172819,89.94413407821229,87.5,91.08108108108108,91.75567468761938,
|
28 |
+
27,"<a target=""_blank"" href=""https://huggingface.co/facebook/Self-taught-evaluator-llama3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">facebook/Self-taught-evaluator-llama3.1-70B</a> *",Generative,90.01358317701886,96.92737430167598,85.08771929824562,89.5945945945946,88.44464451355923,
|
29 |
+
28,"<a target=""_blank"" href=""https://huggingface.co/LxzGordon/URM-LLaMa-3-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">LxzGordon/URM-LLaMa-3-8B</a>",Seq. Classifier,89.90981543420907,96.92737430167598,78.7280701754386,88.24324324324324,95.74057401647842,
|
30 |
+
29,"<a target=""_blank"" href=""https://huggingface.co/NCSOFT/Llama-3-OffsetBias-RM-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NCSOFT/Llama-3-OffsetBias-RM-8B</a>",Seq. Classifier,89.41975692993036,97.20670391061452,81.79824561403508,86.75675675675676,91.91732143831506,
|
31 |
+
30,"<a target=""_blank"" href=""https://huggingface.co/AtlaAI/Selene-1-Mini-Llama-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">AtlaAI/Selene-1-Mini-Llama-3.1-8B</a>",Generative,89.12784912886812,93.57541899441341,79.3859649122807,89.25675675675676,94.29325585202162,
|
32 |
+
31,"<a target=""_blank"" href=""https://huggingface.co/Skywork/Skywork-Critic-Llama-3.1-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Skywork/Skywork-Critic-Llama-3.1-8B</a>",Generative,88.95511699074142,93.57541899441341,81.35964912280701,91.08108108108108,89.80431876466416,
|
33 |
+
32,"<a target=""_blank"" href=""https://huggingface.co/nvidia/Llama3-70B-SteerLM-RM"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">nvidia/Llama3-70B-SteerLM-RM</a> *",Custom Classifier,88.76963582088416,91.34078212290503,80.26315789473684,92.83783783783784,90.63676542805698,
|
34 |
+
33,"<a target=""_blank"" href=""https://huggingface.co/Salesforce/SFR-LLaMa-3.1-8B-Judge-r"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Salesforce/SFR-LLaMa-3.1-8B-Judge-r</a> *",Generative,88.65372403487248,95.53072625698324,77.74122807017544,86.21621621621621,95.12672559611501,
|
35 |
+
34,"<a target=""_blank"" href=""https://huggingface.co/facebook/Self-taught-Llama-3-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">facebook/Self-taught-Llama-3-70B</a> *",Generative,88.62795600264494,96.92737430167598,83.99122807017544,91.08108108108108,82.5121405576472,
|
36 |
+
35,"<a target=""_blank"" href=""https://huggingface.co/RLHFlow/ArmoRM-Llama3-8B-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">RLHFlow/ArmoRM-Llama3-8B-v0.1</a>",Custom Classifier,88.60367185781917,96.92737430167598,76.75438596491227,90.54054054054055,97.34715174332952,74.29414161945574
|
37 |
+
36,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-gemma2-2B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-gemma2-2B-rewardmodel-ft</a>",Seq. Classifier,88.39250002515702,93.01675977653632,77.19298245614036,92.16216216216216,91.19809570578929,
|
38 |
+
37,"<a target=""_blank"" href=""https://huggingface.co/google/gemini-1.5-pro-0514"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/gemini-1.5-pro-0514</a> *",Generative,88.20069001791948,92.31843575418995,80.59210526315789,87.9054054054054,91.98681364892467,
|
39 |
+
38,"<a target=""_blank"" href=""https://huggingface.co/R-I-S-E/RISE-Judge-Qwen2.5-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">R-I-S-E/RISE-Judge-Qwen2.5-7B</a>",Generative,88.19099980224239,92.17877094972067,76.53508771929825,87.97297297297297,96.07716756697768,
|
40 |
+
39,"<a target=""_blank"" href=""https://huggingface.co/Cohere May 2024"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Cohere May 2024</a> *",Custom Classifier,88.16038708182192,96.36871508379889,71.2719298245614,92.29729729729729,97.68272221312816,78.20215489882585
|
41 |
+
40,"<a target=""_blank"" href=""https://huggingface.co/google/flame-1.0-24B-july-2024"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/flame-1.0-24B-july-2024</a> *",Generative,87.80801832232187,92.17877094972067,75.65789473684211,89.5945945945946,93.80081300813008,
|
42 |
+
41,"<a target=""_blank"" href=""https://huggingface.co/internlm/internlm2-7b-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">internlm/internlm2-7b-reward</a>",Seq. Classifier,87.59316719911449,99.16201117318435,69.51754385964912,87.16216216216216,94.53095160146232,
|
43 |
+
42,"<a target=""_blank"" href=""https://huggingface.co/ZiyiYe/Con-J-Qwen2-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ZiyiYe/Con-J-Qwen2-7B</a> β οΈ",Generative,87.12028871485069,91.89944134078212,80.26315789473684,88.24324324324324,88.0753123806406,
|
44 |
+
43,"<a target=""_blank"" href=""https://huggingface.co/google/gemini-1.5-pro-0924"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/gemini-1.5-pro-0924</a>",Generative,86.78430992050927,94.1340782122905,76.97368421052632,85.8108108108108,90.21866644840945,
|
45 |
+
44,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4o-2024-08-06"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4o-2024-08-06</a>",Generative,86.72554986675267,96.08938547486034,76.09649122807018,88.10810810810811,86.60821465597208,
|
46 |
+
45,"<a target=""_blank"" href=""https://huggingface.co/RLHFlow/pair-preference-model-LLaMA3-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">RLHFlow/pair-preference-model-LLaMA3-8B</a>",Custom Classifier,85.74792972712865,98.32402234636872,65.78947368421052,89.72972972972973,94.73420363398264,74.57650875557454
|
47 |
+
46,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-llama3-8B-sftreg"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-llama3-8B-sftreg</a>",Seq. Classifier,85.42084389305319,98.60335195530726,67.76315789473684,89.1891891891892,92.29347410923774,73.08924874053665
|
48 |
+
47,"<a target=""_blank"" href=""https://huggingface.co/opencompass/CompassJudger-1-32B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">opencompass/CompassJudger-1-32B-Instruct</a>",Generative,85.22047081369766,98.04469273743017,65.13157894736842,85.27027027027027,92.43534129972173,
|
49 |
+
48,"<a target=""_blank"" href=""https://huggingface.co/Cohere March 2024"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Cohere March 2024</a> *",Custom Classifier,85.10802881361649,94.6927374301676,65.13157894736842,87.70270270270271,98.17073170731707,74.57675774743672
|
50 |
+
49,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-llama3-8B-distill"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-llama3-8B-distill</a>",Seq. Classifier,84.63918882385776,98.32402234636872,68.42105263157895,86.75675675675676,91.3273449009658,72.09434614337957
|
51 |
+
50,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-Gemma-2B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-Gemma-2B-rewardmodel-ft</a> β οΈ",Seq. Classifier,84.46827345209587,89.3854748603352,75.21929824561404,84.45945945945945,88.80886124297484,
|
52 |
+
51,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4-0125-preview"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4-0125-preview</a>",Generative,84.33564801010327,95.25139664804469,74.34210526315789,87.56756756756756,86.9236645386588,70.85136405607162
|
53 |
+
52,"<a target=""_blank"" href=""https://huggingface.co/mattshumer/Reflection-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mattshumer/Reflection-70B</a>",Generative,84.22327632009588,97.48603351955308,70.6140350877193,83.17567567567568,85.61736099743548,
|
54 |
+
53,"<a target=""_blank"" href=""https://huggingface.co/Anthropic/claude-3-5-sonnet-20240620"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Anthropic/claude-3-5-sonnet-20240620</a>",Generative,84.17242041164789,96.36871508379889,74.01315789473684,81.62162162162163,84.68618704643423,
|
55 |
+
54,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo</a>",Generative,84.12067803631126,97.20670391061452,74.56140350877193,77.56756756756756,87.14703715829104,
|
56 |
+
55,"<a target=""_blank"" href=""https://huggingface.co/opencompass/CompassJudger-1-14B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">opencompass/CompassJudger-1-14B-Instruct</a>",Generative,84.09022697921793,97.48603351955308,62.280701754385966,83.91891891891892,92.67525372401374,
|
57 |
+
56,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3.1-70B-Instruct</a>",Generative,84.05217990917473,97.20670391061452,70.17543859649123,82.83783783783784,85.98873929175534,
|
58 |
+
57,"<a target=""_blank"" href=""https://huggingface.co/NCSOFT/Llama-3-OffsetBias-8B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NCSOFT/Llama-3-OffsetBias-8B</a>",Generative,83.96777752436938,92.45810055865921,80.26315789473684,86.75675675675676,76.39309488732471,
|
59 |
+
58,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4-turbo-2024-04-09"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4-turbo-2024-04-09</a>",Generative,83.95011678629895,95.25139664804469,75.43859649122807,87.56756756756756,82.70345664866045,73.629016365689
|
60 |
+
59,"<a target=""_blank"" href=""https://huggingface.co/sfairXC/FsfairX-LLaMA3-RM-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">sfairXC/FsfairX-LLaMA3-RM-v0.1</a>",Seq. Classifier,83.38339965331156,99.44134078212291,65.13157894736842,86.75675675675676,86.43633709827031,74.91856971076719
|
61 |
+
60,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4o-2024-05-13"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4o-2024-05-13</a>",Generative,83.2681071132992,96.64804469273743,70.39473684210526,86.48648648648648,84.86965951874285,72.61510893954863
|
62 |
+
61,"<a target=""_blank"" href=""https://huggingface.co/opencompass/CompassJudger-1-7B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">opencompass/CompassJudger-1-7B-Instruct</a>",Generative,83.16709323590604,97.76536312849161,60.96491228070175,84.45945945945945,89.47863807497134,
|
63 |
+
62,"<a target=""_blank"" href=""https://huggingface.co/internlm/internlm2-1_8b-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">internlm/internlm2-1_8b-reward</a>",Seq. Classifier,82.16733515408055,93.57541899441341,66.2280701754386,81.62162162162163,87.24422982484859,
|
64 |
+
63,"<a target=""_blank"" href=""https://huggingface.co/CIR-AMS/BTRM_Qwen2_7b_0613"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">CIR-AMS/BTRM_Qwen2_7b_0613</a>",Seq. Classifier,81.72269085246006,97.48603351955308,57.23684210526316,90.13513513513513,87.74894963714738,70.2902968779431
|
65 |
+
64,"<a target=""_blank"" href=""https://huggingface.co/openbmb/Eurus-RM-7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openbmb/Eurus-RM-7b</a>",Seq. Classifier,81.58895090730017,98.04469273743017,65.5701754385965,81.35135135135135,86.3251623288045,71.71779445333651
|
66 |
+
65,"<a target=""_blank"" href=""https://huggingface.co/Nexusflow/Starling-RM-34B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Nexusflow/Starling-RM-34B</a>",Seq. Classifier,81.33351263768401,96.92737430167598,57.23684210526316,87.70270270270271,88.45078299776287,71.36620952434669
|
67 |
+
66,"<a target=""_blank"" href=""https://huggingface.co/google/gemma-2-27b-it"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/gemma-2-27b-it</a>",Generative,80.89669003773389,94.83240223463687,59.10087719298246,86.35135135135135,83.30212937196487,
|
68 |
+
67,"<a target=""_blank"" href=""https://huggingface.co/google/gemini-1.5-flash-001"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">google/gemini-1.5-flash-001</a>",Generative,80.5391103484727,92.17877094972067,63.48684210526316,86.95945945945945,85.1162219675888,69.36940417219024
|
69 |
+
68,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/Gemma-2B-rewardmodel-ft"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/Gemma-2B-rewardmodel-ft</a> β οΈ",Seq. Classifier,80.47843057507436,77.93296089385476,74.78070175438596,85.27027027027027,83.92978938178643,
|
70 |
+
69,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-v2.5-13b-preference-mix-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-v2.5-13b-preference-mix-rm</a>",Seq. Classifier,80.26558812003782,93.57541899441341,68.20175438596492,77.29729729729729,88.50261908659355,67.23611355180205
|
71 |
+
70,"<a target=""_blank"" href=""https://huggingface.co/Anthropic/claude-3-opus-20240229"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Anthropic/claude-3-opus-20240229</a>",Generative,80.0759036376447,94.6927374301676,60.30701754385965,86.62162162162163,78.68223795492989,
|
72 |
+
71,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-4o-mini-2024-07-18"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-4o-mini-2024-07-18</a>",Generative,80.06759386119498,94.97206703910615,60.74561403508772,80.8108108108108,83.7418835597752,
|
73 |
+
72,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/RM-Mistral-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/RM-Mistral-7B</a>",Seq. Classifier,79.8233742639417,96.64804469273743,60.526315789473685,87.02702702702703,77.35615485349484,75.29528365000934
|
74 |
+
73,"<a target=""_blank"" href=""https://huggingface.co/NousResearch/Hermes-3-Llama-3.1-70B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NousResearch/Hermes-3-Llama-3.1-70B</a>",Generative,78.47084260833167,96.22905027932961,56.68859649122807,82.29729729729729,78.6684263654717,
|
75 |
+
74,"<a target=""_blank"" href=""https://huggingface.co/hendrydong/Mistral-RM-for-RAFT-GSHF-v0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">hendrydong/Mistral-RM-for-RAFT-GSHF-v0</a>",Seq. Classifier,78.46503174091394,98.32402234636872,57.89473684210526,85.0,74.33602062530693,75.07572604066365
|
76 |
+
75,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo</a>",Generative,78.08002309698713,87.56983240223464,66.8859649122807,75.06756756756756,82.79672750586566,
|
77 |
+
76,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/reward-model-Mistral-7B-instruct-Unified-Feedback"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/reward-model-Mistral-7B-instruct-Unifie...</a>",Seq. Classifier,76.61192139206588,97.76536312849161,50.6578947368421,85.27027027027027,73.88893435914224,74.3423675391006
|
78 |
+
77,"<a target=""_blank"" href=""https://huggingface.co/Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ahjeong/MMPO_Gemma_7b_gamma1.1_epoch3</a>",DPO,76.52088102568138,97.20670391061452,63.37719298245614,76.35135135135135,72.84129972172205,69.13483329884433
|
79 |
+
78,"<a target=""_blank"" href=""https://huggingface.co/stabilityai/stablelm-2-12b-chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stabilityai/stablelm-2-12b-chat</a>",DPO,76.41872322421631,96.64804469273743,55.48245614035088,78.10810810810811,89.44862770775359,48.39403572004667
|
80 |
+
79,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3-70B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3-70B-Instruct</a>",Generative,76.26515082171642,97.62569832402235,58.88157894736842,72.97297297297297,78.53644895509358,70.3529589965331
|
81 |
+
80,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-2-dpo-70b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-2-dpo-70b</a>",DPO,76.20735542607979,97.48603351955308,60.526315789473685,84.45945945945945,74.07206580455066,52.778449688644265
|
82 |
+
81,"<a target=""_blank"" href=""https://huggingface.co/gemini-1.5-flash-8b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">gemini-1.5-flash-8b</a>",Generative,76.00524043227317,94.41340782122904,59.86842105263158,73.98648648648648,75.75264636874557,
|
83 |
+
82,"<a target=""_blank"" href=""https://huggingface.co/Ahjeong/MMPO_Gemma_7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ahjeong/MMPO_Gemma_7b</a>",DPO,75.8660587247668,96.92737430167598,61.40350877192982,71.35135135135135,77.55872483221475,68.31261000855747
|
84 |
+
83,"<a target=""_blank"" href=""https://huggingface.co/PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-20240229_meta-llama/Llama-3-70b-chat-hf"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PoLL/gpt-3.5-turbo-0125_claude-3-sonnet-2024022...</a>",Generative,75.77705517745792,95.25139664804469,54.05701754385965,80.33783783783784,73.46196868008948,
|
85 |
+
84,"<a target=""_blank"" href=""https://huggingface.co/allenai/llama-3-tulu-2-dpo-70b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/llama-3-tulu-2-dpo-70b</a>",DPO,74.9612075859509,96.36871508379889,57.45614035087719,74.86486486486487,80.2023653625798,56.86669694931664
|
86 |
+
85,"<a target=""_blank"" href=""https://huggingface.co/NousResearch/Nous-Hermes-2-Mistral-7B-DPO"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NousResearch/Nous-Hermes-2-Mistral-7B-DPO</a>",DPO,74.80880493527766,92.17877094972067,60.526315789473685,82.43243243243244,73.75184154526109,55.500522983723165
|
87 |
+
86,"<a target=""_blank"" href=""https://huggingface.co/Anthropic/claude-3-sonnet-20240229"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Anthropic/claude-3-sonnet-20240229</a>",Generative,74.57545943180953,93.43575418994413,56.578947368421055,81.6891891891892,69.07005374583947,69.63124589949818
|
88 |
+
87,"<a target=""_blank"" href=""https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mistralai/Mixtral-8x7B-Instruct-v0.1</a>",DPO,74.54632435829336,94.97206703910615,64.03508771929825,72.56756756756756,78.71855731980139,50.330359933093675
|
89 |
+
88,"<a target=""_blank"" href=""https://huggingface.co/prometheus-eval/prometheus-8x7b-v2.0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">prometheus-eval/prometheus-8x7b-v2.0</a>",Generative,74.5095375782243,93.01675977653632,47.14912280701754,80.47297297297297,77.39929475637038,
|
90 |
+
89,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/GRM-Gemma-2B-sftreg"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/GRM-Gemma-2B-sftreg</a>",Seq. Classifier,74.50927082674883,95.53072625698324,48.68421052631579,79.32432432432432,76.83949909968898,69.82591702611495
|
91 |
+
90,"<a target=""_blank"" href=""https://huggingface.co/general-preference/GPM-Gemma-2B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">general-preference/GPM-Gemma-2B</a>",Custom Classifier,74.49128373533642,71.50837988826815,69.73684210526316,81.21621621621621,75.50369673159818,
|
92 |
+
91,"<a target=""_blank"" href=""https://huggingface.co/0-hero/Matter-0.1-7B-boost-DPO-preview"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">0-hero/Matter-0.1-7B-boost-DPO-preview</a>",DPO,74.47914014376505,91.06145251396649,60.96491228070175,71.35135135135135,83.94718175369673,55.6624654944527
|
93 |
+
92,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-v2.5-70b-uf-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-v2.5-70b-uf-rm</a>",Seq. Classifier,73.98314832639727,86.59217877094972,71.71052631578948,70.13513513513513,75.70046925301467,57.571715987797305
|
94 |
+
93,"<a target=""_blank"" href=""https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">HuggingFaceH4/zephyr-7b-alpha</a>",DPO,73.92192687696839,91.62011173184358,62.5,76.62162162162163,75.13982102908277,53.534233127619544
|
95 |
+
94,"<a target=""_blank"" href=""https://huggingface.co/upstage/SOLAR-10.7B-Instruct-v1.0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">upstage/SOLAR-10.7B-Instruct-v1.0</a>",DPO,73.91132026830088,81.56424581005587,68.64035087719299,85.13513513513513,72.51596005892944,49.49049865208112
|
96 |
+
95,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-2-dpo-13b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-2-dpo-13b</a>",DPO,73.68126195691116,95.81005586592178,58.333333333333336,79.45945945945945,73.22972936105201,49.46620157266727
|
97 |
+
96,"<a target=""_blank"" href=""https://huggingface.co/opencompass/CompassJudger-1-1.5B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">opencompass/CompassJudger-1-1.5B-Instruct</a>",Generative,73.44238723104029,96.36871508379889,49.23245614035088,78.17567567567568,69.99270202433568,
|
98 |
+
97,"<a target=""_blank"" href=""https://huggingface.co/allenai/llama-3-tulu-2-8b-uf-mean-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/llama-3-tulu-2-8b-uf-mean-rm</a>",Seq. Classifier,73.41574916848018,95.25139664804469,59.21052631578947,61.62162162162162,82.1155262727124,64.3436007999852
|
99 |
+
98,"<a target=""_blank"" href=""https://huggingface.co/HuggingFaceH4/starchat2-15b-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">HuggingFaceH4/starchat2-15b-v0.1</a>",DPO,73.22060109644468,93.85474860335195,55.48245614035088,70.94594594594595,81.58522944289845,55.248649602907626
|
100 |
+
99,"<a target=""_blank"" href=""https://huggingface.co/Ray2333/Gemma-2B-rewardmodel-baseline"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Ray2333/Gemma-2B-rewardmodel-baseline</a>",Seq. Classifier,72.89758740021966,94.1340782122905,46.92982456140351,78.64864864864865,73.84050853931359,68.97216667866445
|
101 |
+
100,"<a target=""_blank"" href=""https://huggingface.co/Anthropic/claude-3-haiku-20240307"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Anthropic/claude-3-haiku-20240307</a>",Generative,72.89194286431167,92.73743016759776,51.973684210526315,79.52702702702703,70.60194658154636,66.34730980541012
|
102 |
+
101,"<a target=""_blank"" href=""https://huggingface.co/HuggingFaceH4/zephyr-7b-beta"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">HuggingFaceH4/zephyr-7b-beta</a>",DPO,72.80507814531524,95.25139664804469,62.719298245614034,65.67567567567568,77.89497735581382,52.16300745754066
|
103 |
+
102,"<a target=""_blank"" href=""https://huggingface.co/allenai/llama-3-tulu-2-dpo-8b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/llama-3-tulu-2-dpo-8b</a>",DPO,72.74751270450155,95.25139664804469,53.50877192982456,66.48648648648648,86.63038140448519,50.973541402832126
|
104 |
+
103,"<a target=""_blank"" href=""https://huggingface.co/0-hero/Matter-0.1-7B-DPO-preview"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">0-hero/Matter-0.1-7B-DPO-preview</a>",DPO,72.47264404067178,89.3854748603352,57.675438596491226,63.78378378378378,88.54320128771758,53.477999309390405
|
105 |
+
104,"<a target=""_blank"" href=""https://huggingface.co/jondurbin/bagel-dpo-34b-v0.5"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">jondurbin/bagel-dpo-34b-v0.5</a>",DPO,72.15167952196515,93.85474860335195,55.04385964912281,64.45945945945945,88.8907076990233,44.867564875771365
|
106 |
+
105,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-2-dpo-7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-2-dpo-7b</a>",DPO,72.11611434356087,97.48603351955308,56.14035087719298,75.27027027027027,71.75717520598025,47.737369346054734
|
107 |
+
106,"<a target=""_blank"" href=""https://huggingface.co/prometheus-eval/prometheus-7b-v2.0"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">prometheus-eval/prometheus-7b-v2.0</a>",Generative,72.04295178846496,85.47486033519553,49.12280701754386,77.0945945945946,76.4795452065259,
|
108 |
+
107,"<a target=""_blank"" href=""https://huggingface.co/stabilityai/stablelm-zephyr-3b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stabilityai/stablelm-zephyr-3b</a>",DPO,71.45809212918405,86.31284916201118,60.08771929824562,74.05405405405405,75.73184372783325,50.74989667836822
|
109 |
+
108,"<a target=""_blank"" href=""https://huggingface.co/NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO</a>",DPO,71.38329552978793,91.62011173184358,60.526315789473685,81.48648648648648,61.26104927156654,52.66173320935087
|
110 |
+
109,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-700k.json</a>",Seq. Classifier,71.27478404602779,93.57541899441341,40.78947368421053,79.45945945945945,,
|
111 |
+
110,"<a target=""_blank"" href=""https://huggingface.co/berkeley-nest/Starling-RM-7B-alpha"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">berkeley-nest/Starling-RM-7B-alpha</a>",Seq. Classifier,71.13020256724107,98.04469273743017,45.6140350877193,84.45945945945945,57.998444917335085,67.93855870128164
|
112 |
+
111,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-380k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,70.58403596186601,95.25139664804469,39.473684210526315,77.02702702702703,,
|
113 |
+
112,"<a target=""_blank"" href=""https://huggingface.co/CohereForAI/c4ai-command-r-plus"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">CohereForAI/c4ai-command-r-plus</a>",Generative,70.56998248762835,95.11173184357541,57.56578947368421,59.86486486486486,70.40312789872866,69.23881422694875
|
114 |
+
113,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-2660k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,70.19339171573809,94.97206703910615,37.5,78.10810810810811,,
|
115 |
+
114,"<a target=""_blank"" href=""https://huggingface.co/allenai/llama-3-tulu-2-70b-uf-mean-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/llama-3-tulu-2-70b-uf-mean-rm</a>",Seq. Classifier,70.19307792664753,86.31284916201118,56.14035087719298,60.945945945945944,82.68367708844875,59.57205519263016
|
116 |
+
115,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-3420k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,70.07936854820123,93.85474860335195,38.81578947368421,77.56756756756756,,
|
117 |
+
116,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-3.8m.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,70.03734328271229,94.1340782122905,38.81578947368421,77.16216216216216,,
|
118 |
+
117,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/RM-Gemma-7B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/RM-Gemma-7B</a>",Seq. Classifier,69.66957334431098,96.92737430167598,49.780701754385966,57.83783783783784,73.62395645768537,70.68641939562845
|
119 |
+
118,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-3040k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,69.44952818151877,93.85474860335195,37.06140350877193,77.43243243243244,,
|
120 |
+
119,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-1900k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,69.2421964746281,94.41340782122904,35.74561403508772,77.56756756756756,,
|
121 |
+
120,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/RM-Gemma-7B-4096"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/RM-Gemma-7B-4096</a>",Seq. Classifier,69.22303170109127,94.97206703910615,50.219298245614034,56.08108108108108,75.10912860806461,70.24413536208964
|
122 |
+
121,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-760k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,69.04502561956252,94.41340782122904,35.96491228070175,76.75675675675676,,
|
123 |
+
122,"<a target=""_blank"" href=""https://huggingface.co/openbmb/UltraRM-13b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openbmb/UltraRM-13b</a>",Seq. Classifier,69.02867919901104,96.36871508379889,55.48245614035088,59.86486486486486,62.44270748076608,72.94062565153789
|
124 |
+
123,"<a target=""_blank"" href=""https://huggingface.co/OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5</a>",Seq. Classifier,69.00517292135855,88.54748603351955,48.68421052631579,63.108108108108105,77.51882468489114,65.32929758655776
|
125 |
+
124,"<a target=""_blank"" href=""https://huggingface.co/openbmb/Eurus-7b-kto"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openbmb/Eurus-7b-kto</a>",DPO,68.99912142883106,95.25139664804469,53.728070175438596,60.54054054054054,74.67261417580619,52.606849779819356
|
126 |
+
125,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-2280k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,68.95403268602327,93.85474860335195,37.06140350877193,75.94594594594595,,
|
127 |
+
126,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-14B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-14B-Chat</a>",DPO,68.64045386840729,57.262569832402235,70.17543859649123,71.21621621621621,89.61129753914987,41.23304044714641
|
128 |
+
127,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-checkpoint-1140k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized-3.8m-check...</a>",Seq. Classifier,68.08398077583611,93.01675977653632,35.96491228070175,75.27027027027027,,
|
129 |
+
128,"<a target=""_blank"" href=""https://huggingface.co/RLHFlow/LLaMA3-iterative-DPO-final"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">RLHFlow/LLaMA3-iterative-DPO-final</a>",DPO,67.82774529803461,83.79888268156425,59.21052631578947,78.64864864864865,61.60650952147105,43.920573347364794
|
130 |
+
129,"<a target=""_blank"" href=""https://huggingface.co/HuggingFaceH4/zephyr-7b-gemma-v0.1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">HuggingFaceH4/zephyr-7b-gemma-v0.1</a>",DPO,67.57835885153328,95.81005586592178,49.56140350877193,58.24324324324324,74.63476018988378,51.70630404815817
|
131 |
+
130,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0-nectar-binarized.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0-nectar-binarized.json</a>",Seq. Classifier,67.55772237983352,91.34078212290503,39.03508771929825,72.29729729729729,,
|
132 |
+
131,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-7B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-7B-Chat</a>",DPO,67.50138253417825,53.63128491620112,69.07894736842105,69.1891891891892,90.41475691602555,42.884086027930344
|
133 |
+
132,"<a target=""_blank"" href=""https://huggingface.co/openbmb/MiniCPM-2B-dpo-fp32"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openbmb/MiniCPM-2B-dpo-fp32</a>",DPO,67.304776500488,89.10614525139665,49.3421052631579,57.2972972972973,82.33378348884159,49.58432590300511
|
134 |
+
133,"<a target=""_blank"" href=""https://huggingface.co/mightbe/Better-PairRM"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">mightbe/Better-PairRM</a>",Custom Classifier,67.29754324103595,95.53072625698324,39.25438596491228,82.02702702702703,49.826076280897034,72.40145810968448
|
135 |
+
134,"<a target=""_blank"" href=""https://huggingface.co/allenai/OLMo-7B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/OLMo-7B-Instruct</a>",DPO,67.27282652187517,89.66480446927375,50.6578947368421,64.86486486486487,71.6763518306324,51.72760689365022
|
136 |
+
135,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-72B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-72B-Chat</a>",DPO,67.23151527906012,62.29050279329609,66.00877192982456,67.56756756756756,85.54352867354177,42.26289558308108
|
137 |
+
136,"<a target=""_blank"" href=""https://huggingface.co/ai2/tulu-2-7b-rm-v0.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/tulu-2-7b-rm-v0.json</a>",Seq. Classifier,66.54559072450868,93.29608938547486,45.39473684210526,60.945945945945944,,
|
138 |
+
137,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-MoE-A2.7B-Chat</a>",DPO,66.4408456376338,72.90502793296089,63.1578947368421,62.83783783783784,77.40082937742129,45.364430968579995
|
139 |
+
138,"<a target=""_blank"" href=""https://huggingface.co/RLHFlow/RewardModel-Mistral-7B-for-DPA-v1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">RLHFlow/RewardModel-Mistral-7B-for-DPA-v1</a>",Seq. Classifier,66.33145463112653,87.98882681564245,49.780701754385966,70.67567567567568,59.70835379494734,60.675975598835954
|
140 |
+
139,"<a target=""_blank"" href=""https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stabilityai/stablelm-2-zephyr-1_6b</a>",DPO,65.73535970393974,96.64804469273743,46.71052631578947,60.270270270270274,67.84218639166257,48.67618199453821
|
141 |
+
140,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo</a>",Generative,65.65164437199641,80.72625698324022,49.780701754385966,63.986486486486484,68.11313226387297,
|
142 |
+
141,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/RM-Gemma-2B"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/RM-Gemma-2B</a>",Seq. Classifier,65.48909618129333,94.41340782122904,40.78947368421053,49.86486486486486,76.37399738091341,66.51837812920436
|
143 |
+
142,"<a target=""_blank"" href=""https://huggingface.co/openai/gpt-3.5-turbo-0125"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">openai/gpt-3.5-turbo-0125</a>",Generative,65.34011575979856,92.17877094972067,44.51754385964912,65.47297297297297,59.12315163420091,65.4761630050997
|
144 |
+
143,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-v2.5-70b-preference-mix-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-v2.5-70b-preference-mix-rm</a>",Seq. Classifier,65.15941759094567,77.37430167597765,59.21052631578947,84.86486486486487,41.37508866699405,60.785195271258935
|
145 |
+
144,"<a target=""_blank"" href=""https://huggingface.co/wenbopan/Faro-Yi-9B-DPO"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">wenbopan/Faro-Yi-9B-DPO</a>",DPO,64.61094996096162,92.17877094972067,53.07017543859649,55.13513513513514,58.392672013968465,63.945042573813076
|
146 |
+
145,"<a target=""_blank"" href=""https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">meta-llama/Meta-Llama-3-8B-Instruct</a>",Generative,64.49786646478918,85.47486033519553,41.55701754385965,67.97297297297297,64.82341627107546,60.82426393689548
|
147 |
+
146,"<a target=""_blank"" href=""https://huggingface.co/ai2/llama-2-chat-ultrafeedback-60k.jsonl"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/llama-2-chat-ultrafeedback-60k.jsonl</a>",Seq. Classifier,64.3955076805709,94.41340782122904,45.39473684210526,53.37837837837838,,
|
148 |
+
147,"<a target=""_blank"" href=""https://huggingface.co/IDEA-CCNL/Ziya-LLaMA-7B-Reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">IDEA-CCNL/Ziya-LLaMA-7B-Reward</a>",Seq. Classifier,63.784551529691385,86.87150837988827,46.05263157894737,64.05405405405405,57.74540295738528,64.61376982667257
|
149 |
+
148,"<a target=""_blank"" href=""https://huggingface.co/PKU-Alignment/beaver-7b-v2.0-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PKU-Alignment/beaver-7b-v2.0-reward</a>",Seq. Classifier,63.66172878401215,89.94413407821229,36.40350877192982,60.4054054054054,68.87004146887108,61.70937960727216
|
150 |
+
149,"<a target=""_blank"" href=""https://huggingface.co/stabilityai/stable-code-instruct-3b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stabilityai/stable-code-instruct-3b</a>",DPO,62.1618132126384,57.82122905027933,58.55263157894737,65.54054054054055,75.28271130026737,45.06209397367635
|
151 |
+
150,"<a target=""_blank"" href=""https://huggingface.co/OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1</a>",Seq. Classifier,61.501047673154666,92.45810055865921,37.280701754385966,54.45945945945946,58.55022644186174,68.01245262965921
|
152 |
+
151,"<a target=""_blank"" href=""https://huggingface.co/OpenAssistant/reward-model-deberta-v3-large-v2"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">OpenAssistant/reward-model-deberta-v3-large-v2</a>",Seq. Classifier,61.25988488574668,89.3854748603352,45.175438596491226,73.37837837837837,38.54968079882141,58.361018703667625
|
153 |
+
152,"<a target=""_blank"" href=""https://huggingface.co/llm-blender/PairRM-hf"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">llm-blender/PairRM-hf</a>",Custom Classifier,60.868838250756006,90.22346368715084,52.19298245614035,47.7027027027027,48.983739837398375,69.61376689001952
|
154 |
+
153,"<a target=""_blank"" href=""https://huggingface.co/PKU-Alignment/beaver-7b-v2.0-cost"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PKU-Alignment/beaver-7b-v2.0-cost</a>",Seq. Classifier,59.56778097839703,57.262569832402235,45.6140350877193,76.08108108108108,62.111570360670044,53.97151608182796
|
155 |
+
154,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_llama13b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_llama13b</a>",DPO,59.52205456101889,84.07821229050279,37.719298245614034,46.486486486486484,70.76683308779397,57.5968308283755
|
156 |
+
155,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_llama30b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_llama30b</a>",DPO,59.00687538053444,84.35754189944134,40.57017543859649,60.54054054054054,50.75435150324658,58.616659661160035
|
157 |
+
156,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-1.8B-Chat</a>",DPO,58.89567615638699,56.14525139664804,60.30701754385965,48.37837837837838,77.93283134173623,44.53412808623833
|
158 |
+
157,"<a target=""_blank"" href=""https://huggingface.co/ai2/llama-2-chat-7b-nectar-3.8m.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/llama-2-chat-7b-nectar-3.8m.json</a>",Seq. Classifier,58.426789771247286,86.31284916201118,26.535087719298247,62.432432432432435,,
|
159 |
+
158,"<a target=""_blank"" href=""https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-cost"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PKU-Alignment/beaver-7b-v1.0-cost</a>",Seq. Classifier,57.97567401900532,61.73184357541899,42.324561403508774,73.51351351351352,54.82109728815409,56.999034609857176
|
160 |
+
159,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_llama30b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_llama30b</a>",DPO,56.18285201407361,69.27374301675978,44.73684210526316,62.83783783783784,47.449118786489876,57.0505846339612
|
161 |
+
160,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_pythia1-4b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_pythia1-4b</a>",DPO,55.809930200702766,68.43575418994413,37.93859649122807,52.567567567567565,64.47488677906914,55.455761750707126
|
162 |
+
161,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_pythia6-9b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_pythia6-9b</a>",DPO,55.6117865296703,77.6536312849162,36.18421052631579,53.648648648648646,54.153707644459004,57.22568255835343
|
163 |
+
162,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_pythia2-8b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_pythia2-8b</a>",DPO,54.96592159422631,75.69832402234637,34.21052631578947,47.432432432432435,62.1572679652971,55.69619287630597
|
164 |
+
163,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-4B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-4B-Chat</a>",DPO,54.77003940637828,38.8268156424581,62.719298245614034,55.67567567567568,66.89344955530092,44.69987641930703
|
165 |
+
164,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_llama13b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_llama13b</a>",DPO,53.99846978252061,71.22905027932961,42.98245614035088,56.486486486486484,44.013272766955865,56.56369669643977
|
166 |
+
165,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_llama7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_llama7b</a>",DPO,53.883046644273705,55.865921787709496,43.64035087719298,45.67567567567568,69.41432040159329,55.754882314120465
|
167 |
+
166,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_llama7b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_llama7b</a>",DPO,53.036829672694374,57.82122905027933,44.51754385964912,52.027027027027025,56.58147814699623,55.43691088634592
|
168 |
+
167,"<a target=""_blank"" href=""https://huggingface.co/Qwen/Qwen1.5-0.5B-Chat"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">Qwen/Qwen1.5-0.5B-Chat</a>",DPO,52.982802188122534,35.47486033519553,62.93859649122807,57.027027027027025,59.83862607082447,46.28699984455265
|
169 |
+
168,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_pythia2-8b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_pythia2-8b</a>",DPO,52.857927047782155,80.72625698324022,33.55263157894737,44.729729729729726,51.34671522889725,55.0106763884103
|
170 |
+
169,"<a target=""_blank"" href=""https://huggingface.co/my_model/"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">my_model/</a>",Seq. Classifier,52.672491797862534,45.53072625698324,55.921052631578945,43.91891891891892,65.319269383969,
|
171 |
+
170,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_pythia6-9b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_pythia6-9b</a>",DPO,52.6326255248281,74.86033519553072,34.21052631578947,51.75675675675676,48.470153325694326,55.09808653591037
|
172 |
+
171,"<a target=""_blank"" href=""https://huggingface.co/ai2/llama-2-chat-nectar-180k.json"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ai2/llama-2-chat-nectar-180k.json</a>",Seq. Classifier,52.34906620822528,88.26815642458101,28.50877192982456,40.270270270270274,,
|
173 |
+
172,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_pythia1-4b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_pythia1-4b</a>",DPO,52.334628884533196,63.96648044692738,37.280701754385966,50.4054054054054,56.71652479947619,54.27343514840888
|
174 |
+
173,"<a target=""_blank"" href=""https://huggingface.co/stanfordnlp/SteamSHP-flan-t5-xl"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stanfordnlp/SteamSHP-flan-t5-xl</a>",Custom Classifier,51.34535042343637,85.47486033519553,36.8421052631579,37.83783783783784,38.41156490423965,64.97541713006551
|
175 |
+
174,"<a target=""_blank"" href=""https://huggingface.co/SultanR/SmolTulu-1.7b-RM"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">SultanR/SmolTulu-1.7b-RM</a>",Seq. Classifier,50.93872947030961,74.30167597765363,44.078947368421055,57.16216216216216,28.212132373001584,
|
176 |
+
175,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-kto_pythia12-0b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-kto_pythia12-0b</a>",DPO,50.52988550561952,74.86033519553072,36.18421052631579,47.567567567567565,41.27175751623288,55.001227939281776
|
177 |
+
176,"<a target=""_blank"" href=""https://huggingface.co/weqweasdas/hh_rlhf_rm_open_llama_3b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">weqweasdas/hh_rlhf_rm_open_llama_3b</a>",Seq. Classifier,50.274817067272814,81.84357541899442,37.280701754385966,41.486486486486484,32.80815190702243,65.63552247167672
|
178 |
+
177,"<a target=""_blank"" href=""https://huggingface.co/ContextualAI/archangel_sft-dpo_pythia12-0b"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">ContextualAI/archangel_sft-dpo_pythia12-0b</a>",DPO,50.08791349970499,66.75977653631286,36.40350877192982,54.32432432432432,41.39384514650516,53.02831193920059
|
179 |
+
178,"<a target=""_blank"" href=""https://huggingface.co/random"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">random</a>",,50.0,50.0,50.0,50.0,50.0,50.0
|
180 |
+
179,"<a target=""_blank"" href=""https://huggingface.co/stanfordnlp/SteamSHP-flan-t5-large"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">stanfordnlp/SteamSHP-flan-t5-large</a>",Custom Classifier,49.62050475651485,85.75418994413408,33.1140350877193,37.432432432432435,35.62673923719103,62.72974940567991
|
181 |
+
180,"<a target=""_blank"" href=""https://huggingface.co/allenai/tulu-v2.5-13b-uf-rm"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">allenai/tulu-v2.5-13b-uf-rm</a>",Seq. Classifier,48.05551076423311,39.385474860335194,42.324561403508774,55.54054054054054,47.36897746494243,63.26048833944414
|
182 |
+
181,"<a target=""_blank"" href=""https://huggingface.co/PKU-Alignment/beaver-7b-v1.0-reward"" style=""color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;"">PKU-Alignment/beaver-7b-v1.0-reward</a>",Seq. Classifier,47.26664990676508,81.84357541899442,28.728070175438596,37.567567567567565,34.596155944780925,59.929110947322734
|
leaderboard/retired-app.py
ADDED
@@ -0,0 +1,462 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
from huggingface_hub import HfApi, snapshot_download
|
4 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
+
from datasets import load_dataset
|
6 |
+
from src.utils import load_all_data
|
7 |
+
from src.md import ABOUT_TEXT, TOP_TEXT
|
8 |
+
from src.plt import plot_avg_correlation
|
9 |
+
from src.constants import subset_mapping, length_categories, example_counts
|
10 |
+
from src.css import custom_css
|
11 |
+
import numpy as np
|
12 |
+
|
13 |
+
api = HfApi()
|
14 |
+
|
15 |
+
COLLAB_TOKEN = os.environ.get("COLLAB_TOKEN")
|
16 |
+
evals_repo = "allenai/reward-bench-results"
|
17 |
+
|
18 |
+
eval_set_repo = "allenai/reward-bench"
|
19 |
+
repo_dir_rewardbench = "./evals/rewardbench/"
|
20 |
+
|
21 |
+
def restart_space():
|
22 |
+
api.restart_space(repo_id="allenai/reward-bench", token=COLLAB_TOKEN)
|
23 |
+
|
24 |
+
print("Pulling evaluation results")
|
25 |
+
repo = snapshot_download(
|
26 |
+
local_dir=repo_dir_rewardbench,
|
27 |
+
ignore_patterns=["pref-sets-scores/*", "eval-set-scores/*"],
|
28 |
+
repo_id=evals_repo,
|
29 |
+
use_auth_token=COLLAB_TOKEN,
|
30 |
+
tqdm_class=None,
|
31 |
+
etag_timeout=30,
|
32 |
+
repo_type="dataset",
|
33 |
+
)
|
34 |
+
|
35 |
+
|
36 |
+
def avg_over_rewardbench(dataframe_core, dataframe_prefs):
|
37 |
+
"""
|
38 |
+
Averages over the subsets alpacaeval, mt-bench, llmbar, refusals, hep and returns dataframe with only these columns.
|
39 |
+
|
40 |
+
We average over 4 core sections (per prompt weighting):
|
41 |
+
1. Chat: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
|
42 |
+
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
43 |
+
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
44 |
+
4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
45 |
+
5. Prior Sets (0.5 weight): Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
|
46 |
+
"""
|
47 |
+
new_df = dataframe_core.copy()
|
48 |
+
dataframe_prefs = dataframe_prefs.copy()
|
49 |
+
|
50 |
+
# for main subsets, keys in subset_mapping, take the weighted avg by example_counts and store for the models
|
51 |
+
for subset, sub_subsets in subset_mapping.items():
|
52 |
+
subset_cols = [col for col in new_df.columns if col in sub_subsets]
|
53 |
+
sub_data = new_df[subset_cols].values # take the relevant column values
|
54 |
+
sub_counts = [example_counts[s] for s in subset_cols] # take the example counts
|
55 |
+
new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts) # take the weighted average
|
56 |
+
# new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
|
57 |
+
|
58 |
+
data_cols = list(subset_mapping.keys())
|
59 |
+
keep_columns = ["model",] + ["model_type"] + data_cols
|
60 |
+
# keep_columns = ["model", "average"] + subsets
|
61 |
+
new_df = new_df[keep_columns]
|
62 |
+
|
63 |
+
# selected average from pref_sets
|
64 |
+
pref_columns = ["anthropic_helpful", "anthropic_hhh", "shp", "summarize"]
|
65 |
+
pref_data = dataframe_prefs[pref_columns].values
|
66 |
+
|
67 |
+
# add column test sets knowing the rows are not identical, take superset
|
68 |
+
dataframe_prefs["Prior Sets (0.5 weight)"] = np.nanmean(pref_data, axis=1)
|
69 |
+
|
70 |
+
# add column Test Sets empty to new_df
|
71 |
+
new_df["Prior Sets (0.5 weight)"] = np.nan
|
72 |
+
# per row in new_df if model is in dataframe_prefs, add the value to new_df["Prior Sets (0.5 weight)"]
|
73 |
+
values = []
|
74 |
+
for i, row in new_df.iterrows():
|
75 |
+
model = row["model"]
|
76 |
+
if model in dataframe_prefs["model"].values:
|
77 |
+
values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0])
|
78 |
+
# new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
|
79 |
+
else:
|
80 |
+
values.append(np.nan)
|
81 |
+
|
82 |
+
new_df["Prior Sets (0.5 weight)"] = values
|
83 |
+
|
84 |
+
# add total average
|
85 |
+
data_cols += ["Prior Sets (0.5 weight)"]
|
86 |
+
final_data = new_df[data_cols].values
|
87 |
+
masked_data = np.ma.masked_array(final_data, np.isnan(final_data))
|
88 |
+
weights = [2, 2, 2, 2, 1]
|
89 |
+
average = np.ma.average(masked_data, axis=1, weights=weights)
|
90 |
+
new_df["average"] = average.filled(np.nan)
|
91 |
+
# new_df["average"] = np.nanmean(new_df[data_cols].values, axis=1)
|
92 |
+
|
93 |
+
# make average third column
|
94 |
+
keep_columns = ["model", "model_type", "average"] + data_cols
|
95 |
+
new_df = new_df[keep_columns]
|
96 |
+
return new_df
|
97 |
+
|
98 |
+
def expand_subsets(dataframe):
|
99 |
+
# TODO need to modify data/ script to do this
|
100 |
+
pass
|
101 |
+
|
102 |
+
|
103 |
+
def length_bias_check(dataframe):
|
104 |
+
"""
|
105 |
+
Takes the raw rewardbench dataframe and splits the data into new buckets according to length_categories.
|
106 |
+
Then, take the average of the three buckets as "average"
|
107 |
+
"""
|
108 |
+
new_df = dataframe.copy()
|
109 |
+
existing_subsets = new_df.columns[3:] # model, model_type, average
|
110 |
+
final_subsets = ["Length Bias", "Neutral", "Terse Bias"]
|
111 |
+
# new data is empty list dict for each final subset
|
112 |
+
new_data = {s: [] for s in final_subsets}
|
113 |
+
|
114 |
+
# now, subsets correspond to those with True, Nuetral, and False length bias
|
115 |
+
# check if length_categories[subset] == "True" or "False" or "Neutral"
|
116 |
+
for subset in existing_subsets:
|
117 |
+
subset_data = new_df[subset].values
|
118 |
+
subset_length = length_categories[subset]
|
119 |
+
# route to the correct bucket
|
120 |
+
if subset_length == "True":
|
121 |
+
new_data["Length Bias"].append(subset_data)
|
122 |
+
elif subset_length == "Neutral":
|
123 |
+
new_data["Neutral"].append(subset_data)
|
124 |
+
elif subset_length == "False":
|
125 |
+
new_data["Terse Bias"].append(subset_data)
|
126 |
+
|
127 |
+
# take average of new_data and add to new_df (removing other columns than model)
|
128 |
+
for subset in final_subsets:
|
129 |
+
new_df[subset] = np.nanmean(new_data[subset], axis=0)
|
130 |
+
keep_columns = ["model"] + final_subsets
|
131 |
+
new_df = new_df[keep_columns]
|
132 |
+
# recompute average
|
133 |
+
# new_df["average"] = np.round(np.nanmean(new_df[final_subsets].values, axis=1), 2)
|
134 |
+
|
135 |
+
return new_df
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
rewardbench_data = load_all_data(repo_dir_rewardbench, subdir="eval-set").sort_values(by='average', ascending=False)
|
140 |
+
rewardbench_data_length = length_bias_check(rewardbench_data).sort_values(by='Terse Bias', ascending=False)
|
141 |
+
prefs_data = load_all_data(repo_dir_rewardbench, subdir="pref-sets").sort_values(by='average', ascending=False)
|
142 |
+
# prefs_data_sub = expand_subsets(prefs_data).sort_values(by='average', ascending=False)
|
143 |
+
|
144 |
+
rewardbench_data_avg = avg_over_rewardbench(rewardbench_data, prefs_data).sort_values(by='average', ascending=False)
|
145 |
+
|
146 |
+
def prep_df(df):
|
147 |
+
# add column to 0th entry with count (column name itself empty)
|
148 |
+
df.insert(0, '', range(1, 1 + len(df)))
|
149 |
+
|
150 |
+
# replace "model" with "Model" and "model_type" with "Model Type" and "average" with "Average"
|
151 |
+
df = df.rename(columns={"model": "Model", "model_type": "Model Type", "average": "Average"})
|
152 |
+
|
153 |
+
# if "Model Type" in columns
|
154 |
+
if "Model Type" in df.columns:
|
155 |
+
# get model_types that have generative in them
|
156 |
+
mask = df["Model Type"].str.contains("generative", case=False, na=False)
|
157 |
+
|
158 |
+
# set these values to "Generative"
|
159 |
+
df.loc[mask, "Model Type"] = "Generative"
|
160 |
+
|
161 |
+
return df
|
162 |
+
|
163 |
+
# add count column to all dataframes
|
164 |
+
rewardbench_data = prep_df(rewardbench_data)
|
165 |
+
rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
|
166 |
+
# adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
|
167 |
+
|
168 |
+
# save rewardbench_data_avg to csv or json
|
169 |
+
rewardbench_data_avg.to_csv("rewardbench_data_avg.csv", index=False)
|
170 |
+
|
171 |
+
rewardbench_data_length = prep_df(rewardbench_data_length)
|
172 |
+
prefs_data = prep_df(prefs_data)
|
173 |
+
|
174 |
+
col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
|
175 |
+
col_types_rewardbench_avg = ["number"] + ["markdown"]+ ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
|
176 |
+
cols_rewardbench_data_length = ["markdown"] + ["number"] * (len(rewardbench_data_length.columns) - 1)
|
177 |
+
col_types_prefs = ["number"] + ["markdown"] + ["number"] * (len(prefs_data.columns) - 1)
|
178 |
+
# col_types_prefs_sub = ["markdown"] + ["number"] * (len(prefs_data_sub.columns) - 1)
|
179 |
+
|
180 |
+
# for showing random samples
|
181 |
+
eval_set = load_dataset(eval_set_repo, use_auth_token=COLLAB_TOKEN, split="filtered")
|
182 |
+
def random_sample(r: gr.Request, subset):
|
183 |
+
if subset is None or subset == []:
|
184 |
+
sample_index = np.random.randint(0, len(eval_set) - 1)
|
185 |
+
sample = eval_set[sample_index]
|
186 |
+
else: # filter by subsets (can be list)
|
187 |
+
if isinstance(subset, str):
|
188 |
+
subset = [subset]
|
189 |
+
# filter down dataset to only include the subset(s)
|
190 |
+
eval_set_filtered = eval_set.filter(lambda x: x["subset"] in subset)
|
191 |
+
sample_index = np.random.randint(0, len(eval_set_filtered) - 1)
|
192 |
+
sample = eval_set_filtered[sample_index]
|
193 |
+
|
194 |
+
markdown_text = '\n\n'.join([f"**{key}**:\n\n{value}" for key, value in sample.items()])
|
195 |
+
return markdown_text
|
196 |
+
|
197 |
+
subsets = eval_set.unique("subset")
|
198 |
+
|
199 |
+
color_map = {
|
200 |
+
"Generative": "#7497db",
|
201 |
+
"Custom Classifier": "#E8ECF2",
|
202 |
+
"Seq. Classifier": "#ffcd75",
|
203 |
+
"DPO": "#75809c",
|
204 |
+
}
|
205 |
+
def color_model_type_column(df, color_map):
|
206 |
+
"""
|
207 |
+
Apply color to the 'Model Type' column of the DataFrame based on a given color mapping.
|
208 |
+
|
209 |
+
Parameters:
|
210 |
+
df (pd.DataFrame): The DataFrame containing the 'Model Type' column.
|
211 |
+
color_map (dict): A dictionary mapping model types to colors.
|
212 |
+
|
213 |
+
Returns:
|
214 |
+
pd.Styler: The styled DataFrame.
|
215 |
+
"""
|
216 |
+
# Function to apply color based on the model type
|
217 |
+
def apply_color(val):
|
218 |
+
color = color_map.get(val, "default") # Default color if not specified in color_map
|
219 |
+
return f'background-color: {color}'
|
220 |
+
|
221 |
+
# Format for different columns
|
222 |
+
format_dict = {col: "{:.1f}" for col in df.columns if col not in ['Average', 'Model', 'Model Type']}
|
223 |
+
format_dict['Average'] = "{:.2f}"
|
224 |
+
format_dict[''] = "{:d}"
|
225 |
+
|
226 |
+
return df.style.applymap(apply_color, subset=['Model Type']).format(format_dict, na_rep='')
|
227 |
+
|
228 |
+
def regex_table(dataframe, regex, filter_button, style=True):
|
229 |
+
"""
|
230 |
+
Takes a model name as a regex, then returns only the rows that has that in it.
|
231 |
+
"""
|
232 |
+
# Split regex statement by comma and trim whitespace around regexes
|
233 |
+
regex_list = [x.strip() for x in regex.split(",")]
|
234 |
+
# Join the list into a single regex pattern with '|' acting as OR
|
235 |
+
combined_regex = '|'.join(regex_list)
|
236 |
+
|
237 |
+
# remove internal ai2 data
|
238 |
+
dataframe = dataframe[~dataframe["Model"].str.contains("ai2", case=False, na=False)]
|
239 |
+
|
240 |
+
# if filter_button, remove all rows with "ai2" in the model name
|
241 |
+
update_scores = False
|
242 |
+
if isinstance(filter_button, list) or isinstance(filter_button, str):
|
243 |
+
if "Prior Sets" not in filter_button and 'Prior Sets (0.5 weight)' in dataframe.columns:
|
244 |
+
update_scores = True
|
245 |
+
# remove the column "Prior Sets (0.5 weight)" from the outputted table
|
246 |
+
dataframe = dataframe.drop(columns=['Prior Sets (0.5 weight)'])
|
247 |
+
if "Seq. Classifiers" not in filter_button:
|
248 |
+
dataframe = dataframe[~dataframe["Model Type"].str.contains("Seq. Classifier", case=False, na=False)]
|
249 |
+
if "DPO" not in filter_button:
|
250 |
+
dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
|
251 |
+
if "Custom Classifiers" not in filter_button:
|
252 |
+
dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
|
253 |
+
if "Generative" not in filter_button:
|
254 |
+
dataframe = dataframe[~dataframe["Model Type"].str.contains("generative", case=False, na=False)]
|
255 |
+
# Filter the dataframe such that 'model' contains any of the regex patterns
|
256 |
+
data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
|
257 |
+
|
258 |
+
# if update the score to not use prior sets, do so
|
259 |
+
if update_scores:
|
260 |
+
data["Score"] = (data["Chat"] + data["Chat Hard"] + data["Safety"] + data["Reasoning"]) / 4
|
261 |
+
# if "Prior Sets (0.5 weight)" in data.columns:
|
262 |
+
# data["Prior Sets (0.5 weight)"] = np.nan
|
263 |
+
# sort array by Score column
|
264 |
+
data = data.sort_values(by='Score', ascending=False)
|
265 |
+
|
266 |
+
data.reset_index(drop=True, inplace=True)
|
267 |
+
|
268 |
+
# replace column '' with count/rank
|
269 |
+
data[''] = np.arange(1, 1 + len(data))
|
270 |
+
|
271 |
+
# if Score exists, round to 2 decimals
|
272 |
+
if "Score" in data.columns:
|
273 |
+
data["Score"] = np.round(np.array(data["Score"].values).astype(float), 2)
|
274 |
+
if "Average" in data.columns:
|
275 |
+
data["Average"] = np.round(np.array(data["Average"].values).astype(float), 1)
|
276 |
+
# round all others to 1 decimal
|
277 |
+
for col in data.columns:
|
278 |
+
if col not in ["", "Model", "Model Type", "Score", "Average"]:
|
279 |
+
# replace any data[col].values == '' with np.nan
|
280 |
+
data[col] = data[col].replace('', np.nan)
|
281 |
+
data[col] = np.round(np.array(data[col].values).astype(float), 1)
|
282 |
+
if style:
|
283 |
+
# apply color
|
284 |
+
data = color_model_type_column(data, color_map)
|
285 |
+
|
286 |
+
return data
|
287 |
+
|
288 |
+
# import ipdb; ipdb.set_trace()
|
289 |
+
|
290 |
+
total_models = len(regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False).values)
|
291 |
+
|
292 |
+
with gr.Blocks(css=custom_css) as app:
|
293 |
+
# create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
|
294 |
+
with gr.Row():
|
295 |
+
with gr.Column(scale=6):
|
296 |
+
gr.Markdown(TOP_TEXT.format(str(total_models)))
|
297 |
+
with gr.Column(scale=4):
|
298 |
+
# search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
|
299 |
+
# filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
|
300 |
+
# img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
|
301 |
+
gr.Markdown("""
|
302 |
+

|
303 |
+
""")
|
304 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
305 |
+
with gr.TabItem("π RewardBench Leaderboard"):
|
306 |
+
with gr.Row():
|
307 |
+
search_1 = gr.Textbox(label="Model Search (delimit with , )",
|
308 |
+
placeholder="Model Search (delimit with , )",
|
309 |
+
show_label=False)
|
310 |
+
model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "Prior Sets"],
|
311 |
+
value=["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
|
312 |
+
label="Model Types",
|
313 |
+
show_label=False,
|
314 |
+
# info="Which model types to include.",
|
315 |
+
)
|
316 |
+
with gr.Row():
|
317 |
+
# reference data
|
318 |
+
rewardbench_table_hidden = gr.Dataframe(
|
319 |
+
rewardbench_data_avg.values,
|
320 |
+
datatype=col_types_rewardbench_avg,
|
321 |
+
headers=rewardbench_data_avg.columns.tolist(),
|
322 |
+
visible=False,
|
323 |
+
)
|
324 |
+
rewardbench_table = gr.Dataframe(
|
325 |
+
regex_table(rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"]),
|
326 |
+
datatype=col_types_rewardbench_avg,
|
327 |
+
headers=rewardbench_data_avg.columns.tolist(),
|
328 |
+
elem_id="rewardbench_dataframe_avg",
|
329 |
+
height=1000,
|
330 |
+
)
|
331 |
+
|
332 |
+
with gr.TabItem("π RewardBench - Detailed"):
|
333 |
+
with gr.Row():
|
334 |
+
search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
335 |
+
model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
|
336 |
+
value=["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"],
|
337 |
+
label="Model Types",
|
338 |
+
show_label=False,
|
339 |
+
# info="Which model types to include."
|
340 |
+
)
|
341 |
+
with gr.Row():
|
342 |
+
# ref data
|
343 |
+
rewardbench_table_detailed_hidden = gr.Dataframe(
|
344 |
+
rewardbench_data.values,
|
345 |
+
datatype=col_types_rewardbench,
|
346 |
+
headers=rewardbench_data.columns.tolist(),
|
347 |
+
visible=False,
|
348 |
+
)
|
349 |
+
rewardbench_table_detailed = gr.Dataframe(
|
350 |
+
regex_table(rewardbench_data.copy(), "", ["Seq. Classifiers", "DPO", "Generative", "Custom Classifiers"]),
|
351 |
+
datatype=col_types_rewardbench,
|
352 |
+
headers=rewardbench_data.columns.tolist(),
|
353 |
+
elem_id="rewardbench_dataframe",
|
354 |
+
height=1000,
|
355 |
+
)
|
356 |
+
# with gr.TabItem("rewardbench Eval Set - Length Bias"):
|
357 |
+
# with gr.Row():
|
358 |
+
# # backup
|
359 |
+
# rewardbench_table_len_hidden = gr.Dataframe(
|
360 |
+
# rewardbench_data_length.values,
|
361 |
+
# datatype=cols_rewardbench_data_length,
|
362 |
+
# headers=rewardbench_data_length.columns.tolist(),
|
363 |
+
# visible=False,
|
364 |
+
# )
|
365 |
+
# rewardbench_table_len = gr.Dataframe(
|
366 |
+
# regex_table(rewardbench_data_length.copy(), "", False).values,
|
367 |
+
# datatype=cols_rewardbench_data_length,
|
368 |
+
# headers=rewardbench_data_length.columns.tolist(),
|
369 |
+
# elem_id="rewardbench_dataframe_length",
|
370 |
+
# height=1000,
|
371 |
+
# )
|
372 |
+
with gr.TabItem("Prior Test Sets"):
|
373 |
+
with gr.Row():
|
374 |
+
search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
375 |
+
model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
|
376 |
+
value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
|
377 |
+
label="Model Types",
|
378 |
+
show_label=False,
|
379 |
+
# info="Which model types to include.",
|
380 |
+
)
|
381 |
+
with gr.Row():
|
382 |
+
PREF_SET_TEXT = """
|
383 |
+
For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Helpful, Anthropic HHH, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard ranking.
|
384 |
+
"""
|
385 |
+
gr.Markdown(PREF_SET_TEXT)
|
386 |
+
with gr.Row():
|
387 |
+
# backup
|
388 |
+
pref_sets_table_hidden = gr.Dataframe(
|
389 |
+
prefs_data.values,
|
390 |
+
datatype=col_types_prefs,
|
391 |
+
headers=prefs_data.columns.tolist(),
|
392 |
+
visible=False,
|
393 |
+
)
|
394 |
+
pref_sets_table = gr.Dataframe(
|
395 |
+
regex_table(prefs_data.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers"]),
|
396 |
+
datatype=col_types_prefs,
|
397 |
+
headers=prefs_data.columns.tolist(),
|
398 |
+
elem_id="prefs_dataframe",
|
399 |
+
height=1000,
|
400 |
+
)
|
401 |
+
|
402 |
+
|
403 |
+
with gr.TabItem("About"):
|
404 |
+
with gr.Row():
|
405 |
+
gr.Markdown(ABOUT_TEXT)
|
406 |
+
|
407 |
+
with gr.TabItem("Dataset Viewer"):
|
408 |
+
with gr.Row():
|
409 |
+
# loads one sample
|
410 |
+
gr.Markdown("""## Random Dataset Sample Viewer
|
411 |
+
Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
|
412 |
+
subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
|
413 |
+
button = gr.Button("Show Random Sample")
|
414 |
+
|
415 |
+
with gr.Row():
|
416 |
+
sample_display = gr.Markdown("{sampled data loads here}")
|
417 |
+
|
418 |
+
button.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
|
419 |
+
# removed plot because not pretty enough
|
420 |
+
# with gr.TabItem("Model Correlation"):
|
421 |
+
# with gr.Row():
|
422 |
+
# plot = plot_avg_correlation(rewardbench_data_avg, prefs_data)
|
423 |
+
# gr.Plot(plot)
|
424 |
+
|
425 |
+
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
426 |
+
search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
|
427 |
+
# search.change(regex_table, inputs=[rewardbench_table_len_hidden, search, filter_button], outputs=rewardbench_table_len)
|
428 |
+
search_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
|
429 |
+
|
430 |
+
model_types_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
431 |
+
model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
|
432 |
+
model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
|
433 |
+
|
434 |
+
with gr.Row():
|
435 |
+
with gr.Accordion("π Citation", open=False):
|
436 |
+
citation_button = gr.Textbox(
|
437 |
+
value=r"""@misc{RewardBench,
|
438 |
+
title={RewardBench: Evaluating Reward Models for Language Modeling},
|
439 |
+
author={Lambert, Nathan and Pyatkin, Valentina and Morrison, Jacob and Miranda, LJ and Lin, Bill Yuchen and Chandu, Khyathi and Dziri, Nouha and Kumar, Sachin and Zick, Tom and Choi, Yejin and Smith, Noah A. and Hajishirzi, Hannaneh},
|
440 |
+
year={2024},
|
441 |
+
howpublished={\url{https://huggingface.co/spaces/allenai/reward-bench}
|
442 |
+
}""",
|
443 |
+
lines=7,
|
444 |
+
label="Copy the following to cite these results.",
|
445 |
+
elem_id="citation-button",
|
446 |
+
show_copy_button=True,
|
447 |
+
)
|
448 |
+
# Load data when app starts, TODO make this used somewhere...
|
449 |
+
# def load_data_on_start():
|
450 |
+
# data_rewardbench = load_all_data(repo_dir_rewardbench)
|
451 |
+
# rewardbench_table.update(data_rewardbench)
|
452 |
+
|
453 |
+
# data_rewardbench_avg = avg_over_rewardbench(repo_dir_rewardbench)
|
454 |
+
# rewardbench_table.update(data_rewardbench_avg)
|
455 |
+
|
456 |
+
# data_prefs = load_all_data(repo_dir_prefs)
|
457 |
+
# pref_sets_table.update(data_prefs)
|
458 |
+
|
459 |
+
scheduler = BackgroundScheduler()
|
460 |
+
scheduler.add_job(restart_space, "interval", seconds=10800) # restarted every 3h
|
461 |
+
scheduler.start()
|
462 |
+
app.launch(allowed_paths=['src/']) # had .queue() before launch before... not sure if that's necessary
|