Spaces:
Running
Running
Commit
Β·
c259566
1
Parent(s):
f460af4
works ish
Browse files- app.py +57 -44
- leaderboard/md.py +1 -1
- leaderboard/utils.py +6 -3
app.py
CHANGED
@@ -12,6 +12,14 @@ from leaderboard.css import custom_css
|
|
12 |
from leaderboard.md import *
|
13 |
from leaderboard.utils import load_all_data
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
#######################################################
|
16 |
# Setup #
|
17 |
#######################################################
|
@@ -152,11 +160,10 @@ rewardbench_data_avg = avg_over_rewardbench_v2(rewardbench_data).sort_values(by=
|
|
152 |
rewardbench_data = prep_df(rewardbench_data)
|
153 |
rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
|
154 |
|
155 |
-
col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
|
156 |
-
col_types_rewardbench_avg = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
|
157 |
|
158 |
-
|
159 |
-
|
|
|
160 |
|
161 |
###########################################
|
162 |
# Leaderboard Helpers & Setting #
|
@@ -297,6 +304,11 @@ total_models = len(
|
|
297 |
rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
|
298 |
).values
|
299 |
)
|
|
|
|
|
|
|
|
|
|
|
300 |
assets = Path("leaderboard").resolve() # absolute dir with the image
|
301 |
|
302 |
# Using a string for a predefined color
|
@@ -352,7 +364,7 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
|
|
352 |
# reference data
|
353 |
rewardbench_table_hidden = gr.Dataframe(
|
354 |
rewardbench_data_avg.values,
|
355 |
-
datatype=
|
356 |
headers=rewardbench_data_avg.columns.tolist(),
|
357 |
visible=False,
|
358 |
)
|
@@ -362,7 +374,7 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
|
|
362 |
"",
|
363 |
["Seq. Classifiers", "Custom Classifiers", "Generative"],
|
364 |
),
|
365 |
-
datatype=
|
366 |
headers=rewardbench_data_avg.columns.tolist(),
|
367 |
elem_id="rewardbench_dataframe_avg",
|
368 |
max_height=800, # 800 px β ~25 rows on default row-height
|
@@ -385,42 +397,42 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
|
|
385 |
button_data.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
|
386 |
with gr.TabItem("RewardBench", scale=1.5):
|
387 |
with gr.Row():
|
388 |
-
gr.Markdown(CAPTION_V1.format(str(
|
389 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
390 |
with gr.TabItem("Leaderboard"):
|
391 |
pass
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
with gr.TabItem("About"):
|
425 |
with gr.Row():
|
426 |
gr.Markdown(ABOUT_TEXT_V1)
|
@@ -433,19 +445,20 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
|
|
433 |
button_data_v1 = gr.Button("Show Random Sample")
|
434 |
|
435 |
with gr.Row():
|
436 |
-
|
437 |
-
|
438 |
-
button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display])
|
439 |
|
|
|
440 |
|
441 |
|
442 |
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
443 |
-
|
444 |
|
445 |
model_types_1.change(
|
446 |
regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
|
447 |
)
|
448 |
-
|
|
|
|
|
449 |
|
450 |
with gr.Row():
|
451 |
with gr.Accordion("π Citation", open=False):
|
|
|
12 |
from leaderboard.md import *
|
13 |
from leaderboard.utils import load_all_data
|
14 |
|
15 |
+
# get v1 data
|
16 |
+
rb_orig_snapshot = pd.read_csv("leaderboard/final-rbv1-data.csv")
|
17 |
+
# rename column "Unnamed: 0" to ""
|
18 |
+
rb_orig_snapshot = rb_orig_snapshot.rename(columns={"Unnamed: 0": ""})
|
19 |
+
# rb_orig_snapshot = rb_orig_snapshot.drop(columns=["Unnamed: 0", ''])
|
20 |
+
rb_orig_snapshot.reset_index(drop=True, inplace=True)
|
21 |
+
|
22 |
+
# import ipdb; ipdb.set_trace()
|
23 |
#######################################################
|
24 |
# Setup #
|
25 |
#######################################################
|
|
|
160 |
rewardbench_data = prep_df(rewardbench_data)
|
161 |
rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
|
162 |
|
|
|
|
|
163 |
|
164 |
+
|
165 |
+
col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
|
166 |
+
col_types_rewardbench_v1 = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rb_orig_snapshot.columns) - 1)
|
167 |
|
168 |
###########################################
|
169 |
# Leaderboard Helpers & Setting #
|
|
|
304 |
rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
|
305 |
).values
|
306 |
)
|
307 |
+
total_models_v1 = len(
|
308 |
+
regex_table(
|
309 |
+
rb_orig_snapshot.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
|
310 |
+
).values
|
311 |
+
)
|
312 |
assets = Path("leaderboard").resolve() # absolute dir with the image
|
313 |
|
314 |
# Using a string for a predefined color
|
|
|
364 |
# reference data
|
365 |
rewardbench_table_hidden = gr.Dataframe(
|
366 |
rewardbench_data_avg.values,
|
367 |
+
datatype=col_types_rewardbench_v1,
|
368 |
headers=rewardbench_data_avg.columns.tolist(),
|
369 |
visible=False,
|
370 |
)
|
|
|
374 |
"",
|
375 |
["Seq. Classifiers", "Custom Classifiers", "Generative"],
|
376 |
),
|
377 |
+
datatype=col_types_rewardbench_v1,
|
378 |
headers=rewardbench_data_avg.columns.tolist(),
|
379 |
elem_id="rewardbench_dataframe_avg",
|
380 |
max_height=800, # 800 px β ~25 rows on default row-height
|
|
|
397 |
button_data.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
|
398 |
with gr.TabItem("RewardBench", scale=1.5):
|
399 |
with gr.Row():
|
400 |
+
gr.Markdown(CAPTION_V1.format(str(total_models_v1)))
|
401 |
with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
|
402 |
with gr.TabItem("Leaderboard"):
|
403 |
pass
|
404 |
+
with gr.Row():
|
405 |
+
search_1_v1 = gr.Textbox(
|
406 |
+
label="Model Search (delimit with , )",
|
407 |
+
placeholder="Model Search (delimit with , )",
|
408 |
+
show_label=False,
|
409 |
+
)
|
410 |
+
model_types_1_v1 = gr.CheckboxGroup(
|
411 |
+
["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
|
412 |
+
value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
|
413 |
+
label="Model Types",
|
414 |
+
show_label=False,
|
415 |
+
# info="Which model types to include.",
|
416 |
+
)
|
417 |
+
with gr.Row():
|
418 |
+
# reference data
|
419 |
+
rewardbench_table_hidden_v1 = gr.Dataframe(
|
420 |
+
rb_orig_snapshot.values,
|
421 |
+
datatype=col_types_rewardbench,
|
422 |
+
headers=rb_orig_snapshot.columns.tolist(),
|
423 |
+
visible=False,
|
424 |
+
)
|
425 |
+
rewardbench_table_v1 = gr.Dataframe(
|
426 |
+
regex_table(
|
427 |
+
rb_orig_snapshot.copy(),
|
428 |
+
"",
|
429 |
+
["Seq. Classifiers", "Custom Classifiers", "Generative"],
|
430 |
+
),
|
431 |
+
datatype=col_types_rewardbench,
|
432 |
+
headers=rb_orig_snapshot.columns.tolist(),
|
433 |
+
elem_id="rewardbench_dataframe_avg_v1",
|
434 |
+
max_height=800, # 800 px β ~25 rows on default row-height
|
435 |
+
)
|
436 |
with gr.TabItem("About"):
|
437 |
with gr.Row():
|
438 |
gr.Markdown(ABOUT_TEXT_V1)
|
|
|
445 |
button_data_v1 = gr.Button("Show Random Sample")
|
446 |
|
447 |
with gr.Row():
|
448 |
+
sample_display_v1 = gr.Markdown("{sampled data loads here}")
|
|
|
|
|
449 |
|
450 |
+
button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display_v1])
|
451 |
|
452 |
|
453 |
search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
|
454 |
+
search_1_v1.change(regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1)
|
455 |
|
456 |
model_types_1.change(
|
457 |
regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
|
458 |
)
|
459 |
+
model_types_1_v1.change(
|
460 |
+
regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
|
461 |
+
)
|
462 |
|
463 |
with gr.Row():
|
464 |
with gr.Accordion("π Citation", open=False):
|
leaderboard/md.py
CHANGED
@@ -112,7 +112,7 @@ CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human
|
|
112 |
|
113 |
CAPTION_V1 = """The original RewardBench -- the first reward model evaluation.
|
114 |
|
115 |
-
**Note**: This leaderboard is frozen and will not be updated. The final version of the evaluation results are available
|
116 |
|
117 |
β οΈ Many of the top models were trained on unintentionally contaminated, AI-generated data, for more information, see this [gist](https://gist.github.com/natolambert/1aed306000c13e0e8c5bc17c1a5dd300).
|
118 |
"""
|
|
|
112 |
|
113 |
CAPTION_V1 = """The original RewardBench -- the first reward model evaluation.
|
114 |
|
115 |
+
**Note**: This leaderboard is frozen and will not be updated. The final version of the evaluation results are available in the source for this application.
|
116 |
|
117 |
β οΈ Many of the top models were trained on unintentionally contaminated, AI-generated data, for more information, see this [gist](https://gist.github.com/natolambert/1aed306000c13e0e8c5bc17c1a5dd300).
|
118 |
"""
|
leaderboard/utils.py
CHANGED
@@ -6,7 +6,10 @@ import numpy as np
|
|
6 |
import pandas as pd
|
7 |
from datasets import load_dataset
|
8 |
|
9 |
-
UNVERIFIED_MODELS = [
|
|
|
|
|
|
|
10 |
"nvidia/Nemotron-4-340B-Reward",
|
11 |
"nvidia/Llama3-70B-SteerLM-RM",
|
12 |
"Cohere May 2024",
|
@@ -24,7 +27,8 @@ UNVERIFIED_MODELS = [
|
|
24 |
"nvidia/Llama-3.1-Nemotron-70B-Reward",
|
25 |
]
|
26 |
|
27 |
-
|
|
|
28 |
"Skywork/Skywork-Reward-Gemma-2-27B",
|
29 |
"Skywork/Skywork-Critic-Llama-3.1-70B",
|
30 |
"LxzGordon/URM-LLaMa-3.1-8B",
|
@@ -39,7 +43,6 @@ CONTAMINATED_MODELS = [
|
|
39 |
"Ray2333/GRM-Gemma-2B-rewardmodel-ft",
|
40 |
]
|
41 |
|
42 |
-
|
43 |
# From Open LLM Leaderboard
|
44 |
def model_hyperlink(link, model_name):
|
45 |
# if model_name is above 50 characters, return first 47 characters and "..."
|
|
|
6 |
import pandas as pd
|
7 |
from datasets import load_dataset
|
8 |
|
9 |
+
UNVERIFIED_MODELS = []
|
10 |
+
CONTAMINATED_MODELS = []
|
11 |
+
|
12 |
+
UNVERIFIED_MODELS_V1 = [
|
13 |
"nvidia/Nemotron-4-340B-Reward",
|
14 |
"nvidia/Llama3-70B-SteerLM-RM",
|
15 |
"Cohere May 2024",
|
|
|
27 |
"nvidia/Llama-3.1-Nemotron-70B-Reward",
|
28 |
]
|
29 |
|
30 |
+
# No longer used
|
31 |
+
CONTAMINATED_MODELS_V1 = [
|
32 |
"Skywork/Skywork-Reward-Gemma-2-27B",
|
33 |
"Skywork/Skywork-Critic-Llama-3.1-70B",
|
34 |
"LxzGordon/URM-LLaMa-3.1-8B",
|
|
|
43 |
"Ray2333/GRM-Gemma-2B-rewardmodel-ft",
|
44 |
]
|
45 |
|
|
|
46 |
# From Open LLM Leaderboard
|
47 |
def model_hyperlink(link, model_name):
|
48 |
# if model_name is above 50 characters, return first 47 characters and "..."
|