natolambert commited on
Commit
c259566
Β·
1 Parent(s): f460af4
Files changed (3) hide show
  1. app.py +57 -44
  2. leaderboard/md.py +1 -1
  3. leaderboard/utils.py +6 -3
app.py CHANGED
@@ -12,6 +12,14 @@ from leaderboard.css import custom_css
12
  from leaderboard.md import *
13
  from leaderboard.utils import load_all_data
14
 
 
 
 
 
 
 
 
 
15
  #######################################################
16
  # Setup #
17
  #######################################################
@@ -152,11 +160,10 @@ rewardbench_data_avg = avg_over_rewardbench_v2(rewardbench_data).sort_values(by=
152
  rewardbench_data = prep_df(rewardbench_data)
153
  rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
154
 
155
- col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data.columns) - 1)
156
- col_types_rewardbench_avg = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
157
 
158
- # get v1 data
159
- rb_orig_snapshot = pd.read_csv("leaderboard/final-rbv1-data.csv")
 
160
 
161
  ###########################################
162
  # Leaderboard Helpers & Setting #
@@ -297,6 +304,11 @@ total_models = len(
297
  rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
298
  ).values
299
  )
 
 
 
 
 
300
  assets = Path("leaderboard").resolve() # absolute dir with the image
301
 
302
  # Using a string for a predefined color
@@ -352,7 +364,7 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
352
  # reference data
353
  rewardbench_table_hidden = gr.Dataframe(
354
  rewardbench_data_avg.values,
355
- datatype=col_types_rewardbench_avg,
356
  headers=rewardbench_data_avg.columns.tolist(),
357
  visible=False,
358
  )
@@ -362,7 +374,7 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
362
  "",
363
  ["Seq. Classifiers", "Custom Classifiers", "Generative"],
364
  ),
365
- datatype=col_types_rewardbench_avg,
366
  headers=rewardbench_data_avg.columns.tolist(),
367
  elem_id="rewardbench_dataframe_avg",
368
  max_height=800, # 800 px β‰ˆ ~25 rows on default row-height
@@ -385,42 +397,42 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
385
  button_data.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
386
  with gr.TabItem("RewardBench", scale=1.5):
387
  with gr.Row():
388
- gr.Markdown(CAPTION_V1.format(str(total_models)))
389
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
390
  with gr.TabItem("Leaderboard"):
391
  pass
392
- # with gr.Row():
393
- # search_1 = gr.Textbox(
394
- # label="Model Search (delimit with , )",
395
- # placeholder="Model Search (delimit with , )",
396
- # show_label=False,
397
- # )
398
- # model_types_1 = gr.CheckboxGroup(
399
- # ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
400
- # value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
401
- # label="Model Types",
402
- # show_label=False,
403
- # # info="Which model types to include.",
404
- # )
405
- # with gr.Row():
406
- # # reference data
407
- # rewardbench_table_hidden = gr.Dataframe(
408
- # rewardbench_data_avg.values,
409
- # datatype=col_types_rewardbench_avg,
410
- # headers=rewardbench_data_avg.columns.tolist(),
411
- # visible=False,
412
- # )
413
- # rewardbench_table = gr.Dataframe(
414
- # regex_table(
415
- # rewardbench_data_avg.copy(),
416
- # "",
417
- # ["Seq. Classifiers", "Custom Classifiers", "Generative"],
418
- # ),
419
- # datatype=col_types_rewardbench_avg,
420
- # headers=rewardbench_data_avg.columns.tolist(),
421
- # elem_id="rewardbench_dataframe_avg",
422
- # max_height=800, # 800 px β‰ˆ ~25 rows on default row-height
423
- # )
424
  with gr.TabItem("About"):
425
  with gr.Row():
426
  gr.Markdown(ABOUT_TEXT_V1)
@@ -433,19 +445,20 @@ with gr.Blocks(theme=theme, css=custom_css) as app:
433
  button_data_v1 = gr.Button("Show Random Sample")
434
 
435
  with gr.Row():
436
- sample_display = gr.Markdown("{sampled data loads here}")
437
-
438
- button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display])
439
 
 
440
 
441
 
442
  search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
443
- # search_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
444
 
445
  model_types_1.change(
446
  regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
447
  )
448
- # model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
 
 
449
 
450
  with gr.Row():
451
  with gr.Accordion("πŸ“š Citation", open=False):
 
12
  from leaderboard.md import *
13
  from leaderboard.utils import load_all_data
14
 
15
+ # get v1 data
16
+ rb_orig_snapshot = pd.read_csv("leaderboard/final-rbv1-data.csv")
17
+ # rename column "Unnamed: 0" to ""
18
+ rb_orig_snapshot = rb_orig_snapshot.rename(columns={"Unnamed: 0": ""})
19
+ # rb_orig_snapshot = rb_orig_snapshot.drop(columns=["Unnamed: 0", ''])
20
+ rb_orig_snapshot.reset_index(drop=True, inplace=True)
21
+
22
+ # import ipdb; ipdb.set_trace()
23
  #######################################################
24
  # Setup #
25
  #######################################################
 
160
  rewardbench_data = prep_df(rewardbench_data)
161
  rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
162
 
 
 
163
 
164
+
165
+ col_types_rewardbench = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rewardbench_data_avg.columns) - 1)
166
+ col_types_rewardbench_v1 = ["number"] + ["markdown"] + ["str"] + ["number"] * (len(rb_orig_snapshot.columns) - 1)
167
 
168
  ###########################################
169
  # Leaderboard Helpers & Setting #
 
304
  rewardbench_data_avg.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
305
  ).values
306
  )
307
+ total_models_v1 = len(
308
+ regex_table(
309
+ rb_orig_snapshot.copy(), "", ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"], style=False
310
+ ).values
311
+ )
312
  assets = Path("leaderboard").resolve() # absolute dir with the image
313
 
314
  # Using a string for a predefined color
 
364
  # reference data
365
  rewardbench_table_hidden = gr.Dataframe(
366
  rewardbench_data_avg.values,
367
+ datatype=col_types_rewardbench_v1,
368
  headers=rewardbench_data_avg.columns.tolist(),
369
  visible=False,
370
  )
 
374
  "",
375
  ["Seq. Classifiers", "Custom Classifiers", "Generative"],
376
  ),
377
+ datatype=col_types_rewardbench_v1,
378
  headers=rewardbench_data_avg.columns.tolist(),
379
  elem_id="rewardbench_dataframe_avg",
380
  max_height=800, # 800 px β‰ˆ ~25 rows on default row-height
 
397
  button_data.click(fn=random_sample, inputs=[subset_selector], outputs=[sample_display])
398
  with gr.TabItem("RewardBench", scale=1.5):
399
  with gr.Row():
400
+ gr.Markdown(CAPTION_V1.format(str(total_models_v1)))
401
  with gr.Tabs(elem_id="inner-tabs", elem_classes="tabs-small") as tabs:
402
  with gr.TabItem("Leaderboard"):
403
  pass
404
+ with gr.Row():
405
+ search_1_v1 = gr.Textbox(
406
+ label="Model Search (delimit with , )",
407
+ placeholder="Model Search (delimit with , )",
408
+ show_label=False,
409
+ )
410
+ model_types_1_v1 = gr.CheckboxGroup(
411
+ ["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative"],
412
+ value=["Seq. Classifiers", "Custom Classifiers", "Generative"],
413
+ label="Model Types",
414
+ show_label=False,
415
+ # info="Which model types to include.",
416
+ )
417
+ with gr.Row():
418
+ # reference data
419
+ rewardbench_table_hidden_v1 = gr.Dataframe(
420
+ rb_orig_snapshot.values,
421
+ datatype=col_types_rewardbench,
422
+ headers=rb_orig_snapshot.columns.tolist(),
423
+ visible=False,
424
+ )
425
+ rewardbench_table_v1 = gr.Dataframe(
426
+ regex_table(
427
+ rb_orig_snapshot.copy(),
428
+ "",
429
+ ["Seq. Classifiers", "Custom Classifiers", "Generative"],
430
+ ),
431
+ datatype=col_types_rewardbench,
432
+ headers=rb_orig_snapshot.columns.tolist(),
433
+ elem_id="rewardbench_dataframe_avg_v1",
434
+ max_height=800, # 800 px β‰ˆ ~25 rows on default row-height
435
+ )
436
  with gr.TabItem("About"):
437
  with gr.Row():
438
  gr.Markdown(ABOUT_TEXT_V1)
 
445
  button_data_v1 = gr.Button("Show Random Sample")
446
 
447
  with gr.Row():
448
+ sample_display_v1 = gr.Markdown("{sampled data loads here}")
 
 
449
 
450
+ button_data_v1.click(fn=random_sample_v1, inputs=[subset_selector_v1], outputs=[sample_display_v1])
451
 
452
 
453
  search_1.change(regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table)
454
+ search_1_v1.change(regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1)
455
 
456
  model_types_1.change(
457
  regex_table, inputs=[rewardbench_table_hidden, search_1, model_types_1], outputs=rewardbench_table
458
  )
459
+ model_types_1_v1.change(
460
+ regex_table, inputs=[rewardbench_table_hidden_v1, search_1_v1, model_types_1_v1], outputs=rewardbench_table_v1
461
+ )
462
 
463
  with gr.Row():
464
  with gr.Accordion("πŸ“š Citation", open=False):
leaderboard/md.py CHANGED
@@ -112,7 +112,7 @@ CAPTION_V2 = f"""The *new version* of RewardBench that is based on unseen human
112
 
113
  CAPTION_V1 = """The original RewardBench -- the first reward model evaluation.
114
 
115
- **Note**: This leaderboard is frozen and will not be updated. The final version of the evaluation results are available [here](TODO).
116
 
117
  ⚠️ Many of the top models were trained on unintentionally contaminated, AI-generated data, for more information, see this [gist](https://gist.github.com/natolambert/1aed306000c13e0e8c5bc17c1a5dd300).
118
  """
 
112
 
113
  CAPTION_V1 = """The original RewardBench -- the first reward model evaluation.
114
 
115
+ **Note**: This leaderboard is frozen and will not be updated. The final version of the evaluation results are available in the source for this application.
116
 
117
  ⚠️ Many of the top models were trained on unintentionally contaminated, AI-generated data, for more information, see this [gist](https://gist.github.com/natolambert/1aed306000c13e0e8c5bc17c1a5dd300).
118
  """
leaderboard/utils.py CHANGED
@@ -6,7 +6,10 @@ import numpy as np
6
  import pandas as pd
7
  from datasets import load_dataset
8
 
9
- UNVERIFIED_MODELS = [
 
 
 
10
  "nvidia/Nemotron-4-340B-Reward",
11
  "nvidia/Llama3-70B-SteerLM-RM",
12
  "Cohere May 2024",
@@ -24,7 +27,8 @@ UNVERIFIED_MODELS = [
24
  "nvidia/Llama-3.1-Nemotron-70B-Reward",
25
  ]
26
 
27
- CONTAMINATED_MODELS = [
 
28
  "Skywork/Skywork-Reward-Gemma-2-27B",
29
  "Skywork/Skywork-Critic-Llama-3.1-70B",
30
  "LxzGordon/URM-LLaMa-3.1-8B",
@@ -39,7 +43,6 @@ CONTAMINATED_MODELS = [
39
  "Ray2333/GRM-Gemma-2B-rewardmodel-ft",
40
  ]
41
 
42
-
43
  # From Open LLM Leaderboard
44
  def model_hyperlink(link, model_name):
45
  # if model_name is above 50 characters, return first 47 characters and "..."
 
6
  import pandas as pd
7
  from datasets import load_dataset
8
 
9
+ UNVERIFIED_MODELS = []
10
+ CONTAMINATED_MODELS = []
11
+
12
+ UNVERIFIED_MODELS_V1 = [
13
  "nvidia/Nemotron-4-340B-Reward",
14
  "nvidia/Llama3-70B-SteerLM-RM",
15
  "Cohere May 2024",
 
27
  "nvidia/Llama-3.1-Nemotron-70B-Reward",
28
  ]
29
 
30
+ # No longer used
31
+ CONTAMINATED_MODELS_V1 = [
32
  "Skywork/Skywork-Reward-Gemma-2-27B",
33
  "Skywork/Skywork-Critic-Llama-3.1-70B",
34
  "LxzGordon/URM-LLaMa-3.1-8B",
 
43
  "Ray2333/GRM-Gemma-2B-rewardmodel-ft",
44
  ]
45
 
 
46
  # From Open LLM Leaderboard
47
  def model_hyperlink(link, model_name):
48
  # if model_name is above 50 characters, return first 47 characters and "..."