WeijianQi1999 commited on
Commit
9da9431
·
1 Parent(s): ad604a4

refine leaderboard

Browse files
Files changed (2) hide show
  1. app.py +3 -2
  2. content.py +7 -4
app.py CHANGED
@@ -8,7 +8,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
8
 
9
  # InfoStrings
10
  from scorer import question_scorer
11
- from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
12
 
13
  TOKEN = os.environ.get("TOKEN", None)
14
 
@@ -58,7 +58,8 @@ with demo:
58
  elem_id="citation-button",
59
  lines=10,
60
  )
61
-
 
62
  with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
63
  human_leaderboard_table_test = gr.components.Dataframe(
64
  value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
 
8
 
9
  # InfoStrings
10
  from scorer import question_scorer
11
+ from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, LEADERBOARD_HTML, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
12
 
13
  TOKEN = os.environ.get("TOKEN", None)
14
 
 
58
  elem_id="citation-button",
59
  lines=10,
60
  )
61
+ # gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")
62
+ gr.HTML(LEADERBOARD_HTML)
63
  with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
64
  human_leaderboard_table_test = gr.components.Dataframe(
65
  value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
content.py CHANGED
@@ -9,11 +9,14 @@ LINKS = """
9
  """
10
 
11
  INTRODUCTION_TEXT = """
12
- Online Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains. Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1–5 steps), Medium (6–10 steps), and Hard (11+ steps).
13
-
14
- ## Leaderboard
15
 
16
- We maintain two leaderboards: one for automated evaluation, conducted internally using participant-submitted trajectories, and another for human evaluation—agents will be included in the human-eval leaderboard after submitted results successfully pass our validation process.
 
 
 
17
  """
18
 
19
  SUBMISSION_TEXT = """
 
9
  """
10
 
11
  INTRODUCTION_TEXT = """
12
+ Online Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains.
13
+ Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1–5 steps), Medium (6–10 steps), and Hard (11+ steps).
14
+ """
15
 
16
+ LEADERBOARD_TEXT = """
17
+ ### Leaderboard
18
+ We maintain two leaderboards—one for automated evaluation and another for human evaluation.
19
+ All submissions will be auto-evaluated internally, and if human evaluation results are provided, a subset will be selected for rigorous spot-check verification.
20
  """
21
 
22
  SUBMISSION_TEXT = """