Spaces:

osunlp
/

Online_Mind2Web_Leaderboard

Running

WeijianQi1999 commited on Mar 24

Commit

9da9431

1 Parent(s): ad604a4

refine leaderboard

Files changed (2) hide show

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
 # InfoStrings
 from scorer import question_scorer
-from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, SUBMISSION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
 TOKEN = os.environ.get("TOKEN", None)
@@ -58,7 +58,8 @@ with demo:
                 elem_id="citation-button",
                 lines=10,
             )
     with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
         human_leaderboard_table_test = gr.components.Dataframe(
             value=human_eval_dataframe_test, datatype=TYPES, interactive=False,

 # InfoStrings
 from scorer import question_scorer
+from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, LEADERBOARD_HTML, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
 TOKEN = os.environ.get("TOKEN", None)
                 elem_id="citation-button",
                 lines=10,
             )
+    # gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")
+    gr.HTML(LEADERBOARD_HTML)
     with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
         human_leaderboard_table_test = gr.components.Dataframe(
             value=human_eval_dataframe_test, datatype=TYPES, interactive=False,

content.py CHANGED Viewed

@@ -9,11 +9,14 @@ LINKS = """
 """
 INTRODUCTION_TEXT = """
-Online Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains. Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1–5 steps), Medium (6–10 steps), and Hard (11+ steps).
-## Leaderboard
-We maintain two leaderboards: one for automated evaluation, conducted internally using participant-submitted trajectories, and another for human evaluation—agents will be included in the human-eval leaderboard after submitted results successfully pass our validation process.
 """
 SUBMISSION_TEXT = """

 """
 INTRODUCTION_TEXT = """
+Online Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains.
+Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1–5 steps), Medium (6–10 steps), and Hard (11+ steps).
+"""
+LEADERBOARD_TEXT = """
+### Leaderboard
+We maintain two leaderboards—one for automated evaluation and another for human evaluation.
+All submissions will be auto-evaluated internally, and if human evaluation results are provided, a subset will be selected for rigorous spot-check verification.
 """
 SUBMISSION_TEXT = """