Commit
·
9da9431
1
Parent(s):
ad604a4
refine leaderboard
Browse files- app.py +3 -2
- content.py +7 -4
app.py
CHANGED
@@ -8,7 +8,7 @@ from apscheduler.schedulers.background import BackgroundScheduler
|
|
8 |
|
9 |
# InfoStrings
|
10 |
from scorer import question_scorer
|
11 |
-
from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT,
|
12 |
|
13 |
TOKEN = os.environ.get("TOKEN", None)
|
14 |
|
@@ -58,7 +58,8 @@ with demo:
|
|
58 |
elem_id="citation-button",
|
59 |
lines=10,
|
60 |
)
|
61 |
-
|
|
|
62 |
with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
|
63 |
human_leaderboard_table_test = gr.components.Dataframe(
|
64 |
value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
|
|
|
8 |
|
9 |
# InfoStrings
|
10 |
from scorer import question_scorer
|
11 |
+
from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, LEADERBOARD_HTML, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION
|
12 |
|
13 |
TOKEN = os.environ.get("TOKEN", None)
|
14 |
|
|
|
58 |
elem_id="citation-button",
|
59 |
lines=10,
|
60 |
)
|
61 |
+
# gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text")
|
62 |
+
gr.HTML(LEADERBOARD_HTML)
|
63 |
with gr.Tab("Human Evaluation", elem_id="human-tab", id=1):
|
64 |
human_leaderboard_table_test = gr.components.Dataframe(
|
65 |
value=human_eval_dataframe_test, datatype=TYPES, interactive=False,
|
content.py
CHANGED
@@ -9,11 +9,14 @@ LINKS = """
|
|
9 |
"""
|
10 |
|
11 |
INTRODUCTION_TEXT = """
|
12 |
-
Online Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains.
|
13 |
-
|
14 |
-
|
15 |
|
16 |
-
|
|
|
|
|
|
|
17 |
"""
|
18 |
|
19 |
SUBMISSION_TEXT = """
|
|
|
9 |
"""
|
10 |
|
11 |
INTRODUCTION_TEXT = """
|
12 |
+
Online Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains.
|
13 |
+
Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1–5 steps), Medium (6–10 steps), and Hard (11+ steps).
|
14 |
+
"""
|
15 |
|
16 |
+
LEADERBOARD_TEXT = """
|
17 |
+
### Leaderboard
|
18 |
+
We maintain two leaderboards—one for automated evaluation and another for human evaluation.
|
19 |
+
All submissions will be auto-evaluated internally, and if human evaluation results are provided, a subset will be selected for rigorous spot-check verification.
|
20 |
"""
|
21 |
|
22 |
SUBMISSION_TEXT = """
|