Commit
Β·
1f3e8c7
1
Parent(s):
84c9c9b
update 0511
Browse files- README.md +1 -1
- app.py +3 -2
- auto_gpt4o_Mind2Web-Online - Leaderboard_data.csv +7 -0
- auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv +7 -0
- content.py +5 -0
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: Online-Mind2Web Leaderboard
|
3 |
-
emoji:
|
4 |
colorFrom: yellow
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
|
|
1 |
---
|
2 |
title: Online-Mind2Web Leaderboard
|
3 |
+
emoji: π
|
4 |
colorFrom: yellow
|
5 |
colorTo: indigo
|
6 |
sdk: gradio
|
app.py
CHANGED
@@ -23,12 +23,13 @@ def get_dataframe_from_results(eval_path):
|
|
23 |
df[format_column] = df[format_column].map('{:.1f}'.format)
|
24 |
return df
|
25 |
|
26 |
-
auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
|
|
|
27 |
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
|
28 |
TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
|
29 |
|
30 |
def refresh():
|
31 |
-
auto_eval_dataframe_test = get_dataframe_from_results('./
|
32 |
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
|
33 |
return auto_eval_dataframe_test, human_eval_dataframe_test
|
34 |
|
|
|
23 |
df[format_column] = df[format_column].map('{:.1f}'.format)
|
24 |
return df
|
25 |
|
26 |
+
# auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
|
27 |
+
auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
|
28 |
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
|
29 |
TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
|
30 |
|
31 |
def refresh():
|
32 |
+
auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
|
33 |
human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
|
34 |
return auto_eval_dataframe_test, human_eval_dataframe_test
|
35 |
|
auto_gpt4o_Mind2Web-Online - Leaderboard_data.csv
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
|
2 |
+
Operator,OpenAI Computer-Using Agent,OpenAI,OSU NLP,80.3,73.4,59,71.8,2025-3-22
|
3 |
+
SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,65.1,36.1,18.5,39.8,2025-3-22
|
4 |
+
Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,58.6,37.5,24.3,40.1,2025-3-22
|
5 |
+
Claude Computer Use 3.5,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,61.9,28.1,21.2,35.8,2025-3-22
|
6 |
+
Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,57.4,31.9,14.4,34.7,2025-3-22
|
7 |
+
Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,81.5,56.2,42,59.7,2025-4-20
|
auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
|
2 |
+
Operator,OpenAI Computer-Using Agent,OpenAI,OSU NLP,73.5,59.4,39.2,58.3,2025-5-11
|
3 |
+
SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,51.8,28,9.5,30,2025-5-11
|
4 |
+
Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,44.6,23.1,10.8,26,2025-5-11
|
5 |
+
Claude Computer Use 3.5,Claude-3-5-sonnet-20241022,Anthropic,OSU NLP,51.8,16.1,8.1,24,2025-5-11
|
6 |
+
Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,51.8,23.1,6.8,27,2025-5-11
|
7 |
+
Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,75.9,41.3,27,47.3,2025-5-11
|
content.py
CHANGED
@@ -61,6 +61,11 @@ You should use the script provided in our GitHub repository to obtain automatic
|
|
61 |
To ensure the authenticity and reliability of the reported results, we will also conduct a verification of auto-eval results.
|
62 |
If you have conducted your own human evaluation, please also attach your human-eval results. We will spot-check these before adding them to the human-eval table.
|
63 |
|
|
|
|
|
|
|
|
|
|
|
64 |
## β Please submit the trajectory file with the following format:
|
65 |
The result of each task is stored in a folder named as its `task_id`, containing:
|
66 |
- `trajectory/`: Stores screenshots of each step.
|
|
|
61 |
To ensure the authenticity and reliability of the reported results, we will also conduct a verification of auto-eval results.
|
62 |
If you have conducted your own human evaluation, please also attach your human-eval results. We will spot-check these before adding them to the human-eval table.
|
63 |
|
64 |
+
## Important Notes for Reliable Evaluation:
|
65 |
+
- To enable fair comparisons, please ensure that each task starts from the specified website in our benchmark. Starting from Google Search or alternative websites can lead agents to use different websites to solve the task, resulting in varying difficulty levels and potentially skewed evaluation results.
|
66 |
+
- The action history should contain only the actions taken by the agent to complete the task (e.g., clicking elements and Typing text). Please avoid including the final response, as it may contain hallucinated content, leading to a high rate of false positives.
|
67 |
+
- WebJudge powered by o4-mini demonstrates a higher alignment with human judgment, achieving an average agreement rate of 85.7% and maintaining a narrow success rate gap of just 3.8%. Therefore, please use o4-mini as the backbone for automatic evaluation.
|
68 |
+
|
69 |
## β Please submit the trajectory file with the following format:
|
70 |
The result of each task is stored in a folder named as its `task_id`, containing:
|
71 |
- `trajectory/`: Stores screenshots of each step.
|