WeijianQi1999 commited on
Commit
1f3e8c7
Β·
1 Parent(s): 84c9c9b

update 0511

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: Online-Mind2Web Leaderboard
3
- emoji: πŸ†
4
  colorFrom: yellow
5
  colorTo: indigo
6
  sdk: gradio
 
1
  ---
2
  title: Online-Mind2Web Leaderboard
3
+ emoji: 🌐
4
  colorFrom: yellow
5
  colorTo: indigo
6
  sdk: gradio
app.py CHANGED
@@ -23,12 +23,13 @@ def get_dataframe_from_results(eval_path):
23
  df[format_column] = df[format_column].map('{:.1f}'.format)
24
  return df
25
 
26
- auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
 
27
  human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
28
  TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
29
 
30
  def refresh():
31
- auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
32
  human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
33
  return auto_eval_dataframe_test, human_eval_dataframe_test
34
 
 
23
  df[format_column] = df[format_column].map('{:.1f}'.format)
24
  return df
25
 
26
+ # auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv')
27
+ auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
28
  human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
29
  TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"]
30
 
31
  def refresh():
32
+ auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv')
33
  human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv')
34
  return auto_eval_dataframe_test, human_eval_dataframe_test
35
 
auto_gpt4o_Mind2Web-Online - Leaderboard_data.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
2
+ Operator,OpenAI Computer-Using Agent,OpenAI,OSU NLP,80.3,73.4,59,71.8,2025-3-22
3
+ SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,65.1,36.1,18.5,39.8,2025-3-22
4
+ Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,58.6,37.5,24.3,40.1,2025-3-22
5
+ Claude Computer Use 3.5,claude-3-5-sonnet-20241022,Anthropic,OSU NLP,61.9,28.1,21.2,35.8,2025-3-22
6
+ Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,57.4,31.9,14.4,34.7,2025-3-22
7
+ Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,81.5,56.2,42,59.7,2025-4-20
auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Agent,Model,Organization,Source,Easy,Medium,Hard,Average SR,Date
2
+ Operator,OpenAI Computer-Using Agent,OpenAI,OSU NLP,73.5,59.4,39.2,58.3,2025-5-11
3
+ SeeAct,gpt-4o-2024-08-06,OSU,OSU NLP,51.8,28,9.5,30,2025-5-11
4
+ Browser Use,gpt-4o-2024-08-06,Browser Use,OSU NLP,44.6,23.1,10.8,26,2025-5-11
5
+ Claude Computer Use 3.5,Claude-3-5-sonnet-20241022,Anthropic,OSU NLP,51.8,16.1,8.1,24,2025-5-11
6
+ Agent-E,gpt-4o-2024-08-06,Emergence AI,OSU NLP,51.8,23.1,6.8,27,2025-5-11
7
+ Claude Computer Use 3.7 (w/o thinking),Claude-3-7-sonnet-20250219,Anthropic,OSU NLP,75.9,41.3,27,47.3,2025-5-11
content.py CHANGED
@@ -61,6 +61,11 @@ You should use the script provided in our GitHub repository to obtain automatic
61
  To ensure the authenticity and reliability of the reported results, we will also conduct a verification of auto-eval results.
62
  If you have conducted your own human evaluation, please also attach your human-eval results. We will spot-check these before adding them to the human-eval table.
63
 
 
 
 
 
 
64
  ## ⚠ Please submit the trajectory file with the following format:
65
  The result of each task is stored in a folder named as its `task_id`, containing:
66
  - `trajectory/`: Stores screenshots of each step.
 
61
  To ensure the authenticity and reliability of the reported results, we will also conduct a verification of auto-eval results.
62
  If you have conducted your own human evaluation, please also attach your human-eval results. We will spot-check these before adding them to the human-eval table.
63
 
64
+ ## Important Notes for Reliable Evaluation:
65
+ - To enable fair comparisons, please ensure that each task starts from the specified website in our benchmark. Starting from Google Search or alternative websites can lead agents to use different websites to solve the task, resulting in varying difficulty levels and potentially skewed evaluation results.
66
+ - The action history should contain only the actions taken by the agent to complete the task (e.g., clicking elements and Typing text). Please avoid including the final response, as it may contain hallucinated content, leading to a high rate of false positives.
67
+ - WebJudge powered by o4-mini demonstrates a higher alignment with human judgment, achieving an average agreement rate of 85.7% and maintaining a narrow success rate gap of just 3.8%. Therefore, please use o4-mini as the backbone for automatic evaluation.
68
+
69
  ## ⚠ Please submit the trajectory file with the following format:
70
  The result of each task is stored in a folder named as its `task_id`, containing:
71
  - `trajectory/`: Stores screenshots of each step.