import os import gradio as gr import pandas as pd import json import plotly.graph_objects as go from plotly.subplots import make_subplots from collections import Counter from apscheduler.schedulers.background import BackgroundScheduler from scorer import question_scorer from content import format_error, format_warning, format_log, TITLE, LINKS, INTRODUCTION_TEXT, LEADERBOARD_TEXT, CITATION_BUTTON_LABEL, EVALUATION_DETAILS, CITATION_BUTTON_TEXT, model_hyperlink, SUBMIT_INTRODUCTION TOKEN = os.environ.get("TOKEN", None) OWNER = "Online-Mind2Web" YEAR_VERSION = "2025" LOCAL_DEBUG = True def get_dataframe_from_results(eval_path): df = pd.read_csv(eval_path) df = df.sort_values(by=["Average SR"], ascending=False) for format_column in ['Easy', 'Medium', 'Hard', 'Average SR']: df[format_column] = df[format_column].map('{:.1f}'.format) return df # auto_eval_dataframe_test = get_dataframe_from_results('./auto_Mind2Web-Online - Leaderboard_data.csv') auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv') human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv') TYPES = ["str", "str", "str", "str", "number", "number", "number", "number", "str"] def refresh(): auto_eval_dataframe_test = get_dataframe_from_results('./auto_o4-mini_Mind2Web-Online - Leaderboard_data.csv') human_eval_dataframe_test = get_dataframe_from_results('./human_Mind2Web-Online - Leaderboard_data.csv') return auto_eval_dataframe_test, human_eval_dataframe_test def plot_heatmap_with_performance_bar(json_file): with open(json_file, "r") as f: data = json.load(f) agents = [k for k in data[0].keys() if k.endswith("_human_label")] records = [] original_ids = [task["task_id"] for task in data] for task in data: task_id = task["task_id"] for agent in agents: raw_val = task.get(agent, "0") try: val = int(raw_val) except ValueError: val = 0 val = 1 if val == 1 else 0 records.append({ "Task ID": task_id, "Agent": agent.replace("_human_label", ""), "Success": val }) df = pd.DataFrame(records) pivot = df.pivot_table(index="Agent", columns="Task ID", values="Success", aggfunc="max") for task_id in original_ids: if task_id not in pivot.columns: pivot[task_id] = 0 pivot = pivot[original_ids] agent_success_rate = pivot.sum(axis=1) / pivot.shape[1] pivot["SuccessRate"] = agent_success_rate pivot = pivot.sort_values(by="SuccessRate", ascending=False) pivot = pivot.drop(columns=["SuccessRate"]) agent_name_map = { "Operator": "Operator", "Agent-E": "Agent-E", "Browser_Use": "Browser Use", "Claude_Computer_Use": "Claude Computer Use", "SeeAct": "SeeAct" } sorted_agents = pivot.index.tolist() pivot.index = [ f"{agent_name_map.get(agent, agent)} ({agent_success_rate[agent]*100:.1f}%)" for agent in sorted_agents ] custom_labels = [["Success" if val == 1 else "Failure" for val in row] for row in pivot.values] any_agent_solved = pivot.max(axis=0).sum() best_agent_solved = pivot.sum(axis=1).max() total_tasks = len(original_ids) fig = make_subplots( rows=2, cols=1, row_heights=[0.8, 0.2], vertical_spacing=0.08, subplot_titles=("TASK ID", ""), shared_xaxes=False ) fig.add_trace(go.Heatmap( z=pivot.values, x=pivot.columns, y=pivot.index, colorscale=[[0, "white"], [1, "skyblue"]], zmin=0, zmax=1, showscale=False, customdata=custom_labels, hovertemplate="Agent: %{y}
Task ID: %{x}
Completion: %{customdata}" ), row=1, col=1) fig.add_trace(go.Bar( y=["Any agent", "Best agent"], x=[any_agent_solved, best_agent_solved], orientation='h', marker_color=["dodgerblue", "mediumseagreen"], text=[ f"{int(any_agent_solved)}/{total_tasks} ({any_agent_solved / total_tasks:.1%})", f"{int(best_agent_solved)}/{total_tasks} ({best_agent_solved / total_tasks:.1%})" ], textposition="auto", showlegend=False ), row=2, col=1) fig.add_trace(go.Scatter( x=[None], y=[None], mode='markers', marker=dict(size=10, color='skyblue'), name='Success' )) fig.add_trace(go.Scatter( x=[None], y=[None], mode='markers', marker=dict(size=10, color='white', line=dict(width=1, color='black')), name='Failure' )) fig.update_xaxes(range=[0, total_tasks], row=2, col=1) fig.update_layout( height=600, xaxis=dict(showticklabels=False), yaxis=dict(title="Agent"), yaxis2=dict(title=""), margin=dict(t=60) ) return fig def gradio_plot_wrapper(json_file): return plot_heatmap_with_performance_bar(json_file.name) demo = gr.Blocks(css="""#human-leaderboard-table { width: auto; min-width: calc(100% + 20px); }""") with demo: gr.HTML(TITLE) gr.HTML(LINKS) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, elem_id="citation-button", lines=10, ) gr.Markdown(LEADERBOARD_TEXT, elem_classes="markdown-text") with gr.Tab("Human Evaluation", elem_id="human-tab", id=1): human_leaderboard_table_test = gr.Dataframe( value=human_eval_dataframe_test, datatype=TYPES, interactive=False, wrap=False ) gr.Markdown("### Visualization") gr.Markdown("This figure presents a fine-grained heatmap illustrating task-level completion across different agents. Each row corresponds to a specific agent, and each column represents a task (identified by its task ID). Blue bars indicate successful completions, while white spaces denote failures. Any agent: A task is considered successful if at least one agent is able to complete it. (This style of visualization is inspired by [HAL](https://hal.cs.princeton.edu/).)") fig = plot_heatmap_with_performance_bar("./human_label.json") gr.Plot(fig) gr.Markdown(EVALUATION_DETAILS) with gr.Tab("Auto Evaluation", elem_id="auto-tab", id=2): auto_leaderboard_table_test = gr.Dataframe( value=auto_eval_dataframe_test, datatype=TYPES, interactive=False, wrap=False ) with gr.Tab("Submission Guideline", elem_id="submit-tab", id=3): with gr.Row(): gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text") refresh_button = gr.Button("Refresh") refresh_button.click( refresh, inputs=[], outputs=[auto_leaderboard_table_test, human_leaderboard_table_test], ) scheduler = BackgroundScheduler() scheduler.start() if __name__ == "__main__": demo.launch(debug=True)