Spaces:

BUT-FIT
/

EMMA_leaderboard

Running

App Files Files Community

Lakoc commited on Jul 1

Commit

7fc9a28

1 Parent(s): 84a47e4

Leaderboard split into 4 categories, updates of the logic and GT added, simplified CER for Mandarin

Browse files

Files changed (15) hide show

app.py +18 -12
content.py +8 -1
leaderboard_server.py +172 -73
references/multi_channel_gt_diar/aishell4.json +0 -0
references/multi_channel_gt_diar/alimeeting.json +0 -0
references/multi_channel_gt_diar/ami-mdm.json +0 -0
references/multi_channel_gt_diar/chime6-mdm.json +0 -0
references/multi_channel_gt_diar/notsofar1-small-mdm.json +0 -0
references/multi_channel_real_diar/aishell4.json +0 -0
references/multi_channel_real_diar/alimeeting.json +0 -0
references/multi_channel_real_diar/ami-mdm.json +0 -0
references/multi_channel_real_diar/chime6-mdm.json +0 -0
references/multi_channel_real_diar/notsofar1-small-mdm.json +0 -0
requirements.txt +2 -1
tasks_metadata.json +6 -0

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import gradio as gr
 from gradio_modal import Modal
-from content import HEADER_MARKDOWN, LEADERBOARD_TAB_TITLE_MARKDOWN, SUBMISSION_TAB_TITLE_MARKDOWN, ADDITIONAL_NOTES_MARKDOWN
 from leaderboard_server import LeaderboardServer
 # Initialize server and task list
@@ -23,11 +23,11 @@ def update_datasets(task):
     return gr.CheckboxGroup(choices=get_datasets_for_task(task), value=get_datasets_for_task(task))
-def submit_model(task, datasets, hyp_file, submitted_by, model_id, token, normalize):
     if not hyp_file:
         return gr.update(visible=True, value="⚠️ Please upload a hypothesis file.")
-    if not submitted_by.strip() or not model_id.strip() or not token.strip():
         return gr.update(visible=True, value="⚠️ All fields are required.")
     if token.strip() != EXPECTED_TOKEN:
@@ -36,10 +36,11 @@ def submit_model(task, datasets, hyp_file, submitted_by, model_id, token, normal
     metadata = {
         "submitted_by": submitted_by.strip(),
         "model_id": model_id.strip(),
         "normalize": normalize  # Include normalization info in metadata if needed
     }
-    leaderboard_df = server.get_leaderboard()
     if len(leaderboard_df) > 0:
         existing = leaderboard_df[
             (leaderboard_df["Submitted by"] == submitted_by.strip()) &
@@ -52,27 +53,30 @@ def submit_model(task, datasets, hyp_file, submitted_by, model_id, token, normal
         server.prepare_model_for_submission(
             hyp_file.name, metadata, task, datasets, normalize=normalize
         )
-        server.update_leaderboard()
         return gr.update(visible=True, value="✅ Submission successful!")
     except Exception as e:
         print(e)
         return gr.update(visible=True, value=f"❌ Error: {str(e)}")
-def get_leaderboard_df():
-    return server.get_leaderboard()
 # Gradio UI
-with gr.Blocks() as demo:
     gr.Markdown(HEADER_MARKDOWN)
     with gr.Tabs(selected=0) as tabs:
         with gr.Tab("📈 Leaderboard"):
             gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
-            leaderboard_output = gr.Dataframe(
-                value=get_leaderboard_df,
                 interactive=False,
                 label="Leaderboard"
             )
         with gr.Tab("📤 Submit"):
             gr.Markdown(SUBMISSION_TAB_TITLE_MARKDOWN)
@@ -85,6 +89,7 @@ with gr.Blocks() as demo:
             with gr.Row():
                 submitted_by_input = gr.Text(label="Submitted by")
                 model_id_input = gr.Text(label="Model Identifier")
                 token_input = gr.Text(label="Submission Token", type="password")
             hyp_file_upload = gr.File(label="Upload Hypothesis JSON", file_types=[".json"])
@@ -103,10 +108,11 @@ with gr.Blocks() as demo:
             ).then(
                 fn=submit_model,
                 inputs=[task_dropdown, dataset_checkboxes, hyp_file_upload,
-                        submitted_by_input, model_id_input, token_input, normalize_checkbox],
                 outputs=[feedback_text],
             ).then(
-                lambda: server.get_leaderboard(),
                 outputs=leaderboard_output
             )

 import gradio as gr
 from gradio_modal import Modal
+from content import HEADER_MARKDOWN, LEADERBOARD_TAB_TITLE_MARKDOWN, SUBMISSION_TAB_TITLE_MARKDOWN, ADDITIONAL_NOTES_MARKDOWN, LEADERBOARD_CSS
 from leaderboard_server import LeaderboardServer
 # Initialize server and task list
     return gr.CheckboxGroup(choices=get_datasets_for_task(task), value=get_datasets_for_task(task))
+def submit_model(task, datasets, hyp_file, submitted_by, model_id, model_link_input, token, normalize):
     if not hyp_file:
         return gr.update(visible=True, value="⚠️ Please upload a hypothesis file.")
+    if not submitted_by.strip() or not model_id.strip() or not model_link_input.strip() or not token.strip():
         return gr.update(visible=True, value="⚠️ All fields are required.")
     if token.strip() != EXPECTED_TOKEN:
     metadata = {
         "submitted_by": submitted_by.strip(),
         "model_id": model_id.strip(),
+        "model_link": model_link_input.strip(),
         "normalize": normalize  # Include normalization info in metadata if needed
     }
+    leaderboard_df = server.get_leaderboard(task)
     if len(leaderboard_df) > 0:
         existing = leaderboard_df[
             (leaderboard_df["Submitted by"] == submitted_by.strip()) &
         server.prepare_model_for_submission(
             hyp_file.name, metadata, task, datasets, normalize=normalize
         )
         return gr.update(visible=True, value="✅ Submission successful!")
     except Exception as e:
         print(e)
         return gr.update(visible=True, value=f"❌ Error: {str(e)}")
+def get_leaderboard_df(task):
+    return server.get_leaderboard(task)
 # Gradio UI
+with gr.Blocks(css=LEADERBOARD_CSS) as demo:
     gr.Markdown(HEADER_MARKDOWN)
     with gr.Tabs(selected=0) as tabs:
         with gr.Tab("📈 Leaderboard"):
             gr.Markdown(LEADERBOARD_TAB_TITLE_MARKDOWN)
+            leaderboard_task_dropdown = gr.Dropdown(choices=TASKS, value=TASKS[0], label="Select Task for Leaderboard")
+            leaderboard_output = gr.components.Dataframe(
+                datatype=["markdown", "markdown", "float", "float", "float", "float", "float", "float"],
+                value=lambda: get_leaderboard_df(TASKS[0]),
                 interactive=False,
                 label="Leaderboard"
             )
+            leaderboard_task_dropdown.change(fn=get_leaderboard_df, inputs=leaderboard_task_dropdown,
+                                             outputs=leaderboard_output)
         with gr.Tab("📤 Submit"):
             gr.Markdown(SUBMISSION_TAB_TITLE_MARKDOWN)
             with gr.Row():
                 submitted_by_input = gr.Text(label="Submitted by")
                 model_id_input = gr.Text(label="Model Identifier")
+                model_link_input = gr.Text(label="Model Link", placeholder="Link to model or code repository")
                 token_input = gr.Text(label="Submission Token", type="password")
             hyp_file_upload = gr.File(label="Upload Hypothesis JSON", file_types=[".json"])
             ).then(
                 fn=submit_model,
                 inputs=[task_dropdown, dataset_checkboxes, hyp_file_upload,
+                        submitted_by_input, model_id_input, model_link_input, token_input, normalize_checkbox],
                 outputs=[feedback_text],
             ).then(
+                fn=lambda task: get_leaderboard_df(task),
+                inputs=task_dropdown,
                 outputs=leaderboard_output
             )

content.py CHANGED Viewed

@@ -10,7 +10,9 @@ Welcome to the official leaderboard for benchmarking **multi-talker ASR systems*
 LEADERBOARD_TAB_TITLE_MARKDOWN = """
 ## Leaderboard
-Below you’ll find the latest results submitted to the benchmark. Models are evaluated using **`meeteval`** with **TCP-WER [%] (collar=5s)**.
 """
 SUBMISSION_TAB_TITLE_MARKDOWN = """
@@ -43,3 +45,8 @@ You can choose to disable this using the checkbox above.
 """

 LEADERBOARD_TAB_TITLE_MARKDOWN = """
 ## Leaderboard
+Below you’ll find the latest results submitted to the benchmark. Models are evaluated using **`meeteval`** with **TCP-WER [%] (collar=5s)**.
+For AISHELL-4 and AliMeeting conversion to simplified Mandarin is applied, and tcpCER [%] is used.
 """
 SUBMISSION_TAB_TITLE_MARKDOWN = """
 """
+LEADERBOARD_CSS = """
+#leaderboard-table th .header-content {
+    white-space: nowrap;
+}
+"""

leaderboard_server.py CHANGED Viewed

@@ -1,110 +1,209 @@
 import json
-import os
 import meeteval.io
 import pandas as pd
-from utils import calc_wer, aggregate_wer_metrics
-from txt_norm import get_text_norm
-# Constants
-REFERENCE_BASE_PATH = os.path.abspath("references")  # e.g. ./references/single_channel/dataset1.json
-TASKS_METADATA_PATH = os.path.abspath("tasks_metadata.json")
-def list_files(startpath):
-    for root, dirs, files in os.walk(startpath):
-        level = root.replace(startpath, '').count(os.sep)
-        indent = ' ' * 4 * (level)
-        print('{}{}/'.format(indent, os.path.basename(root)))
-        subindent = ' ' * 4 * (level + 1)
-        for f in files:
-            print('{}{}'.format(subindent, f))
 class LeaderboardServer:
-    def __init__(self):
-        self.local_leaderboard = os.path.abspath("/data/submissions")
-        self.submisssion_id_to_file = {}  # Maps model_id to filepath
-        self.tasks_metadata = json.load(open(TASKS_METADATA_PATH))["tasks"]
-        self.submission_ids = set()
-        self.results_file = os.path.join(self.local_leaderboard, "results.json")
-        os.makedirs(self.local_leaderboard, exist_ok=True)
-        self.fetch_existing_models()
         self.text_normalizer = get_text_norm("whisper_nsf")
-    def fetch_existing_models(self):
-        self.submisssion_id_to_file.clear()
-        self.submission_ids.clear()
-        results_path = os.path.join(self.local_leaderboard, "results.json")
-        if not os.path.exists(results_path):
-            return
-        with open(results_path) as f:
-            results = json.load(f)
-        for model_id in results:
-            self.submission_ids.add(model_id)
-            hyp_path = os.path.join(self.local_leaderboard, f"{model_id}_hyp.json")
-            self.submisssion_id_to_file[model_id] = hyp_path
-    def prepare_model_for_submission(self, file, metadata, task, datasets, normalize=False):
-        submitted_by = metadata["submitted_by"]
-        model_id = metadata["model_id"]
-        original_id = f"{submitted_by}_{model_id}"
-        # Run WER eval
-        results = {}
         hyp_seglst = meeteval.io.load(file)
         for dataset in datasets:
-            ref_path = os.path.join(REFERENCE_BASE_PATH, task, f"{dataset}.json")
             ref_seglst = meeteval.io.load(ref_path)
             sessions = ref_seglst.unique('session_id')
             local_hyps = hyp_seglst.filter(lambda seg: seg['session_id'] in sessions)
-            ref_seglst = ref_seglst.map(lambda seg: {**seg, "words":self.text_normalizer(seg["words"]) if normalize else seg["words"]})
-            local_hyps = local_hyps.map(lambda seg: {**seg, "words":self.text_normalizer(seg["words"]) if normalize else seg["words"]})
-            per_session_wers = calc_wer(tcp_hyp_seglst=local_hyps, ref_seglst=ref_seglst, collar=5, metrics_list=["tcp_wer"])
-            metrics = aggregate_wer_metrics(per_session_wers, ["tcp_wer"])
-            results[dataset] = metrics
-        # Update results file
-        results_path = os.path.join(self.local_leaderboard, "results.json")
-        if os.path.exists(results_path):
-            with open(results_path) as f:
-                all_results = json.load(f)
-        else:
-            all_results = {}
-        all_results[original_id] = {
-            "model_id": model_id,
-            "submitted_by": submitted_by,
             "results": results
         }
-        with open(results_path, "w") as f:
-            json.dump(all_results, f, indent=2)
-        with open(f"{self.local_leaderboard}/{original_id}_hyp.json", "w") as out_f:
-            with open(file, "r") as in_f:
-                out_f.write(in_f.read())
-    def update_leaderboard(self):
-        self.fetch_existing_models()
-    def get_leaderboard(self):
-        results_path = os.path.join(self.local_leaderboard, "results.json")
-        if not os.path.exists(results_path):
             return pd.DataFrame(columns=["No submissions yet"])
         with open(results_path) as f:
             results = json.load(f)
         rows = []
         for content in results.values():
-            row = {"Model ID": content["model_id"], "Submitted by": content["submitted_by"]}
-            for k, v in content["results"].items():
-                row[k] = v.get("tcp_wer", None)
             rows.append(row)
-        df =  pd.DataFrame(rows)
-        df.iloc[:, 2:] *= 100.0 # Convert WER to percentage, first two columns are metadata
-        df = df.fillna("-")
         df = df.round(2)
         return df

 import json
+from pathlib import Path
+from typing import Dict, List
 import meeteval.io
 import pandas as pd
+from txt_norm import get_text_norm
+from utils import calc_wer, aggregate_wer_metrics
 class LeaderboardServer:
+    """Manages ASR model submissions and leaderboard generation."""
+    def __init__(self,
+                 reference_base_path: str = "references",
+                 tasks_metadata_path: str = "tasks_metadata.json",
+                 local_leaderboard_path: str = "submissions"):
+        """Initialize the leaderboard server.
+        Args:
+            reference_base_path: Base path for reference files
+            tasks_metadata_path: Path to tasks metadata JSON file
+            local_leaderboard_path: Directory for storing submissions
+        """
+        self.reference_base_path = Path(reference_base_path).resolve()
+        self.tasks_metadata_path = Path(tasks_metadata_path).resolve()
+        self.local_leaderboard = Path(local_leaderboard_path).resolve()
+        # Load tasks metadata
+        self.tasks_metadata = self._load_tasks_metadata()
+        # Initialize storage
+        self.local_leaderboard.mkdir(exist_ok=True)
         self.text_normalizer = get_text_norm("whisper_nsf")
+    def _load_tasks_metadata(self) -> Dict:
+        """Load tasks metadata from JSON file."""
+        try:
+            with open(self.tasks_metadata_path) as f:
+                return json.load(f)["tasks"]
+        except (FileNotFoundError, KeyError, json.JSONDecodeError) as e:
+            raise ValueError(f"Failed to load tasks metadata: {e}")
+    def _get_results_file_path(self, task: str) -> Path:
+        """Get the path to the results file for a specific task."""
+        return self.local_leaderboard / f"{task}_results.json"
+    def _create_submission_id(self, metadata: Dict[str, str]) -> str:
+        """Create a unique submission ID from metadata."""
+        return f"{metadata['submitted_by']}_{metadata['model_id']}"
+    def _normalize_text_if_needed(self, segment: Dict, normalize: bool) -> Dict:
+        """Apply text normalization to a segment if requested."""
+        if normalize:
+            return {**segment, "words": self.text_normalizer(segment["words"])}
+        return segment
+    def _evaluate_dataset(self,
+                          hyp_seglst,
+                          ref_seglst,
+                          normalize: bool = False) -> Dict:
+        """Evaluate WER for a single dataset."""
+        # Apply normalization if requested
+        if normalize:
+            ref_seglst = ref_seglst.map(lambda seg: self._normalize_text_if_needed(seg, True))
+            hyp_seglst = hyp_seglst.map(lambda seg: self._normalize_text_if_needed(seg, True))
+        # Calculate WER metrics
+        per_session_wers = calc_wer(
+            tcp_hyp_seglst=hyp_seglst,
+            ref_seglst=ref_seglst,
+            collar=5,
+            metrics_list=["tcp_wer"]
+        )
+        return aggregate_wer_metrics(per_session_wers, ["tcp_wer"])
+    def _load_existing_results(self, task: str) -> Dict:
+        """Load existing results for a task, or return empty dict."""
+        results_path = self._get_results_file_path(task)
+        if results_path.exists():
+            with open(results_path) as f:
+                return json.load(f)
+        return {}
+    def _save_results(self, task: str, results: Dict) -> None:
+        """Save results to the task results file."""
+        results_path = self._get_results_file_path(task)
+        with open(results_path, "w") as f:
+            json.dump(results, f, indent=2)
+    def _save_hypothesis_file(self,
+                              task: str,
+                              submission_id: str,
+                              source_file: str) -> None:
+        """Save the hypothesis file for future reference."""
+        hyp_filename = f"{task}_{submission_id}_hyp.json"
+        hyp_filepath = self.local_leaderboard / hyp_filename
+        with open(hyp_filepath, "w") as out_f:
+            with open(source_file, "r") as in_f:
+                out_f.write(in_f.read())
+    def prepare_model_for_submission(self,
+                                     file: str,
+                                     metadata: Dict[str, str],
+                                     task: str,
+                                     datasets: List[str],
+                                     normalize: bool = False) -> None:
+        """Prepare and evaluate a model submission.
+        Args:
+            file: Path to the hypothesis file
+            metadata: Submission metadata containing 'submitted_by' and 'model_id'
+            task: Task name
+            datasets: List of dataset names to evaluate on
+            normalize: Whether to apply text normalization
+        """
+        submission_id = self._create_submission_id(metadata)
+        # Load hypothesis segments
         hyp_seglst = meeteval.io.load(file)
+        # Evaluate on each dataset
+        results = {}
         for dataset in datasets:
+            ref_path = self.reference_base_path / task / f"{dataset}.json"
+            if not ref_path.exists():
+                raise FileNotFoundError(f"Reference file not found: {ref_path}")
             ref_seglst = meeteval.io.load(ref_path)
             sessions = ref_seglst.unique('session_id')
+            # Filter hypotheses to match reference sessions
             local_hyps = hyp_seglst.filter(lambda seg: seg['session_id'] in sessions)
+            if "alimeeting" in dataset or "aishell4" in dataset:
+                import opencc
+                converter = opencc.OpenCC('s2t.json')
+                local_hyps = local_hyps.map(lambda seg: {**seg, "words": " ".join(list(converter.convert(seg["words"])))})
+                ref_seglst = ref_seglst.map(lambda seg: {**seg, "words": " ".join(list(converter.convert(seg["words"])))})
+            # Evaluate this dataset
+            results[dataset] = self._evaluate_dataset(local_hyps, ref_seglst, normalize)
+        # Update results file
+        all_results = self._load_existing_results(task)
+        all_results[submission_id] = {
+            "model_link": metadata["model_link"],
+            "model_id": metadata["model_id"],
+            "submitted_by": metadata["submitted_by"],
             "results": results
         }
+        self._save_results(task, all_results)
+        self._save_hypothesis_file(task, submission_id, file)
+    @staticmethod
+    def make_clickable_model(model_name, link):
+        return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+    def get_leaderboard(self, task: str) -> pd.DataFrame:
+        """Generate leaderboard DataFrame for a specific task.
+        Args:
+            task: Task name
+        Returns:
+            DataFrame containing leaderboard results
+        """
+        results_path = self._get_results_file_path(task)
+        if not results_path.exists():
             return pd.DataFrame(columns=["No submissions yet"])
         with open(results_path) as f:
             results = json.load(f)
+        if not results:
+            return pd.DataFrame(columns=["No submissions yet"])
+        # Build rows for DataFrame
         rows = []
         for content in results.values():
+            row = {
+                "Model ID": self.make_clickable_model(content["model_id"], content["model_link"]),
+                "Submitted by": content["submitted_by"]
+            }
+            # Add dataset results
+            for dataset, metrics in content["results"].items():
+                row[dataset] = metrics.get("tcp_wer")
             rows.append(row)
+        df = pd.DataFrame(rows)
+        if df.empty:
+            return df
+        # Convert WER to percentage and format
+        numeric_columns = df.select_dtypes(include=['number']).columns
+        df[numeric_columns] *= 100.0
         df = df.round(2)
+        df = df.fillna("-")
         return df

references/multi_channel_gt_diar/aishell4.json ADDED Viewed