import gradio as gr from collections import defaultdict import os import base64 import torch from datasets import ( Dataset, load_dataset, ) import random import pandas as pd from collections import defaultdict TOKEN = os.environ['TOKEN'] def encode_image_to_base64(image_path): """Encode an image or GIF file to base64.""" with open(image_path, "rb") as file: encoded_string = base64.b64encode(file.read()).decode() return encoded_string def create_html_media(media_path, is_gif=False): """Create HTML for displaying an image or GIF.""" media_base64 = encode_image_to_base64(media_path) media_type = "gif" if is_gif else "jpeg" html_string = f"""
Displayed Media
""" return html_string MASKED_LM_MODELS = [ "BounharAbdelaziz/XLM-RoBERTa-Morocco", "SI2M-Lab/DarijaBERT", "BounharAbdelaziz/ModernBERT-Morocco", "google-bert/bert-base-multilingual-cased", "FacebookAI/xlm-roberta-large", "aubmindlab/bert-base-arabertv02", ] CAUSAL_LM_MODELS = [ "BounharAbdelaziz/Al-Atlas-LLM-0.5B", "Qwen/Qwen2.5-0.5B", "tiiuae/Falcon3-1B-Base", "MBZUAI-Paris/Atlas-Chat-2B", ] class LMBattleArena: def __init__(self, dataset_path): """Initialize battle arena with dataset""" self.df = pd.read_csv(dataset_path) print(self.df.head()) self.current_index = 0 self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations self.evaluation_results_masked = [] self.evaluation_results_causal = [] self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0}) def get_next_battle_pair(self, is_causal): """Retrieve next pair of summaries for comparison""" # if self.current_index >= len(self.df): # return None row = self.df.iloc[self.current_index] if is_causal: model_summary_cols = [ col for col in CAUSAL_LM_MODELS ] else: model_summary_cols = [ col for col in MASKED_LM_MODELS ] selected_models = random.sample(model_summary_cols, 2) battle_data = { 'prompt': row['masked_sentence'] if not is_causal else row['causal_sentence'], 'model_1': row[selected_models[0]], 'model_2': row[selected_models[1]], 'model1_name': selected_models[0], 'model2_name': selected_models[1] } self.current_index += 1 return battle_data def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name, is_causal): """Record user's model preference and update scores""" self.model_scores[model1_name]['total_comparisons'] += 1 self.model_scores[model2_name]['total_comparisons'] += 1 if preferred_models == "Both Good": self.model_scores[model1_name]['wins'] += 1 self.model_scores[model2_name]['wins'] += 1 elif preferred_models == "Model A": # Maps to first model self.model_scores[model1_name]['wins'] += 1 elif preferred_models == "Model B": # Maps to second model self.model_scores[model2_name]['wins'] += 1 # "Both Bad" case - no wins recorded evaluation = { 'input_text': input_text, 'output1': output1, 'output2': output2, 'model1_name': model1_name, 'model2_name': model2_name, 'preferred_models': preferred_models } if is_causal: self.evaluation_results_causal.append(evaluation) else: self.evaluation_results_masked.append(evaluation) return self.get_model_scores_df(is_causal) def get_model_scores_df(self, is_causal): """Convert model scores to DataFrame""" scores_data = [] for model, stats in self.model_scores.items(): if is_causal: if model not in CAUSAL_LM_MODELS: continue else: if model not in MASKED_LM_MODELS: continue win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0 scores_data.append({ 'Model': model, 'Wins': stats['wins'], 'Total Comparisons': stats['total_comparisons'], 'Win Rate (%)': round(win_rate, 2) }) results_df = pd.DataFrame(scores_data).sort_values('Win Rate (%)', ascending=False) # save the results in a huggingface dataset if self.current_index % self.saving_freq == 0 and self.current_index > 0: # results_dataset = Dataset.from_pandas(results_df) # results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True) results_df.to_csv('human_eval_results.csv') return results_df def create_battle_arena(dataset_path, is_gif, is_causal): arena = LMBattleArena(dataset_path) def battle_round(is_causal): battle_data = arena.get_next_battle_pair(is_causal) if battle_data is None: return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False) return ( battle_data['prompt'], battle_data['model_1'], battle_data['model_2'], battle_data['model1_name'], battle_data['model2_name'], gr.DataFrame(visible=True) ) def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal): scores_df = arena.record_evaluation( preferred_models, input_text, output_1, output_2, model1_name, model2_name, is_causal ) next_battle = battle_round(is_causal) return (*next_battle[:-1], scores_df) with gr.Blocks(css="footer{display:none !important}") as demo: base_path = os.path.dirname(__file__) local_image_path = os.path.join(base_path, 'battle_leaderboard.gif') gr.HTML(create_html_media(local_image_path, is_gif=is_gif)) with gr.Tabs(): with gr.Tab("Masked LM Battle Arena"): gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena") # Use gr.State to store the boolean value without displaying it is_causal = gr.State(value=False) input_text = gr.Textbox( label="Input prompt", interactive=False, ) with gr.Row(): output_1 = gr.Textbox( label="Model A", interactive=False ) model1_name = gr.State() # Hidden state for model1 name with gr.Row(): output_2 = gr.Textbox( label="Model B", interactive=False ) model2_name = gr.State() # Hidden state for model2 name preferred_models = gr.Radio( label="Which model is better?", choices=["Model A", "Model B", "Both Good", "Both Bad"] ) submit_btn = gr.Button("Vote", variant="primary") scores_table = gr.DataFrame( headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'], label="🏆 Leaderboard" ) submit_btn.click( submit_preference, inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal], outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table] ) demo.load( battle_round, inputs=[is_causal], outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table] ) with gr.Tab("Causal LM Battle Arena"): gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena") # Use gr.State to store the boolean value without displaying it is_causal = gr.State(value=True) input_text = gr.Textbox( label="Input prompt", interactive=False, ) with gr.Row(): output_1 = gr.Textbox( label="Model A", interactive=False ) model1_name = gr.State() # Hidden state for model1 name with gr.Row(): output_2 = gr.Textbox( label="Model B", interactive=False ) model2_name = gr.State() # Hidden state for model2 name preferred_models = gr.Radio( label="Which model is better?", choices=["Model A", "Model B", "Both Good", "Both Bad"] ) submit_btn = gr.Button("Vote", variant="primary") scores_table = gr.DataFrame( headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'], label="🏆 Leaderboard" ) submit_btn.click( submit_preference, inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models, is_causal], outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table] ) demo.load( battle_round, inputs=[is_causal], outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table] ) return demo if __name__ == "__main__": dataset_path = 'human_eval_dataset.csv' is_gif = True # load the existing dataset that contains outputs of the LMs human_eval_dataset = load_dataset("atlasia/LM-Moroccan-Darija-Bench", split='test', token=TOKEN).to_csv(dataset_path) # load first tab for masked LM demo = create_battle_arena(dataset_path, is_gif, is_causal=False) demo.launch(debug=True)