Spaces:

atlasia
/

Atlaset-Arena

Running

File size: 7,946 Bytes

import gradio as gr
from collections import defaultdict
import os
import base64
import torch
from datasets import (
    Dataset,
    load_dataset,
)
import random
import pandas as pd
from collections import defaultdict

def encode_image_to_base64(image_path):
    """Encode an image or GIF file to base64."""
    with open(image_path, "rb") as file:
        encoded_string = base64.b64encode(file.read()).decode()
    return encoded_string

def create_html_media(media_path, is_gif=False):
    """Create HTML for displaying an image or GIF."""
    media_base64 = encode_image_to_base64(media_path)
    media_type = "gif" if is_gif else "jpeg"
    
    html_string = f"""
    <div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
        <div style="max-width: 450px; margin: auto;">
            <img src="data:image/{media_type};base64,{media_base64}"
                 style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
                 alt="Displayed Media">
        </div>
    </div>
    """
    return html_string

class LMBattleArena:
    def __init__(self, dataset_path):
        """Initialize battle arena with dataset"""
        self.df = pd.read_csv(dataset_path)
        print(self.df.head())
        self.current_index = 0
        self.saving_freq = 10 # save the results in csv/push to hub every 10 evaluations
        self.evaluation_results = []
        self.model_scores = defaultdict(lambda: {'wins': 0, 'total_comparisons': 0})
    
    def get_next_battle_pair(self):
        """Retrieve next pair of summaries for comparison"""
        if self.current_index >= len(self.df):
            return None
        
        row = self.df.iloc[self.current_index]
        model_summary_cols = [
            col 
            for col in row.index 
            if col.upper() != 'PROMPT'
        ]
        selected_models = random.sample(model_summary_cols, 2)
        battle_data = {
            'prompt': row['prompt'],
            'model_1': row[selected_models[0]],
            'model_2': row[selected_models[1]],
            'model1_name': selected_models[0],
            'model2_name': selected_models[1]
        }
        self.current_index += 1
        return battle_data
    
    def record_evaluation(self, preferred_models, input_text, output1, output2, model1_name, model2_name):
        """Record user's model preference and update scores"""
        self.model_scores[model1_name]['total_comparisons'] += 1
        self.model_scores[model2_name]['total_comparisons'] += 1
        
        if preferred_models == "Both Good":
            self.model_scores[model1_name]['wins'] += 1
            self.model_scores[model2_name]['wins'] += 1
        elif preferred_models == "Model A":  # Maps to first model
            self.model_scores[model1_name]['wins'] += 1
        elif preferred_models == "Model B":  # Maps to second model
            self.model_scores[model2_name]['wins'] += 1
        # "Both Bad" case - no wins recorded
        
        evaluation = {
            'input_text': input_text,
            'output1': output1,
            'output2': output2,
            'model1_name': model1_name,
            'model2_name': model2_name,
            'preferred_models': preferred_models
        }
        self.evaluation_results.append(evaluation)
        
        return self.get_model_scores_df()
    
    def get_model_scores_df(self):
        """Convert model scores to DataFrame"""
        scores_data = []
        for model, stats in self.model_scores.items():
            win_rate = (stats['wins'] / stats['total_comparisons'] * 100) if stats['total_comparisons'] > 0 else 0
            scores_data.append({
                'Model': model,
                'Wins': stats['wins'],
                'Total Comparisons': stats['total_comparisons'],
                'Win Rate (%)': round(win_rate, 2)
            })
        results_df = pd.DataFrame(scores_data).sort_values('Win Rate (%)', ascending=False)
        
        # save the results in a huggingface dataset
        if self.current_index % self.saving_freq == 0 and self.current_index > 0:
            # results_dataset = Dataset.from_pandas(results_df)
            # results_dataset.push_to_hub('atlasia/Res-Moroccan-Darija-LLM-Battle-Al-Atlas', private=True)
            results_df.to_csv('human_eval_results.csv')
            
        return results_df
    

def create_battle_arena(dataset_path, is_gif):
    arena = LMBattleArena(dataset_path)
    
    def battle_round():
        battle_data = arena.get_next_battle_pair()
        
        if battle_data is None:
            return "No more texts to evaluate!", "", "", "", "", gr.DataFrame(visible=False)
        
        return (
            battle_data['prompt'], 
            battle_data['model_1'], 
            battle_data['model_2'],
            battle_data['model1_name'], 
            battle_data['model2_name'],
            gr.DataFrame(visible=True)
        )
    
    def submit_preference(input_text, output_1, output_2, model1_name, model2_name, preferred_models):
        scores_df = arena.record_evaluation(
            preferred_models, input_text, output_1, output_2, model1_name, model2_name
        )
        next_battle = battle_round()
        return (*next_battle[:-1], scores_df)

    with gr.Blocks(css="footer{display:none !important}") as demo:
        
        base_path = os.path.dirname(__file__)
        local_image_path = os.path.join(base_path, 'battle_leaderboard.gif')
        gr.HTML(create_html_media(local_image_path, is_gif=is_gif))
        
        with gr.Tabs():
            with gr.Tab("Battle Arena"):
                gr.Markdown("# 🤖 Pretrained SmolLMs Battle Arena")
                
                input_text = gr.Textbox(
                    label="Input prompt", 
                    interactive=False,
                )
                
                with gr.Row():
                    output_1 = gr.Textbox(
                        label="Model A", 
                        interactive=False
                    )
                    model1_name = gr.State()  # Hidden state for model1 name
                
                with gr.Row():
                    output_2 = gr.Textbox(
                        label="Model B", 
                        interactive=False
                    )
                    model2_name = gr.State()  # Hidden state for model2 name
                
                preferred_models = gr.Radio(
                    label="Which model is better?",
                    choices=["Model A", "Model B", "Both Good", "Both Bad"]
                )
                submit_btn = gr.Button("Vote", variant="primary")
                
                scores_table = gr.DataFrame(
                    headers=['Model', 'Wins', 'Total Comparisons', 'Win Rate (%)'],
                    label="🏆 Leaderboard"
                )
                
                submit_btn.click(
                    submit_preference,
                    inputs=[input_text, output_1, output_2, model1_name, model2_name, preferred_models],
                    outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table]
                )
                
                demo.load(battle_round, outputs=[input_text, output_1, output_2, model1_name, model2_name, scores_table])
                        
    return demo

if __name__ == "__main__":
    
    # load the existing dataset that contains outputs of the LMs
    human_eval_dataset = load_dataset("atlasia/Moroccan-Darija-LLM-Battle-Al-Atlas", split='train').to_csv('human_eval_dataset.csv')

    # precision
    torch_dtype = torch.float16

    # inference device
    device = "cpu" #"cuda" if torch.cuda.is_available() else "cpu"
    dataset_path = 'human_eval_dataset.csv'
    is_gif = True
    demo = create_battle_arena(dataset_path, is_gif)
    demo.launch(debug=True)