| import gradio as gr |
| import pandas as pd |
|
|
| from dataset import get_dataframe |
| from markdown import GUIDELINES, PANEL_MARKDOWN |
|
|
| df = get_dataframe() |
|
|
|
|
| def filter_dataframe(dataframe, eval_dataset, cont_source, checkboxes): |
| """ |
| Filter the dataframe based on the provided evaluation dataset, contaminated source, and checkboxes. |
| |
| Args: |
| dataframe (pandas.DataFrame): The input dataframe to filter. |
| eval_dataset (str): The evaluation dataset to filter by. |
| cont_source (str): The contaminated source to filter by. |
| checkboxes (list): The checkboxes to filter by. |
| |
| Returns: |
| pandas.DataFrame: The filtered dataframe. |
| """ |
| if isinstance(eval_dataset, str): |
| dataframe = dataframe[ |
| dataframe["Evaluation Dataset"].str.contains(eval_dataset) |
| ] |
| if isinstance(cont_source, str): |
| dataframe = dataframe[ |
| dataframe["Contaminated Source"].str.contains(cont_source) |
| ] |
| if isinstance(checkboxes, list) and "Exclude model-based evidences" in checkboxes: |
| dataframe = dataframe[dataframe["Approach"] != "model-based"] |
| if isinstance(checkboxes, list) and "Show only contaminated" in checkboxes: |
| dataframe = dataframe[ |
| (dataframe["Train Split"] > 0.0) |
| | (dataframe["Development Split"] > 0.0) |
| | (dataframe["Test Split"] > 0.0) |
| ] |
|
|
| dataframe = dataframe.sort_values("Test Split", ascending=False) |
|
|
| return dataframe.style.format( |
| { |
| "Train Split": "{:.1%}", |
| "Development Split": "{:.1%}", |
| "Test Split": "{:.1%}", |
| }, |
| na_rep="Unknown", |
| ) |
|
|
|
|
| def filter_dataframe_corpus(*args, **kwargs) -> pd.DataFrame: |
| """ |
| Filter the dataframe for corpus contamination. |
| |
| Returns: |
| pandas.DataFrame: The filtered dataframe for corpus contamination. |
| """ |
| |
| filtered_df = df[df["Model or corpus"] == "corpus"] |
| filtered_df = filtered_df.drop(columns=["Model or corpus"]) |
| return filter_dataframe(filtered_df, *args, **kwargs) |
|
|
|
|
| def filter_dataframe_model(*args, **kwargs) -> pd.DataFrame: |
| """ |
| Filter the dataframe for model contamination. |
| |
| Returns: |
| pandas.DataFrame: The filtered dataframe for model contamination. |
| """ |
| |
| filtered_df = df[df["Model or corpus"] == "model"] |
| filtered_df = filtered_df.drop(columns=["Model or corpus"]) |
| return filter_dataframe(filtered_df, *args, **kwargs) |
|
|
|
|
| theme = gr.themes.Soft( |
| primary_hue="emerald", |
| secondary_hue="cyan", |
| text_size="md", |
| spacing_size="lg", |
| font=[ |
| gr.themes.GoogleFont("Poppins"), |
| gr.themes.GoogleFont("Poppins"), |
| gr.themes.GoogleFont("Poppins"), |
| gr.themes.GoogleFont("Poppins"), |
| ], |
| ).set( |
| block_background_fill="*neutral_50", |
| block_background_fill_dark="*neutral_950", |
| section_header_text_size="*text_lg", |
| section_header_text_weight="800", |
| ) |
|
|
|
|
| with gr.Blocks( |
| theme=theme, |
| title="π¨ Data Contamination Database", |
| analytics_enabled=False, |
| fill_height=True, |
| ) as demo: |
| gr.Markdown(PANEL_MARKDOWN) |
| with gr.Tab("Corpus contamination") as tab_corpus: |
| with gr.Row(variant="compact"): |
| with gr.Column(): |
| eval_dataset_corpus = gr.Textbox( |
| placeholder="Evaluation dataset", |
| label="Evaluation dataset", |
| value="", |
| ) |
| cont_corpora = gr.Textbox( |
| placeholder="Pre-training corpora", |
| label="Pre-training corpora", |
| value="", |
| ) |
| with gr.Column(): |
| checkboxes_corpus = gr.CheckboxGroup( |
| ["Exclude model-based evidences", "Show only contaminated"], |
| label="Search options", |
| value=[], |
| ) |
|
|
| filter_corpus_btn = gr.Button("Filter") |
|
|
| corpus_dataframe = gr.DataFrame( |
| value=filter_dataframe_corpus( |
| eval_dataset_corpus, cont_corpora, checkboxes_corpus |
| ), |
| headers=df.columns.to_list(), |
| datatype=[ |
| "markdown", |
| "markdown", |
| "number", |
| "number", |
| "number", |
| "str", |
| "markdown", |
| "markdown", |
| ], |
| ) |
|
|
| with gr.Tab("Model contamination") as tab_model: |
| with gr.Row(variant="compact"): |
| with gr.Column(): |
| eval_dataset_model = gr.Textbox( |
| placeholder="Evaluation dataset", |
| label="Evaluation dataset", |
| value="", |
| ) |
| cont_model = gr.Textbox( |
| placeholder="Model", label="Pre-trained model", value="" |
| ) |
| with gr.Column(): |
| checkboxes_model = gr.CheckboxGroup( |
| ["Exclude model-based evidences", "Show only contaminated"], |
| label="Search options", |
| value=[], |
| ) |
|
|
| filter_model_btn = gr.Button("Filter") |
|
|
| model_dataframe = gr.DataFrame( |
| value=filter_dataframe_model( |
| eval_dataset_model, cont_model, checkboxes_model |
| ), |
| headers=df.columns.to_list(), |
| datatype=[ |
| "markdown", |
| "markdown", |
| "number", |
| "number", |
| "number", |
| "str", |
| "markdown", |
| "markdown", |
| ], |
| ) |
|
|
| filter_corpus_btn.click( |
| filter_dataframe_corpus, |
| inputs=[eval_dataset_corpus, cont_corpora, checkboxes_corpus], |
| outputs=corpus_dataframe, |
| ) |
| filter_model_btn.click( |
| filter_dataframe_model, |
| inputs=[eval_dataset_model, cont_model, checkboxes_model], |
| outputs=model_dataframe, |
| ) |
|
|
| with gr.Tab("Contribution Guidelines") as tab_guidelines: |
| gr.Markdown(GUIDELINES) |
|
|
|
|
| demo.launch() |
|
|