|
import streamlit as st |
|
|
|
from draw_utils import PAGE_MARKDOWN, PAGE_INFO, LENGTHS |
|
from draw_utils import load_results, style_dataframe |
|
|
|
st.set_page_config(layout="wide", page_title="Leaderboard App") |
|
st.markdown(PAGE_MARKDOWN, unsafe_allow_html=True) |
|
|
|
|
|
def draw_leaderboard(): |
|
df = load_results() |
|
|
|
tasks = ['avg(qa1-5)'] + [f"qa{i}" for i in range(1, 11)] |
|
columns = ["model_name", "β€32k", "β€128k"] + LENGTHS |
|
|
|
st.title("πππͺ‘πβ BABILong Leaderboard π") |
|
st.markdown(PAGE_INFO) |
|
st.subheader("Evaluation results:") |
|
st.text('Each tab corresponds to a task, avg - averaged scores over qa1-5 tasks.') |
|
st.markdown('Predictions of all evaluated models: ' |
|
'[BABILong evals](https://huggingface.co/datasets/RMT-team/babilong_evals)') |
|
search_term = st.text_input("Search models:", "") |
|
|
|
tabs = st.tabs(tasks) |
|
for i, tab in enumerate(tabs): |
|
with tab: |
|
task_df = df[df.task == tasks[i]][columns] |
|
if i == 0: |
|
task_df = task_df.loc[~task_df[task_df.columns[:5]].isna().any(axis=1)] |
|
|
|
if search_term: |
|
task_df = task_df[task_df['model_name'].str.contains(search_term, case=False)] |
|
task_df.reset_index(drop=True, inplace=True) |
|
|
|
row_height = 35 |
|
height = (len(task_df) + 1) * row_height |
|
|
|
styled_df = style_dataframe(task_df).format(precision=1) |
|
|
|
st.dataframe( |
|
styled_df, |
|
width=1070, |
|
height=height, |
|
column_config={ |
|
"model_name": st.column_config.Column(width=260, pinned=True) |
|
} |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
draw_leaderboard() |
|
|