|
import streamlit as st |
|
|
|
from draw_utils import PAGE_MARKDOWN, PAGE_INFO, LENGTHS |
|
from draw_utils import load_results, style_dataframe |
|
|
|
st.set_page_config(layout="wide", page_title="Leaderboard App") |
|
st.markdown(PAGE_MARKDOWN, unsafe_allow_html=True) |
|
|
|
|
|
def draw_leaderboard(): |
|
df = load_results() |
|
|
|
tasks = ['avg'] + [f"qa{i}" for i in range(1, 11)] |
|
columns = ["model_name", "<=32k", "<=128k"] + LENGTHS |
|
|
|
st.title("πππͺ‘πβ BABILong Leaderboard π") |
|
st.markdown(PAGE_INFO) |
|
st.subheader("Evaluation results:") |
|
st.text('Each tab corresponds to a task, avg - averaged scores over qa1-5 tasks.') |
|
st.markdown('Predictions of all evaluated models: ' |
|
'[BABILong evals](https://huggingface.co/datasets/RMT-team/babilong_evals)') |
|
search_term = st.text_input("Search models:", "") |
|
|
|
tabs = st.tabs([str(task) for task in tasks]) |
|
for i, tab in enumerate(tabs): |
|
with tab: |
|
task_df = df[df.task == tasks[i]][columns] |
|
|
|
if search_term: |
|
task_df = task_df[task_df['model_name'].str.contains(search_term, case=False)] |
|
task_df.reset_index(drop=True, inplace=True) |
|
|
|
row_height = 35 |
|
height = (len(task_df) + 1) * row_height |
|
|
|
styled_df = style_dataframe(task_df).format(precision=1) |
|
|
|
st.dataframe( |
|
styled_df, |
|
width=1030, |
|
height=height, |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
draw_leaderboard() |
|
|