David Pomerenke
commited on
Commit
·
4746aca
1
Parent(s):
678e066
Rough draft of individual language view
Browse files
app.py
CHANGED
@@ -798,6 +798,7 @@ with gr.Blocks(title="AI Language Proficiency Benchmark", css=css, head=shortcut
|
|
798 |
|
799 |
for lang in tqdm(languages[:20], desc="Generating pages"):
|
800 |
with demo.route(lang['language_name'], f"/{lang['bcp_47']}"):
|
|
|
801 |
url = f"hf.co/spaces/datenlaborbmz/ai-language-monitor?lang={lang['bcp_47']}"
|
802 |
gr.Markdown(
|
803 |
f'''
|
@@ -808,6 +809,167 @@ for lang in tqdm(languages[:20], desc="Generating pages"):
|
|
808 |
''',
|
809 |
sanitize_html=False
|
810 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
811 |
|
812 |
|
813 |
demo.launch()
|
|
|
798 |
|
799 |
for lang in tqdm(languages[:20], desc="Generating pages"):
|
800 |
with demo.route(lang['language_name'], f"/{lang['bcp_47']}"):
|
801 |
+
gr.Button("← Back to Main Dashboard", link="/")
|
802 |
url = f"hf.co/spaces/datenlaborbmz/ai-language-monitor?lang={lang['bcp_47']}"
|
803 |
gr.Markdown(
|
804 |
f'''
|
|
|
809 |
''',
|
810 |
sanitize_html=False
|
811 |
)
|
812 |
+
|
813 |
+
# Language overview section
|
814 |
+
with gr.Row():
|
815 |
+
with gr.Column(scale=2):
|
816 |
+
gr.Markdown(f"""
|
817 |
+
## Language Overview
|
818 |
+
- **Native name**: {lang.get('native_name', 'N/A')}
|
819 |
+
- **Language family**: {lang.get('language_family', 'N/A')}
|
820 |
+
- **BCP-47 code**: `{lang['bcp_47']}`
|
821 |
+
- **ISO 639-3 code**: `{lang.get('iso_639_3', 'N/A')}`
|
822 |
+
- **Number of speakers**: {format_number(lang['speakers'])}
|
823 |
+
- **Script**: {lang.get('script', 'N/A')}
|
824 |
+
- **CommonVoice hours**: {round(lang.get('commonvoice_hours', 0) or 0)}
|
825 |
+
""")
|
826 |
+
|
827 |
+
# Resource links
|
828 |
+
resource_links = []
|
829 |
+
if lang.get('commonvoice_locale'):
|
830 |
+
resource_links.append(f"[CommonVoice Dataset](https://commonvoice.mozilla.org/{lang['commonvoice_locale']})")
|
831 |
+
if lang.get('wikipedia_code'):
|
832 |
+
resource_links.append(f"[Wikipedia](https://{lang['wikipedia_code']}.wikipedia.org)")
|
833 |
+
if lang.get('bcp_47'):
|
834 |
+
resource_links.append(f"[FLORES+ Dataset](https://huggingface.co/datasets/openlanguagedata/flores_plus/viewer/all/{lang['bcp_47']})")
|
835 |
+
|
836 |
+
if resource_links:
|
837 |
+
gr.Markdown("### Resources\n" + "\n".join(resource_links))
|
838 |
+
|
839 |
+
with gr.Column(scale=3):
|
840 |
+
# Create a mini-map showing where the language is spoken
|
841 |
+
country_data = {}
|
842 |
+
if "population" in lang:
|
843 |
+
for country_code, speakers in lang["population"].items():
|
844 |
+
try:
|
845 |
+
country = pycountry.countries.get(alpha_2=country_code)
|
846 |
+
if country:
|
847 |
+
country_data[country.alpha_3] = speakers / lang["speakers"]
|
848 |
+
except (KeyError, AttributeError):
|
849 |
+
continue
|
850 |
+
|
851 |
+
locations = list(country_data.keys())
|
852 |
+
values = list(country_data.values())
|
853 |
+
|
854 |
+
if locations:
|
855 |
+
fig = go.Figure(data=go.Choropleth(
|
856 |
+
locations=locations,
|
857 |
+
z=values,
|
858 |
+
locationmode="ISO-3",
|
859 |
+
colorscale="Blues",
|
860 |
+
marker_line_color='white',
|
861 |
+
marker_line_width=0.5,
|
862 |
+
colorbar_title="Speaker %"
|
863 |
+
))
|
864 |
+
|
865 |
+
fig.update_layout(
|
866 |
+
title_text=f"Distribution of {lang['language_name']} Speakers",
|
867 |
+
geo=dict(
|
868 |
+
showframe=False,
|
869 |
+
showcoastlines=True,
|
870 |
+
projection_type='natural earth'
|
871 |
+
),
|
872 |
+
height=300,
|
873 |
+
margin={"r":0,"t":30,"l":0,"b":0}
|
874 |
+
)
|
875 |
+
|
876 |
+
gr.Plot(value=fig)
|
877 |
+
else:
|
878 |
+
gr.Markdown("*Geographic data not available*")
|
879 |
+
|
880 |
+
# Performance metrics section
|
881 |
+
gr.Markdown("## AI Model Performance")
|
882 |
+
|
883 |
+
with gr.Row():
|
884 |
+
with gr.Column():
|
885 |
+
# Create metrics dashboard for this language
|
886 |
+
metrics_data = []
|
887 |
+
for metric_key, display_name in [
|
888 |
+
("t2t_score", "Overall Text Performance"),
|
889 |
+
("mt_bleu", "Translation (BLEU)"),
|
890 |
+
("mt_chrf", "Translation (ChrF)"),
|
891 |
+
("cls_acc", "Classification"),
|
892 |
+
("mlm_chrf", "Masked Language Modeling"),
|
893 |
+
("s2t_score", "Overall Speech Performance"),
|
894 |
+
("asr_wer", "Speech Recognition (WER)"),
|
895 |
+
("asr_chrf", "Speech Recognition (ChrF)")
|
896 |
+
]:
|
897 |
+
if metric_key in lang and lang[metric_key] is not None:
|
898 |
+
value = lang[metric_key]
|
899 |
+
color = "green" if value > 0.5 else "orange" if value > 0.25 else "red"
|
900 |
+
|
901 |
+
# For WER, lower is better, so invert the color logic
|
902 |
+
if metric_key == "asr_wer":
|
903 |
+
color = "green" if value < 0.3 else "orange" if value < 0.6 else "red"
|
904 |
+
|
905 |
+
metrics_data.append({
|
906 |
+
"Metric": display_name,
|
907 |
+
"Value": round(value, 3),
|
908 |
+
"Visual": make_colored_bar(value if metric_key != "asr_wer" else 1 - value)
|
909 |
+
})
|
910 |
+
|
911 |
+
if metrics_data:
|
912 |
+
gr.DataFrame(
|
913 |
+
pd.DataFrame(metrics_data),
|
914 |
+
label=f"Performance Metrics for {lang['language_name']}",
|
915 |
+
show_search=False
|
916 |
+
)
|
917 |
+
else:
|
918 |
+
gr.Markdown("*No performance metrics available*")
|
919 |
+
|
920 |
+
# Model comparison table
|
921 |
+
gr.Markdown("## Model Comparison")
|
922 |
+
|
923 |
+
with gr.Row():
|
924 |
+
models_data = []
|
925 |
+
for score in lang["scores"]:
|
926 |
+
if score.get("t2t_score") is not None:
|
927 |
+
model_name = score["model"].split("/")[-1]
|
928 |
+
models_data.append({
|
929 |
+
"Model": model_name,
|
930 |
+
"Overall": round(score.get("t2t_score", 0), 3),
|
931 |
+
"Translation": round(score.get("mt_chrf", 0), 3),
|
932 |
+
"Classification": round(score.get("cls_acc", 0), 3),
|
933 |
+
"Lang Model": round(score.get("mlm_chrf", 0), 3),
|
934 |
+
"Speech": round(score.get("asr_chrf", 0), 3) if "asr_chrf" in score else "N/A"
|
935 |
+
})
|
936 |
+
|
937 |
+
if models_data:
|
938 |
+
df = pd.DataFrame(models_data).sort_values("Overall", ascending=False)
|
939 |
+
gr.DataFrame(
|
940 |
+
df,
|
941 |
+
label=f"Model Performance on {lang['language_name']}",
|
942 |
+
show_search=False
|
943 |
+
)
|
944 |
+
else:
|
945 |
+
gr.Markdown("*No model comparison data available*")
|
946 |
+
|
947 |
+
# Performance comparison with similar languages
|
948 |
+
if lang.get("language_family"):
|
949 |
+
gr.Markdown("## Comparison with Related Languages")
|
950 |
+
|
951 |
+
# Find related languages
|
952 |
+
related_langs = [l for l in languages if l.get("language_family") == lang["language_family"] and l["t2t_score"] is not None]
|
953 |
+
related_langs = sorted(related_langs, key=lambda x: x["t2t_score"], reverse=True)[:10]
|
954 |
+
|
955 |
+
if len(related_langs) > 1:
|
956 |
+
lang_names = [l["language_name"] for l in related_langs]
|
957 |
+
t2t_scores = [l["t2t_score"] for l in related_langs]
|
958 |
+
|
959 |
+
fig = px.bar(
|
960 |
+
x=lang_names,
|
961 |
+
y=t2t_scores,
|
962 |
+
labels={"x": "Language", "y": "Text-to-Text Score"},
|
963 |
+
title=f"Performance Across {lang['language_family']} Languages"
|
964 |
+
)
|
965 |
+
|
966 |
+
# Highlight the current language
|
967 |
+
for i, name in enumerate(lang_names):
|
968 |
+
if name == lang["language_name"]:
|
969 |
+
fig.data[0].marker.color = ["lightblue"] * i + ["orange"] + ["lightblue"] * (len(lang_names) - i - 1)
|
970 |
+
|
971 |
+
fig.update_layout(height=400)
|
972 |
+
gr.Plot(value=fig)
|
973 |
|
974 |
|
975 |
demo.launch()
|