David Pomerenke commited on
Commit
4746aca
·
1 Parent(s): 678e066

Rough draft of individual language view

Browse files
Files changed (1) hide show
  1. app.py +162 -0
app.py CHANGED
@@ -798,6 +798,7 @@ with gr.Blocks(title="AI Language Proficiency Benchmark", css=css, head=shortcut
798
 
799
  for lang in tqdm(languages[:20], desc="Generating pages"):
800
  with demo.route(lang['language_name'], f"/{lang['bcp_47']}"):
 
801
  url = f"hf.co/spaces/datenlaborbmz/ai-language-monitor?lang={lang['bcp_47']}"
802
  gr.Markdown(
803
  f'''
@@ -808,6 +809,167 @@ for lang in tqdm(languages[:20], desc="Generating pages"):
808
  ''',
809
  sanitize_html=False
810
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
811
 
812
 
813
  demo.launch()
 
798
 
799
  for lang in tqdm(languages[:20], desc="Generating pages"):
800
  with demo.route(lang['language_name'], f"/{lang['bcp_47']}"):
801
+ gr.Button("← Back to Main Dashboard", link="/")
802
  url = f"hf.co/spaces/datenlaborbmz/ai-language-monitor?lang={lang['bcp_47']}"
803
  gr.Markdown(
804
  f'''
 
809
  ''',
810
  sanitize_html=False
811
  )
812
+
813
+ # Language overview section
814
+ with gr.Row():
815
+ with gr.Column(scale=2):
816
+ gr.Markdown(f"""
817
+ ## Language Overview
818
+ - **Native name**: {lang.get('native_name', 'N/A')}
819
+ - **Language family**: {lang.get('language_family', 'N/A')}
820
+ - **BCP-47 code**: `{lang['bcp_47']}`
821
+ - **ISO 639-3 code**: `{lang.get('iso_639_3', 'N/A')}`
822
+ - **Number of speakers**: {format_number(lang['speakers'])}
823
+ - **Script**: {lang.get('script', 'N/A')}
824
+ - **CommonVoice hours**: {round(lang.get('commonvoice_hours', 0) or 0)}
825
+ """)
826
+
827
+ # Resource links
828
+ resource_links = []
829
+ if lang.get('commonvoice_locale'):
830
+ resource_links.append(f"[CommonVoice Dataset](https://commonvoice.mozilla.org/{lang['commonvoice_locale']})")
831
+ if lang.get('wikipedia_code'):
832
+ resource_links.append(f"[Wikipedia](https://{lang['wikipedia_code']}.wikipedia.org)")
833
+ if lang.get('bcp_47'):
834
+ resource_links.append(f"[FLORES+ Dataset](https://huggingface.co/datasets/openlanguagedata/flores_plus/viewer/all/{lang['bcp_47']})")
835
+
836
+ if resource_links:
837
+ gr.Markdown("### Resources\n" + "\n".join(resource_links))
838
+
839
+ with gr.Column(scale=3):
840
+ # Create a mini-map showing where the language is spoken
841
+ country_data = {}
842
+ if "population" in lang:
843
+ for country_code, speakers in lang["population"].items():
844
+ try:
845
+ country = pycountry.countries.get(alpha_2=country_code)
846
+ if country:
847
+ country_data[country.alpha_3] = speakers / lang["speakers"]
848
+ except (KeyError, AttributeError):
849
+ continue
850
+
851
+ locations = list(country_data.keys())
852
+ values = list(country_data.values())
853
+
854
+ if locations:
855
+ fig = go.Figure(data=go.Choropleth(
856
+ locations=locations,
857
+ z=values,
858
+ locationmode="ISO-3",
859
+ colorscale="Blues",
860
+ marker_line_color='white',
861
+ marker_line_width=0.5,
862
+ colorbar_title="Speaker %"
863
+ ))
864
+
865
+ fig.update_layout(
866
+ title_text=f"Distribution of {lang['language_name']} Speakers",
867
+ geo=dict(
868
+ showframe=False,
869
+ showcoastlines=True,
870
+ projection_type='natural earth'
871
+ ),
872
+ height=300,
873
+ margin={"r":0,"t":30,"l":0,"b":0}
874
+ )
875
+
876
+ gr.Plot(value=fig)
877
+ else:
878
+ gr.Markdown("*Geographic data not available*")
879
+
880
+ # Performance metrics section
881
+ gr.Markdown("## AI Model Performance")
882
+
883
+ with gr.Row():
884
+ with gr.Column():
885
+ # Create metrics dashboard for this language
886
+ metrics_data = []
887
+ for metric_key, display_name in [
888
+ ("t2t_score", "Overall Text Performance"),
889
+ ("mt_bleu", "Translation (BLEU)"),
890
+ ("mt_chrf", "Translation (ChrF)"),
891
+ ("cls_acc", "Classification"),
892
+ ("mlm_chrf", "Masked Language Modeling"),
893
+ ("s2t_score", "Overall Speech Performance"),
894
+ ("asr_wer", "Speech Recognition (WER)"),
895
+ ("asr_chrf", "Speech Recognition (ChrF)")
896
+ ]:
897
+ if metric_key in lang and lang[metric_key] is not None:
898
+ value = lang[metric_key]
899
+ color = "green" if value > 0.5 else "orange" if value > 0.25 else "red"
900
+
901
+ # For WER, lower is better, so invert the color logic
902
+ if metric_key == "asr_wer":
903
+ color = "green" if value < 0.3 else "orange" if value < 0.6 else "red"
904
+
905
+ metrics_data.append({
906
+ "Metric": display_name,
907
+ "Value": round(value, 3),
908
+ "Visual": make_colored_bar(value if metric_key != "asr_wer" else 1 - value)
909
+ })
910
+
911
+ if metrics_data:
912
+ gr.DataFrame(
913
+ pd.DataFrame(metrics_data),
914
+ label=f"Performance Metrics for {lang['language_name']}",
915
+ show_search=False
916
+ )
917
+ else:
918
+ gr.Markdown("*No performance metrics available*")
919
+
920
+ # Model comparison table
921
+ gr.Markdown("## Model Comparison")
922
+
923
+ with gr.Row():
924
+ models_data = []
925
+ for score in lang["scores"]:
926
+ if score.get("t2t_score") is not None:
927
+ model_name = score["model"].split("/")[-1]
928
+ models_data.append({
929
+ "Model": model_name,
930
+ "Overall": round(score.get("t2t_score", 0), 3),
931
+ "Translation": round(score.get("mt_chrf", 0), 3),
932
+ "Classification": round(score.get("cls_acc", 0), 3),
933
+ "Lang Model": round(score.get("mlm_chrf", 0), 3),
934
+ "Speech": round(score.get("asr_chrf", 0), 3) if "asr_chrf" in score else "N/A"
935
+ })
936
+
937
+ if models_data:
938
+ df = pd.DataFrame(models_data).sort_values("Overall", ascending=False)
939
+ gr.DataFrame(
940
+ df,
941
+ label=f"Model Performance on {lang['language_name']}",
942
+ show_search=False
943
+ )
944
+ else:
945
+ gr.Markdown("*No model comparison data available*")
946
+
947
+ # Performance comparison with similar languages
948
+ if lang.get("language_family"):
949
+ gr.Markdown("## Comparison with Related Languages")
950
+
951
+ # Find related languages
952
+ related_langs = [l for l in languages if l.get("language_family") == lang["language_family"] and l["t2t_score"] is not None]
953
+ related_langs = sorted(related_langs, key=lambda x: x["t2t_score"], reverse=True)[:10]
954
+
955
+ if len(related_langs) > 1:
956
+ lang_names = [l["language_name"] for l in related_langs]
957
+ t2t_scores = [l["t2t_score"] for l in related_langs]
958
+
959
+ fig = px.bar(
960
+ x=lang_names,
961
+ y=t2t_scores,
962
+ labels={"x": "Language", "y": "Text-to-Text Score"},
963
+ title=f"Performance Across {lang['language_family']} Languages"
964
+ )
965
+
966
+ # Highlight the current language
967
+ for i, name in enumerate(lang_names):
968
+ if name == lang["language_name"]:
969
+ fig.data[0].marker.color = ["lightblue"] * i + ["orange"] + ["lightblue"] * (len(lang_names) - i - 1)
970
+
971
+ fig.update_layout(height=400)
972
+ gr.Plot(value=fig)
973
 
974
 
975
  demo.launch()