David Pomerenke commited on
Commit
08735bb
·
1 Parent(s): 63fd3b1

Add language families

Browse files
Files changed (6) hide show
  1. .gitignore +1 -6
  2. data/data.txt +1 -0
  3. evals.py +15 -6
  4. pyproject.toml +1 -0
  5. results.json +38 -19
  6. uv.lock +0 -0
.gitignore CHANGED
@@ -1,15 +1,10 @@
1
  floresp-*
 
2
  LanguageCodes.tab
3
  ScriptCodes.csv
4
  .cache
5
  .env
6
 
7
- # Observable
8
- .DS_Store
9
- /dist/
10
- node_modules/
11
- yarn-error.log
12
-
13
  # Python-generated files
14
  __pycache__/
15
  *.py[oc]
 
1
  floresp-*
2
+ glottolog-*
3
  LanguageCodes.tab
4
  ScriptCodes.csv
5
  .cache
6
  .env
7
 
 
 
 
 
 
 
8
  # Python-generated files
9
  __pycache__/
10
  *.py[oc]
data/data.txt CHANGED
@@ -1,4 +1,5 @@
1
  floresp-v2.0-rc.3: https://github.com/openlanguagedata/flores
 
2
  languages.csv: generated from https://query.wikidata.org/ using the languages.rq query
3
  LanguageCodes.tab: https://www.ethnologue.com/
4
  ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
 
1
  floresp-v2.0-rc.3: https://github.com/openlanguagedata/flores
2
+ glottolog-5.1: https://github.com/glottolog/glottolog
3
  languages.csv: generated from https://query.wikidata.org/ using the languages.rq query
4
  LanguageCodes.tab: https://www.ethnologue.com/
5
  ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
evals.py CHANGED
@@ -19,6 +19,7 @@ from requests import get
19
  from rich import print
20
  from tqdm.asyncio import tqdm_asyncio
21
  from transformers import NllbTokenizer
 
22
 
23
  # config
24
  models = [
@@ -73,6 +74,15 @@ def population(bcp_47):
73
  return items
74
 
75
 
 
 
 
 
 
 
 
 
 
76
  def script_name(iso15924):
77
  return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
78
 
@@ -406,12 +416,8 @@ async def main():
406
  "scores": results,
407
  "mt_bleu": mean([s["mt_bleu"] for s in results]),
408
  "mt_chrf": mean([s["mt_chrf"] for s in results]),
409
- "cls_acc": mean(
410
- [s["cls_acc"] for s in results]
411
- ),
412
- "mlm_chrf": mean(
413
- [s["mlm_chrf"] for s in results]
414
- ),
415
  "overall_score": mean([s["overall_score"] for s in results]),
416
  "commonvoice_hours": language.commonvoice_hours
417
  if not pd.isna(language.commonvoice_hours)
@@ -420,6 +426,9 @@ async def main():
420
  if not pd.isna(language.commonvoice_locale)
421
  else None,
422
  "population": population(language.bcp_47),
 
 
 
423
  }
424
  )
425
  with open("results.json", "w") as f:
 
19
  from rich import print
20
  from tqdm.asyncio import tqdm_asyncio
21
  from transformers import NllbTokenizer
22
+ from pyglottolog import Glottolog
23
 
24
  # config
25
  models = [
 
74
  return items
75
 
76
 
77
+ glottolog = Glottolog("data/glottolog-5.1")
78
+
79
+
80
+ @cache
81
+ def language_family(iso_639_3):
82
+ languoid = glottolog.languoid(iso_639_3)
83
+ return languoid.family.name if languoid else None
84
+
85
+
86
  def script_name(iso15924):
87
  return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
88
 
 
416
  "scores": results,
417
  "mt_bleu": mean([s["mt_bleu"] for s in results]),
418
  "mt_chrf": mean([s["mt_chrf"] for s in results]),
419
+ "cls_acc": mean([s["cls_acc"] for s in results]),
420
+ "mlm_chrf": mean([s["mlm_chrf"] for s in results]),
 
 
 
 
421
  "overall_score": mean([s["overall_score"] for s in results]),
422
  "commonvoice_hours": language.commonvoice_hours
423
  if not pd.isna(language.commonvoice_hours)
 
426
  if not pd.isna(language.commonvoice_locale)
427
  else None,
428
  "population": population(language.bcp_47),
429
+ "language_family": language_family(
430
+ language.flores_path.split("_")[0]
431
+ ),
432
  }
433
  )
434
  with open("results.json", "w") as f:
pyproject.toml CHANGED
@@ -20,6 +20,7 @@ dev-dependencies = [
20
  "langcodes>=3.5.0",
21
  "openai>=1.52.2",
22
  "protobuf>=5.28.3",
 
23
  "python-dotenv>=1.0.1",
24
  "sacrebleu>=2.4.3",
25
  "sentencepiece>=0.2.0",
 
20
  "langcodes>=3.5.0",
21
  "openai>=1.52.2",
22
  "protobuf>=5.28.3",
23
+ "pyglottolog>=3.14.0",
24
  "python-dotenv>=1.0.1",
25
  "sacrebleu>=2.4.3",
26
  "sentencepiece>=0.2.0",
results.json CHANGED
@@ -208,7 +208,8 @@
208
  "ZA": 17503716,
209
  "ZM": 2788256,
210
  "ZW": 6109446
211
- }
 
212
  },
213
  {
214
  "language_name": "Chinese",
@@ -252,7 +253,8 @@
252
  "TW": 22422850,
253
  "US": 2295209,
254
  "VN": 1085934
255
- }
 
256
  },
257
  {
258
  "language_name": "Hindi",
@@ -282,7 +284,8 @@
282
  "NP": 127377,
283
  "UG": 2206,
284
  "ZA": 1129272
285
- }
 
286
  },
287
  {
288
  "language_name": "Spanish",
@@ -345,7 +348,8 @@
345
  "US": 31933344,
346
  "UY": 2981097,
347
  "VE": 23488572
348
- }
 
349
  },
350
  {
351
  "language_name": "Arabic",
@@ -407,7 +411,8 @@
407
  "TN": 10549080,
408
  "TR": 459298,
409
  "YE": 22114456
410
- }
 
411
  },
412
  {
413
  "language_name": "Urdu",
@@ -436,7 +441,8 @@
436
  "IN": 66304500,
437
  "MU": 71727,
438
  "PK": 221825950
439
- }
 
440
  },
441
  {
442
  "language_name": "French",
@@ -522,7 +528,8 @@
522
  "VU": 149166,
523
  "WF": 7610,
524
  "YT": 110580
525
- }
 
526
  },
527
  {
528
  "language_name": "Bangla",
@@ -551,7 +558,8 @@
551
  "GB": 263044,
552
  "IN": 107413290,
553
  "NP": 28508
554
- }
 
555
  },
556
  {
557
  "language_name": "Portuguese",
@@ -591,7 +599,8 @@
591
  "PT": 9890592,
592
  "ST": 179454,
593
  "TL": 816395
594
- }
 
595
  },
596
  {
597
  "language_name": "Punjabi",
@@ -621,7 +630,8 @@
621
  "KE": 10170,
622
  "PK": 163450700,
623
  "SG": 9314
624
- }
 
625
  },
626
  {
627
  "language_name": "Russian",
@@ -668,7 +678,8 @@
668
  "UA": 20204534,
669
  "US": 798334,
670
  "UZ": 4279156
671
- }
 
672
  },
673
  {
674
  "language_name": "Swahili",
@@ -701,7 +712,8 @@
701
  "UG": 32439750,
702
  "YT": 2716,
703
  "ZA": 1016
704
- }
 
705
  },
706
  {
707
  "language_name": "Indonesian",
@@ -727,7 +739,8 @@
727
  "population": {
728
  "ID": 170896640,
729
  "NL": 311047
730
- }
 
731
  },
732
  {
733
  "language_name": "German",
@@ -778,7 +791,8 @@
778
  "SI": 883126,
779
  "SK": 1196932,
780
  "US": 1563403
781
- }
 
782
  },
783
  {
784
  "language_name": "Japanese",
@@ -805,7 +819,8 @@
805
  "BR": 444604,
806
  "CA": 52772,
807
  "JP": 119231650
808
- }
 
809
  },
810
  {
811
  "language_name": "Telugu",
@@ -830,7 +845,8 @@
830
  "commonvoice_locale": "te",
831
  "population": {
832
  "IN": 95478480
833
- }
 
834
  },
835
  {
836
  "language_name": "Marathi",
@@ -855,7 +871,8 @@
855
  "commonvoice_locale": "mr",
856
  "population": {
857
  "IN": 92826300
858
- }
 
859
  },
860
  {
861
  "language_name": "Javanese",
@@ -881,7 +898,8 @@
881
  "population": {
882
  "ID": 90788840,
883
  "MY": 391825
884
- }
 
885
  },
886
  {
887
  "language_name": "Vietnamese",
@@ -909,6 +927,7 @@
909
  "CN": 6970,
910
  "US": 1130973,
911
  "VN": 84900318
912
- }
 
913
  }
914
  ]
 
208
  "ZA": 17503716,
209
  "ZM": 2788256,
210
  "ZW": 6109446
211
+ },
212
+ "language_family": "Indo-European"
213
  },
214
  {
215
  "language_name": "Chinese",
 
253
  "TW": 22422850,
254
  "US": 2295209,
255
  "VN": 1085934
256
+ },
257
+ "language_family": "Sino-Tibetan"
258
  },
259
  {
260
  "language_name": "Hindi",
 
284
  "NP": 127377,
285
  "UG": 2206,
286
  "ZA": 1129272
287
+ },
288
+ "language_family": "Indo-European"
289
  },
290
  {
291
  "language_name": "Spanish",
 
348
  "US": 31933344,
349
  "UY": 2981097,
350
  "VE": 23488572
351
+ },
352
+ "language_family": "Indo-European"
353
  },
354
  {
355
  "language_name": "Arabic",
 
411
  "TN": 10549080,
412
  "TR": 459298,
413
  "YE": 22114456
414
+ },
415
+ "language_family": "Afro-Asiatic"
416
  },
417
  {
418
  "language_name": "Urdu",
 
441
  "IN": 66304500,
442
  "MU": 71727,
443
  "PK": 221825950
444
+ },
445
+ "language_family": "Indo-European"
446
  },
447
  {
448
  "language_name": "French",
 
528
  "VU": 149166,
529
  "WF": 7610,
530
  "YT": 110580
531
+ },
532
+ "language_family": "Indo-European"
533
  },
534
  {
535
  "language_name": "Bangla",
 
558
  "GB": 263044,
559
  "IN": 107413290,
560
  "NP": 28508
561
+ },
562
+ "language_family": "Indo-European"
563
  },
564
  {
565
  "language_name": "Portuguese",
 
599
  "PT": 9890592,
600
  "ST": 179454,
601
  "TL": 816395
602
+ },
603
+ "language_family": "Indo-European"
604
  },
605
  {
606
  "language_name": "Punjabi",
 
630
  "KE": 10170,
631
  "PK": 163450700,
632
  "SG": 9314
633
+ },
634
+ "language_family": "Indo-European"
635
  },
636
  {
637
  "language_name": "Russian",
 
678
  "UA": 20204534,
679
  "US": 798334,
680
  "UZ": 4279156
681
+ },
682
+ "language_family": "Indo-European"
683
  },
684
  {
685
  "language_name": "Swahili",
 
712
  "UG": 32439750,
713
  "YT": 2716,
714
  "ZA": 1016
715
+ },
716
+ "language_family": "Atlantic-Congo"
717
  },
718
  {
719
  "language_name": "Indonesian",
 
739
  "population": {
740
  "ID": 170896640,
741
  "NL": 311047
742
+ },
743
+ "language_family": "Austronesian"
744
  },
745
  {
746
  "language_name": "German",
 
791
  "SI": 883126,
792
  "SK": 1196932,
793
  "US": 1563403
794
+ },
795
+ "language_family": "Indo-European"
796
  },
797
  {
798
  "language_name": "Japanese",
 
819
  "BR": 444604,
820
  "CA": 52772,
821
  "JP": 119231650
822
+ },
823
+ "language_family": "Japonic"
824
  },
825
  {
826
  "language_name": "Telugu",
 
845
  "commonvoice_locale": "te",
846
  "population": {
847
  "IN": 95478480
848
+ },
849
+ "language_family": "Dravidian"
850
  },
851
  {
852
  "language_name": "Marathi",
 
871
  "commonvoice_locale": "mr",
872
  "population": {
873
  "IN": 92826300
874
+ },
875
+ "language_family": "Indo-European"
876
  },
877
  {
878
  "language_name": "Javanese",
 
898
  "population": {
899
  "ID": 90788840,
900
  "MY": 391825
901
+ },
902
+ "language_family": "Austronesian"
903
  },
904
  {
905
  "language_name": "Vietnamese",
 
927
  "CN": 6970,
928
  "US": 1130973,
929
  "VN": 84900318
930
+ },
931
+ "language_family": "Austroasiatic"
932
  }
933
  ]
uv.lock CHANGED
The diff for this file is too large to render. See raw diff