David Pomerenke
commited on
Commit
·
08735bb
1
Parent(s):
63fd3b1
Add language families
Browse files- .gitignore +1 -6
- data/data.txt +1 -0
- evals.py +15 -6
- pyproject.toml +1 -0
- results.json +38 -19
- uv.lock +0 -0
.gitignore
CHANGED
@@ -1,15 +1,10 @@
|
|
1 |
floresp-*
|
|
|
2 |
LanguageCodes.tab
|
3 |
ScriptCodes.csv
|
4 |
.cache
|
5 |
.env
|
6 |
|
7 |
-
# Observable
|
8 |
-
.DS_Store
|
9 |
-
/dist/
|
10 |
-
node_modules/
|
11 |
-
yarn-error.log
|
12 |
-
|
13 |
# Python-generated files
|
14 |
__pycache__/
|
15 |
*.py[oc]
|
|
|
1 |
floresp-*
|
2 |
+
glottolog-*
|
3 |
LanguageCodes.tab
|
4 |
ScriptCodes.csv
|
5 |
.cache
|
6 |
.env
|
7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
# Python-generated files
|
9 |
__pycache__/
|
10 |
*.py[oc]
|
data/data.txt
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
floresp-v2.0-rc.3: https://github.com/openlanguagedata/flores
|
|
|
2 |
languages.csv: generated from https://query.wikidata.org/ using the languages.rq query
|
3 |
LanguageCodes.tab: https://www.ethnologue.com/
|
4 |
ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
|
|
|
1 |
floresp-v2.0-rc.3: https://github.com/openlanguagedata/flores
|
2 |
+
glottolog-5.1: https://github.com/glottolog/glottolog
|
3 |
languages.csv: generated from https://query.wikidata.org/ using the languages.rq query
|
4 |
LanguageCodes.tab: https://www.ethnologue.com/
|
5 |
ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
|
evals.py
CHANGED
@@ -19,6 +19,7 @@ from requests import get
|
|
19 |
from rich import print
|
20 |
from tqdm.asyncio import tqdm_asyncio
|
21 |
from transformers import NllbTokenizer
|
|
|
22 |
|
23 |
# config
|
24 |
models = [
|
@@ -73,6 +74,15 @@ def population(bcp_47):
|
|
73 |
return items
|
74 |
|
75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
def script_name(iso15924):
|
77 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
78 |
|
@@ -406,12 +416,8 @@ async def main():
|
|
406 |
"scores": results,
|
407 |
"mt_bleu": mean([s["mt_bleu"] for s in results]),
|
408 |
"mt_chrf": mean([s["mt_chrf"] for s in results]),
|
409 |
-
"cls_acc": mean(
|
410 |
-
|
411 |
-
),
|
412 |
-
"mlm_chrf": mean(
|
413 |
-
[s["mlm_chrf"] for s in results]
|
414 |
-
),
|
415 |
"overall_score": mean([s["overall_score"] for s in results]),
|
416 |
"commonvoice_hours": language.commonvoice_hours
|
417 |
if not pd.isna(language.commonvoice_hours)
|
@@ -420,6 +426,9 @@ async def main():
|
|
420 |
if not pd.isna(language.commonvoice_locale)
|
421 |
else None,
|
422 |
"population": population(language.bcp_47),
|
|
|
|
|
|
|
423 |
}
|
424 |
)
|
425 |
with open("results.json", "w") as f:
|
|
|
19 |
from rich import print
|
20 |
from tqdm.asyncio import tqdm_asyncio
|
21 |
from transformers import NllbTokenizer
|
22 |
+
from pyglottolog import Glottolog
|
23 |
|
24 |
# config
|
25 |
models = [
|
|
|
74 |
return items
|
75 |
|
76 |
|
77 |
+
glottolog = Glottolog("data/glottolog-5.1")
|
78 |
+
|
79 |
+
|
80 |
+
@cache
|
81 |
+
def language_family(iso_639_3):
|
82 |
+
languoid = glottolog.languoid(iso_639_3)
|
83 |
+
return languoid.family.name if languoid else None
|
84 |
+
|
85 |
+
|
86 |
def script_name(iso15924):
|
87 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
88 |
|
|
|
416 |
"scores": results,
|
417 |
"mt_bleu": mean([s["mt_bleu"] for s in results]),
|
418 |
"mt_chrf": mean([s["mt_chrf"] for s in results]),
|
419 |
+
"cls_acc": mean([s["cls_acc"] for s in results]),
|
420 |
+
"mlm_chrf": mean([s["mlm_chrf"] for s in results]),
|
|
|
|
|
|
|
|
|
421 |
"overall_score": mean([s["overall_score"] for s in results]),
|
422 |
"commonvoice_hours": language.commonvoice_hours
|
423 |
if not pd.isna(language.commonvoice_hours)
|
|
|
426 |
if not pd.isna(language.commonvoice_locale)
|
427 |
else None,
|
428 |
"population": population(language.bcp_47),
|
429 |
+
"language_family": language_family(
|
430 |
+
language.flores_path.split("_")[0]
|
431 |
+
),
|
432 |
}
|
433 |
)
|
434 |
with open("results.json", "w") as f:
|
pyproject.toml
CHANGED
@@ -20,6 +20,7 @@ dev-dependencies = [
|
|
20 |
"langcodes>=3.5.0",
|
21 |
"openai>=1.52.2",
|
22 |
"protobuf>=5.28.3",
|
|
|
23 |
"python-dotenv>=1.0.1",
|
24 |
"sacrebleu>=2.4.3",
|
25 |
"sentencepiece>=0.2.0",
|
|
|
20 |
"langcodes>=3.5.0",
|
21 |
"openai>=1.52.2",
|
22 |
"protobuf>=5.28.3",
|
23 |
+
"pyglottolog>=3.14.0",
|
24 |
"python-dotenv>=1.0.1",
|
25 |
"sacrebleu>=2.4.3",
|
26 |
"sentencepiece>=0.2.0",
|
results.json
CHANGED
@@ -208,7 +208,8 @@
|
|
208 |
"ZA": 17503716,
|
209 |
"ZM": 2788256,
|
210 |
"ZW": 6109446
|
211 |
-
}
|
|
|
212 |
},
|
213 |
{
|
214 |
"language_name": "Chinese",
|
@@ -252,7 +253,8 @@
|
|
252 |
"TW": 22422850,
|
253 |
"US": 2295209,
|
254 |
"VN": 1085934
|
255 |
-
}
|
|
|
256 |
},
|
257 |
{
|
258 |
"language_name": "Hindi",
|
@@ -282,7 +284,8 @@
|
|
282 |
"NP": 127377,
|
283 |
"UG": 2206,
|
284 |
"ZA": 1129272
|
285 |
-
}
|
|
|
286 |
},
|
287 |
{
|
288 |
"language_name": "Spanish",
|
@@ -345,7 +348,8 @@
|
|
345 |
"US": 31933344,
|
346 |
"UY": 2981097,
|
347 |
"VE": 23488572
|
348 |
-
}
|
|
|
349 |
},
|
350 |
{
|
351 |
"language_name": "Arabic",
|
@@ -407,7 +411,8 @@
|
|
407 |
"TN": 10549080,
|
408 |
"TR": 459298,
|
409 |
"YE": 22114456
|
410 |
-
}
|
|
|
411 |
},
|
412 |
{
|
413 |
"language_name": "Urdu",
|
@@ -436,7 +441,8 @@
|
|
436 |
"IN": 66304500,
|
437 |
"MU": 71727,
|
438 |
"PK": 221825950
|
439 |
-
}
|
|
|
440 |
},
|
441 |
{
|
442 |
"language_name": "French",
|
@@ -522,7 +528,8 @@
|
|
522 |
"VU": 149166,
|
523 |
"WF": 7610,
|
524 |
"YT": 110580
|
525 |
-
}
|
|
|
526 |
},
|
527 |
{
|
528 |
"language_name": "Bangla",
|
@@ -551,7 +558,8 @@
|
|
551 |
"GB": 263044,
|
552 |
"IN": 107413290,
|
553 |
"NP": 28508
|
554 |
-
}
|
|
|
555 |
},
|
556 |
{
|
557 |
"language_name": "Portuguese",
|
@@ -591,7 +599,8 @@
|
|
591 |
"PT": 9890592,
|
592 |
"ST": 179454,
|
593 |
"TL": 816395
|
594 |
-
}
|
|
|
595 |
},
|
596 |
{
|
597 |
"language_name": "Punjabi",
|
@@ -621,7 +630,8 @@
|
|
621 |
"KE": 10170,
|
622 |
"PK": 163450700,
|
623 |
"SG": 9314
|
624 |
-
}
|
|
|
625 |
},
|
626 |
{
|
627 |
"language_name": "Russian",
|
@@ -668,7 +678,8 @@
|
|
668 |
"UA": 20204534,
|
669 |
"US": 798334,
|
670 |
"UZ": 4279156
|
671 |
-
}
|
|
|
672 |
},
|
673 |
{
|
674 |
"language_name": "Swahili",
|
@@ -701,7 +712,8 @@
|
|
701 |
"UG": 32439750,
|
702 |
"YT": 2716,
|
703 |
"ZA": 1016
|
704 |
-
}
|
|
|
705 |
},
|
706 |
{
|
707 |
"language_name": "Indonesian",
|
@@ -727,7 +739,8 @@
|
|
727 |
"population": {
|
728 |
"ID": 170896640,
|
729 |
"NL": 311047
|
730 |
-
}
|
|
|
731 |
},
|
732 |
{
|
733 |
"language_name": "German",
|
@@ -778,7 +791,8 @@
|
|
778 |
"SI": 883126,
|
779 |
"SK": 1196932,
|
780 |
"US": 1563403
|
781 |
-
}
|
|
|
782 |
},
|
783 |
{
|
784 |
"language_name": "Japanese",
|
@@ -805,7 +819,8 @@
|
|
805 |
"BR": 444604,
|
806 |
"CA": 52772,
|
807 |
"JP": 119231650
|
808 |
-
}
|
|
|
809 |
},
|
810 |
{
|
811 |
"language_name": "Telugu",
|
@@ -830,7 +845,8 @@
|
|
830 |
"commonvoice_locale": "te",
|
831 |
"population": {
|
832 |
"IN": 95478480
|
833 |
-
}
|
|
|
834 |
},
|
835 |
{
|
836 |
"language_name": "Marathi",
|
@@ -855,7 +871,8 @@
|
|
855 |
"commonvoice_locale": "mr",
|
856 |
"population": {
|
857 |
"IN": 92826300
|
858 |
-
}
|
|
|
859 |
},
|
860 |
{
|
861 |
"language_name": "Javanese",
|
@@ -881,7 +898,8 @@
|
|
881 |
"population": {
|
882 |
"ID": 90788840,
|
883 |
"MY": 391825
|
884 |
-
}
|
|
|
885 |
},
|
886 |
{
|
887 |
"language_name": "Vietnamese",
|
@@ -909,6 +927,7 @@
|
|
909 |
"CN": 6970,
|
910 |
"US": 1130973,
|
911 |
"VN": 84900318
|
912 |
-
}
|
|
|
913 |
}
|
914 |
]
|
|
|
208 |
"ZA": 17503716,
|
209 |
"ZM": 2788256,
|
210 |
"ZW": 6109446
|
211 |
+
},
|
212 |
+
"language_family": "Indo-European"
|
213 |
},
|
214 |
{
|
215 |
"language_name": "Chinese",
|
|
|
253 |
"TW": 22422850,
|
254 |
"US": 2295209,
|
255 |
"VN": 1085934
|
256 |
+
},
|
257 |
+
"language_family": "Sino-Tibetan"
|
258 |
},
|
259 |
{
|
260 |
"language_name": "Hindi",
|
|
|
284 |
"NP": 127377,
|
285 |
"UG": 2206,
|
286 |
"ZA": 1129272
|
287 |
+
},
|
288 |
+
"language_family": "Indo-European"
|
289 |
},
|
290 |
{
|
291 |
"language_name": "Spanish",
|
|
|
348 |
"US": 31933344,
|
349 |
"UY": 2981097,
|
350 |
"VE": 23488572
|
351 |
+
},
|
352 |
+
"language_family": "Indo-European"
|
353 |
},
|
354 |
{
|
355 |
"language_name": "Arabic",
|
|
|
411 |
"TN": 10549080,
|
412 |
"TR": 459298,
|
413 |
"YE": 22114456
|
414 |
+
},
|
415 |
+
"language_family": "Afro-Asiatic"
|
416 |
},
|
417 |
{
|
418 |
"language_name": "Urdu",
|
|
|
441 |
"IN": 66304500,
|
442 |
"MU": 71727,
|
443 |
"PK": 221825950
|
444 |
+
},
|
445 |
+
"language_family": "Indo-European"
|
446 |
},
|
447 |
{
|
448 |
"language_name": "French",
|
|
|
528 |
"VU": 149166,
|
529 |
"WF": 7610,
|
530 |
"YT": 110580
|
531 |
+
},
|
532 |
+
"language_family": "Indo-European"
|
533 |
},
|
534 |
{
|
535 |
"language_name": "Bangla",
|
|
|
558 |
"GB": 263044,
|
559 |
"IN": 107413290,
|
560 |
"NP": 28508
|
561 |
+
},
|
562 |
+
"language_family": "Indo-European"
|
563 |
},
|
564 |
{
|
565 |
"language_name": "Portuguese",
|
|
|
599 |
"PT": 9890592,
|
600 |
"ST": 179454,
|
601 |
"TL": 816395
|
602 |
+
},
|
603 |
+
"language_family": "Indo-European"
|
604 |
},
|
605 |
{
|
606 |
"language_name": "Punjabi",
|
|
|
630 |
"KE": 10170,
|
631 |
"PK": 163450700,
|
632 |
"SG": 9314
|
633 |
+
},
|
634 |
+
"language_family": "Indo-European"
|
635 |
},
|
636 |
{
|
637 |
"language_name": "Russian",
|
|
|
678 |
"UA": 20204534,
|
679 |
"US": 798334,
|
680 |
"UZ": 4279156
|
681 |
+
},
|
682 |
+
"language_family": "Indo-European"
|
683 |
},
|
684 |
{
|
685 |
"language_name": "Swahili",
|
|
|
712 |
"UG": 32439750,
|
713 |
"YT": 2716,
|
714 |
"ZA": 1016
|
715 |
+
},
|
716 |
+
"language_family": "Atlantic-Congo"
|
717 |
},
|
718 |
{
|
719 |
"language_name": "Indonesian",
|
|
|
739 |
"population": {
|
740 |
"ID": 170896640,
|
741 |
"NL": 311047
|
742 |
+
},
|
743 |
+
"language_family": "Austronesian"
|
744 |
},
|
745 |
{
|
746 |
"language_name": "German",
|
|
|
791 |
"SI": 883126,
|
792 |
"SK": 1196932,
|
793 |
"US": 1563403
|
794 |
+
},
|
795 |
+
"language_family": "Indo-European"
|
796 |
},
|
797 |
{
|
798 |
"language_name": "Japanese",
|
|
|
819 |
"BR": 444604,
|
820 |
"CA": 52772,
|
821 |
"JP": 119231650
|
822 |
+
},
|
823 |
+
"language_family": "Japonic"
|
824 |
},
|
825 |
{
|
826 |
"language_name": "Telugu",
|
|
|
845 |
"commonvoice_locale": "te",
|
846 |
"population": {
|
847 |
"IN": 95478480
|
848 |
+
},
|
849 |
+
"language_family": "Dravidian"
|
850 |
},
|
851 |
{
|
852 |
"language_name": "Marathi",
|
|
|
871 |
"commonvoice_locale": "mr",
|
872 |
"population": {
|
873 |
"IN": 92826300
|
874 |
+
},
|
875 |
+
"language_family": "Indo-European"
|
876 |
},
|
877 |
{
|
878 |
"language_name": "Javanese",
|
|
|
898 |
"population": {
|
899 |
"ID": 90788840,
|
900 |
"MY": 391825
|
901 |
+
},
|
902 |
+
"language_family": "Austronesian"
|
903 |
},
|
904 |
{
|
905 |
"language_name": "Vietnamese",
|
|
|
927 |
"CN": 6970,
|
928 |
"US": 1130973,
|
929 |
"VN": 84900318
|
930 |
+
},
|
931 |
+
"language_family": "Austroasiatic"
|
932 |
}
|
933 |
]
|
uv.lock
CHANGED
The diff for this file is too large to render.
See raw diff
|
|