David Pomerenke
commited on
Commit
Β·
c527cda
1
Parent(s):
f18ff7d
Move data files
Browse files
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
title: AI Language Monitor
|
3 |
-
emoji:
|
4 |
colorFrom: purple
|
5 |
colorTo: pink
|
6 |
sdk: static
|
|
|
1 |
---
|
2 |
title: AI Language Monitor
|
3 |
+
emoji: π
|
4 |
colorFrom: purple
|
5 |
colorTo: pink
|
6 |
sdk: static
|
observablehq.config.js
CHANGED
@@ -6,15 +6,11 @@ export default {
|
|
6 |
// The pages and sections in the sidebar. If you donβt specify this option,
|
7 |
// all pages will be listed in alphabetical order. Listing pages explicitly
|
8 |
// lets you organize them into sections and have unlisted pages.
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
// {name: "Report", path: "/example-report"}
|
15 |
-
// ]
|
16 |
-
// }
|
17 |
-
// ],
|
18 |
|
19 |
// Content to add to the head of the page, e.g. for a favicon:
|
20 |
head: '<link rel="icon" href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22 fill=%22black%22>π</text></svg>">',
|
|
|
6 |
// The pages and sections in the sidebar. If you donβt specify this option,
|
7 |
// all pages will be listed in alphabetical order. Listing pages explicitly
|
8 |
// lets you organize them into sections and have unlisted pages.
|
9 |
+
pages: [
|
10 |
+
{ name: "Compare Languages", path: "/compare-languages" },
|
11 |
+
{ name: "Compare AI Models", path: "/compare-ai-models" },
|
12 |
+
{ name: "Methodology", path: "/methodology" },
|
13 |
+
],
|
|
|
|
|
|
|
|
|
14 |
|
15 |
// Content to add to the head of the page, e.g. for a favicon:
|
16 |
head: '<link rel="icon" href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22 fill=%22black%22>π</text></svg>">',
|
src/{compare-ais.md β compare-ai-models.md}
RENAMED
File without changes
|
data.txt β src/data/data.txt
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
floresp-v2.0-rc.3: https://github.com/openlanguagedata/flores
|
2 |
languages.csv: generated from https://query.wikidata.org/ using the languages.rq query
|
3 |
LanguageCodes.tab: https://www.ethnologue.com/
|
4 |
-
ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
|
|
|
1 |
floresp-v2.0-rc.3: https://github.com/openlanguagedata/flores
|
2 |
languages.csv: generated from https://query.wikidata.org/ using the languages.rq query
|
3 |
LanguageCodes.tab: https://www.ethnologue.com/
|
4 |
+
ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
|
src/data/languagebench.json
DELETED
@@ -1,62 +0,0 @@
|
|
1 |
-
[
|
2 |
-
{
|
3 |
-
"language_name": "English",
|
4 |
-
"language_code": "eng",
|
5 |
-
"speakers": 1132366680.0,
|
6 |
-
"scores": [
|
7 |
-
{
|
8 |
-
"model": "anthropic/claude-3.5-haiku",
|
9 |
-
"bleu": 0.4114123099745433
|
10 |
-
}
|
11 |
-
],
|
12 |
-
"bleu": 0.4114123099745433
|
13 |
-
},
|
14 |
-
{
|
15 |
-
"language_name": "Mandarin Chinese",
|
16 |
-
"language_code": "cmn",
|
17 |
-
"speakers": 1074000000.0,
|
18 |
-
"scores": [
|
19 |
-
{
|
20 |
-
"model": "anthropic/claude-3.5-haiku",
|
21 |
-
"bleu": 0.22799274850984375
|
22 |
-
}
|
23 |
-
],
|
24 |
-
"bleu": 0.22799274850984375
|
25 |
-
},
|
26 |
-
{
|
27 |
-
"language_name": "Spanish",
|
28 |
-
"language_code": "spa",
|
29 |
-
"speakers": 485000000.0,
|
30 |
-
"scores": [
|
31 |
-
{
|
32 |
-
"model": "anthropic/claude-3.5-haiku",
|
33 |
-
"bleu": 0.27814703404841756
|
34 |
-
}
|
35 |
-
],
|
36 |
-
"bleu": 0.27814703404841756
|
37 |
-
},
|
38 |
-
{
|
39 |
-
"language_name": "Hindi",
|
40 |
-
"language_code": "hin",
|
41 |
-
"speakers": 341000000.0,
|
42 |
-
"scores": [
|
43 |
-
{
|
44 |
-
"model": "anthropic/claude-3.5-haiku",
|
45 |
-
"bleu": 0.2607691459848629
|
46 |
-
}
|
47 |
-
],
|
48 |
-
"bleu": 0.2607691459848629
|
49 |
-
},
|
50 |
-
{
|
51 |
-
"language_name": "Bengali",
|
52 |
-
"language_code": "ben",
|
53 |
-
"speakers": 300000000.0,
|
54 |
-
"scores": [
|
55 |
-
{
|
56 |
-
"model": "anthropic/claude-3.5-haiku",
|
57 |
-
"bleu": 0.2504671437388243
|
58 |
-
}
|
59 |
-
],
|
60 |
-
"bleu": 0.2504671437388243
|
61 |
-
}
|
62 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/data/languagebench.json.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
import asyncio
|
2 |
import json
|
3 |
import os
|
|
|
4 |
from os import getenv
|
|
|
5 |
|
6 |
import evaluate
|
7 |
import pandas as pd
|
@@ -43,9 +45,9 @@ def reorder(language_name):
|
|
43 |
return language_name.split(",")[1] + " " + language_name.split(",")[0]
|
44 |
return language_name
|
45 |
|
46 |
-
|
47 |
# load benchmark languages and scripts
|
48 |
-
|
|
|
49 |
benchmark_languages = pd.DataFrame(
|
50 |
[f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
|
51 |
columns=["language_code", "script_code"],
|
@@ -56,7 +58,7 @@ benchmark_languages["in_benchmark"] = True
|
|
56 |
|
57 |
# load Ethnologue language names
|
58 |
language_names = (
|
59 |
-
pd.read_csv("LanguageCodes.tab", sep="\t")
|
60 |
.rename(columns={"LangID": "language_code", "Name": "language_name"})[
|
61 |
["language_code", "language_name"]
|
62 |
]
|
@@ -65,7 +67,7 @@ language_names = (
|
|
65 |
|
66 |
# load Wikidata speaker stats
|
67 |
language_stats = (
|
68 |
-
pd.read_csv("languages.tsv", sep="\t")
|
69 |
.rename(columns={"iso639_3": "language_code", "maxSpeakers": "speakers"})[
|
70 |
["language_code", "speakers"]
|
71 |
]
|
@@ -84,7 +86,7 @@ language_stats = language_stats[
|
|
84 |
]
|
85 |
|
86 |
# load unicode script names
|
87 |
-
script_names = pd.read_csv("ScriptCodes.csv").rename(
|
88 |
columns={"Code": "script_code", "English Name": "script_name"}
|
89 |
)[["script_code", "script_name"]]
|
90 |
|
@@ -166,7 +168,7 @@ async def main():
|
|
166 |
if not pd.isna(language.language_name)
|
167 |
else language.language_code
|
168 |
)
|
169 |
-
print(name)
|
170 |
scores = []
|
171 |
if language.in_benchmark:
|
172 |
target_sentences = load_sentences(language)[:n_sentences]
|
@@ -185,7 +187,7 @@ async def main():
|
|
185 |
load_sentences(lang)[i]
|
186 |
for i, lang in enumerate(_original_languages.itertuples())
|
187 |
]
|
188 |
-
print(model)
|
189 |
predictions = [
|
190 |
translate(
|
191 |
model, language.language_name, language.script_name, sentence
|
@@ -220,8 +222,7 @@ async def main():
|
|
220 |
# "bert_score": mean([s["bert_score"] for s in scores]),
|
221 |
}
|
222 |
)
|
223 |
-
|
224 |
-
json.dump(results, f, indent=2, ensure_ascii=False)
|
225 |
|
226 |
|
227 |
if __name__ == "__main__":
|
|
|
1 |
import asyncio
|
2 |
import json
|
3 |
import os
|
4 |
+
import sys
|
5 |
from os import getenv
|
6 |
+
from pathlib import Path
|
7 |
|
8 |
import evaluate
|
9 |
import pandas as pd
|
|
|
45 |
return language_name.split(",")[1] + " " + language_name.split(",")[0]
|
46 |
return language_name
|
47 |
|
|
|
48 |
# load benchmark languages and scripts
|
49 |
+
data = Path("src/data")
|
50 |
+
benchmark_dir = data / "floresp-v2.0-rc.3/dev"
|
51 |
benchmark_languages = pd.DataFrame(
|
52 |
[f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
|
53 |
columns=["language_code", "script_code"],
|
|
|
58 |
|
59 |
# load Ethnologue language names
|
60 |
language_names = (
|
61 |
+
pd.read_csv(data / "LanguageCodes.tab", sep="\t")
|
62 |
.rename(columns={"LangID": "language_code", "Name": "language_name"})[
|
63 |
["language_code", "language_name"]
|
64 |
]
|
|
|
67 |
|
68 |
# load Wikidata speaker stats
|
69 |
language_stats = (
|
70 |
+
pd.read_csv(data / "languages.tsv", sep="\t")
|
71 |
.rename(columns={"iso639_3": "language_code", "maxSpeakers": "speakers"})[
|
72 |
["language_code", "speakers"]
|
73 |
]
|
|
|
86 |
]
|
87 |
|
88 |
# load unicode script names
|
89 |
+
script_names = pd.read_csv(data / "ScriptCodes.csv").rename(
|
90 |
columns={"Code": "script_code", "English Name": "script_name"}
|
91 |
)[["script_code", "script_name"]]
|
92 |
|
|
|
168 |
if not pd.isna(language.language_name)
|
169 |
else language.language_code
|
170 |
)
|
171 |
+
print(name, file=sys.stderr)
|
172 |
scores = []
|
173 |
if language.in_benchmark:
|
174 |
target_sentences = load_sentences(language)[:n_sentences]
|
|
|
187 |
load_sentences(lang)[i]
|
188 |
for i, lang in enumerate(_original_languages.itertuples())
|
189 |
]
|
190 |
+
print(model, file=sys.stderr)
|
191 |
predictions = [
|
192 |
translate(
|
193 |
model, language.language_name, language.script_name, sentence
|
|
|
222 |
# "bert_score": mean([s["bert_score"] for s in scores]),
|
223 |
}
|
224 |
)
|
225 |
+
print(json.dumps(results, indent=2, ensure_ascii=False))
|
|
|
226 |
|
227 |
|
228 |
if __name__ == "__main__":
|
languages.rq β src/data/languages.rq
RENAMED
File without changes
|
languages.tsv β src/data/languages.tsv
RENAMED
File without changes
|