David Pomerenke commited on
Commit
c527cda
Β·
1 Parent(s): f18ff7d

Move data files

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
  title: AI Language Monitor
3
- emoji: πŸ‘€
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: static
 
1
  ---
2
  title: AI Language Monitor
3
+ emoji: 🌍
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: static
observablehq.config.js CHANGED
@@ -6,15 +6,11 @@ export default {
6
  // The pages and sections in the sidebar. If you don’t specify this option,
7
  // all pages will be listed in alphabetical order. Listing pages explicitly
8
  // lets you organize them into sections and have unlisted pages.
9
- // pages: [
10
- // {
11
- // name: "Examples",
12
- // pages: [
13
- // {name: "Dashboard", path: "/example-dashboard"},
14
- // {name: "Report", path: "/example-report"}
15
- // ]
16
- // }
17
- // ],
18
 
19
  // Content to add to the head of the page, e.g. for a favicon:
20
  head: '<link rel="icon" href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22 fill=%22black%22>🌍</text></svg>">',
 
6
  // The pages and sections in the sidebar. If you don’t specify this option,
7
  // all pages will be listed in alphabetical order. Listing pages explicitly
8
  // lets you organize them into sections and have unlisted pages.
9
+ pages: [
10
+ { name: "Compare Languages", path: "/compare-languages" },
11
+ { name: "Compare AI Models", path: "/compare-ai-models" },
12
+ { name: "Methodology", path: "/methodology" },
13
+ ],
 
 
 
 
14
 
15
  // Content to add to the head of the page, e.g. for a favicon:
16
  head: '<link rel="icon" href="data:image/svg+xml,<svg xmlns=%22http://www.w3.org/2000/svg%22 viewBox=%220 0 100 100%22><text y=%22.9em%22 font-size=%2290%22 fill=%22black%22>🌍</text></svg>">',
src/{compare-ais.md β†’ compare-ai-models.md} RENAMED
File without changes
data.txt β†’ src/data/data.txt RENAMED
@@ -1,4 +1,4 @@
1
  floresp-v2.0-rc.3: https://github.com/openlanguagedata/flores
2
  languages.csv: generated from https://query.wikidata.org/ using the languages.rq query
3
  LanguageCodes.tab: https://www.ethnologue.com/
4
- ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
 
1
  floresp-v2.0-rc.3: https://github.com/openlanguagedata/flores
2
  languages.csv: generated from https://query.wikidata.org/ using the languages.rq query
3
  LanguageCodes.tab: https://www.ethnologue.com/
4
+ ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
src/data/languagebench.json DELETED
@@ -1,62 +0,0 @@
1
- [
2
- {
3
- "language_name": "English",
4
- "language_code": "eng",
5
- "speakers": 1132366680.0,
6
- "scores": [
7
- {
8
- "model": "anthropic/claude-3.5-haiku",
9
- "bleu": 0.4114123099745433
10
- }
11
- ],
12
- "bleu": 0.4114123099745433
13
- },
14
- {
15
- "language_name": "Mandarin Chinese",
16
- "language_code": "cmn",
17
- "speakers": 1074000000.0,
18
- "scores": [
19
- {
20
- "model": "anthropic/claude-3.5-haiku",
21
- "bleu": 0.22799274850984375
22
- }
23
- ],
24
- "bleu": 0.22799274850984375
25
- },
26
- {
27
- "language_name": "Spanish",
28
- "language_code": "spa",
29
- "speakers": 485000000.0,
30
- "scores": [
31
- {
32
- "model": "anthropic/claude-3.5-haiku",
33
- "bleu": 0.27814703404841756
34
- }
35
- ],
36
- "bleu": 0.27814703404841756
37
- },
38
- {
39
- "language_name": "Hindi",
40
- "language_code": "hin",
41
- "speakers": 341000000.0,
42
- "scores": [
43
- {
44
- "model": "anthropic/claude-3.5-haiku",
45
- "bleu": 0.2607691459848629
46
- }
47
- ],
48
- "bleu": 0.2607691459848629
49
- },
50
- {
51
- "language_name": "Bengali",
52
- "language_code": "ben",
53
- "speakers": 300000000.0,
54
- "scores": [
55
- {
56
- "model": "anthropic/claude-3.5-haiku",
57
- "bleu": 0.2504671437388243
58
- }
59
- ],
60
- "bleu": 0.2504671437388243
61
- }
62
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/data/languagebench.json.py CHANGED
@@ -1,7 +1,9 @@
1
  import asyncio
2
  import json
3
  import os
 
4
  from os import getenv
 
5
 
6
  import evaluate
7
  import pandas as pd
@@ -43,9 +45,9 @@ def reorder(language_name):
43
  return language_name.split(",")[1] + " " + language_name.split(",")[0]
44
  return language_name
45
 
46
-
47
  # load benchmark languages and scripts
48
- benchmark_dir = "floresp-v2.0-rc.3/dev"
 
49
  benchmark_languages = pd.DataFrame(
50
  [f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
51
  columns=["language_code", "script_code"],
@@ -56,7 +58,7 @@ benchmark_languages["in_benchmark"] = True
56
 
57
  # load Ethnologue language names
58
  language_names = (
59
- pd.read_csv("LanguageCodes.tab", sep="\t")
60
  .rename(columns={"LangID": "language_code", "Name": "language_name"})[
61
  ["language_code", "language_name"]
62
  ]
@@ -65,7 +67,7 @@ language_names = (
65
 
66
  # load Wikidata speaker stats
67
  language_stats = (
68
- pd.read_csv("languages.tsv", sep="\t")
69
  .rename(columns={"iso639_3": "language_code", "maxSpeakers": "speakers"})[
70
  ["language_code", "speakers"]
71
  ]
@@ -84,7 +86,7 @@ language_stats = language_stats[
84
  ]
85
 
86
  # load unicode script names
87
- script_names = pd.read_csv("ScriptCodes.csv").rename(
88
  columns={"Code": "script_code", "English Name": "script_name"}
89
  )[["script_code", "script_name"]]
90
 
@@ -166,7 +168,7 @@ async def main():
166
  if not pd.isna(language.language_name)
167
  else language.language_code
168
  )
169
- print(name)
170
  scores = []
171
  if language.in_benchmark:
172
  target_sentences = load_sentences(language)[:n_sentences]
@@ -185,7 +187,7 @@ async def main():
185
  load_sentences(lang)[i]
186
  for i, lang in enumerate(_original_languages.itertuples())
187
  ]
188
- print(model)
189
  predictions = [
190
  translate(
191
  model, language.language_name, language.script_name, sentence
@@ -220,8 +222,7 @@ async def main():
220
  # "bert_score": mean([s["bert_score"] for s in scores]),
221
  }
222
  )
223
- with open("src/data/languagebench.json", "w") as f:
224
- json.dump(results, f, indent=2, ensure_ascii=False)
225
 
226
 
227
  if __name__ == "__main__":
 
1
  import asyncio
2
  import json
3
  import os
4
+ import sys
5
  from os import getenv
6
+ from pathlib import Path
7
 
8
  import evaluate
9
  import pandas as pd
 
45
  return language_name.split(",")[1] + " " + language_name.split(",")[0]
46
  return language_name
47
 
 
48
  # load benchmark languages and scripts
49
+ data = Path("src/data")
50
+ benchmark_dir = data / "floresp-v2.0-rc.3/dev"
51
  benchmark_languages = pd.DataFrame(
52
  [f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
53
  columns=["language_code", "script_code"],
 
58
 
59
  # load Ethnologue language names
60
  language_names = (
61
+ pd.read_csv(data / "LanguageCodes.tab", sep="\t")
62
  .rename(columns={"LangID": "language_code", "Name": "language_name"})[
63
  ["language_code", "language_name"]
64
  ]
 
67
 
68
  # load Wikidata speaker stats
69
  language_stats = (
70
+ pd.read_csv(data / "languages.tsv", sep="\t")
71
  .rename(columns={"iso639_3": "language_code", "maxSpeakers": "speakers"})[
72
  ["language_code", "speakers"]
73
  ]
 
86
  ]
87
 
88
  # load unicode script names
89
+ script_names = pd.read_csv(data / "ScriptCodes.csv").rename(
90
  columns={"Code": "script_code", "English Name": "script_name"}
91
  )[["script_code", "script_name"]]
92
 
 
168
  if not pd.isna(language.language_name)
169
  else language.language_code
170
  )
171
+ print(name, file=sys.stderr)
172
  scores = []
173
  if language.in_benchmark:
174
  target_sentences = load_sentences(language)[:n_sentences]
 
187
  load_sentences(lang)[i]
188
  for i, lang in enumerate(_original_languages.itertuples())
189
  ]
190
+ print(model, file=sys.stderr)
191
  predictions = [
192
  translate(
193
  model, language.language_name, language.script_name, sentence
 
222
  # "bert_score": mean([s["bert_score"] for s in scores]),
223
  }
224
  )
225
+ print(json.dumps(results, indent=2, ensure_ascii=False))
 
226
 
227
 
228
  if __name__ == "__main__":
languages.rq β†’ src/data/languages.rq RENAMED
File without changes
languages.tsv β†’ src/data/languages.tsv RENAMED
File without changes