David Pomerenke commited on
Commit
8190782
·
1 Parent(s): d5fc8b3

Add links to add CommonVoice recordings

Browse files
Files changed (3) hide show
  1. app.py +3 -2
  2. evals.py +23 -7
  3. results.json +20 -10
app.py CHANGED
@@ -178,6 +178,7 @@ def create_language_stats_df(results):
178
  model = best_score['model']
179
  model_name = model.split('/')[-1] if model else "N/A"
180
  model_link = f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>" if model else "N/A"
 
181
  row = {
182
  "Language": f"**{lang['language_name']}**",
183
  "Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
@@ -189,7 +190,7 @@ def create_language_stats_df(results):
189
  "Best Model BLEU": round(best_score["bleu"], 3)
190
  if best_score["bleu"] is not None
191
  else "N/A",
192
- "CommonVoice Hours": lang["commonvoice_hours"],
193
  }
194
  flat_data.append(row)
195
 
@@ -198,7 +199,7 @@ def create_language_stats_df(results):
198
  value=df,
199
  label="Language Results",
200
  show_search="search",
201
- datatype=["markdown", "number", "number", "number", "markdown", "number"],
202
  )
203
 
204
 
 
178
  model = best_score['model']
179
  model_name = model.split('/')[-1] if model else "N/A"
180
  model_link = f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>" if model else "N/A"
181
+ commonvoice_link = f"<!--{lang['commonvoice_hours']:07} (for sorting)--> <a href='https://commonvoice.mozilla.org/{lang['commonvoice_locale']}/speak' style='text-decoration: none; color: inherit;'>🎙️ {lang['commonvoice_hours']}</a>" if lang["commonvoice_hours"] else "N/A"
182
  row = {
183
  "Language": f"**{lang['language_name']}**",
184
  "Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
 
190
  "Best Model BLEU": round(best_score["bleu"], 3)
191
  if best_score["bleu"] is not None
192
  else "N/A",
193
+ "CommonVoice Hours": commonvoice_link,
194
  }
195
  flat_data.append(row)
196
 
 
199
  value=df,
200
  label="Language Results",
201
  show_search="search",
202
+ datatype=["markdown", "number", "number", "number", "markdown", "number", "markdown"],
203
  )
204
 
205
 
evals.py CHANGED
@@ -61,11 +61,15 @@ languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"]
61
  languages["name"] = languages["bcp_47"].apply(lambda x: Language.get(x).display_name())
62
 
63
  # load script codes and names
64
- scripts = pd.read_csv("data/ScriptCodes.csv").rename(columns={"Code": "iso15924", "English Name": "script_name"})
 
 
 
65
 
66
  def script_name(iso15924):
67
  return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
68
 
 
69
  # load benchmark languages and scripts
70
  benchmark_dir = "data/floresp-v2.0-rc.3/dev"
71
  benchmark_languages = pd.DataFrame(
@@ -94,16 +98,20 @@ def get_commonvoice_stats(date: date):
94
 
95
 
96
  commonvoice_stats = pd.DataFrame(get_commonvoice_stats(date.today())).rename(
97
- columns={"locale": "bcp_47", "validatedHours": "commonvoice_hours"}
98
- )[["bcp_47", "commonvoice_hours"]]
99
  # ignore country (language is language) (in practive this is only relevant to zh-CN/zh-TW/zh-HK)
100
- commonvoice_stats["bcp_47"] = commonvoice_stats["bcp_47"].apply(
101
  lambda x: re.sub(r"-[A-Z]{2}$", "", x)
102
  )
103
  commonvoice_stats["bcp_47"] = commonvoice_stats["bcp_47"].apply(
104
  lambda x: standardize_tag(x, macro=True)
105
  ) # this does not really seem to get macrolanguages though, e.g. not for Quechua
106
- commonvoice_stats = commonvoice_stats.groupby("bcp_47").sum().reset_index()
 
 
 
 
107
 
108
  # merge data
109
  languages = pd.merge(
@@ -149,6 +157,7 @@ async def complete(**kwargs):
149
  raise Exception(response)
150
  return response
151
 
 
152
  async def translate(model, target_language, sentence):
153
  script = script_name(target_language.iso15924)
154
  reply = await complete(
@@ -170,7 +179,9 @@ def mean(l):
170
 
171
 
172
  def load_sentences(language):
173
- return open(f"{benchmark_dir}/dev.{language.iso639_3}_{language.iso15924}").readlines()
 
 
174
 
175
 
176
  # evaluation!
@@ -196,7 +207,11 @@ async def main():
196
  original_sentences, target_languages.itertuples()
197
  )
198
  ]
199
- predictions = await tqdm_asyncio.gather(*predictions, miniters=1, desc=f"{language.name} {model.split('/')[0]}")
 
 
 
 
200
  target_sentences = [
201
  load_sentences(lang)[i]
202
  for i, lang in enumerate(target_languages.itertuples())
@@ -227,6 +242,7 @@ async def main():
227
  "bleu": mean([s["bleu"] for s in scores]) if scores else None,
228
  # "bert_score": mean([s["bert_score"] for s in scores]),
229
  "commonvoice_hours": language.commonvoice_hours,
 
230
  }
231
  )
232
  with open("results.json", "w") as f:
 
61
  languages["name"] = languages["bcp_47"].apply(lambda x: Language.get(x).display_name())
62
 
63
  # load script codes and names
64
+ scripts = pd.read_csv("data/ScriptCodes.csv").rename(
65
+ columns={"Code": "iso15924", "English Name": "script_name"}
66
+ )
67
+
68
 
69
  def script_name(iso15924):
70
  return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
71
 
72
+
73
  # load benchmark languages and scripts
74
  benchmark_dir = "data/floresp-v2.0-rc.3/dev"
75
  benchmark_languages = pd.DataFrame(
 
98
 
99
 
100
  commonvoice_stats = pd.DataFrame(get_commonvoice_stats(date.today())).rename(
101
+ columns={"locale": "commonvoice_locale", "validatedHours": "commonvoice_hours"}
102
+ )[["commonvoice_locale", "commonvoice_hours"]]
103
  # ignore country (language is language) (in practive this is only relevant to zh-CN/zh-TW/zh-HK)
104
+ commonvoice_stats["bcp_47"] = commonvoice_stats["commonvoice_locale"].apply(
105
  lambda x: re.sub(r"-[A-Z]{2}$", "", x)
106
  )
107
  commonvoice_stats["bcp_47"] = commonvoice_stats["bcp_47"].apply(
108
  lambda x: standardize_tag(x, macro=True)
109
  ) # this does not really seem to get macrolanguages though, e.g. not for Quechua
110
+ commonvoice_stats = (
111
+ commonvoice_stats.groupby("bcp_47")
112
+ .agg({"commonvoice_hours": "sum", "commonvoice_locale": "first"})
113
+ .reset_index()
114
+ )
115
 
116
  # merge data
117
  languages = pd.merge(
 
157
  raise Exception(response)
158
  return response
159
 
160
+
161
  async def translate(model, target_language, sentence):
162
  script = script_name(target_language.iso15924)
163
  reply = await complete(
 
179
 
180
 
181
  def load_sentences(language):
182
+ return open(
183
+ f"{benchmark_dir}/dev.{language.iso639_3}_{language.iso15924}"
184
+ ).readlines()
185
 
186
 
187
  # evaluation!
 
207
  original_sentences, target_languages.itertuples()
208
  )
209
  ]
210
+ predictions = await tqdm_asyncio.gather(
211
+ *predictions,
212
+ miniters=1,
213
+ desc=f"{language.name} {model.split('/')[0]}",
214
+ )
215
  target_sentences = [
216
  load_sentences(lang)[i]
217
  for i, lang in enumerate(target_languages.itertuples())
 
242
  "bleu": mean([s["bleu"] for s in scores]) if scores else None,
243
  # "bert_score": mean([s["bert_score"] for s in scores]),
244
  "commonvoice_hours": language.commonvoice_hours,
245
+ "commonvoice_locale": language.commonvoice_locale,
246
  }
247
  )
248
  with open("results.json", "w") as f:
results.json CHANGED
@@ -10,7 +10,8 @@
10
  }
11
  ],
12
  "bleu": 0.4931825583688982,
13
- "commonvoice_hours": 2649.0
 
14
  },
15
  {
16
  "language_name": "Chinese",
@@ -43,7 +44,8 @@
43
  }
44
  ],
45
  "bleu": 0.4356399559223496,
46
- "commonvoice_hours": 422.0
 
47
  },
48
  {
49
  "language_name": "Hindi",
@@ -56,7 +58,8 @@
56
  }
57
  ],
58
  "bleu": 0.42910938007537924,
59
- "commonvoice_hours": 16.0
 
60
  },
61
  {
62
  "language_name": "Spanish",
@@ -69,7 +72,8 @@
69
  }
70
  ],
71
  "bleu": 0.3335615012680206,
72
- "commonvoice_hours": 446.0
 
73
  },
74
  {
75
  "language_name": "Arabic",
@@ -82,7 +86,8 @@
82
  }
83
  ],
84
  "bleu": 0.19072998559991275,
85
- "commonvoice_hours": 91.0
 
86
  },
87
  {
88
  "language_name": "Urdu",
@@ -115,7 +120,8 @@
115
  }
116
  ],
117
  "bleu": 0.32276445473356513,
118
- "commonvoice_hours": 76.0
 
119
  },
120
  {
121
  "language_name": "French",
@@ -128,7 +134,8 @@
128
  }
129
  ],
130
  "bleu": 0.40595466651226686,
131
- "commonvoice_hours": 1051.0
 
132
  },
133
  {
134
  "language_name": "Bangla",
@@ -141,7 +148,8 @@
141
  }
142
  ],
143
  "bleu": 0.30570858536443696,
144
- "commonvoice_hours": 49.0
 
145
  },
146
  {
147
  "language_name": "Portuguese",
@@ -174,7 +182,8 @@
174
  }
175
  ],
176
  "bleu": 0.3778453994295843,
177
- "commonvoice_hours": 176.0
 
178
  },
179
  {
180
  "language_name": "Punjabi",
@@ -187,6 +196,7 @@
187
  }
188
  ],
189
  "bleu": 0.34311946995454473,
190
- "commonvoice_hours": 2.3
 
191
  }
192
  ]
 
10
  }
11
  ],
12
  "bleu": 0.4931825583688982,
13
+ "commonvoice_hours": 2649.0,
14
+ "commonvoice_locale": "en"
15
  },
16
  {
17
  "language_name": "Chinese",
 
44
  }
45
  ],
46
  "bleu": 0.4356399559223496,
47
+ "commonvoice_hours": 422.0,
48
+ "commonvoice_locale": "zh-TW"
49
  },
50
  {
51
  "language_name": "Hindi",
 
58
  }
59
  ],
60
  "bleu": 0.42910938007537924,
61
+ "commonvoice_hours": 16.0,
62
+ "commonvoice_locale": "hi-IN"
63
  },
64
  {
65
  "language_name": "Spanish",
 
72
  }
73
  ],
74
  "bleu": 0.3335615012680206,
75
+ "commonvoice_hours": 446.0,
76
+ "commonvoice_locale": "es"
77
  },
78
  {
79
  "language_name": "Arabic",
 
86
  }
87
  ],
88
  "bleu": 0.19072998559991275,
89
+ "commonvoice_hours": 91.0,
90
+ "commonvoice_locale": "ar"
91
  },
92
  {
93
  "language_name": "Urdu",
 
120
  }
121
  ],
122
  "bleu": 0.32276445473356513,
123
+ "commonvoice_hours": 76.0,
124
+ "commonvoice_locale": "ur"
125
  },
126
  {
127
  "language_name": "French",
 
134
  }
135
  ],
136
  "bleu": 0.40595466651226686,
137
+ "commonvoice_hours": 1051.0,
138
+ "commonvoice_locale": "fr"
139
  },
140
  {
141
  "language_name": "Bangla",
 
148
  }
149
  ],
150
  "bleu": 0.30570858536443696,
151
+ "commonvoice_hours": 49.0,
152
+ "commonvoice_locale": "bn"
153
  },
154
  {
155
  "language_name": "Portuguese",
 
182
  }
183
  ],
184
  "bleu": 0.3778453994295843,
185
+ "commonvoice_hours": 176.0,
186
+ "commonvoice_locale": "pt"
187
  },
188
  {
189
  "language_name": "Punjabi",
 
196
  }
197
  ],
198
  "bleu": 0.34311946995454473,
199
+ "commonvoice_hours": 2.3,
200
+ "commonvoice_locale": "pa-IN"
201
  }
202
  ]