David Pomerenke commited on
Commit
d5fc8b3
·
1 Parent(s): 8beab26

Use langcodes for language matching

Browse files
Files changed (4) hide show
  1. evals.py +79 -80
  2. pyproject.toml +1 -0
  3. results.json +71 -461
  4. uv.lock +81 -0
evals.py CHANGED
@@ -1,6 +1,7 @@
1
  import asyncio
2
  import json
3
  import os
 
4
  from os import getenv
5
 
6
  import evaluate
@@ -14,17 +15,19 @@ from tqdm.asyncio import tqdm_asyncio
14
  from transformers import NllbTokenizer
15
  from datetime import date
16
  from requests import get
 
 
17
 
18
  # config
19
  models = [
20
- "openai/gpt-4o-mini", # 0.6$/M tokens
21
  # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive
22
- "meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
23
- "mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
24
- "google/gemini-2.0-flash-001", # 0.4$/M tokens
25
  # "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
26
- "deepseek/deepseek-chat", # 0.9$/M tokens
27
- "microsoft/phi-4", # 0.07$/M tokens
28
  ]
29
  fast_model = "meta-llama/llama-3.3-70b-instruct"
30
  n_sentences = 30
@@ -47,73 +50,79 @@ def reorder(language_name):
47
  return language_name.split(",")[1] + " " + language_name.split(",")[0]
48
  return language_name
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # load benchmark languages and scripts
51
  benchmark_dir = "data/floresp-v2.0-rc.3/dev"
52
  benchmark_languages = pd.DataFrame(
53
  [f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
54
- columns=["language_code", "script_code"],
55
  )
56
- # hack: drop additional script codes for languages with multiple scripts
57
- benchmark_languages = benchmark_languages.groupby("language_code").head(1)
58
- benchmark_languages["in_benchmark"] = True
59
-
60
- # load Ethnologue language names
61
- language_names = (
62
- pd.read_csv("data/LanguageCodes.tab", sep="\t")
63
- .rename(columns={"LangID": "language_code", "Name": "language_name"})[
64
- ["language_code", "language_name"]
65
- ]
66
- .assign(language_name=lambda df: df["language_name"].apply(reorder).str.strip())
67
  )
68
-
69
- # load Wikidata speaker stats
70
- language_stats = (
71
- pd.read_csv("data/languages.tsv", sep="\t")
72
- .rename(columns={"iso639_3": "language_code", "maxSpeakers": "speakers"})[
73
- ["language_code", "speakers", "iso639_1"]
74
- ]
75
- .dropna(subset=["language_code"])
76
  )
77
- language_stats["speakers"] = pd.to_numeric(language_stats["speakers"], errors="coerce")
78
- ignored_languages = [
79
- "zho", # Chinese -> use Mandarin (cmn) instead
80
- "ara", # Arabic -> use Standard Arabic (arb) instead
81
- "pus", # Pashto -> use Nothern / Central / Southern Pashto instead (pbt / pst / pbu)
82
- "fas", # Persian -> use Iranian Persian (pes) instead
83
- "msa", # Malay -> use Indonesian (ind) instead
84
- ]
85
- language_stats = language_stats[
86
- ~language_stats["language_code"].isin(ignored_languages)
87
- ]
88
-
89
- # load unicode script names
90
- script_names = pd.read_csv("data/ScriptCodes.csv").rename(
91
- columns={"Code": "script_code", "English Name": "script_name"}
92
- )[["script_code", "script_name"]]
93
 
94
- # merge data
95
- languages = pd.merge(language_stats, language_names, on="language_code", how="outer")
96
- languages = pd.merge(benchmark_languages, languages, on="language_code", how="outer")
97
- languages = pd.merge(languages, script_names, on="script_code", how="left")
98
- languages["in_benchmark"] = languages["in_benchmark"].fillna(False)
99
- languages = languages.sort_values(by="speakers", ascending=False)
100
- languages = languages.iloc[:30]
101
 
102
- # retrieve CommonVoice stats
103
- @cache # cache for 1 day
104
  def get_commonvoice_stats(date: date):
105
  return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json()
106
 
107
- commonvoice_stats = pd.DataFrame(get_commonvoice_stats(date.today()))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  # sample languages to translate to
110
  target_languages = languages[languages["in_benchmark"]].sample(
111
  n=n_sentences, weights="speakers", replace=True, random_state=42
112
  )
113
  # sample languages to analyze with all models
114
- detailed_languages = languages[languages["in_benchmark"]].sample(
115
- n=10, random_state=42
116
- )
117
 
118
 
119
  # utils
@@ -140,15 +149,14 @@ async def complete(**kwargs):
140
  raise Exception(response)
141
  return response
142
 
143
-
144
- @cache
145
- async def translate(model, target_language, target_script, sentence):
146
  reply = await complete(
147
  model=model,
148
  messages=[
149
  {
150
  "role": "user",
151
- "content": f"Translate the following text to the {target_language} language; use the {target_script} script; reply only with the translation:\n\n{sentence}",
152
  }
153
  ],
154
  temperature=0,
@@ -162,40 +170,33 @@ def mean(l):
162
 
163
 
164
  def load_sentences(language):
165
- return open(
166
- f"{benchmark_dir}/dev.{language.language_code}_{language.script_code}"
167
- ).readlines()
168
 
169
 
170
  # evaluation!
171
  async def main():
172
  results = []
173
  for language in list(languages.itertuples()):
174
- name = (
175
- language.language_name
176
- if not pd.isna(language.language_name)
177
- else language.language_code
178
- )
179
- print(name)
180
  scores = []
181
  if language.in_benchmark:
182
  original_sentences = load_sentences(language)[:n_sentences]
183
  for model in models:
184
  if (
185
  model != fast_model
186
- and language.language_code
187
- not in detailed_languages.language_code.values
188
  ):
189
  continue
190
-
191
- print(model)
192
  predictions = [
193
  translate(
194
- model, language.language_name, language.script_name, sentence
 
 
 
 
 
195
  )
196
- for sentence, language in zip(original_sentences, target_languages.itertuples())
197
  ]
198
- predictions = await tqdm_asyncio.gather(*predictions, miniters=1)
199
  target_sentences = [
200
  load_sentences(lang)[i]
201
  for i, lang in enumerate(target_languages.itertuples())
@@ -217,17 +218,15 @@ async def main():
217
  # "bert_score": mean(metrics_bert["f1"]),
218
  }
219
  )
220
- commonvoice_hours = commonvoice_stats[commonvoice_stats["locale"] == language.iso639_1]["validatedHours"].values
221
- commonvoice_hours = commonvoice_hours[0] if commonvoice_hours.size > 0 else "N/A"
222
  results.append(
223
  {
224
- "language_name": name,
225
- "language_code": language.language_code,
226
  "speakers": language.speakers if not pd.isna(language.speakers) else 0,
227
  "scores": scores,
228
  "bleu": mean([s["bleu"] for s in scores]) if scores else None,
229
  # "bert_score": mean([s["bert_score"] for s in scores]),
230
- "commonvoice_hours": commonvoice_hours,
231
  }
232
  )
233
  with open("results.json", "w") as f:
 
1
  import asyncio
2
  import json
3
  import os
4
+ import re
5
  from os import getenv
6
 
7
  import evaluate
 
15
  from transformers import NllbTokenizer
16
  from datetime import date
17
  from requests import get
18
+ from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
19
+ from langcodes import standardize_tag, Language
20
 
21
  # config
22
  models = [
23
+ "openai/gpt-4o-mini", # 0.6$/M tokens
24
  # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive
25
+ "meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
26
+ "mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
27
+ "google/gemini-2.0-flash-001", # 0.4$/M tokens
28
  # "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
29
+ "deepseek/deepseek-chat", # 0.9$/M tokens
30
+ "microsoft/phi-4", # 0.07$/M tokens
31
  ]
32
  fast_model = "meta-llama/llama-3.3-70b-instruct"
33
  n_sentences = 30
 
50
  return language_name.split(",")[1] + " " + language_name.split(",")[0]
51
  return language_name
52
 
53
+
54
+ # load general language data
55
+ languages = {
56
+ lang: pop
57
+ for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
58
+ if not re.match(r".*-[A-Z]{2}$", lang)
59
+ }
60
+ languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"])
61
+ languages["name"] = languages["bcp_47"].apply(lambda x: Language.get(x).display_name())
62
+
63
+ # load script codes and names
64
+ scripts = pd.read_csv("data/ScriptCodes.csv").rename(columns={"Code": "iso15924", "English Name": "script_name"})
65
+
66
+ def script_name(iso15924):
67
+ return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
68
+
69
  # load benchmark languages and scripts
70
  benchmark_dir = "data/floresp-v2.0-rc.3/dev"
71
  benchmark_languages = pd.DataFrame(
72
  [f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)],
73
+ columns=["iso639_3", "iso15924"],
74
  )
75
+ benchmark_languages["bcp_47"] = benchmark_languages.apply(
76
+ lambda row: standardize_tag(row["iso639_3"] + "-" + row["iso15924"], macro=True),
77
+ axis=1,
 
 
 
 
 
 
 
 
78
  )
79
+ # ignore script (language is language)
80
+ benchmark_languages["bcp_47"] = benchmark_languages["bcp_47"].apply(
81
+ lambda x: re.sub(r"-[A-Z][a-z]+$", "", x)
82
+ )
83
+ benchmark_languages = (
84
+ benchmark_languages.groupby("bcp_47")
85
+ .agg({"iso639_3": "first", "iso15924": "first"})
86
+ .reset_index()
87
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
 
 
 
 
 
 
 
89
 
90
+ # load CommonVoice stats
91
+ @cache # cache for 1 day
92
  def get_commonvoice_stats(date: date):
93
  return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json()
94
 
95
+
96
+ commonvoice_stats = pd.DataFrame(get_commonvoice_stats(date.today())).rename(
97
+ columns={"locale": "bcp_47", "validatedHours": "commonvoice_hours"}
98
+ )[["bcp_47", "commonvoice_hours"]]
99
+ # ignore country (language is language) (in practive this is only relevant to zh-CN/zh-TW/zh-HK)
100
+ commonvoice_stats["bcp_47"] = commonvoice_stats["bcp_47"].apply(
101
+ lambda x: re.sub(r"-[A-Z]{2}$", "", x)
102
+ )
103
+ commonvoice_stats["bcp_47"] = commonvoice_stats["bcp_47"].apply(
104
+ lambda x: standardize_tag(x, macro=True)
105
+ ) # this does not really seem to get macrolanguages though, e.g. not for Quechua
106
+ commonvoice_stats = commonvoice_stats.groupby("bcp_47").sum().reset_index()
107
+
108
+ # merge data
109
+ languages = pd.merge(
110
+ languages, benchmark_languages, on="bcp_47", how="left"
111
+ ) # "left" because keep it simple for now
112
+ languages = pd.merge(
113
+ languages, commonvoice_stats, on="bcp_47", how="left"
114
+ ) # "left" because keep it simple for now
115
+ languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
116
+
117
+ languages = languages.sort_values(by="speakers", ascending=False)
118
+ languages = languages.iloc[:10]
119
 
120
  # sample languages to translate to
121
  target_languages = languages[languages["in_benchmark"]].sample(
122
  n=n_sentences, weights="speakers", replace=True, random_state=42
123
  )
124
  # sample languages to analyze with all models
125
+ detailed_languages = languages[languages["in_benchmark"]].sample(n=3, random_state=42)
 
 
126
 
127
 
128
  # utils
 
149
  raise Exception(response)
150
  return response
151
 
152
+ async def translate(model, target_language, sentence):
153
+ script = script_name(target_language.iso15924)
 
154
  reply = await complete(
155
  model=model,
156
  messages=[
157
  {
158
  "role": "user",
159
+ "content": f"Translate the following text to the {target_language.name} language; use the {script} script; reply only with the translation:\n\n{sentence}",
160
  }
161
  ],
162
  temperature=0,
 
170
 
171
 
172
  def load_sentences(language):
173
+ return open(f"{benchmark_dir}/dev.{language.iso639_3}_{language.iso15924}").readlines()
 
 
174
 
175
 
176
  # evaluation!
177
  async def main():
178
  results = []
179
  for language in list(languages.itertuples()):
 
 
 
 
 
 
180
  scores = []
181
  if language.in_benchmark:
182
  original_sentences = load_sentences(language)[:n_sentences]
183
  for model in models:
184
  if (
185
  model != fast_model
186
+ and language.bcp_47 not in detailed_languages.bcp_47.values
 
187
  ):
188
  continue
 
 
189
  predictions = [
190
  translate(
191
+ model,
192
+ language,
193
+ sentence,
194
+ )
195
+ for sentence, language in zip(
196
+ original_sentences, target_languages.itertuples()
197
  )
 
198
  ]
199
+ predictions = await tqdm_asyncio.gather(*predictions, miniters=1, desc=f"{language.name} {model.split('/')[0]}")
200
  target_sentences = [
201
  load_sentences(lang)[i]
202
  for i, lang in enumerate(target_languages.itertuples())
 
218
  # "bert_score": mean(metrics_bert["f1"]),
219
  }
220
  )
 
 
221
  results.append(
222
  {
223
+ "language_name": language.name,
224
+ "bcp_47": language.bcp_47,
225
  "speakers": language.speakers if not pd.isna(language.speakers) else 0,
226
  "scores": scores,
227
  "bleu": mean([s["bleu"] for s in scores]) if scores else None,
228
  # "bert_score": mean([s["bert_score"] for s in scores]),
229
+ "commonvoice_hours": language.commonvoice_hours,
230
  }
231
  )
232
  with open("results.json", "w") as f:
pyproject.toml CHANGED
@@ -16,6 +16,7 @@ dev-dependencies = [
16
  "bert-score>=0.3.13",
17
  "evaluate==0.4.0",
18
  "joblib>=1.4.2",
 
19
  "openai>=1.52.2",
20
  "protobuf>=5.28.3",
21
  "python-dotenv>=1.0.1",
 
16
  "bert-score>=0.3.13",
17
  "evaluate==0.4.0",
18
  "joblib>=1.4.2",
19
+ "langcodes>=3.5.0",
20
  "openai>=1.52.2",
21
  "protobuf>=5.28.3",
22
  "python-dotenv>=1.0.1",
results.json CHANGED
@@ -1,582 +1,192 @@
1
  [
2
  {
3
  "language_name": "English",
4
- "language_code": "eng",
5
- "speakers": 1132366680.0,
6
  "scores": [
7
- {
8
- "model": "openai/gpt-4o-mini",
9
- "bleu": 0.47104084248165595
10
- },
11
  {
12
  "model": "meta-llama/llama-3.3-70b-instruct",
13
- "bleu": 0.4207265890491719
14
- },
15
- {
16
- "model": "mistralai/mistral-small-24b-instruct-2501",
17
- "bleu": 0.4642719176436136
18
- },
19
- {
20
- "model": "google/gemini-2.0-flash-001",
21
- "bleu": 0.5237470882988915
22
- },
23
- {
24
- "model": "deepseek/deepseek-chat",
25
- "bleu": 0.516570670982587
26
- },
27
- {
28
- "model": "microsoft/phi-4",
29
- "bleu": 0.44668905281921456
30
  }
31
  ],
32
- "bleu": 0.47384102687918905,
33
  "commonvoice_hours": 2649.0
34
  },
35
  {
36
- "language_name": "Mandarin Chinese",
37
- "language_code": "cmn",
38
- "speakers": 1074000000.0,
39
- "scores": [
40
- {
41
- "model": "meta-llama/llama-3.3-70b-instruct",
42
- "bleu": 0.48254866511762295
43
- }
44
- ],
45
- "bleu": 0.48254866511762295,
46
- "commonvoice_hours": "N/A"
47
- },
48
- {
49
- "language_name": "Spanish",
50
- "language_code": "spa",
51
- "speakers": 485000000.0,
52
- "scores": [
53
- {
54
- "model": "meta-llama/llama-3.3-70b-instruct",
55
- "bleu": 0.31606621368361204
56
- }
57
- ],
58
- "bleu": 0.31606621368361204,
59
- "commonvoice_hours": 446.0
60
- },
61
- {
62
- "language_name": "Hindi",
63
- "language_code": "hin",
64
- "speakers": 341000000.0,
65
- "scores": [
66
- {
67
- "model": "meta-llama/llama-3.3-70b-instruct",
68
- "bleu": 0.3273225856613046
69
- }
70
- ],
71
- "bleu": 0.3273225856613046,
72
- "commonvoice_hours": 16.0
73
- },
74
- {
75
- "language_name": "Bengali",
76
- "language_code": "ben",
77
- "speakers": 300000000.0,
78
- "scores": [
79
- {
80
- "model": "meta-llama/llama-3.3-70b-instruct",
81
- "bleu": 0.23110496173302814
82
- }
83
- ],
84
- "bleu": 0.23110496173302814,
85
- "commonvoice_hours": 49.0
86
- },
87
- {
88
- "language_name": "Portuguese",
89
- "language_code": "por",
90
- "speakers": 254300000.0,
91
- "scores": [
92
- {
93
- "model": "meta-llama/llama-3.3-70b-instruct",
94
- "bleu": 0.35032125995743685
95
- }
96
- ],
97
- "bleu": 0.35032125995743685,
98
- "commonvoice_hours": 176.0
99
- },
100
- {
101
- "language_name": "French",
102
- "language_code": "fra",
103
- "speakers": 208157220.0,
104
- "scores": [
105
- {
106
- "model": "meta-llama/llama-3.3-70b-instruct",
107
- "bleu": 0.31625053573185663
108
- }
109
- ],
110
- "bleu": 0.31625053573185663,
111
- "commonvoice_hours": 1051.0
112
- },
113
- {
114
- "language_name": "Indonesian",
115
- "language_code": "ind",
116
- "speakers": 198996550.0,
117
- "scores": [
118
- {
119
- "model": "meta-llama/llama-3.3-70b-instruct",
120
- "bleu": 0.3112185444311794
121
- }
122
- ],
123
- "bleu": 0.3112185444311794,
124
- "commonvoice_hours": 33.0
125
- },
126
- {
127
- "language_name": "Russian",
128
- "language_code": "rus",
129
- "speakers": 171428900.0,
130
  "scores": [
131
  {
132
  "model": "openai/gpt-4o-mini",
133
- "bleu": 0.32615858913589074
134
  },
135
  {
136
  "model": "meta-llama/llama-3.3-70b-instruct",
137
- "bleu": 0.3244999119385425
138
  },
139
  {
140
  "model": "mistralai/mistral-small-24b-instruct-2501",
141
- "bleu": 0.315801608032821
142
  },
143
  {
144
  "model": "google/gemini-2.0-flash-001",
145
- "bleu": 0.3683733679689521
146
  },
147
  {
148
  "model": "deepseek/deepseek-chat",
149
- "bleu": 0.35988734604889566
150
  },
151
  {
152
  "model": "microsoft/phi-4",
153
- "bleu": 0.31289371159965956
154
  }
155
  ],
156
- "bleu": 0.3346024224541269,
157
- "commonvoice_hours": 241.0
158
  },
159
  {
160
- "language_name": "Japanese",
161
- "language_code": "jpn",
162
- "speakers": 128000000.0,
163
  "scores": [
164
- {
165
- "model": "openai/gpt-4o-mini",
166
- "bleu": 0.28991739992953497
167
- },
168
  {
169
  "model": "meta-llama/llama-3.3-70b-instruct",
170
- "bleu": 0.2675679907215641
171
- },
172
- {
173
- "model": "mistralai/mistral-small-24b-instruct-2501",
174
- "bleu": 0.21348802780641032
175
- },
176
- {
177
- "model": "google/gemini-2.0-flash-001",
178
- "bleu": 0.3345265427223546
179
- },
180
- {
181
- "model": "deepseek/deepseek-chat",
182
- "bleu": 0.3101203037558905
183
- },
184
- {
185
- "model": "microsoft/phi-4",
186
- "bleu": 0.2585222780278109
187
  }
188
  ],
189
- "bleu": 0.2790237571605942,
190
- "commonvoice_hours": 222.0
191
  },
192
  {
193
- "language_name": "Eastern Punjabi",
194
- "language_code": "pan",
195
- "speakers": 125000000.0,
196
  "scores": [
197
  {
198
  "model": "meta-llama/llama-3.3-70b-instruct",
199
- "bleu": 0.27325501919134315
200
  }
201
  ],
202
- "bleu": 0.27325501919134315,
203
- "commonvoice_hours": "N/A"
204
  },
205
  {
206
- "language_name": "Standard German",
207
- "language_code": "deu",
208
- "speakers": 105000000.0,
209
  "scores": [
210
- {
211
- "model": "openai/gpt-4o-mini",
212
- "bleu": 0.39019323183176663
213
- },
214
  {
215
  "model": "meta-llama/llama-3.3-70b-instruct",
216
- "bleu": 0.37266353070949576
217
- },
218
- {
219
- "model": "mistralai/mistral-small-24b-instruct-2501",
220
- "bleu": 0.3647632576435612
221
- },
222
- {
223
- "model": "google/gemini-2.0-flash-001",
224
- "bleu": 0.4466723425292597
225
- },
226
- {
227
- "model": "deepseek/deepseek-chat",
228
- "bleu": 0.4045496243095387
229
- },
230
- {
231
- "model": "microsoft/phi-4",
232
- "bleu": 0.36047992103881465
233
  }
234
  ],
235
- "bleu": 0.3898869846770727,
236
- "commonvoice_hours": 1357.0
237
- },
238
- {
239
- "language_name": "Egyptian Arabic",
240
- "language_code": "arz",
241
- "speakers": 100542400.0,
242
- "scores": [
243
- {
244
- "model": "openai/gpt-4o-mini",
245
- "bleu": 0.2339779422333898
246
- },
247
- {
248
- "model": "meta-llama/llama-3.3-70b-instruct",
249
- "bleu": 0.20475486619797384
250
- },
251
- {
252
- "model": "mistralai/mistral-small-24b-instruct-2501",
253
- "bleu": 0.20783660453505234
254
- },
255
- {
256
- "model": "google/gemini-2.0-flash-001",
257
- "bleu": 0.2840808045687292
258
- },
259
- {
260
- "model": "deepseek/deepseek-chat",
261
- "bleu": 0.2786287793608212
262
- },
263
- {
264
- "model": "microsoft/phi-4",
265
- "bleu": 0.19969813973959594
266
- }
267
- ],
268
- "bleu": 0.23482952277259375,
269
- "commonvoice_hours": "N/A"
270
  },
271
  {
272
  "language_name": "Urdu",
273
- "language_code": "urd",
274
- "speakers": 94022900.0,
275
  "scores": [
276
  {
277
  "model": "openai/gpt-4o-mini",
278
- "bleu": 0.297325653414119
279
  },
280
  {
281
  "model": "meta-llama/llama-3.3-70b-instruct",
282
- "bleu": 0.24593966310665433
283
  },
284
  {
285
  "model": "mistralai/mistral-small-24b-instruct-2501",
286
- "bleu": 0.21988755291389567
287
  },
288
  {
289
  "model": "google/gemini-2.0-flash-001",
290
- "bleu": 0.31796430998058983
291
  },
292
  {
293
  "model": "deepseek/deepseek-chat",
294
- "bleu": 0.3043614136242901
295
  },
296
  {
297
  "model": "microsoft/phi-4",
298
- "bleu": 0.2285337340113323
299
  }
300
  ],
301
- "bleu": 0.2690020545084802,
302
  "commonvoice_hours": 76.0
303
  },
304
  {
305
- "language_name": "Filipino",
306
- "language_code": "fil",
307
- "speakers": 90000000.0,
308
- "scores": [
309
- {
310
- "model": "meta-llama/llama-3.3-70b-instruct",
311
- "bleu": 0.33268969497468076
312
- }
313
- ],
314
- "bleu": 0.33268969497468076,
315
- "commonvoice_hours": "N/A"
316
- },
317
- {
318
- "language_name": "Javanese",
319
- "language_code": "jav",
320
- "speakers": 84308740.0,
321
- "scores": [
322
- {
323
- "model": "meta-llama/llama-3.3-70b-instruct",
324
- "bleu": 0.2528746866064681
325
- }
326
- ],
327
- "bleu": 0.2528746866064681,
328
- "commonvoice_hours": 0.0
329
- },
330
- {
331
- "language_name": "Marathi",
332
- "language_code": "mar",
333
- "speakers": 83100000.0,
334
- "scores": [
335
- {
336
- "model": "meta-llama/llama-3.3-70b-instruct",
337
- "bleu": 0.24876051941895777
338
- }
339
- ],
340
- "bleu": 0.24876051941895777,
341
- "commonvoice_hours": 20.0
342
- },
343
- {
344
- "language_name": "Swahili",
345
- "language_code": "swh",
346
- "speakers": 82300000.0,
347
- "scores": [
348
- {
349
- "model": "openai/gpt-4o-mini",
350
- "bleu": 0.34863560100932933
351
- },
352
- {
353
- "model": "meta-llama/llama-3.3-70b-instruct",
354
- "bleu": 0.30524292832054034
355
- },
356
- {
357
- "model": "mistralai/mistral-small-24b-instruct-2501",
358
- "bleu": 0.23580256234118713
359
- },
360
- {
361
- "model": "google/gemini-2.0-flash-001",
362
- "bleu": 0.3871437234807849
363
- },
364
- {
365
- "model": "deepseek/deepseek-chat",
366
- "bleu": 0.3476225063617937
367
- },
368
- {
369
- "model": "microsoft/phi-4",
370
- "bleu": 0.21803176063271826
371
- }
372
- ],
373
- "bleu": 0.3070798470243923,
374
- "commonvoice_hours": "N/A"
375
- },
376
- {
377
- "language_name": "Turkish",
378
- "language_code": "tur",
379
- "speakers": 82231620.0,
380
- "scores": [
381
- {
382
- "model": "meta-llama/llama-3.3-70b-instruct",
383
- "bleu": 0.29874140544434125
384
- }
385
- ],
386
- "bleu": 0.29874140544434125,
387
- "commonvoice_hours": 127.0
388
- },
389
- {
390
- "language_name": "Telugu",
391
- "language_code": "tel",
392
- "speakers": 82000000.0,
393
- "scores": [
394
- {
395
- "model": "meta-llama/llama-3.3-70b-instruct",
396
- "bleu": 0.28869836899054496
397
- }
398
- ],
399
- "bleu": 0.28869836899054496,
400
- "commonvoice_hours": 0.3
401
- },
402
- {
403
- "language_name": "Wu Chinese",
404
- "language_code": "wuu",
405
- "speakers": 81400000.0,
406
- "scores": [],
407
- "bleu": null,
408
- "commonvoice_hours": "N/A"
409
- },
410
- {
411
- "language_name": "Korean",
412
- "language_code": "kor",
413
- "speakers": 77300000.0,
414
- "scores": [
415
- {
416
- "model": "meta-llama/llama-3.3-70b-instruct",
417
- "bleu": 0.2566453806044083
418
- }
419
- ],
420
- "bleu": 0.2566453806044083,
421
- "commonvoice_hours": 1.7
422
- },
423
- {
424
- "language_name": "Vietnamese",
425
- "language_code": "vie",
426
- "speakers": 76000000.0,
427
- "scores": [
428
- {
429
- "model": "openai/gpt-4o-mini",
430
- "bleu": 0.3104431723374164
431
- },
432
- {
433
- "model": "meta-llama/llama-3.3-70b-instruct",
434
- "bleu": 0.3098478561790782
435
- },
436
- {
437
- "model": "mistralai/mistral-small-24b-instruct-2501",
438
- "bleu": 0.28074941515909896
439
- },
440
- {
441
- "model": "google/gemini-2.0-flash-001",
442
- "bleu": 0.37327273228460267
443
- },
444
- {
445
- "model": "deepseek/deepseek-chat",
446
- "bleu": 0.3487726531917602
447
- },
448
- {
449
- "model": "microsoft/phi-4",
450
- "bleu": 0.18355331419148843
451
- }
452
- ],
453
- "bleu": 0.3011065238905742,
454
- "commonvoice_hours": 5.9
455
- },
456
- {
457
- "language_name": "Tamil",
458
- "language_code": "tam",
459
- "speakers": 75000000.0,
460
- "scores": [
461
- {
462
- "model": "openai/gpt-4o-mini",
463
- "bleu": 0.24593649157372188
464
- },
465
- {
466
- "model": "meta-llama/llama-3.3-70b-instruct",
467
- "bleu": 0.24009996232522382
468
- },
469
- {
470
- "model": "mistralai/mistral-small-24b-instruct-2501",
471
- "bleu": 0.16785828803139252
472
- },
473
- {
474
- "model": "google/gemini-2.0-flash-001",
475
- "bleu": 0.3411457686951495
476
- },
477
- {
478
- "model": "deepseek/deepseek-chat",
479
- "bleu": 0.2875340171253509
480
- },
481
- {
482
- "model": "microsoft/phi-4",
483
- "bleu": 0.12646276530642359
484
- }
485
- ],
486
- "bleu": 0.23483954884287706,
487
- "commonvoice_hours": 234.0
488
- },
489
- {
490
- "language_name": "Yue Chinese",
491
- "language_code": "yue",
492
- "speakers": 73100000.0,
493
  "scores": [
494
  {
495
  "model": "meta-llama/llama-3.3-70b-instruct",
496
- "bleu": 0.2663995648378034
497
  }
498
  ],
499
- "bleu": 0.2663995648378034,
500
- "commonvoice_hours": "N/A"
501
  },
502
  {
503
- "language_name": "Italian",
504
- "language_code": "ita",
505
- "speakers": 64819790.0,
506
  "scores": [
507
  {
508
  "model": "meta-llama/llama-3.3-70b-instruct",
509
- "bleu": 0.3190660116366235
510
  }
511
  ],
512
- "bleu": 0.3190660116366235,
513
- "commonvoice_hours": 362.0
514
  },
515
  {
516
- "language_name": "Gujarati",
517
- "language_code": "guj",
518
- "speakers": 56400000.0,
519
  "scores": [
520
  {
521
  "model": "openai/gpt-4o-mini",
522
- "bleu": 0.25754571533357745
523
  },
524
  {
525
  "model": "meta-llama/llama-3.3-70b-instruct",
526
- "bleu": 0.24145756515188838
527
  },
528
  {
529
  "model": "mistralai/mistral-small-24b-instruct-2501",
530
- "bleu": 0.20092063514315023
531
  },
532
  {
533
  "model": "google/gemini-2.0-flash-001",
534
- "bleu": 0.3664134239402827
535
  },
536
  {
537
  "model": "deepseek/deepseek-chat",
538
- "bleu": 0.2908883229704476
539
  },
540
  {
541
  "model": "microsoft/phi-4",
542
- "bleu": 0.19669824113063106
543
  }
544
  ],
545
- "bleu": 0.2589873172783296,
546
- "commonvoice_hours": "N/A"
547
- },
548
- {
549
- "language_name": "Iranian Persian",
550
- "language_code": "pes",
551
- "speakers": 52800000.0,
552
- "scores": [
553
- {
554
- "model": "meta-llama/llama-3.3-70b-instruct",
555
- "bleu": 0.28359916806993934
556
- }
557
- ],
558
- "bleu": 0.28359916806993934,
559
- "commonvoice_hours": "N/A"
560
  },
561
  {
562
- "language_name": "Bhojpuri",
563
- "language_code": "bho",
564
- "speakers": 52200000.0,
565
  "scores": [
566
  {
567
  "model": "meta-llama/llama-3.3-70b-instruct",
568
- "bleu": 0.24311504988281543
569
  }
570
  ],
571
- "bleu": 0.24311504988281543,
572
- "commonvoice_hours": "N/A"
573
- },
574
- {
575
- "language_name": "Hakka Chinese",
576
- "language_code": "hak",
577
- "speakers": 48200000.0,
578
- "scores": [],
579
- "bleu": null,
580
- "commonvoice_hours": "N/A"
581
  }
582
  ]
 
1
  [
2
  {
3
  "language_name": "English",
4
+ "bcp_47": "en",
5
+ "speakers": 1636485840,
6
  "scores": [
 
 
 
 
7
  {
8
  "model": "meta-llama/llama-3.3-70b-instruct",
9
+ "bleu": 0.4931825583688982
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  }
11
  ],
12
+ "bleu": 0.4931825583688982,
13
  "commonvoice_hours": 2649.0
14
  },
15
  {
16
+ "language_name": "Chinese",
17
+ "bcp_47": "zh",
18
+ "speakers": 1304678914,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  "scores": [
20
  {
21
  "model": "openai/gpt-4o-mini",
22
+ "bleu": 0.4807599914028467
23
  },
24
  {
25
  "model": "meta-llama/llama-3.3-70b-instruct",
26
+ "bleu": 0.48224897154012053
27
  },
28
  {
29
  "model": "mistralai/mistral-small-24b-instruct-2501",
30
+ "bleu": 0.2688927547323512
31
  },
32
  {
33
  "model": "google/gemini-2.0-flash-001",
34
+ "bleu": 0.4876059353172742
35
  },
36
  {
37
  "model": "deepseek/deepseek-chat",
38
+ "bleu": 0.46126489333496423
39
  },
40
  {
41
  "model": "microsoft/phi-4",
42
+ "bleu": 0.43306718920654086
43
  }
44
  ],
45
+ "bleu": 0.4356399559223496,
46
+ "commonvoice_hours": 422.0
47
  },
48
  {
49
+ "language_name": "Hindi",
50
+ "bcp_47": "hi",
51
+ "speakers": 546882144,
52
  "scores": [
 
 
 
 
53
  {
54
  "model": "meta-llama/llama-3.3-70b-instruct",
55
+ "bleu": 0.42910938007537924
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  }
57
  ],
58
+ "bleu": 0.42910938007537924,
59
+ "commonvoice_hours": 16.0
60
  },
61
  {
62
+ "language_name": "Spanish",
63
+ "bcp_47": "es",
64
+ "speakers": 493528077,
65
  "scores": [
66
  {
67
  "model": "meta-llama/llama-3.3-70b-instruct",
68
+ "bleu": 0.3335615012680206
69
  }
70
  ],
71
+ "bleu": 0.3335615012680206,
72
+ "commonvoice_hours": 446.0
73
  },
74
  {
75
+ "language_name": "Arabic",
76
+ "bcp_47": "ar",
77
+ "speakers": 351664197,
78
  "scores": [
 
 
 
 
79
  {
80
  "model": "meta-llama/llama-3.3-70b-instruct",
81
+ "bleu": 0.19072998559991275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  }
83
  ],
84
+ "bleu": 0.19072998559991275,
85
+ "commonvoice_hours": 91.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  },
87
  {
88
  "language_name": "Urdu",
89
+ "bcp_47": "ur",
90
+ "speakers": 290790290,
91
  "scores": [
92
  {
93
  "model": "openai/gpt-4o-mini",
94
+ "bleu": 0.3223557428811336
95
  },
96
  {
97
  "model": "meta-llama/llama-3.3-70b-instruct",
98
+ "bleu": 0.3361392064611452
99
  },
100
  {
101
  "model": "mistralai/mistral-small-24b-instruct-2501",
102
+ "bleu": 0.30361668093990973
103
  },
104
  {
105
  "model": "google/gemini-2.0-flash-001",
106
+ "bleu": 0.38811035932918286
107
  },
108
  {
109
  "model": "deepseek/deepseek-chat",
110
+ "bleu": 0.33221997814253806
111
  },
112
  {
113
  "model": "microsoft/phi-4",
114
+ "bleu": 0.2541447606474814
115
  }
116
  ],
117
+ "bleu": 0.32276445473356513,
118
  "commonvoice_hours": 76.0
119
  },
120
  {
121
+ "language_name": "French",
122
+ "bcp_47": "fr",
123
+ "speakers": 278611507,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  "scores": [
125
  {
126
  "model": "meta-llama/llama-3.3-70b-instruct",
127
+ "bleu": 0.40595466651226686
128
  }
129
  ],
130
+ "bleu": 0.40595466651226686,
131
+ "commonvoice_hours": 1051.0
132
  },
133
  {
134
+ "language_name": "Bangla",
135
+ "bcp_47": "bn",
136
+ "speakers": 267193288,
137
  "scores": [
138
  {
139
  "model": "meta-llama/llama-3.3-70b-instruct",
140
+ "bleu": 0.30570858536443696
141
  }
142
  ],
143
+ "bleu": 0.30570858536443696,
144
+ "commonvoice_hours": 49.0
145
  },
146
  {
147
+ "language_name": "Portuguese",
148
+ "bcp_47": "pt",
149
+ "speakers": 237496885,
150
  "scores": [
151
  {
152
  "model": "openai/gpt-4o-mini",
153
+ "bleu": 0.4122096638493346
154
  },
155
  {
156
  "model": "meta-llama/llama-3.3-70b-instruct",
157
+ "bleu": 0.39250552075952033
158
  },
159
  {
160
  "model": "mistralai/mistral-small-24b-instruct-2501",
161
+ "bleu": 0.22643923104785263
162
  },
163
  {
164
  "model": "google/gemini-2.0-flash-001",
165
+ "bleu": 0.42197093736929103
166
  },
167
  {
168
  "model": "deepseek/deepseek-chat",
169
+ "bleu": 0.42783260235353093
170
  },
171
  {
172
  "model": "microsoft/phi-4",
173
+ "bleu": 0.38611444119797594
174
  }
175
  ],
176
+ "bleu": 0.3778453994295843,
177
+ "commonvoice_hours": 176.0
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  },
179
  {
180
+ "language_name": "Punjabi",
181
+ "bcp_47": "pa",
182
+ "speakers": 203571210,
183
  "scores": [
184
  {
185
  "model": "meta-llama/llama-3.3-70b-instruct",
186
+ "bleu": 0.34311946995454473
187
  }
188
  ],
189
+ "bleu": 0.34311946995454473,
190
+ "commonvoice_hours": 2.3
 
 
 
 
 
 
 
 
191
  }
192
  ]
uv.lock CHANGED
@@ -898,6 +898,30 @@ wheels = [
898
  { url = "https://files.pythonhosted.org/packages/ea/8b/d7497df4a1cae9367adf21665dd1f896c2a7aeb8769ad77b662c5e2bcce7/kiwisolver-1.4.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:eb542fe7933aa09d8d8f9d9097ef37532a7df6497819d16efe4359890a2f417a", size = 55715 },
899
  ]
900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
901
  [[package]]
902
  name = "languagebench"
903
  version = "0.1.0"
@@ -914,6 +938,7 @@ dev = [
914
  { name = "bert-score" },
915
  { name = "evaluate" },
916
  { name = "joblib" },
 
917
  { name = "openai" },
918
  { name = "protobuf" },
919
  { name = "python-dotenv" },
@@ -937,6 +962,7 @@ dev = [
937
  { name = "bert-score", specifier = ">=0.3.13" },
938
  { name = "evaluate", specifier = "==0.4.0" },
939
  { name = "joblib", specifier = ">=1.4.2" },
 
940
  { name = "openai", specifier = ">=1.52.2" },
941
  { name = "protobuf", specifier = ">=5.28.3" },
942
  { name = "python-dotenv", specifier = ">=1.0.1" },
@@ -1029,6 +1055,61 @@ wheels = [
1029
  { url = "https://files.pythonhosted.org/packages/ba/b2/6a22fb5c0885da3b00e116aee81f0b829ec9ac8f736cd414b4a09413fc7d/lxml-5.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba", size = 3487557 },
1030
  ]
1031
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1032
  [[package]]
1033
  name = "markdown-it-py"
1034
  version = "3.0.0"
 
898
  { url = "https://files.pythonhosted.org/packages/ea/8b/d7497df4a1cae9367adf21665dd1f896c2a7aeb8769ad77b662c5e2bcce7/kiwisolver-1.4.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:eb542fe7933aa09d8d8f9d9097ef37532a7df6497819d16efe4359890a2f417a", size = 55715 },
899
  ]
900
 
901
+ [[package]]
902
+ name = "langcodes"
903
+ version = "3.5.0"
904
+ source = { registry = "https://pypi.org/simple" }
905
+ dependencies = [
906
+ { name = "language-data" },
907
+ ]
908
+ sdist = { url = "https://files.pythonhosted.org/packages/3a/7a/5a97e327063409a5caa21541e6d08ae4a0f2da328447e9f2c7b39e179226/langcodes-3.5.0.tar.gz", hash = "sha256:1eef8168d07e51e131a2497ffecad4b663f6208e7c3ae3b8dc15c51734a6f801", size = 191030 }
909
+ wheels = [
910
+ { url = "https://files.pythonhosted.org/packages/c3/6b/068c2ea7a712bf805c62445bd9e9c06d7340358ef2824150eceac027444b/langcodes-3.5.0-py3-none-any.whl", hash = "sha256:853c69d1a35e0e13da2f427bb68fb2fa4a8f4fb899e0c62ad8df8d073dcfed33", size = 182974 },
911
+ ]
912
+
913
+ [[package]]
914
+ name = "language-data"
915
+ version = "1.3.0"
916
+ source = { registry = "https://pypi.org/simple" }
917
+ dependencies = [
918
+ { name = "marisa-trie" },
919
+ ]
920
+ sdist = { url = "https://files.pythonhosted.org/packages/dd/ce/3f144716a9f2cbf42aa86ebc8b085a184be25c80aa453eea17c294d239c1/language_data-1.3.0.tar.gz", hash = "sha256:7600ef8aa39555145d06c89f0c324bf7dab834ea0b0a439d8243762e3ebad7ec", size = 5129310 }
921
+ wheels = [
922
+ { url = "https://files.pythonhosted.org/packages/5d/e9/5a5ffd9b286db82be70d677d0a91e4d58f7912bb8dd026ddeeb4abe70679/language_data-1.3.0-py3-none-any.whl", hash = "sha256:e2ee943551b5ae5f89cd0e801d1fc3835bb0ef5b7e9c3a4e8e17b2b214548fbf", size = 5385760 },
923
+ ]
924
+
925
  [[package]]
926
  name = "languagebench"
927
  version = "0.1.0"
 
938
  { name = "bert-score" },
939
  { name = "evaluate" },
940
  { name = "joblib" },
941
+ { name = "langcodes" },
942
  { name = "openai" },
943
  { name = "protobuf" },
944
  { name = "python-dotenv" },
 
962
  { name = "bert-score", specifier = ">=0.3.13" },
963
  { name = "evaluate", specifier = "==0.4.0" },
964
  { name = "joblib", specifier = ">=1.4.2" },
965
+ { name = "langcodes", specifier = ">=3.5.0" },
966
  { name = "openai", specifier = ">=1.52.2" },
967
  { name = "protobuf", specifier = ">=5.28.3" },
968
  { name = "python-dotenv", specifier = ">=1.0.1" },
 
1055
  { url = "https://files.pythonhosted.org/packages/ba/b2/6a22fb5c0885da3b00e116aee81f0b829ec9ac8f736cd414b4a09413fc7d/lxml-5.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:6e91cf736959057f7aac7adfc83481e03615a8e8dd5758aa1d95ea69e8931dba", size = 3487557 },
1056
  ]
1057
 
1058
+ [[package]]
1059
+ name = "marisa-trie"
1060
+ version = "1.2.1"
1061
+ source = { registry = "https://pypi.org/simple" }
1062
+ dependencies = [
1063
+ { name = "setuptools" },
1064
+ ]
1065
+ sdist = { url = "https://files.pythonhosted.org/packages/31/15/9d9743897e4450b2de199ee673b50cb018980c4ced477d41cf91304a85e3/marisa_trie-1.2.1.tar.gz", hash = "sha256:3a27c408e2aefc03e0f1d25b2ff2afb85aac3568f6fa2ae2a53b57a2e87ce29d", size = 416124 }
1066
+ wheels = [
1067
+ { url = "https://files.pythonhosted.org/packages/e4/83/ccf5b33f2123f3110705c608f8e0caa82002626511aafafc58f82e50d322/marisa_trie-1.2.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a2eb41d2f9114d8b7bd66772c237111e00d2bae2260824560eaa0a1e291ce9e8", size = 362200 },
1068
+ { url = "https://files.pythonhosted.org/packages/9d/74/f7ce1fc2ee480c7f8ceadd9b992caceaba442a97e5e99d6aea00d3635a0b/marisa_trie-1.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9e956e6a46f604b17d570901e66f5214fb6f658c21e5e7665deace236793cef6", size = 192309 },
1069
+ { url = "https://files.pythonhosted.org/packages/e4/52/5dbbc13e57ce54c2ef0d04962d7d8f66edc69ed34310c734a2913199a581/marisa_trie-1.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:bd45142501300e7538b2e544905580918b67b1c82abed1275fe4c682c95635fa", size = 174713 },
1070
+ { url = "https://files.pythonhosted.org/packages/57/49/2580372f3f980aea95c23d05b2c1d3bbb9ee1ab8cfd441545153e44f1be7/marisa_trie-1.2.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a8443d116c612cfd1961fbf76769faf0561a46d8e317315dd13f9d9639ad500c", size = 1314808 },
1071
+ { url = "https://files.pythonhosted.org/packages/5a/ba/e12a4d450f265414cc68df6a116a78beece72b95f774f04d29cd48e08d19/marisa_trie-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:875a6248e60fbb48d947b574ffa4170f34981f9e579bde960d0f9a49ea393ecc", size = 1346678 },
1072
+ { url = "https://files.pythonhosted.org/packages/b2/81/8e130cb1eea741fd17694d821096f7ec9841f0e3d3c69b740257f5eeafa8/marisa_trie-1.2.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:746a7c60a17fccd3cfcfd4326926f02ea4fcdfc25d513411a0c4fc8e4a1ca51f", size = 1307254 },
1073
+ { url = "https://files.pythonhosted.org/packages/d7/d0/3deb5ea2bf7e4d845339875dbb31f3c3f66c8d6568723db1d137fb08a91c/marisa_trie-1.2.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:e70869737cc0e5bd903f620667da6c330d6737048d1f44db792a6af68a1d35be", size = 2194712 },
1074
+ { url = "https://files.pythonhosted.org/packages/9c/5f/b38d728dd30954816497b53425cfaddaf7b93ac0912db5911888f191b07a/marisa_trie-1.2.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:06b099dd743676dbcd8abd8465ceac8f6d97d8bfaabe2c83b965495523b4cef2", size = 2355625 },
1075
+ { url = "https://files.pythonhosted.org/packages/7e/4f/61c0faa9ae9e53600a1b7a0c367bc9db1a4fdc625402ec232c755a05e094/marisa_trie-1.2.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d2a82eb21afdaf22b50d9b996472305c05ca67fc4ff5a026a220320c9c961db6", size = 2290290 },
1076
+ { url = "https://files.pythonhosted.org/packages/7c/7d/713b970fb3043248881ed776dbf4d54918398aa5dde843a38711d0d62c8f/marisa_trie-1.2.1-cp310-cp310-win32.whl", hash = "sha256:8951e7ce5d3167fbd085703b4cbb3f47948ed66826bef9a2173c379508776cf5", size = 130743 },
1077
+ { url = "https://files.pythonhosted.org/packages/cc/94/3d619cc82c30daeacd18a88674f4e6540ebfb7b4b7752ca0552793be80cf/marisa_trie-1.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:5685a14b3099b1422c4f59fa38b0bf4b5342ee6cc38ae57df9666a0b28eeaad3", size = 151891 },
1078
+ { url = "https://files.pythonhosted.org/packages/4a/93/ffb01dfa22b6eee918e798e0bc3487427036c608aa4c065725f31aaf4104/marisa_trie-1.2.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:ed3fb4ed7f2084597e862bcd56c56c5529e773729a426c083238682dba540e98", size = 362823 },
1079
+ { url = "https://files.pythonhosted.org/packages/6d/1d/5c36500ac350c278c9bdfd88e17fa846fa4136d75597c167141ed973cdf2/marisa_trie-1.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0fe69fb9ffb2767746181f7b3b29bbd3454d1d24717b5958e030494f3d3cddf3", size = 192741 },
1080
+ { url = "https://files.pythonhosted.org/packages/e8/04/87dd0840f3f720e511eba56193c02bf64d7d96df1ca9f6d19994f55154be/marisa_trie-1.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4728ed3ae372d1ea2cdbd5eaa27b8f20a10e415d1f9d153314831e67d963f281", size = 174995 },
1081
+ { url = "https://files.pythonhosted.org/packages/c9/51/9e903a7e13b7593e2e675d0ec4c390ca076dc5df1c1a0d5e85a513b886a3/marisa_trie-1.2.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8cf4f25cf895692b232f49aa5397af6aba78bb679fb917a05fce8d3cb1ee446d", size = 1384728 },
1082
+ { url = "https://files.pythonhosted.org/packages/e8/3f/7362a5ac60c2b0aad0f52cd57e7bd0c708f20d2660d8df85360f3d8f1c4b/marisa_trie-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7cca7f96236ffdbf49be4b2e42c132e3df05968ac424544034767650913524de", size = 1412620 },
1083
+ { url = "https://files.pythonhosted.org/packages/1f/bc/aaa3eaf6875f78a204a8da9692d56e3a36f89997dad2c388628385614576/marisa_trie-1.2.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d7eb20bf0e8b55a58d2a9b518aabc4c18278787bdba476c551dd1c1ed109e509", size = 1361555 },
1084
+ { url = "https://files.pythonhosted.org/packages/18/98/e11b5a6206c5d110f32adab37fa84a85410d684e9c731acdd5c9250e2ce4/marisa_trie-1.2.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b1ec93f0d1ee6d7ab680a6d8ea1a08bf264636358e92692072170032dda652ba", size = 2257717 },
1085
+ { url = "https://files.pythonhosted.org/packages/d2/9d/6b4a40867875e738a67c5b29f83e2e490a66bd9067ace3dd9a5c497e2b7f/marisa_trie-1.2.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:e2699255d7ac610dee26d4ae7bda5951d05c7d9123a22e1f7c6a6f1964e0a4e4", size = 2417044 },
1086
+ { url = "https://files.pythonhosted.org/packages/fe/61/e25613c72f2931757334b8bcf6b501569ef713f5ee9c6c7688ec460bd720/marisa_trie-1.2.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:c484410911182457a8a1a0249d0c09c01e2071b78a0a8538cd5f7fa45589b13a", size = 2351960 },
1087
+ { url = "https://files.pythonhosted.org/packages/19/0a/a90ccaf3eb476d13ec261f80c6c52defaf10ebc7f35eb2bcd7dfb533aef7/marisa_trie-1.2.1-cp311-cp311-win32.whl", hash = "sha256:ad548117744b2bcf0e3d97374608be0a92d18c2af13d98b728d37cd06248e571", size = 130446 },
1088
+ { url = "https://files.pythonhosted.org/packages/fc/98/574b4e143e0a2f5f71af8716b6c4a8a46220f75a6e0847ce7d11ee0ba4aa/marisa_trie-1.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:436f62d27714970b9cdd3b3c41bdad046f260e62ebb0daa38125ef70536fc73b", size = 152037 },
1089
+ { url = "https://files.pythonhosted.org/packages/4e/bf/8bd4ac8436b33fd46c9e1ffe3c2a131cd9744cc1649dbbe13308f744ef2b/marisa_trie-1.2.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:638506eacf20ca503fff72221a7e66a6eadbf28d6a4a6f949fcf5b1701bb05ec", size = 360041 },
1090
+ { url = "https://files.pythonhosted.org/packages/ab/dd/4d3151e302e66ae387885f6ec265bd189e096b0c43c1379bfd9a3b9d2543/marisa_trie-1.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de1665eaafefa48a308e4753786519888021740501a15461c77bdfd57638e6b4", size = 190520 },
1091
+ { url = "https://files.pythonhosted.org/packages/00/28/ae5991c74fb90b173167a366a634c83445f948ad044d37287b478d6b457e/marisa_trie-1.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f713af9b8aa66a34cd3a78c7d150a560a75734713abe818a69021fd269e927fa", size = 174175 },
1092
+ { url = "https://files.pythonhosted.org/packages/5a/6a/fbfa89a8680eaabc6847a6c421e65427c43182db0c4bdb60e1516c81c822/marisa_trie-1.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2a7d00f53f4945320b551bccb826b3fb26948bde1a10d50bb9802fabb611b10", size = 1354995 },
1093
+ { url = "https://files.pythonhosted.org/packages/9e/4c/2ba0b385e5f64ca4ddb0c10ec52ddf881bc4521f135948786fc339d1d6c8/marisa_trie-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98042040d1d6085792e8d0f74004fc0f5f9ca6091c298f593dd81a22a4643854", size = 1390989 },
1094
+ { url = "https://files.pythonhosted.org/packages/6b/22/0791ed3045c91d0938345a86be472fc7c188b894f16c5dfad2ef31e7f882/marisa_trie-1.2.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6532615111eec2c79e711965ece0bc95adac1ff547a7fff5ffca525463116deb", size = 1328810 },
1095
+ { url = "https://files.pythonhosted.org/packages/9d/7d/3f566e563abae6efce7fc311c63282a447c611739b3cd66c0e36077c86f8/marisa_trie-1.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:20948e40ab2038e62b7000ca6b4a913bc16c91a2c2e6da501bd1f917eeb28d51", size = 2230222 },
1096
+ { url = "https://files.pythonhosted.org/packages/a5/0b/38fbb4611b5d1030242ddc2aa62e524438c8076e26f87395dbbf222dc62d/marisa_trie-1.2.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:66b23e5b35dd547f85bf98db7c749bc0ffc57916ade2534a6bbc32db9a4abc44", size = 2383620 },
1097
+ { url = "https://files.pythonhosted.org/packages/ae/17/4553c63de29904d5d2521a24cad817bc7883cfa90506ab702ec4dae59a7b/marisa_trie-1.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6704adf0247d2dda42e876b793be40775dff46624309ad99bc7537098bee106d", size = 2329202 },
1098
+ { url = "https://files.pythonhosted.org/packages/45/08/6307a630e63cd763fe77ac56516faa67fa9cd342060691e40fabc84be6b0/marisa_trie-1.2.1-cp312-cp312-win32.whl", hash = "sha256:3ad356442c2fea4c2a6f514738ddf213d23930f942299a2b2c05df464a00848a", size = 129652 },
1099
+ { url = "https://files.pythonhosted.org/packages/a1/fe/67c357bfd92710d95a16b86e1453c663d565415d7f7838781c79ff7e1a7e/marisa_trie-1.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:f2806f75817392cedcacb24ac5d80b0350dde8d3861d67d045c1d9b109764114", size = 150845 },
1100
+ { url = "https://files.pythonhosted.org/packages/2a/a4/a110cd9952f0e72da7bafea1f0084b18b9e03952110d9083bfda52279f5c/marisa_trie-1.2.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:b5ea16e69bfda0ac028c921b58de1a4aaf83d43934892977368579cd3c0a2554", size = 354439 },
1101
+ { url = "https://files.pythonhosted.org/packages/3c/a5/a6099eb1c3fd8d7e93408c45501e1d08536ac57dfef02ec331f78e1ace18/marisa_trie-1.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9f627f4e41be710b6cb6ed54b0128b229ac9d50e2054d9cde3af0fef277c23cf", size = 188187 },
1102
+ { url = "https://files.pythonhosted.org/packages/7c/cc/f637127e2beffa920d21f7fc45b4029575bcd1b28a90c0d90cb2b08c2205/marisa_trie-1.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5e649f3dc8ab5476732094f2828cc90cac3be7c79bc0c8318b6fda0c1d248db4", size = 171484 },
1103
+ { url = "https://files.pythonhosted.org/packages/6d/0f/29f2ad7260b956570f69f25a542efa51ba76eb76ecd53c63ee9d21987c3d/marisa_trie-1.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46e528ee71808c961baf8c3ce1c46a8337ec7a96cc55389d11baafe5b632f8e9", size = 1319770 },
1104
+ { url = "https://files.pythonhosted.org/packages/f2/12/0b69ed61fba59551a5f3d569af367afae614db7214ce1da12946ba9a433a/marisa_trie-1.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36aa4401a1180615f74d575571a6550081d84fc6461e9aefc0bb7b2427af098e", size = 1356488 },
1105
+ { url = "https://files.pythonhosted.org/packages/33/23/483b110db7ffe8729d6ebea2bf74258aef51f10fef5775f99e4bac7aef69/marisa_trie-1.2.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce59bcd2cda9bb52b0e90cc7f36413cd86c3d0ce7224143447424aafb9f4aa48", size = 1302334 },
1106
+ { url = "https://files.pythonhosted.org/packages/1c/6f/46c2be99ce925985127fdf78900f1673bce8cb72debfebee6dccd11032c6/marisa_trie-1.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f4cd800704a5fc57e53c39c3a6b0c9b1519ebdbcb644ede3ee67a06eb542697d", size = 2202624 },
1107
+ { url = "https://files.pythonhosted.org/packages/fd/b6/ef642327dbd4ec35be55d5682520b8f70fca98a54024f441ef2732f6b305/marisa_trie-1.2.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2428b495003c189695fb91ceeb499f9fcced3a2dce853e17fa475519433c67ff", size = 2364206 },
1108
+ { url = "https://files.pythonhosted.org/packages/69/04/ef8197a79d0ab5043b781cc9b457bd11b81d4204fe78adf7625a67f48c21/marisa_trie-1.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:735c363d9aaac82eaf516a28f7c6b95084c2e176d8231c87328dc80e112a9afa", size = 2304801 },
1109
+ { url = "https://files.pythonhosted.org/packages/03/72/f87564d653daf31d8f33d9bf0121e99ccc21f18f5c485fb404ba06abc10e/marisa_trie-1.2.1-cp313-cp313-win32.whl", hash = "sha256:eba6ca45500ca1a042466a0684aacc9838e7f20fe2605521ee19f2853062798f", size = 128799 },
1110
+ { url = "https://files.pythonhosted.org/packages/27/40/5f9eb8b73030cc4b0d6817176e66079a62a2ddd9d5530da54f8011473428/marisa_trie-1.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:aa7cd17e1c690ce96c538b2f4aae003d9a498e65067dd433c52dd069009951d4", size = 149035 },
1111
+ ]
1112
+
1113
  [[package]]
1114
  name = "markdown-it-py"
1115
  version = "3.0.0"