|
import asyncio |
|
import json |
|
import os |
|
import re |
|
from os import getenv |
|
|
|
import evaluate |
|
import pandas as pd |
|
import requests |
|
from aiolimiter import AsyncLimiter |
|
from dotenv import load_dotenv |
|
from joblib.memory import Memory |
|
from openai import AsyncOpenAI |
|
from tqdm.asyncio import tqdm_asyncio |
|
from transformers import NllbTokenizer |
|
from datetime import date |
|
from requests import get |
|
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION |
|
from langcodes import standardize_tag, Language |
|
|
|
|
|
models = [ |
|
"openai/gpt-4o-mini", |
|
|
|
"meta-llama/llama-3.3-70b-instruct", |
|
"mistralai/mistral-small-24b-instruct-2501", |
|
"google/gemini-2.0-flash-001", |
|
|
|
"deepseek/deepseek-chat", |
|
"microsoft/phi-4", |
|
] |
|
fast_model = "meta-llama/llama-3.3-70b-instruct" |
|
n_sentences = 30 |
|
|
|
|
|
load_dotenv() |
|
client = AsyncOpenAI( |
|
base_url="https://openrouter.ai/api/v1", |
|
api_key=getenv("OPENROUTER_API_KEY"), |
|
) |
|
cache = Memory(location=".cache", verbose=0).cache |
|
bleu = evaluate.load("bleu") |
|
bertscore = evaluate.load("bertscore") |
|
tokenizer = NllbTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") |
|
rate_limit = AsyncLimiter(max_rate=20, time_period=1) |
|
|
|
|
|
def reorder(language_name): |
|
if "," in language_name and "(" not in language_name: |
|
return language_name.split(",")[1] + " " + language_name.split(",")[0] |
|
return language_name |
|
|
|
|
|
|
|
languages = { |
|
lang: pop |
|
for lang, pop in LANGUAGE_SPEAKING_POPULATION.items() |
|
if not re.match(r".*-[A-Z]{2}$", lang) |
|
} |
|
languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"]) |
|
languages["name"] = languages["bcp_47"].apply(lambda x: Language.get(x).display_name()) |
|
|
|
|
|
scripts = pd.read_csv("data/ScriptCodes.csv").rename(columns={"Code": "iso15924", "English Name": "script_name"}) |
|
|
|
def script_name(iso15924): |
|
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0] |
|
|
|
|
|
benchmark_dir = "data/floresp-v2.0-rc.3/dev" |
|
benchmark_languages = pd.DataFrame( |
|
[f.split(".")[1].split("_", 1) for f in os.listdir(benchmark_dir)], |
|
columns=["iso639_3", "iso15924"], |
|
) |
|
benchmark_languages["bcp_47"] = benchmark_languages.apply( |
|
lambda row: standardize_tag(row["iso639_3"] + "-" + row["iso15924"], macro=True), |
|
axis=1, |
|
) |
|
|
|
benchmark_languages["bcp_47"] = benchmark_languages["bcp_47"].apply( |
|
lambda x: re.sub(r"-[A-Z][a-z]+$", "", x) |
|
) |
|
benchmark_languages = ( |
|
benchmark_languages.groupby("bcp_47") |
|
.agg({"iso639_3": "first", "iso15924": "first"}) |
|
.reset_index() |
|
) |
|
|
|
|
|
|
|
@cache |
|
def get_commonvoice_stats(date: date): |
|
return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json() |
|
|
|
|
|
commonvoice_stats = pd.DataFrame(get_commonvoice_stats(date.today())).rename( |
|
columns={"locale": "bcp_47", "validatedHours": "commonvoice_hours"} |
|
)[["bcp_47", "commonvoice_hours"]] |
|
|
|
commonvoice_stats["bcp_47"] = commonvoice_stats["bcp_47"].apply( |
|
lambda x: re.sub(r"-[A-Z]{2}$", "", x) |
|
) |
|
commonvoice_stats["bcp_47"] = commonvoice_stats["bcp_47"].apply( |
|
lambda x: standardize_tag(x, macro=True) |
|
) |
|
commonvoice_stats = commonvoice_stats.groupby("bcp_47").sum().reset_index() |
|
|
|
|
|
languages = pd.merge( |
|
languages, benchmark_languages, on="bcp_47", how="left" |
|
) |
|
languages = pd.merge( |
|
languages, commonvoice_stats, on="bcp_47", how="left" |
|
) |
|
languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"]) |
|
|
|
languages = languages.sort_values(by="speakers", ascending=False) |
|
languages = languages.iloc[:10] |
|
|
|
|
|
target_languages = languages[languages["in_benchmark"]].sample( |
|
n=n_sentences, weights="speakers", replace=True, random_state=42 |
|
) |
|
|
|
detailed_languages = languages[languages["in_benchmark"]].sample(n=3, random_state=42) |
|
|
|
|
|
|
|
def check_rate_limit(): |
|
print( |
|
requests.get( |
|
"https://openrouter.ai/api/v1/auth/key", |
|
headers={"Authorization": f"Bearer {getenv('OPENROUTER_API_KEY')}"}, |
|
).json() |
|
) |
|
models = requests.get( |
|
"https://openrouter.ai/api/v1/models", |
|
headers={"Authorization": f"Bearer {getenv('OPENROUTER_API_KEY')}"}, |
|
).json()["data"] |
|
model = next((m for m in models if m["id"] == "google/gemini-flash-1.5"), None) |
|
print(model) |
|
|
|
|
|
@cache |
|
async def complete(**kwargs): |
|
async with rate_limit: |
|
response = await client.chat.completions.create(**kwargs) |
|
if not response.choices: |
|
raise Exception(response) |
|
return response |
|
|
|
async def translate(model, target_language, sentence): |
|
script = script_name(target_language.iso15924) |
|
reply = await complete( |
|
model=model, |
|
messages=[ |
|
{ |
|
"role": "user", |
|
"content": f"Translate the following text to the {target_language.name} language; use the {script} script; reply only with the translation:\n\n{sentence}", |
|
} |
|
], |
|
temperature=0, |
|
max_tokens=1024, |
|
) |
|
return reply.choices[0].message.content |
|
|
|
|
|
def mean(l): |
|
return sum(l) / len(l) if l else 0 |
|
|
|
|
|
def load_sentences(language): |
|
return open(f"{benchmark_dir}/dev.{language.iso639_3}_{language.iso15924}").readlines() |
|
|
|
|
|
|
|
async def main(): |
|
results = [] |
|
for language in list(languages.itertuples()): |
|
scores = [] |
|
if language.in_benchmark: |
|
original_sentences = load_sentences(language)[:n_sentences] |
|
for model in models: |
|
if ( |
|
model != fast_model |
|
and language.bcp_47 not in detailed_languages.bcp_47.values |
|
): |
|
continue |
|
predictions = [ |
|
translate( |
|
model, |
|
language, |
|
sentence, |
|
) |
|
for sentence, language in zip( |
|
original_sentences, target_languages.itertuples() |
|
) |
|
] |
|
predictions = await tqdm_asyncio.gather(*predictions, miniters=1, desc=f"{language.name} {model.split('/')[0]}") |
|
target_sentences = [ |
|
load_sentences(lang)[i] |
|
for i, lang in enumerate(target_languages.itertuples()) |
|
] |
|
metrics_bleu = bleu.compute( |
|
predictions=predictions, |
|
references=target_sentences, |
|
tokenizer=tokenizer.tokenize, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
scores.append( |
|
{ |
|
"model": model, |
|
"bleu": metrics_bleu["bleu"], |
|
|
|
} |
|
) |
|
results.append( |
|
{ |
|
"language_name": language.name, |
|
"bcp_47": language.bcp_47, |
|
"speakers": language.speakers if not pd.isna(language.speakers) else 0, |
|
"scores": scores, |
|
"bleu": mean([s["bleu"] for s in scores]) if scores else None, |
|
|
|
"commonvoice_hours": language.commonvoice_hours, |
|
} |
|
) |
|
with open("results.json", "w") as f: |
|
json.dump(results, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
asyncio.run(main()) |
|
|