import asyncio import json import os import random from os import getenv import evaluate import pandas as pd import requests from aiolimiter import AsyncLimiter from dotenv import load_dotenv from joblib.memory import Memory from openai import AsyncOpenAI from tqdm.asyncio import tqdm_asyncio # config models = [ "openai/gpt-4o-mini", "anthropic/claude-3.5-sonnet", "meta-llama/llama-3.1-70b-instruct", # lots of slow repetitions for LRLs "mistralai/mistral-nemo", "google/gemini-flash-1.5", # very fast "qwen/qwen-2.5-72b-instruct", # somewhat slow ] original_language = "eng_Latn" dataset = "floresp-v2.0-rc.3/dev" random.seed(42) target_languages = [f.split(".")[1] for f in os.listdir(dataset)] target_languages = random.choices(target_languages, k=15) + ["deu_Latn"] # setup load_dotenv() client = AsyncOpenAI( base_url="https://openrouter.ai/api/v1", api_key=getenv("OPENROUTER_API_KEY"), ) cache = Memory(location=".cache", verbose=0).cache bleu = evaluate.load("sacrebleu") rate_limit = AsyncLimiter(max_rate=2, time_period=0.1) def check_rate_limit(): print( requests.get( "https://openrouter.ai/api/v1/auth/key", headers={"Authorization": f"Bearer {getenv('OPENROUTER_API_KEY')}"}, ).json() ) print( requests.get( "https://openrouter.ai/api/v1/models", headers={"Authorization": f"Bearer {getenv('OPENROUTER_API_KEY')}"}, ).json() ) @cache async def complete(**kwargs): async with rate_limit: response = await client.chat.completions.create(**kwargs) return response def reorder(language_name): if "," in language_name and "(" not in language_name: return language_name.split(",")[1] + " " + language_name.split(",")[0] return language_name language_names = pd.read_csv("LanguageCodes.tab", sep="\t") language_names["Name"] = language_names["Name"].apply(reorder) language_stats = pd.read_csv("languages.tsv", sep="\t") script_names = pd.read_csv("ScriptCodes.csv") @cache async def translate(model, target_language, target_script, sentence): reply = await complete( model=model, messages=[ { "role": "user", "content": f"Translate the following text to the {target_language} language; use the {target_script} script; reply only with the translation:\n\n{sentence}", } ], temperature=0.1, max_tokens=1024, ) return reply.choices[0].message.content def get_language_stats(language_code): lang, script = language_code.split("_") stats = language_stats[language_stats["iso639_3"] == lang] if not stats.empty: stats = stats.iloc[0].to_dict() else: stats = dict() stats["script"] = script_names[script_names["Code"] == script]["English Name"].iloc[ 0 ] stats["name"] = language_names[language_names["LangID"] == lang]["Name"].iloc[0] return stats async def main(): n = 30 results = [] original_sentences = open(f"{dataset}/dev.{original_language}").readlines() for target_language in target_languages: if target_language == original_language: continue target_sentences = open(f"{dataset}/dev.{target_language}").readlines() for model in models: stats = get_language_stats(target_language) print(f"{model} -> {stats['name']}") predictions = [ translate(model, stats["name"], stats["script"], sentence) for sentence in original_sentences[:n] ] predictions = await tqdm_asyncio.gather(*predictions, miniters=1) metrics = bleu.compute( predictions=predictions, references=target_sentences[:n], tokenize="char", ) results.append( { "model": model, "original_language": original_language, "target_language": target_language, "target_language_name": stats["name"], "speakers": int(stats.get("maxSpeakers", 0)), "bleu": metrics["score"], } ) with open("results.json", "w") as f: json.dump(results, f, indent=2, ensure_ascii=False) pd.DataFrame(results).groupby("target_language_name").agg( {"bleu": "mean", "speakers": "mean"} ).reset_index().to_json("results_summary.json", indent=2, orient="records") if __name__ == "__main__": # check_rate_limit() asyncio.run(main())