evals-for-every-language / languagebench.py
David Pomerenke
Init
3a246c9
raw
history blame
2.38 kB
import asyncio
import json
import os
from os import getenv
import evaluate
from dotenv import load_dotenv
from openai import AsyncOpenAI
from tqdm.asyncio import tqdm_asyncio
from joblib.memory import Memory
# config
models = [
"openai/gpt-4o-mini",
"google/gemini-flash-1.5",
"anthropic/claude-3.5-sonnet",
"qwen/qwen-2.5-72b-instruct",
"meta-llama/llama-3.1-8b-instruct",
]
original_language = "eng_Latn"
dataset = "floresp-v2.0-rc.3/dev"
# target_languages = [f.split(".")[1] for f in os.listdir(dataset)]
target_languages = [
"eng_Latn",
"deu_Latn",
"fra_Latn",
"spa_Latn",
"cmn_Hans",
]
# setup
client = AsyncOpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=getenv("OPENROUTER_API_KEY"),
)
load_dotenv()
cache = Memory(location=".cache", verbose=0).cache
bleu = evaluate.load("bleu")
@cache
async def translate(model, target_language, sentence):
reply = await client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": f"Translate the following text from {original_language} to {target_language}:\n\n{sentence}",
}
],
temperature=0,
)
return reply.choices[0].message.content
async def main():
n = 30
results = []
original_sentences = open(f"{dataset}/dev.{original_language}").readlines()
for target_language in target_languages:
target_sentences = open(f"{dataset}/dev.{target_language}").readlines()
for model in models:
print(f"{model} -> {target_language}")
predictions = await tqdm_asyncio.gather(
*[
translate(model, target_language, sentence)
for sentence in original_sentences[:n]
],
)
metrics = bleu.compute(
predictions=predictions, references=target_sentences[:n]
)
results.append(
{
"model": model,
"original_language": original_language,
"target_language": target_language,
"bleu": metrics["bleu"],
}
)
with open("results.json", "w") as f:
json.dump(results, f, indent=2)
if __name__ == "__main__":
asyncio.run(main())