David Pomerenke commited on
Commit
3a246c9
·
0 Parent(s):
Files changed (9) hide show
  1. .env.example +1 -0
  2. .gitignore +14 -0
  3. .python-version +1 -0
  4. README.md +9 -0
  5. dashboard.py +22 -0
  6. languagebench.py +86 -0
  7. pyproject.toml +16 -0
  8. results.json +152 -0
  9. uv.lock +0 -0
.env.example ADDED
@@ -0,0 +1 @@
 
 
1
+ OPENROUTER_API_KEY=
.gitignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ floresp-*
2
+ .cache
3
+ .env
4
+
5
+ # Python-generated files
6
+ __pycache__/
7
+ *.py[oc]
8
+ build/
9
+ dist/
10
+ wheels/
11
+ *.egg-info
12
+
13
+ # Virtual environments
14
+ .venv
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # languagebench 🌍
2
+
3
+ Benchmarking all big AI models on all benchmarkable languages.
4
+
5
+ Sources:
6
+
7
+ 1. For AI models: [OpenRouter](https://openrouter.ai/)
8
+ 2. For language benchmarks: [FLORES+](https://github.com/openlanguagedata/flores)
9
+ 3. For language statistics: [Wikidata](https://gist.github.com/unhammer/3e8f2e0f79972bf5008a4c970081502d) (Potential alternative: [Ethnologue](https://www.ethnologue.com/browse/names/))
dashboard.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+
5
+ st.title("Language Bench")
6
+
7
+ st.write("## Results")
8
+
9
+
10
+ results = pd.read_json("results.json")
11
+
12
+ st.dataframe(results)
13
+
14
+ for language in results["target_language"].unique():
15
+ st.write(f"## {language}")
16
+ fig = px.bar(
17
+ results[results["target_language"] == language],
18
+ x="model",
19
+ y="bleu",
20
+ range_y=[0, 1],
21
+ )
22
+ st.plotly_chart(fig)
languagebench.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os
4
+ from os import getenv
5
+
6
+ import evaluate
7
+ from dotenv import load_dotenv
8
+ from openai import AsyncOpenAI
9
+ from tqdm.asyncio import tqdm_asyncio
10
+ from joblib.memory import Memory
11
+
12
+ # config
13
+ models = [
14
+ "openai/gpt-4o-mini",
15
+ "google/gemini-flash-1.5",
16
+ "anthropic/claude-3.5-sonnet",
17
+ "qwen/qwen-2.5-72b-instruct",
18
+ "meta-llama/llama-3.1-8b-instruct",
19
+ ]
20
+ original_language = "eng_Latn"
21
+ dataset = "floresp-v2.0-rc.3/dev"
22
+ # target_languages = [f.split(".")[1] for f in os.listdir(dataset)]
23
+ target_languages = [
24
+ "eng_Latn",
25
+ "deu_Latn",
26
+ "fra_Latn",
27
+ "spa_Latn",
28
+ "cmn_Hans",
29
+ ]
30
+
31
+ # setup
32
+ client = AsyncOpenAI(
33
+ base_url="https://openrouter.ai/api/v1",
34
+ api_key=getenv("OPENROUTER_API_KEY"),
35
+ )
36
+ load_dotenv()
37
+ cache = Memory(location=".cache", verbose=0).cache
38
+ bleu = evaluate.load("bleu")
39
+
40
+
41
+ @cache
42
+ async def translate(model, target_language, sentence):
43
+ reply = await client.chat.completions.create(
44
+ model=model,
45
+ messages=[
46
+ {
47
+ "role": "user",
48
+ "content": f"Translate the following text from {original_language} to {target_language}:\n\n{sentence}",
49
+ }
50
+ ],
51
+ temperature=0,
52
+ )
53
+ return reply.choices[0].message.content
54
+
55
+
56
+ async def main():
57
+ n = 30
58
+ results = []
59
+ original_sentences = open(f"{dataset}/dev.{original_language}").readlines()
60
+ for target_language in target_languages:
61
+ target_sentences = open(f"{dataset}/dev.{target_language}").readlines()
62
+ for model in models:
63
+ print(f"{model} -> {target_language}")
64
+ predictions = await tqdm_asyncio.gather(
65
+ *[
66
+ translate(model, target_language, sentence)
67
+ for sentence in original_sentences[:n]
68
+ ],
69
+ )
70
+ metrics = bleu.compute(
71
+ predictions=predictions, references=target_sentences[:n]
72
+ )
73
+ results.append(
74
+ {
75
+ "model": model,
76
+ "original_language": original_language,
77
+ "target_language": target_language,
78
+ "bleu": metrics["bleu"],
79
+ }
80
+ )
81
+ with open("results.json", "w") as f:
82
+ json.dump(results, f, indent=2)
83
+
84
+
85
+ if __name__ == "__main__":
86
+ asyncio.run(main())
pyproject.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "languagebench"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "evaluate>=0.4.3",
9
+ "joblib>=1.4.2",
10
+ "openai>=1.52.2",
11
+ "pandas>=2.2.3",
12
+ "plotly>=5.24.1",
13
+ "python-dotenv>=1.0.1",
14
+ "streamlit>=1.39.0",
15
+ "tqdm>=4.66.6",
16
+ ]
results.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "model": "openai/gpt-4o-mini",
4
+ "original_language": "eng_Latn",
5
+ "target_language": "eng_Latn",
6
+ "bleu": 0.9601875101934466
7
+ },
8
+ {
9
+ "model": "google/gemini-flash-1.5",
10
+ "original_language": "eng_Latn",
11
+ "target_language": "eng_Latn",
12
+ "bleu": 0.796483772261889
13
+ },
14
+ {
15
+ "model": "anthropic/claude-3.5-sonnet",
16
+ "original_language": "eng_Latn",
17
+ "target_language": "eng_Latn",
18
+ "bleu": 0.4789694173473208
19
+ },
20
+ {
21
+ "model": "qwen/qwen-2.5-72b-instruct",
22
+ "original_language": "eng_Latn",
23
+ "target_language": "eng_Latn",
24
+ "bleu": 0.5708253125905761
25
+ },
26
+ {
27
+ "model": "meta-llama/llama-3.1-8b-instruct",
28
+ "original_language": "eng_Latn",
29
+ "target_language": "eng_Latn",
30
+ "bleu": 0.7139866196167579
31
+ },
32
+ {
33
+ "model": "openai/gpt-4o-mini",
34
+ "original_language": "eng_Latn",
35
+ "target_language": "deu_Latn",
36
+ "bleu": 0.42769123869791453
37
+ },
38
+ {
39
+ "model": "google/gemini-flash-1.5",
40
+ "original_language": "eng_Latn",
41
+ "target_language": "deu_Latn",
42
+ "bleu": 0.481667025275085
43
+ },
44
+ {
45
+ "model": "anthropic/claude-3.5-sonnet",
46
+ "original_language": "eng_Latn",
47
+ "target_language": "deu_Latn",
48
+ "bleu": 0.47566381880734276
49
+ },
50
+ {
51
+ "model": "qwen/qwen-2.5-72b-instruct",
52
+ "original_language": "eng_Latn",
53
+ "target_language": "deu_Latn",
54
+ "bleu": 0.3886704151083369
55
+ },
56
+ {
57
+ "model": "meta-llama/llama-3.1-8b-instruct",
58
+ "original_language": "eng_Latn",
59
+ "target_language": "deu_Latn",
60
+ "bleu": 0.3229429355718441
61
+ },
62
+ {
63
+ "model": "openai/gpt-4o-mini",
64
+ "original_language": "eng_Latn",
65
+ "target_language": "fra_Latn",
66
+ "bleu": 0.4770220301445618
67
+ },
68
+ {
69
+ "model": "google/gemini-flash-1.5",
70
+ "original_language": "eng_Latn",
71
+ "target_language": "fra_Latn",
72
+ "bleu": 0.4950529382461408
73
+ },
74
+ {
75
+ "model": "anthropic/claude-3.5-sonnet",
76
+ "original_language": "eng_Latn",
77
+ "target_language": "fra_Latn",
78
+ "bleu": 0.505571990673057
79
+ },
80
+ {
81
+ "model": "qwen/qwen-2.5-72b-instruct",
82
+ "original_language": "eng_Latn",
83
+ "target_language": "fra_Latn",
84
+ "bleu": 0.4343766704709354
85
+ },
86
+ {
87
+ "model": "meta-llama/llama-3.1-8b-instruct",
88
+ "original_language": "eng_Latn",
89
+ "target_language": "fra_Latn",
90
+ "bleu": 0.3738013101452592
91
+ },
92
+ {
93
+ "model": "openai/gpt-4o-mini",
94
+ "original_language": "eng_Latn",
95
+ "target_language": "spa_Latn",
96
+ "bleu": 0.34656060748435535
97
+ },
98
+ {
99
+ "model": "google/gemini-flash-1.5",
100
+ "original_language": "eng_Latn",
101
+ "target_language": "spa_Latn",
102
+ "bleu": 0.3449205632717461
103
+ },
104
+ {
105
+ "model": "anthropic/claude-3.5-sonnet",
106
+ "original_language": "eng_Latn",
107
+ "target_language": "spa_Latn",
108
+ "bleu": 0.34586378905270954
109
+ },
110
+ {
111
+ "model": "qwen/qwen-2.5-72b-instruct",
112
+ "original_language": "eng_Latn",
113
+ "target_language": "spa_Latn",
114
+ "bleu": 0.3341419407814188
115
+ },
116
+ {
117
+ "model": "meta-llama/llama-3.1-8b-instruct",
118
+ "original_language": "eng_Latn",
119
+ "target_language": "spa_Latn",
120
+ "bleu": 0.29470460185415065
121
+ },
122
+ {
123
+ "model": "openai/gpt-4o-mini",
124
+ "original_language": "eng_Latn",
125
+ "target_language": "cmn_Hans",
126
+ "bleu": 0.0
127
+ },
128
+ {
129
+ "model": "google/gemini-flash-1.5",
130
+ "original_language": "eng_Latn",
131
+ "target_language": "cmn_Hans",
132
+ "bleu": 0.0
133
+ },
134
+ {
135
+ "model": "anthropic/claude-3.5-sonnet",
136
+ "original_language": "eng_Latn",
137
+ "target_language": "cmn_Hans",
138
+ "bleu": 0.0
139
+ },
140
+ {
141
+ "model": "qwen/qwen-2.5-72b-instruct",
142
+ "original_language": "eng_Latn",
143
+ "target_language": "cmn_Hans",
144
+ "bleu": 0.0
145
+ },
146
+ {
147
+ "model": "meta-llama/llama-3.1-8b-instruct",
148
+ "original_language": "eng_Latn",
149
+ "target_language": "cmn_Hans",
150
+ "bleu": 0.0
151
+ }
152
+ ]
uv.lock ADDED
The diff for this file is too large to render. See raw diff