David Pomerenke commited on
Commit
8941a67
·
1 Parent(s): f52ec6e

Update models

Browse files
Files changed (2) hide show
  1. evals/main.py +1 -1
  2. evals/models.py +23 -20
evals/main.py CHANGED
@@ -13,7 +13,7 @@ from tasks import tasks
13
 
14
  n_sentences = 10
15
  n_languages = 10
16
- n_models = 3
17
 
18
  # ===== run evaluation and aggregate results =====
19
 
 
13
 
14
  n_sentences = 10
15
  n_languages = 10
16
+ n_models = 20
17
 
18
  # ===== run evaluation and aggregate results =====
19
 
evals/models.py CHANGED
@@ -16,25 +16,28 @@ from requests import HTTPError, get
16
  # for development purposes, all languages will be evaluated on the fast models
17
  # and only a sample of languages will be evaluated on all models
18
  models = [
19
- "openai/gpt-4o-mini", # 0.6$/M tokens
20
- # "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
21
- "meta-llama/llama-4-maverick", # 0.6$/M tokens
22
- "meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
23
- "meta-llama/llama-3.1-70b-instruct", # 0.3$/M tokens
24
- "meta-llama/llama-3-70b-instruct", # 0.4$/M tokens
25
- "mistralai/mistral-small-3.1-24b-instruct", # 0.3$/M tokens
26
- # "mistralai/mistral-saba", # 0.6$/M tokens
27
- # "mistralai/mistral-nemo", # 0.08$/M tokens
28
- "google/gemini-2.0-flash-001", # 0.4$/M tokens
29
- # "google/gemini-2.0-flash-lite-001", # 0.3$/M tokens
30
- "google/gemma-3-27b-it", # 0.2$/M tokens
31
- # "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
32
- "qwen/qwq-32b", # 0.2$/M tokens
33
- "deepseek/deepseek-chat-v3-0324", # 1.1$/M tokens
34
- # "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
35
- "microsoft/phi-4-multimodal-instruct", # 0.1$/M tokens
36
- "amazon/nova-micro-v1", # 0.09$/M tokens
37
- # "openGPT-X/Teuken-7B-instruct-research-v0.4", # not on OpenRouter
 
 
 
38
  ]
39
 
40
  transcription_models = [
@@ -63,7 +66,7 @@ def get_popular_models(date: date):
63
 
64
 
65
  pop_models = get_popular_models(date.today())
66
- models += [m for m in pop_models if m not in models][:1]
67
 
68
  load_dotenv()
69
  client = AsyncOpenAI(
 
16
  # for development purposes, all languages will be evaluated on the fast models
17
  # and only a sample of languages will be evaluated on all models
18
  models = [
19
+ "meta-llama/llama-4-maverick", # 0.6$
20
+ "meta-llama/llama-3.3-70b-instruct", # 0.3$
21
+ "meta-llama/llama-3.1-70b-instruct", # 0.3$
22
+ "meta-llama/llama-3-70b-instruct", # 0.4$
23
+ # "meta-llama/llama-2-70b-chat", # 0.9$; not enough context
24
+ "openai/gpt-4.1-nano", # 0.4$
25
+ "openai/gpt-4o-mini", # 0.6$
26
+ # "openai/gpt-3.5-turbo-0613", # 2$
27
+ # "openai/gpt-3.5-turbo", # 1.5$
28
+ # "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
29
+ "mistralai/mistral-small-3.1-24b-instruct", # 0.3$
30
+ # "mistralai/mistral-saba", # 0.6$
31
+ # "mistralai/mistral-nemo", # 0.08$
32
+ "google/gemini-2.5-flash-preview", # 0.6$
33
+ # "google/gemini-2.0-flash-lite-001", # 0.3$
34
+ "google/gemma-3-27b-it", # 0.2$
35
+ # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
36
+ "qwen/qwq-32b", # 0.2$
37
+ "deepseek/deepseek-chat-v3-0324", # 1.1$
38
+ # "microsoft/phi-4", # 0.07$; only 16k tokens context
39
+ "microsoft/phi-4-multimodal-instruct", # 0.1$
40
+ "amazon/nova-micro-v1", # 0.09$
41
  ]
42
 
43
  transcription_models = [
 
66
 
67
 
68
  pop_models = get_popular_models(date.today())
69
+ # models += [m for m in pop_models if m not in models][:1]
70
 
71
  load_dotenv()
72
  client = AsyncOpenAI(