Spaces:

fair-forward
/

evals-for-every-language

Running

App Files Files Community

davidpomerenke commited on May 22

Commit

913253a

verified ·

1 Parent(s): 7fce0be

Upload from GitHub Actions: Use FLORES+ via Huggingface

Browse files

Files changed (10) hide show

README.md +10 -1
evals/backend.py +1 -1
evals/datasets_/flores.py +16 -17
evals/download_data.py +0 -83
evals/main.py +5 -6
evals/models.py +17 -9
evals/tasks.py +16 -25
languages.json +196 -196
models.json +0 -55
results.json +0 -0

README.md CHANGED Viewed

@@ -43,6 +43,15 @@ For tag meaning, see https://huggingface.co/spaces/leaderboards/LeaderboardsExpl
 _Tracking language proficiency of AI models for every language_
 ```bash
-uv run evals/main.py
 ```

 _Tracking language proficiency of AI models for every language_
+## Evaluate
+```bash
+uv run --extra dev evals/main.py
+```
+## Explore
 ```bash
+uv run evals/backend.py
+cd frontend && npm i && npm start
 ```

evals/backend.py CHANGED Viewed

@@ -23,7 +23,7 @@ def mean(lst):
 task_metrics = [
     "translation_from_bleu",
     "translation_to_bleu",
-    # "classification_accuracy",
     "mmlu_accuracy",
 ]

 task_metrics = [
     "translation_from_bleu",
     "translation_to_bleu",
+    "classification_accuracy",
     "mmlu_accuracy",
 ]

evals/datasets_/flores.py CHANGED Viewed

@@ -1,15 +1,19 @@
-from langcodes import Language, standardize_tag
-import pandas as pd
-import os
 import re
-flores_dir = "data/floresp-v2.0-rc.3/dev"
-def flores_sentences(language) -> list[str] | None:
-    try:
-        return open(f"{flores_dir}/dev.{language.flores_path}").readlines()
-    except FileNotFoundError:
         return None
 def aggregate_flores_paths(flores_paths):
     # takes a list of paths from the same language but different scripts
@@ -22,20 +26,15 @@ def aggregate_flores_paths(flores_paths):
     ]
     return flores_paths.values[populations.index(max(populations))]
-flores = pd.DataFrame(
-    [f.split(".")[1] for f in os.listdir(flores_dir)],
-    columns=["flores_path"],
-)
 flores["bcp_47"] = flores["flores_path"].apply(
     lambda x: standardize_tag(x, macro=True),
 )
 # ignore script (language is language)
 flores["bcp_47"] = flores["bcp_47"].apply(
-    lambda x: re.sub(r"-[A-Z][a-z]+$", "", x)
 )
 flores = (
-    flores.groupby("bcp_47")
-    .agg({"flores_path": aggregate_flores_paths})
-    .reset_index()
 )

 import re
+import pandas as pd
+from datasets_.util import _get_dataset_config_names, _load_dataset
+from langcodes import Language, standardize_tag
+slug = "openlanguagedata/flores_plus"
+splits = _get_dataset_config_names(slug)
+splits.remove("default")
+def flores_sentences(language) -> pd.DataFrame | None:
+    if language.flores_path not in splits:
         return None
+    return _load_dataset(slug, subset=language.flores_path, split="dev").to_pandas()
 def aggregate_flores_paths(flores_paths):
     # takes a list of paths from the same language but different scripts
     ]
     return flores_paths.values[populations.index(max(populations))]
+flores = pd.DataFrame(splits, columns=["flores_path"])
 flores["bcp_47"] = flores["flores_path"].apply(
     lambda x: standardize_tag(x, macro=True),
 )
 # ignore script (language is language)
 flores["bcp_47"] = flores["bcp_47"].apply(
+    lambda x: re.sub(r"-[A-Z][a-z0-9\-]+$", "", x)
 )
 flores = (
+    flores.groupby("bcp_47").agg({"flores_path": aggregate_flores_paths}).reset_index()
 )

evals/download_data.py CHANGED Viewed

@@ -24,9 +24,6 @@ DATA_DIR = project_root / "data"
 FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
 FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
-FLORES_PLUS_HF_ID = "openlanguagedata/flores_plus"
-FLORES_TARGET_DIR = DATA_DIR / "floresp-v2.0-rc.3" / "dev_parquet" # Note: Saving as parquet
 GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
 GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
 GLOTTOLOG_CSV_NAME = "languoid.csv"
@@ -142,37 +139,6 @@ def download_fleurs_data():
         else:
              print(f"Found extracted audio: {audio_extracted_marker}")
-def download_flores_plus_data():
-    """Downloads Flores+ data using Hugging Face datasets library."""
-    print("\n--- Downloading Flores+ Data (requires HF login & accepted terms) ---")
-    FLORES_TARGET_DIR.mkdir(parents=True, exist_ok=True)
-    try:
-        # Check login status first
-        token = huggingface_hub.HfFolder.get_token()
-        if not token:
-            print("Hugging Face token not found. Please log in using `huggingface-cli login`.")
-            print("You also need to accept the terms for 'openlanguagedata/flores_plus' on the HF website.")
-            return
-        print(f"Attempting to download '{FLORES_PLUS_HF_ID}' (dev split)...")
-        # Load only the 'dev' split
-        ds = load_dataset(FLORES_PLUS_HF_ID, split='dev', verification_mode='no_checks')
-        # Save as parquet files, potentially one per language if needed later
-        # For simplicity now, save the whole dev split as one parquet file
-        target_file = FLORES_TARGET_DIR / "dev_split.parquet"
-        print(f"Saving dev split to {target_file}...")
-        ds.to_parquet(target_file)
-        print("Flores+ dev split downloaded and saved as parquet.")
-    except huggingface_hub.utils.GatedRepoError:
-        print(f"Error: Access to '{FLORES_PLUS_HF_ID}' is gated.")
-        print("Please ensure you are logged in (`huggingface-cli login`) and have accepted the terms ")
-        print(f"on the dataset page: https://huggingface.co/datasets/{FLORES_PLUS_HF_ID}")
-    except Exception as e:
-        print(f"An error occurred downloading or saving Flores+: {e}")
 def download_glottolog_data():
     """Downloads and extracts Glottolog languoid CSV."""
@@ -227,53 +193,6 @@ def download_spbleu_data():
     else:
         print(f"Found: {target_dict_file}")
-# --- Conversion Function ---
-def convert_flores_parquet_to_text():
-    """Converts the downloaded Flores+ parquet dev split to text files."""
-    print("\n--- Converting Flores+ Parquet to Text Files ---")
-    parquet_file = FLORES_TARGET_DIR / "dev_split.parquet"
-    text_dir = project_root / "data" / "floresp-v2.0-rc.3" / "dev" # Original expected dir
-    if not parquet_file.exists():
-        print(f"Parquet file not found: {parquet_file}. Skipping conversion.")
-        return
-    try:
-        print(f"Reading parquet file: {parquet_file}")
-        df = pd.read_parquet(parquet_file)
-        print(f"Read {len(df)} rows from parquet.")
-        if not all(col in df.columns for col in ['iso_639_3', 'iso_15924', 'text']):
-            print("Error: Parquet file missing required columns (iso_639_3, iso_15924, text).")
-            return
-        text_dir.mkdir(parents=True, exist_ok=True)
-        print(f"Target directory for text files: {text_dir}")
-        # Group by language and script to create individual files
-        grouped = df.groupby(['iso_639_3', 'iso_15924'])
-        count = 0
-        for (lang, script), group in grouped:
-            target_filename = f"dev.{lang}_{script}"
-            target_path = text_dir / target_filename
-            print(f"Writing {len(group)} sentences to {target_path}...")
-            try:
-                with open(target_path, 'w', encoding='utf-8') as f:
-                    for sentence in group['text']:
-                        f.write(sentence + '\n')
-                count += 1
-            except Exception as e:
-                print(f"Error writing file {target_path}: {e}")
-        print(f"Successfully wrote {count} language/script files to {text_dir}.")
-    except ImportError:
-        print("Error: pandas or pyarrow might be missing. Cannot read parquet.")
-        print("Please install them: pip install pandas pyarrow")
-    except Exception as e:
-        print(f"An error occurred during parquet conversion: {e}")
 # --- Main Execution ---
@@ -282,8 +201,6 @@ def main():
     print("Starting data download process...")
     DATA_DIR.mkdir(exist_ok=True)
-    download_flores_plus_data()
-    convert_flores_parquet_to_text()
     #download_fleurs_data()
     download_glottolog_data()
     download_scriptcodes_data()

 FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
 FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
 GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
 GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
 GLOTTOLOG_CSV_NAME = "languoid.csv"
         else:
              print(f"Found extracted audio: {audio_extracted_marker}")
 def download_glottolog_data():
     """Downloads and extracts Glottolog languoid CSV."""
     else:
         print(f"Found: {target_dict_file}")
 # --- Main Execution ---
     print("Starting data download process...")
     DATA_DIR.mkdir(exist_ok=True)
     #download_fleurs_data()
     download_glottolog_data()
     download_scriptcodes_data()

evals/main.py CHANGED Viewed

@@ -9,8 +9,8 @@ from tqdm.asyncio import tqdm_asyncio
 # ===== config =====
 n_sentences = 10
-n_languages = 20
-n_models = 30
 # ===== run evaluation and aggregate results =====
@@ -31,9 +31,8 @@ async def evaluate():
     ]
     # filter out combinations that have already been evaluated
     combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
-    combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
-    combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
-    print(combis["model"].unique())
     # run evaluations
     results = [
         tasks[task_name](model, bcp_47, i)
@@ -51,7 +50,7 @@ async def evaluate():
             .reset_index()
         )
         # save results
-        results = pd.concat([old_results, results])
         results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
         results.to_json("results.json", **args)

 # ===== config =====
 n_sentences = 10
+n_languages = 10
+n_models = 10
 # ===== run evaluation and aggregate results =====
     ]
     # filter out combinations that have already been evaluated
     combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
+    # combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
+    # combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
     # run evaluations
     results = [
         tasks[task_name](model, bcp_47, i)
             .reset_index()
         )
         # save results
+        # results = pd.concat([old_results, results])
         results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
         results.to_json("results.json", **args)

evals/models.py CHANGED Viewed

@@ -12,6 +12,7 @@ from huggingface_hub import AsyncInferenceClient, HfApi
 from joblib.memory import Memory
 from openai import AsyncOpenAI
 from requests import HTTPError, get
 # for development purposes, all languages will be evaluated on the fast models
 # and only a sample of languages will be evaluated on all models
@@ -111,11 +112,17 @@ huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1)
 @cache
 async def complete(**kwargs):
     async with openrouter_rate_limit:
-        response = await client.chat.completions.create(**kwargs)
     if not response.choices:
         raise Exception(response)
-    return response
 @cache
 async def transcribe_elevenlabs(path, model):
@@ -199,12 +206,13 @@ def get_cost(row):
 @cache
 def load_models(date: date):
-    popular_models = (
-        get_historical_popular_models(date.today())[:15]
-        + get_current_popular_models(date.today())[:15]
-    )
-    popular_models = [m["slug"] for m in popular_models]
-    models = set(important_models + popular_models) - set(blocklist)
     models = pd.DataFrame(sorted(list(models)), columns=["id"])
     or_metadata = models["id"].apply(get_or_metadata)
     hf_metadata = or_metadata.apply(get_hf_metadata)

 from joblib.memory import Memory
 from openai import AsyncOpenAI
 from requests import HTTPError, get
+from openai import PermissionDeniedError
 # for development purposes, all languages will be evaluated on the fast models
 # and only a sample of languages will be evaluated on all models
 @cache
 async def complete(**kwargs):
     async with openrouter_rate_limit:
+        try:
+            response = await client.chat.completions.create(**kwargs)
+        except PermissionDeniedError as e:
+            if e["error"]["metadata"]["reason"] in ["violence", "hate", "sexual", "self-harm", "harassment"]:
+                print(e)
+                return None
+            else:
+                raise e
     if not response.choices:
         raise Exception(response)
+    return response.choices[0].message.content.strip()
 @cache
 async def transcribe_elevenlabs(path, model):
 @cache
 def load_models(date: date):
+    # popular_models = (
+    #     get_historical_popular_models(date.today())[:15]
+    #     + get_current_popular_models(date.today())[:15]
+    # )
+    # popular_models = [m["slug"] for m in popular_models]
+    # models = set(important_models + popular_models) - set(blocklist)
+    models = set(important_models) - set(blocklist)
     models = pd.DataFrame(sorted(list(models)), columns=["id"])
     or_metadata = models["id"].apply(get_or_metadata)
     hf_metadata = or_metadata.apply(get_hf_metadata)

evals/tasks.py CHANGED Viewed

@@ -30,12 +30,12 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
             pass
         case "to":
             original_language, target_language = target_language, original_language
-    if not flores_sentences(original_language) or not flores_sentences(target_language):
         return []
-    original_sentence = flores_sentences(original_language)[sentence_nr].strip()
-    target_sentence = flores_sentences(target_language)[sentence_nr].strip()
     script = script_name(target_language.flores_path.split("_")[1])
-    reply = await complete(
         model=model,
         messages=[
             {
@@ -46,8 +46,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
         temperature=0,
         max_tokens=1024,
     )
-    prediction = reply.choices[0].message.content.strip()
-    if prediction.strip():
         bleu_score = bleu.compute(
             predictions=[prediction],
             references=[target_sentence],
@@ -71,21 +70,15 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
         )
     ]
-# metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
 async def classify_and_evaluate(model, bcp_47, nr):
     language = languages[languages["bcp_47"] == bcp_47].iloc[0]
     sentences = flores_sentences(language)
-    if not sentences:
         return []
-    sentences = pd.DataFrame(sentences, columns=["text"])
-    sentences = pd.concat([metadata, sentences], axis=1)
     sentences = sentences.dropna(subset=["topic"])
     sentences["topic"] = sentences["topic"].str.lower()
     paragraphs = (
-        sentences.groupby("URL").agg({"text": " ".join, "topic": "first"}).reset_index()
     )
     top_topics = paragraphs.value_counts("topic").head(5).index
     paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
@@ -95,7 +88,7 @@ async def classify_and_evaluate(model, bcp_47, nr):
             for t in top_topics
         ]
     ).sample(frac=1, random_state=nr)
-    test_paragraphs = paragraphs[~paragraphs["URL"].isin(examples["URL"])].sample(
         frac=1, random_state=42
     )
     test_paragraph = test_paragraphs.iloc[nr]
@@ -112,7 +105,7 @@ async def classify_and_evaluate(model, bcp_47, nr):
     # some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
     # this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
     try:
-        reply = await complete(
             model=model,
             messages=[
                 *messages,
@@ -124,12 +117,11 @@ async def classify_and_evaluate(model, bcp_47, nr):
             temperature=0,
             max_tokens=30,
         )
-        response = reply.choices[0].message.content.strip().lower()
         true = test_paragraph.topic
         others = [t for t in top_topics if t != true]
         acc = int(
-            response.startswith(true)
-            or (true in response and not any(o in response for o in others))
         )
     except Exception as e:
         if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
@@ -160,7 +152,7 @@ def corrupt_sentence(sentence):
 async def mlm_and_evaluate(model, language_bcp_47, nr):
     language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
     sentences = flores_sentences(language)
-    if not sentences:
         return []
     sentences = pd.DataFrame(sentences, columns=["text"])
     sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
@@ -175,7 +167,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
             {"role": "user", "content": example.corrupt_text},
             {"role": "assistant", "content": example.text},
         ]
-    reply = await complete(
         model=model,
         messages=[
             *messages,
@@ -187,7 +179,6 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
         temperature=0,
         max_tokens=1024,
     )
-    prediction = reply.choices[0].message.content.strip()
     chrf_score = chrf.compute(predictions=[prediction], references=[test_sentence.text])
     return [
         {
@@ -224,13 +215,13 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
         ]
     messages += [{"role": "user", "content": format_item(task)}]
     try:
-        reply = await complete(
             model=model,
             messages=messages,
             temperature=0,
             max_tokens=1,
         )
-        acc = int(reply.choices[0].message.content[:1].strip() == task["answer"])
     except Exception as e:
         if "ResponsibleAIPolicyViolation" in str(e):
             acc = 0
@@ -282,7 +273,7 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
 tasks = {
     "translation_from": partial(translate_and_evaluate, mode="from"),
     "translation_to": partial(translate_and_evaluate, mode="to"),
-    # "classification": classify_and_evaluate,
     # "mlm": mlm_and_evaluate,
     "mmlu": mmlu_and_evaluate,
     # "asr": transcribe_and_evaluate,

             pass
         case "to":
             original_language, target_language = target_language, original_language
+    if flores_sentences(original_language) is None or flores_sentences(target_language) is None:
         return []
+    original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
+    target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
     script = script_name(target_language.flores_path.split("_")[1])
+    prediction = await complete(
         model=model,
         messages=[
             {
         temperature=0,
         max_tokens=1024,
     )
+    if prediction:
         bleu_score = bleu.compute(
             predictions=[prediction],
             references=[target_sentence],
         )
     ]
 async def classify_and_evaluate(model, bcp_47, nr):
     language = languages[languages["bcp_47"] == bcp_47].iloc[0]
     sentences = flores_sentences(language)
+    if sentences is None:
         return []
     sentences = sentences.dropna(subset=["topic"])
     sentences["topic"] = sentences["topic"].str.lower()
     paragraphs = (
+        sentences.groupby("url").agg({"text": " ".join, "topic": "first"}).reset_index()
     )
     top_topics = paragraphs.value_counts("topic").head(5).index
     paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
             for t in top_topics
         ]
     ).sample(frac=1, random_state=nr)
+    test_paragraphs = paragraphs[~paragraphs["url"].isin(examples["url"])].sample(
         frac=1, random_state=42
     )
     test_paragraph = test_paragraphs.iloc[nr]
     # some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
     # this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
     try:
+        pred = await complete(
             model=model,
             messages=[
                 *messages,
             temperature=0,
             max_tokens=30,
         )
         true = test_paragraph.topic
         others = [t for t in top_topics if t != true]
         acc = int(
+            pred.startswith(true)
+            or (true in pred and not any(o in pred for o in others))
         )
     except Exception as e:
         if "`inputs` tokens + `max_new_tokens` must be <= 4097" in str(e):
 async def mlm_and_evaluate(model, language_bcp_47, nr):
     language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
     sentences = flores_sentences(language)
+    if sentences is None:
         return []
     sentences = pd.DataFrame(sentences, columns=["text"])
     sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
             {"role": "user", "content": example.corrupt_text},
             {"role": "assistant", "content": example.text},
         ]
+    prediction = await complete(
         model=model,
         messages=[
             *messages,
         temperature=0,
         max_tokens=1024,
     )
     chrf_score = chrf.compute(predictions=[prediction], references=[test_sentence.text])
     return [
         {
         ]
     messages += [{"role": "user", "content": format_item(task)}]
     try:
+        response = await complete(
             model=model,
             messages=messages,
             temperature=0,
             max_tokens=1,
         )
+        acc = int(response[:1].strip() == task["answer"])
     except Exception as e:
         if "ResponsibleAIPolicyViolation" in str(e):
             acc = 0
 tasks = {
     "translation_from": partial(translate_and_evaluate, mode="from"),
     "translation_to": partial(translate_and_evaluate, mode="to"),
+    "classification": classify_and_evaluate,
     # "mlm": mlm_and_evaluate,
     "mmlu": mmlu_and_evaluate,
     # "asr": transcribe_and_evaluate,

languages.json CHANGED Viewed

@@ -485,7 +485,7 @@
     "language_name":"North Levantine Arabic",
     "autonym":"العامية",
     "family":"Afro-Asiatic",
-    "flores_path":"apc_Arab",
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "commonvoice_locale":null,
@@ -876,10 +876,10 @@
     "in_benchmark":true
   },
   {
-    "bcp_47":"mwr",
     "speakers":15913080,
-    "language_name":"Marwari",
-    "autonym":"Marwari",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
@@ -888,10 +888,10 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"bgc",
     "speakers":15913080,
-    "language_name":"Haryanvi",
-    "autonym":"हरियाणवी",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
@@ -1073,7 +1073,7 @@
     "language_name":"Akan",
     "autonym":"Akan",
     "family":"Atlantic-Congo",
-    "flores_path":"twi_Latn",
     "fleurs_tag":null,
     "commonvoice_hours":0.2,
     "commonvoice_locale":"tw",
@@ -1171,7 +1171,7 @@
     "family":"Afro-Asiatic",
     "flores_path":"tir_Ethi",
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
     "commonvoice_locale":"ti",
     "in_benchmark":true
   },
@@ -1195,7 +1195,7 @@
     "family":"Atlantic-Congo",
     "flores_path":"lua_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":1.9,
     "commonvoice_locale":"lua",
     "in_benchmark":true
   },
@@ -1955,18 +1955,6 @@
     "commonvoice_locale":"gom",
     "in_benchmark":true
   },
-  {
-    "bcp_47":"kln",
-    "speakers":4068120,
-    "language_name":"Kalenjin",
-    "autonym":"Kalenjin",
-    "family":"Nilotic",
-    "flores_path":null,
-    "fleurs_tag":null,
-    "commonvoice_hours":43.0,
-    "commonvoice_locale":"kln",
-    "in_benchmark":false
-  },
   {
     "bcp_47":"kam",
     "speakers":4068120,
@@ -1979,6 +1967,18 @@
     "commonvoice_locale":"kam",
     "in_benchmark":true
   },
   {
     "bcp_47":"bjn",
     "speakers":4010288,
@@ -2124,10 +2124,10 @@
     "in_benchmark":true
   },
   {
-    "bcp_47":"gbm",
     "speakers":3580443,
-    "language_name":"Garhwali",
-    "autonym":"Garhwali",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
@@ -2136,10 +2136,10 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"lmn",
     "speakers":3580443,
-    "language_name":"Lambadi",
-    "autonym":"Lambadi",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
@@ -2352,27 +2352,27 @@
     "in_benchmark":true
   },
   {
-    "bcp_47":"efi",
     "speakers":2996392,
-    "language_name":"Efik",
-    "autonym":"Efik",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":null,
-    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
-    "bcp_47":"ibb",
     "speakers":2996392,
-    "language_name":"Ibibio",
-    "autonym":"Ibibio",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":11.0,
-    "commonvoice_locale":"ibb",
     "in_benchmark":false
   },
   {
@@ -2544,11 +2544,11 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"wbq",
     "speakers":2386962,
-    "language_name":"Waddar",
-    "autonym":"Waddar",
-    "family":"Dravidian",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -2556,11 +2556,11 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"sck",
     "speakers":2386962,
-    "language_name":"Sadri",
-    "autonym":"Sadri",
-    "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -2724,10 +2724,10 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"khn",
     "speakers":1989135,
-    "language_name":"Khandesi",
-    "autonym":"Khandesi",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
@@ -2748,10 +2748,10 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"wbr",
     "speakers":1989135,
-    "language_name":"Wagdi",
-    "autonym":"Wagdi",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
@@ -3535,7 +3535,7 @@
     "family":null,
     "flores_path":"eus_Latn",
     "fleurs_tag":null,
-    "commonvoice_hours":377.0,
     "commonvoice_locale":"eu",
     "in_benchmark":true
   },
@@ -3559,7 +3559,7 @@
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":61.0,
     "commonvoice_locale":"kbd",
     "in_benchmark":false
   },
@@ -3684,10 +3684,10 @@
     "in_benchmark":true
   },
   {
-    "bcp_47":"ksb",
     "speakers":995398,
-    "language_name":"Shambala",
-    "autonym":"Kishambaa",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
@@ -3696,10 +3696,10 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"bez",
     "speakers":995398,
-    "language_name":"Bena",
-    "autonym":"Hibena",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
@@ -4512,27 +4512,27 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"nhw",
     "speakers":501735,
-    "language_name":"Western Huasteca Nahuatl",
-    "autonym":"Western Huasteca Nahuatl",
     "family":"Uto-Aztecan",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":null,
-    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
-    "bcp_47":"nhe",
     "speakers":501735,
-    "language_name":"Eastern Huasteca Nahuatl",
-    "autonym":"Eastern Huasteca Nahuatl",
     "family":"Uto-Aztecan",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
-    "commonvoice_locale":"nhe",
     "in_benchmark":false
   },
   {
@@ -4553,11 +4553,11 @@
     "language_name":"Kara-Kalpak",
     "autonym":"Kara-Kalpak",
     "family":"Turkic",
-    "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":0.0,
     "commonvoice_locale":"kaa",
-    "in_benchmark":false
   },
   {
     "bcp_47":"gju",
@@ -4715,18 +4715,6 @@
     "commonvoice_locale":null,
     "in_benchmark":false
   },
-  {
-    "bcp_47":"jmc",
-    "speakers":433291,
-    "language_name":"Machame",
-    "autonym":"Kimachame",
-    "family":"Atlantic-Congo",
-    "flores_path":null,
-    "fleurs_tag":null,
-    "commonvoice_hours":null,
-    "commonvoice_locale":null,
-    "in_benchmark":false
-  },
   {
     "bcp_47":"vun",
     "speakers":433291,
@@ -4747,10 +4735,22 @@
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":1.2,
     "commonvoice_locale":"rof",
     "in_benchmark":false
   },
   {
     "bcp_47":"kjg",
     "speakers":431949,
@@ -5124,27 +5124,27 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"bas",
     "speakers":332940,
-    "language_name":"Basaa",
-    "autonym":"Ɓàsàa",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":12.0,
-    "commonvoice_locale":"bas",
     "in_benchmark":false
   },
   {
-    "bcp_47":"bax",
     "speakers":332940,
-    "language_name":"Bamun",
-    "autonym":"Bamun",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":11.0,
-    "commonvoice_locale":"bax",
     "in_benchmark":false
   },
   {
@@ -5232,11 +5232,11 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"bfq",
     "speakers":305001,
-    "language_name":"Badaga",
-    "autonym":"Badaga",
-    "family":"Dravidian",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -5244,11 +5244,11 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"njo",
     "speakers":305001,
-    "language_name":"Ao Naga",
-    "autonym":"Ao Naga",
-    "family":"Sino-Tibetan",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -5388,10 +5388,10 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"tdd",
     "speakers":264864,
-    "language_name":"Tai Nüa",
-    "autonym":"Tai Nüa",
     "family":"Tai-Kadai",
     "flores_path":null,
     "fleurs_tag":null,
@@ -5400,10 +5400,10 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"khb",
     "speakers":264864,
-    "language_name":"Lü",
-    "autonym":"Lü",
     "family":"Tai-Kadai",
     "flores_path":null,
     "fleurs_tag":null,
@@ -5508,10 +5508,10 @@
     "in_benchmark":true
   },
   {
-    "bcp_47":"sxn",
     "speakers":245664,
-    "language_name":"Sangir",
-    "autonym":"Sangir",
     "family":"Austronesian",
     "flores_path":null,
     "fleurs_tag":null,
@@ -5520,10 +5520,10 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"mdr",
     "speakers":245664,
-    "language_name":"Mandar",
-    "autonym":"Mandar",
     "family":"Austronesian",
     "flores_path":null,
     "fleurs_tag":null,
@@ -5904,10 +5904,10 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"bss",
     "speakers":149823,
-    "language_name":"Akoose",
-    "autonym":"Akoose",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
@@ -5916,10 +5916,10 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"kkj",
     "speakers":149823,
-    "language_name":"Kako",
-    "autonym":"Kakɔ",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
@@ -6367,7 +6367,7 @@
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":3.1,
     "commonvoice_locale":"btv",
     "in_benchmark":false
   },
@@ -7272,11 +7272,11 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"bku",
     "speakers":7970,
-    "language_name":"Buhid",
-    "autonym":"Buhid",
-    "family":"Austronesian",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -7284,11 +7284,11 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"twq",
     "speakers":7970,
-    "language_name":"Tasawaq",
-    "autonym":"Tasawaq Senni",
-    "family":"Songhay",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -7836,11 +7836,11 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"crl",
     "speakers":377,
-    "language_name":"Northern East Cree",
-    "autonym":"Northern East Cree",
-    "family":"Algic",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -7848,11 +7848,11 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"kwk",
     "speakers":377,
-    "language_name":"Kwakʼwala",
-    "autonym":"KwakʼWala",
-    "family":"Wakashan",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -7968,11 +7968,11 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"sgs",
     "speakers":0,
-    "language_name":"Samogitian",
-    "autonym":"Samogitian",
-    "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -7980,11 +7980,11 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"rgn",
     "speakers":0,
-    "language_name":"Romagnol",
-    "autonym":"Romagnol",
-    "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -7992,22 +7992,22 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"ann",
     "speakers":0,
-    "language_name":"Obolo",
-    "autonym":"Obolo",
-    "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":null,
-    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
-    "bcp_47":"pfl",
     "speakers":0,
-    "language_name":"Palatine German",
-    "autonym":"Palatine German",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
@@ -8016,23 +8016,23 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"osa",
     "speakers":0,
-    "language_name":"Osage",
-    "autonym":"𐓏𐓘𐓻𐓘𐓻𐓟",
-    "family":"Siouan",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":null,
-    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
-    "bcp_47":"lzh",
     "speakers":0,
-    "language_name":"Literary Chinese",
-    "autonym":"Literary Chinese",
-    "family":"Sino-Tibetan",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -8040,23 +8040,23 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"jbo",
     "speakers":0,
-    "language_name":"Lojban",
-    "autonym":"La .Lojban.",
-    "family":"Artificial Language",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.0,
-    "commonvoice_locale":"jbo",
     "in_benchmark":false
   },
   {
-    "bcp_47":"io",
     "speakers":0,
-    "language_name":"Ido",
-    "autonym":"Ido",
-    "family":"Artificial Language",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -8064,10 +8064,10 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"jut",
     "speakers":0,
-    "language_name":"Jutish",
-    "autonym":"Jutish",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
@@ -8076,11 +8076,11 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"gez",
     "speakers":0,
-    "language_name":"Geez",
-    "autonym":"Geez",
-    "family":"Afro-Asiatic",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -8088,11 +8088,11 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"cu",
     "speakers":0,
-    "language_name":"Church Slavic",
-    "autonym":"Church Slavic",
-    "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
@@ -8112,23 +8112,23 @@
     "in_benchmark":false
   },
   {
-    "bcp_47":"vot",
     "speakers":0,
-    "language_name":"Votic",
-    "autonym":"Votic",
-    "family":"Uralic",
     "flores_path":null,
     "fleurs_tag":null,
-    "commonvoice_hours":0.1,
-    "commonvoice_locale":"vot",
     "in_benchmark":false
   },
   {
-    "bcp_47":"cad",
     "speakers":0,
-    "language_name":"Caddo",
-    "autonym":"Caddo",
-    "family":"Caddoan",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,

     "language_name":"North Levantine Arabic",
     "autonym":"العامية",
     "family":"Afro-Asiatic",
+    "flores_path":"apc_Arab_nort3139",
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "commonvoice_locale":null,
     "in_benchmark":true
   },
   {
+    "bcp_47":"bgc",
     "speakers":15913080,
+    "language_name":"Haryanvi",
+    "autonym":"हरियाणवी",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"mwr",
     "speakers":15913080,
+    "language_name":"Marwari",
+    "autonym":"Marwari",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "language_name":"Akan",
     "autonym":"Akan",
     "family":"Atlantic-Congo",
+    "flores_path":"twi_Latn_akua1239",
     "fleurs_tag":null,
     "commonvoice_hours":0.2,
     "commonvoice_locale":"tw",
     "family":"Afro-Asiatic",
     "flores_path":"tir_Ethi",
     "fleurs_tag":null,
+    "commonvoice_hours":0.1,
     "commonvoice_locale":"ti",
     "in_benchmark":true
   },
     "family":"Atlantic-Congo",
     "flores_path":"lua_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":2.2,
     "commonvoice_locale":"lua",
     "in_benchmark":true
   },
     "commonvoice_locale":"gom",
     "in_benchmark":true
   },
   {
     "bcp_47":"kam",
     "speakers":4068120,
     "commonvoice_locale":"kam",
     "in_benchmark":true
   },
+  {
+    "bcp_47":"kln",
+    "speakers":4068120,
+    "language_name":"Kalenjin",
+    "autonym":"Kalenjin",
+    "family":"Nilotic",
+    "flores_path":null,
+    "fleurs_tag":null,
+    "commonvoice_hours":43.0,
+    "commonvoice_locale":"kln",
+    "in_benchmark":false
+  },
   {
     "bcp_47":"bjn",
     "speakers":4010288,
     "in_benchmark":true
   },
   {
+    "bcp_47":"lmn",
     "speakers":3580443,
+    "language_name":"Lambadi",
+    "autonym":"Lambadi",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"gbm",
     "speakers":3580443,
+    "language_name":"Garhwali",
+    "autonym":"Garhwali",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":true
   },
   {
+    "bcp_47":"ibb",
     "speakers":2996392,
+    "language_name":"Ibibio",
+    "autonym":"Ibibio",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":11.0,
+    "commonvoice_locale":"ibb",
     "in_benchmark":false
   },
   {
+    "bcp_47":"efi",
     "speakers":2996392,
+    "language_name":"Efik",
+    "autonym":"Efik",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
     "in_benchmark":false
   },
   {
+    "bcp_47":"sck",
     "speakers":2386962,
+    "language_name":"Sadri",
+    "autonym":"Sadri",
+    "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"wbq",
     "speakers":2386962,
+    "language_name":"Waddar",
+    "autonym":"Waddar",
+    "family":"Dravidian",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"wbr",
     "speakers":1989135,
+    "language_name":"Wagdi",
+    "autonym":"Wagdi",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"khn",
     "speakers":1989135,
+    "language_name":"Khandesi",
+    "autonym":"Khandesi",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "family":null,
     "flores_path":"eus_Latn",
     "fleurs_tag":null,
+    "commonvoice_hours":379.0,
     "commonvoice_locale":"eu",
     "in_benchmark":true
   },
     "family":"Abkhaz-Adyge",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":62.0,
     "commonvoice_locale":"kbd",
     "in_benchmark":false
   },
     "in_benchmark":true
   },
   {
+    "bcp_47":"bez",
     "speakers":995398,
+    "language_name":"Bena",
+    "autonym":"Hibena",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"ksb",
     "speakers":995398,
+    "language_name":"Shambala",
+    "autonym":"Kishambaa",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"nhe",
     "speakers":501735,
+    "language_name":"Eastern Huasteca Nahuatl",
+    "autonym":"Eastern Huasteca Nahuatl",
     "family":"Uto-Aztecan",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":0.0,
+    "commonvoice_locale":"nhe",
     "in_benchmark":false
   },
   {
+    "bcp_47":"nhw",
     "speakers":501735,
+    "language_name":"Western Huasteca Nahuatl",
+    "autonym":"Western Huasteca Nahuatl",
     "family":"Uto-Aztecan",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
     "language_name":"Kara-Kalpak",
     "autonym":"Kara-Kalpak",
     "family":"Turkic",
+    "flores_path":"kaa_Latn",
     "fleurs_tag":null,
     "commonvoice_hours":0.0,
     "commonvoice_locale":"kaa",
+    "in_benchmark":true
   },
   {
     "bcp_47":"gju",
     "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
     "bcp_47":"vun",
     "speakers":433291,
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":2.5,
     "commonvoice_locale":"rof",
     "in_benchmark":false
   },
+  {
+    "bcp_47":"jmc",
+    "speakers":433291,
+    "language_name":"Machame",
+    "autonym":"Kimachame",
+    "family":"Atlantic-Congo",
+    "flores_path":null,
+    "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
+    "in_benchmark":false
+  },
   {
     "bcp_47":"kjg",
     "speakers":431949,
     "in_benchmark":false
   },
   {
+    "bcp_47":"bax",
     "speakers":332940,
+    "language_name":"Bamun",
+    "autonym":"Bamun",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":11.0,
+    "commonvoice_locale":"bax",
     "in_benchmark":false
   },
   {
+    "bcp_47":"bas",
     "speakers":332940,
+    "language_name":"Basaa",
+    "autonym":"Ɓàsàa",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":12.0,
+    "commonvoice_locale":"bas",
     "in_benchmark":false
   },
   {
     "in_benchmark":false
   },
   {
+    "bcp_47":"njo",
     "speakers":305001,
+    "language_name":"Ao Naga",
+    "autonym":"Ao Naga",
+    "family":"Sino-Tibetan",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"bfq",
     "speakers":305001,
+    "language_name":"Badaga",
+    "autonym":"Badaga",
+    "family":"Dravidian",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"khb",
     "speakers":264864,
+    "language_name":"Lü",
+    "autonym":"Lü",
     "family":"Tai-Kadai",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"tdd",
     "speakers":264864,
+    "language_name":"Tai Nüa",
+    "autonym":"Tai Nüa",
     "family":"Tai-Kadai",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":true
   },
   {
+    "bcp_47":"mdr",
     "speakers":245664,
+    "language_name":"Mandar",
+    "autonym":"Mandar",
     "family":"Austronesian",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"sxn",
     "speakers":245664,
+    "language_name":"Sangir",
+    "autonym":"Sangir",
     "family":"Austronesian",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"kkj",
     "speakers":149823,
+    "language_name":"Kako",
+    "autonym":"Kakɔ",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"bss",
     "speakers":149823,
+    "language_name":"Akoose",
+    "autonym":"Akoose",
     "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":4.6,
     "commonvoice_locale":"btv",
     "in_benchmark":false
   },
     "in_benchmark":false
   },
   {
+    "bcp_47":"twq",
     "speakers":7970,
+    "language_name":"Tasawaq",
+    "autonym":"Tasawaq Senni",
+    "family":"Songhay",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"bku",
     "speakers":7970,
+    "language_name":"Buhid",
+    "autonym":"Buhid",
+    "family":"Austronesian",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"kwk",
     "speakers":377,
+    "language_name":"Kwakʼwala",
+    "autonym":"KwakʼWala",
+    "family":"Wakashan",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"crl",
     "speakers":377,
+    "language_name":"Northern East Cree",
+    "autonym":"Northern East Cree",
+    "family":"Algic",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"lzh",
     "speakers":0,
+    "language_name":"Literary Chinese",
+    "autonym":"Literary Chinese",
+    "family":"Sino-Tibetan",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"io",
     "speakers":0,
+    "language_name":"Ido",
+    "autonym":"Ido",
+    "family":"Artificial Language",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"jbo",
     "speakers":0,
+    "language_name":"Lojban",
+    "autonym":"La .Lojban.",
+    "family":"Artificial Language",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":0.0,
+    "commonvoice_locale":"jbo",
     "in_benchmark":false
   },
   {
+    "bcp_47":"jut",
     "speakers":0,
+    "language_name":"Jutish",
+    "autonym":"Jutish",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"vot",
     "speakers":0,
+    "language_name":"Votic",
+    "autonym":"Votic",
+    "family":"Uralic",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":0.1,
+    "commonvoice_locale":"vot",
     "in_benchmark":false
   },
   {
+    "bcp_47":"gez",
     "speakers":0,
+    "language_name":"Geez",
+    "autonym":"Geez",
+    "family":"Afro-Asiatic",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"osa",
     "speakers":0,
+    "language_name":"Osage",
+    "autonym":"𐓏𐓘𐓻𐓘𐓻𐓟",
+    "family":"Siouan",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"rgn",
     "speakers":0,
+    "language_name":"Romagnol",
+    "autonym":"Romagnol",
+    "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"cu",
     "speakers":0,
+    "language_name":"Church Slavic",
+    "autonym":"Church Slavic",
     "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"sgs",
     "speakers":0,
+    "language_name":"Samogitian",
+    "autonym":"Samogitian",
+    "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"ann",
     "speakers":0,
+    "language_name":"Obolo",
+    "autonym":"Obolo",
+    "family":"Atlantic-Congo",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"cad",
     "speakers":0,
+    "language_name":"Caddo",
+    "autonym":"Caddo",
+    "family":"Caddoan",
     "flores_path":null,
     "fleurs_tag":null,
+    "commonvoice_hours":null,
+    "commonvoice_locale":null,
     "in_benchmark":false
   },
   {
+    "bcp_47":"pfl",
     "speakers":0,
+    "language_name":"Palatine German",
+    "autonym":"Palatine German",
+    "family":"Indo-European",
     "flores_path":null,
     "fleurs_tag":null,
     "commonvoice_hours":null,

models.json CHANGED Viewed

@@ -32,28 +32,6 @@
     "license":"Mit",
     "creation_date":1742774400000
   },
-  {
-    "id":"deepseek\/deepseek-r1",
-    "name":"R1 (free)",
-    "provider_name":"DeepSeek",
-    "cost":0.0,
-    "hf_id":"deepseek-ai\/DeepSeek-R1",
-    "size":684531386000.0,
-    "type":"Open",
-    "license":"Mit",
-    "creation_date":1737331200000
-  },
-  {
-    "id":"google\/gemini-2.0-flash-001",
-    "name":"Gemini 2.0 Flash",
-    "provider_name":"Google",
-    "cost":0.4,
-    "hf_id":null,
-    "size":null,
-    "type":"Commercial",
-    "license":null,
-    "creation_date":1738713600000
-  },
   {
     "id":"google\/gemini-2.0-flash-lite-001",
     "name":"Gemini 2.0 Flash Lite",
@@ -76,28 +54,6 @@
     "license":null,
     "creation_date":1744848000000
   },
-  {
-    "id":"google\/gemini-flash-1.5",
-    "name":"Gemini 1.5 Flash ",
-    "provider_name":"Google",
-    "cost":0.3,
-    "hf_id":null,
-    "size":null,
-    "type":"Commercial",
-    "license":null,
-    "creation_date":1715644800000
-  },
-  {
-    "id":"google\/gemini-flash-1.5-8b",
-    "name":"Gemini 1.5 Flash 8B",
-    "provider_name":"Google",
-    "cost":0.15,
-    "hf_id":null,
-    "size":null,
-    "type":"Commercial",
-    "license":null,
-    "creation_date":1727913600000
-  },
   {
     "id":"google\/gemma-3-27b-it",
     "name":"Gemma 3 27B (free)",
@@ -109,17 +65,6 @@
     "license":"Gemma",
     "creation_date":1740787200000
   },
-  {
-    "id":"gryphe\/mythomax-l2-13b",
-    "name":"MythoMax 13B",
-    "provider_name":"MythoMax 13B",
-    "cost":0.07,
-    "hf_id":"Gryphe\/MythoMax-L2-13b",
-    "size":null,
-    "type":"Open",
-    "license":"Other",
-    "creation_date":1691625600000
-  },
   {
     "id":"meta-llama\/llama-3-70b-instruct",
     "name":"Llama 3 70B Instruct",

     "license":"Mit",
     "creation_date":1742774400000
   },
   {
     "id":"google\/gemini-2.0-flash-lite-001",
     "name":"Gemini 2.0 Flash Lite",
     "license":null,
     "creation_date":1744848000000
   },
   {
     "id":"google\/gemma-3-27b-it",
     "name":"Gemma 3 27B (free)",
     "license":"Gemma",
     "creation_date":1740787200000
   },
   {
     "id":"meta-llama\/llama-3-70b-instruct",
     "name":"Llama 3 70B Instruct",

results.json CHANGED Viewed

The diff for this file is too large to render. See raw diff