Spaces:

fair-forward
/

evals-for-every-language

Running

File size: 1,898 Bytes

import re
import xml.etree.ElementTree as ET
from collections import defaultdict
from joblib.memory import Memory
import pandas as pd
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
from language_data.util import data_filename

cache = Memory(location=".cache", verbose=0).cache

@cache
def get_population_data():
    filename = data_filename("supplementalData.xml")
    root = ET.fromstring(open(filename).read())
    territories = root.findall("./territoryInfo/territory")

    data = {}
    for territory in territories:
        t_code = territory.attrib["type"]
        t_population = float(territory.attrib["population"])
        data[t_code] = t_population
    return data


def population(bcp_47):
    items = {
        re.sub(r"^[a-z]+-", "", lang): pop
        for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
        if re.match(rf"^{bcp_47}-[A-Z]{{2}}$", lang)
    }
    return items

@cache
def make_country_table(language_table):
    countries = defaultdict(list)
    for lang in language_table.itertuples():
        for country, speaker_pop in population(lang.bcp_47).items():
            countries[country].append(
                {
                    "name": lang.language_name,
                    "bcp_47": lang.bcp_47,
                    "population": speaker_pop,
                    "score": lang.average if not pd.isna(lang.average) else 0,
                }
            )
    for country, languages in countries.items():
        speaker_pop = sum(entry["population"] for entry in languages)
        score = (
            sum(entry["score"] * entry["population"] for entry in languages)
            / speaker_pop
        )
        countries[country] = {
            "score": score,
            "languages": languages,
        }
    countries = [{"iso2": country, **data} for country, data in countries.items()]
    return pd.DataFrame(countries)