David Pomerenke
Speed things up
566c57e
raw
history blame
1.9 kB
import re
import xml.etree.ElementTree as ET
from collections import defaultdict
from joblib.memory import Memory
import pandas as pd
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
from language_data.util import data_filename
cache = Memory(location=".cache", verbose=0).cache
@cache
def get_population_data():
filename = data_filename("supplementalData.xml")
root = ET.fromstring(open(filename).read())
territories = root.findall("./territoryInfo/territory")
data = {}
for territory in territories:
t_code = territory.attrib["type"]
t_population = float(territory.attrib["population"])
data[t_code] = t_population
return data
def population(bcp_47):
items = {
re.sub(r"^[a-z]+-", "", lang): pop
for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
if re.match(rf"^{bcp_47}-[A-Z]{{2}}$", lang)
}
return items
@cache
def make_country_table(language_table):
countries = defaultdict(list)
for lang in language_table.itertuples():
for country, speaker_pop in population(lang.bcp_47).items():
countries[country].append(
{
"name": lang.language_name,
"bcp_47": lang.bcp_47,
"population": speaker_pop,
"score": lang.average if not pd.isna(lang.average) else 0,
}
)
for country, languages in countries.items():
speaker_pop = sum(entry["population"] for entry in languages)
score = (
sum(entry["score"] * entry["population"] for entry in languages)
/ speaker_pop
)
countries[country] = {
"score": score,
"languages": languages,
}
countries = [{"iso2": country, **data} for country, data in countries.items()]
return pd.DataFrame(countries)