File size: 1,898 Bytes
723f963 566c57e 2c21cf7 723f963 566c57e 723f963 566c57e 723f963 566c57e 723f963 92b2164 723f963 92b2164 2c21cf7 723f963 92b2164 723f963 2c21cf7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import re
import xml.etree.ElementTree as ET
from collections import defaultdict
from joblib.memory import Memory
import pandas as pd
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
from language_data.util import data_filename
cache = Memory(location=".cache", verbose=0).cache
@cache
def get_population_data():
filename = data_filename("supplementalData.xml")
root = ET.fromstring(open(filename).read())
territories = root.findall("./territoryInfo/territory")
data = {}
for territory in territories:
t_code = territory.attrib["type"]
t_population = float(territory.attrib["population"])
data[t_code] = t_population
return data
def population(bcp_47):
items = {
re.sub(r"^[a-z]+-", "", lang): pop
for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
if re.match(rf"^{bcp_47}-[A-Z]{{2}}$", lang)
}
return items
@cache
def make_country_table(language_table):
countries = defaultdict(list)
for lang in language_table.itertuples():
for country, speaker_pop in population(lang.bcp_47).items():
countries[country].append(
{
"name": lang.language_name,
"bcp_47": lang.bcp_47,
"population": speaker_pop,
"score": lang.average if not pd.isna(lang.average) else 0,
}
)
for country, languages in countries.items():
speaker_pop = sum(entry["population"] for entry in languages)
score = (
sum(entry["score"] * entry["population"] for entry in languages)
/ speaker_pop
)
countries[country] = {
"score": score,
"languages": languages,
}
countries = [{"iso2": country, **data} for country, data in countries.items()]
return pd.DataFrame(countries)
|