David Pomerenke
Refactor eval code into files
da6e1bc
raw
history blame
1.11 kB
import re
from datetime import date
import pandas as pd
from joblib.memory import Memory
from langcodes import standardize_tag
from requests import get
cache = Memory(location=".cache", verbose=0).cache
# load CommonVoice stats
@cache # cache for 1 day
def get_commonvoice_stats(date: date):
return get("https://commonvoice.mozilla.org/api/v1/stats/languages").json()
commonvoice = pd.DataFrame(get_commonvoice_stats(date.today())).rename(
columns={"locale": "commonvoice_locale", "validatedHours": "commonvoice_hours"}
)[["commonvoice_locale", "commonvoice_hours"]]
# ignore country (language is language) (in practive this is only relevant to zh-CN/zh-TW/zh-HK)
commonvoice["bcp_47"] = commonvoice["commonvoice_locale"].apply(
lambda x: re.sub(r"-[A-Z]{2}$", "", x)
)
commonvoice["bcp_47"] = commonvoice["bcp_47"].apply(
lambda x: standardize_tag(x, macro=True)
) # this does not really seem to get macrolanguages though, e.g. not for Quechua
commonvoice = (
commonvoice.groupby("bcp_47")
.agg({"commonvoice_hours": "sum", "commonvoice_locale": "first"})
.reset_index()
)