David Pomerenke
commited on
Commit
·
0c05388
1
Parent(s):
7f54946
Use real population data in map
Browse files
app.py
CHANGED
@@ -251,9 +251,24 @@ def format_number(n):
|
|
251 |
return f"{n/1_000:.0f}K"
|
252 |
return str(n)
|
253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
def create_world_map(results):
|
256 |
# Collect all country data
|
|
|
257 |
country_data = {}
|
258 |
for lang in results:
|
259 |
if "population" not in lang or lang["bleu"] is None:
|
@@ -270,6 +285,7 @@ def create_world_map(results):
|
|
270 |
if iso3_code not in country_data:
|
271 |
country_data[iso3_code] = {
|
272 |
"total_speakers": 0,
|
|
|
273 |
"weighted_bleu_sum": 0,
|
274 |
"languages": [],
|
275 |
}
|
@@ -324,7 +340,6 @@ def create_world_map(results):
|
|
324 |
|
325 |
# Sort languages by number of speakers
|
326 |
langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True)
|
327 |
-
total_speakers = sum(lang["speakers"] for lang in langs)
|
328 |
|
329 |
# Take top 5 languages and summarize the rest
|
330 |
main_langs = langs[:5]
|
@@ -333,7 +348,7 @@ def create_world_map(results):
|
|
333 |
# Create language rows with bars
|
334 |
lang_rows = []
|
335 |
for lang in main_langs:
|
336 |
-
percentage = (lang["speakers"] /
|
337 |
speaker_bar = make_black_bar(percentage / 100)
|
338 |
bleu_bar = make_colored_bar((lang["bleu"] - 0.2) / 0.2)
|
339 |
|
@@ -346,7 +361,7 @@ def create_world_map(results):
|
|
346 |
# Add summary for other languages if any
|
347 |
if other_langs:
|
348 |
other_speakers = sum(lang["speakers"] for lang in other_langs)
|
349 |
-
other_percentage = (other_speakers /
|
350 |
other_avg_bleu = sum(lang["bleu"] for lang in other_langs) / len(
|
351 |
other_langs
|
352 |
)
|
@@ -360,15 +375,8 @@ def create_world_map(results):
|
|
360 |
f"{bleu_bar} {other_avg_bleu:.3f} BLEU<br>"
|
361 |
)
|
362 |
|
363 |
-
# Create overall BLEU visualization
|
364 |
-
bleu_percentage = (weighted_avg - 0.2) / 0.2 # Scale from 0.2-0.4 to 0-1
|
365 |
-
overall_bleu_bar = make_colored_bar(bleu_percentage)
|
366 |
-
|
367 |
hover_text = (
|
368 |
f"<b>{country_name}</b><br><br>"
|
369 |
-
f"{format_number(data['total_speakers'])} speakers*<br>"
|
370 |
-
f"{overall_bleu_bar} {weighted_avg:.3f} BLEU<br><br>"
|
371 |
-
f"<b>Languages:</b><br><br>"
|
372 |
f"{'<br>'.join(lang_rows)}"
|
373 |
)
|
374 |
|
|
|
251 |
return f"{n/1_000:.0f}K"
|
252 |
return str(n)
|
253 |
|
254 |
+
def get_population_data():
|
255 |
+
import xml.etree.ElementTree as ET
|
256 |
+
from language_data.util import data_filename
|
257 |
+
|
258 |
+
filename = data_filename("supplementalData.xml")
|
259 |
+
root = ET.fromstring(open(filename).read())
|
260 |
+
territories = root.findall("./territoryInfo/territory")
|
261 |
+
|
262 |
+
data = {}
|
263 |
+
for territory in territories:
|
264 |
+
t_code = territory.attrib['type']
|
265 |
+
t_population = float(territory.attrib['population'])
|
266 |
+
data[t_code] = t_population
|
267 |
+
return data
|
268 |
|
269 |
def create_world_map(results):
|
270 |
# Collect all country data
|
271 |
+
population_data = get_population_data()
|
272 |
country_data = {}
|
273 |
for lang in results:
|
274 |
if "population" not in lang or lang["bleu"] is None:
|
|
|
285 |
if iso3_code not in country_data:
|
286 |
country_data[iso3_code] = {
|
287 |
"total_speakers": 0,
|
288 |
+
"population": population_data.get(country_code, 0),
|
289 |
"weighted_bleu_sum": 0,
|
290 |
"languages": [],
|
291 |
}
|
|
|
340 |
|
341 |
# Sort languages by number of speakers
|
342 |
langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True)
|
|
|
343 |
|
344 |
# Take top 5 languages and summarize the rest
|
345 |
main_langs = langs[:5]
|
|
|
348 |
# Create language rows with bars
|
349 |
lang_rows = []
|
350 |
for lang in main_langs:
|
351 |
+
percentage = (lang["speakers"] / data["population"]) * 100
|
352 |
speaker_bar = make_black_bar(percentage / 100)
|
353 |
bleu_bar = make_colored_bar((lang["bleu"] - 0.2) / 0.2)
|
354 |
|
|
|
361 |
# Add summary for other languages if any
|
362 |
if other_langs:
|
363 |
other_speakers = sum(lang["speakers"] for lang in other_langs)
|
364 |
+
other_percentage = (other_speakers / data["population"]) * 100
|
365 |
other_avg_bleu = sum(lang["bleu"] for lang in other_langs) / len(
|
366 |
other_langs
|
367 |
)
|
|
|
375 |
f"{bleu_bar} {other_avg_bleu:.3f} BLEU<br>"
|
376 |
)
|
377 |
|
|
|
|
|
|
|
|
|
378 |
hover_text = (
|
379 |
f"<b>{country_name}</b><br><br>"
|
|
|
|
|
|
|
380 |
f"{'<br>'.join(lang_rows)}"
|
381 |
)
|
382 |
|