David Pomerenke commited on
Commit
0c05388
·
1 Parent(s): 7f54946

Use real population data in map

Browse files
Files changed (1) hide show
  1. app.py +18 -10
app.py CHANGED
@@ -251,9 +251,24 @@ def format_number(n):
251
  return f"{n/1_000:.0f}K"
252
  return str(n)
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
  def create_world_map(results):
256
  # Collect all country data
 
257
  country_data = {}
258
  for lang in results:
259
  if "population" not in lang or lang["bleu"] is None:
@@ -270,6 +285,7 @@ def create_world_map(results):
270
  if iso3_code not in country_data:
271
  country_data[iso3_code] = {
272
  "total_speakers": 0,
 
273
  "weighted_bleu_sum": 0,
274
  "languages": [],
275
  }
@@ -324,7 +340,6 @@ def create_world_map(results):
324
 
325
  # Sort languages by number of speakers
326
  langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True)
327
- total_speakers = sum(lang["speakers"] for lang in langs)
328
 
329
  # Take top 5 languages and summarize the rest
330
  main_langs = langs[:5]
@@ -333,7 +348,7 @@ def create_world_map(results):
333
  # Create language rows with bars
334
  lang_rows = []
335
  for lang in main_langs:
336
- percentage = (lang["speakers"] / total_speakers) * 100
337
  speaker_bar = make_black_bar(percentage / 100)
338
  bleu_bar = make_colored_bar((lang["bleu"] - 0.2) / 0.2)
339
 
@@ -346,7 +361,7 @@ def create_world_map(results):
346
  # Add summary for other languages if any
347
  if other_langs:
348
  other_speakers = sum(lang["speakers"] for lang in other_langs)
349
- other_percentage = (other_speakers / total_speakers) * 100
350
  other_avg_bleu = sum(lang["bleu"] for lang in other_langs) / len(
351
  other_langs
352
  )
@@ -360,15 +375,8 @@ def create_world_map(results):
360
  f"{bleu_bar} {other_avg_bleu:.3f} BLEU<br>"
361
  )
362
 
363
- # Create overall BLEU visualization
364
- bleu_percentage = (weighted_avg - 0.2) / 0.2 # Scale from 0.2-0.4 to 0-1
365
- overall_bleu_bar = make_colored_bar(bleu_percentage)
366
-
367
  hover_text = (
368
  f"<b>{country_name}</b><br><br>"
369
- f"{format_number(data['total_speakers'])} speakers*<br>"
370
- f"{overall_bleu_bar} {weighted_avg:.3f} BLEU<br><br>"
371
- f"<b>Languages:</b><br><br>"
372
  f"{'<br>'.join(lang_rows)}"
373
  )
374
 
 
251
  return f"{n/1_000:.0f}K"
252
  return str(n)
253
 
254
+ def get_population_data():
255
+ import xml.etree.ElementTree as ET
256
+ from language_data.util import data_filename
257
+
258
+ filename = data_filename("supplementalData.xml")
259
+ root = ET.fromstring(open(filename).read())
260
+ territories = root.findall("./territoryInfo/territory")
261
+
262
+ data = {}
263
+ for territory in territories:
264
+ t_code = territory.attrib['type']
265
+ t_population = float(territory.attrib['population'])
266
+ data[t_code] = t_population
267
+ return data
268
 
269
  def create_world_map(results):
270
  # Collect all country data
271
+ population_data = get_population_data()
272
  country_data = {}
273
  for lang in results:
274
  if "population" not in lang or lang["bleu"] is None:
 
285
  if iso3_code not in country_data:
286
  country_data[iso3_code] = {
287
  "total_speakers": 0,
288
+ "population": population_data.get(country_code, 0),
289
  "weighted_bleu_sum": 0,
290
  "languages": [],
291
  }
 
340
 
341
  # Sort languages by number of speakers
342
  langs = sorted(data["languages"], key=lambda x: x["speakers"], reverse=True)
 
343
 
344
  # Take top 5 languages and summarize the rest
345
  main_langs = langs[:5]
 
348
  # Create language rows with bars
349
  lang_rows = []
350
  for lang in main_langs:
351
+ percentage = (lang["speakers"] / data["population"]) * 100
352
  speaker_bar = make_black_bar(percentage / 100)
353
  bleu_bar = make_colored_bar((lang["bleu"] - 0.2) / 0.2)
354
 
 
361
  # Add summary for other languages if any
362
  if other_langs:
363
  other_speakers = sum(lang["speakers"] for lang in other_langs)
364
+ other_percentage = (other_speakers / data["population"]) * 100
365
  other_avg_bleu = sum(lang["bleu"] for lang in other_langs) / len(
366
  other_langs
367
  )
 
375
  f"{bleu_bar} {other_avg_bleu:.3f} BLEU<br>"
376
  )
377
 
 
 
 
 
378
  hover_text = (
379
  f"<b>{country_name}</b><br><br>"
 
 
 
380
  f"{'<br>'.join(lang_rows)}"
381
  )
382