Spaces:

poltextlab
/

babel_machine

Running

App Files Files Community

kovacsvi commited on May 8

Commit

af68a82

1 Parent(s): ecdbfcf

flat min media pred

Browse files

Files changed (3) hide show

interfaces/cap_minor_media.py +53 -8
label_dicts.py +221 -0
utils.py +4 -0

interfaces/cap_minor_media.py CHANGED Viewed

@@ -12,7 +12,8 @@ from huggingface_hub import HfApi
 from collections import defaultdict
 from label_dicts import (CAP_MEDIA_NUM_DICT, CAP_MEDIA_LABEL_NAMES,
-                        CAP_MIN_NUM_DICT, CAP_MIN_LABEL_NAMES)
 from .utils import is_disk_full
@@ -54,8 +55,11 @@ def check_huggingface_path(checkpoint_path: str):
     except:
         return False
-def build_huggingface_path(language: str, domain: str):
-    return ("poltextlab/xlm-roberta-large-pooled-cap-media", "poltextlab/xlm-roberta-large-pooled-cap-minor-v3")
 #@spaces.GPU(duration=30)
 def predict(text, major_model_id, minor_model_id, tokenizer_id, HF_TOKEN=None):
@@ -141,16 +145,57 @@ def predict(text, major_model_id, minor_model_id, tokenizer_id, HF_TOKEN=None):
     return interpretation_info, output_pred, output_info
-def predict_cap(tmp, method, text, language, domain):
-    domain = domains[domain]
-    major_model_id, minor_model_id = build_huggingface_path(language, domain)
-    tokenizer_id = "xlm-roberta-large"
     if is_disk_full():
         os.system('rm -rf /data/models*')
         os.system('rm -r ~/.cache/huggingface/hub')
-    return predict(text, major_model_id, minor_model_id, tokenizer_id)
 description = """
 You can choose between two approaches for making predictions:

 from collections import defaultdict
 from label_dicts import (CAP_MEDIA_NUM_DICT, CAP_MEDIA_LABEL_NAMES,
+                        CAP_MIN_NUM_DICT, CAP_MIN_LABEL_NAMES,
+                        CAP_MIN_MEDIA_NUM_DICT)
 from .utils import is_disk_full
     except:
         return False
+def build_huggingface_path(language: str, domain: str, hierarchical: bool):
+    if hierarchical:
+        return ("poltextlab/xlm-roberta-large-pooled-cap-media", "poltextlab/xlm-roberta-large-pooled-cap-minor-v3")
+    else:
+        return "poltextlab/xlm-roberta-large-pooled-cap-media-minor"
 #@spaces.GPU(duration=30)
 def predict(text, major_model_id, minor_model_id, tokenizer_id, HF_TOKEN=None):
     return interpretation_info, output_pred, output_info
+def predict_flat(text, model_id, tokenizer_id, HF_TOKEN=None):
+    device = torch.device("cpu")
+    model = AutoModelForSequenceClassification.from_pretrained(model_id, low_cpu_mem_usage=True, device_map="auto", offload_folder="offload", token=HF_TOKEN).to(device)
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
+    inputs = tokenizer(text,
+                       max_length=256,
+                       truncation=True,
+                       padding="do_not_pad",
+                       return_tensors="pt").to(device)
+    model.eval()
+    with torch.no_grad():
+        logits = model(**inputs).logits
+    probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
+    top_indices = np.argsort(probs)[::-1][:10]
+    CAP_MIN_MEDIA_LABEL_NAMES = CAP_MEDIA_LABEL_NAMES | CAP_MIN_LABEL_NAMES
+    output_pred = {
+        f"[{CAP_MIN_MEDIA_NUM_DICT[i]}] {CAP_MIN_MEDIA_LABEL_NAMES[CAP_MIN_MEDIA_NUM_DICT[i]]}": probs[i]
+        for i in top_indices
+    }
+    output_info = f'<p style="text-align: center; display: block">Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model.</p>'
+    interpretation_info = """
+    ## How to Interpret These Values (Flat Classification)
+    This method returns predictions made by a single model. Both media codes and minor topics may appear in the output list. Only the top 10 most confident labels are displayed.
+    """
+    return interpretation_info, output_pred, output_info
+def predict_cap(tmp, method, text, language, domain):
     if is_disk_full():
         os.system('rm -rf /data/models*')
         os.system('rm -r ~/.cache/huggingface/hub')
+    domain = domains[domain]
+    if method == "Hierarchical Classification":
+        major_model_id, minor_model_id = build_huggingface_path(language, domain, True)
+        tokenizer_id = "xlm-roberta-large"
+        return predict(text, major_model_id, minor_model_id, tokenizer_id)
+    else:
+        model_id = build_huggingface_path(language, domain, False)
+        tokenizer_id = "xlm-roberta-large"
+        return predict_flat(text, model_id, tokenizer_id)
 description = """
 You can choose between two approaches for making predictions:

label_dicts.py CHANGED Viewed

@@ -329,6 +329,227 @@ CAP_LABEL_NAMES = {
     999: "No Policy Content"
 }
 CAP_MIN_LABEL_NAMES = {
     # 1. Macroeconomics
     100: "General",

     999: "No Policy Content"
 }
+CAP_MIN_MEDIA_NUM_DICT = {0: 100,
+     1: 101,
+     2: 103,
+     3: 104,
+     4: 105,
+     5: 107,
+     6: 108,
+     7: 110,
+     8: 199,
+     9: 200,
+     10: 201,
+     11: 202,
+     12: 204,
+     13: 205,
+     14: 206,
+     15: 207,
+     16: 208,
+     17: 209,
+     18: 299,
+     19: 300,
+     20: 301,
+     21: 302,
+     22: 321,
+     23: 322,
+     24: 323,
+     25: 324,
+     26: 325,
+     27: 331,
+     28: 332,
+     29: 333,
+     30: 334,
+     31: 335,
+     32: 341,
+     33: 342,
+     34: 398,
+     35: 399,
+     36: 400,
+     37: 401,
+     38: 402,
+     39: 403,
+     40: 404,
+     41: 405,
+     42: 408,
+     43: 498,
+     44: 499,
+     45: 500,
+     46: 501,
+     47: 502,
+     48: 503,
+     49: 504,
+     50: 505,
+     51: 506,
+     52: 529,
+     53: 599,
+     54: 600,
+     55: 601,
+     56: 602,
+     57: 603,
+     58: 604,
+     59: 606,
+     60: 607,
+     61: 698,
+     62: 699,
+     63: 700,
+     64: 701,
+     65: 703,
+     66: 704,
+     67: 705,
+     68: 707,
+     69: 708,
+     70: 709,
+     71: 711,
+     72: 798,
+     73: 799,
+     74: 800,
+     75: 801,
+     76: 802,
+     77: 803,
+     78: 805,
+     79: 806,
+     80: 807,
+     81: 898,
+     82: 899,
+     83: 900,
+     84: 1000,
+     85: 1001,
+     86: 1002,
+     87: 1003,
+     88: 1005,
+     89: 1007,
+     90: 1010,
+     91: 1098,
+     92: 1099,
+     93: 1200,
+     94: 1201,
+     95: 1202,
+     96: 1203,
+     97: 1204,
+     98: 1205,
+     99: 1206,
+     100: 1207,
+     101: 1208,
+     102: 1210,
+     103: 1211,
+     104: 1227,
+     105: 1299,
+     106: 1300,
+     107: 1302,
+     108: 1303,
+     109: 1304,
+     110: 1305,
+     111: 1308,
+     112: 1399,
+     113: 1400,
+     114: 1401,
+     115: 1403,
+     116: 1404,
+     117: 1405,
+     118: 1406,
+     119: 1407,
+     120: 1408,
+     121: 1409,
+     122: 1498,
+     123: 1499,
+     124: 1500,
+     125: 1501,
+     126: 1502,
+     127: 1504,
+     128: 1505,
+     129: 1507,
+     130: 1520,
+     131: 1521,
+     132: 1522,
+     133: 1523,
+     134: 1524,
+     135: 1525,
+     136: 1526,
+     137: 1598,
+     138: 1599,
+     139: 1600,
+     140: 1602,
+     141: 1603,
+     142: 1604,
+     143: 1605,
+     144: 1606,
+     145: 1608,
+     146: 1610,
+     147: 1611,
+     148: 1612,
+     149: 1614,
+     150: 1615,
+     151: 1616,
+     152: 1617,
+     153: 1619,
+     154: 1620,
+     155: 1698,
+     156: 1699,
+     157: 1700,
+     158: 1701,
+     159: 1704,
+     160: 1705,
+     161: 1706,
+     162: 1707,
+     163: 1708,
+     164: 1709,
+     165: 1798,
+     166: 1799,
+     167: 1800,
+     168: 1802,
+     169: 1803,
+     170: 1804,
+     171: 1806,
+     172: 1807,
+     173: 1808,
+     174: 1899,
+     175: 1900,
+     176: 1901,
+     177: 1902,
+     178: 1905,
+     179: 1906,
+     180: 1910,
+     181: 1921,
+     182: 1925,
+     183: 1926,
+     184: 1927,
+     185: 1929,
+     186: 1999,
+     187: 2000,
+     188: 2001,
+     189: 2002,
+     190: 2003,
+     191: 2004,
+     192: 2005,
+     193: 2006,
+     194: 2007,
+     195: 2008,
+     196: 2009,
+     197: 2010,
+     198: 2011,
+     199: 2012,
+     200: 2013,
+     201: 2014,
+     202: 2015,
+     203: 2030,
+     204: 2099,
+     205: 2100,
+     206: 2101,
+     207: 2102,
+     208: 2103,
+     209: 2104,
+     210: 2105,
+     211: 2300,
+     212: 9999,
+     213: 24,
+     214: 26,
+     215: 27,
+     216: 29,
+     217: 30,
+     218: 31,
+     219: 99}
 CAP_MIN_LABEL_NAMES = {
     # 1. Macroeconomics
     100: "General",

utils.py CHANGED Viewed

@@ -13,6 +13,7 @@ from interfaces.illframes import domains as domains_illframes
 from interfaces.cap import build_huggingface_path as hf_cap_path
 from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
 from interfaces.cap_media_demo import build_huggingface_path as hf_cap_media_path # why... just follow the name template the next time pls
 from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
 from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
@@ -37,6 +38,9 @@ for language in languages_cap:
 # cap media
 models.append(hf_cap_media_path("", ""))
 # emotion9
 for language in languages_emotion9:

 from interfaces.cap import build_huggingface_path as hf_cap_path
 from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
+from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path
 from interfaces.cap_media_demo import build_huggingface_path as hf_cap_media_path # why... just follow the name template the next time pls
 from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
 from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
 # cap media
 models.append(hf_cap_media_path("", ""))
+# cap minor media
+models.append(hf_cap_minor_media_path("", "", False))
 # emotion9
 for language in languages_emotion9: