Spaces:
Running
Running
File size: 10,035 Bytes
aa975e0 cec858f 11d8c5d aa975e0 2d748e6 aa975e0 7c543b8 e4fa6a2 6db7031 7fb1f06 bb92bc0 ac4450b 1586d22 aa975e0 52bbc14 aa975e0 52bbc14 abf66d1 17687e1 52bbc14 abf66d1 aa975e0 52bbc14 aa975e0 575d2bc 52bbc14 575d2bc 6ac3700 52bbc14 6ac3700 52bbc14 cec858f 52bbc14 cec858f 52bbc14 cec858f aa975e0 52bbc14 aa975e0 51db79c aa975e0 52bbc14 11d8c5d aa975e0 52bbc14 aa975e0 52bbc14 aa975e0 796fe47 cec858f 52bbc14 247282f b742f2b aa975e0 5b27629 ac4450b ed412e3 5b27629 f9b69e4 5b27629 3838a79 ed412e3 5b27629 52bbc14 ac4450b 52bbc14 484e051 71603e3 0edf9d8 71603e3 52bbc14 7fb1f06 71603e3 52bbc14 71603e3 52bbc14 7fb1f06 71603e3 7fb1f06 52bbc14 71603e3 7fb1f06 71603e3 0edf9d8 71603e3 005b17a 71603e3 ac4450b 52bbc14 9f339a6 52bbc14 9f339a6 c64e0a4 9f339a6 e252a02 73953d0 52bbc14 c64e0a4 52bbc14 3ed00cb 52bbc14 e252a02 52bbc14 e252a02 52bbc14 aa975e0 cec858f ac4450b 1586d22 796fe47 247282f 52bbc14 cec858f 0d8846e ba0fb9c 6efba5d f3c99ea 52bbc14 ac4450b aa975e0 fc96b01 81640f3 f6dc696 805ad79 51b9447 7543a16 805ad79 e9e2028 52bbc14 17687e1 51b9447 cec858f 3bdb0a7 52bbc14 3bdb0a7 52bbc14 3bdb0a7 552f630 d71e26f 81640f3 226b684 81640f3 52bbc14 169b92f 3f9f0ba 81640f3 4966266 247282f 9d3dc59 52bbc14 4966266 81640f3 52bbc14 73953d0 52bbc14 3f9f0ba 6efba5d 81640f3 52bbc14 6efba5d 6f3f9f8 552f630 cec858f aa975e0 cec858f 52bbc14 cec858f aa975e0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 |
import os
import torch
import spacy
import spaces
import numpy as np
import pandas as pd
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import gradio as gr
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
from matplotlib.ticker import PercentFormatter
import matplotlib.colors as mcolors
import plotly.express as px
import seaborn as sns
from tqdm import tqdm
PATH = "/data/" # at least 150GB storage needs to be attached
os.environ["TRANSFORMERS_CACHE"] = PATH
os.environ["HF_HOME"] = PATH
os.environ["HF_DATASETS_CACHE"] = PATH
os.environ["TORCH_HOME"] = PATH
css = """
.info {font-size: 3em; !important}
.title_ {text-align: center;}
"""
HF_TOKEN = os.environ["hf_read"]
SENTIMENT_LABEL_NAMES = {
0: "Negative",
1: "No sentiment or Neutral sentiment",
2: "Positive",
}
LANGUAGES = ["Czech", "English", "French", "German", "Hungarian", "Polish", "Slovakian"]
id2label = {
0: "Anger",
1: "Fear",
2: "Disgust",
3: "Sadness",
4: "Joy",
5: "None of Them",
}
emotion_colors = {
"Anger": "#D96459",
"Fear": "#6A8EAE",
"Disgust": "#A4C639",
"Sadness": "#9DBCD4",
"Joy": "#F3E9A8",
"None of Them": "#C0C0C0",
}
def load_spacy_model(model_name="xx_sent_ud_sm"):
try:
model = spacy.load(model_name)
except OSError:
spacy.cli.download(model_name)
model = spacy.load(model_name)
return model
def split_sentences(text, model):
# disable pipeline components not necessary for splitting
model.disable_pipes(model.pipe_names) # first disable all the pipes
model.enable_pipe("senter") # then enable the sentence splitter only
doc = model(text)
sentences = [sent.text for sent in doc.sents]
return sentences
def build_huggingface_path(language: str):
if language == "Czech" or language == "Slovakian":
return "visegradmedia-emotion/Emotion_RoBERTa_pooled_V4"
return "poltextlab/xlm-roberta-large-pooled-emotions6"
@spaces.GPU
def predict(text, model_id, tokenizer_id):
model = AutoModelForSequenceClassification.from_pretrained(
model_id,
low_cpu_mem_usage=True,
device_map="auto",
offload_folder="offload",
token=HF_TOKEN,
)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
inputs = tokenizer(
text, max_length=64, truncation=True, padding="do_not_pad", return_tensors="pt"
)
model.eval()
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy().flatten()
return probs
def get_most_probable_label(probs, idx=1):
sorted_indices = probs.argsort()[::-1]
selected_idx = sorted_indices[idx - 1]
label = id2label[selected_idx]
probability = f"{round(100 * probs[selected_idx], 2)}%"
return label, probability
def prepare_heatmap_data(data):
heatmap_data = pd.DataFrame(0.0, index=id2label.values(), columns=range(len(data)))
for idx, row in enumerate(data):
confidences = row["emotions"].tolist()
for idy, confidence in enumerate(confidences):
emotion = id2label[idy]
heatmap_data.at[emotion, idx] = round(confidence, 4)
heatmap_data.columns = [item["sentence"][:18] + "..." for item in data]
return heatmap_data
def plot_emotion_heatmap(heatmap_data):
# Transpose: now rows = sentences, columns = emotions
heatmap_data = heatmap_data.T
# Normalize each row (sentence-wise)
normalized_data = heatmap_data.copy()
for row in normalized_data.index:
max_val = normalized_data.loc[row].max()
normalized_data.loc[row] = (
normalized_data.loc[row] / max_val if max_val > 0 else 0
)
# Create color matrix
color_matrix = np.empty(
(len(normalized_data.index), len(normalized_data.columns), 3)
)
for i, sentence in enumerate(normalized_data.index):
for j, emotion in enumerate(normalized_data.columns):
val = normalized_data.loc[sentence, emotion]
base_rgb = mcolors.to_rgb(emotion_colors[emotion])
# Blend from white to base color
blended = tuple(1 - val * (1 - c) for c in base_rgb)
color_matrix[i, j] = blended
fig, ax = plt.subplots(
figsize=(
len(normalized_data.columns) * 0.8 + 2,
len(normalized_data.index) * 0.5 + 2,
)
)
ax.imshow(color_matrix, aspect="auto")
# Set ticks and labels
ax.set_xticks(np.arange(len(normalized_data.columns)))
ax.set_xticklabels(normalized_data.columns, rotation=45, ha="right", fontsize=10)
ax.set_yticks(np.arange(len(normalized_data.index)))
ax.set_yticklabels(normalized_data.index, rotation=0, fontsize=10)
ax.set_xlabel("Emotions")
ax.set_ylabel("Sentences")
plt.tight_layout()
return fig
def plot_average_emotion_barplot(heatmap_data):
# Compute average emotion scores
all_emotion_scores = np.array([item["emotions"] for item in heatmap_data])
mean_scores = all_emotion_scores.mean(axis=0)
labels = [id2label[i] for i in range(len(mean_scores))]
scores = mean_scores
colors = [emotion_colors[label] for label in labels]
fig, ax = plt.subplots(figsize=(8, 6))
bars = sns.barplot(x=list(scores), y=list(labels), palette=colors, ax=ax)
ax.xaxis.set_major_formatter(PercentFormatter(xmax=1.0, decimals=0))
# Add percentage labels
for i, score in enumerate(scores):
ax.text(score + 0.01, i, f"{score*100:.1f}%", va="center")
ax.set_title("Which emotions showed up most in the text?", fontsize=14)
ax.set_xlabel("Average Confidence")
ax.set_ylabel("Emotions")
plt.tight_layout()
return fig
def predict_wrapper(text, language):
model_id = build_huggingface_path(language)
tokenizer_id = "xlm-roberta-large"
spacy_model = load_spacy_model()
sentences = split_sentences(text, spacy_model)
results = []
results_heatmap = []
for sentence in tqdm(sentences):
probs = predict(sentence, model_id, tokenizer_id)
label1, probability1 = get_most_probable_label(probs, 1)
label2, probability2 = get_most_probable_label(probs, 2)
results.append([sentence, label1, probability1, label2, probability2])
results_heatmap.append({"sentence": sentence, "emotions": probs})
# let's see...
print(results)
print(results_heatmap)
figure = plot_average_emotion_barplot(results_heatmap)
heatmap = plot_emotion_heatmap(prepare_heatmap_data(results_heatmap))
output_info = f'Prediction was made using the <a href="https://huggingface.co/{model_id}">{model_id}</a> model. '
funding_info = "The research was funded by European Union’s Horizon 2020 research and innovation program, “MORES” project (Grant No.: 101132601)"
return results, figure, heatmap, output_info + funding_info
with gr.Blocks(css=css) as demo:
placeholder = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."
introduction = """
This platform is designed to detect and visualize emotions in text. The model behind it operates using a 6-label codebook, including the following labels: `Anger`, `Fear`, `Disgust`, `Sadness`, `Joy`, and `None of Them`.
The [model](https://huggingface.co/poltextlab/xlm-roberta-large-pooled-emotions6) is optimized for sentence-level analysis, and make predictions in the following languages: Czech, English, French, German, Hungarian, Polish, and Slovak.
The text you enter in the input box is automatically divided into sentences, and the analysis is performed on each sentence. Depending on the length of the text, this process may take a few seconds, but for longer texts, it can take up to 2-3 minutes.
Read our Q&A about Pulse (here)[].
"""
gr.HTML("<h1>MORES Pulse</h1>", elem_classes="title_")
gr.Markdown(introduction, elem_classes="info")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
lines=6, label="Input", placeholder="Enter your text here..."
)
with gr.Column():
with gr.Row():
language_choice = gr.Dropdown(
choices=LANGUAGES, label="Language", value="English"
)
with gr.Row():
predict_button = gr.Button("Submit")
with gr.Row():
with gr.Column(scale=7):
plot = gr.Plot()
with gr.Column(scale=3):
gr.Markdown(
"The chart gives an overview of the main emotions found in the text and how strongly each one is present.",
elem_classes="info",
)
with gr.Row():
with gr.Column(scale=7):
result_table = gr.Dataframe(
headers=["Sentence", "Prediction (1)", "Confidence (1)", "Prediction (2)", "Confidence (2)"],
column_widths=["46%", "17%", "10%", "17%", "10%"],
wrap=True, # important
)
with gr.Column(scale=3):
gr.Markdown(
"This table shows the two most probable emotions detected in each sentence, along with how confident our predictions are. For all emotions check the heatmap below.",
elem_classes="info",
)
with gr.Row():
with gr.Column(scale=7):
heatmap = gr.Plot()
with gr.Column(scale=3):
gr.Markdown(
"This heatmap shows how strongly each emotion appears in every sentence. Darker colours mean stronger presence.",
elem_classes="info",
)
with gr.Row():
model_info = gr.Markdown()
predict_button.click(
fn=predict_wrapper,
inputs=[input_text, language_choice],
outputs=[result_table, plot, heatmap, model_info],
)
if __name__ == "__main__":
demo.launch()
|