Spaces:
Sleeping
Sleeping
File size: 3,892 Bytes
9e5883e d3bd4e2 9e5883e 5546cc2 0483f45 d3bd4e2 b9295e6 d3bd4e2 5546cc2 d3bd4e2 a325a3b b9295e6 2e25af9 0483f45 b9295e6 d3bd4e2 b9295e6 d3bd4e2 5546cc2 b9295e6 d3bd4e2 b9295e6 5546cc2 d3bd4e2 b9295e6 d3bd4e2 b9295e6 d3bd4e2 b9295e6 d3bd4e2 b9295e6 d3bd4e2 5546cc2 d3bd4e2 5546cc2 b9295e6 5546cc2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import os
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
import demoji
import re
import pandas as pd
import numpy as np
import keras
import warnings
import torch
import tensorflow as tf
from transformers import BertTokenizer, BertModel
from string import punctuation
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
# --- Disable warnings ---
warnings.filterwarnings("ignore")
# --- Configuration & Global Variables ---
MAX_LENGTH = 128
base_path = os.path.join('data')
model_path = os.path.join('Model')
# --- Load Resources ---
alay_dict = pd.read_csv(os.path.join(base_path, 'kamus_alay.csv'), names=['alay', 'normal'], encoding='latin-1')
alay_dict_map = dict(zip(alay_dict['alay'], alay_dict['normal']))
# Load stopwords dari Excel
stopwords_excel_path = os.path.join(base_path, 'stopwords_indonesia.xlsx')
stopwords_df = pd.read_excel(stopwords_excel_path)
stop_words = set(stopwords_df['stopword'].astype(str).tolist())
# --- Load HuggingFace IndoBERT ---
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p1")
bert_model = BertModel.from_pretrained("indobenchmark/indobert-large-p1")
# Load LSTM model (Keras)
lstm_model = keras.models.load_model(os.path.join(model_path, 'indobert_lstm_model.keras'))
# --- Preprocessing Function ---
def process_text(text):
text = str(text).lower()
text = re.sub(r'\d+', '', text)
text = text.replace('\\n\\n\\n', ' ').replace('\\n\\n', ' ').replace('\\n', ' ')
text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
text = re.sub(f"[{re.escape(punctuation)}]", " ", text)
text = demoji.replace(text, "")
text = " ".join(text.split())
words = text.split()
words = [alay_dict_map.get(word, word) for word in words]
return ' '.join(words)
# --- Load TFLite Model ---
def load_tflite_model(tflite_path=os.path.join(model_path, "indobert_lstm_model.tflite")):
interpreter = tf.lite.Interpreter(model_path=tflite_path)
interpreter.allocate_tensors()
return interpreter
# --- Emotion Prediction ---
def predict_emotion(text, interpreter):
cleaned = process_text(text)
tokens = tokenizer(cleaned, return_tensors="pt", padding='max_length', truncation=True, max_length=MAX_LENGTH)
with torch.no_grad():
outputs = bert_model(**tokens)
embeddings = outputs.last_hidden_state.numpy().astype(np.float32)
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
interpreter.set_tensor(input_details[0]['index'], embeddings)
interpreter.invoke()
output = interpreter.get_tensor(output_details[0]['index'])
label = np.argmax(output, axis=1)[0]
emotions = ['anger', 'fear', 'sadness']
return emotions[label]
# --- Keyword Extraction ---
df_rank_keyword = pd.read_excel(os.path.join(base_path, 'Keyword_KeyBERT.xlsx'))
df_rank_keyword['keyword'] = df_rank_keyword['keyword'].apply(process_text)
df_rank_keyword['new_rank'] = df_rank_keyword['rank'].max() - df_rank_keyword['rank'] + 1
def rank_keywords(row):
total_ranking = 0
total_keyword = 0
for keyword in row:
frekuensi_rank = df_rank_keyword.loc[df_rank_keyword['keyword'] == keyword]
if not frekuensi_rank.empty:
total_ranking += frekuensi_rank['new_rank'].values[0]
total_keyword += 1
return total_ranking / total_keyword if total_keyword > 0 else 0
def keyword(text):
sentence_model = SentenceTransformer("denaya/indoSBERT-large", trust_remote_code=True)
kw_model = KeyBERT(model=sentence_model)
filtered_text = ' '.join([w for w in text.split() if w not in stop_words])
processed = process_text(filtered_text)
keywords = kw_model.extract_keywords(processed, top_n=5)
key_list = [keyword for keyword, _ in keywords]
rank = rank_keywords(key_list)
return key_list, rank
|