Spaces:
Sleeping
Sleeping
import os | |
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface' | |
import demoji | |
import re | |
import pandas as pd | |
import numpy as np | |
import keras | |
import warnings | |
import torch | |
import tensorflow as tf | |
from transformers import BertTokenizer, BertModel | |
from string import punctuation | |
from keybert import KeyBERT | |
from sentence_transformers import SentenceTransformer | |
# --- Disable warnings --- | |
warnings.filterwarnings("ignore") | |
# --- Configuration & Global Variables --- | |
MAX_LENGTH = 128 | |
base_path = os.path.join('data') | |
model_path = os.path.join('Model') | |
# --- Load Resources --- | |
alay_dict = pd.read_csv(os.path.join(base_path, 'kamus_alay.csv'), names=['alay', 'normal'], encoding='latin-1') | |
alay_dict_map = dict(zip(alay_dict['alay'], alay_dict['normal'])) | |
# Load stopwords dari Excel | |
stopwords_excel_path = os.path.join(base_path, 'stopwords_indonesia.xlsx') | |
stopwords_df = pd.read_excel(stopwords_excel_path) | |
stop_words = set(stopwords_df['stopword'].astype(str).tolist()) | |
# --- Load HuggingFace IndoBERT --- | |
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p1") | |
bert_model = BertModel.from_pretrained("indobenchmark/indobert-large-p1") | |
# Load LSTM model (Keras) | |
lstm_model = keras.models.load_model(os.path.join(model_path, 'indobert_lstm_model.keras')) | |
# --- Preprocessing Function --- | |
def process_text(text): | |
text = str(text).lower() | |
text = re.sub(r'\d+', '', text) | |
text = text.replace('\\n\\n\\n', ' ').replace('\\n\\n', ' ').replace('\\n', ' ') | |
text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) | |
text = re.sub(f"[{re.escape(punctuation)}]", " ", text) | |
text = demoji.replace(text, "") | |
text = " ".join(text.split()) | |
words = text.split() | |
words = [alay_dict_map.get(word, word) for word in words] | |
return ' '.join(words) | |
# --- Load TFLite Model --- | |
def load_tflite_model(tflite_path=os.path.join(model_path, "indobert_lstm_model.tflite")): | |
interpreter = tf.lite.Interpreter(model_path=tflite_path) | |
interpreter.allocate_tensors() | |
return interpreter | |
# --- Emotion Prediction --- | |
def predict_emotion(text, interpreter): | |
cleaned = process_text(text) | |
tokens = tokenizer(cleaned, return_tensors="pt", padding='max_length', truncation=True, max_length=MAX_LENGTH) | |
with torch.no_grad(): | |
outputs = bert_model(**tokens) | |
embeddings = outputs.last_hidden_state.numpy().astype(np.float32) | |
input_details = interpreter.get_input_details() | |
output_details = interpreter.get_output_details() | |
interpreter.set_tensor(input_details[0]['index'], embeddings) | |
interpreter.invoke() | |
output = interpreter.get_tensor(output_details[0]['index']) | |
label = np.argmax(output, axis=1)[0] | |
emotions = ['anger', 'fear', 'sadness'] | |
return emotions[label] | |
# --- Keyword Extraction --- | |
df_rank_keyword = pd.read_excel(os.path.join(base_path, 'Keyword_KeyBERT.xlsx')) | |
df_rank_keyword['keyword'] = df_rank_keyword['keyword'].apply(process_text) | |
df_rank_keyword['new_rank'] = df_rank_keyword['rank'].max() - df_rank_keyword['rank'] + 1 | |
def rank_keywords(row): | |
total_ranking = 0 | |
total_keyword = 0 | |
for keyword in row: | |
frekuensi_rank = df_rank_keyword.loc[df_rank_keyword['keyword'] == keyword] | |
if not frekuensi_rank.empty: | |
total_ranking += frekuensi_rank['new_rank'].values[0] | |
total_keyword += 1 | |
return total_ranking / total_keyword if total_keyword > 0 else 0 | |
def keyword(text): | |
sentence_model = SentenceTransformer("denaya/indoSBERT-large", trust_remote_code=True) | |
kw_model = KeyBERT(model=sentence_model) | |
filtered_text = ' '.join([w for w in text.split() if w not in stop_words]) | |
processed = process_text(filtered_text) | |
keywords = kw_model.extract_keywords(processed, top_n=5) | |
key_list = [keyword for keyword, _ in keywords] | |
rank = rank_keywords(key_list) | |
return key_list, rank | |