keluh-cerdas / helper.py
mahfudl26's picture
Update helper.py
2e25af9 verified
raw
history blame
3.89 kB
import os
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
import demoji
import re
import pandas as pd
import numpy as np
import keras
import warnings
import torch
import tensorflow as tf
from transformers import BertTokenizer, BertModel
from string import punctuation
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
# --- Disable warnings ---
warnings.filterwarnings("ignore")
# --- Configuration & Global Variables ---
MAX_LENGTH = 128
base_path = os.path.join('data')
model_path = os.path.join('Model')
# --- Load Resources ---
alay_dict = pd.read_csv(os.path.join(base_path, 'kamus_alay.csv'), names=['alay', 'normal'], encoding='latin-1')
alay_dict_map = dict(zip(alay_dict['alay'], alay_dict['normal']))
# Load stopwords dari Excel
stopwords_excel_path = os.path.join(base_path, 'stopwords_indonesia.xlsx')
stopwords_df = pd.read_excel(stopwords_excel_path)
stop_words = set(stopwords_df['stopword'].astype(str).tolist())
# --- Load HuggingFace IndoBERT ---
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p1")
bert_model = BertModel.from_pretrained("indobenchmark/indobert-large-p1")
# Load LSTM model (Keras)
lstm_model = keras.models.load_model(os.path.join(model_path, 'indobert_lstm_model.keras'))
# --- Preprocessing Function ---
def process_text(text):
text = str(text).lower()
text = re.sub(r'\d+', '', text)
text = text.replace('\\n\\n\\n', ' ').replace('\\n\\n', ' ').replace('\\n', ' ')
text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
text = re.sub(f"[{re.escape(punctuation)}]", " ", text)
text = demoji.replace(text, "")
text = " ".join(text.split())
words = text.split()
words = [alay_dict_map.get(word, word) for word in words]
return ' '.join(words)
# --- Load TFLite Model ---
def load_tflite_model(tflite_path=os.path.join(model_path, "indobert_lstm_model.tflite")):
interpreter = tf.lite.Interpreter(model_path=tflite_path)
interpreter.allocate_tensors()
return interpreter
# --- Emotion Prediction ---
def predict_emotion(text, interpreter):
cleaned = process_text(text)
tokens = tokenizer(cleaned, return_tensors="pt", padding='max_length', truncation=True, max_length=MAX_LENGTH)
with torch.no_grad():
outputs = bert_model(**tokens)
embeddings = outputs.last_hidden_state.numpy().astype(np.float32)
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
interpreter.set_tensor(input_details[0]['index'], embeddings)
interpreter.invoke()
output = interpreter.get_tensor(output_details[0]['index'])
label = np.argmax(output, axis=1)[0]
emotions = ['anger', 'fear', 'sadness']
return emotions[label]
# --- Keyword Extraction ---
df_rank_keyword = pd.read_excel(os.path.join(base_path, 'Keyword_KeyBERT.xlsx'))
df_rank_keyword['keyword'] = df_rank_keyword['keyword'].apply(process_text)
df_rank_keyword['new_rank'] = df_rank_keyword['rank'].max() - df_rank_keyword['rank'] + 1
def rank_keywords(row):
total_ranking = 0
total_keyword = 0
for keyword in row:
frekuensi_rank = df_rank_keyword.loc[df_rank_keyword['keyword'] == keyword]
if not frekuensi_rank.empty:
total_ranking += frekuensi_rank['new_rank'].values[0]
total_keyword += 1
return total_ranking / total_keyword if total_keyword > 0 else 0
def keyword(text):
sentence_model = SentenceTransformer("denaya/indoSBERT-large", trust_remote_code=True)
kw_model = KeyBERT(model=sentence_model)
filtered_text = ' '.join([w for w in text.split() if w not in stop_words])
processed = process_text(filtered_text)
keywords = kw_model.extract_keywords(processed, top_n=5)
key_list = [keyword for keyword, _ in keywords]
rank = rank_keywords(key_list)
return key_list, rank