import os os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface' import demoji import re import pandas as pd import numpy as np import keras import warnings import torch import tensorflow as tf from transformers import BertTokenizer, BertModel from string import punctuation from keybert import KeyBERT from sentence_transformers import SentenceTransformer # --- Disable warnings --- warnings.filterwarnings("ignore") # --- Configuration & Global Variables --- MAX_LENGTH = 128 base_path = os.path.join('data') model_path = os.path.join('Model') # --- Load Resources --- alay_dict = pd.read_csv(os.path.join(base_path, 'kamus_alay.csv'), names=['alay', 'normal'], encoding='latin-1') alay_dict_map = dict(zip(alay_dict['alay'], alay_dict['normal'])) # Load stopwords dari Excel stopwords_excel_path = os.path.join(base_path, 'stopwords_indonesia.xlsx') stopwords_df = pd.read_excel(stopwords_excel_path) stop_words = set(stopwords_df['stopword'].astype(str).tolist()) # --- Load HuggingFace IndoBERT --- tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p1") bert_model = BertModel.from_pretrained("indobenchmark/indobert-large-p1") # Load LSTM model (Keras) lstm_model = keras.models.load_model(os.path.join(model_path, 'indobert_lstm_model.keras')) # --- Preprocessing Function --- def process_text(text): text = str(text).lower() text = re.sub(r'\d+', '', text) text = text.replace('\\n\\n\\n', ' ').replace('\\n\\n', ' ').replace('\\n', ' ') text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE) text = re.sub(f"[{re.escape(punctuation)}]", " ", text) text = demoji.replace(text, "") text = " ".join(text.split()) words = text.split() words = [alay_dict_map.get(word, word) for word in words] return ' '.join(words) # --- Load TFLite Model --- def load_tflite_model(tflite_path=os.path.join(model_path, "indobert_lstm_model.tflite")): interpreter = tf.lite.Interpreter(model_path=tflite_path) interpreter.allocate_tensors() return interpreter # --- Emotion Prediction --- def predict_emotion(text, interpreter): cleaned = process_text(text) tokens = tokenizer(cleaned, return_tensors="pt", padding='max_length', truncation=True, max_length=MAX_LENGTH) with torch.no_grad(): outputs = bert_model(**tokens) embeddings = outputs.last_hidden_state.numpy().astype(np.float32) input_details = interpreter.get_input_details() output_details = interpreter.get_output_details() interpreter.set_tensor(input_details[0]['index'], embeddings) interpreter.invoke() output = interpreter.get_tensor(output_details[0]['index']) label = np.argmax(output, axis=1)[0] emotions = ['anger', 'fear', 'sadness'] return emotions[label] # --- Keyword Extraction --- df_rank_keyword = pd.read_excel(os.path.join(base_path, 'Keyword_KeyBERT.xlsx')) df_rank_keyword['keyword'] = df_rank_keyword['keyword'].apply(process_text) df_rank_keyword['new_rank'] = df_rank_keyword['rank'].max() - df_rank_keyword['rank'] + 1 def rank_keywords(row): total_ranking = 0 total_keyword = 0 for keyword in row: frekuensi_rank = df_rank_keyword.loc[df_rank_keyword['keyword'] == keyword] if not frekuensi_rank.empty: total_ranking += frekuensi_rank['new_rank'].values[0] total_keyword += 1 return total_ranking / total_keyword if total_keyword > 0 else 0 def keyword(text): sentence_model = SentenceTransformer("denaya/indoSBERT-large", trust_remote_code=True) kw_model = KeyBERT(model=sentence_model) filtered_text = ' '.join([w for w in text.split() if w not in stop_words]) processed = process_text(filtered_text) keywords = kw_model.extract_keywords(processed, top_n=5) key_list = [keyword for keyword, _ in keywords] rank = rank_keywords(key_list) return key_list, rank