Spaces:
Sleeping
Sleeping
Update helper.py
Browse files
helper.py
CHANGED
@@ -2,7 +2,6 @@ import demoji
|
|
2 |
import re
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
5 |
-
import nltk
|
6 |
import keras
|
7 |
import os
|
8 |
import warnings
|
@@ -11,7 +10,6 @@ import tensorflow as tf
|
|
11 |
from transformers import BertTokenizer, TFBertModel
|
12 |
from string import punctuation
|
13 |
from keybert import KeyBERT
|
14 |
-
from nltk.corpus import stopwords
|
15 |
from sentence_transformers import SentenceTransformer
|
16 |
|
17 |
# --- Disable warnings ---
|
@@ -22,16 +20,16 @@ MAX_LENGTH = 128
|
|
22 |
base_path = os.path.join('data')
|
23 |
model_path = os.path.join('Model')
|
24 |
|
25 |
-
# --- Setup NLTK data directory (custom path for Hugging Face) ---
|
26 |
-
nltk_data_dir = os.path.join(tempfile.gettempdir(), "nltk_data")
|
27 |
-
os.makedirs(nltk_data_dir, exist_ok=True)
|
28 |
-
nltk.data.path.append(nltk_data_dir)
|
29 |
-
nltk.download('stopwords', download_dir=nltk_data_dir)
|
30 |
-
|
31 |
# --- Load Resources ---
|
32 |
alay_dict = pd.read_csv(os.path.join(base_path, 'kamus_alay.csv'), names=['alay', 'normal'], encoding='latin-1')
|
33 |
alay_dict_map = dict(zip(alay_dict['alay'], alay_dict['normal']))
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p1")
|
36 |
bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-large-p1")
|
37 |
lstm_model = keras.models.load_model(os.path.join(model_path, 'indobert_lstm_model.keras'))
|
|
|
2 |
import re
|
3 |
import pandas as pd
|
4 |
import numpy as np
|
|
|
5 |
import keras
|
6 |
import os
|
7 |
import warnings
|
|
|
10 |
from transformers import BertTokenizer, TFBertModel
|
11 |
from string import punctuation
|
12 |
from keybert import KeyBERT
|
|
|
13 |
from sentence_transformers import SentenceTransformer
|
14 |
|
15 |
# --- Disable warnings ---
|
|
|
20 |
base_path = os.path.join('data')
|
21 |
model_path = os.path.join('Model')
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# --- Load Resources ---
|
24 |
alay_dict = pd.read_csv(os.path.join(base_path, 'kamus_alay.csv'), names=['alay', 'normal'], encoding='latin-1')
|
25 |
alay_dict_map = dict(zip(alay_dict['alay'], alay_dict['normal']))
|
26 |
+
|
27 |
+
# Load stopwords dari Excel
|
28 |
+
stopwords_excel_path = os.path.join(base_path, 'stopwords_indonesia.xlsx')
|
29 |
+
stopwords_df = pd.read_excel(stopwords_excel_path)
|
30 |
+
stop_words = set(stopwords_df['stopword'].astype(str).tolist())
|
31 |
+
|
32 |
+
# Load tokenizer & model
|
33 |
tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p1")
|
34 |
bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-large-p1")
|
35 |
lstm_model = keras.models.load_model(os.path.join(model_path, 'indobert_lstm_model.keras'))
|