mahfudl26 commited on
Commit
a325a3b
·
verified ·
1 Parent(s): c81f8a8

Update helper.py

Browse files
Files changed (1) hide show
  1. helper.py +7 -9
helper.py CHANGED
@@ -2,7 +2,6 @@ import demoji
2
  import re
3
  import pandas as pd
4
  import numpy as np
5
- import nltk
6
  import keras
7
  import os
8
  import warnings
@@ -11,7 +10,6 @@ import tensorflow as tf
11
  from transformers import BertTokenizer, TFBertModel
12
  from string import punctuation
13
  from keybert import KeyBERT
14
- from nltk.corpus import stopwords
15
  from sentence_transformers import SentenceTransformer
16
 
17
  # --- Disable warnings ---
@@ -22,16 +20,16 @@ MAX_LENGTH = 128
22
  base_path = os.path.join('data')
23
  model_path = os.path.join('Model')
24
 
25
- # --- Setup NLTK data directory (custom path for Hugging Face) ---
26
- nltk_data_dir = os.path.join(tempfile.gettempdir(), "nltk_data")
27
- os.makedirs(nltk_data_dir, exist_ok=True)
28
- nltk.data.path.append(nltk_data_dir)
29
- nltk.download('stopwords', download_dir=nltk_data_dir)
30
-
31
  # --- Load Resources ---
32
  alay_dict = pd.read_csv(os.path.join(base_path, 'kamus_alay.csv'), names=['alay', 'normal'], encoding='latin-1')
33
  alay_dict_map = dict(zip(alay_dict['alay'], alay_dict['normal']))
34
- stop_words = set(stopwords.words('indonesian'))
 
 
 
 
 
 
35
  tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p1")
36
  bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-large-p1")
37
  lstm_model = keras.models.load_model(os.path.join(model_path, 'indobert_lstm_model.keras'))
 
2
  import re
3
  import pandas as pd
4
  import numpy as np
 
5
  import keras
6
  import os
7
  import warnings
 
10
  from transformers import BertTokenizer, TFBertModel
11
  from string import punctuation
12
  from keybert import KeyBERT
 
13
  from sentence_transformers import SentenceTransformer
14
 
15
  # --- Disable warnings ---
 
20
  base_path = os.path.join('data')
21
  model_path = os.path.join('Model')
22
 
 
 
 
 
 
 
23
  # --- Load Resources ---
24
  alay_dict = pd.read_csv(os.path.join(base_path, 'kamus_alay.csv'), names=['alay', 'normal'], encoding='latin-1')
25
  alay_dict_map = dict(zip(alay_dict['alay'], alay_dict['normal']))
26
+
27
+ # Load stopwords dari Excel
28
+ stopwords_excel_path = os.path.join(base_path, 'stopwords_indonesia.xlsx')
29
+ stopwords_df = pd.read_excel(stopwords_excel_path)
30
+ stop_words = set(stopwords_df['stopword'].astype(str).tolist())
31
+
32
+ # Load tokenizer & model
33
  tokenizer = BertTokenizer.from_pretrained("indobenchmark/indobert-large-p1")
34
  bert_model = TFBertModel.from_pretrained("indobenchmark/indobert-large-p1")
35
  lstm_model = keras.models.load_model(os.path.join(model_path, 'indobert_lstm_model.keras'))