Spaces:
Sleeping
Sleeping
File size: 2,724 Bytes
b77c0a2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import pickle
from config import (
MODEL_NAME,
SETENCE_EMBEDDING_FILE,
SETENCE_SIMILARITY_FILE,
SAMPLE_DATA_FILE, SUBJECT_DATA_FILE
)
from sentence_transformer_lib.sentence_transformer_helper import SentenceTransformerHelper
from data_lib.subject_data import SubjectData
from data_lib.sample_name_data import SampleNameData
from clustering_lib.sentence_clustering_lib import SentenceClusteringLib
from data_lib.base_data import COL_STANDARD_NAME_KEY, COL_STANDARD_SUBJECT, COL_STANDARD_NAME
class SentenceTransformerService:
def __init__(self):
self.sentenceTransformerHelper = None
self.dic_standard_subject = None
self.sample_name_sentence_embeddings = None
self.sample_name_sentence_similarities = None
self.sampleData = None
self.sentence_clustering_lib = None
self.name_groups = None
def load_model_data(self):
"""Load model and data only once at startup"""
if self.sentenceTransformerHelper is not None:
print("Model already loaded. Skipping reload.")
return # Không load lại nếu đã có model
print("Loading models and data...")
# Load sentence transformer model
self.sentenceTransformerHelper = SentenceTransformerHelper(
convert_to_zenkaku_flag=True, replace_words=None, keywords=None
)
self.sentenceTransformerHelper.load_model_by_name(MODEL_NAME)
# Load standard subject dictionary
self.dic_standard_subject = SubjectData.create_standard_subject_dic_from_file(SUBJECT_DATA_FILE)
# Load pre-computed embeddings and similarities
with open(SETENCE_EMBEDDING_FILE, "rb") as f:
self.sample_name_sentence_embeddings = pickle.load(f)
with open(SETENCE_SIMILARITY_FILE, "rb") as f:
self.sample_name_sentence_similarities = pickle.load(f)
# Load and process sample data
self.sampleData = SampleNameData()
self.sampleData.load_data_from_csv(SAMPLE_DATA_FILE)
self.sampleData.process_data()
# Create sentence clusters
self.sentence_clustering_lib = SentenceClusteringLib(self.sample_name_sentence_embeddings)
best_name_eps = 0.07
self.name_groups, _ = self.sentence_clustering_lib.create_sentence_cluster(best_name_eps)
self.sampleData._create_key_column(
COL_STANDARD_NAME_KEY, COL_STANDARD_SUBJECT, COL_STANDARD_NAME
)
self.sampleData.set_name_sentence_labels(self.name_groups)
self.sampleData.build_search_tree()
print("Models and data loaded successfully")
# Global instance (singleton)
sentence_transformer_service = SentenceTransformerService()
|