Spaces:
Sleeping
Sleeping
import pickle | |
from config import ( | |
MODEL_NAME, | |
SETENCE_EMBEDDING_FILE, | |
SETENCE_SIMILARITY_FILE, | |
SAMPLE_DATA_FILE, SUBJECT_DATA_FILE | |
) | |
from sentence_transformer_lib.sentence_transformer_helper import SentenceTransformerHelper | |
from data_lib.subject_data import SubjectData | |
from data_lib.sample_name_data import SampleNameData | |
from clustering_lib.sentence_clustering_lib import SentenceClusteringLib | |
from data_lib.base_data import COL_STANDARD_NAME_KEY, COL_STANDARD_SUBJECT, COL_STANDARD_NAME | |
class SentenceTransformerService: | |
def __init__(self): | |
self.sentenceTransformerHelper = None | |
self.dic_standard_subject = None | |
self.sample_name_sentence_embeddings = None | |
self.sample_name_sentence_similarities = None | |
self.sampleData = None | |
self.sentence_clustering_lib = None | |
self.name_groups = None | |
def load_model_data(self): | |
"""Load model and data only once at startup""" | |
if self.sentenceTransformerHelper is not None: | |
print("Model already loaded. Skipping reload.") | |
return # Kh么ng load l岷 n岷縰 膽茫 c贸 model | |
print("Loading models and data...") | |
# Load sentence transformer model | |
self.sentenceTransformerHelper = SentenceTransformerHelper( | |
convert_to_zenkaku_flag=True, replace_words=None, keywords=None | |
) | |
self.sentenceTransformerHelper.load_model_by_name(MODEL_NAME) | |
# Load standard subject dictionary | |
self.dic_standard_subject = SubjectData.create_standard_subject_dic_from_file(SUBJECT_DATA_FILE) | |
# Load pre-computed embeddings and similarities | |
with open(SETENCE_EMBEDDING_FILE, "rb") as f: | |
self.sample_name_sentence_embeddings = pickle.load(f) | |
with open(SETENCE_SIMILARITY_FILE, "rb") as f: | |
self.sample_name_sentence_similarities = pickle.load(f) | |
# Load and process sample data | |
self.sampleData = SampleNameData() | |
self.sampleData.load_data_from_csv(SAMPLE_DATA_FILE) | |
self.sampleData.process_data() | |
# Create sentence clusters | |
self.sentence_clustering_lib = SentenceClusteringLib(self.sample_name_sentence_embeddings) | |
best_name_eps = 0.07 | |
self.name_groups, _ = self.sentence_clustering_lib.create_sentence_cluster(best_name_eps) | |
self.sampleData._create_key_column( | |
COL_STANDARD_NAME_KEY, COL_STANDARD_SUBJECT, COL_STANDARD_NAME | |
) | |
self.sampleData.set_name_sentence_labels(self.name_groups) | |
self.sampleData.build_search_tree() | |
print("Models and data loaded successfully") | |
# Global instance (singleton) | |
sentence_transformer_service = SentenceTransformerService() | |