File size: 2,724 Bytes
b77c0a2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import pickle
from config import (
    MODEL_NAME, 
    SETENCE_EMBEDDING_FILE, 
    SETENCE_SIMILARITY_FILE, 
    SAMPLE_DATA_FILE, SUBJECT_DATA_FILE
)
from sentence_transformer_lib.sentence_transformer_helper import SentenceTransformerHelper
from data_lib.subject_data import SubjectData
from data_lib.sample_name_data import SampleNameData
from clustering_lib.sentence_clustering_lib import SentenceClusteringLib
from data_lib.base_data import COL_STANDARD_NAME_KEY, COL_STANDARD_SUBJECT, COL_STANDARD_NAME

class SentenceTransformerService:
    def __init__(self):
        self.sentenceTransformerHelper = None
        self.dic_standard_subject = None
        self.sample_name_sentence_embeddings = None
        self.sample_name_sentence_similarities = None
        self.sampleData = None
        self.sentence_clustering_lib = None
        self.name_groups = None

    def load_model_data(self):
        """Load model and data only once at startup"""
        if self.sentenceTransformerHelper is not None:
            print("Model already loaded. Skipping reload.")
            return  # Không load lại nếu đã có model

        print("Loading models and data...")
        # Load sentence transformer model
        self.sentenceTransformerHelper = SentenceTransformerHelper(
            convert_to_zenkaku_flag=True, replace_words=None, keywords=None
        )
        self.sentenceTransformerHelper.load_model_by_name(MODEL_NAME)

        # Load standard subject dictionary
        self.dic_standard_subject = SubjectData.create_standard_subject_dic_from_file(SUBJECT_DATA_FILE)

        # Load pre-computed embeddings and similarities
        with open(SETENCE_EMBEDDING_FILE, "rb") as f:
            self.sample_name_sentence_embeddings = pickle.load(f)

        with open(SETENCE_SIMILARITY_FILE, "rb") as f:
            self.sample_name_sentence_similarities = pickle.load(f)

        # Load and process sample data
        self.sampleData = SampleNameData()
        self.sampleData.load_data_from_csv(SAMPLE_DATA_FILE)
        self.sampleData.process_data()

        # Create sentence clusters
        self.sentence_clustering_lib = SentenceClusteringLib(self.sample_name_sentence_embeddings)
        best_name_eps = 0.07
        self.name_groups, _ = self.sentence_clustering_lib.create_sentence_cluster(best_name_eps)

        self.sampleData._create_key_column(
            COL_STANDARD_NAME_KEY, COL_STANDARD_SUBJECT, COL_STANDARD_NAME
        )
        self.sampleData.set_name_sentence_labels(self.name_groups)
        self.sampleData.build_search_tree()

        print("Models and data loaded successfully")

# Global instance (singleton)
sentence_transformer_service = SentenceTransformerService()