File size: 1,026 Bytes
0063d17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os
import torch
import random
import numpy as np


# Data settings
os.makedirs('data', exist_ok=True)
os.makedirs('data/original', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)
os.makedirs('data/retrieval', exist_ok=True)


# Model settings
MODEL_ID   = 'google-bert/bert-base-multilingual-cased'
MODEL_NAME = 'VN-legalDocs-SBERT'

CACHE_DIR  = f"cache/{MODEL_NAME}"
OUTPUT_DIR = f"models/{MODEL_NAME}"

os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)


# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Reproducibility: deterministic=True, benchmark=False
# Optimize inference/training speed: deterministic=False, benchmark=True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.benchmark     = True


# Hyperparameters
MAX_SEQ_LEN = 512
EPOCHS      = 5
LR          = 3e-5
BATCH_SIZE  = 128
DEVICE      = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {DEVICE}")