babel_machine / utils.py
kovacsvi
set torch threads
8027e9b
import os
import shutil
import glob
import subprocess
from contextlib import contextmanager
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from interfaces.cap import languages as languages_cap
from interfaces.cap import domains as domains_cap
from interfaces.emotion9 import languages as languages_emotion9
from interfaces.illframes import domains as domains_illframes
from interfaces.cap import build_huggingface_path as hf_cap_path
from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path
from interfaces.cap_media_demo import build_huggingface_path as hf_cap_media_path # why... just follow the name template the next time pls
from interfaces.cap_media2 import build_huggingface_path as hf_cap_media2_path
from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
from interfaces.emotion import build_huggingface_path as hf_emotion_path
from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
from interfaces.illframes import build_huggingface_path as hf_illframes_path
from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path
from huggingface_hub import scan_cache_dir
JIT_DIR = "/data/jit_models"
HF_TOKEN = os.environ["hf_read"]
# should be a temporary solution
models = [hf_manifesto_path(""), hf_sentiment_path(""), hf_emotion_path(""), hf_cap_minor_path("", ""), hf_ontolisst_path("")]
# it gets more difficult with cap
domains_cap = list(domains_cap.values())
for language in languages_cap:
for domain in domains_cap:
models.append(hf_cap_path(language, domain))
# cap media
models.append(hf_cap_media_path("", ""))
# cap media2
models.append(hf_cap_media2_path("", ""))
# cap minor media
models.append(hf_cap_minor_media_path("", "", False))
# emotion9
for language in languages_emotion9:
models.append(hf_emotion9_path(language))
# illframes (domains is a dict for some reason?)
for domain in domains_illframes.values():
models.append(hf_illframes_path(domain))
tokenizers = ["xlm-roberta-large"]
def download_hf_models():
os.makedirs(JIT_DIR, exist_ok=True)
for model_id in models:
print(f"Downloading + JIT tracing model: {model_id}")
safe_model_name = model_id.replace("/", "_")
traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")
if os.path.exists(traced_model_path):
delete_unused_bin_files(model_id)
print(f"⏩ Skipping JIT β€” already exists: {traced_model_path}")
else:
print(f"βš™οΈ Tracing and saving: {traced_model_path}")
model = AutoModelForSequenceClassification.from_pretrained(
model_id,
token=HF_TOKEN,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
model.eval()
# Dummy input for tracing
dummy_input = tokenizer(
"Hello, world!",
return_tensors="pt",
padding=True,
truncation=True,
max_length=64
)
# JIT trace
traced_model = torch.jit.trace(
model,
(dummy_input["input_ids"], dummy_input["attention_mask"]),
strict=False
)
# Save traced model
traced_model.save(traced_model_path)
print(f"βœ”οΈ Saved JIT model to: {traced_model_path}")
def df_h():
df_result = subprocess.run(["df", "-H"], capture_output=True, text=True)
print("=== Disk Free Space (df -H) ===")
print(df_result.stdout)
du_result = subprocess.run(["du", "-h", "--max-depth=2", "/data/"], capture_output=True, text=True)
print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
print(du_result.stdout)
def delete_unused_bin_files(model_id: str):
target_path = f"/data/models--poltextlab--{model_id}"
# delete files in blobs/
blob_bins = glob.glob(f"{target_path}/blobs/**/*", recursive=True)
# delete .bin files in snapshots/, except config.json
snapshot_bins = glob.glob(f"{target_path}/snapshots/**/*.bin", recursive=True)
files_to_delete = blob_bins + snapshot_bins
for file_path in files_to_delete:
if os.path.basename(file_path) == "config.json":
continue
if os.path.isfile(path):
print(f"Deleting file: {path}")
os.remove(path)
elif os.path.isdir(path):
print(f"Deleting directory: {path}")
shutil.rmtree(path)
def delete_http_folders():
http_folders = glob.glob("/data/http*")
for folder in http_folders:
if os.path.isdir(folder):
print(f"Deleting: {folder}")
shutil.rmtree(folder)
@contextmanager
def hf_cleanup():
delete_http_folders()
try:
yield
finally:
delete_http_folders()
def scan_cache():
# Scan Hugging Face model cache
cache_dir = os.environ.get("TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers"))
scan_result = scan_cache_dir(cache_dir)
print("=== πŸ€— Hugging Face Model Cache ===")
print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
print(f"Number of repos: {len(scan_result.repos)}")
for repo in scan_result.repos:
print(f"- {repo.repo_id} ({repo.repo_type}) β€” {repo.size_on_disk / 1e6:.2f} MB")
print("\n=== 🧊 TorchScript JIT Cache ===")
if not os.path.exists(JIT_DIR):
print(f"(Directory does not exist: {JIT_DIR})")
return
total_size = 0
for filename in os.listdir(JIT_DIR):
if filename.endswith(".pt"):
path = os.path.join(JIT_DIR, filename)
size = os.path.getsize(path)
total_size += size
print(f"- {filename}: {size / 1e6:.2f} MB")
print(f"Total JIT cache size: {total_size / 1e6:.2f} MB")
def set_hf_cache_dir(path:str):
os.environ['TRANSFORMERS_CACHE'] = path
os.environ['HF_HOME'] = path
os.environ['HF_DATASETS_CACHE'] = path
os.environ['TORCH_HOME'] = path
def set_torch_threads():
torch.set_num_threads(1)
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
def is_disk_full(min_free_space_in_GB=10):
total, used, free = shutil.disk_usage("/")
free_gb = free / (1024 ** 3)
if free_gb >= min_free_space_in_GB:
return False
else:
return True