Spaces:
Running
Running
File size: 7,762 Bytes
e390ccc 4bba8df b1b87fb 4bba8df 3f77878 e390ccc ca62943 7e0dad9 ca62943 4bba8df e390ccc c554973 4bba8df e390ccc 4bba8df af68a82 7e0dad9 0c9d7b1 e390ccc 4bba8df e390ccc 44d3c68 caa0374 d68fe8b e390ccc c554973 0c08f54 4bba8df 6d39e54 c554973 0c08f54 2926563 af68a82 0c9d7b1 af68a82 0c08f54 4bba8df 0c08f54 4bba8df c554973 e390ccc 0c08f54 3abd99d caa0374 41bc8d2 caa0374 0c08f54 caa0374 0c08f54 fb1a253 8453705 654bf8b 0c08f54 8453705 0c08f54 8453705 654bf8b 0c08f54 654bf8b 0c08f54 654bf8b 0c08f54 4bba8df 04d7b9c 0c08f54 04d7b9c 0c08f54 8453705 bf07f99 65e6711 10307a1 0c08f54 bf07f99 8d3cc6e 0c08f54 3f77878 b1b87fb 0c08f54 3f77878 0c08f54 af77a1c 0a394ee 0c08f54 44d3c68 0c08f54 0a394ee 44d3c68 0a394ee 0c08f54 0a394ee 0c08f54 8027e9b 4bba8df e390ccc 4bba8df 0c08f54 4bba8df 7e0dad9 410b6ef 7e0dad9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 |
import os
import shutil
import glob
import subprocess
from contextlib import contextmanager
import torch
import pandas as pd
import json
from google.oauth2 import service_account
from pandas_gbq import to_gbq
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from interfaces.cap import languages as languages_cap
from interfaces.cap import domains as domains_cap
from interfaces.emotion9 import languages as languages_emotion9
from interfaces.illframes import domains as domains_illframes
from interfaces.cap import build_huggingface_path as hf_cap_path
from interfaces.cap_minor import build_huggingface_path as hf_cap_minor_path
from interfaces.cap_minor_media import build_huggingface_path as hf_cap_minor_media_path
from interfaces.cap_media import build_huggingface_path as hf_cap_media_path
from interfaces.cap_media2 import build_huggingface_path as hf_cap_media2_path
from interfaces.manifesto import build_huggingface_path as hf_manifesto_path
from interfaces.sentiment import build_huggingface_path as hf_sentiment_path
from interfaces.emotion import build_huggingface_path as hf_emotion_path
from interfaces.emotion9 import build_huggingface_path as hf_emotion9_path
from interfaces.ontolisst import build_huggingface_path as hf_ontlisst_path
from interfaces.illframes import build_huggingface_path as hf_illframes_path
from interfaces.ontolisst import build_huggingface_path as hf_ontolisst_path
from huggingface_hub import scan_cache_dir
JIT_DIR = "/data/jit_models"
HF_TOKEN = os.environ["hf_read"]
# should be a temporary solution
models = [
hf_manifesto_path(""),
hf_sentiment_path(""),
hf_emotion_path(""),
hf_cap_minor_path("", ""),
hf_cap_minor_path("", "social"),
hf_ontolisst_path(""),
]
# it gets more difficult with cap
domains_cap = list(domains_cap.values())
for language in languages_cap:
for domain in domains_cap:
models.append(hf_cap_path(language, domain))
# cap media
models.append(hf_cap_media_path("", ""))
# cap media2
models.append(hf_cap_media2_path("", ""))
# cap minor media
models.append(hf_cap_minor_media_path("", "", False))
# emotion9
for language in languages_emotion9:
models.append(hf_emotion9_path(language))
# illframes (domains is a dict for some reason?)
for domain in domains_illframes.values():
models.append(hf_illframes_path(domain))
tokenizers = ["xlm-roberta-large"]
def download_hf_models():
os.makedirs(JIT_DIR, exist_ok=True)
for model_id in models:
print(f"Downloading + JIT tracing model: {model_id}")
safe_model_name = model_id.replace("/", "_")
traced_model_path = os.path.join(JIT_DIR, f"{safe_model_name}.pt")
if os.path.exists(traced_model_path):
delete_unused_bin_files(model_id)
print(f"β© Skipping JIT β already exists: {traced_model_path}")
else:
print(f"βοΈ Tracing and saving: {traced_model_path}")
model = AutoModelForSequenceClassification.from_pretrained(
model_id, token=HF_TOKEN, device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")
model.eval()
# Dummy input for tracing
dummy_input = tokenizer(
"Hello, world!",
return_tensors="pt",
padding=True,
truncation=True,
max_length=64,
)
# JIT trace
traced_model = torch.jit.trace(
model,
(dummy_input["input_ids"], dummy_input["attention_mask"]),
strict=False,
)
# Save traced model
traced_model.save(traced_model_path)
print(f"βοΈ Saved JIT model to: {traced_model_path}")
def df_h():
df_result = subprocess.run(["df", "-H"], capture_output=True, text=True)
print("=== Disk Free Space (df -H) ===")
print(df_result.stdout)
du_result = subprocess.run(
["du", "-h", "--max-depth=2", "/data/"], capture_output=True, text=True
)
print("=== Disk Usage for /data/ (du -h --max-depth=2) ===")
print(du_result.stdout)
def delete_unused_bin_files(model_id: str):
target_path = f"/data/models--poltextlab--{model_id}"
# delete files in blobs/
blob_bins = glob.glob(f"{target_path}/blobs/**/*", recursive=True)
# delete .bin files in snapshots/, except config.json
snapshot_bins = glob.glob(f"{target_path}/snapshots/**/*.bin", recursive=True)
files_to_delete = blob_bins + snapshot_bins
for file_path in files_to_delete:
if os.path.basename(file_path) == "config.json":
continue
if os.path.isfile(path):
print(f"Deleting file: {path}")
os.remove(path)
elif os.path.isdir(path):
print(f"Deleting directory: {path}")
shutil.rmtree(path)
def delete_http_folders():
http_folders = glob.glob("/data/http*")
for folder in http_folders:
if os.path.isdir(folder):
print(f"Deleting: {folder}")
shutil.rmtree(folder)
@contextmanager
def hf_cleanup():
delete_http_folders()
try:
yield
finally:
delete_http_folders()
def scan_cache():
# Scan Hugging Face model cache
cache_dir = os.environ.get(
"TRANSFORMERS_CACHE", os.path.expanduser("~/.cache/huggingface/transformers")
)
scan_result = scan_cache_dir(cache_dir)
print("=== π€ Hugging Face Model Cache ===")
print(f"Cache size: {scan_result.size_on_disk / 1e6:.2f} MB")
print(f"Number of repos: {len(scan_result.repos)}")
for repo in scan_result.repos:
print(f"- {repo.repo_id} ({repo.repo_type}) β {repo.size_on_disk / 1e6:.2f} MB")
print("\n=== π§ TorchScript JIT Cache ===")
if not os.path.exists(JIT_DIR):
print(f"(Directory does not exist: {JIT_DIR})")
return
total_size = 0
for filename in os.listdir(JIT_DIR):
if filename.endswith(".pt"):
path = os.path.join(JIT_DIR, filename)
size = os.path.getsize(path)
total_size += size
print(f"- {filename}: {size / 1e6:.2f} MB")
print(f"Total JIT cache size: {total_size / 1e6:.2f} MB")
def set_hf_cache_dir(path: str):
os.environ["TRANSFORMERS_CACHE"] = path
os.environ["HF_HOME"] = path
os.environ["HF_DATASETS_CACHE"] = path
os.environ["TORCH_HOME"] = path
def set_torch_threads():
torch.set_num_threads(1)
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
def is_disk_full(min_free_space_in_GB=10):
total, used, free = shutil.disk_usage("/")
free_gb = free / (1024**3)
if free_gb >= min_free_space_in_GB:
return False
else:
return True
def update_bq_model_table():
try:
# Load BQ credentials from HF secret
service_account_info = json.loads(os.environ["GCP_SERVICE_ACCOUNT_JSON"])
credentials = service_account.Credentials.from_service_account_info(
service_account_info
)
project_id = os.environ.get("BQ_PROJECT_ID", None)
dataset_id = os.environ.get("BQ_DATASET_ID", None)
table_id = os.environ.get("BQ_TABLE_ID", None) # hf_space_models
full_table_id = f"{dataset_id}.{table_id}"
to_gbq(
pd.DataFrame({"model_id": models}),
destination_table=full_table_id,
project_id=project_id,
if_exists="replace",
credentials=credentials,
)
# TO-DO: add timestamp column?
print(f"Updated BigQuery model table!")
except Exception as e:
print(f"BigQuery model table update failed: {e}")
|