Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -4,7 +4,12 @@ import spaces
|
|
4 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
|
5 |
import os
|
6 |
from threading import Thread
|
|
|
7 |
|
|
|
|
|
|
|
|
|
8 |
MODEL_LIST = ["THUDM/GLM-4-Z1-32B-0414"]
|
9 |
|
10 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
@@ -46,7 +51,8 @@ def stream_chat(message, history: list, temperature: float, max_length: int, top
|
|
46 |
low_cpu_mem_usage=True,
|
47 |
trust_remote_code=True,
|
48 |
quantization_config=quantization_config,
|
49 |
-
device_map="auto"
|
|
|
50 |
)
|
51 |
|
52 |
print(f'message is - {message}')
|
|
|
4 |
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
|
5 |
import os
|
6 |
from threading import Thread
|
7 |
+
from accelerate import init_empty_weights
|
8 |
|
9 |
+
max_memory = {
|
10 |
+
0: "40GiB", # بسته به VRAM کارت گرافیکت، مثلاً اگه 8 گیگ داری، بذار 6 یا 5
|
11 |
+
"cpu": "32GiB", # بسته به RAM سیستمت
|
12 |
+
}
|
13 |
MODEL_LIST = ["THUDM/GLM-4-Z1-32B-0414"]
|
14 |
|
15 |
HF_TOKEN = os.environ.get("HF_TOKEN", None)
|
|
|
51 |
low_cpu_mem_usage=True,
|
52 |
trust_remote_code=True,
|
53 |
quantization_config=quantization_config,
|
54 |
+
device_map="auto"و
|
55 |
+
max_memory=max_memory,
|
56 |
)
|
57 |
|
58 |
print(f'message is - {message}')
|