Gumelar Teja Sukma commited on
Commit
fe79cb6
·
1 Parent(s): 56f7ef6
Files changed (2) hide show
  1. app.py +11 -6
  2. requirements.txt +2 -1
app.py CHANGED
@@ -2,13 +2,18 @@ import torch
2
  import gradio as gr
3
  from transformers import AutoTokenizer
4
  from auto_gptq import AutoGPTQForCausalLM
 
 
5
 
6
  # Load model & tokenizer
7
  # model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
8
  print("PyTorch Version",torch.__version__) # Versi PyTorch
9
  print("Is GPU Available",torch.cuda.is_available()) # Apakah GPU terdeteksi?
 
 
10
 
11
- model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF"
 
12
  # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
13
  tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
14
 
@@ -17,13 +22,13 @@ model = AutoGPTQForCausalLM.from_quantized(
17
  model_basename="model",
18
  # device_map="auto", # Auto-detects GPU/CPU
19
  device_map="cpu",
20
- torch_dtype=torch.float32, # <-- Tambahkan ini
21
  use_safetensors=True,
22
  trust_remote_code=True,
23
  use_triton=False,
24
- inject_fused_attention=False,
25
  inject_fused_mlp=False,
26
- disable_exllama=True,
27
  disable_exllamav2=True,
28
  )
29
 
@@ -56,10 +61,10 @@ def chat(user_input, chat_history):
56
  with torch.inference_mode():
57
  output_ids = model.generate(
58
  input_ids=input_ids,
59
- max_new_tokens=50,
60
  use_cache=True,
61
  # do_sample=True,
62
- do_sample=False,
63
  temperature=0.7,
64
  top_p=0.95
65
  )
 
2
  import gradio as gr
3
  from transformers import AutoTokenizer
4
  from auto_gptq import AutoGPTQForCausalLM
5
+ import os
6
+ os.environ["TOKENIZERS_PARALLELISM"] = "false" # Hindari beban CPU
7
 
8
  # Load model & tokenizer
9
  # model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
10
  print("PyTorch Version",torch.__version__) # Versi PyTorch
11
  print("Is GPU Available",torch.cuda.is_available()) # Apakah GPU terdeteksi?
12
+ print("CPU cores:", psutil.cpu_count())
13
+ print("RAM (GB):", psutil.virtual_memory().total / (1024**3))
14
 
15
+ # model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF"
16
+ model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ"
17
  # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
18
  tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
19
 
 
22
  model_basename="model",
23
  # device_map="auto", # Auto-detects GPU/CPU
24
  device_map="cpu",
25
+ torch_dtype=torch.float32, # Hindari float16 di CPU
26
  use_safetensors=True,
27
  trust_remote_code=True,
28
  use_triton=False,
29
+ inject_fused_attention=False, # Wajib untuk CPU
30
  inject_fused_mlp=False,
31
+ disable_exllama=True, # Wajib untuk CPU
32
  disable_exllamav2=True,
33
  )
34
 
 
61
  with torch.inference_mode():
62
  output_ids = model.generate(
63
  input_ids=input_ids,
64
+ max_new_tokens=32,
65
  use_cache=True,
66
  # do_sample=True,
67
+ do_sample=False, # Matikan sampling untuk percepat
68
  temperature=0.7,
69
  top_p=0.95
70
  )
requirements.txt CHANGED
@@ -1,7 +1,8 @@
1
  torch>=2.0.0
2
  gradio>=3.0.0
3
  transformers>=4.30.0
4
- auto-gptq>=0.4.0 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cpu/
 
5
  ninja>=1.11.0
6
  accelerate>=0.20.0
7
  bitsandbytes>=0.40.0
 
1
  torch>=2.0.0
2
  gradio>=3.0.0
3
  transformers>=4.30.0
4
+ auto-gptq>=0.4.0 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cpu
5
+ # auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118
6
  ninja>=1.11.0
7
  accelerate>=0.20.0
8
  bitsandbytes>=0.40.0