import torch
import gradio as gr
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Hindari beban CPU
# Load model & tokenizer
# model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
print("PyTorch Version",torch.__version__) # Versi PyTorch
print("Is GPU Available",torch.cuda.is_available()) # Apakah GPU terdeteksi?
print("CPU cores:", psutil.cpu_count())
print("RAM (GB):", psutil.virtual_memory().total / (1024**3))
# model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF"
model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ"
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoGPTQForCausalLM.from_quantized(
model_name_or_path,
model_basename="model",
# device_map="auto", # Auto-detects GPU/CPU
device_map="cpu",
torch_dtype=torch.float32, # Hindari float16 di CPU
use_safetensors=True,
trust_remote_code=True,
use_triton=False,
inject_fused_attention=False, # Wajib untuk CPU
inject_fused_mlp=False,
disable_exllama=True, # Wajib untuk CPU
disable_exllamav2=True,
)
# Prompt template
SYSTEM_PROMPT = "<>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<>\n\n"
# def build_prompt(history):
# prompt = f"[INST] {SYSTEM_PROMPT}{history[-1][0]} [/INST]"
# return prompt
def build_prompt(chat_history):
prompt = "[INST] <>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<>\n\n"
for msg in chat_history:
if msg["role"] == "user":
prompt += f"{msg['content']} [/INST] "
else:
prompt += f"{msg['content']} [INST] "
return prompt
def chat(user_input, chat_history):
print("๐ ๏ธ DEBUG - Input:", user_input)
print("๐ ๏ธ DEBUG - History:", chat_history)
if not chat_history:
chat_history = []
chat_history.append({"role": "user", "content": user_input})
prompt = build_prompt(chat_history)
print("๐ ๏ธ DEBUG - prompt:", prompt)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cpu")
print("๐ ๏ธ DEBUG - input_ids:", input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids=input_ids,
max_new_tokens=32,
use_cache=True,
# do_sample=True,
do_sample=False, # Matikan sampling untuk percepat
temperature=0.7,
top_p=0.95
)
# output_ids = model.generate(
# input_ids=input_ids,
# max_new_tokens=128,
# use_cache=True,
# do_sample=True,
# temperature=0.7,
# top_p=0.95
# )
print("๐ ๏ธ DEBUG - output_ids:", output_ids)
response = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
print("๐ ๏ธ DEBUG - Response:", response) # Akan muncul di terminal/logs
chat_history.append({"role": "assistant", "content": response})
return chat_history, chat_history
# def chat(user_input, chat_history):
# if not chat_history:
# chat_history = []
# prompt = build_prompt(chat_history + [[user_input, ""]])
# input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
# output_ids = model.generate(
# input_ids=input_ids,
# max_new_tokens=256,
# do_sample=True,
# temperature=0.7,
# top_p=0.95,
# )
# generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# response = generated_text.split("[/INST]")[-1].strip()
# chat_history.append([user_input, response])
# return chat_history, chat_history
# Gradio UI
with gr.Blocks(title="Ujang v3 Chatbot") as demo:
gr.Markdown("### ๐ค Ujang v3 - LLaMA 2 Chatbot GPTQ")
# chatbot = gr.Chatbot()
chatbot = gr.Chatbot(type="messages")
msg = gr.Textbox(label="Ketik pesan:")
clear = gr.Button("๐งน Bersihkan")
state = gr.State([])
msg.submit(chat, [msg, state], [chatbot, state])
clear.click(lambda: ([], []), None, [chatbot, state])
demo.launch()