import torch import gradio as gr from transformers import AutoTokenizer from auto_gptq import AutoGPTQForCausalLM import os os.environ["TOKENIZERS_PARALLELISM"] = "false" # Hindari beban CPU # Load model & tokenizer # model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ" print("PyTorch Version",torch.__version__) # Versi PyTorch print("Is GPU Available",torch.cuda.is_available()) # Apakah GPU terdeteksi? print("CPU cores:", psutil.cpu_count()) print("RAM (GB):", psutil.virtual_memory().total / (1024**3)) # model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF" model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ" # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) tokenizer = AutoTokenizer.from_pretrained(model_name_or_path) model = AutoGPTQForCausalLM.from_quantized( model_name_or_path, model_basename="model", # device_map="auto", # Auto-detects GPU/CPU device_map="cpu", torch_dtype=torch.float32, # Hindari float16 di CPU use_safetensors=True, trust_remote_code=True, use_triton=False, inject_fused_attention=False, # Wajib untuk CPU inject_fused_mlp=False, disable_exllama=True, # Wajib untuk CPU disable_exllamav2=True, ) # Prompt template SYSTEM_PROMPT = "<>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<>\n\n" # def build_prompt(history): # prompt = f"[INST] {SYSTEM_PROMPT}{history[-1][0]} [/INST]" # return prompt def build_prompt(chat_history): prompt = "[INST] <>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<>\n\n" for msg in chat_history: if msg["role"] == "user": prompt += f"{msg['content']} [/INST] " else: prompt += f"{msg['content']} [INST] " return prompt def chat(user_input, chat_history): print("๐Ÿ› ๏ธ DEBUG - Input:", user_input) print("๐Ÿ› ๏ธ DEBUG - History:", chat_history) if not chat_history: chat_history = [] chat_history.append({"role": "user", "content": user_input}) prompt = build_prompt(chat_history) print("๐Ÿ› ๏ธ DEBUG - prompt:", prompt) input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cpu") print("๐Ÿ› ๏ธ DEBUG - input_ids:", input_ids) with torch.inference_mode(): output_ids = model.generate( input_ids=input_ids, max_new_tokens=32, use_cache=True, # do_sample=True, do_sample=False, # Matikan sampling untuk percepat temperature=0.7, top_p=0.95 ) # output_ids = model.generate( # input_ids=input_ids, # max_new_tokens=128, # use_cache=True, # do_sample=True, # temperature=0.7, # top_p=0.95 # ) print("๐Ÿ› ๏ธ DEBUG - output_ids:", output_ids) response = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("[/INST]")[-1].strip() print("๐Ÿ› ๏ธ DEBUG - Response:", response) # Akan muncul di terminal/logs chat_history.append({"role": "assistant", "content": response}) return chat_history, chat_history # def chat(user_input, chat_history): # if not chat_history: # chat_history = [] # prompt = build_prompt(chat_history + [[user_input, ""]]) # input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) # output_ids = model.generate( # input_ids=input_ids, # max_new_tokens=256, # do_sample=True, # temperature=0.7, # top_p=0.95, # ) # generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) # response = generated_text.split("[/INST]")[-1].strip() # chat_history.append([user_input, response]) # return chat_history, chat_history # Gradio UI with gr.Blocks(title="Ujang v3 Chatbot") as demo: gr.Markdown("### ๐Ÿค– Ujang v3 - LLaMA 2 Chatbot GPTQ") # chatbot = gr.Chatbot() chatbot = gr.Chatbot(type="messages") msg = gr.Textbox(label="Ketik pesan:") clear = gr.Button("๐Ÿงน Bersihkan") state = gr.State([]) msg.submit(chat, [msg, state], [chatbot, state]) clear.click(lambda: ([], []), None, [chatbot, state]) demo.launch()