import torch
import gradio as gr
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Hindari beban CPU

# Load model & tokenizer
# model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
print("PyTorch Version",torch.__version__)  # Versi PyTorch
print("Is GPU Available",torch.cuda.is_available())  # Apakah GPU terdeteksi?
print("CPU cores:", psutil.cpu_count())
print("RAM (GB):", psutil.virtual_memory().total / (1024**3))

# model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF"
model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ"
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    model_basename="model",
    # device_map="auto",  # Auto-detects GPU/CPU
    device_map="cpu",
    torch_dtype=torch.float32,  # Hindari float16 di CPU
    use_safetensors=True,
    trust_remote_code=True,
    use_triton=False,
    inject_fused_attention=False, # Wajib untuk CPU
    inject_fused_mlp=False,
    disable_exllama=True, # Wajib untuk CPU
    disable_exllamav2=True,
)

# Prompt template
SYSTEM_PROMPT = "<<SYS>>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<</SYS>>\n\n"

# def build_prompt(history):
#     prompt = f"<s>[INST] {SYSTEM_PROMPT}{history[-1][0]} [/INST]"
#     return prompt
def build_prompt(chat_history):
    prompt = "<s>[INST] <<SYS>>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<</SYS>>\n\n"
    for msg in chat_history:
        if msg["role"] == "user":
            prompt += f"{msg['content']} [/INST] "
        else:
            prompt += f"{msg['content']} </s><s>[INST] "
    return prompt

def chat(user_input, chat_history):
    print("🛠️ DEBUG - Input:", user_input)
    print("🛠️ DEBUG - History:", chat_history)
    if not chat_history:
        chat_history = []
    
    chat_history.append({"role": "user", "content": user_input})
    prompt = build_prompt(chat_history)
    print("🛠️ DEBUG - prompt:", prompt)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cpu")
    print("🛠️ DEBUG - input_ids:", input_ids)
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids=input_ids, 
            max_new_tokens=32,
            use_cache=True,
            # do_sample=True,
            do_sample=False, # Matikan sampling untuk percepat
            temperature=0.7,
            top_p=0.95
        )
    
    # output_ids = model.generate(
    #     input_ids=input_ids, 
    #     max_new_tokens=128,
    #     use_cache=True,
    #     do_sample=True,
    #     temperature=0.7,
    #     top_p=0.95
    # )
    print("🛠️ DEBUG - output_ids:", output_ids)
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
    print("🛠️ DEBUG - Response:", response)  # Akan muncul di terminal/logs
    chat_history.append({"role": "assistant", "content": response})
    return chat_history, chat_history

# def chat(user_input, chat_history):
#     if not chat_history:
#         chat_history = []

#     prompt = build_prompt(chat_history + [[user_input, ""]])
#     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

#     output_ids = model.generate(
#         input_ids=input_ids,
#         max_new_tokens=256,
#         do_sample=True,
#         temperature=0.7,
#         top_p=0.95,
#     )

#     generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#     response = generated_text.split("[/INST]")[-1].strip()
#     chat_history.append([user_input, response])
#     return chat_history, chat_history

# Gradio UI
with gr.Blocks(title="Ujang v3 Chatbot") as demo:
    gr.Markdown("### 🤖 Ujang v3 - LLaMA 2 Chatbot GPTQ")
    # chatbot = gr.Chatbot()
    chatbot = gr.Chatbot(type="messages")
    msg = gr.Textbox(label="Ketik pesan:")
    clear = gr.Button("🧹 Bersihkan")
    state = gr.State([])

    msg.submit(chat, [msg, state], [chatbot, state])
    clear.click(lambda: ([], []), None, [chatbot, state])

demo.launch()