import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

def load_model():
    # Download the model from HuggingFace
    repo_id = "forestav/gguf_lora_model"
    model_file = "unsloth.F16.gguf"  
    
    local_path = hf_hub_download(
        repo_id=repo_id,
        filename=model_file
    )
    
    # Initialize the model
    model = Llama(
        model_path=local_path,
        n_ctx=2048,
        n_threads=8
    )
    
    return model

def generate_response(message, history):
    # Generate response
    response = model.create_chat_completion(
        messages=[
            {"role": "user", "content": message}
        ],
        max_tokens=512,
        temperature=0.7,
        top_p=0.95,
    )
    
    return response['choices'][0]['message']['content']

# Load model globally
model = load_model()

# Create Gradio interface with updated parameters
demo = gr.ChatInterface(
    fn=generate_response,
    title="Your GGUF Model Chat",
    description="A conversational AI model using GGUF format",
    examples=["Continue the fibonacci sequence: 1, 1, 2, 3, 5, 8,"]
)

if __name__ == "__main__":
    demo.launch()