import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download def load_model(): # Download the model from HuggingFace repo_id = "forestav/gguf_lora_model" model_file = "unsloth.F16.gguf" local_path = hf_hub_download( repo_id=repo_id, filename=model_file ) # Initialize the model model = Llama( model_path=local_path, n_ctx=2048, n_threads=8 ) return model def generate_response(message, history): # Generate response response = model.create_chat_completion( messages=[ {"role": "user", "content": message} ], max_tokens=512, temperature=0.7, top_p=0.95, ) return response['choices'][0]['message']['content'] # Load model globally model = load_model() # Create Gradio interface with updated parameters demo = gr.ChatInterface( fn=generate_response, title="Your GGUF Model Chat", description="A conversational AI model using GGUF format", examples=["Continue the fibonacci sequence: 1, 1, 2, 3, 5, 8,"] ) if __name__ == "__main__": demo.launch()