Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import spaces | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
MODEL_NAME = "speakleash/Bielik-11B-v2.3-Instruct-GGUF" | |
MODEL_FILE = "Bielik-11B-v2.3-Instruct.Q4_K_M.gguf" | |
def test(): | |
device = torch.device("cuda") | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_NAME, | |
model_file=MODEL_FILE, | |
model_type="mistral", gpu_layers=50, hf=True).to(device) | |
inputs = tokenizer("Cześć Bielik, jak się masz?", return_tensors="pt").to(device) | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id | |
) | |
return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
demo = gr.Interface(fn=test, inputs=None, outputs=gr.Text()) | |
demo.launch() | |