Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,027 Bytes
daeec09 36704dc 524b722 4631bc7 524b722 4631bc7 524b722 16d3aa3 7d2afe0 a23d50e 7d2afe0 a23d50e 7d2afe0 daeec09 7d2afe0 a2ced42 7d2afe0 a23d50e 7d2afe0 a23d50e 7d2afe0 a23d50e 7d2afe0 a23d50e 7d2afe0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import gradio as gr
import torch
import spaces
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TextIteratorStreamer,
)
from threading import Thread
MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct"
MODEL_NAME = MODEL_ID.split("/")[-1]
if torch.cuda.is_available():
device = torch.device("cuda")
print("Using GPU:", torch.cuda.get_device_name(0))
else:
device = torch.device("cpu")
print("CUDA is not available. Using CPU.")
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
quantization_config=quantization_config,
low_cpu_mem_usage=True,
)
@spaces.GPU
def generate(
prompt,
temperature,
max_tokens,
top_k,
repetition_penalty,
top_p,
):
streamer = TextIteratorStreamer(
tokenizer, skip_prompt=True, skip_special_tokens=True
)
system = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim"
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
tokenizer_output = tokenizer.apply_chat_template(
messages, return_tensors="pt", return_dict=True
)
if torch.cuda.is_available():
model_input_ids = tokenizer_output.input_ids.to(device)
model_attention_mask = tokenizer_output.attention_mask.to(device)
else:
model_input_ids = tokenizer_output.input_ids
model_attention_mask = tokenizer_output.attention_mask
generate_kwargs = {
"input_ids": model_input_ids,
"attention_mask": model_attention_mask,
"streamer": streamer,
"do_sample": True if temperature else False,
"temperature": temperature,
"max_new_tokens": max_tokens,
"top_k": top_k,
"repetition_penalty": repetition_penalty,
"top_p": top_p,
}
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
partial_response = ""
for new_token in streamer:
partial_response += new_token
if "<|im_end|>" in partial_response or "<|endoftext|>" in partial_response:
break
yield partial_response
demo = gr.Interface(
fn=generate,
inputs=[
gr.Textbox(label="Your question", placeholder="Type your question here..."),
gr.Slider(0, 1, 0.6, label="Temperature"),
gr.Slider(128, 4096, 1024, label="Max new tokens"),
gr.Slider(1, 80, 40, step=1, label="Top K sampling"),
gr.Slider(0, 2, 1.1, label="Repetition penalty"),
gr.Slider(0, 1, 0.95, label="Top P sampling"),
],
outputs=gr.Textbox(label="Answer", lines=5),
title="Polish Chatbot",
description="Ask questions in Polish to the Bielik-11B-v2.3-Instruct model",
)
demo.launch()
|