Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,450 Bytes
daeec09 36704dc 524b722 4631bc7 524b722 4631bc7 524b722 16d3aa3 7d2afe0 a23d50e 7d2afe0 a23d50e 849151a 7d2afe0 daeec09 7d2afe0 a2ced42 7d2afe0 a23d50e 7d2afe0 a23d50e 7d2afe0 e955a84 7d2afe0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import gradio as gr
import torch
import spaces
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TextIteratorStreamer,
)
from threading import Thread
MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct"
MODEL_NAME = MODEL_ID.split("/")[-1]
if torch.cuda.is_available():
device = torch.device("cuda")
print("Using GPU:", torch.cuda.get_device_name(0))
else:
device = torch.device("cpu")
print("CUDA is not available. Using CPU.")
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
quantization_config=quantization_config,
low_cpu_mem_usage=True,
)
@spaces.GPU
def generate(
prompt,
temperature,
max_tokens,
top_k,
repetition_penalty,
top_p,
):
streamer = TextIteratorStreamer(
tokenizer, skip_prompt=True, skip_special_tokens=True
)
system = "Jesteś chatbotem udzielającym odpowiedzi na pytania w języku polskim"
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
tokenizer_output = tokenizer.apply_chat_template(
messages, return_tensors="pt", return_dict=True
)
if torch.cuda.is_available():
model_input_ids = tokenizer_output.input_ids.to(device)
model_attention_mask = tokenizer_output.attention_mask.to(device)
else:
model_input_ids = tokenizer_output.input_ids
model_attention_mask = tokenizer_output.attention_mask
generate_kwargs = {
"input_ids": model_input_ids,
"attention_mask": model_attention_mask,
"streamer": streamer,
"do_sample": True if temperature else False,
"temperature": temperature,
"max_new_tokens": max_tokens,
"top_k": top_k,
"repetition_penalty": repetition_penalty,
"top_p": top_p,
}
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
partial_response = ""
for new_token in streamer:
partial_response += new_token
if "<|im_end|>" in partial_response or "<|endoftext|>" in partial_response:
break
yield partial_response
with gr.Blocks() as demo:
gr.Markdown("# Bielik Tools - narzędzia dla modelu Bielik v2.3")
gr.Markdown("Bielik czeka na Twoje pytanie - zadaj je śmiało i otrzymaj odpowiedź!")
with gr.Row():
prompt = gr.Textbox(
label="Twoje pytanie", placeholder="Zadaj swoje pytanie tutaj...", lines=10
)
output = gr.Textbox(label="Answer", lines=10)
btn = gr.Button("Generuj odpowiedź")
with gr.Accordion("⚙️ Parametry", open=False):
temperature = gr.Slider(0, 1, 0.3, step=0.1, label="Temperatura")
max_tokens = gr.Slider(128, 4096, 1024, label="Maksymalna długość odpowiedzi")
top_k = gr.Slider(1, 80, 40, step=1, label="Top K")
repetition_penalty = gr.Slider(0, 2, 0, step=0.1, label="Penalizacja powtórzeń")
top_p = gr.Slider(0, 1, 1, step=0.1, label="Top P")
btn.click(
generate,
inputs=[prompt, temperature, max_tokens, top_k, repetition_penalty, top_p],
outputs=output,
)
demo.launch()
|