janbanot's picture
fix: refactor
16d3aa3
raw
history blame
2.23 kB
import gradio as gr
import torch
import spaces
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TextStreamer,
)
MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct"
MODEL_NAME = MODEL_ID.split("/")[-1]
if torch.cuda.is_available():
device = torch.device("cuda")
print("Using GPU:", torch.cuda.get_device_name(0))
else:
device = torch.device("cpu")
print("CUDA is not available. Using CPU.")
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
quantization_config=quantization_config,
low_cpu_mem_usage=True,
)
@spaces.GPU
def test():
max_tokens = 5000
temperature = 0
top_k = 0
top_p = 0
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
model.generation_config.pad_token_id = tokenizer.pad_token_id
prompt = "Kim jesteś?"
system = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim"
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
tokenizer_output = tokenizer.apply_chat_template(
messages, return_tensors="pt", return_dict=True
)
if torch.cuda.is_available():
model_input_ids = tokenizer_output.input_ids.to(device)
model_attention_mask = tokenizer_output.attention_mask.to(device)
else:
model_input_ids = tokenizer_output.input_ids
model_attention_mask = tokenizer_output.attention_mask
outputs = model.generate(
model_input_ids,
attention_mask=model_attention_mask,
streamer=streamer,
max_new_tokens=max_tokens,
do_sample=True if temperature else False,
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
answer = tokenizer.batch_decode(outputs, skip_special_tokens=False)
return answer
demo = gr.Interface(fn=test, inputs=None, outputs=gr.Text())
demo.launch()