Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import torch | |
import spaces | |
from transformers import ( | |
AutoModelForCausalLM, | |
AutoTokenizer, | |
BitsAndBytesConfig, | |
TextStreamer, | |
) | |
MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct" | |
MODEL_NAME = MODEL_ID.split("/")[-1] | |
if torch.cuda.is_available(): | |
device = torch.device("cuda") | |
print("Using GPU:", torch.cuda.get_device_name(0)) | |
else: | |
device = torch.device("cpu") | |
print("CUDA is not available. Using CPU.") | |
quantization_config = BitsAndBytesConfig( | |
load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16 | |
) | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
tokenizer.pad_token = tokenizer.eos_token | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_ID, | |
torch_dtype=torch.bfloat16, | |
quantization_config=quantization_config, | |
low_cpu_mem_usage=True, | |
) | |
def test(): | |
max_tokens = 5000 | |
temperature = 0 | |
top_k = 0 | |
top_p = 0 | |
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
model.generation_config.pad_token_id = tokenizer.pad_token_id | |
prompt = "Kim jesteś?" | |
system = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim" | |
messages = [] | |
if system: | |
messages.append({"role": "system", "content": system}) | |
messages.append({"role": "user", "content": prompt}) | |
tokenizer_output = tokenizer.apply_chat_template( | |
messages, return_tensors="pt", return_dict=True | |
) | |
if torch.cuda.is_available(): | |
model_input_ids = tokenizer_output.input_ids.to(device) | |
model_attention_mask = tokenizer_output.attention_mask.to(device) | |
else: | |
model_input_ids = tokenizer_output.input_ids | |
model_attention_mask = tokenizer_output.attention_mask | |
outputs = model.generate( | |
model_input_ids, | |
attention_mask=model_attention_mask, | |
streamer=streamer, | |
max_new_tokens=max_tokens, | |
do_sample=True if temperature else False, | |
temperature=temperature, | |
top_k=top_k, | |
top_p=top_p, | |
) | |
answer = tokenizer.batch_decode(outputs, skip_special_tokens=False) | |
return answer | |
demo = gr.Interface(fn=test, inputs=None, outputs=gr.Text()) | |
demo.launch() | |