Spaces:
Running
on
Zero
Running
on
Zero
File size: 2,229 Bytes
daeec09 36704dc 524b722 daeec09 524b722 16d3aa3 524b722 16d3aa3 ff9698f daeec09 16d3aa3 daeec09 a2ced42 daeec09 ff9698f daeec09 524b722 36704dc daeec09 ff9698f daeec09 524b722 daeec09 524b722 5f24a61 daeec09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import gradio as gr
import torch
import spaces
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
TextStreamer,
)
MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct"
MODEL_NAME = MODEL_ID.split("/")[-1]
if torch.cuda.is_available():
device = torch.device("cuda")
print("Using GPU:", torch.cuda.get_device_name(0))
else:
device = torch.device("cpu")
print("CUDA is not available. Using CPU.")
quantization_config = BitsAndBytesConfig(
load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
quantization_config=quantization_config,
low_cpu_mem_usage=True,
)
@spaces.GPU
def test():
max_tokens = 5000
temperature = 0
top_k = 0
top_p = 0
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
model.generation_config.pad_token_id = tokenizer.pad_token_id
prompt = "Kim jesteś?"
system = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim"
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
tokenizer_output = tokenizer.apply_chat_template(
messages, return_tensors="pt", return_dict=True
)
if torch.cuda.is_available():
model_input_ids = tokenizer_output.input_ids.to(device)
model_attention_mask = tokenizer_output.attention_mask.to(device)
else:
model_input_ids = tokenizer_output.input_ids
model_attention_mask = tokenizer_output.attention_mask
outputs = model.generate(
model_input_ids,
attention_mask=model_attention_mask,
streamer=streamer,
max_new_tokens=max_tokens,
do_sample=True if temperature else False,
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
answer = tokenizer.batch_decode(outputs, skip_special_tokens=False)
return answer
demo = gr.Interface(fn=test, inputs=None, outputs=gr.Text())
demo.launch()
|