import gradio as gr import torch import spaces from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextStreamer, ) MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct" MODEL_NAME = MODEL_ID.split("/")[-1] if torch.cuda.is_available(): device = torch.device("cuda") print("Using GPU:", torch.cuda.get_device_name(0)) else: device = torch.device("cpu") print("CUDA is not available. Using CPU.") quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, quantization_config=quantization_config, low_cpu_mem_usage=True, ) @spaces.GPU def test(): max_tokens = 5000 temperature = 0 top_k = 0 top_p = 0 streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) model.generation_config.pad_token_id = tokenizer.pad_token_id prompt = "Kim jesteś?" system = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim" messages = [] if system: messages.append({"role": "system", "content": system}) messages.append({"role": "user", "content": prompt}) tokenizer_output = tokenizer.apply_chat_template( messages, return_tensors="pt", return_dict=True ) if torch.cuda.is_available(): model_input_ids = tokenizer_output.input_ids.to(device) model_attention_mask = tokenizer_output.attention_mask.to(device) else: model_input_ids = tokenizer_output.input_ids model_attention_mask = tokenizer_output.attention_mask outputs = model.generate( model_input_ids, attention_mask=model_attention_mask, streamer=streamer, max_new_tokens=max_tokens, do_sample=True if temperature else False, temperature=temperature, top_k=top_k, top_p=top_p, ) answer = tokenizer.batch_decode(outputs, skip_special_tokens=False) return answer demo = gr.Interface(fn=test, inputs=None, outputs=gr.Text()) demo.launch()