import gradio as gr import torch import spaces from transformers import ( AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer, ) from threading import Thread MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct" MODEL_NAME = MODEL_ID.split("/")[-1] if torch.cuda.is_available(): device = torch.device("cuda") print("Using GPU:", torch.cuda.get_device_name(0)) else: device = torch.device("cpu") print("CUDA is not available. Using CPU.") quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, quantization_config=quantization_config, low_cpu_mem_usage=True, ) @spaces.GPU def generate( prompt, temperature, max_tokens, top_k, repetition_penalty, top_p, ): streamer = TextIteratorStreamer( tokenizer, skip_prompt=True, skip_special_tokens=True ) system = "Jesteś chatbotem udzielającym odpowiedzi na pytania w języku polskim" messages = [] if system: messages.append({"role": "system", "content": system}) messages.append({"role": "user", "content": prompt}) tokenizer_output = tokenizer.apply_chat_template( messages, return_tensors="pt", return_dict=True ) if torch.cuda.is_available(): model_input_ids = tokenizer_output.input_ids.to(device) model_attention_mask = tokenizer_output.attention_mask.to(device) else: model_input_ids = tokenizer_output.input_ids model_attention_mask = tokenizer_output.attention_mask generate_kwargs = { "input_ids": model_input_ids, "attention_mask": model_attention_mask, "streamer": streamer, "do_sample": True if temperature else False, "temperature": temperature, "max_new_tokens": max_tokens, "top_k": top_k, "repetition_penalty": repetition_penalty, "top_p": top_p, } t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() partial_response = "" for new_token in streamer: partial_response += new_token if "<|im_end|>" in partial_response or "<|endoftext|>" in partial_response: break # Strip leading whitespace and newlines cleaned_response = partial_response.lstrip("\n").lstrip() yield cleaned_response def clear(): return "", "" with gr.Blocks() as demo: gr.Markdown("# Bielik Tools - narzędzia dla modelu Bielik v2.3") gr.Markdown("Bielik czeka na Twoje pytanie - zadaj je śmiało i otrzymaj odpowiedź!") with gr.Row(): prompt = gr.Textbox( label="Twoje pytanie", placeholder="Zadaj swoje pytanie tutaj...", lines=10 ) output = gr.Textbox(label="Answer", lines=10) with gr.Row(): btn = gr.Button("Generuj odpowiedź") clear_btn = gr.Button("Wyczyść") with gr.Accordion("⚙️ Parametry", open=False): temperature = gr.Slider(0, 1, 0.3, step=0.1, label="Temperatura") max_tokens = gr.Slider(128, 4096, 1024, label="Maksymalna długość odpowiedzi") top_k = gr.Slider(1, 80, 40, step=1, label="Top K") repetition_penalty = gr.Slider( 0, 2, 1.1, step=0.1, label="Penalizacja powtórzeń" ) top_p = gr.Slider(0, 1, 0.95, step=0.05, label="Top P") btn.click( generate, inputs=[prompt, temperature, max_tokens, top_k, repetition_penalty, top_p], outputs=output, ) clear_btn.click(clear, inputs=[], outputs=[prompt, output]) demo.launch()