Spaces:

janbanot
/

bielik_goblin_zero

Running on Zero

App Files Files Community

janbanot commited on Feb 22

Commit

a23d50e

1 Parent(s): 7d2afe0

fix: refactor + interface change

Browse files

Files changed (1) hide show

app.py +25 -16

app.py CHANGED Viewed

@@ -20,8 +20,8 @@ else:
     print("CUDA is not available. Using CPU.")
 quantization_config = BitsAndBytesConfig(
-        load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
-    )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 tokenizer.pad_token = tokenizer.eos_token
 model = AutoModelForCausalLM.from_pretrained(
@@ -33,13 +33,17 @@ model = AutoModelForCausalLM.from_pretrained(
 @spaces.GPU
-def test(prompt):
-    max_tokens = 5000
-    temperature = 0
-    top_k = 0
-    top_p = 0
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     system = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim"
     messages = []
@@ -54,9 +58,7 @@ def test(prompt):
     if torch.cuda.is_available():
         model_input_ids = tokenizer_output.input_ids.to(device)
         model_attention_mask = tokenizer_output.attention_mask.to(device)
     else:
         model_input_ids = tokenizer_output.input_ids
         model_attention_mask = tokenizer_output.attention_mask
@@ -65,10 +67,11 @@ def test(prompt):
         "input_ids": model_input_ids,
         "attention_mask": model_attention_mask,
         "streamer": streamer,
-        "max_new_tokens": max_tokens,
         "do_sample": True if temperature else False,
         "temperature": temperature,
         "top_k": top_k,
         "top_p": top_p,
     }
@@ -78,17 +81,23 @@ def test(prompt):
     partial_response = ""
     for new_token in streamer:
         partial_response += new_token
-        # Stop if we hit any of the special tokens
         if "<|im_end|>" in partial_response or "<|endoftext|>" in partial_response:
             break
         yield partial_response
 demo = gr.Interface(
-    fn=test,
-    inputs=gr.Textbox(label="Your question", placeholder="Type your question here..."),
     outputs=gr.Textbox(label="Answer", lines=5),
     title="Polish Chatbot",
-    description="Ask questions in Polish to the Bielik-11B-v2.3-Instruct model"
 )
 demo.launch()

     print("CUDA is not available. Using CPU.")
 quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
+)
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 tokenizer.pad_token = tokenizer.eos_token
 model = AutoModelForCausalLM.from_pretrained(
 @spaces.GPU
+def generate(
+    prompt,
+    temperature,
+    max_tokens,
+    top_k,
+    repetition_penalty,
+    top_p,
+):
+    streamer = TextIteratorStreamer(
+        tokenizer, skip_prompt=True, skip_special_tokens=True
+    )
     system = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim"
     messages = []
     if torch.cuda.is_available():
         model_input_ids = tokenizer_output.input_ids.to(device)
         model_attention_mask = tokenizer_output.attention_mask.to(device)
     else:
         model_input_ids = tokenizer_output.input_ids
         model_attention_mask = tokenizer_output.attention_mask
         "input_ids": model_input_ids,
         "attention_mask": model_attention_mask,
         "streamer": streamer,
         "do_sample": True if temperature else False,
         "temperature": temperature,
+        "max_new_tokens": max_tokens,
         "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
         "top_p": top_p,
     }
     partial_response = ""
     for new_token in streamer:
         partial_response += new_token
         if "<|im_end|>" in partial_response or "<|endoftext|>" in partial_response:
             break
         yield partial_response
 demo = gr.Interface(
+    fn=generate,
+    inputs=[
+        gr.Textbox(label="Your question", placeholder="Type your question here..."),
+        gr.Slider(0, 1, 0.6, label="Temperature"),
+        gr.Slider(128, 4096, 1024, label="Max new tokens"),
+        gr.Slider(1, 80, 40, step=1, label="Top K sampling"),
+        gr.Slider(0, 2, 1.1, label="Repetition penalty"),
+        gr.Slider(0, 1, 0.95, label="Top P sampling"),
+    ],
     outputs=gr.Textbox(label="Answer", lines=5),
     title="Polish Chatbot",
+    description="Ask questions in Polish to the Bielik-11B-v2.3-Instruct model",
 )
 demo.launch()