Spaces:

janbanot
/

bielik_goblin_zero

Running on Zero

App Files Files Community

janbanot commited on Feb 20

Commit

4631bc7

1 Parent(s): d54309a

fix: output streaming

Browse files

Files changed (1) hide show

app.py +25 -20

app.py CHANGED Viewed

@@ -5,8 +5,9 @@ from transformers import (
     AutoModelForCausalLM,
     AutoTokenizer,
     BitsAndBytesConfig,
-    TextStreamer,
 )
 MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct"
 MODEL_NAME = MODEL_ID.split("/")[-1]
@@ -38,9 +39,7 @@ def test(prompt):
     top_k = 0
     top_p = 0
-    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    model.generation_config.pad_token_id = tokenizer.pad_token_id
     system = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim"
     messages = []
@@ -62,27 +61,33 @@ def test(prompt):
         model_input_ids = tokenizer_output.input_ids
         model_attention_mask = tokenizer_output.attention_mask
-    outputs = model.generate(
-        model_input_ids,
-        attention_mask=model_attention_mask,
-        streamer=streamer,
-        max_new_tokens=max_tokens,
-        do_sample=True if temperature else False,
-        temperature=temperature,
-        top_k=top_k,
-        top_p=top_p,
-    )
-    answer = tokenizer.batch_decode(outputs, skip_special_tokens=False)
-    # Extract just the assistant's response after last user message
-    response = answer[0].split("<|im_end|>")[2].strip()
-    return response
 demo = gr.Interface(
     fn=test,
     inputs=gr.Textbox(label="Your question", placeholder="Type your question here..."),
-    outputs=gr.Text(label="Answer"),
     title="Polish Chatbot",
     description="Ask questions in Polish to the Bielik-11B-v2.3-Instruct model"
 )

     AutoModelForCausalLM,
     AutoTokenizer,
     BitsAndBytesConfig,
+    TextIteratorStreamer,
 )
+from threading import Thread
 MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct"
 MODEL_NAME = MODEL_ID.split("/")[-1]
     top_k = 0
     top_p = 0
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     system = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim"
     messages = []
         model_input_ids = tokenizer_output.input_ids
         model_attention_mask = tokenizer_output.attention_mask
+    generate_kwargs = {
+        "input_ids": model_input_ids,
+        "attention_mask": model_attention_mask,
+        "streamer": streamer,
+        "max_new_tokens": max_tokens,
+        "do_sample": True if temperature else False,
+        "temperature": temperature,
+        "top_k": top_k,
+        "top_p": top_p,
+    }
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    partial_response = ""
+    for new_token in streamer:
+        partial_response += new_token
+        # Stop if we hit any of the special tokens
+        if "<|im_end|>" in partial_response or "<|endoftext|>" in partial_response:
+            break
+        yield partial_response
 demo = gr.Interface(
     fn=test,
     inputs=gr.Textbox(label="Your question", placeholder="Type your question here..."),
+    outputs=gr.Textbox(label="Answer", lines=5),
     title="Polish Chatbot",
     description="Ask questions in Polish to the Bielik-11B-v2.3-Instruct model"
 )