Spaces:

janbanot
/

bielik_goblin_zero

Running on Zero

App Files Files Community

janbanot commited on Feb 18

Commit

524b722

1 Parent(s): a2ced42

fix: try again

Browse files

Files changed (1) hide show

app.py +178 -20

app.py CHANGED Viewed

@@ -1,30 +1,188 @@
-import gradio as gr
-import spaces
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-MODEL_NAME = "speakleash/Bielik-11B-v2.3-Instruct-GGUF"
-MODEL_FILE = "Bielik-11B-v2.3-Instruct.Q4_K_M.gguf"
-@spaces.GPU
-def test():
-    device = torch.device("cuda")
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    model = AutoModelForCausalLM.from_pretrained(
-            MODEL_NAME,
-            model_file=MODEL_FILE,
-            model_type="mistral", gpu_layers=50, hf=True).to(device)
-    inputs = tokenizer("Cześć Bielik, jak się masz?", return_tensors="pt").to(device)
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs, max_new_tokens=128, pad_token_id=tokenizer.eos_token_id
         )
-    return tokenizer.decode(outputs[0], skip_special_tokens=True)
-demo = gr.Interface(fn=test, inputs=None, outputs=gr.Text())
-demo.launch()

+import os
+import subprocess
+from threading import Thread
+import random
 import torch
+import spaces
+import gradio as gr
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    TextIteratorStreamer,
+)
+subprocess.run(
+    "pip install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
+MODEL_ID = "speakleash/Bielik-7B-Instruct-v0.1"
+CHAT_TEMPLATE = "ChatML"
+MODEL_NAME = MODEL_ID.split("/")[-1]
+CONTEXT_LENGTH = 1024
+COLOR = os.environ.get("COLOR")
+EMOJI = os.environ.get("EMOJI")
+DESCRIPTION = os.environ.get("DESCRIPTION")
+# Load model
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+tokenizer.pad_token = tokenizer.eos_token
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    device_map="auto",
+    torch_dtype="auto",
+    attn_implementation="flash_attention_2",
+)
+@spaces.GPU()
+def generate(
+    instruction,
+    stop_tokens,
+    temperature,
+    max_new_tokens,
+    top_k,
+    repetition_penalty,
+    top_p,
+):
+    streamer = TextIteratorStreamer(
+        tokenizer, skip_prompt=True, skip_special_tokens=True
+    )
+    enc = tokenizer([instruction], return_tensors="pt", padding=True, truncation=True)
+    input_ids, attention_mask = enc.input_ids, enc.attention_mask
+    if input_ids.shape[1] > CONTEXT_LENGTH:
+        input_ids = input_ids[:, -CONTEXT_LENGTH:]
+    generate_kwargs = dict(
+        {
+            "input_ids": input_ids.to(device),
+            "attention_mask": attention_mask.to(device),
+        },
+        streamer=streamer,
+        do_sample=True if temperature else False,
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty,
+        top_p=top_p,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for new_token in streamer:
+        outputs.append(new_token)
+        if new_token in stop_tokens:
+            break
+        yield "".join(outputs)
+def predict(
+    message,
+    history,
+    system_prompt,
+    temperature,
+    max_new_tokens,
+    top_k,
+    repetition_penalty,
+    top_p,
+):
+    repetition_penalty = float(repetition_penalty)
+    print(
+        "LLL",
+        [
+            message,
+            history,
+            system_prompt,
+            temperature,
+            max_new_tokens,
+            top_k,
+            repetition_penalty,
+            top_p,
+        ],
+    )
+    # Format history with a given chat template
+    if CHAT_TEMPLATE == "ChatML":
+        stop_tokens = ["<|endoftext|>", "<|im_end|>"]
+        instruction = "<|im_start|>system\n" + system_prompt + "\n<|im_end|>\n"
+        for human, assistant in history:
+            instruction += (
+                "<|im_start|>user\n"
+                + human
+                + "\n<|im_end|>\n<|im_start|>assistant\n"
+                + assistant
+            )
+        instruction += (
+            "\n<|im_start|>user\n" + message + "\n<|im_end|>\n<|im_start|>assistant\n"
+        )
+    elif CHAT_TEMPLATE == "Mistral Instruct":
+        stop_tokens = ["</s>", "[INST]", "[INST] ", "<s>", "[/INST]", "[/INST] "]
+        instruction = "<s>[INST] " + system_prompt
+        for human, assistant in history:
+            instruction += human + " [/INST] " + assistant + "</s>[INST]"
+        instruction += " " + message + " [/INST]"
+    elif CHAT_TEMPLATE == "Bielik":
+        stop_tokens = ["</s>"]
+        prompt_builder = ["<s>[INST] "]
+        if system_prompt:
+            prompt_builder.append(f"<<SYS>>\n{system_prompt}\n<</SYS>>\n\n")
+        for human, assistant in history:
+            prompt_builder.append(f"{human} [/INST] {assistant}</s>[INST] ")
+        prompt_builder.append(f"{message} [/INST]")
+        instruction = "".join(prompt_builder)
+    else:
+        raise Exception(
+            "Incorrect chat template, select 'ChatML' or 'Mistral Instruct'"
         )
+    print(instruction)
+    for output_text in generate(
+        instruction,
+        stop_tokens,
+        temperature,
+        max_new_tokens,
+        top_k,
+        repetition_penalty,
+        top_p,
+    ):
+        yield output_text
+# Create Gradio interface
+def update_examples():
+    exs = [["Kim jesteś?"], ["Ile to jest 9+2-1?"], ["Napisz mi coś miłego."]]
+    random.shuffle(exs)
+    return gr.Dataset(samples=exs)
+with gr.Blocks() as demo:
+    chatbot = gr.Chatbot(label="Chatbot", likeable=True, render=False)
+    chat = gr.ChatInterface(
+        predict,
+        chatbot=chatbot,
+        title=EMOJI + " " + MODEL_NAME + " - online chat demo",
+        description=DESCRIPTION,
+        examples=[["Kim jesteś?"], ["Ile to jest 9+2-1?"], ["Napisz mi coś miłego."]],
+        additional_inputs_accordion=gr.Accordion(
+            label="⚙️ Parameters", open=False, render=False
+        ),
+        additional_inputs=[
+            gr.Textbox("", label="System prompt", render=False),
+            gr.Slider(0, 1, 0.6, label="Temperature", render=False),
+            gr.Slider(128, 4096, 1024, label="Max new tokens", render=False),
+            gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False),
+            gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False),
+            gr.Slider(0, 1, 0.95, label="Top P sampling", render=False),
+        ],
+        theme=gr.themes.Soft(primary_hue=COLOR),
+    )
+    demo.load(update_examples, None, chat.examples_handler.dataset)
+demo.queue(max_size=20).launch()