Spaces:

janbanot
/

bielik_goblin_zero

Running on Zero

App Files Files Community

janbanot commited on Feb 20

Commit

b1c28de

1 Parent(s): 4631bc7

fix: refactor

Browse files

chore: refactor

fix: wrong parameter name

Files changed (1) hide show

app.py +143 -78

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 import torch
 import spaces
@@ -9,86 +10,150 @@ from transformers import (
 )
 from threading import Thread
 MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct"
-MODEL_NAME = MODEL_ID.split("/")[-1]
-if torch.cuda.is_available():
-    device = torch.device("cuda")
-    print("Using GPU:", torch.cuda.get_device_name(0))
-else:
-    device = torch.device("cpu")
-    print("CUDA is not available. Using CPU.")
-quantization_config = BitsAndBytesConfig(
-        load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
     )
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-tokenizer.pad_token = tokenizer.eos_token
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID,
-    torch_dtype=torch.bfloat16,
-    quantization_config=quantization_config,
-    low_cpu_mem_usage=True,
-)
-@spaces.GPU
-def test(prompt):
-    max_tokens = 5000
-    temperature = 0
-    top_k = 0
-    top_p = 0
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    system = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim"
-    messages = []
-    if system:
-        messages.append({"role": "system", "content": system})
-    messages.append({"role": "user", "content": prompt})
-    tokenizer_output = tokenizer.apply_chat_template(
-        messages, return_tensors="pt", return_dict=True
-    )
-    if torch.cuda.is_available():
-        model_input_ids = tokenizer_output.input_ids.to(device)
-        model_attention_mask = tokenizer_output.attention_mask.to(device)
-    else:
-        model_input_ids = tokenizer_output.input_ids
-        model_attention_mask = tokenizer_output.attention_mask
-    generate_kwargs = {
-        "input_ids": model_input_ids,
-        "attention_mask": model_attention_mask,
-        "streamer": streamer,
-        "max_new_tokens": max_tokens,
-        "do_sample": True if temperature else False,
-        "temperature": temperature,
-        "top_k": top_k,
-        "top_p": top_p,
-    }
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    partial_response = ""
-    for new_token in streamer:
-        partial_response += new_token
-        # Stop if we hit any of the special tokens
-        if "<|im_end|>" in partial_response or "<|endoftext|>" in partial_response:
-            break
-        yield partial_response
-demo = gr.Interface(
-    fn=test,
-    inputs=gr.Textbox(label="Your question", placeholder="Type your question here..."),
-    outputs=gr.Textbox(label="Answer", lines=5),
-    title="Polish Chatbot",
-    description="Ask questions in Polish to the Bielik-11B-v2.3-Instruct model"
-)
-demo.launch()

+from typing import Dict, Generator, List, Optional
 import gradio as gr
 import torch
 import spaces
 )
 from threading import Thread
+# Configuration
 MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct"
+SYSTEM_PROMPT = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim"
+DEFAULT_GENERATION_PARAMS = {
+    "max_new_tokens": 5000,
+    "temperature": 0,
+    "top_k": 0,
+    "top_p": 0,
+}
+class ModelLoader:
+    """Handles model loading and device setup"""
+    def __init__(self, model_id: str):
+        self.device = self._get_device()
+        self.quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
+        )
+        self.tokenizer = self._load_tokenizer(model_id)
+        self.model = self._load_model(model_id)
+    def _get_device(self) -> torch.device:
+        """Determine and return the appropriate device"""
+        if torch.cuda.is_available():
+            device = torch.device("cuda")
+            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
+        else:
+            device = torch.device("cpu")
+            print("CUDA is not available. Using CPU.")
+        return device
+    def _load_tokenizer(self, model_id: str) -> AutoTokenizer:
+        """Load and configure the tokenizer"""
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_token = tokenizer.eos_token
+        return tokenizer
+    def _load_model(self, model_id: str) -> AutoModelForCausalLM:
+        """Load and configure the model"""
+        return AutoModelForCausalLM.from_pretrained(
+            model_id,
+            torch_dtype=torch.bfloat16,
+            quantization_config=self.quantization_config,
+            low_cpu_mem_usage=True,
+            device_map="auto",
+        )
+class ChatInterface:
+    """Handles chat interactions and response generation"""
+    def __init__(self, model_loader: ModelLoader):
+        self.model = model_loader.model
+        self.tokenizer = model_loader.tokenizer
+        self.device = model_loader.device
+    @spaces.GPU
+    def generate_response(
+        self, prompt: str, system_prompt: Optional[str] = None
+    ) -> Generator[str, None, None]:
+        """Generate streaming response for the given prompt"""
+        generation_params = DEFAULT_GENERATION_PARAMS.copy()
+        streamer = TextIteratorStreamer(
+            self.tokenizer, skip_prompt=True, skip_special_tokens=True
+        )
+        messages = self._build_messages(prompt, system_prompt or SYSTEM_PROMPT)
+        tokenizer_output = self._prepare_inputs(messages)
+        generate_kwargs = {
+            **generation_params,
+            **tokenizer_output,
+            "streamer": streamer,
+            "do_sample": bool(generation_params["temperature"]),
+        }
+        self._start_generation_thread(generate_kwargs)
+        yield from self._stream_response(streamer)
+    def _build_messages(self, prompt: str, system_prompt: str) -> List[Dict[str, str]]:
+        """Build the message structure for the model"""
+        messages = [{"role": "system", "content": system_prompt}]
+        messages.append({"role": "user", "content": prompt})
+        return messages
+    def _prepare_inputs(
+        self, messages: List[Dict[str, str]]
+    ) -> Dict[str, torch.Tensor]:
+        """Prepare model inputs from messages"""
+        tokenizer_output = self.tokenizer.apply_chat_template(
+            messages, return_tensors="pt", return_dict=True
+        )
+        # Ensure all tensors are on the correct device
+        inputs = {
+            "input_ids": tokenizer_output.input_ids.to(self.device),
+            "attention_mask": tokenizer_output.attention_mask.to(self.device),
+        }
+        # Move model to device if not already there
+        if self.model.device != self.device:
+            self.model.to(self.device)
+        return inputs
+    def _start_generation_thread(self, generate_kwargs: Dict):
+        """Start model generation in a separate thread"""
+        t = Thread(target=self.model.generate, kwargs=generate_kwargs)
+        t.start()
+    def _stream_response(
+        self, streamer: TextIteratorStreamer
+    ) -> Generator[str, None, None]:
+        """Stream the response token by token"""
+        partial_response = ""
+        for new_token in streamer:
+            partial_response += new_token
+            if any(
+                stop_token in partial_response
+                for stop_token in ["<|im_end|>", "<|endoftext|>"]
+            ):
+                break
+            yield partial_response
+def create_gradio_interface(chat_interface: ChatInterface) -> gr.Interface:
+    """Create and configure the Gradio interface"""
+    return gr.Interface(
+        fn=chat_interface.generate_response,
+        inputs=gr.Textbox(
+            label="Your question", placeholder="Type your question here..."
+        ),
+        outputs=gr.Textbox(label="Answer", lines=5),
+        title="Polish Chatbot",
+        description="Ask questions in Polish to the Bielik-11B-v2.3-Instruct model",
     )
+if __name__ == "__main__":
+    # Initialize components
+    model_loader = ModelLoader(MODEL_ID)
+    chat_interface = ChatInterface(model_loader)
+    # Create and launch interface
+    demo = create_gradio_interface(chat_interface)
+    demo.launch()