Spaces:

janbanot
/

bielik_goblin_zero

Running on Zero

App Files Files Community

janbanot commited on Feb 20

Commit

7d2afe0

1 Parent(s): b1c28de

Revert "fix: refactor"

Browse files

This reverts commit b1c28de92515add5f0b6debbd169c837aa7b9be6.

Files changed (1) hide show

app.py +78 -143

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
-from typing import Dict, Generator, List, Optional
 import gradio as gr
 import torch
 import spaces
@@ -10,150 +9,86 @@ from transformers import (
 )
 from threading import Thread
-# Configuration
 MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct"
-SYSTEM_PROMPT = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim"
-DEFAULT_GENERATION_PARAMS = {
-    "max_new_tokens": 5000,
-    "temperature": 0,
-    "top_k": 0,
-    "top_p": 0,
-}
-class ModelLoader:
-    """Handles model loading and device setup"""
-    def __init__(self, model_id: str):
-        self.device = self._get_device()
-        self.quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
-        )
-        self.tokenizer = self._load_tokenizer(model_id)
-        self.model = self._load_model(model_id)
-    def _get_device(self) -> torch.device:
-        """Determine and return the appropriate device"""
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-            print(f"Using GPU: {torch.cuda.get_device_name(0)}")
-        else:
-            device = torch.device("cpu")
-            print("CUDA is not available. Using CPU.")
-        return device
-    def _load_tokenizer(self, model_id: str) -> AutoTokenizer:
-        """Load and configure the tokenizer"""
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-        tokenizer.pad_token = tokenizer.eos_token
-        return tokenizer
-    def _load_model(self, model_id: str) -> AutoModelForCausalLM:
-        """Load and configure the model"""
-        return AutoModelForCausalLM.from_pretrained(
-            model_id,
-            torch_dtype=torch.bfloat16,
-            quantization_config=self.quantization_config,
-            low_cpu_mem_usage=True,
-            device_map="auto",
-        )
-class ChatInterface:
-    """Handles chat interactions and response generation"""
-    def __init__(self, model_loader: ModelLoader):
-        self.model = model_loader.model
-        self.tokenizer = model_loader.tokenizer
-        self.device = model_loader.device
-    @spaces.GPU
-    def generate_response(
-        self, prompt: str, system_prompt: Optional[str] = None
-    ) -> Generator[str, None, None]:
-        """Generate streaming response for the given prompt"""
-        generation_params = DEFAULT_GENERATION_PARAMS.copy()
-        streamer = TextIteratorStreamer(
-            self.tokenizer, skip_prompt=True, skip_special_tokens=True
-        )
-        messages = self._build_messages(prompt, system_prompt or SYSTEM_PROMPT)
-        tokenizer_output = self._prepare_inputs(messages)
-        generate_kwargs = {
-            **generation_params,
-            **tokenizer_output,
-            "streamer": streamer,
-            "do_sample": bool(generation_params["temperature"]),
-        }
-        self._start_generation_thread(generate_kwargs)
-        yield from self._stream_response(streamer)
-    def _build_messages(self, prompt: str, system_prompt: str) -> List[Dict[str, str]]:
-        """Build the message structure for the model"""
-        messages = [{"role": "system", "content": system_prompt}]
-        messages.append({"role": "user", "content": prompt})
-        return messages
-    def _prepare_inputs(
-        self, messages: List[Dict[str, str]]
-    ) -> Dict[str, torch.Tensor]:
-        """Prepare model inputs from messages"""
-        tokenizer_output = self.tokenizer.apply_chat_template(
-            messages, return_tensors="pt", return_dict=True
-        )
-        # Ensure all tensors are on the correct device
-        inputs = {
-            "input_ids": tokenizer_output.input_ids.to(self.device),
-            "attention_mask": tokenizer_output.attention_mask.to(self.device),
-        }
-        # Move model to device if not already there
-        if self.model.device != self.device:
-            self.model.to(self.device)
-        return inputs
-    def _start_generation_thread(self, generate_kwargs: Dict):
-        """Start model generation in a separate thread"""
-        t = Thread(target=self.model.generate, kwargs=generate_kwargs)
-        t.start()
-    def _stream_response(
-        self, streamer: TextIteratorStreamer
-    ) -> Generator[str, None, None]:
-        """Stream the response token by token"""
-        partial_response = ""
-        for new_token in streamer:
-            partial_response += new_token
-            if any(
-                stop_token in partial_response
-                for stop_token in ["<|im_end|>", "<|endoftext|>"]
-            ):
-                break
-            yield partial_response
-def create_gradio_interface(chat_interface: ChatInterface) -> gr.Interface:
-    """Create and configure the Gradio interface"""
-    return gr.Interface(
-        fn=chat_interface.generate_response,
-        inputs=gr.Textbox(
-            label="Your question", placeholder="Type your question here..."
-        ),
-        outputs=gr.Textbox(label="Answer", lines=5),
-        title="Polish Chatbot",
-        description="Ask questions in Polish to the Bielik-11B-v2.3-Instruct model",
     )
-if __name__ == "__main__":
-    # Initialize components
-    model_loader = ModelLoader(MODEL_ID)
-    chat_interface = ChatInterface(model_loader)
-    # Create and launch interface
-    demo = create_gradio_interface(chat_interface)
-    demo.launch()

 import gradio as gr
 import torch
 import spaces
 )
 from threading import Thread
 MODEL_ID = "speakleash/Bielik-11B-v2.3-Instruct"
+MODEL_NAME = MODEL_ID.split("/")[-1]
+if torch.cuda.is_available():
+    device = torch.device("cuda")
+    print("Using GPU:", torch.cuda.get_device_name(0))
+else:
+    device = torch.device("cpu")
+    print("CUDA is not available. Using CPU.")
+quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16
     )
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+tokenizer.pad_token = tokenizer.eos_token
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID,
+    torch_dtype=torch.bfloat16,
+    quantization_config=quantization_config,
+    low_cpu_mem_usage=True,
+)
+@spaces.GPU
+def test(prompt):
+    max_tokens = 5000
+    temperature = 0
+    top_k = 0
+    top_p = 0
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+    system = "Jesteś chatboem udzielającym odpowiedzi na pytania w języku polskim"
+    messages = []
+    if system:
+        messages.append({"role": "system", "content": system})
+    messages.append({"role": "user", "content": prompt})
+    tokenizer_output = tokenizer.apply_chat_template(
+        messages, return_tensors="pt", return_dict=True
+    )
+    if torch.cuda.is_available():
+        model_input_ids = tokenizer_output.input_ids.to(device)
+        model_attention_mask = tokenizer_output.attention_mask.to(device)
+    else:
+        model_input_ids = tokenizer_output.input_ids
+        model_attention_mask = tokenizer_output.attention_mask
+    generate_kwargs = {
+        "input_ids": model_input_ids,
+        "attention_mask": model_attention_mask,
+        "streamer": streamer,
+        "max_new_tokens": max_tokens,
+        "do_sample": True if temperature else False,
+        "temperature": temperature,
+        "top_k": top_k,
+        "top_p": top_p,
+    }
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    partial_response = ""
+    for new_token in streamer:
+        partial_response += new_token
+        # Stop if we hit any of the special tokens
+        if "<|im_end|>" in partial_response or "<|endoftext|>" in partial_response:
+            break
+        yield partial_response
+demo = gr.Interface(
+    fn=test,
+    inputs=gr.Textbox(label="Your question", placeholder="Type your question here..."),
+    outputs=gr.Textbox(label="Answer", lines=5),
+    title="Polish Chatbot",
+    description="Ask questions in Polish to the Bielik-11B-v2.3-Instruct model"
+)
+demo.launch()