File size: 2,455 Bytes
9b459ae e5dde1f 9b459ae e5dde1f 5e0182c e5dde1f 3a28db6 5e0182c e5dde1f 3a28db6 e5dde1f 3a28db6 e5dde1f 3a28db6 e5dde1f 3a28db6 e5dde1f 5e0182c e5dde1f fc62f09 e5dde1f 5e0182c e5dde1f 5e0182c e5dde1f 5e0182c e5dde1f 5e0182c e5dde1f 5e0182c fc62f09 34b863f 5e0182c e5dde1f 5e0182c 03f5805 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# Standard library imports
import os
import threading
# Third-party imports
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from peft import PeftModel
HF_TOKEN = os.getenv("HF_TOKEN")
tokenizer = AutoTokenizer.from_pretrained(
"bunyaminergen/Qwen2.5-Coder-1.5B-Instruct-Reasoning",
token=HF_TOKEN,
trust_remote_code=True
)
base_model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2.5-Coder-1.5B-Instruct",
device_map="auto",
torch_dtype="auto",
token=HF_TOKEN
)
base_model.resize_token_embeddings(len(tokenizer))
# 4️⃣ PEFT adapter’ı yükle
model = PeftModel.from_pretrained(
base_model,
"bunyaminergen/Qwen2.5-Coder-1.5B-Instruct-Reasoning",
token=HF_TOKEN
)
model.eval()
def respond(
message: str,
history: list[tuple[str, str]],
system_message: str,
max_tokens: int,
temperature: float,
top_p: float,
):
messages = [{"role": "system", "content": system_message}]
for u, a in history:
if u:
messages.append({"role": "user", "content": u})
if a:
messages.append({"role": "assistant", "content": a})
messages.append({"role": "user", "content": message})
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
streamer = TextIteratorStreamer(
tokenizer,
timeout=10.0,
skip_prompt=True,
skip_special_tokens=True
)
generation_kwargs = {
**inputs,
"max_new_tokens": max_tokens,
"temperature": temperature,
"top_p": top_p,
"streamer": streamer,
}
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
output = ""
for chunk in streamer:
output += chunk
yield output
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(value="You are a helpful coding assistant.", label="System message"),
gr.Slider(minimum=512, maximum=8192, value=2048, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
],
)
if __name__ == "__main__":
demo.launch(share=True)
|