codewithdark commited on
Commit
2dc2198
·
verified ·
1 Parent(s): fc75b8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -17
app.py CHANGED
@@ -1,17 +1,17 @@
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
- from transformers import AutoModel, AutoTokenizer
4
  import torch
5
 
6
  # Initialize Hugging Face Inference API client
7
  hf_client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
9
- # Load the second model
10
  local_model_name = "codewithdark/latent-recurrent-depth-lm"
11
  tokenizer = AutoTokenizer.from_pretrained(local_model_name)
12
- model = AutoModel.from_pretrained(local_model_name, trust_remote_code=True)
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
- model.to(device)
15
 
16
  def generate_response(
17
  message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, model_choice
@@ -28,22 +28,29 @@ def generate_response(
28
 
29
  if model_choice == "Zephyr-7B (API)":
30
  response = ""
31
- for message in hf_client.chat_completion(
32
- messages,
33
- max_tokens=max_tokens,
34
- stream=True,
35
- temperature=temperature,
36
- top_p=top_p,
37
- ):
38
- token = message.choices[0].delta.content
39
- response += token
40
- yield response
 
 
 
41
  else:
42
  input_text = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
43
- output = model.generate(input_text, max_length=max_tokens, temperature=temperature, top_p=top_p)
44
- response = tokenizer.decode(output[0], skip_special_tokens=True)
45
- yield response
 
 
 
46
 
 
47
  demo = gr.ChatInterface(
48
  generate_response,
49
  additional_inputs=[
 
1
  import gradio as gr
2
  from huggingface_hub import InferenceClient
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import torch
5
 
6
  # Initialize Hugging Face Inference API client
7
  hf_client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
8
 
9
+ # Load the second model (local)
10
  local_model_name = "codewithdark/latent-recurrent-depth-lm"
11
  tokenizer = AutoTokenizer.from_pretrained(local_model_name)
12
+ model = AutoModelForCausalLM.from_pretrained(local_model_name, trust_remote_code=True)
13
  device = "cuda" if torch.cuda.is_available() else "cpu"
14
+ model.to(device).eval() # Set model to evaluation mode
15
 
16
  def generate_response(
17
  message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, model_choice
 
28
 
29
  if model_choice == "Zephyr-7B (API)":
30
  response = ""
31
+ try:
32
+ for message in hf_client.chat_completion(
33
+ messages=messages,
34
+ max_tokens=max_tokens,
35
+ stream=True,
36
+ temperature=temperature,
37
+ top_p=top_p,
38
+ ):
39
+ token = message.choices[0].delta.content if message.choices else ""
40
+ response += token
41
+ yield response
42
+ except Exception as e:
43
+ yield f"Error in API response: {e}"
44
  else:
45
  input_text = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
46
+ with torch.no_grad():
47
+ output = model.generate(input_text, max_length=max_tokens, temperature=temperature, top_p=top_p)
48
+ response = tokenizer.decode(output[0], skip_special_tokens=True).strip()
49
+
50
+ for i in range(len(response)):
51
+ yield response[: i + 1]
52
 
53
+ # Gradio UI
54
  demo = gr.ChatInterface(
55
  generate_response,
56
  additional_inputs=[