Spaces:

codewithdark
/

LatentRecurrentDepthLM

Sleeping

App Files Files Community

codewithdark commited on Feb 25

Commit

2dc2198

verified ·

1 Parent(s): fc75b8b

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -17

app.py CHANGED Viewed

@@ -1,17 +1,17 @@
 import gradio as gr
 from huggingface_hub import InferenceClient
-from transformers import AutoModel, AutoTokenizer
 import torch
 # Initialize Hugging Face Inference API client
 hf_client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-# Load the second model
 local_model_name = "codewithdark/latent-recurrent-depth-lm"
 tokenizer = AutoTokenizer.from_pretrained(local_model_name)
-model = AutoModel.from_pretrained(local_model_name, trust_remote_code=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
 def generate_response(
     message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, model_choice
@@ -28,22 +28,29 @@ def generate_response(
     if model_choice == "Zephyr-7B (API)":
         response = ""
-        for message in hf_client.chat_completion(
-            messages,
-            max_tokens=max_tokens,
-            stream=True,
-            temperature=temperature,
-            top_p=top_p,
-        ):
-            token = message.choices[0].delta.content
-            response += token
-            yield response
     else:
         input_text = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
-        output = model.generate(input_text, max_length=max_tokens, temperature=temperature, top_p=top_p)
-        response = tokenizer.decode(output[0], skip_special_tokens=True)
-        yield response
 demo = gr.ChatInterface(
     generate_response,
     additional_inputs=[

 import gradio as gr
 from huggingface_hub import InferenceClient
+from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
 # Initialize Hugging Face Inference API client
 hf_client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
+# Load the second model (local)
 local_model_name = "codewithdark/latent-recurrent-depth-lm"
 tokenizer = AutoTokenizer.from_pretrained(local_model_name)
+model = AutoModelForCausalLM.from_pretrained(local_model_name, trust_remote_code=True)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+model.to(device).eval()  # Set model to evaluation mode
 def generate_response(
     message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, model_choice
     if model_choice == "Zephyr-7B (API)":
         response = ""
+        try:
+            for message in hf_client.chat_completion(
+                messages=messages,
+                max_tokens=max_tokens,
+                stream=True,
+                temperature=temperature,
+                top_p=top_p,
+            ):
+                token = message.choices[0].delta.content if message.choices else ""
+                response += token
+                yield response
+        except Exception as e:
+            yield f"Error in API response: {e}"
     else:
         input_text = tokenizer.apply_chat_template(messages, return_tensors="pt").to(device)
+        with torch.no_grad():
+            output = model.generate(input_text, max_length=max_tokens, temperature=temperature, top_p=top_p)
+        response = tokenizer.decode(output[0], skip_special_tokens=True).strip()
+        for i in range(len(response)):
+            yield response[: i + 1]
+# Gradio UI
 demo = gr.ChatInterface(
     generate_response,
     additional_inputs=[