import gradio as gr from openai import OpenAI import os # Retrieve the access token from the environment variable ACCESS_TOKEN = os.getenv("HF_TOKEN") # Initialize the OpenAI API client client = OpenAI( base_url="https://api-inference.huggingface.co/v1/", api_key=ACCESS_TOKEN, ) def respond( message, history, system_message, max_tokens, temperature, top_p, frequency_penalty, seed ): # Process the incoming message print(f"Received message: {message}") print(f"History: {history}") print(f"System Message: {system_message}") print(f"Max Tokens: {max_tokens}, Temperature: {temperature}, Top P: {top_p}") print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}") # Convert seed to None if -1 (random) if seed == -1: seed = None # Construct the messages list for the API messages = [{"role": "system", "content": system_message}] # Add conversation history to the context for user_message, assistant_message in history: if user_message: messages.append({"role": "user", "content": user_message}) print(f"Added user message: {user_message}") if assistant_message: messages.append({"role": "assistant", "content": assistant_message}) print(f"Added assistant message: {assistant_message}") # Append the latest message messages.append({"role": "user", "content": message}) # Initialize response response = "" # Make the API request for chunk in client.chat.completions.create( model="meta-llama/Llama-3.3-70B-Instruct", messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p, frequency_penalty=frequency_penalty, seed=seed, stream=True, ): # Extract the token text from the response chunk token = chunk.choices[0].message.content response += token yield response # Create the Gradio Chatbot component chatbot = gr.Chatbot(height=600) # Define the Gradio ChatInterface demo = gr.ChatInterface( chatbot=chatbot, fn=respond, inputs=[ gr.Textbox(lines=1, placeholder="Enter your message..."), gr.Chatbot(label="Conversation History"), gr.Textbox(label="System Message"), gr.Slider(minimum=10, maximum=200, step=1, label="Max Tokens"), gr.Slider(minimum=0, maximum=2, step=0.1, label="Temperature"), gr.Slider(minimum=0, maximum=1, step=0.05, label="Top P"), gr.Slider(minimum=-2, maximum=2, step=0.1, label="Frequency Penalty"), gr.Slider(minimum=-1, maximum=1000000, step=1, label="Seed (-1 for random)"), ], theme="Nymbo/Nymbo_Theme", ) # Create the "Featured Models" accordion with gr.Accordion("Featured Models", open=True) as featured_models: # Textbox for searching models model_search = gr.Textbox(label="Filter Models") # List of featured models models = [ "meta-llama/Llama-3.3-70B-Instruct", "meta-llama/Llama-2-70B-Chat-hf", "TheBloke/Llama-2-13B-Chat-GGML", "TheBloke/Llama-2-70B-Chat-GGML", "TheBloke/Llama-2-13B-Chat-GGML-v2", "TheBloke/Llama-2-70B-Chat-GGML-v2", "TheBloke/Llama-2-70B-Chat-HF-API-compatible-GGML", "TheBloke/Llama-2-70b-chat-hf", "TheBloke/Llama-2-70B-Chat-GGML-v2-32K", "TheBloke/Llama-2-13B-Chat-GGML-v2-32K", "TheBloke/Llama-2-70B-Chat-GGML-v2-32K", "TheBloke/Llama-2-13B-Chat-GGML-v2-32K", "TheBloke/Llama-2-70B-Chat-GGML-v2-32K", "TheBloke/Llama-7-13B-Chat-GGML-v2-32K", "TheBloke/Llama-2-70B-Chat-GGML-v2-32K", "TheBloke/Llama-2-13B-Chat-GGML-v2-32K", "TheBloke/Llama-2-70B-Chat-GGML-v2-32K", # Add more models as needed... ] # Radio buttons for selecting a model model_radio = gr.Radio(choices=models, label="Select a Model") # Update the model list based on search input def filter_models(search_term): filtered_models = [model for model in models if search_term.lower() in model.lower()] return gr.update(choices=filtered_models) # Update the model list when the search box is used model_search.change(filter_models, inputs=model_search, outputs=model_radio) # Create a "Custom Model" textbox custom_model = gr.Textbox(label="Custom Model", placeholder="Hugging Face model path") # Create the "Information" tab with gr.Tab("Information"): # Featured Models accordion with gr.Accordion("Featured Models", open=False): gr.Markdown( """ # Featured Models Here's a list of some popular models available on Hugging Face: - meta-llama/Llama-3.3-70B-Instruct - meta-llama/Llama-2-70B-Chat-hf - TheBloke/Llama-2-13B-Chat-GGML - TheBloke/Llama-2-70B-Chat-GGML - TheBloke/Llama-2-13B-Chat-GGML-v2 - TheBloke/Llama-2-70B-Chat-GGML-v2 - ... (and many more) You can search and select a model from the list above, or use your own custom model path. """ ) # Parameters Overview accordion with gr.Accordion("Parameters Overview", open=False): gr.Markdown( """ # Parameters Overview Here's a brief explanation of the parameters you can adjust: - **Max Tokens**: The maximum number of tokens to generate in the response. - **Temperature**: Controls the randomness of the output. Higher values make the output more random. - **Top P**: Also known as nucleus sampling, it filters the least probable tokens, encouraging the model to be more creative. - **Frequency Penalty**: Penalizes repeated tokens to avoid repetition. - **Seed**: A fixed seed for reproducibility. Use -1 for a random seed. Feel free to experiment with these settings to achieve the desired output. """ ) # Launch the Gradio interface demo.launch(share=True)