import os import gradio as gr from html import escape from transformers import AutoTokenizer def get_available_models() -> list[str]: """获取models目录下所有包含tokenizer.json的模型""" models_dir = "models" if not os.path.exists(models_dir): return [] available_models = [] for model_name in os.listdir(models_dir): model_path = os.path.join(models_dir, model_name) tokenizer_file = os.path.join(model_path, "tokenizer.json") if os.path.isdir(model_path) and os.path.isfile(tokenizer_file): available_models.append(model_name) return sorted(available_models) def tokenize_text( builtin_model: str, custom_model: str | None, text: str ) -> tuple[str | None, int, int]: """处理tokenize请求""" if not builtin_model: return "Please choose a model and input some texts", 0, 0 if not text: text = "Please choose a model and input some texts" try: # 加载tokenizer if custom_model: tokenizer = AutoTokenizer.from_pretrained( custom_model, trust_remote_code=True, device_map="cpu" ) else: model_path = os.path.join("models", builtin_model) tokenizer = AutoTokenizer.from_pretrained( model_path, trust_remote_code=True, device_map="cpu" ) # Tokenize处理 input_ids = tokenizer.encode(text, add_special_tokens=True) # 生成带颜色的HTML colors = ["#A8D8EA", "#AA96DA", "#FCBAD3"] html_parts = [] for i, token_id in enumerate(input_ids): # 转义HTML特殊字符 safe_token = escape(tokenizer.decode(token_id)) # 交替颜色 color = colors[i % len(colors)] html_part = ( f'' f"{safe_token}
" f'{token_id}' f"
" ) html_parts.append(html_part) # 统计信息 token_len = len(input_ids) char_len = len(text) return "".join(html_parts), token_len, char_len except Exception as e: error_msg = f"Error: {str(e)}" return error_msg, 0, 0 banner_md = """# 🎨 Tokenize it! Powerful token visualization tool for your text inputs. 🚀 Works for LLMs both online and *locally* on your machine!""" banner = gr.Markdown(banner_md) model_selector = gr.Dropdown( label="Built-in Model", choices=get_available_models(), interactive=True ) custom_model = gr.Textbox( label="Custom Model", placeholder="Enter your custom model name. e.g. Qwen/QwQ", lines=1, ) text_input = gr.Textbox(label="Input Text", placeholder="Hello World!", lines=4) submit_btn = gr.Button("🚀 Tokenize!", variant="primary") output_html = gr.HTML(label="Tokenized Output", elem_classes="token-output") token_count = gr.Number(label="Token Count", value=0, interactive=False) char_count = gr.Number(label="Character Count", value=0, interactive=False) with gr.Blocks(title="Token Visualizer", theme="NoCrypt/miku") as webui: banner.render() with gr.Column(): with gr.TabItem("Built-in Model"): model_selector.render() with gr.TabItem("Custom Model"): custom_model.render() text_input.render() submit_btn.render() with gr.Column(): with gr.Row(): token_count.render() char_count.render() output_html.render() # 定义CSS样式 webui.css = """ .token-output span { margin: 3px; vertical-align: top; } .stats-output { font-weight: bold !important; color: #2c3e50 !important; } .gradio-container { /* 针对 Gradio 的主容器 */ width: 100%; /* 根据需要调整宽度 */ display: flex; justify-content: center; align-items: center; } .gradio-container > div { /* 直接子元素,通常包含你的内容 */ width: 90%; /* 或者你想要的固定宽度 */ max-width: 1200px; /* 设置最大宽度 */ } """ submit_btn.click( fn=tokenize_text, inputs=[model_selector, custom_model, text_input], outputs=[output_html, token_count, char_count], ) if __name__ == "__main__": os.makedirs("models", exist_ok=True) webui.launch(pwa=True)