Spaces:
Running
Running
File size: 4,015 Bytes
dd1b211 d0dc6a4 dd1b211 9579fc0 dd1b211 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import os
import gradio as gr
from html import escape
from transformers import AutoTokenizer
def get_available_models():
"""获取models目录下所有包含tokenizer.json的模型"""
models_dir = "models"
if not os.path.exists(models_dir):
return []
available_models = []
for model_name in os.listdir(models_dir):
model_path = os.path.join(models_dir, model_name)
tokenizer_file = os.path.join(model_path, "config.json")
if os.path.isdir(model_path) and os.path.isfile(tokenizer_file):
available_models.append(model_name)
return sorted(available_models)
def tokenize_text(model_name, text):
"""处理tokenize请求"""
if not model_name:
return "Please choose a model and input some texts", 0, 0
if not text:
text = "Please choose a model and input some texts"
try:
# 加载tokenizer
model_path = os.path.join("models", model_name)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, device_map="cpu")
# Tokenize处理
input_ids = tokenizer.encode(text, add_special_tokens=True)
# 生成带颜色的HTML
colors = ["#A8D8EA", "#AA96DA", "#FCBAD3"]
html_parts = []
for i, token_id in enumerate(input_ids):
# 转义HTML特殊字符
safe_token = escape(tokenizer.decode(token_id))
# 交替颜色
color = colors[i % len(colors)]
html_part = (
f'<span style="background-color: {color};'
f"margin: 2px; padding: 2px 5px; border-radius: 3px;"
f'display: inline-block; font-size: 1.2em;">'
f"{safe_token}<br/>"
f'<sub style="font-size: 0.9em;">{token_id}</sub>'
f"</span>"
)
html_parts.append(html_part)
# 统计信息
token_len = len(input_ids)
char_len = len(text)
return "".join(html_parts), token_len, char_len
except Exception as e:
error_msg = f"Error: {str(e)}"
return error_msg, ""
banner_md = """# 🎨 Tokenize it!
Powerful token visualization tool for your text inputs. 🚀
Works for LLMs both online and *locally* on your machine!"""
banner = gr.Markdown(banner_md)
model_selector = gr.Dropdown(
label="Choose Model", choices=get_available_models(), interactive=True
)
text_input = gr.Textbox(label="Input Text", placeholder="Hello World!", lines=4)
submit_btn = gr.Button("🚀 Tokenize!", variant="primary")
output_html = gr.HTML(label="Tokenized Output", elem_classes="token-output")
token_count = gr.Number(label="Token Count", value=0, interactive=False)
char_count = gr.Number(label="Character Count", value=0, interactive=False)
with gr.Blocks(title="Token Visualizer", theme="NoCrypt/miku") as webui:
banner.render()
with gr.Column():
model_selector.render()
text_input.render()
submit_btn.render()
with gr.Column():
with gr.Row():
token_count.render()
char_count.render()
output_html.render()
# 定义CSS样式
webui.css = """
.token-output span {
margin: 3px;
vertical-align: top;
}
.stats-output {
font-weight: bold !important;
color: #2c3e50 !important;
}
.gradio-container { /* 针对 Gradio 的主容器 */
width: 100%; /* 根据需要调整宽度 */
display: flex;
justify-content: center;
align-items: center;
}
.gradio-container > div { /* 直接子元素,通常包含你的内容 */
width: 90%; /* 或者你想要的固定宽度 */
max-width: 1200px; /* 设置最大宽度 */
}
"""
submit_btn.click(
fn=tokenize_text,
inputs=[model_selector, text_input],
outputs=[output_html, token_count, char_count],
)
if __name__ == "__main__":
os.makedirs("models", exist_ok=True)
webui.launch(pwa=True)
|