File size: 4,015 Bytes
dd1b211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0dc6a4
dd1b211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9579fc0
dd1b211
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import os
import gradio as gr
from html import escape
from transformers import AutoTokenizer


def get_available_models():
    """获取models目录下所有包含tokenizer.json的模型"""
    models_dir = "models"
    if not os.path.exists(models_dir):
        return []

    available_models = []
    for model_name in os.listdir(models_dir):
        model_path = os.path.join(models_dir, model_name)
        tokenizer_file = os.path.join(model_path, "config.json")

        if os.path.isdir(model_path) and os.path.isfile(tokenizer_file):
            available_models.append(model_name)

    return sorted(available_models)


def tokenize_text(model_name, text):
    """处理tokenize请求"""
    if not model_name:
        return "Please choose a model and input some texts", 0, 0
    if not text:
        text = "Please choose a model and input some texts"

    try:
        # 加载tokenizer
        model_path = os.path.join("models", model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, device_map="cpu")

        # Tokenize处理
        input_ids = tokenizer.encode(text, add_special_tokens=True)

        # 生成带颜色的HTML
        colors = ["#A8D8EA", "#AA96DA", "#FCBAD3"]
        html_parts = []

        for i, token_id in enumerate(input_ids):
            # 转义HTML特殊字符
            safe_token = escape(tokenizer.decode(token_id))
            # 交替颜色
            color = colors[i % len(colors)]
            html_part = (
                f'<span style="background-color: {color};'
                f"margin: 2px; padding: 2px 5px; border-radius: 3px;"
                f'display: inline-block; font-size: 1.2em;">'
                f"{safe_token}<br/>"
                f'<sub style="font-size: 0.9em;">{token_id}</sub>'
                f"</span>"
            )
            html_parts.append(html_part)

        # 统计信息
        token_len = len(input_ids)
        char_len = len(text)

        return "".join(html_parts), token_len, char_len

    except Exception as e:
        error_msg = f"Error: {str(e)}"
        return error_msg, ""


banner_md = """# 🎨 Tokenize it!

Powerful token visualization tool for your text inputs. 🚀

Works for LLMs both online and *locally* on your machine!"""
banner = gr.Markdown(banner_md)
model_selector = gr.Dropdown(
    label="Choose Model", choices=get_available_models(), interactive=True
)
text_input = gr.Textbox(label="Input Text", placeholder="Hello World!", lines=4)
submit_btn = gr.Button("🚀 Tokenize!", variant="primary")

output_html = gr.HTML(label="Tokenized Output", elem_classes="token-output")
token_count = gr.Number(label="Token Count", value=0, interactive=False)
char_count = gr.Number(label="Character Count", value=0, interactive=False)

with gr.Blocks(title="Token Visualizer", theme="NoCrypt/miku") as webui:
    banner.render()

    with gr.Column():
        model_selector.render()
        text_input.render()
        submit_btn.render()

    with gr.Column():
        with gr.Row():
            token_count.render()
            char_count.render()
        output_html.render()

    # 定义CSS样式
    webui.css = """
    .token-output span {
        margin: 3px;
        vertical-align: top;
    }
    .stats-output {
        font-weight: bold !important;
        color: #2c3e50 !important;
    }
    .gradio-container { /* 针对 Gradio 的主容器 */
        width: 100%; /* 根据需要调整宽度 */
        display: flex;
        justify-content: center;
        align-items: center;
    }
    .gradio-container > div { /* 直接子元素,通常包含你的内容 */
        width: 90%; /* 或者你想要的固定宽度 */
        max-width: 1200px; /* 设置最大宽度 */
    }
    """

    submit_btn.click(
        fn=tokenize_text,
        inputs=[model_selector, text_input],
        outputs=[output_html, token_count, char_count],
    )

if __name__ == "__main__":
    os.makedirs("models", exist_ok=True)
    webui.launch(pwa=True)