import gradio as gr import datetime import base64 import numpy as np import dashscope import os API_KEY = os.environ['API_KEY'] VOICE_OPTIONS = { "Cherry / 芊悦": "Cherry", "Ethan / 晨煦": "Ethan", "Jennifer / 詹妮弗": "Jennifer", "Ryan / 甜茶": "Ryan", "Katerina / 卡捷琳娜": "Katerina", "Nofish / 不吃鱼": "Nofish", "Elias / 墨讲师": "Elias", "Li / 南京-老李": "Li", "Marcus / 陕西-秦川": "Marcus", "Roy / 闽南-阿杰": "Roy", "Peter / 天津-李彼得": "Peter", "Eric / 四川-程川": "Eric", "Rocky / 粤语-阿强": "Rocky", "Kiki / 粤语-阿清": "Kiki", "Sunny / 四川-晴儿": "Sunny", "Jada / 上海-阿珍": "Jada", "Dylan / 北京-晓东": "Dylan", } DEFAULT_VOICE = 'Cherry / 芊悦' LANGUAGE_OPTIONS = [ "Auto / 自动", "English / 英文", "Chinese / 中文", "German / 德语", "Italian / 意大利语", "Portuguese / 葡萄牙语", "Spanish / 西班牙语", "Japanese / 日语", "Korean / 韩语", "French / 法语", "Russian / 俄语" ] LANGUAGE_MAP = { "Auto / 自动": "Auto", "English / 英文": "English", "Chinese / 中文": "Chinese", "German / 德语": "German", "Italian / 意大利语": "Italian", "Portuguese / 葡萄牙语": "Portuguese", "Spanish / 西班牙语": "Spanish", "Japanese / 日语": "Japanese", "Korean / 韩语": "Korean", "French / 法语": "French", "Russian / 俄语": "Russian" } def tts_interface(text, voice_display, language_display): voice_name = VOICE_OPTIONS[voice_display] # 将显示的语言转换为API参数 language = LANGUAGE_MAP[language_display] print(f"text: {text}, {voice_name}, {language} time: {datetime.datetime.now()}\n") audio_frames = [] responses = dashscope.MultiModalConversation.call( api_key=API_KEY, model="qwen3-tts-flash", text=text, voice=voice_name, stream=True, language_type=language ) for chunk in responses: audio_string = "" try: audio_string = chunk.output.audio.data except: print(chunk) pass wav_bytes = base64.b64decode(audio_string) audio_np = np.frombuffer(wav_bytes, dtype=np.int16).astype(np.float32) / 32768.0 audio_frames.append(audio_np) if audio_frames: full_audio = np.concatenate(audio_frames) else: full_audio = None sample_rate = 24000 return (sample_rate, full_audio) with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"]), css=".gradio-container {max-width: none !important;}") as demo: gr.Markdown("# 🎤 Qwen3-TTS Demo") with gr.Row(): with gr.Column(): # 输入文本 - 英文在前 text_input = gr.Textbox( label="Input Text / 输入文本", placeholder="Enter text to synthesis here... / 在此输入要合成为语音的文本...", lines=4, max_lines=8 ) # 发音人选择 - 英文在前 voice_select = gr.Dropdown( label="Select Voice / 选择发音人", choices=list(VOICE_OPTIONS.keys()), value=DEFAULT_VOICE ) # 语言选择 - 英文在前 language_select = gr.Dropdown( label="Select Text Language / 选择文本语言", choices=LANGUAGE_OPTIONS, value="Auto / 自动" ) # 生成按钮 - 英文在前 generate_btn = gr.Button("Generate Speech / 生成语音", variant="primary") with gr.Column(): # 音频输出 - 英文在前 audio_output = gr.Audio(label="Generated Speech / 生成的语音", interactive=False) # 示例文本 - 英文在前 examples = gr.Examples( examples=[ ["你好,我是通义千问,很高兴认识你。", "Cherry / 芊悦", "Chinese / 中文"], ["你好,我是通义千问,很高兴认识你。", "Dylan / 北京-晓东", "Chinese / 中文"], ["Hello, this is a text-to-speech demo", "Jennifer / 詹妮弗", "English / 英文"], ["こんにちは、これはデモです", "Cherry / 芊悦", "Japanese / 日语"], ], inputs=[text_input, voice_select, language_select], label="Examples / 示例文本" ) generate_btn.click( fn=tts_interface, inputs=[text_input, voice_select, language_select], outputs=audio_output ) if __name__ == "__main__": demo.launch()