####################################################################################### # # MIT License # # Copyright (c) [2025] [leonelhs@gmail.com] # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ####################################################################################### # This file implements an API endpoint for the Mandarin Chinese Kokoro Text-to-Speech (TTS) system. # It provides functionality to generate TTS audio from input Mandarin Chinese text using the Kokoro voice model. # Source code is based on or inspired by several projects. # For more details and proper attribution, please refer to the following resources: # # - [Kokoro] - [https://github.com/hexgrad/kokoro] # - [Misaki] - [https://github.com/hexgrad/misaki] # - [Kokoro-82M] - [https://huggingface.co/hexgrad/Kokoro-82M] # - [Kokoro-onnx] - [https://github.com/thewh1teagle/kokoro-onnx] import os import gradio as gr from huggingface_hub import snapshot_download from kokoro_onnx import Kokoro from misaki import zh KOKORO_REPO_ID = "leonelhs/kokoro-thewh1teagle" VOICES = { '🚺 001':'zf_001', '🚺 002':'zf_002', '🚺 003':'zf_003', '🚺 004':'zf_004', '🚺 005':'zf_005', '🚺 006':'zf_006', '🚺 007':'zf_007', '🚺 008':'zf_008', '🚺 017':'zf_017', '🚺 018':'zf_018', '🚺 019':'zf_019', '🚺 021':'zf_021', '🚺 022':'zf_022', '🚺 023':'zf_023', '🚺 024':'zf_024', '🚺 026':'zf_026', '🚺 027':'zf_027', '🚺 028':'zf_028', '🚺 032':'zf_032', '🚺 036':'zf_036', '🚺 038':'zf_038', '🚺 039':'zf_039', '🚺 040':'zf_040', '🚺 042':'zf_042', '🚺 043':'zf_043', '🚺 044':'zf_044', '🚺 046':'zf_046', '🚺 047':'zf_047', '🚺 048':'zf_048', '🚺 049':'zf_049', '🚺 051':'zf_051', '🚺 059':'zf_059', '🚺 060':'zf_060', '🚺 067':'zf_067', '🚺 070':'zf_070', '🚺 071':'zf_071', '🚺 072':'zf_072', '🚺 073':'zf_073', '🚺 074':'zf_074', '🚺 075':'zf_075', '🚺 076':'zf_076', '🚺 077':'zf_077', '🚺 078':'zf_078', '🚺 079':'zf_079', '🚺 083':'zf_083', '🚺 084':'zf_084', '🚺 085':'zf_085', '🚺 086':'zf_086', '🚺 087':'zf_087', '🚺 088':'zf_088', '🚺 090':'zf_090', '🚺 092':'zf_092', '🚺 093':'zf_093', '🚺 094':'zf_094', '🚺 099':'zf_099', '🚹 009':'zm_009', '🚹 010':'zm_010', '🚹 011':'zm_011', '🚹 012':'zm_012', '🚹 013':'zm_013', '🚹 014':'zm_014', '🚹 015':'zm_015', '🚹 016':'zm_016', '🚹 020':'zm_020', '🚹 025':'zm_025', '🚹 029':'zm_029', '🚹 030':'zm_030', '🚹 031':'zm_031', '🚹 033':'zm_033', '🚹 034':'zm_034', '🚹 035':'zm_035', '🚹 037':'zm_037', '🚹 041':'zm_041', '🚹 045':'zm_045', '🚹 050':'zm_050', '🚹 052':'zm_052', '🚹 053':'zm_053', '🚹 054':'zm_054', '🚹 055':'zm_055', '🚹 056':'zm_056', '🚹 057':'zm_057', '🚹 058':'zm_058', '🚹 061':'zm_061', '🚹 062':'zm_062', '🚹 063':'zm_063', '🚹 064':'zm_064', '🚹 065':'zm_065', '🚹 066':'zm_066', '🚹 068':'zm_068', '🚹 069':'zm_069', '🚹 080':'zm_080', '🚹 081':'zm_081', '🚹 082':'zm_082', '🚹 089':'zm_089', '🚹 091':'zm_091', '🚹 095':'zm_095', '🚹 096':'zm_096', '🚹 097':'zm_097', '🚹 098':'zm_098', '🚹 100':'zm_100', } snapshot = snapshot_download(repo_id=KOKORO_REPO_ID) # Misaki G2P with espeak-ng fallback g2p = zh.ZHG2P(version="1.1") # Kokoro model_path = os.path.join(snapshot, "kokoro-v1.1-zh.onnx") voices_path = os.path.join(snapshot, "voices-v1.1-zh.bin") kokoro = Kokoro(model_path, voices_path, vocab_config="config.json") def predict(text, voice='zf_001', speed=1): """ Generate speech audio from mandarin chinese text input. Parameters: text (string): The text to be converted into speech. voice (string): The selected male of female voice profile (specific voice ID). speed (float): The speaking rate multiplier (e.g., 1.0 = normal speed, 0.8 = slower, 1.2 = faster). Returns: path: File path to the generated audio speech. """ phonemes, _ = g2p(text) samples, sample_rate = kokoro.create(phonemes, voice, speed, is_phonemes=True) return sample_rate, samples app = gr.Interface( predict, [ gr.Textbox(label='Input Text'), gr.Dropdown(list(VOICES.items()), value='zf_001', label='Voice'), gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Speed') ], gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True), description="Kokoro TTS 🇨🇳 API Endpoint", ) app.launch(share=False, debug=True, show_error=True, mcp_server=True) app.queue()