3
为方便测试,写了一个kivy小框架:
import torch
import ChatTTS
from kivy.app import App
from kivy.uix.boxlayout import BoxLayout
from kivy.uix.button import Button
from kivy.uix.textinput import TextInput
from kivy.core.audio import SoundLoader
import soundfile as sf
import tempfile
seeds = {
"旁白": {"seed": 2222},
"中年女性": {"seed": 7869},
"年轻女性": {"seed": 6615},
"中年男性": {"seed": 4099},
"年轻男性": {"seed": 6653},
}
class ChatApp(App):
def build(self):
self.chat = ChatTTS.Chat()
self.chat.load_models(source='local', local_path='models')
self.std, self.mean = torch.load('models/asset/spk_stat.pt').chunk(2)
layout = BoxLayout(orientation='vertical')
self.input_text = TextInput(size_hint=(1, 0.8), multiline=False)
submit_button = Button(text='Submit', size_hint=(1, 0.2))
submit_button.bind(on_press=self.infer_and_play)
layout.add_widget(self.input_text)
layout.add_widget(submit_button)
return layout
def infer_and_play(self, instance):
torch.manual_seed(seeds["年轻女性"]["seed"])
rnd_spk_emb = self.chat.sample_random_speaker()
params_infer_code = {
'spk_emb': rnd_spk_emb,
#'temperature': .1,
#'top_P': 0.7,
#'top_K': 20,
}
params_refine_text = {
'prompt': '[oral_2][laugh_0][break_6]'
}
text = self.input_text.text
wav = self.chat.infer(text, params_infer_code=params_infer_code, use_decoder=True)[0][0]#params_refine_text=params_refine_text
# 保存音频数据到临时文件
temp_audio_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
sf.write(temp_audio_file, wav, 24000, format='WAV', subtype='PCM_24')
temp_audio_file.close()
print("temp_audio_file.name:", temp_audio_file.name)
# 加载并播放音频文件
sound = SoundLoader.load(temp_audio_file.name)
if sound:
sound.volume = 1.0
sound.play()
if __name__ == '__main__':
ChatApp().run()