import os import torch import argparse import gradio as gr from zipfile import ZipFile import langid import se_extractor from api import BaseSpeakerTTS, ToneColorConverter parser = argparse.ArgumentParser() parser.add_argument("--share", action='store_true', default=False, help="make link public") args = parser.parse_args() en_ckpt_base = 'checkpoints/base_speakers/EN' ckpt_converter = 'checkpoints/converter' device = 'cuda' if torch.cuda.is_available() else 'cpu' output_dir = 'outputs' os.makedirs(output_dir, exist_ok=True) # load models en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device) en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth') tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device) tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth') # load speaker embeddings en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device) en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device) # This online demo mainly supports English and Chinese supported_languages = ['en'] def predict(prompt, style, audio_file_pth, agree): # initialize a empty info text_hint = '' # first detect the input language language_predicted = langid.classify(prompt)[0].strip() print(f"Detected language:{language_predicted}") if language_predicted not in supported_languages: text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n" gr.Warning( f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}" ) return ( text_hint, None, None, ) else: tts_model = en_base_speaker_tts if style == 'default': source_se = en_source_default_se else: source_se = en_source_style_se language = 'English' if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']: text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n" gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']") return ( text_hint, None, None, ) speaker_wav = audio_file_pth if len(prompt) < 2: text_hint += f"[ERROR] Please give a longer prompt text \n" gr.Warning("Please give a longer prompt text") return ( text_hint, None, None, ) # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference try: target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', vad=True) except Exception as e: text_hint += f"[ERROR] Get target tone color error {str(e)} \n" gr.Warning( "[ERROR] Get target tone color error {str(e)} \n" ) return ( text_hint, None, None, ) src_path = f'{output_dir}/tmp.wav' tts_model.tts(prompt, src_path, speaker=style, language=language) save_path = f'{output_dir}/output.wav' # Run the tone color converter encode_message = "@MyShell" tone_color_converter.convert( audio_src_path=src_path, src_se=source_se, tgt_se=target_se, output_path=save_path, message=encode_message) text_hint += f'''Get response successfully \n''' return ( text_hint, save_path, speaker_wav, ) title = "MyShell OpenVoice" description = """ We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set. """ markdown_table = """