upgrade to rocm6.0.2 and added xtts-ui
This commit is contained in:
parent
575f789fda
commit
2e8975c9a6
58
Dockerfile
Normal file → Executable file
58
Dockerfile
Normal file → Executable file
@ -1,4 +1,4 @@
|
||||
FROM rocm/dev-ubuntu-22.04:5.7-complete
|
||||
FROM rocm/dev-ubuntu-22.04:6.0.2-complete
|
||||
ENV DEBIAN_FRONTEND=noninteractive \
|
||||
PYTHONUNBUFFERED=1 \
|
||||
PYTHONIOENCODING=UTF-8
|
||||
@ -16,7 +16,12 @@ RUN apt-get update &&\
|
||||
python3.10-tk \
|
||||
python-is-python3 \
|
||||
python3.10-venv \
|
||||
rsync
|
||||
rsync \
|
||||
ffmpeg \
|
||||
python3.10-distutils \
|
||||
cron \
|
||||
unzip
|
||||
|
||||
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
|
||||
RUN python3.10 -m pip install --upgrade pip wheel setuptools
|
||||
|
||||
@ -24,7 +29,7 @@ RUN python3.10 -m pip install --upgrade pip wheel setuptools
|
||||
RUN apt-get install -y hipblas hipblaslt hipsparse hipcub hip-runtime-amd rocthrust rocthrust-dev rocrand
|
||||
|
||||
# Install PyTorch
|
||||
RUN python3.10 -m pip install torch torchvision --index-url https://download.pytorch.org/whl/nightly/rocm5.7
|
||||
RUN python3.10 -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.0
|
||||
|
||||
# Create ai folder for saving projects
|
||||
RUN mkdir /ai/
|
||||
@ -35,7 +40,7 @@ RUN git clone https://github.com/TimDettmers/bitsandbytes.git /ai/git/bitsandbyt
|
||||
# Clone the ROCM PR branch cause it isn't merged yet
|
||||
RUN git fetch origin refs/pull/756/head:rocmport && git checkout rocmport
|
||||
# Set the env variables to the container ROCM setup
|
||||
ENV ROCM_HOME=/opt/rocm-5.7.0
|
||||
ENV ROCM_HOME=/opt/rocm-6.0.2
|
||||
ENV ROCM_TARGET=gfx1030
|
||||
# Build and install globally
|
||||
RUN make hip && pip install .
|
||||
@ -67,6 +72,7 @@ RUN cp -R /ai/git/llamacpp/* /ai/llamacpp/
|
||||
RUN make LLAMA_HIPBLAS=1 -j4
|
||||
RUN pip install --ignore-installed flask requests
|
||||
|
||||
|
||||
# Install KoyhaSS in /ai/
|
||||
WORKDIR /ai/kohya_ss
|
||||
# RUN apt-get install -y libgl1 libglib2.0-0 libgoogle-perftools-dev python3-html5lib python3-apt python3.10-distutils
|
||||
@ -78,11 +84,49 @@ RUN python3.10 -m venv /ai/venv/kohya_ss/ --system-site-packages
|
||||
# Install python requirements
|
||||
RUN /ai/venv/kohya_ss/bin/python -m pip install --upgrade pip wheel
|
||||
RUN /ai/venv/kohya_ss/bin/python -m pip install -r requirements.txt && /ai/venv/kohya_ss/bin/python -m pip uninstall tensorflow
|
||||
RUN /ai/venv/kohya_ss/bin/python -m pip install accelerate tensorboard tensorflow-rocm lion_pytorch
|
||||
RUN /ai/venv/kohya_ss/bin/python -m pip install accelerate tensorboard tensorflow-rocm lion_pytorch scipy
|
||||
RUN /ai/venv/kohya_ss/bin/python -m pip install typing_extensions --upgrade
|
||||
|
||||
# Set safe directory for extensions and stuff
|
||||
RUN git config --global --add safe.directory "*"
|
||||
|
||||
# Install OpenVoice
|
||||
# Not working entirely (error on interference)
|
||||
# WORKDIR /ai/openvoice
|
||||
# RUN git clone https://github.com/myshell-ai/OpenVoice.git /ai/git/openvoice
|
||||
# RUN cp -R /ai/git/openvoice/* /ai/openvoice/
|
||||
# RUN python3.10 -m venv /ai/venv/openvoice/ --system-site-packages
|
||||
# RUN sed -i '/wavmark==0.0.2/d' requirements.txt
|
||||
# RUN /ai/venv/openvoice/bin/python -m pip install -r requirements.txt
|
||||
# RUN /ai/venv/openvoice/bin/python -m pip install wavmark --no-dependencies # to avoid it reinstalling torch /(O_O)/
|
||||
# # Set safe directory for extensions and stuff
|
||||
# RUN git config --global --add safe.directory "*"
|
||||
# WORKDIR /ai/openvoice/checkpoints
|
||||
# RUN wget https://myshell-public-repo-hosting.s3.amazonaws.com/checkpoints_1226.zip
|
||||
# RUN unzip checkpoints_1226.zip -d ../
|
||||
# COPY ./openvoice/openvoice_app.py /ai/openvoice/
|
||||
# WORKDIR /root/.cache/torch/hub
|
||||
# RUN wget https://github.com/snakers4/silero-vad/archive/refs/tags/v4.0.zip
|
||||
# RUN unzip v4.0.zip && rm v4.0.zip
|
||||
# RUN mv silero-vad-4.0/ snakers4_silero-vad_v4.0/
|
||||
# RUN mkdir snakers4_silero-vad_master
|
||||
# RUN ln -s snakers4_silero-vad_v4.0/ snakers4_silero-vad_master
|
||||
# RUN sed -i 's/method="silero"/method="silero:v4.0"/' /ai/openvoice/se_extractor.py
|
||||
# RUN /ai/venv/openvoice/bin/python -m pip install --upgrade whisper_timestamped
|
||||
# ENV LD_PRELOAD=/usr/lib/aarch64-linux-gnu/libgomp.so.1
|
||||
|
||||
# Install XTTS-WebUI
|
||||
WORKDIR /ai/xtts
|
||||
RUN git clone https://github.com/daswer123/xtts-webui /ai/git/xtts
|
||||
RUN cp -R /ai/git/xtts/* /ai/xtts/
|
||||
RUN python3.10 -m venv /ai/venv/xtts/ --system-site-packages
|
||||
RUN sed -i '/torch==2.1.1/d' requirements.txt
|
||||
RUN sed -i '/torchaudio==2.1.1/d' requirements.txt
|
||||
RUN /ai/venv/xtts/bin/python -m pip install -r requirements.txt
|
||||
# Install deepspeed and ninja even though it doesn't work with XTTS (dunno why, compilation errors)
|
||||
RUN /ai/venv/xtts/bin/python -m pip install deepspeed ninja
|
||||
RUN apt-get install -y ninja-build
|
||||
|
||||
RUN sh -c 'echo "* * * * * for i in {1..6}; do chown -R 1000:0 /ai/stablediffusion-webui/outputs/ & sleep 10; done" >> /etc/crontab'
|
||||
|
||||
|
||||
# Go back to ai folder when done
|
||||
WORKDIR /ai
|
||||
|
39
docker-compose.yml
Normal file → Executable file
39
docker-compose.yml
Normal file → Executable file
@ -2,7 +2,7 @@ version: '3'
|
||||
|
||||
services:
|
||||
stablediff-rocm:
|
||||
image: ai-suite-rocm:5.7
|
||||
image: ai-suite-rocm:6.0
|
||||
container_name: stablediffusion-rocm
|
||||
environment:
|
||||
TZ: "Europe/Zurich"
|
||||
@ -19,7 +19,7 @@ services:
|
||||
echo 'You may need sudo to perform this action'
|
||||
exit 1
|
||||
fi;
|
||||
chmod -R 777 /ai/stablediffusion-webui/outputs;
|
||||
chown -R 1000:0 /ai/stablediffusion-webui/outputs/;
|
||||
/ai/venv/stablediffusion/bin/python launch.py"
|
||||
ports:
|
||||
- "5000:7860"
|
||||
@ -41,7 +41,7 @@ services:
|
||||
- ./stablediffusion/outputs:/ai/stablediffusion-webui/outputs/
|
||||
|
||||
kobold-rocm:
|
||||
image: ai-suite-rocm:5.7
|
||||
image: ai-suite-rocm:6.0
|
||||
container_name: koboldai-rocm
|
||||
environment:
|
||||
TZ: "Europe/Zurich"
|
||||
@ -65,7 +65,7 @@ services:
|
||||
- ./koboldai/models:/ai/koboldai/localmodels
|
||||
|
||||
llamacpp-rocm:
|
||||
image: ai-suite-rocm:5.7
|
||||
image: ai-suite-rocm:6.0
|
||||
container_name: llamacpp-rocm
|
||||
environment:
|
||||
TZ: "Europe/Zurich"
|
||||
@ -96,7 +96,7 @@ services:
|
||||
- ./llamacpp/extra:/ai/llamacpp/extra
|
||||
|
||||
koyhass-rocm:
|
||||
image: ai-suite-rocm:5.7
|
||||
image: ai-suite-rocm:6.0
|
||||
container_name: koyhass-rocm
|
||||
environment:
|
||||
TZ: "Europe/Zurich"
|
||||
@ -115,4 +115,31 @@ services:
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- seccomp:unconfined
|
||||
- seccomp:unconfined
|
||||
volumes:
|
||||
- ./kohyass/models:/ai/kohya_ss/models
|
||||
- ./kohyass/extra:/ai/kohya_ss/extra
|
||||
|
||||
xtts-rocm:
|
||||
image: ai-suite-rocm:6.0
|
||||
container_name: xtts-rocm
|
||||
environment:
|
||||
TZ: "Europe/Zurich"
|
||||
entrypoint: ["/bin/sh", "-c"]
|
||||
working_dir: /ai/xtts/
|
||||
command: ["/ai/venv/xtts/bin/python app.py --host 0.0.0.0 -v v2.0.3"]
|
||||
ports:
|
||||
- "5005:8010"
|
||||
devices:
|
||||
- "/dev/kfd:/dev/kfd"
|
||||
- "/dev/dri:/dev/dri"
|
||||
group_add:
|
||||
- video
|
||||
ipc: host
|
||||
cap_add:
|
||||
- SYS_PTRACE
|
||||
security_opt:
|
||||
- seccomp:unconfined
|
||||
volumes:
|
||||
- ./xtts/outputs:/ai/xtts/output
|
||||
- ./xtts/models:/ai/xtts/models
|
||||
|
0
koboldai/config.kcpps
Normal file → Executable file
0
koboldai/config.kcpps
Normal file → Executable file
1
kohyass/models
Symbolic link
1
kohyass/models
Symbolic link
@ -0,0 +1 @@
|
||||
/mnt/DATA/SD_MODELS/
|
@ -2,3 +2,5 @@ mkdir -p stablediffusion koboldai llamacpp
|
||||
ln -s '/mnt/DATA/SD_MODELS/' ./stablediffusion/models
|
||||
ln -s '/mnt/DATA/LLM_MODELS/' ./koboldai/models
|
||||
ln -s '/mnt/DATA/LLM_MODELS/' ./llamacpp/models
|
||||
ln -s '/mnt/DATA/SD_MODELS/' ./koyhass/models
|
||||
ln -s '/mnt/DATA/XTTS_MODELS/' ./xtts/models
|
||||
|
238
openvoice(deprecated)/openvoice_app.py
Normal file
238
openvoice(deprecated)/openvoice_app.py
Normal file
@ -0,0 +1,238 @@
|
||||
import os
|
||||
import torch
|
||||
import argparse
|
||||
import gradio as gr
|
||||
from zipfile import ZipFile
|
||||
import langid
|
||||
import se_extractor
|
||||
from api import BaseSpeakerTTS, ToneColorConverter
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--share", action='store_true', default=False, help="make link public")
|
||||
args = parser.parse_args()
|
||||
|
||||
en_ckpt_base = 'checkpoints/base_speakers/EN'
|
||||
ckpt_converter = 'checkpoints/converter'
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
output_dir = 'outputs'
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# load models
|
||||
en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
|
||||
en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
|
||||
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
|
||||
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
|
||||
|
||||
# load speaker embeddings
|
||||
en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
|
||||
en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
|
||||
|
||||
# This online demo mainly supports English and Chinese
|
||||
supported_languages = ['en']
|
||||
|
||||
def predict(prompt, style, audio_file_pth, agree):
|
||||
# initialize a empty info
|
||||
text_hint = ''
|
||||
|
||||
# first detect the input language
|
||||
language_predicted = langid.classify(prompt)[0].strip()
|
||||
print(f"Detected language:{language_predicted}")
|
||||
|
||||
if language_predicted not in supported_languages:
|
||||
text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
|
||||
gr.Warning(
|
||||
f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
|
||||
)
|
||||
|
||||
return (
|
||||
text_hint,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
else:
|
||||
tts_model = en_base_speaker_tts
|
||||
if style == 'default':
|
||||
source_se = en_source_default_se
|
||||
else:
|
||||
source_se = en_source_style_se
|
||||
language = 'English'
|
||||
if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
|
||||
text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
|
||||
gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
|
||||
return (
|
||||
text_hint,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
speaker_wav = audio_file_pth
|
||||
|
||||
if len(prompt) < 2:
|
||||
text_hint += f"[ERROR] Please give a longer prompt text \n"
|
||||
gr.Warning("Please give a longer prompt text")
|
||||
return (
|
||||
text_hint,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
|
||||
try:
|
||||
target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', vad=True)
|
||||
except Exception as e:
|
||||
text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
|
||||
gr.Warning(
|
||||
"[ERROR] Get target tone color error {str(e)} \n"
|
||||
)
|
||||
return (
|
||||
text_hint,
|
||||
None,
|
||||
None,
|
||||
)
|
||||
|
||||
src_path = f'{output_dir}/tmp.wav'
|
||||
tts_model.tts(prompt, src_path, speaker=style, language=language)
|
||||
|
||||
save_path = f'{output_dir}/output.wav'
|
||||
# Run the tone color converter
|
||||
encode_message = "@MyShell"
|
||||
tone_color_converter.convert(
|
||||
audio_src_path=src_path,
|
||||
src_se=source_se,
|
||||
tgt_se=target_se,
|
||||
output_path=save_path,
|
||||
message=encode_message)
|
||||
|
||||
text_hint += f'''Get response successfully \n'''
|
||||
|
||||
return (
|
||||
text_hint,
|
||||
save_path,
|
||||
speaker_wav,
|
||||
)
|
||||
|
||||
|
||||
|
||||
title = "MyShell OpenVoice"
|
||||
|
||||
description = """
|
||||
We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
|
||||
"""
|
||||
|
||||
markdown_table = """
|
||||
<div align="center" style="margin-bottom: 10px;">
|
||||
|
||||
| | | |
|
||||
| :-----------: | :-----------: | :-----------: |
|
||||
| **OpenSource Repo** | **Project Page** | **Join the Community** |
|
||||
| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
|
||||
|
||||
</div>
|
||||
"""
|
||||
|
||||
markdown_table_v2 = """
|
||||
<div align="center" style="margin-bottom: 2px;">
|
||||
|
||||
| | | | |
|
||||
| :-----------: | :-----------: | :-----------: | :-----------: |
|
||||
| **OpenSource Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
|
||||
|
||||
| | |
|
||||
| :-----------: | :-----------: |
|
||||
**Join the Community** | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
|
||||
|
||||
</div>
|
||||
"""
|
||||
content = """
|
||||
<div>
|
||||
<strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
|
||||
This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
|
||||
</div>
|
||||
"""
|
||||
wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
|
||||
|
||||
|
||||
examples = [
|
||||
[
|
||||
"今天天气真好,我们一起出去吃饭吧。",
|
||||
'default',
|
||||
"resources/demo_speaker1.mp3",
|
||||
True,
|
||||
],[
|
||||
"This audio is generated by open voice with a half-performance model.",
|
||||
'whispering',
|
||||
"resources/demo_speaker2.mp3",
|
||||
True,
|
||||
],
|
||||
[
|
||||
"He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
||||
'sad',
|
||||
"resources/demo_speaker0.mp3",
|
||||
True,
|
||||
],
|
||||
]
|
||||
|
||||
with gr.Blocks(analytics_enabled=False) as demo:
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
with gr.Row():
|
||||
gr.Markdown(
|
||||
"""
|
||||
## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
|
||||
"""
|
||||
)
|
||||
with gr.Row():
|
||||
gr.Markdown(markdown_table_v2)
|
||||
with gr.Row():
|
||||
gr.Markdown(description)
|
||||
with gr.Column():
|
||||
gr.Video('https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f', autoplay=True)
|
||||
|
||||
with gr.Row():
|
||||
gr.HTML(wrapped_markdown_content)
|
||||
|
||||
with gr.Row():
|
||||
with gr.Column():
|
||||
input_text_gr = gr.Textbox(
|
||||
label="Text Prompt",
|
||||
info="One or two sentences at a time is better. Up to 200 text characters.",
|
||||
value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
||||
)
|
||||
style_gr = gr.Dropdown(
|
||||
label="Style",
|
||||
info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
|
||||
choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
|
||||
max_choices=1,
|
||||
value="default",
|
||||
)
|
||||
ref_gr = gr.Audio(
|
||||
label="Reference Audio",
|
||||
info="Click on the ✎ button to upload your own target speaker audio",
|
||||
type="filepath",
|
||||
value="resources/demo_speaker2.mp3",
|
||||
)
|
||||
tos_gr = gr.Checkbox(
|
||||
label="Agree",
|
||||
value=False,
|
||||
info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
|
||||
)
|
||||
|
||||
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
|
||||
|
||||
|
||||
with gr.Column():
|
||||
out_text_gr = gr.Text(label="Info")
|
||||
audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
|
||||
ref_audio_gr = gr.Audio(label="Reference Audio Used")
|
||||
|
||||
gr.Examples(examples,
|
||||
label="Examples",
|
||||
inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
|
||||
outputs=[out_text_gr, audio_gr, ref_audio_gr],
|
||||
fn=predict,
|
||||
cache_examples=False,)
|
||||
tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
|
||||
|
||||
demo.queue()
|
||||
demo.launch(server_name='0.0.0.0', server_port=7862, debug=True, show_api=True, share=False)
|
1
xtts/models
Symbolic link
1
xtts/models
Symbolic link
@ -0,0 +1 @@
|
||||
/mnt/DATA/XTTS_MODELS/
|
Loading…
Reference in New Issue
Block a user