Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| import subprocess | |
| from pathlib import Path | |
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| import torch | |
| from demucs.apply import apply_model | |
| from demucs.pretrained import DEFAULT_MODEL, get_model | |
| from huggingface_hub import hf_hub_download, list_repo_files | |
| from so_vits_svc_fork.hparams import HParams | |
| from so_vits_svc_fork.inference.core import Svc | |
| ################################################################### | |
| # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS | |
| ################################################################### | |
| # The Hugging Face Hub repo ID - 在这里修改repo_id,可替换成任何已经训练好的模型! | |
| repo_id = "kevinwang676/talktalkai-qing" | |
| # If None, Uses latest ckpt in the repo | |
| ckpt_name = None | |
| # If None, Uses "kmeans.pt" if it exists in the repo | |
| cluster_model_name = None | |
| # Set the default f0 type to use - use the one it was trained on. | |
| # The default for so-vits-svc-fork is "dio". | |
| # Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest" | |
| default_f0_method = "crepe" | |
| # The default ratio of cluster inference to SVC inference. | |
| # If cluster_model_name is not found in the repo, this is set to 0. | |
| default_cluster_infer_ratio = 0.5 | |
| # Limit on duration of audio at inference time. increase if you can | |
| # In this parent app, we set the limit with an env var to 30 seconds | |
| # If you didnt set env var + you go OOM try changing 9e9 to <=300ish | |
| duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9)) | |
| ################################################################### | |
| # Figure out the latest generator by taking highest value one. | |
| # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth | |
| if ckpt_name is None: | |
| latest_id = sorted( | |
| [ | |
| int(Path(x).stem.split("_")[1]) | |
| for x in list_repo_files(repo_id) | |
| if x.startswith("G_") and x.endswith(".pth") | |
| ] | |
| )[-1] | |
| ckpt_name = f"G_{latest_id}.pth" | |
| cluster_model_name = cluster_model_name or "kmeans.pt" | |
| if cluster_model_name in list_repo_files(repo_id): | |
| print(f"Found Cluster model - Downloading {cluster_model_name} from {repo_id}") | |
| cluster_model_path = hf_hub_download(repo_id, cluster_model_name) | |
| else: | |
| print(f"Could not find {cluster_model_name} in {repo_id}. Using None") | |
| cluster_model_path = None | |
| default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0 | |
| generator_path = hf_hub_download(repo_id, ckpt_name) | |
| config_path = hf_hub_download(repo_id, "config.json") | |
| hparams = HParams(**json.loads(Path(config_path).read_text())) | |
| speakers = list(hparams.spk.keys()) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path) | |
| demucs_model = get_model(DEFAULT_MODEL) | |
| def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0): | |
| wav, sr = librosa.load(filename, mono=False, sr=sr) | |
| wav = torch.tensor(wav) | |
| ref = wav.mean(0) | |
| wav = (wav - ref.mean()) / ref.std() | |
| sources = apply_model( | |
| model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs | |
| )[0] | |
| sources = sources * ref.std() + ref.mean() | |
| # We take just the vocals stem. I know the vocals for this model are at index -1 | |
| # If using different model, check model.sources.index('vocals') | |
| vocal_wav = sources[-1] | |
| # I did this because its the same normalization the so-vits model required | |
| vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1) | |
| vocal_wav = vocal_wav.numpy() | |
| vocal_wav = librosa.to_mono(vocal_wav) | |
| vocal_wav = vocal_wav.T | |
| instrumental_wav = sources[:-1].sum(0).numpy().T | |
| return vocal_wav, instrumental_wav | |
| def download_youtube_clip( | |
| video_identifier, | |
| start_time, | |
| end_time, | |
| output_filename, | |
| num_attempts=5, | |
| url_base="https://www.youtube.com/watch?v=", | |
| quiet=False, | |
| force=False, | |
| ): | |
| output_path = Path(output_filename) | |
| if output_path.exists(): | |
| if not force: | |
| return output_path | |
| else: | |
| output_path.unlink() | |
| quiet = "--quiet --no-warnings" if quiet else "" | |
| command = f""" | |
| yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501 | |
| """.strip() | |
| attempts = 0 | |
| while True: | |
| try: | |
| _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) | |
| except subprocess.CalledProcessError: | |
| attempts += 1 | |
| if attempts == num_attempts: | |
| return None | |
| else: | |
| break | |
| if output_path.exists(): | |
| return output_path | |
| else: | |
| return None | |
| def predict( | |
| speaker, | |
| audio, | |
| transpose: int = 0, | |
| auto_predict_f0: bool = False, | |
| cluster_infer_ratio: float = 0, | |
| noise_scale: float = 0.4, | |
| f0_method: str = "crepe", | |
| db_thresh: int = -40, | |
| pad_seconds: float = 0.5, | |
| chunk_seconds: float = 0.5, | |
| absolute_thresh: bool = False, | |
| ): | |
| audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit) | |
| audio = model.infer_silence( | |
| audio.astype(np.float32), | |
| speaker=speaker, | |
| transpose=transpose, | |
| auto_predict_f0=auto_predict_f0, | |
| cluster_infer_ratio=cluster_infer_ratio, | |
| noise_scale=noise_scale, | |
| f0_method=f0_method, | |
| db_thresh=db_thresh, | |
| pad_seconds=pad_seconds, | |
| chunk_seconds=chunk_seconds, | |
| absolute_thresh=absolute_thresh, | |
| ) | |
| return model.target_sample, audio | |
| def predict_song_from_yt( | |
| ytid_or_url, | |
| start, | |
| end, | |
| speaker=speakers[0], | |
| transpose: int = 0, | |
| auto_predict_f0: bool = False, | |
| cluster_infer_ratio: float = 0, | |
| noise_scale: float = 0.4, | |
| f0_method: str = "dio", | |
| db_thresh: int = -40, | |
| pad_seconds: float = 0.5, | |
| chunk_seconds: float = 0.5, | |
| absolute_thresh: bool = False, | |
| ): | |
| end = min(start + duration_limit, end) | |
| original_track_filepath = download_youtube_clip( | |
| ytid_or_url, | |
| start, | |
| end, | |
| "track.wav", | |
| force=True, | |
| url_base="" if ytid_or_url.startswith("http") else "https://www.youtube.com/watch?v=", | |
| ) | |
| vox_wav, inst_wav = extract_vocal_demucs(demucs_model, original_track_filepath) | |
| if transpose != 0: | |
| inst_wav = librosa.effects.pitch_shift(inst_wav.T, sr=model.target_sample, n_steps=transpose).T | |
| cloned_vox = model.infer_silence( | |
| vox_wav.astype(np.float32), | |
| speaker=speaker, | |
| transpose=transpose, | |
| auto_predict_f0=auto_predict_f0, | |
| cluster_infer_ratio=cluster_infer_ratio, | |
| noise_scale=noise_scale, | |
| f0_method=f0_method, | |
| db_thresh=db_thresh, | |
| pad_seconds=pad_seconds, | |
| chunk_seconds=chunk_seconds, | |
| absolute_thresh=absolute_thresh, | |
| ) | |
| full_song = inst_wav + np.expand_dims(cloned_vox, 1) | |
| return (model.target_sample, full_song), (model.target_sample, cloned_vox), (model.target_sample, inst_wav) | |
| image_markdown = (""" | |
| <h1 align="center"><a href="http://www.talktalkai.com"><img src="https://y.qq.com/music/photo_new/T001R300x300M0000025Gr0r2OXvrn_2.jpg", alt="talktalkai" border="0" style="margin: 0 auto; height: 200px;" /></a> </h1> | |
| """) | |
| with gr.Blocks() as demo: | |
| gr.HTML("<center>" | |
| "<h1>🌊💕🎶 - 滔滔AI+音乐:可从B站直接上传素材,无需分离背景音</h1>" | |
| "</center>") | |
| with gr.Accordion("📒 关于此应用 (可折叠)", open=True): | |
| gr.Markdown("## <center>🏞️ - 滔滔AI,为您提供全场景的AI声音服务(如AI拟声、AI歌手、AI变声等)</center>") | |
| gr.Markdown("### <center>🥳 - 滔滔AI合作音乐人:[一清清清](https://space.bilibili.com/22960772?spm_id_from=333.337.0.0);AI歌手,唱我想唱!</center>") | |
| gr.Markdown("### <center>🎡 - 更多精彩,尽在[滔滔AI](http://www.talktalkai.com);滔滔AI,为爱滔滔!💕</center>") | |
| gr.Markdown("<center>💡 - 如何使用此程序:在页面上方选择“从B站视频上传”模块,填写视频网址和视频起止时间后,点击“让AI歌手开始演唱吧”按键即可!您还可以点击页面最下方的示例快速预览效果</center>") | |
| gr.Markdown(image_markdown) | |
| with gr.Tab("📺 - 从B站视频上传"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| inp1=gr.Textbox( | |
| label="Bilibili网址", info="请填写含有您喜欢歌曲的Bilibili网址,可直接填写相应的BV号", value="https://www.bilibili.com/video/BV..." | |
| ) | |
| inp2=gr.Number(value=0, label="起始时间 (秒)") | |
| inp3=gr.Number(value=15, label="结束时间 (秒)") | |
| inp4=gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手🎶 - 🌟一清清清🌟") | |
| inp5=gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)") | |
| inp6=gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False) | |
| inp7=gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降") | |
| inp8=gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False) | |
| inp9=gr.Dropdown( | |
| choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], | |
| value=default_f0_method, | |
| label="模型推理方法 (crepe推理效果最好)", visible=False | |
| ) | |
| btn1=gr.Button("让AI歌手开始演唱吧", variant="primary") | |
| with gr.Column(): | |
| out1=gr.Audio(label="AI歌手+伴奏🎶") | |
| out2=gr.Audio(label="人声部分🎤") | |
| out3=gr.Audio(label="伴奏部分🎵") | |
| btn1.click(fn=predict_song_from_yt, inputs=[inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8, inp9], outputs=[out1, out2, out3]) | |
| gr.Examples(examples=[["https://www.bilibili.com/video/BV1ip4y1p7Pn", 87, 103, speakers[0], 0, False, default_cluster_infer_ratio, 0.4, default_f0_method]], | |
| inputs=[inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8, inp9], outputs=[out1, out2, out3], fn=predict_song_from_yt, cache_examples=True) | |
| with gr.Tab("🎙️ - 从麦克风上传"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| inp10=gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手🎶 - 🌟一清清清🌟") | |
| inp11=gr.Audio(type="filepath", source="microphone", label="请用麦克风上传您想转换的歌曲") | |
| inp12=gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)") | |
| inp13=gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False) | |
| inp14=gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降 (如果使用,建议0.5左右)") | |
| inp15=gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False) | |
| inp16=gr.Dropdown( | |
| choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], | |
| value=default_f0_method, | |
| label="模型推理方法 (crepe推理效果最好)", visible=False | |
| ) | |
| btn2=gr.Button("让AI歌手开始演唱吧", variant="primary") | |
| with gr.Column(): | |
| out4=gr.Audio(label="AI歌手演唱🎶") | |
| btn2.click(fn=predict, inputs=[inp10, inp11, inp12, inp13, inp14, inp15, inp16], outputs=[out4]) | |
| with gr.Tab("🎵 - 从文件上传"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| inp17=gr.Dropdown(speakers, value=speakers[0], label="🎤AI歌手🎶 - 🌟一清清清🌟") | |
| inp18=gr.Audio(type="filepath", source="upload", label="请上传您想转换的歌曲 (仅人声部分)") | |
| inp19=gr.Slider(-12, 12, value=0, step=1, label="变调 (默认为0;有正负值,+2为升高两个key)") | |
| inp20=gr.Checkbox(False, label="是否开启自动f0预测", info="勾选即为开启;配合聚类模型f0预测效果更好,仅限语音转换时使用", visible=False) | |
| inp21=gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="聚类模型混合比例", info="0-1之间,0即不启用聚类。使用聚类模型能提升音色相似度,但会导致咬字下降 (如果使用,建议0.5左右)") | |
| inp22=gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale (建议保持不变)", visible=False) | |
| inp23=gr.Dropdown( | |
| choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], | |
| value=default_f0_method, | |
| label="模型推理方法 (crepe推理效果最好)", visible=False | |
| ) | |
| btn3=gr.Button("让AI歌手开始演唱吧", variant="primary") | |
| with gr.Column(): | |
| out5=gr.Audio(label="AI歌手演唱🎶") | |
| btn3.click(fn=predict, inputs=[inp17, inp18, inp19, inp20, inp21, inp22, inp23], outputs=[out5]) | |
| gr.Markdown("### <center>注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。</center>") | |
| gr.HTML(''' | |
| <div class="footer"> | |
| <p>🌊🏞️🎶 - 江水东流急,滔滔无尽声。 明·顾璘 | |
| </p> | |
| </div> | |
| ''') | |
| demo.launch(show_error=True) |