VieNeu-TTS

Runtime error

App Files Files Community

pnnbao-ump commited on Nov 18

Commit

8dc4072

1 Parent(s): a7ef740

deploy VieNeu-TTS-1000h

Browse files

Files changed (1) hide show

app.py +122 -126

app.py CHANGED Viewed

@@ -1,208 +1,215 @@
 import spaces  # PHẢI import TRƯỚC mọi thứ!
 import os
 os.environ['SPACES_ZERO_GPU'] = '1'  # Set environment variable
 import gradio as gr
-import numpy as np
 import soundfile as sf
 import tempfile
 import torch
-# Import vieneutts SAU khi đã setup spaces
-from vieneutts import VieNeuTTS
-# Khởi tạo model trên CPU trước
 print("📦 Đang tải model...")
 tts = VieNeuTTS(
-    backbone_repo="pnnbao-ump/VieNeu-TTS",
-    backbone_device="cpu",  # Load trên CPU trước
     codec_repo="neuphonic/neucodec",
-    codec_device="cpu"
 )
 print("✅ Model đã tải xong!")
 # Danh sách giọng mẫu
 VOICE_SAMPLES = {
-    "Nam miền Nam": {
         "audio": "./sample/id_0001.wav",
         "text": "./sample/id_0001.txt"
     },
-    "Nữ miền Nam": {
         "audio": "./sample/id_0002.wav",
         "text": "./sample/id_0002.txt"
     }
 }
-@spaces.GPU(duration=120)  # Giữ GPU trong 120 giây
 def synthesize_speech(text, voice_choice, custom_audio=None, custom_text=None):
-    """
-    Tổng hợp giọng nói từ văn bản - Chạy trên GPU
-    """
     try:
-        # Kiểm tra text input
         if not text or text.strip() == "":
             return None, "❌ Vui lòng nhập văn bản cần tổng hợp"
-        # Giới hạn độ dài text
-        if len(text) > 500:
-            return None, "❌ Văn bản quá dài! Vui lòng nhập tối đa 500 ký tự"
         # Xác định reference audio và text
         if custom_audio is not None and custom_text:
             ref_audio_path = custom_audio
-            ref_text = custom_text
         elif voice_choice in VOICE_SAMPLES:
             ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
             ref_text_path = VOICE_SAMPLES[voice_choice]["text"]
             with open(ref_text_path, "r", encoding="utf-8") as f:
-                ref_text = f.read()
         else:
             return None, "❌ Vui lòng chọn giọng hoặc tải lên audio tùy chỉnh"
-        # Di chuyển model lên GPU
-        device = "cuda" if torch.cuda.is_available() else "cpu"
-        if device == "cuda":
-            print("🚀 Đang chuyển model lên GPU...")
-            tts.backbone = tts.backbone.to("cuda")
-            tts.codec = tts.codec.to("cuda")
-        # Encode reference audio
         print(f"📝 Đang xử lý: {text[:50]}...")
         ref_codes = tts.encode_reference(ref_audio_path)
-        # Tổng hợp giọng nói
         print(f"🎵 Đang tổng hợp giọng nói trên {device.upper()}...")
-        wav = tts.infer(text, ref_codes, ref_text)
-        # Di chuyển model về CPU
-        if device == "cuda":
-            print("💾 Đang giải phóng GPU...")
-            tts.backbone = tts.backbone.to("cpu")
-            tts.codec = tts.codec.to("cpu")
-            torch.cuda.empty_cache()
-        # Lưu file tạm
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             sf.write(tmp_file.name, wav, 24000)
             output_path = tmp_file.name
         print("✅ Hoàn thành!")
-        return output_path, f"✅ Tổng hợp thành công trên {device.upper()}!"
     except Exception as e:
         print(f"❌ Lỗi: {str(e)}")
         import traceback
         traceback.print_exc()
-        # Giải phóng GPU khi có lỗi
-        try:
-            if torch.cuda.is_available():
-                tts.backbone = tts.backbone.to("cpu")
-                tts.codec = tts.codec.to("cpu")
-                torch.cuda.empty_cache()
-        except:
-            pass
         return None, f"❌ Lỗi: {str(e)}"
-# Các ví dụ mẫu
-examples = [
-    ["Legacy là một bộ phim đột phá về mặt âm nhạc, quay phim, hiệu ứng đặc biệt, và tôi rất mừng vì cuối cùng nó cũng được cả giới phê bình lẫn người hâm mộ đánh giá lại. Chúng ta đã quá bất công với bộ phim này vào năm 2010.", "Nam miền Nam"],
-    ["Từ nhiều nguồn tài liệu lịch sử, có thể thấy nuôi con theo phong cách Do Thái không chỉ tốt cho đứa trẻ mà còn tốt cho cả các bậc cha mẹ.", "Nữ miền Nam"],
-    ["Các bác sĩ đang nghiên cứu một loại vaccine mới chống lại virus cúm mùa. Thí nghiệm lâm sàng cho thấy phản ứng miễn dịch mạnh mẽ và ít tác dụng phụ, mở ra hy vọng phòng chống dịch bệnh hiệu quả hơn trong tương lai.", "Nam miền Nam"],
-]
-# Custom CSS
 custom_css = """
 .gradio-container {
     max-width: 900px !important;
 }
-#warning {
-    background-color: #fff3cd;
-    border: 1px solid #ffc107;
-    border-radius: 5px;
-    padding: 10px;
-    margin: 10px 0;
-}
-#info {
-    background-color: #d1ecf1;
-    border: 1px solid #17a2b8;
-    border-radius: 5px;
-    padding: 10px;
     margin: 10px 0;
 }
 """
-# Tạo giao diện Gradio
 with gr.Blocks(title="VieNeu-TTS", css=custom_css, theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
-    # 🎙️ VieNeu-TTS: Vietnamese Text-to-Speech
-    Hệ thống tổng hợp tiếng nói tiếng Việt được **finetune từ NeuTTS-Air** - một mô hình TTS tiên tiến sử dụng Large Language Model và Neural Codec.
-    Tác giả: [Phạm Nguyễn Ngọc Bảo](https://github.com/pnnbao97)
-    Model: [VieNeu-TTS](https://huggingface.co/pnnbao-ump/VieNeu-TTS)
-    Code: [GitHub](https://github.com/pnnbao97/VieNeu-TTS)
-    Demo: [Hugging Face](https://huggingface.co/spaces/pnnbao-ump/VieNeu-TTS)
-    """)
     with gr.Row():
-        with gr.Column():
-            # Input text
             text_input = gr.Textbox(
-                label="📝 Văn bản đầu vào (tối đa 500 ký tự)",
-                placeholder="Nhập văn bản tiếng Việt...",
-                lines=4,
-                max_lines=6,
-                value="Legacy là một bộ phim đột phá về mặt âm nhạc, quay phim, hiệu ứng đặc biệt, và tôi rất mừng vì cuối cùng nó cũng được cả giới phê bình lẫn người hâm mộ đánh giá lại. Chúng ta đã quá bất công với bộ phim này vào năm 2010."  # Example 1 làm mặc định
             )
-            # Character counter
-            char_count = gr.Markdown("209 / 500 ký tự")  # Update số ký tự của example 1
-            # Voice selection
             voice_select = gr.Radio(
                 choices=list(VOICE_SAMPLES.keys()),
-                label="🎤 Chọn giọng",
-                value="Nam miền Nam"
             )
-            # Custom voice option
-            with gr.Accordion("🎨 Hoặc sử dụng giọng tùy chỉnh", open=False):
-                gr.Markdown("*Upload file audio (.wav) và nội dung text tương ứng*")
-                custom_audio = gr.Audio(
-                    label="File audio mẫu",
-                    type="filepath"
-                )
                 custom_text = gr.Textbox(
-                    label="Nội dung của audio mẫu",
                     placeholder="Nhập chính xác nội dung...",
                     lines=2
                 )
-            # Submit button
-            submit_btn = gr.Button("🎵 Tổng hợp giọng nói", variant="primary", size="lg")
-        with gr.Column():
-            # Output
-            audio_output = gr.Audio(label="🔊 Kết quả")
-            status_output = gr.Textbox(label="📊 Trạng thái", interactive=False)
     # Examples
-    gr.Markdown("### 💡 Ví dụ nhanh")
-    gr.Examples(
-        examples=examples,
-        inputs=[text_input, voice_select],
-        outputs=[audio_output, status_output],
-        fn=synthesize_speech,
-        cache_examples=False
-    )
     # Update character count
     def update_char_count(text):
         count = len(text) if text else 0
-        color = "red" if count > 500 else "green"
-        return f"<span style='color: {color}'>{count} / 500 ký tự</span>"
     text_input.change(
         fn=update_char_count,
@@ -216,17 +223,6 @@ with gr.Blocks(title="VieNeu-TTS", css=custom_css, theme=gr.themes.Soft()) as de
         inputs=[text_input, voice_select, custom_audio, custom_text],
         outputs=[audio_output, status_output]
     )
-    gr.Markdown("""
-    ---
-    ### 📌 Thông tin
-    **Liên kết:**
-    - [GitHub Repository](https://github.com/pnnbao97/VieNeu-TTS)
-    - [Model Card](https://huggingface.co/pnnbao-ump/VieNeu-TTS)
-    <sub>Powered by VieNeu-TTS | Built with ❤️ for Vietnamese TTS</sub>
-    """)
 # Launch
 if __name__ == "__main__":

 import spaces  # PHẢI import TRƯỚC mọi thứ!
 import os
 os.environ['SPACES_ZERO_GPU'] = '1'  # Set environment variable
 import gradio as gr
 import soundfile as sf
 import tempfile
 import torch
+from vieneu_tts import VieNeuTTS
+print("⏳ Đang khởi động VieNeu-TTS...")
+# Khởi tạo model
 print("📦 Đang tải model...")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"🖥️ Sử dụng thiết bị: {device.upper()}")
 tts = VieNeuTTS(
+    backbone_repo="pnnbao-ump/VieNeu-TTS-1000h",
+    backbone_device=device,
     codec_repo="neuphonic/neucodec",
+    codec_device=device
 )
 print("✅ Model đã tải xong!")
 # Danh sách giọng mẫu
 VOICE_SAMPLES = {
+    "Nam 1": {
         "audio": "./sample/id_0001.wav",
         "text": "./sample/id_0001.txt"
     },
+    "Nữ 1": {
         "audio": "./sample/id_0002.wav",
         "text": "./sample/id_0002.txt"
+    },
+    "Nam 2": {
+        "audio": "./sample/id_0003.wav",
+        "text": "./sample/id_0003.txt"
+    },
+    "Nữ 2": {
+        "audio": "./sample/id_0004.wav",
+        "text": "./sample/id_0004.txt"
+    },
+    "Nam 3": {
+        "audio": "./sample/id_0005.wav",
+        "text": "./sample/id_0005.txt"
+    },
+    "Nam 4": {
+        "audio": "./sample/id_0007.wav",
+        "text": "./sample/id_0007.txt"
     }
 }
+@spaces.GPU(duration=120)
 def synthesize_speech(text, voice_choice, custom_audio=None, custom_text=None):
+    """Tổng hợp giọng nói từ văn bản"""
     try:
         if not text or text.strip() == "":
             return None, "❌ Vui lòng nhập văn bản cần tổng hợp"
+        if len(text) > 250:
+            return None, "❌ Văn bản quá dài! Vui lòng nhập tối đa 250 ký tự. Để tổng hợp văn bản dài hơn, vui lòng tham khảo examples/infer_long_text.py"
         # Xác định reference audio và text
         if custom_audio is not None and custom_text:
             ref_audio_path = custom_audio
+            ref_text_raw = custom_text
+            print("🎨 Sử dụng giọng tùy chỉnh")
         elif voice_choice in VOICE_SAMPLES:
             ref_audio_path = VOICE_SAMPLES[voice_choice]["audio"]
             ref_text_path = VOICE_SAMPLES[voice_choice]["text"]
             with open(ref_text_path, "r", encoding="utf-8") as f:
+                ref_text_raw = f.read()
+            print(f"🎤 Sử dụng giọng: {voice_choice}")
         else:
             return None, "❌ Vui lòng chọn giọng hoặc tải lên audio tùy chỉnh"
+        # Encode và tổng hợp
         print(f"📝 Đang xử lý: {text[:50]}...")
         ref_codes = tts.encode_reference(ref_audio_path)
         print(f"🎵 Đang tổng hợp giọng nói trên {device.upper()}...")
+        wav = tts.infer(text, ref_codes, ref_text_raw)
+        # Lưu file
         with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
             sf.write(tmp_file.name, wav, 24000)
             output_path = tmp_file.name
         print("✅ Hoàn thành!")
+        return output_path, f"✅ Tổng hợp thành công"
     except Exception as e:
         print(f"❌ Lỗi: {str(e)}")
         import traceback
         traceback.print_exc()
         return None, f"❌ Lỗi: {str(e)}"
+# Custom CSS - tối giản
 custom_css = """
 .gradio-container {
     max-width: 900px !important;
+    margin: 0 auto !important;
 }
+.warning-box {
+    background-color: #fef3c7;
+    border-left: 4px solid #f59e0b;
+    padding: 12px 16px;
+    border-radius: 6px;
     margin: 10px 0;
+    color: #000000;
 }
 """
+# Tạo giao diện
 with gr.Blocks(title="VieNeu-TTS", css=custom_css, theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
+    # VieNeu-TTS
+    Hệ thống tổng hợp tiếng nói tiếng Việt sử dụng Large Language Model
+    **Phiên bản:** VieNeu-TTS-1000h (model mới nhất, train trên 1000 giờ dữ liệu)
+    [GitHub](https://github.com/pnnbao97/VieNeu-TTS) • [Model Card](https://huggingface.co/pnnbao-ump/VieNeu-TTS) • [Finetune Guide](https://github.com/pnnbao-ump/VieNeuTTS/blob/main/finetune.ipynb)
+    """)
+    # Main interface
     with gr.Row():
+        with gr.Column(scale=1):
             text_input = gr.Textbox(
+                label="Văn bản",
+                placeholder="Nhập văn bản tiếng Việt (khuyến cáo dưới 250 ký tự)...",
+                lines=5,
+                value="Trí tuệ nhân tạo đang cách mạng hóa nhiều lĩnh vực, từ y tế, giáo dục đến giao thông vận tải, mang lại những giải pháp thông minh và hiệu quả."
             )
+            char_count = gr.Markdown("**142 / 250 ký tự**")
             voice_select = gr.Radio(
                 choices=list(VOICE_SAMPLES.keys()),
+                label="Chọn giọng",
+                value="Nam 1"
             )
+            submit_btn = gr.Button("Tổng hợp", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(label="Kết quả", type="filepath")
+            status_output = gr.Textbox(label="Trạng thái", interactive=False, show_label=False)
+            with gr.Accordion("Giọng tùy chỉnh", open=False):
+                gr.Markdown("""
+Tải lên file audio và nhập nội dung tương ứng. Để có kết quả tốt nhất, nên finetune model trên giọng của bạn.
+                """)
+                custom_audio = gr.Audio(label="File audio (.wav)", type="filepath")
                 custom_text = gr.Textbox(
+                    label="Nội dung audio",
                     placeholder="Nhập chính xác nội dung...",
                     lines=2
                 )
+            gr.HTML("""
+            <div class="warning-box" style="color: #000000;">
+                ⚠️ Chúng tôi khuyến cáo sử dụng đoạn văn bản <250 ký tự để đảm bảo chất lượng tốt nhất.
+                Nếu muốn tổng hợp văn bản dài hơn, vui lòng tham khảo code trong examples/infer_long_text.py
+            </div>
+            """)
     # Examples
+    with gr.Row():
+        gr.Examples(
+            examples=[
+                ["Trí tuệ nhân tạo đang cách mạng hóa nhiều lĩnh vực, từ y tế, giáo dục đến giao thông vận tải, mang lại những giải pháp thông minh và hiệu quả.", "Nam 1"],
+                ["Trên bầu trời xanh thẳm, những đám mây trắng lửng lờ trôi như những chiếc thuyền nhỏ đang lướt nhẹ theo dòng gió. Dưới mặt đất, cánh đồng lúa vàng rực trải dài tới tận chân trời, những bông lúa nghiêng mình theo từng làn gió.", "Nữ 2"],
+                ["Legacy là một bộ phim đột phá về mặt âm nhạc, quay phim, hiệu ứng đặc biệt, và tôi rất mừng vì cuối cùng nó cũng được cả giới phê bình lẫn người hâm mộ đánh giá lại. Chúng ta đã quá bất công với bộ phim này vào năm 2010.", "Nam 4"],
+                ["Thật đáng ngạc nhiên! Mặc dù con đường này rất xa và khó đi, nhưng với sự kiên trì và sự đồng lòng của tất cả mọi người, chúng ta đã hoàn thành được công việc sửa chữa trước 3 ngày so với kế hoạch ban đầu, bạn có tin không?", "Nữ 1"],
+                ["Các bác sĩ đang nghiên cứu một loại vaccine mới chống lại virus cúm mùa. Thí nghiệm lâm sàng cho thấy phản ứng miễn dịch mạnh mẽ và ít tác dụng phụ.", "Nam 2"],
+            ],
+            inputs=[text_input, voice_select],
+            outputs=[audio_output, status_output],
+            fn=synthesize_speech,
+            cache_examples=False
+        )
+    # Footer info
+    gr.Markdown("""
+---
+**Tác giả:** Phạm Nguyễn Ngọc Bảo • **Model:** VieNeu-TTS-1000h
+**Lưu ý:** Nếu muốn sử dụng model cũ VieNeu-TTS-140h, hãy thay đổi `backbone_repo` trong mã nguồn
+---
+### Ủng hộ dự án
+VieNeu-TTS là dự án miễn phí và mã nguồn mở. Tuy nhiên, việc train model TTS chất lượng cao trên 1000+ giờ dữ liệu đòi hỏi nguồn lực tính toán đáng kể.
+Nếu bạn thấy dự án này hữu ích, hãy cân nhắc ủng hộ:
+☕ [Buy Me a Coffee](https://buymeacoffee.com/pnnbao)
+    """)
     # Update character count
     def update_char_count(text):
         count = len(text) if text else 0
+        color = "#dc2626" if count > 250 else "#374151"
+        return f"<span style='color: {color}; font-weight: 500'>{count} / 250 ký tự</span>"
     text_input.change(
         fn=update_char_count,
         inputs=[text_input, voice_select, custom_audio, custom_text],
         outputs=[audio_output, status_output]
     )
 # Launch
 if __name__ == "__main__":