Spaces:

alakxender
/

dhivehi-tts-demos

Running on Zero

App Files Files Community

alakxender commited on Oct 10

Commit

d81be79

1 Parent(s): 91ab2cd

t

Browse files

Files changed (5) hide show

app.py +0 -2
cbox_dv.py +0 -451
cbox_test.py +0 -79
chatterbox_dhivehi.py +0 -210
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -16,7 +16,6 @@ from csm1b_dv import (
     change_model_and_update_ui
 )
 from dia_1_6B_dv import get_dia_1_6B_tab
-from cbox_dv import get_cbox_dv
 with gr.Blocks(
     title="Dhivehi (Thaana) Text-to-Speech",
@@ -38,7 +37,6 @@ with gr.Blocks(
 ) as app:
     get_csm1b_tab()
     get_dia_1_6B_tab()
-    get_cbox_dv()
 if __name__ == "__main__":
     app.launch(share=False)

     change_model_and_update_ui
 )
 from dia_1_6B_dv import get_dia_1_6B_tab
 with gr.Blocks(
     title="Dhivehi (Thaana) Text-to-Speech",
 ) as app:
     get_csm1b_tab()
     get_dia_1_6B_tab()
 if __name__ == "__main__":
     app.launch(share=False)

cbox_dv.py DELETED Viewed

@@ -1,451 +0,0 @@
-from pathlib import Path
-import os
-try:
-    from huggingface_hub import snapshot_download
-    _target = Path.home() / ".chatterbox-tts-dhivehi"
-    if not (_target.exists() and any(_target.rglob("*"))):
-        snapshot_download(
-            repo_id="alakxender/chatterbox-tts-dhivehi",
-            local_dir=str(_target),
-            local_dir_use_symlinks=False,
-            resume_download=True
-        )
-except Exception as _e:
-    pass
-from chatterbox.tts import ChatterboxTTS
-import torchaudio
-from pathlib import Path
-import torch
-import random
-import numpy as np
-import gradio as gr
-import tempfile
-import os
-import chatterbox_dhivehi
-import warnings
-warnings.filterwarnings("ignore")
-chatterbox_dhivehi.extend_dhivehi()
-class TTSApp:
-    def __init__(self, checkpoint=f"{_target}/kn_cbox"):
-        self.checkpoint = checkpoint
-        self.model = None
-        self.load_model()
-    def load_model(self):
-        """Load the TTS model"""
-        try:
-            print(f"Loading model with checkpoint: {self.checkpoint}")
-            self.model = ChatterboxTTS.from_dhivehi(
-                ckpt_dir=Path(self.checkpoint),
-                device="cuda" if torch.cuda.is_available() else "cpu"
-            )
-            print("Model loaded successfully!")
-        except Exception as e:
-            print(f"Error loading model: {e}")
-            raise e
-    def set_seed(self, seed: int):
-        """Set random seed for reproducibility"""
-        torch.manual_seed(seed)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed(seed)
-            torch.cuda.manual_seed_all(seed)
-        random.seed(seed)
-        np.random.seed(seed)
-    def generate_speech(self,
-                       text,
-                       reference_audio,
-                       exaggeration=0.5,
-                       temperature=0.1,
-                       cfg_weight=0.5,
-                       seed=42):
-        """Generate speech from text using voice cloning"""
-        # Clean the input text
-        text = self.clean_text(text)
-        if not text:
-            return None, "Please enter some text to generate speech."
-        if self.model is None:
-            return None, "Model not loaded. Please check your model paths."
-        try:
-            # Set seed for reproducibility
-            self.set_seed(seed)
-            # Handle reference audio - make it optional
-            audio_prompt_path = reference_audio
-            print(f"Generating audio for: {text[:50]}...")
-            if audio_prompt_path:
-                print(f"Using reference audio: {audio_prompt_path}")
-            else:
-                print("Generating without reference audio")
-            # Generate audio - handle optional reference audio
-            if audio_prompt_path:
-                audio = self.model.generate(
-                    text=text,
-                    audio_prompt_path=audio_prompt_path,
-                    exaggeration=exaggeration,
-                    temperature=temperature,
-                    cfg_weight=cfg_weight,
-                )
-            else:
-                # Try without reference audio
-                try:
-                    audio = self.model.generate(
-                        text=text,
-                        exaggeration=exaggeration,
-                        temperature=temperature,
-                        cfg_weight=cfg_weight,
-                    )
-                except TypeError:
-                    # If the model requires audio_prompt_path, try with empty string
-                    audio = self.model.generate(
-                        text=text,
-                        audio_prompt_path="",
-                        exaggeration=exaggeration,
-                        temperature=temperature,
-                        cfg_weight=cfg_weight,
-                    )
-            # Save to temporary file
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-                output_path = tmp_file.name
-            torchaudio.save(output_path, audio, 24000)
-            return output_path, f"Successfully generated speech! Audio length: {audio.shape[1]/24000:.2f} seconds"
-        except Exception as e:
-            error_msg = f"Error generating speech: {str(e)}"
-            print(error_msg)
-            return None, error_msg
-    def clean_text(self, text):
-        """Clean text by removing newlines at start/end, double spaces, and extra whitespace"""
-        import re
-        # Remove newlines at start and end
-        text = text.strip('\n\r')
-        # Replace multiple spaces with single space
-        text = re.sub(r'\s+', ' ', text)
-        # Strip leading and trailing spaces
-        text = text.strip()
-        return text
-    def split_sentences(self, text):
-        """Split text into sentences based on periods, ensuring each sentence is at least 150 characters"""
-        # Clean the input text first
-        text = self.clean_text(text)
-        # First, split by periods normally
-        initial_sentences = []
-        current_sentence = ""
-        for char in text:
-            current_sentence += char
-            if char == '.':
-                # Add sentence if it's not empty after stripping spaces from both sides
-                stripped_sentence = current_sentence.strip()
-                if stripped_sentence:
-                    initial_sentences.append(stripped_sentence)
-                current_sentence = ""
-        # Add remaining text if any (without period), stripped of spaces from both sides
-        stripped_remaining = current_sentence.strip()
-        if stripped_remaining:
-            initial_sentences.append(stripped_remaining)
-        # If we only have one sentence, return it
-        if len(initial_sentences) <= 1:
-            return initial_sentences
-        # Now combine sentences until each is at least 150 characters
-        final_sentences = []
-        combined_sentence = ""
-        for sentence in initial_sentences:
-            if combined_sentence:
-                combined_sentence += " " + sentence
-            else:
-                combined_sentence = sentence
-            # If combined sentence is >= 150 chars, add it to final list
-            if len(combined_sentence) >= 150:
-                final_sentences.append(combined_sentence.strip())
-                combined_sentence = ""
-        # Add any remaining combined sentence (even if < 150 chars)
-        if combined_sentence.strip():
-            final_sentences.append(combined_sentence.strip())
-        return final_sentences
-    def generate_speech_multi_sentence(self,
-                                     text,
-                                     reference_audio,
-                                     exaggeration=0.5,
-                                     temperature=0.1,
-                                     cfg_weight=0.5,
-                                     seed=42):
-        """Generate speech from text with multi-sentence support and progress tracking"""
-        # Clean the input text
-        text = self.clean_text(text)
-        if not text:
-            yield None, "Please enter some text to generate speech."
-            return
-        if self.model is None:
-            yield None, "Model not loaded. Please check your model paths."
-            return
-        # Split text into sentences
-        sentences = self.split_sentences(text)
-        # If only one sentence or no periods, use regular method
-        if len(sentences) <= 1:
-            yield None, "🎵 Generating single sentence..."
-            result_audio, result_status = self.generate_speech(text, reference_audio, exaggeration, temperature, cfg_weight, seed)
-            yield result_audio, result_status
-            return
-        try:
-            # Set seed for reproducibility
-            self.set_seed(seed)
-            # Handle reference audio - make it optional
-            audio_prompt_path = reference_audio
-            yield None, f"🚀 Starting generation for {len(sentences)} sentences..."
-            print(f"Processing {len(sentences)} sentences...")
-            all_audio_segments = []
-            total_duration = 0
-            for i, sentence in enumerate(sentences):
-                # Calculate progress percentage
-                progress_percent = int((i / len(sentences)) * 90)  # Reserve last 10% for combining
-                yield None, f"🎵 Generating sentence {i+1}/{len(sentences)} ({progress_percent}%): {sentence[:50]}..."
-                print(f"Generating audio for sentence {i+1}/{len(sentences)}: {sentence[:50]}...")
-                # Generate audio for this sentence
-                try:
-                    if audio_prompt_path:
-                        audio = self.model.generate(
-                            text=sentence,
-                            audio_prompt_path=audio_prompt_path,
-                            exaggeration=exaggeration,
-                            temperature=temperature,
-                            cfg_weight=cfg_weight,
-                        )
-                    else:
-                        # Try without reference audio
-                        try:
-                            audio = self.model.generate(
-                                text=sentence,
-                                exaggeration=exaggeration,
-                                temperature=temperature,
-                                cfg_weight=cfg_weight,
-                            )
-                        except TypeError:
-                            # If the model requires audio_prompt_path, try with empty string
-                            audio = self.model.generate(
-                                text=sentence,
-                                audio_prompt_path="",
-                                exaggeration=exaggeration,
-                                temperature=temperature,
-                                cfg_weight=cfg_weight,
-                            )
-                except Exception as model_error:
-                    # If the model fails due to missing reference audio, try with default behavior
-                    if "reference_voice.wav not found" in str(model_error) or "No reference audio provided" in str(model_error):
-                        print("Attempting generation without reference audio...")
-                        # Try different approaches for models that don't support None reference audio
-                        try:
-                            # Some models might accept an empty string
-                            audio = self.model.generate(
-                                text=sentence,
-                                audio_prompt_path="",
-                                exaggeration=exaggeration,
-                                temperature=temperature,
-                                cfg_weight=cfg_weight,
-                            )
-                        except:
-                            # If that fails, try without the audio_prompt_path parameter entirely
-                            audio = self.model.generate(
-                                text=sentence,
-                                exaggeration=exaggeration,
-                                temperature=temperature,
-                                cfg_weight=cfg_weight,
-                            )
-                    else:
-                        raise model_error
-                all_audio_segments.append(audio)
-                total_duration += audio.shape[1] / 24000
-            # Concatenate all audio segments
-            yield None, "🔧 Combining audio segments (95%)..."
-            print("Combining audio segments...")
-            combined_audio = torch.cat(all_audio_segments, dim=1)
-            # Save to temporary file
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-                output_path = tmp_file.name
-            torchaudio.save(output_path, combined_audio, 24000)
-            print("Multi-sentence processing complete!")
-            yield output_path, f"✅ Successfully generated speech from {len(sentences)} sentences! Total audio length: {total_duration:.2f} seconds"
-        except Exception as e:
-            error_msg = f"❌ Error generating multi-sentence speech: {str(e)}"
-            print(error_msg)
-            yield None, error_msg
-def get_cbox_dv():
-    """Create the Gradio interface"""
-    # Initialize the TTS app
-    #tts_app = TTSApp()
-    # Sample texts in Dhivehi
-    sample_texts = [
-        "ކާޑު ނުލައި ފައިސާ ދެއްކޭ ނެޝަނަލް ކިއުއާރް ކޯޑް އެމްއެމްއޭ އިން ތައާރަފްކުރަނީ",
-        """ފުޓްބޯޅަ ސްކޫލްގެ ބިމާއި ގުދަންބަރި ބިމުގައި އިމާރާތް ކުރުމުގެ މަސައްކަތް ހުއްޓާލަން އަންގައިފި...
-Construction work on football school land and warehouse land has been ordered to stop""",
-        "ސިވިލް ސާވިސްގެ ހިދުމަތުގެ މުއްދަތު ގުނުމުގައި ކުންފުނިތަކާއި އިދާރާތަކަށް ހިދުމަތްކުރި މުއްދަތު ހިމަނަނީ",
-        """އެ ރަށުގެ ބިން ހިއްކުމާއި ބަނދަރުގެ ނެރު ބަދަލުކުރުމާއި ގޮނޑުދޮށް ހިމާޔަތް ކުރުމުގެ މަސައްކަތް އެމްޓީސީސީއާ މިނިސްޓްރީން ހަވާލުކުރީ މިދިޔަ މަހު ރައީސް އެ ރަށަށް ކުރެއްވި ދަތުރުފުޅުގައި.
-The ministry handed over the land reclamation, replacement of the port canal and beach protection to MTCC during the President's visit to the village last month"""
-    ]
-    with gr.Tab("🎤 ChatterboxTTS"):
-        gr.Markdown("# 🎤 ChatterboxTTS - Dhivehi Text-to-Speech with Voice Cloning")
-        gr.Markdown("Generate natural-sounding Dhivehi speech with voice cloning capabilities.")
-        # Row 1: Text input and Reference audio
-        with gr.Row():
-            text_input = gr.Textbox(
-                label="Text to Convert",
-                placeholder="Enter Dhivehi text here...",
-                lines=6,
-                value=sample_texts[0],
-                rtl=True,
-                elem_classes=["textbox1"]
-            )
-            reference_audio = gr.Audio(
-                label="Reference Voice Audio (optional - for voice cloning)",
-                type="filepath",
-                sources=["upload", "microphone"],
-            )
-        # Row 2: Example buttons
-        gr.Markdown("**Quick Examples:**")
-        with gr.Row():
-            sample_btn1 = gr.Button("Sample 1", size="sm")
-            sample_btn2 = gr.Button("Sample 2", size="sm")
-            sample_btn3 = gr.Button("Sample 3", size="sm")
-            sample_btn4 = gr.Button("Sample 4", size="sm")
-        # Row 3: Advanced settings
-        with gr.Accordion("Advanced Settings", open=False):
-            with gr.Row():
-                exaggeration = gr.Slider(
-                    minimum=0.0,
-                    maximum=2.0,
-                    value=0.5,
-                    step=0.1,
-                    label="Exaggeration",
-                    info="Controls expressiveness"
-                )
-                temperature = gr.Slider(
-                    minimum=0.01,
-                    maximum=1.0,
-                    value=0.35,
-                    step=0.01,
-                    label="Temperature",
-                    info="Controls randomness"
-                )
-                cfg_weight = gr.Slider(
-                    minimum=0.0,
-                    maximum=2.0,
-                    value=0.3,
-                    step=0.1,
-                    label="CFG Weight",
-                    info="Classifier-free guidance weight"
-                )
-                seed = gr.Slider(
-                    minimum=0,
-                    maximum=9999,
-                    value=42,
-                    step=1,
-                    label="Seed",
-                    info="For reproducible results"
-                )
-        # Row 4: Generate button
-        generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
-        # Row 5: Output section
-        with gr.Row():
-            with gr.Column():
-                output_audio = gr.Audio(label="Generated Speech", type="filepath")
-                status_message = gr.Textbox(label="Status", interactive=False)
-        # Event handlers
-        def set_sample_text(sample_idx):
-            return sample_texts[sample_idx]
-        sample_btn1.click(lambda: set_sample_text(0), outputs=[text_input])
-        sample_btn2.click(lambda: set_sample_text(1), outputs=[text_input])
-        sample_btn3.click(lambda: set_sample_text(2), outputs=[text_input])
-        sample_btn4.click(lambda: set_sample_text(3), outputs=[text_input])
-        def generate_with_progress(text, reference_audio, exaggeration, temperature, cfg_weight, seed):
-            """Generate speech with streaming progress updates"""
-            # Use the streaming generator from the TTS app
-            #for result_audio, result_status in tts_app.generate_speech_multi_sentence(
-            #    text, reference_audio, exaggeration, temperature, cfg_weight, seed
-            #):
-            #    yield result_audio, result_status
-        generate_btn.click(
-            fn=generate_with_progress,
-            inputs=[text_input, reference_audio, exaggeration, temperature, cfg_weight, seed],
-            outputs=[output_audio, status_message]
-        )
-        # Instructions
-        with gr.Accordion("Tips", open=False):
-            gr.Markdown("""
-            ### General Use (TTS and Voice Agents):
-            - The default settings (exaggeration=0.5, cfg=0.5) work well for most prompts.
-            - If the reference speaker has a fast speaking style, lowering cfg to around 0.3 can improve pacing.
-            ### Expressive or Dramatic Speech:
-            - Try lower cfg values (e.g. ~0.3) and increase exaggeration to around 0.7 or higher.
-            - Higher exaggeration tends to speed up speech; reducing cfg helps compensate with slower, more deliberate pacing.
-            ### Language Transfer Notes:
-            - Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language.
-            - To mitigate this, set the CFG weight to 0.
-            ### Additional Tips:
-            - For best voice cloning results, use clear audio with minimal background noise
-            - The reference audio should be 3-10 seconds long
-            - Use the same seed value for reproducible results
-            """)

cbox_test.py DELETED Viewed

@@ -1,79 +0,0 @@
-from pathlib import Path
-import os
-try:
-    from huggingface_hub import snapshot_download
-    _target = Path.home() / ".chatterbox-tts-dhivehi"
-    if not (_target.exists() and any(_target.rglob("*"))):
-        snapshot_download(
-            repo_id="alakxender/chatterbox-tts-dhivehi",
-            local_dir=str(_target),
-            local_dir_use_symlinks=False,
-            resume_download=True
-        )
-except Exception as _e:
-    pass
-from chatterbox.tts import ChatterboxTTS
-import chatterbox_dhivehi
-import torchaudio
-import torch
-import numpy as np
-import random
-# ---- User settings (edit these) ----
-CKPT_DIR = f"{_target}/kn_cbox"  # path to your finetuned checkpoint dir
-REF_WAV = f"{_target}/samples/reference_audio.wav"                                              # optional 3–10s clean reference; "" to disable
-#REF_WAV = ""
-TEXT = "މި ރިޕޯޓާ ގުޅޭ ގޮތުން އެނިމަލް ވެލްފެއާ މިނިސްޓްރީން އަދި ވާހަކައެއް ނުދައްކާ"  # sample Dhivehi text
-TEXT = f"{TEXT}, The Animal Welfare Ministry has not yet commented on the report"
-EXAGGERATION = 0.4
-TEMPERATURE = 0.3
-CFG_WEIGHT = 0.7
-SEED = 42
-SAMPLE_RATE = 24000
-OUT_PATH = "out.wav"
-# ------------------------------------
-# Extend Dhivehi support from local file
-chatterbox_dhivehi.extend_dhivehi()
-# Seed for reproducibility
-torch.manual_seed(SEED)
-if torch.cuda.is_available():
-    torch.cuda.manual_seed(SEED)
-    torch.cuda.manual_seed_all(SEED)
-random.seed(SEED)
-np.random.seed(SEED)
-# Load model
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Loading ChatterboxTTS from: {CKPT_DIR} on {device}")
-model = ChatterboxTTS.from_dhivehi(ckpt_dir=Path(CKPT_DIR), device=device)
-print("Model loaded.")
-# Generate (reference audio optional)
-print(f"Generating audio... ref={'yes' if REF_WAV else 'no'}")
-gen_kwargs = dict(
-    text=TEXT,
-    exaggeration=EXAGGERATION,
-    temperature=TEMPERATURE,
-    cfg_weight=CFG_WEIGHT,
-)
-try:
-    if REF_WAV:
-        gen_kwargs["audio_prompt_path"] = REF_WAV
-        audio = model.generate(**gen_kwargs)
-    else:
-        # Try without reference first; if backend requires audio_prompt_path, fall back to ""
-        try:
-            audio = model.generate(**gen_kwargs)
-        except TypeError:
-            gen_kwargs["audio_prompt_path"] = ""
-            audio = model.generate(**gen_kwargs)
-except Exception as e:
-    raise RuntimeError(f"Generation failed: {e}")
-# Save
-torchaudio.save(OUT_PATH, audio, SAMPLE_RATE)
-dur = audio.shape[1] / SAMPLE_RATE
-print(f"Saved {OUT_PATH} ({dur:.2f}s)")

chatterbox_dhivehi.py DELETED Viewed

@@ -1,210 +0,0 @@
-# chatterbox_dhivehi.py
-"""
-Dhivehi extension for ChatterboxTTS.
-Requires: chatterbox-tts 0.1.4 (not tested on any other version)
-Adds:
-  - load_t3_with_vocab(state_dict, device, force_vocab_size): load T3 with a specific vocab size,
-    resizing both the embedding and the projection head, and padding checkpoint weights if needed.
-  - from_dhivehi(...): classmethod for building a ChatterboxTTS from a checkpoint directory,
-    using load_t3_with_vocab under the hood (defaults to vocab=2000).
-  - extend_dhivehi(): attach the above to ChatterboxTTS (idempotent).
-Usage in app.py:
-    import chatterbox_dhivehi
-    chatterbox_dhivehi.extend_dhivehi()
-    self.model = ChatterboxTTS.from_dhivehi(
-        ckpt_dir=Path(self.checkpoint),
-        device="cuda" if torch.cuda.is_available() else "cpu",
-        force_vocab_size=2000,
-    )
-"""
-from __future__ import annotations
-import logging
-from pathlib import Path
-from typing import Optional, Union
-import torch
-import torch.nn as nn
-from safetensors.torch import load_file
-# Core chatterbox imports
-from chatterbox.tts import ChatterboxTTS, Conditionals
-from chatterbox.models.t3 import T3
-from chatterbox.models.s3gen import S3Gen
-from chatterbox.models.tokenizers import EnTokenizer
-from chatterbox.models.voice_encoder import VoiceEncoder
-# Helpers
-def _expand_or_trim_rows(t: torch.Tensor, new_rows: int, init_std: float = 0.02) -> torch.Tensor:
-    """
-    Return a tensor with first dimension resized to `new_rows`.
-    If expanding, newly added rows are randomly initialized N(0, init_std).
-    """
-    old_rows = t.shape[0]
-    if new_rows == old_rows:
-        return t.clone()
-    if new_rows < old_rows:
-        return t[:new_rows].clone()
-    # expand
-    out = t.new_empty((new_rows,) + t.shape[1:])
-    out[:old_rows] = t
-    out[old_rows:].normal_(mean=0.0, std=init_std)
-    return out
-def _prepare_resized_state_dict(sd: dict, new_vocab: int, init_std: float = 0.02) -> dict:
-    """
-    Create a modified copy of `sd` where text_emb/text_head weights (and bias) match `new_vocab`.
-    """
-    sd = sd.copy()
-    # text embedding: [vocab, dim]
-    if "text_emb.weight" in sd:
-        sd["text_emb.weight"] = _expand_or_trim_rows(sd["text_emb.weight"], new_vocab, init_std)
-    # text projection head: Linear(out=vocab, in=dim)
-    if "text_head.weight" in sd:
-        sd["text_head.weight"] = _expand_or_trim_rows(sd["text_head.weight"], new_vocab, init_std)
-    if "text_head.bias" in sd:
-        bias = sd["text_head.bias"]
-        if bias.ndim == 1:
-            sd["text_head.bias"] = _expand_or_trim_rows(bias.unsqueeze(1), new_vocab, init_std).squeeze(1)
-    return sd
-def _resize_model_vocab_layers(model: T3, new_vocab: int, dim: Optional[int] = None) -> None:
-    """
-    Rebuild model.text_emb and model.text_head to match `new_vocab`.
-    Embedding dim is inferred from existing layers if not provided.
-    """
-    if dim is None:
-        if hasattr(model, "text_emb") and isinstance(model.text_emb, nn.Embedding):
-            dim = model.text_emb.embedding_dim
-        elif hasattr(model, "text_head") and isinstance(model.text_head, nn.Linear):
-            dim = model.text_head.in_features
-        else:
-            raise RuntimeError("Cannot infer text embedding dimension from T3 model.")
-    model.text_emb = nn.Embedding(new_vocab, dim)
-    model.text_head = nn.Linear(dim, new_vocab, bias=True)
-# Public api
-def load_t3_with_vocab(
-    t3_state_dict: dict,
-    device: str = "cpu",
-    *,
-    force_vocab_size: Optional[int] = None,
-    init_std: float = 0.02,
-) -> T3:
-    """
-    Load a T3 model with a specified vocabulary size.
-    - Removes a leading "t3." prefix on state_dict keys if present.
-    - Resizes BOTH `text_emb` and `text_head` to `force_vocab_size` (or to the checkpoint vocab if not forced).
-    - Pads checkpoint weights when the target vocab is larger than the checkpoint's.
-    Args:
-        t3_state_dict: state dict loaded from t3_cfg.safetensors (or similar).
-        device: "cpu", "cuda", or "mps".
-        force_vocab_size: desired vocab size (e.g., 2000 for Dhivehi-extended models).
-        init_std: std for random init of padded rows.
-    Returns:
-        T3: model moved to `device` and set to eval().
-    """
-    logger = logging.getLogger(__name__)
-    # Strip "t3." prefix if present
-    if any(k.startswith("t3.") for k in t3_state_dict.keys()):
-        t3_state_dict = {k[len("t3."):]: v for k, v in t3_state_dict.items()}
-    # derive checkpoint vocab if available
-    ckpt_vocab_size = None
-    if "text_emb.weight" in t3_state_dict and t3_state_dict["text_emb.weight"].ndim == 2:
-        ckpt_vocab_size = int(t3_state_dict["text_emb.weight"].shape[0])
-    elif "text_head.weight" in t3_state_dict and t3_state_dict["text_head.weight"].ndim == 2:
-        ckpt_vocab_size = int(t3_state_dict["text_head.weight"].shape[0])
-    target_vocab = int(force_vocab_size) if force_vocab_size is not None else ckpt_vocab_size
-    if target_vocab is None:
-        raise RuntimeError("Could not determine vocab size. Provide force_vocab_size.")
-    logger.info(f"Loading T3 with vocab={target_vocab} (ckpt_vocab={ckpt_vocab_size})")
-    # Build a base model and resize layers to accept the incoming state dict
-    t3 = T3()
-    _resize_model_vocab_layers(t3, target_vocab)
-    # Patch the checkpoint tensors to the target vocab
-    patched_sd = _prepare_resized_state_dict(t3_state_dict, target_vocab, init_std)
-    # Load (strict=False to tolerate benign extra/missing keys)
-    t3.load_state_dict(patched_sd, strict=False)
-    return t3.to(device).eval()
-def from_dhivehi(
-    cls,
-    *,
-    ckpt_dir: Union[str, Path],
-    device: str = "cpu",
-    force_vocab_size: int = 1199,
-):
-    """
-    Construct a Dhivehi-extended ChatterboxTTS from a checkpoint directory.
-    Expected files in `ckpt_dir`:
-      - ve.safetensors
-      - t3_cfg.safetensors
-      - s3gen.safetensors
-      - tokenizer.json
-      - conds.pt (optional)
-    """
-    ckpt_dir = Path(ckpt_dir)
-    # Voice encoder
-    ve = VoiceEncoder()
-    ve.load_state_dict(load_file(ckpt_dir / "ve.safetensors"))
-    ve.to(device).eval()
-    # T3 with Dhivehi vocab extension
-    t3_state = load_file(ckpt_dir / "t3_cfg.safetensors")
-    t3 = load_t3_with_vocab(t3_state, device=device, force_vocab_size=force_vocab_size)
-    # S3Gen
-    s3gen = S3Gen()
-    s3gen.load_state_dict(load_file(ckpt_dir / "s3gen.safetensors"), strict=False)
-    s3gen.to(device).eval()
-    # Tokenizer
-    tokenizer = EnTokenizer(str(ckpt_dir / "tokenizer.json"))
-    # Optional conditionals
-    conds = None
-    conds_path = ckpt_dir / "conds.pt"
-    if conds_path.exists():
-        # Always safe-load to CPU first; .to(device) later
-        conds = Conditionals.load(conds_path, map_location="cpu").to(device)
-    return cls(t3, s3gen, ve, tokenizer, device, conds=conds)
-def extend_dhivehi():
-    """
-    Attach Dhivehi-specific helpers to ChatterboxTTS (idempotent).
-    - ChatterboxTTS.load_t3_with_vocab (staticmethod)
-    - ChatterboxTTS.from_dhivehi (classmethod)
-    """
-    if getattr(ChatterboxTTS, "_dhivehi_extended", False):
-        return
-    ChatterboxTTS.load_t3_with_vocab = staticmethod(load_t3_with_vocab)
-    ChatterboxTTS.from_dhivehi = classmethod(from_dhivehi)
-    ChatterboxTTS._dhivehi_extended = True

requirements.txt CHANGED Viewed

@@ -1,4 +1,3 @@
-chatterbox-tts==0.1.4
 transformers==4.53.0
 librosa==0.11.0
 accelerate==1.8.1

 transformers==4.53.0
 librosa==0.11.0
 accelerate==1.8.1