KaniTTS_Voice_Cloning

Runtime error

App Files Files Community

Gapeleon commited on Sep 22

Commit

f60a5bb

1 Parent(s): 8c00aff

add modules

Browse files

Files changed (7) hide show

kanitts/__init__.py +24 -0
kanitts/audio.py +82 -0
kanitts/config.py +41 -0
kanitts/extractors.py +66 -0
kanitts/factory.py +21 -0
kanitts/models.py +92 -0
kanitts/tokens.py +19 -0

kanitts/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""NanoCodec TTS System - A modular text-to-speech system."""
+from .config import Config, AudioConfig, ModelConfig
+from .tokens import TokenRegistry
+from .audio import NemoAudioPlayer, AudioProcessor, NemoAudioProcessor
+from .models import KaniModel, InputProcessor, ModelInference
+from .extractors import AudioCodeExtractor, TextExtractor
+from .factory import TTSFactory
+__all__ = [
+    'Config',
+    'AudioConfig',
+    'ModelConfig',
+    'TokenRegistry',
+    'NemoAudioPlayer',
+    'AudioProcessor',
+    'NemoAudioProcessor',
+    'KaniModel',
+    'InputProcessor',
+    'ModelInference',
+    'AudioCodeExtractor',
+    'TextExtractor',
+    'TTSFactory',
+]

kanitts/audio.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""Audio processing components for the TTS system."""
+import torch
+import logging
+from abc import ABC, abstractmethod
+from typing import Tuple, Optional
+from nemo.collections.tts.models import AudioCodecModel
+from transformers import AutoTokenizer
+from .config import Config, AudioConfig
+from .extractors import AudioCodeExtractor, TextExtractor
+from nemo.utils.nemo_logging import Logger
+nemo_logger = Logger()
+nemo_logger.remove_stream_handlers()
+logger = logging.getLogger(__name__)
+class AudioProcessor(ABC):
+    """Abstract base class for audio processing strategies."""
+    @abstractmethod
+    def decode_audio(self, audio_codes: torch.Tensor, length: torch.Tensor) -> torch.Tensor:
+        pass
+class NemoAudioProcessor(AudioProcessor):
+    """NeMo-based audio processing implementation."""
+    def __init__(self, config: AudioConfig):
+        self.config = config
+        self.device = config.device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self._model = None
+    @property
+    def model(self):
+        if self._model is None:
+            logger.info(f"Loading NeMo codec model: {self.config.nemo_model_name}")
+            self._model = AudioCodecModel.from_pretrained(self.config.nemo_model_name).eval()
+            self._model.to(self.device)
+        return self._model
+    def decode_audio(self, audio_codes: torch.Tensor, length: torch.Tensor) -> torch.Tensor:
+        audio_codes, length = audio_codes.to(self.device), length.to(self.device)
+        with torch.inference_mode():
+            reconstructed_audio, _ = self.model.decode(tokens=audio_codes, tokens_len=length)
+            return reconstructed_audio.cpu().detach().numpy().squeeze()
+class NemoAudioPlayer:
+    """Orchestrates audio generation from token sequences."""
+    def __init__(self, config: Config, text_tokenizer_name: Optional[str] = None):
+        self.config = config
+        self.tokens = config.tokens
+        self.audio_processor = NemoAudioProcessor(config.audio)
+        self.code_extractor = AudioCodeExtractor(config.tokens)
+        self.text_extractor = None
+        if text_tokenizer_name:
+            tokenizer = AutoTokenizer.from_pretrained(text_tokenizer_name)
+            self.text_extractor = TextExtractor(config.tokens, tokenizer)
+    def get_waveform(self, out_ids: torch.Tensor) -> Tuple[torch.Tensor, Optional[str]]:
+        """Generate waveform from model output tokens."""
+        try:
+            out_ids = out_ids.flatten()
+            self.code_extractor.validate_output(out_ids)
+            audio_codes, length = self.code_extractor.extract_audio_codes(out_ids)
+            output_audio = self.audio_processor.decode_audio(audio_codes, length)
+            text = None
+            if self.text_extractor:
+                text = self.text_extractor.extract_text(out_ids)
+            return output_audio, text
+        except Exception as e:
+            logger.error(f"Error generating waveform: {e}")
+            raise

kanitts/config.py ADDED Viewed

	@@ -0,0 +1,41 @@

+"""Configuration classes for the TTS system."""
+from dataclasses import dataclass
+from typing import Optional
+from .tokens import TokenRegistry
+@dataclass
+class AudioConfig:
+    """Configuration for audio processing."""
+    nemo_model_name: str = "nvidia/nemo-nano-codec-22khz-0.6kbps-12.5fps"
+    sample_rate: int = 22050
+    device: Optional[str] = None
+@dataclass
+class ModelConfig:
+    """Configuration for language model."""
+    model_name: str = 'nineninesix/kani-tts-450m-0.1-pt'
+    device_map: str = "auto"
+    torch_dtype: str = "bfloat16"
+    max_new_tokens: int = 1200
+    temperature: float = 0.6
+    top_p: float = 0.95
+    repetition_penalty: float = 1.1
+@dataclass
+class Config:
+    """Main configuration container."""
+    model: ModelConfig
+    audio: AudioConfig
+    tokens: TokenRegistry
+    @classmethod
+    def default(cls) -> 'Config':
+        return cls(
+            model=ModelConfig(),
+            audio=AudioConfig(),
+            tokens=TokenRegistry()
+        )

kanitts/extractors.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""Extractors for processing audio and text from token sequences."""
+import torch
+from typing import Tuple, Optional
+from .tokens import TokenRegistry
+class AudioCodeExtractor:
+    """Handles extraction and validation of audio codes from token sequences."""
+    def __init__(self, token_registry: TokenRegistry):
+        self.tokens = token_registry
+    def validate_output(self, out_ids: torch.Tensor) -> None:
+        """Validate that required speech tokens are present."""
+        start_present = self.tokens.start_of_speech in out_ids
+        end_present = self.tokens.end_of_speech in out_ids
+        if not (start_present and end_present):
+            raise ValueError('Special speech tokens not found in output')
+    def extract_audio_codes(self, out_ids: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Extract and process audio codes from token sequence."""
+        try:
+            start_idx = (out_ids == self.tokens.start_of_speech).nonzero(as_tuple=True)[0].item()
+            end_idx = (out_ids == self.tokens.end_of_speech).nonzero(as_tuple=True)[0].item()
+        except IndexError:
+            raise ValueError('Speech tokens not found in sequence')
+        if start_idx >= end_idx:
+            raise ValueError('Invalid audio codes sequence - start token after end token')
+        audio_codes = out_ids[start_idx + 1:end_idx]
+        if len(audio_codes) % 4 != 0:
+            raise ValueError('Audio codes length must be multiple of 4')
+        audio_codes = audio_codes.reshape(-1, 4)
+        audio_codes = audio_codes - torch.tensor([self.tokens.codebook_size * i for i in range(4)])
+        audio_codes = audio_codes - self.tokens.audio_tokens_start
+        if (audio_codes < 0).sum().item() > 0:
+            raise ValueError('Invalid audio tokens detected')
+        audio_codes = audio_codes.T.unsqueeze(0)
+        length = torch.tensor([audio_codes.shape[-1]])
+        return audio_codes, length
+class TextExtractor:
+    """Handles text extraction from token sequences."""
+    def __init__(self, token_registry: TokenRegistry, tokenizer):
+        self.tokens = token_registry
+        self.tokenizer = tokenizer
+    def extract_text(self, out_ids: torch.Tensor) -> Optional[str]:
+        """Extract text from token sequence."""
+        try:
+            start_idx = (out_ids == self.tokens.start_of_text).nonzero(as_tuple=True)[0].item()
+            end_idx = (out_ids == self.tokens.end_of_text).nonzero(as_tuple=True)[0].item()
+            text_tokens = out_ids[start_idx:end_idx + 1]
+            return self.tokenizer.decode(text_tokens, skip_special_tokens=True)
+        except (IndexError, AttributeError):
+            return None

kanitts/factory.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""Factory for creating TTS system components."""
+from typing import Optional, Tuple
+from .config import Config
+from .audio import NemoAudioPlayer
+from .models import KaniModel
+class TTSFactory:
+    """Factory for creating TTS system components."""
+    @staticmethod
+    def create_system(config: Optional[Config] = None) -> Tuple[KaniModel, NemoAudioPlayer]:
+        """Create a complete TTS system."""
+        if config is None:
+            config = Config.default()
+        player = NemoAudioPlayer(config)
+        model = KaniModel(config, player)
+        return model, player

kanitts/models.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""Model inference components for the TTS system."""
+import torch
+import logging
+from typing import Tuple
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from .config import Config, ModelConfig
+from .tokens import TokenRegistry
+from .audio import NemoAudioPlayer
+logger = logging.getLogger(__name__)
+class InputProcessor:
+    """Handles input text processing and tokenization."""
+    def __init__(self, tokenizer, token_registry: TokenRegistry):
+        self.tokenizer = tokenizer
+        self.tokens = token_registry
+    def prepare_input(self, text: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Prepare input text for model inference."""
+        input_ids = self.tokenizer(text, return_tensors="pt").input_ids
+        start_token = torch.tensor([[self.tokens.start_of_human]], dtype=torch.int64)
+        end_tokens = torch.tensor([[self.tokens.end_of_text, self.tokens.end_of_human]], dtype=torch.int64)
+        modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1)
+        attention_mask = torch.ones(1, modified_input_ids.shape[1], dtype=torch.int64)
+        return modified_input_ids, attention_mask
+class ModelInference:
+    """Handles model inference operations."""
+    def __init__(self, model, config: ModelConfig, token_registry: TokenRegistry):
+        self.model = model
+        self.config = config
+        self.tokens = token_registry
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+        """Generate tokens from input."""
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
+        with torch.no_grad():
+            generated_ids = self.model.generate(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                max_new_tokens=self.config.max_new_tokens,
+                do_sample=True,
+                temperature=self.config.temperature,
+                top_p=self.config.top_p,
+                repetition_penalty=self.config.repetition_penalty,
+                num_return_sequences=1,
+                eos_token_id=self.tokens.end_of_speech,
+            )
+        return generated_ids.to('cpu')
+class KaniModel:
+    """Main text-to-speech model orchestrator."""
+    def __init__(self, config: Config, player: NemoAudioPlayer):
+        self.config = config
+        self.player = player
+        logger.info(f"Loading model: {config.model.model_name}")
+        torch_dtype = getattr(torch, config.model.torch_dtype)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            config.model.model_name,
+            torch_dtype=torch_dtype,
+            device_map=config.model.device_map,
+        )
+        self.tokenizer = AutoTokenizer.from_pretrained(config.model.model_name)
+        self.input_processor = InputProcessor(self.tokenizer, config.tokens)
+        self.inference = ModelInference(self.model, config.model, config.tokens)
+    def run_model(self, text: str) -> Tuple[torch.Tensor, str]:
+        """Generate audio from input text."""
+        try:
+            logger.info(f"Processing text: {text[:50]}...")
+            input_ids, attention_mask = self.input_processor.prepare_input(text)
+            model_output = self.inference.generate(input_ids, attention_mask)
+            audio, _ = self.player.get_waveform(model_output)
+            return audio, text
+        except Exception as e:
+            logger.error(f"Error in model execution: {e}")
+            raise

kanitts/tokens.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""Token registry for managing special tokens in the TTS system."""
+class TokenRegistry:
+    """Centralized token management for audio codec operations."""
+    def __init__(self, tokenizer_length: int = 64400):
+        self.tokenizer_length = tokenizer_length
+        self.start_of_text = 1
+        self.end_of_text = 2
+        self.start_of_speech = tokenizer_length + 1
+        self.end_of_speech = tokenizer_length + 2
+        self.start_of_human = tokenizer_length + 3
+        self.end_of_human = tokenizer_length + 4
+        self.start_of_ai = tokenizer_length + 5
+        self.end_of_ai = tokenizer_length + 6
+        self.pad_token = tokenizer_length + 7
+        self.audio_tokens_start = tokenizer_length + 10
+        self.codebook_size = 4032