""" DungeonMaster AI - Voice Integration Models Pydantic models for voice profiles, synthesis results, and status tracking. """ from __future__ import annotations from datetime import datetime from enum import Enum from pydantic import BaseModel, Field class VoiceType(str, Enum): """Voice profile types for different speakers.""" DM = "dm" NPC_MALE_GRUFF = "npc_male_gruff" NPC_FEMALE_GENTLE = "npc_female_gentle" NPC_MYSTERIOUS = "npc_mysterious" MONSTER = "monster" class VoiceCircuitState(str, Enum): """Circuit breaker states for voice service.""" CLOSED = "closed" # Normal operation, requests allowed OPEN = "open" # Too many failures, requests rejected HALF_OPEN = "half_open" # Testing if service recovered class VoiceServiceState(str, Enum): """Overall voice service availability state.""" AVAILABLE = "available" # Fully functional DEGRADED = "degraded" # Working but experiencing issues UNAVAILABLE = "unavailable" # Not available (auth error, quota, etc.) class VoiceModelType(str, Enum): """ElevenLabs model types for synthesis.""" TURBO_V2 = "eleven_turbo_v2" TURBO_V2_5 = "eleven_turbo_v2_5" MULTILINGUAL_V2 = "eleven_multilingual_v2" # ============================================================================= # Voice Configuration Models # ============================================================================= class VoiceSynthesisSettings(BaseModel): """Settings for voice synthesis quality and style.""" stability: float = Field( default=0.5, ge=0.0, le=1.0, description="Voice stability (0.0-1.0). Lower = more variation.", ) similarity_boost: float = Field( default=0.75, ge=0.0, le=1.0, description="How closely to match the original voice (0.0-1.0).", ) style: float = Field( default=0.0, ge=0.0, le=1.0, description="Style exaggeration (0.0-1.0). Higher = more expressive.", ) use_speaker_boost: bool = Field( default=True, description="Boost voice clarity and reduce background noise.", ) class VoiceProfile(BaseModel): """Complete voice profile definition.""" name: str = Field(description="Profile name identifier") voice_id: str = Field(description="ElevenLabs voice ID") description: str = Field(default="", description="Human-readable description") voice_type: VoiceType = Field(description="Type of voice profile") settings: VoiceSynthesisSettings = Field( default_factory=VoiceSynthesisSettings, description="Synthesis settings for this voice", ) # ============================================================================= # Synthesis Request/Result Models # ============================================================================= class SynthesisRequest(BaseModel): """Request for voice synthesis.""" text: str = Field(description="Text to synthesize") voice_type: VoiceType = Field( default=VoiceType.DM, description="Voice profile type to use", ) voice_profile_override: str | None = Field( default=None, description="Override voice profile name (ignores voice_type)", ) stream: bool = Field( default=True, description="Stream audio chunks for real-time playback", ) model: VoiceModelType = Field( default=VoiceModelType.TURBO_V2, description="ElevenLabs model to use", ) output_format: str = Field( default="mp3_22050_32", description="Audio output format", ) class SynthesisResult(BaseModel): """Result of voice synthesis.""" success: bool = Field(description="Whether synthesis succeeded") audio_bytes: bytes | None = Field( default=None, description="Synthesized audio data", ) duration_ms: int | None = Field( default=None, description="Audio duration in milliseconds", ) voice_type: VoiceType = Field( default=VoiceType.DM, description="Voice type used", ) voice_id: str = Field( default="", description="ElevenLabs voice ID used", ) text_length: int = Field( default=0, description="Length of synthesized text", ) model_used: str = Field( default="", description="ElevenLabs model used", ) from_cache: bool = Field( default=False, description="Whether result came from cache", ) error_message: str | None = Field( default=None, description="Error message if synthesis failed", ) # ============================================================================= # Text Processing Models # ============================================================================= class TextSegment(BaseModel): """A segment of text with assigned voice.""" text: str = Field(description="Text content of this segment") voice_type: VoiceType = Field( default=VoiceType.DM, description="Voice type to use for this segment", ) is_dialogue: bool = Field( default=False, description="Whether this is quoted dialogue", ) speaker_name: str | None = Field( default=None, description="Name of the speaker if known", ) pause_before_ms: int = Field( default=0, description="Pause duration before this segment in ms", ) pause_after_ms: int = Field( default=0, description="Pause duration after this segment in ms", ) class ProcessedNarration(BaseModel): """Fully processed narration ready for synthesis.""" segments: list[TextSegment] = Field( default_factory=list, description="List of text segments with voice assignments", ) total_text: str = Field( default="", description="Complete processed text", ) primary_voice: VoiceType = Field( default=VoiceType.DM, description="Primary voice type used", ) has_dialogue: bool = Field( default=False, description="Whether narration contains dialogue", ) estimated_duration_ms: int = Field( default=0, description="Estimated audio duration in ms", ) # ============================================================================= # Service Status Models # ============================================================================= class VoiceServiceStatus(BaseModel): """Status information for voice service.""" state: VoiceServiceState = Field( default=VoiceServiceState.UNAVAILABLE, description="Overall service state", ) circuit_state: VoiceCircuitState = Field( default=VoiceCircuitState.CLOSED, description="Circuit breaker state", ) is_available: bool = Field( default=False, description="Whether voice service is available for use", ) is_initialized: bool = Field( default=False, description="Whether client has been initialized", ) last_successful_call: datetime | None = Field( default=None, description="When the last successful synthesis occurred", ) consecutive_failures: int = Field( default=0, description="Number of consecutive synthesis failures", ) cache_size: int = Field( default=0, description="Number of cached audio entries", ) cache_hit_rate: float = Field( default=0.0, description="Cache hit rate (0.0-1.0)", ) error_message: str | None = Field( default=None, description="Last error message if any", ) # ============================================================================= # Narration Result Model (for VoiceNarratorAgent in Phase 3) # ============================================================================= class NarrationResult(BaseModel): """Result from voice narration including audio and metadata.""" success: bool = Field(description="Whether narration succeeded") audio: bytes | None = Field( default=None, description="Synthesized audio data", ) format: str = Field( default="mp3", description="Audio format", ) voice_used: str = Field( default="dm", description="Voice profile name used", ) voice_type: VoiceType = Field( default=VoiceType.DM, description="Voice type used", ) text_narrated: str = Field( default="", description="Original text that was narrated", ) text_processed: str = Field( default="", description="Processed text after TTS preprocessing", ) duration_ms: int = Field( default=0, description="Audio duration in milliseconds", ) is_streaming: bool = Field( default=False, description="Whether this is a streaming result", ) from_cache: bool = Field( default=False, description="Whether audio came from cache", ) error_message: str | None = Field( default=None, description="Error message if narration failed", )