|
|
""" |
|
|
DungeonMaster AI - Voice Integration Models |
|
|
|
|
|
Pydantic models for voice profiles, synthesis results, and status tracking. |
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
from datetime import datetime |
|
|
from enum import Enum |
|
|
|
|
|
from pydantic import BaseModel, Field |
|
|
|
|
|
|
|
|
class VoiceType(str, Enum): |
|
|
"""Voice profile types for different speakers.""" |
|
|
|
|
|
DM = "dm" |
|
|
NPC_MALE_GRUFF = "npc_male_gruff" |
|
|
NPC_FEMALE_GENTLE = "npc_female_gentle" |
|
|
NPC_MYSTERIOUS = "npc_mysterious" |
|
|
MONSTER = "monster" |
|
|
|
|
|
|
|
|
class VoiceCircuitState(str, Enum): |
|
|
"""Circuit breaker states for voice service.""" |
|
|
|
|
|
CLOSED = "closed" |
|
|
OPEN = "open" |
|
|
HALF_OPEN = "half_open" |
|
|
|
|
|
|
|
|
class VoiceServiceState(str, Enum): |
|
|
"""Overall voice service availability state.""" |
|
|
|
|
|
AVAILABLE = "available" |
|
|
DEGRADED = "degraded" |
|
|
UNAVAILABLE = "unavailable" |
|
|
|
|
|
|
|
|
class VoiceModelType(str, Enum): |
|
|
"""ElevenLabs model types for synthesis.""" |
|
|
|
|
|
TURBO_V2 = "eleven_turbo_v2" |
|
|
TURBO_V2_5 = "eleven_turbo_v2_5" |
|
|
MULTILINGUAL_V2 = "eleven_multilingual_v2" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class VoiceSynthesisSettings(BaseModel): |
|
|
"""Settings for voice synthesis quality and style.""" |
|
|
|
|
|
stability: float = Field( |
|
|
default=0.5, |
|
|
ge=0.0, |
|
|
le=1.0, |
|
|
description="Voice stability (0.0-1.0). Lower = more variation.", |
|
|
) |
|
|
similarity_boost: float = Field( |
|
|
default=0.75, |
|
|
ge=0.0, |
|
|
le=1.0, |
|
|
description="How closely to match the original voice (0.0-1.0).", |
|
|
) |
|
|
style: float = Field( |
|
|
default=0.0, |
|
|
ge=0.0, |
|
|
le=1.0, |
|
|
description="Style exaggeration (0.0-1.0). Higher = more expressive.", |
|
|
) |
|
|
use_speaker_boost: bool = Field( |
|
|
default=True, |
|
|
description="Boost voice clarity and reduce background noise.", |
|
|
) |
|
|
|
|
|
|
|
|
class VoiceProfile(BaseModel): |
|
|
"""Complete voice profile definition.""" |
|
|
|
|
|
name: str = Field(description="Profile name identifier") |
|
|
voice_id: str = Field(description="ElevenLabs voice ID") |
|
|
description: str = Field(default="", description="Human-readable description") |
|
|
voice_type: VoiceType = Field(description="Type of voice profile") |
|
|
settings: VoiceSynthesisSettings = Field( |
|
|
default_factory=VoiceSynthesisSettings, |
|
|
description="Synthesis settings for this voice", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class SynthesisRequest(BaseModel): |
|
|
"""Request for voice synthesis.""" |
|
|
|
|
|
text: str = Field(description="Text to synthesize") |
|
|
voice_type: VoiceType = Field( |
|
|
default=VoiceType.DM, |
|
|
description="Voice profile type to use", |
|
|
) |
|
|
voice_profile_override: str | None = Field( |
|
|
default=None, |
|
|
description="Override voice profile name (ignores voice_type)", |
|
|
) |
|
|
stream: bool = Field( |
|
|
default=True, |
|
|
description="Stream audio chunks for real-time playback", |
|
|
) |
|
|
model: VoiceModelType = Field( |
|
|
default=VoiceModelType.TURBO_V2, |
|
|
description="ElevenLabs model to use", |
|
|
) |
|
|
output_format: str = Field( |
|
|
default="mp3_22050_32", |
|
|
description="Audio output format", |
|
|
) |
|
|
|
|
|
|
|
|
class SynthesisResult(BaseModel): |
|
|
"""Result of voice synthesis.""" |
|
|
|
|
|
success: bool = Field(description="Whether synthesis succeeded") |
|
|
audio_bytes: bytes | None = Field( |
|
|
default=None, |
|
|
description="Synthesized audio data", |
|
|
) |
|
|
duration_ms: int | None = Field( |
|
|
default=None, |
|
|
description="Audio duration in milliseconds", |
|
|
) |
|
|
voice_type: VoiceType = Field( |
|
|
default=VoiceType.DM, |
|
|
description="Voice type used", |
|
|
) |
|
|
voice_id: str = Field( |
|
|
default="", |
|
|
description="ElevenLabs voice ID used", |
|
|
) |
|
|
text_length: int = Field( |
|
|
default=0, |
|
|
description="Length of synthesized text", |
|
|
) |
|
|
model_used: str = Field( |
|
|
default="", |
|
|
description="ElevenLabs model used", |
|
|
) |
|
|
from_cache: bool = Field( |
|
|
default=False, |
|
|
description="Whether result came from cache", |
|
|
) |
|
|
error_message: str | None = Field( |
|
|
default=None, |
|
|
description="Error message if synthesis failed", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TextSegment(BaseModel): |
|
|
"""A segment of text with assigned voice.""" |
|
|
|
|
|
text: str = Field(description="Text content of this segment") |
|
|
voice_type: VoiceType = Field( |
|
|
default=VoiceType.DM, |
|
|
description="Voice type to use for this segment", |
|
|
) |
|
|
is_dialogue: bool = Field( |
|
|
default=False, |
|
|
description="Whether this is quoted dialogue", |
|
|
) |
|
|
speaker_name: str | None = Field( |
|
|
default=None, |
|
|
description="Name of the speaker if known", |
|
|
) |
|
|
pause_before_ms: int = Field( |
|
|
default=0, |
|
|
description="Pause duration before this segment in ms", |
|
|
) |
|
|
pause_after_ms: int = Field( |
|
|
default=0, |
|
|
description="Pause duration after this segment in ms", |
|
|
) |
|
|
|
|
|
|
|
|
class ProcessedNarration(BaseModel): |
|
|
"""Fully processed narration ready for synthesis.""" |
|
|
|
|
|
segments: list[TextSegment] = Field( |
|
|
default_factory=list, |
|
|
description="List of text segments with voice assignments", |
|
|
) |
|
|
total_text: str = Field( |
|
|
default="", |
|
|
description="Complete processed text", |
|
|
) |
|
|
primary_voice: VoiceType = Field( |
|
|
default=VoiceType.DM, |
|
|
description="Primary voice type used", |
|
|
) |
|
|
has_dialogue: bool = Field( |
|
|
default=False, |
|
|
description="Whether narration contains dialogue", |
|
|
) |
|
|
estimated_duration_ms: int = Field( |
|
|
default=0, |
|
|
description="Estimated audio duration in ms", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class VoiceServiceStatus(BaseModel): |
|
|
"""Status information for voice service.""" |
|
|
|
|
|
state: VoiceServiceState = Field( |
|
|
default=VoiceServiceState.UNAVAILABLE, |
|
|
description="Overall service state", |
|
|
) |
|
|
circuit_state: VoiceCircuitState = Field( |
|
|
default=VoiceCircuitState.CLOSED, |
|
|
description="Circuit breaker state", |
|
|
) |
|
|
is_available: bool = Field( |
|
|
default=False, |
|
|
description="Whether voice service is available for use", |
|
|
) |
|
|
is_initialized: bool = Field( |
|
|
default=False, |
|
|
description="Whether client has been initialized", |
|
|
) |
|
|
last_successful_call: datetime | None = Field( |
|
|
default=None, |
|
|
description="When the last successful synthesis occurred", |
|
|
) |
|
|
consecutive_failures: int = Field( |
|
|
default=0, |
|
|
description="Number of consecutive synthesis failures", |
|
|
) |
|
|
cache_size: int = Field( |
|
|
default=0, |
|
|
description="Number of cached audio entries", |
|
|
) |
|
|
cache_hit_rate: float = Field( |
|
|
default=0.0, |
|
|
description="Cache hit rate (0.0-1.0)", |
|
|
) |
|
|
error_message: str | None = Field( |
|
|
default=None, |
|
|
description="Last error message if any", |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class NarrationResult(BaseModel): |
|
|
"""Result from voice narration including audio and metadata.""" |
|
|
|
|
|
success: bool = Field(description="Whether narration succeeded") |
|
|
audio: bytes | None = Field( |
|
|
default=None, |
|
|
description="Synthesized audio data", |
|
|
) |
|
|
format: str = Field( |
|
|
default="mp3", |
|
|
description="Audio format", |
|
|
) |
|
|
voice_used: str = Field( |
|
|
default="dm", |
|
|
description="Voice profile name used", |
|
|
) |
|
|
voice_type: VoiceType = Field( |
|
|
default=VoiceType.DM, |
|
|
description="Voice type used", |
|
|
) |
|
|
text_narrated: str = Field( |
|
|
default="", |
|
|
description="Original text that was narrated", |
|
|
) |
|
|
text_processed: str = Field( |
|
|
default="", |
|
|
description="Processed text after TTS preprocessing", |
|
|
) |
|
|
duration_ms: int = Field( |
|
|
default=0, |
|
|
description="Audio duration in milliseconds", |
|
|
) |
|
|
is_streaming: bool = Field( |
|
|
default=False, |
|
|
description="Whether this is a streaming result", |
|
|
) |
|
|
from_cache: bool = Field( |
|
|
default=False, |
|
|
description="Whether audio came from cache", |
|
|
) |
|
|
error_message: str | None = Field( |
|
|
default=None, |
|
|
description="Error message if narration failed", |
|
|
) |
|
|
|