bhupesh-sf's picture
first commit
f8ba6bf verified
"""
DungeonMaster AI - Voice Integration Models
Pydantic models for voice profiles, synthesis results, and status tracking.
"""
from __future__ import annotations
from datetime import datetime
from enum import Enum
from pydantic import BaseModel, Field
class VoiceType(str, Enum):
"""Voice profile types for different speakers."""
DM = "dm"
NPC_MALE_GRUFF = "npc_male_gruff"
NPC_FEMALE_GENTLE = "npc_female_gentle"
NPC_MYSTERIOUS = "npc_mysterious"
MONSTER = "monster"
class VoiceCircuitState(str, Enum):
"""Circuit breaker states for voice service."""
CLOSED = "closed" # Normal operation, requests allowed
OPEN = "open" # Too many failures, requests rejected
HALF_OPEN = "half_open" # Testing if service recovered
class VoiceServiceState(str, Enum):
"""Overall voice service availability state."""
AVAILABLE = "available" # Fully functional
DEGRADED = "degraded" # Working but experiencing issues
UNAVAILABLE = "unavailable" # Not available (auth error, quota, etc.)
class VoiceModelType(str, Enum):
"""ElevenLabs model types for synthesis."""
TURBO_V2 = "eleven_turbo_v2"
TURBO_V2_5 = "eleven_turbo_v2_5"
MULTILINGUAL_V2 = "eleven_multilingual_v2"
# =============================================================================
# Voice Configuration Models
# =============================================================================
class VoiceSynthesisSettings(BaseModel):
"""Settings for voice synthesis quality and style."""
stability: float = Field(
default=0.5,
ge=0.0,
le=1.0,
description="Voice stability (0.0-1.0). Lower = more variation.",
)
similarity_boost: float = Field(
default=0.75,
ge=0.0,
le=1.0,
description="How closely to match the original voice (0.0-1.0).",
)
style: float = Field(
default=0.0,
ge=0.0,
le=1.0,
description="Style exaggeration (0.0-1.0). Higher = more expressive.",
)
use_speaker_boost: bool = Field(
default=True,
description="Boost voice clarity and reduce background noise.",
)
class VoiceProfile(BaseModel):
"""Complete voice profile definition."""
name: str = Field(description="Profile name identifier")
voice_id: str = Field(description="ElevenLabs voice ID")
description: str = Field(default="", description="Human-readable description")
voice_type: VoiceType = Field(description="Type of voice profile")
settings: VoiceSynthesisSettings = Field(
default_factory=VoiceSynthesisSettings,
description="Synthesis settings for this voice",
)
# =============================================================================
# Synthesis Request/Result Models
# =============================================================================
class SynthesisRequest(BaseModel):
"""Request for voice synthesis."""
text: str = Field(description="Text to synthesize")
voice_type: VoiceType = Field(
default=VoiceType.DM,
description="Voice profile type to use",
)
voice_profile_override: str | None = Field(
default=None,
description="Override voice profile name (ignores voice_type)",
)
stream: bool = Field(
default=True,
description="Stream audio chunks for real-time playback",
)
model: VoiceModelType = Field(
default=VoiceModelType.TURBO_V2,
description="ElevenLabs model to use",
)
output_format: str = Field(
default="mp3_22050_32",
description="Audio output format",
)
class SynthesisResult(BaseModel):
"""Result of voice synthesis."""
success: bool = Field(description="Whether synthesis succeeded")
audio_bytes: bytes | None = Field(
default=None,
description="Synthesized audio data",
)
duration_ms: int | None = Field(
default=None,
description="Audio duration in milliseconds",
)
voice_type: VoiceType = Field(
default=VoiceType.DM,
description="Voice type used",
)
voice_id: str = Field(
default="",
description="ElevenLabs voice ID used",
)
text_length: int = Field(
default=0,
description="Length of synthesized text",
)
model_used: str = Field(
default="",
description="ElevenLabs model used",
)
from_cache: bool = Field(
default=False,
description="Whether result came from cache",
)
error_message: str | None = Field(
default=None,
description="Error message if synthesis failed",
)
# =============================================================================
# Text Processing Models
# =============================================================================
class TextSegment(BaseModel):
"""A segment of text with assigned voice."""
text: str = Field(description="Text content of this segment")
voice_type: VoiceType = Field(
default=VoiceType.DM,
description="Voice type to use for this segment",
)
is_dialogue: bool = Field(
default=False,
description="Whether this is quoted dialogue",
)
speaker_name: str | None = Field(
default=None,
description="Name of the speaker if known",
)
pause_before_ms: int = Field(
default=0,
description="Pause duration before this segment in ms",
)
pause_after_ms: int = Field(
default=0,
description="Pause duration after this segment in ms",
)
class ProcessedNarration(BaseModel):
"""Fully processed narration ready for synthesis."""
segments: list[TextSegment] = Field(
default_factory=list,
description="List of text segments with voice assignments",
)
total_text: str = Field(
default="",
description="Complete processed text",
)
primary_voice: VoiceType = Field(
default=VoiceType.DM,
description="Primary voice type used",
)
has_dialogue: bool = Field(
default=False,
description="Whether narration contains dialogue",
)
estimated_duration_ms: int = Field(
default=0,
description="Estimated audio duration in ms",
)
# =============================================================================
# Service Status Models
# =============================================================================
class VoiceServiceStatus(BaseModel):
"""Status information for voice service."""
state: VoiceServiceState = Field(
default=VoiceServiceState.UNAVAILABLE,
description="Overall service state",
)
circuit_state: VoiceCircuitState = Field(
default=VoiceCircuitState.CLOSED,
description="Circuit breaker state",
)
is_available: bool = Field(
default=False,
description="Whether voice service is available for use",
)
is_initialized: bool = Field(
default=False,
description="Whether client has been initialized",
)
last_successful_call: datetime | None = Field(
default=None,
description="When the last successful synthesis occurred",
)
consecutive_failures: int = Field(
default=0,
description="Number of consecutive synthesis failures",
)
cache_size: int = Field(
default=0,
description="Number of cached audio entries",
)
cache_hit_rate: float = Field(
default=0.0,
description="Cache hit rate (0.0-1.0)",
)
error_message: str | None = Field(
default=None,
description="Last error message if any",
)
# =============================================================================
# Narration Result Model (for VoiceNarratorAgent in Phase 3)
# =============================================================================
class NarrationResult(BaseModel):
"""Result from voice narration including audio and metadata."""
success: bool = Field(description="Whether narration succeeded")
audio: bytes | None = Field(
default=None,
description="Synthesized audio data",
)
format: str = Field(
default="mp3",
description="Audio format",
)
voice_used: str = Field(
default="dm",
description="Voice profile name used",
)
voice_type: VoiceType = Field(
default=VoiceType.DM,
description="Voice type used",
)
text_narrated: str = Field(
default="",
description="Original text that was narrated",
)
text_processed: str = Field(
default="",
description="Processed text after TTS preprocessing",
)
duration_ms: int = Field(
default=0,
description="Audio duration in milliseconds",
)
is_streaming: bool = Field(
default=False,
description="Whether this is a streaming result",
)
from_cache: bool = Field(
default=False,
description="Whether audio came from cache",
)
error_message: str | None = Field(
default=None,
description="Error message if narration failed",
)