""" DungeonMaster AI - Text Processor for Voice Synthesis Preprocesses text for optimal TTS synthesis, including abbreviation expansion, dice notation conversion, dialogue detection, and dramatic pause insertion. """ from __future__ import annotations import logging import re from typing import TYPE_CHECKING, Mapping from .models import ProcessedNarration, TextSegment, VoiceType from .voice_profiles import select_voice_for_context if TYPE_CHECKING: pass logger = logging.getLogger(__name__) class NarrationProcessor: """ Preprocesses text for optimal TTS synthesis. Handles: - D&D abbreviation expansion (HP → hit points) - Dice notation conversion (2d6 → two dee six) - Markdown cleanup - Dramatic pause insertion - Dialogue detection and speaker splitting """ # D&D abbreviation expansions (uppercase keys for matching) ABBREVIATIONS: Mapping[str, str] = { "HP": "hit points", "AC": "armor class", "DC": "difficulty class", "DM": "Dungeon Master", "NPC": "N P C", "PC": "player character", "XP": "experience points", "GP": "gold pieces", "SP": "silver pieces", "CP": "copper pieces", "EP": "electrum pieces", "PP": "platinum pieces", "STR": "Strength", "DEX": "Dexterity", "CON": "Constitution", "INT": "Intelligence", "WIS": "Wisdom", "CHA": "Charisma", "ATK": "attack", "DMG": "damage", "INIT": "initiative", "LVL": "level", "HD": "hit dice", "CR": "challenge rating", "AOE": "area of effect", "AOO": "attack of opportunity", } # Dice notation pattern: matches "1d20", "2d6+3", "d8-1", etc. DICE_PATTERN = re.compile( r"(\d*)d(\d+)([+-]\d+)?", re.IGNORECASE, ) # Dialogue detection patterns DIALOGUE_DOUBLE_QUOTE = re.compile(r'"([^"]+)"') DIALOGUE_SINGLE_QUOTE = re.compile(r"'([^']+)'") # Number words for TTS NUMBER_WORDS: Mapping[int, str] = { 1: "one", 2: "two", 3: "three", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine", 10: "ten", 11: "eleven", 12: "twelve", 20: "twenty", 100: "one hundred", } # Dramatic words that benefit from a pause before them DRAMATIC_WORDS = frozenset([ "suddenly", "however", "but", "then", "finally", "unfortunately", "meanwhile", "beware", "alas", "behold", ]) # Average characters per second for duration estimation CHARS_PER_SECOND = 15 def __init__(self) -> None: """Initialize the narration processor.""" # Pre-compile abbreviation patterns for efficiency self._abbreviation_patterns: dict[str, re.Pattern[str]] = {} for abbr in self.ABBREVIATIONS: # Word boundary pattern for each abbreviation self._abbreviation_patterns[abbr] = re.compile( rf"\b{abbr}\b", re.IGNORECASE, ) def process_for_tts(self, text: str) -> str: """ Clean and prepare text for TTS synthesis. Operations: 1. Expand D&D abbreviations 2. Convert dice notation to spoken form 3. Clean markdown formatting 4. Normalize whitespace Args: text: Raw input text. Returns: Cleaned text ready for TTS. """ result = text # Expand abbreviations (case-insensitive) for abbr, expansion in self.ABBREVIATIONS.items(): result = self._abbreviation_patterns[abbr].sub(expansion, result) # Convert dice notation result = self.DICE_PATTERN.sub(self._convert_dice_notation, result) # Clean markdown formatting result = self._clean_markdown(result) # Normalize whitespace result = " ".join(result.split()) return result.strip() def add_dramatic_pauses(self, text: str) -> str: """ Insert timing markers for dramatic effect. Uses natural punctuation since ElevenLabs respects pauses. Args: text: Text to add pauses to. Returns: Text with dramatic pauses inserted. """ result = text # Normalize ellipsis (ensure space after) result = re.sub(r"\.{3,}", "... ", result) # Add slight pause before dramatic words for word in self.DRAMATIC_WORDS: # Match word at start of sentence or after space pattern = rf"(\s)({word})\b" result = re.sub( pattern, r"\1... \2", result, flags=re.IGNORECASE, ) # Clean up any double spaces created result = " ".join(result.split()) return result def handle_dialogue( self, text: str, default_npc_voice: VoiceType = VoiceType.NPC_MALE_GRUFF, ) -> list[TextSegment]: """ Split text into segments with speaker detection. Detects quoted text as dialogue and assigns appropriate voice types. Args: text: Text containing potential dialogue. default_npc_voice: Voice type to use for dialogue if not specified. Returns: List of TextSegments with appropriate voice assignments. """ segments: list[TextSegment] = [] last_end = 0 # Find all dialogue (double quotes) for match in self.DIALOGUE_DOUBLE_QUOTE.finditer(text): # Add narration before dialogue if match.start() > last_end: narration = text[last_end : match.start()].strip() if narration: segments.append( TextSegment( text=self.process_for_tts(narration), voice_type=VoiceType.DM, is_dialogue=False, ) ) # Add dialogue dialogue = match.group(1) segments.append( TextSegment( text=self.process_for_tts(dialogue), voice_type=default_npc_voice, is_dialogue=True, pause_before_ms=100, # Brief pause before dialogue pause_after_ms=100, # Brief pause after dialogue ) ) last_end = match.end() # Add remaining narration if last_end < len(text): remaining = text[last_end:].strip() if remaining: segments.append( TextSegment( text=self.process_for_tts(remaining), voice_type=VoiceType.DM, is_dialogue=False, ) ) # If no dialogue found, return single DM segment if not segments: segments.append( TextSegment( text=self.process_for_tts(text), voice_type=VoiceType.DM, is_dialogue=False, ) ) return segments def detect_voice_type( self, text: str, context: dict[str, object] | None = None, ) -> VoiceType: """ Analyze text and context to determine appropriate voice type. Args: text: Text to analyze. context: Game context dictionary with keys like: - in_combat: bool - current_npc: dict | None Returns: VoiceType to use for this text. """ context = context or {} # Check for current NPC current_npc = context.get("current_npc") if current_npc and isinstance(current_npc, dict): from .voice_profiles import select_voice_from_npc_data return select_voice_from_npc_data(current_npc) # Check if text contains dialogue has_dialogue = bool( self.DIALOGUE_DOUBLE_QUOTE.search(text) or self.DIALOGUE_SINGLE_QUOTE.search(text) ) # Get combat state in_combat = bool(context.get("in_combat", False)) return select_voice_for_context( is_dialogue=has_dialogue, is_combat=in_combat, ) def split_by_speaker( self, text: str, context: dict[str, object] | None = None, ) -> list[tuple[str, VoiceType]]: """ Split response into segments by speaker. Convenience method that returns simpler tuple format. Args: text: Text to split. context: Game context for voice selection. Returns: List of (text, voice_type) tuples. """ context = context or {} # Determine NPC voice from context npc_voice = VoiceType.NPC_MALE_GRUFF current_npc = context.get("current_npc") if current_npc and isinstance(current_npc, dict): from .voice_profiles import select_voice_from_npc_data npc_voice = select_voice_from_npc_data(current_npc) segments = self.handle_dialogue(text, default_npc_voice=npc_voice) return [(seg.text, seg.voice_type) for seg in segments] def process( self, text: str, context: dict[str, object] | None = None, add_pauses: bool = True, ) -> ProcessedNarration: """ Full processing pipeline for narration. Args: text: Raw text to process. context: Game context for voice selection. add_pauses: Whether to add dramatic pauses. Returns: ProcessedNarration with all segments and metadata. """ context = context or {} # Get segments with voice assignments npc_voice = VoiceType.NPC_MALE_GRUFF current_npc = context.get("current_npc") if current_npc and isinstance(current_npc, dict): from .voice_profiles import select_voice_from_npc_data npc_voice = select_voice_from_npc_data(current_npc) segments = self.handle_dialogue(text, default_npc_voice=npc_voice) # Add dramatic pauses if requested if add_pauses: for segment in segments: segment.text = self.add_dramatic_pauses(segment.text) # Build complete text total_text = " ".join(seg.text for seg in segments) # Determine primary voice primary_voice = VoiceType.DM for seg in segments: if not seg.is_dialogue: primary_voice = seg.voice_type break # Check for dialogue has_dialogue = any(seg.is_dialogue for seg in segments) # Estimate duration estimated_duration = int((len(total_text) / self.CHARS_PER_SECOND) * 1000) return ProcessedNarration( segments=segments, total_text=total_text, primary_voice=primary_voice, has_dialogue=has_dialogue, estimated_duration_ms=estimated_duration, ) def _convert_dice_notation(self, match: re.Match[str]) -> str: """ Convert dice notation to speakable text. Examples: - "1d20" → "one dee twenty" - "2d6+3" → "two dee six plus three" - "d8" → "one dee eight" """ num_dice = match.group(1) or "1" die_size = match.group(2) modifier = match.group(3) # Convert numbers to words where available num_dice_int = int(num_dice) die_size_int = int(die_size) num_word = self.NUMBER_WORDS.get(num_dice_int, num_dice) die_word = self.NUMBER_WORDS.get(die_size_int, die_size) result = f"{num_word} dee {die_word}" if modifier: mod_sign = modifier[0] mod_value = modifier[1:] if mod_sign == "+": result += f" plus {mod_value}" elif mod_sign == "-": result += f" minus {mod_value}" return result def _clean_markdown(self, text: str) -> str: """ Remove markdown formatting from text. Args: text: Text with potential markdown. Returns: Plain text without markdown. """ result = text # Remove bold/italic markers result = re.sub(r"\*\*([^*]+)\*\*", r"\1", result) # **bold** result = re.sub(r"\*([^*]+)\*", r"\1", result) # *italic* result = re.sub(r"__([^_]+)__", r"\1", result) # __bold__ result = re.sub(r"_([^_]+)_", r"\1", result) # _italic_ # Remove headers result = re.sub(r"^#+\s*", "", result, flags=re.MULTILINE) # Remove horizontal rules result = re.sub(r"^[-*_]{3,}\s*$", "", result, flags=re.MULTILINE) # Remove bullet points (keep text) result = re.sub(r"^\s*[-*+]\s+", "", result, flags=re.MULTILINE) # Remove numbered lists (keep text) result = re.sub(r"^\s*\d+\.\s+", "", result, flags=re.MULTILINE) # Remove links but keep text: [text](url) → text result = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", result) # Remove inline code result = re.sub(r"`([^`]+)`", r"\1", result) # Remove code blocks result = re.sub(r"```[\s\S]*?```", "", result) return result def estimate_duration_ms(self, text: str) -> int: """ Estimate audio duration for text. Args: text: Text to estimate duration for. Returns: Estimated duration in milliseconds. """ # Process text first to get accurate character count processed = self.process_for_tts(text) return int((len(processed) / self.CHARS_PER_SECOND) * 1000)