Spaces:

MCP-1st-Birthday
/

DungeonMaster-AI

Running

File size: 14,114 Bytes

f8ba6bf

"""
DungeonMaster AI - Text Processor for Voice Synthesis

Preprocesses text for optimal TTS synthesis, including abbreviation expansion,
dice notation conversion, dialogue detection, and dramatic pause insertion.
"""

from __future__ import annotations

import logging
import re
from typing import TYPE_CHECKING, Mapping

from .models import ProcessedNarration, TextSegment, VoiceType
from .voice_profiles import select_voice_for_context

if TYPE_CHECKING:
    pass

logger = logging.getLogger(__name__)


class NarrationProcessor:
    """
    Preprocesses text for optimal TTS synthesis.

    Handles:
    - D&D abbreviation expansion (HP → hit points)
    - Dice notation conversion (2d6 → two dee six)
    - Markdown cleanup
    - Dramatic pause insertion
    - Dialogue detection and speaker splitting
    """

    # D&D abbreviation expansions (uppercase keys for matching)
    ABBREVIATIONS: Mapping[str, str] = {
        "HP": "hit points",
        "AC": "armor class",
        "DC": "difficulty class",
        "DM": "Dungeon Master",
        "NPC": "N P C",
        "PC": "player character",
        "XP": "experience points",
        "GP": "gold pieces",
        "SP": "silver pieces",
        "CP": "copper pieces",
        "EP": "electrum pieces",
        "PP": "platinum pieces",
        "STR": "Strength",
        "DEX": "Dexterity",
        "CON": "Constitution",
        "INT": "Intelligence",
        "WIS": "Wisdom",
        "CHA": "Charisma",
        "ATK": "attack",
        "DMG": "damage",
        "INIT": "initiative",
        "LVL": "level",
        "HD": "hit dice",
        "CR": "challenge rating",
        "AOE": "area of effect",
        "AOO": "attack of opportunity",
    }

    # Dice notation pattern: matches "1d20", "2d6+3", "d8-1", etc.
    DICE_PATTERN = re.compile(
        r"(\d*)d(\d+)([+-]\d+)?",
        re.IGNORECASE,
    )

    # Dialogue detection patterns
    DIALOGUE_DOUBLE_QUOTE = re.compile(r'"([^"]+)"')
    DIALOGUE_SINGLE_QUOTE = re.compile(r"'([^']+)'")

    # Number words for TTS
    NUMBER_WORDS: Mapping[int, str] = {
        1: "one",
        2: "two",
        3: "three",
        4: "four",
        5: "five",
        6: "six",
        7: "seven",
        8: "eight",
        9: "nine",
        10: "ten",
        11: "eleven",
        12: "twelve",
        20: "twenty",
        100: "one hundred",
    }

    # Dramatic words that benefit from a pause before them
    DRAMATIC_WORDS = frozenset([
        "suddenly",
        "however",
        "but",
        "then",
        "finally",
        "unfortunately",
        "meanwhile",
        "beware",
        "alas",
        "behold",
    ])

    # Average characters per second for duration estimation
    CHARS_PER_SECOND = 15

    def __init__(self) -> None:
        """Initialize the narration processor."""
        # Pre-compile abbreviation patterns for efficiency
        self._abbreviation_patterns: dict[str, re.Pattern[str]] = {}
        for abbr in self.ABBREVIATIONS:
            # Word boundary pattern for each abbreviation
            self._abbreviation_patterns[abbr] = re.compile(
                rf"\b{abbr}\b",
                re.IGNORECASE,
            )

    def process_for_tts(self, text: str) -> str:
        """
        Clean and prepare text for TTS synthesis.

        Operations:
        1. Expand D&D abbreviations
        2. Convert dice notation to spoken form
        3. Clean markdown formatting
        4. Normalize whitespace

        Args:
            text: Raw input text.

        Returns:
            Cleaned text ready for TTS.
        """
        result = text

        # Expand abbreviations (case-insensitive)
        for abbr, expansion in self.ABBREVIATIONS.items():
            result = self._abbreviation_patterns[abbr].sub(expansion, result)

        # Convert dice notation
        result = self.DICE_PATTERN.sub(self._convert_dice_notation, result)

        # Clean markdown formatting
        result = self._clean_markdown(result)

        # Normalize whitespace
        result = " ".join(result.split())

        return result.strip()

    def add_dramatic_pauses(self, text: str) -> str:
        """
        Insert timing markers for dramatic effect.

        Uses natural punctuation since ElevenLabs respects pauses.

        Args:
            text: Text to add pauses to.

        Returns:
            Text with dramatic pauses inserted.
        """
        result = text

        # Normalize ellipsis (ensure space after)
        result = re.sub(r"\.{3,}", "... ", result)

        # Add slight pause before dramatic words
        for word in self.DRAMATIC_WORDS:
            # Match word at start of sentence or after space
            pattern = rf"(\s)({word})\b"
            result = re.sub(
                pattern,
                r"\1... \2",
                result,
                flags=re.IGNORECASE,
            )

        # Clean up any double spaces created
        result = " ".join(result.split())

        return result

    def handle_dialogue(
        self,
        text: str,
        default_npc_voice: VoiceType = VoiceType.NPC_MALE_GRUFF,
    ) -> list[TextSegment]:
        """
        Split text into segments with speaker detection.

        Detects quoted text as dialogue and assigns appropriate voice types.

        Args:
            text: Text containing potential dialogue.
            default_npc_voice: Voice type to use for dialogue if not specified.

        Returns:
            List of TextSegments with appropriate voice assignments.
        """
        segments: list[TextSegment] = []
        last_end = 0

        # Find all dialogue (double quotes)
        for match in self.DIALOGUE_DOUBLE_QUOTE.finditer(text):
            # Add narration before dialogue
            if match.start() > last_end:
                narration = text[last_end : match.start()].strip()
                if narration:
                    segments.append(
                        TextSegment(
                            text=self.process_for_tts(narration),
                            voice_type=VoiceType.DM,
                            is_dialogue=False,
                        )
                    )

            # Add dialogue
            dialogue = match.group(1)
            segments.append(
                TextSegment(
                    text=self.process_for_tts(dialogue),
                    voice_type=default_npc_voice,
                    is_dialogue=True,
                    pause_before_ms=100,  # Brief pause before dialogue
                    pause_after_ms=100,  # Brief pause after dialogue
                )
            )

            last_end = match.end()

        # Add remaining narration
        if last_end < len(text):
            remaining = text[last_end:].strip()
            if remaining:
                segments.append(
                    TextSegment(
                        text=self.process_for_tts(remaining),
                        voice_type=VoiceType.DM,
                        is_dialogue=False,
                    )
                )

        # If no dialogue found, return single DM segment
        if not segments:
            segments.append(
                TextSegment(
                    text=self.process_for_tts(text),
                    voice_type=VoiceType.DM,
                    is_dialogue=False,
                )
            )

        return segments

    def detect_voice_type(
        self,
        text: str,
        context: dict[str, object] | None = None,
    ) -> VoiceType:
        """
        Analyze text and context to determine appropriate voice type.

        Args:
            text: Text to analyze.
            context: Game context dictionary with keys like:
                     - in_combat: bool
                     - current_npc: dict | None

        Returns:
            VoiceType to use for this text.
        """
        context = context or {}

        # Check for current NPC
        current_npc = context.get("current_npc")
        if current_npc and isinstance(current_npc, dict):
            from .voice_profiles import select_voice_from_npc_data

            return select_voice_from_npc_data(current_npc)

        # Check if text contains dialogue
        has_dialogue = bool(
            self.DIALOGUE_DOUBLE_QUOTE.search(text)
            or self.DIALOGUE_SINGLE_QUOTE.search(text)
        )

        # Get combat state
        in_combat = bool(context.get("in_combat", False))

        return select_voice_for_context(
            is_dialogue=has_dialogue,
            is_combat=in_combat,
        )

    def split_by_speaker(
        self,
        text: str,
        context: dict[str, object] | None = None,
    ) -> list[tuple[str, VoiceType]]:
        """
        Split response into segments by speaker.

        Convenience method that returns simpler tuple format.

        Args:
            text: Text to split.
            context: Game context for voice selection.

        Returns:
            List of (text, voice_type) tuples.
        """
        context = context or {}

        # Determine NPC voice from context
        npc_voice = VoiceType.NPC_MALE_GRUFF
        current_npc = context.get("current_npc")
        if current_npc and isinstance(current_npc, dict):
            from .voice_profiles import select_voice_from_npc_data

            npc_voice = select_voice_from_npc_data(current_npc)

        segments = self.handle_dialogue(text, default_npc_voice=npc_voice)
        return [(seg.text, seg.voice_type) for seg in segments]

    def process(
        self,
        text: str,
        context: dict[str, object] | None = None,
        add_pauses: bool = True,
    ) -> ProcessedNarration:
        """
        Full processing pipeline for narration.

        Args:
            text: Raw text to process.
            context: Game context for voice selection.
            add_pauses: Whether to add dramatic pauses.

        Returns:
            ProcessedNarration with all segments and metadata.
        """
        context = context or {}

        # Get segments with voice assignments
        npc_voice = VoiceType.NPC_MALE_GRUFF
        current_npc = context.get("current_npc")
        if current_npc and isinstance(current_npc, dict):
            from .voice_profiles import select_voice_from_npc_data

            npc_voice = select_voice_from_npc_data(current_npc)

        segments = self.handle_dialogue(text, default_npc_voice=npc_voice)

        # Add dramatic pauses if requested
        if add_pauses:
            for segment in segments:
                segment.text = self.add_dramatic_pauses(segment.text)

        # Build complete text
        total_text = " ".join(seg.text for seg in segments)

        # Determine primary voice
        primary_voice = VoiceType.DM
        for seg in segments:
            if not seg.is_dialogue:
                primary_voice = seg.voice_type
                break

        # Check for dialogue
        has_dialogue = any(seg.is_dialogue for seg in segments)

        # Estimate duration
        estimated_duration = int((len(total_text) / self.CHARS_PER_SECOND) * 1000)

        return ProcessedNarration(
            segments=segments,
            total_text=total_text,
            primary_voice=primary_voice,
            has_dialogue=has_dialogue,
            estimated_duration_ms=estimated_duration,
        )

    def _convert_dice_notation(self, match: re.Match[str]) -> str:
        """
        Convert dice notation to speakable text.

        Examples:
        - "1d20" → "one dee twenty"
        - "2d6+3" → "two dee six plus three"
        - "d8" → "one dee eight"
        """
        num_dice = match.group(1) or "1"
        die_size = match.group(2)
        modifier = match.group(3)

        # Convert numbers to words where available
        num_dice_int = int(num_dice)
        die_size_int = int(die_size)

        num_word = self.NUMBER_WORDS.get(num_dice_int, num_dice)
        die_word = self.NUMBER_WORDS.get(die_size_int, die_size)

        result = f"{num_word} dee {die_word}"

        if modifier:
            mod_sign = modifier[0]
            mod_value = modifier[1:]

            if mod_sign == "+":
                result += f" plus {mod_value}"
            elif mod_sign == "-":
                result += f" minus {mod_value}"

        return result

    def _clean_markdown(self, text: str) -> str:
        """
        Remove markdown formatting from text.

        Args:
            text: Text with potential markdown.

        Returns:
            Plain text without markdown.
        """
        result = text

        # Remove bold/italic markers
        result = re.sub(r"\*\*([^*]+)\*\*", r"\1", result)  # **bold**
        result = re.sub(r"\*([^*]+)\*", r"\1", result)  # *italic*
        result = re.sub(r"__([^_]+)__", r"\1", result)  # __bold__
        result = re.sub(r"_([^_]+)_", r"\1", result)  # _italic_

        # Remove headers
        result = re.sub(r"^#+\s*", "", result, flags=re.MULTILINE)

        # Remove horizontal rules
        result = re.sub(r"^[-*_]{3,}\s*$", "", result, flags=re.MULTILINE)

        # Remove bullet points (keep text)
        result = re.sub(r"^\s*[-*+]\s+", "", result, flags=re.MULTILINE)

        # Remove numbered lists (keep text)
        result = re.sub(r"^\s*\d+\.\s+", "", result, flags=re.MULTILINE)

        # Remove links but keep text: [text](url) → text
        result = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", result)

        # Remove inline code
        result = re.sub(r"`([^`]+)`", r"\1", result)

        # Remove code blocks
        result = re.sub(r"```[\s\S]*?```", "", result)

        return result

    def estimate_duration_ms(self, text: str) -> int:
        """
        Estimate audio duration for text.

        Args:
            text: Text to estimate duration for.

        Returns:
            Estimated duration in milliseconds.
        """
        # Process text first to get accurate character count
        processed = self.process_for_tts(text)
        return int((len(processed) / self.CHARS_PER_SECOND) * 1000)