DungeonMaster-AI / src /voice /text_processor.py
bhupesh-sf's picture
first commit
f8ba6bf verified
"""
DungeonMaster AI - Text Processor for Voice Synthesis
Preprocesses text for optimal TTS synthesis, including abbreviation expansion,
dice notation conversion, dialogue detection, and dramatic pause insertion.
"""
from __future__ import annotations
import logging
import re
from typing import TYPE_CHECKING, Mapping
from .models import ProcessedNarration, TextSegment, VoiceType
from .voice_profiles import select_voice_for_context
if TYPE_CHECKING:
pass
logger = logging.getLogger(__name__)
class NarrationProcessor:
"""
Preprocesses text for optimal TTS synthesis.
Handles:
- D&D abbreviation expansion (HP β†’ hit points)
- Dice notation conversion (2d6 β†’ two dee six)
- Markdown cleanup
- Dramatic pause insertion
- Dialogue detection and speaker splitting
"""
# D&D abbreviation expansions (uppercase keys for matching)
ABBREVIATIONS: Mapping[str, str] = {
"HP": "hit points",
"AC": "armor class",
"DC": "difficulty class",
"DM": "Dungeon Master",
"NPC": "N P C",
"PC": "player character",
"XP": "experience points",
"GP": "gold pieces",
"SP": "silver pieces",
"CP": "copper pieces",
"EP": "electrum pieces",
"PP": "platinum pieces",
"STR": "Strength",
"DEX": "Dexterity",
"CON": "Constitution",
"INT": "Intelligence",
"WIS": "Wisdom",
"CHA": "Charisma",
"ATK": "attack",
"DMG": "damage",
"INIT": "initiative",
"LVL": "level",
"HD": "hit dice",
"CR": "challenge rating",
"AOE": "area of effect",
"AOO": "attack of opportunity",
}
# Dice notation pattern: matches "1d20", "2d6+3", "d8-1", etc.
DICE_PATTERN = re.compile(
r"(\d*)d(\d+)([+-]\d+)?",
re.IGNORECASE,
)
# Dialogue detection patterns
DIALOGUE_DOUBLE_QUOTE = re.compile(r'"([^"]+)"')
DIALOGUE_SINGLE_QUOTE = re.compile(r"'([^']+)'")
# Number words for TTS
NUMBER_WORDS: Mapping[int, str] = {
1: "one",
2: "two",
3: "three",
4: "four",
5: "five",
6: "six",
7: "seven",
8: "eight",
9: "nine",
10: "ten",
11: "eleven",
12: "twelve",
20: "twenty",
100: "one hundred",
}
# Dramatic words that benefit from a pause before them
DRAMATIC_WORDS = frozenset([
"suddenly",
"however",
"but",
"then",
"finally",
"unfortunately",
"meanwhile",
"beware",
"alas",
"behold",
])
# Average characters per second for duration estimation
CHARS_PER_SECOND = 15
def __init__(self) -> None:
"""Initialize the narration processor."""
# Pre-compile abbreviation patterns for efficiency
self._abbreviation_patterns: dict[str, re.Pattern[str]] = {}
for abbr in self.ABBREVIATIONS:
# Word boundary pattern for each abbreviation
self._abbreviation_patterns[abbr] = re.compile(
rf"\b{abbr}\b",
re.IGNORECASE,
)
def process_for_tts(self, text: str) -> str:
"""
Clean and prepare text for TTS synthesis.
Operations:
1. Expand D&D abbreviations
2. Convert dice notation to spoken form
3. Clean markdown formatting
4. Normalize whitespace
Args:
text: Raw input text.
Returns:
Cleaned text ready for TTS.
"""
result = text
# Expand abbreviations (case-insensitive)
for abbr, expansion in self.ABBREVIATIONS.items():
result = self._abbreviation_patterns[abbr].sub(expansion, result)
# Convert dice notation
result = self.DICE_PATTERN.sub(self._convert_dice_notation, result)
# Clean markdown formatting
result = self._clean_markdown(result)
# Normalize whitespace
result = " ".join(result.split())
return result.strip()
def add_dramatic_pauses(self, text: str) -> str:
"""
Insert timing markers for dramatic effect.
Uses natural punctuation since ElevenLabs respects pauses.
Args:
text: Text to add pauses to.
Returns:
Text with dramatic pauses inserted.
"""
result = text
# Normalize ellipsis (ensure space after)
result = re.sub(r"\.{3,}", "... ", result)
# Add slight pause before dramatic words
for word in self.DRAMATIC_WORDS:
# Match word at start of sentence or after space
pattern = rf"(\s)({word})\b"
result = re.sub(
pattern,
r"\1... \2",
result,
flags=re.IGNORECASE,
)
# Clean up any double spaces created
result = " ".join(result.split())
return result
def handle_dialogue(
self,
text: str,
default_npc_voice: VoiceType = VoiceType.NPC_MALE_GRUFF,
) -> list[TextSegment]:
"""
Split text into segments with speaker detection.
Detects quoted text as dialogue and assigns appropriate voice types.
Args:
text: Text containing potential dialogue.
default_npc_voice: Voice type to use for dialogue if not specified.
Returns:
List of TextSegments with appropriate voice assignments.
"""
segments: list[TextSegment] = []
last_end = 0
# Find all dialogue (double quotes)
for match in self.DIALOGUE_DOUBLE_QUOTE.finditer(text):
# Add narration before dialogue
if match.start() > last_end:
narration = text[last_end : match.start()].strip()
if narration:
segments.append(
TextSegment(
text=self.process_for_tts(narration),
voice_type=VoiceType.DM,
is_dialogue=False,
)
)
# Add dialogue
dialogue = match.group(1)
segments.append(
TextSegment(
text=self.process_for_tts(dialogue),
voice_type=default_npc_voice,
is_dialogue=True,
pause_before_ms=100, # Brief pause before dialogue
pause_after_ms=100, # Brief pause after dialogue
)
)
last_end = match.end()
# Add remaining narration
if last_end < len(text):
remaining = text[last_end:].strip()
if remaining:
segments.append(
TextSegment(
text=self.process_for_tts(remaining),
voice_type=VoiceType.DM,
is_dialogue=False,
)
)
# If no dialogue found, return single DM segment
if not segments:
segments.append(
TextSegment(
text=self.process_for_tts(text),
voice_type=VoiceType.DM,
is_dialogue=False,
)
)
return segments
def detect_voice_type(
self,
text: str,
context: dict[str, object] | None = None,
) -> VoiceType:
"""
Analyze text and context to determine appropriate voice type.
Args:
text: Text to analyze.
context: Game context dictionary with keys like:
- in_combat: bool
- current_npc: dict | None
Returns:
VoiceType to use for this text.
"""
context = context or {}
# Check for current NPC
current_npc = context.get("current_npc")
if current_npc and isinstance(current_npc, dict):
from .voice_profiles import select_voice_from_npc_data
return select_voice_from_npc_data(current_npc)
# Check if text contains dialogue
has_dialogue = bool(
self.DIALOGUE_DOUBLE_QUOTE.search(text)
or self.DIALOGUE_SINGLE_QUOTE.search(text)
)
# Get combat state
in_combat = bool(context.get("in_combat", False))
return select_voice_for_context(
is_dialogue=has_dialogue,
is_combat=in_combat,
)
def split_by_speaker(
self,
text: str,
context: dict[str, object] | None = None,
) -> list[tuple[str, VoiceType]]:
"""
Split response into segments by speaker.
Convenience method that returns simpler tuple format.
Args:
text: Text to split.
context: Game context for voice selection.
Returns:
List of (text, voice_type) tuples.
"""
context = context or {}
# Determine NPC voice from context
npc_voice = VoiceType.NPC_MALE_GRUFF
current_npc = context.get("current_npc")
if current_npc and isinstance(current_npc, dict):
from .voice_profiles import select_voice_from_npc_data
npc_voice = select_voice_from_npc_data(current_npc)
segments = self.handle_dialogue(text, default_npc_voice=npc_voice)
return [(seg.text, seg.voice_type) for seg in segments]
def process(
self,
text: str,
context: dict[str, object] | None = None,
add_pauses: bool = True,
) -> ProcessedNarration:
"""
Full processing pipeline for narration.
Args:
text: Raw text to process.
context: Game context for voice selection.
add_pauses: Whether to add dramatic pauses.
Returns:
ProcessedNarration with all segments and metadata.
"""
context = context or {}
# Get segments with voice assignments
npc_voice = VoiceType.NPC_MALE_GRUFF
current_npc = context.get("current_npc")
if current_npc and isinstance(current_npc, dict):
from .voice_profiles import select_voice_from_npc_data
npc_voice = select_voice_from_npc_data(current_npc)
segments = self.handle_dialogue(text, default_npc_voice=npc_voice)
# Add dramatic pauses if requested
if add_pauses:
for segment in segments:
segment.text = self.add_dramatic_pauses(segment.text)
# Build complete text
total_text = " ".join(seg.text for seg in segments)
# Determine primary voice
primary_voice = VoiceType.DM
for seg in segments:
if not seg.is_dialogue:
primary_voice = seg.voice_type
break
# Check for dialogue
has_dialogue = any(seg.is_dialogue for seg in segments)
# Estimate duration
estimated_duration = int((len(total_text) / self.CHARS_PER_SECOND) * 1000)
return ProcessedNarration(
segments=segments,
total_text=total_text,
primary_voice=primary_voice,
has_dialogue=has_dialogue,
estimated_duration_ms=estimated_duration,
)
def _convert_dice_notation(self, match: re.Match[str]) -> str:
"""
Convert dice notation to speakable text.
Examples:
- "1d20" β†’ "one dee twenty"
- "2d6+3" β†’ "two dee six plus three"
- "d8" β†’ "one dee eight"
"""
num_dice = match.group(1) or "1"
die_size = match.group(2)
modifier = match.group(3)
# Convert numbers to words where available
num_dice_int = int(num_dice)
die_size_int = int(die_size)
num_word = self.NUMBER_WORDS.get(num_dice_int, num_dice)
die_word = self.NUMBER_WORDS.get(die_size_int, die_size)
result = f"{num_word} dee {die_word}"
if modifier:
mod_sign = modifier[0]
mod_value = modifier[1:]
if mod_sign == "+":
result += f" plus {mod_value}"
elif mod_sign == "-":
result += f" minus {mod_value}"
return result
def _clean_markdown(self, text: str) -> str:
"""
Remove markdown formatting from text.
Args:
text: Text with potential markdown.
Returns:
Plain text without markdown.
"""
result = text
# Remove bold/italic markers
result = re.sub(r"\*\*([^*]+)\*\*", r"\1", result) # **bold**
result = re.sub(r"\*([^*]+)\*", r"\1", result) # *italic*
result = re.sub(r"__([^_]+)__", r"\1", result) # __bold__
result = re.sub(r"_([^_]+)_", r"\1", result) # _italic_
# Remove headers
result = re.sub(r"^#+\s*", "", result, flags=re.MULTILINE)
# Remove horizontal rules
result = re.sub(r"^[-*_]{3,}\s*$", "", result, flags=re.MULTILINE)
# Remove bullet points (keep text)
result = re.sub(r"^\s*[-*+]\s+", "", result, flags=re.MULTILINE)
# Remove numbered lists (keep text)
result = re.sub(r"^\s*\d+\.\s+", "", result, flags=re.MULTILINE)
# Remove links but keep text: [text](url) β†’ text
result = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", result)
# Remove inline code
result = re.sub(r"`([^`]+)`", r"\1", result)
# Remove code blocks
result = re.sub(r"```[\s\S]*?```", "", result)
return result
def estimate_duration_ms(self, text: str) -> int:
"""
Estimate audio duration for text.
Args:
text: Text to estimate duration for.
Returns:
Estimated duration in milliseconds.
"""
# Process text first to get accurate character count
processed = self.process_for_tts(text)
return int((len(processed) / self.CHARS_PER_SECOND) * 1000)