|
|
""" |
|
|
DungeonMaster AI - Text Processor for Voice Synthesis |
|
|
|
|
|
Preprocesses text for optimal TTS synthesis, including abbreviation expansion, |
|
|
dice notation conversion, dialogue detection, and dramatic pause insertion. |
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import logging |
|
|
import re |
|
|
from typing import TYPE_CHECKING, Mapping |
|
|
|
|
|
from .models import ProcessedNarration, TextSegment, VoiceType |
|
|
from .voice_profiles import select_voice_for_context |
|
|
|
|
|
if TYPE_CHECKING: |
|
|
pass |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
class NarrationProcessor: |
|
|
""" |
|
|
Preprocesses text for optimal TTS synthesis. |
|
|
|
|
|
Handles: |
|
|
- D&D abbreviation expansion (HP β hit points) |
|
|
- Dice notation conversion (2d6 β two dee six) |
|
|
- Markdown cleanup |
|
|
- Dramatic pause insertion |
|
|
- Dialogue detection and speaker splitting |
|
|
""" |
|
|
|
|
|
|
|
|
ABBREVIATIONS: Mapping[str, str] = { |
|
|
"HP": "hit points", |
|
|
"AC": "armor class", |
|
|
"DC": "difficulty class", |
|
|
"DM": "Dungeon Master", |
|
|
"NPC": "N P C", |
|
|
"PC": "player character", |
|
|
"XP": "experience points", |
|
|
"GP": "gold pieces", |
|
|
"SP": "silver pieces", |
|
|
"CP": "copper pieces", |
|
|
"EP": "electrum pieces", |
|
|
"PP": "platinum pieces", |
|
|
"STR": "Strength", |
|
|
"DEX": "Dexterity", |
|
|
"CON": "Constitution", |
|
|
"INT": "Intelligence", |
|
|
"WIS": "Wisdom", |
|
|
"CHA": "Charisma", |
|
|
"ATK": "attack", |
|
|
"DMG": "damage", |
|
|
"INIT": "initiative", |
|
|
"LVL": "level", |
|
|
"HD": "hit dice", |
|
|
"CR": "challenge rating", |
|
|
"AOE": "area of effect", |
|
|
"AOO": "attack of opportunity", |
|
|
} |
|
|
|
|
|
|
|
|
DICE_PATTERN = re.compile( |
|
|
r"(\d*)d(\d+)([+-]\d+)?", |
|
|
re.IGNORECASE, |
|
|
) |
|
|
|
|
|
|
|
|
DIALOGUE_DOUBLE_QUOTE = re.compile(r'"([^"]+)"') |
|
|
DIALOGUE_SINGLE_QUOTE = re.compile(r"'([^']+)'") |
|
|
|
|
|
|
|
|
NUMBER_WORDS: Mapping[int, str] = { |
|
|
1: "one", |
|
|
2: "two", |
|
|
3: "three", |
|
|
4: "four", |
|
|
5: "five", |
|
|
6: "six", |
|
|
7: "seven", |
|
|
8: "eight", |
|
|
9: "nine", |
|
|
10: "ten", |
|
|
11: "eleven", |
|
|
12: "twelve", |
|
|
20: "twenty", |
|
|
100: "one hundred", |
|
|
} |
|
|
|
|
|
|
|
|
DRAMATIC_WORDS = frozenset([ |
|
|
"suddenly", |
|
|
"however", |
|
|
"but", |
|
|
"then", |
|
|
"finally", |
|
|
"unfortunately", |
|
|
"meanwhile", |
|
|
"beware", |
|
|
"alas", |
|
|
"behold", |
|
|
]) |
|
|
|
|
|
|
|
|
CHARS_PER_SECOND = 15 |
|
|
|
|
|
def __init__(self) -> None: |
|
|
"""Initialize the narration processor.""" |
|
|
|
|
|
self._abbreviation_patterns: dict[str, re.Pattern[str]] = {} |
|
|
for abbr in self.ABBREVIATIONS: |
|
|
|
|
|
self._abbreviation_patterns[abbr] = re.compile( |
|
|
rf"\b{abbr}\b", |
|
|
re.IGNORECASE, |
|
|
) |
|
|
|
|
|
def process_for_tts(self, text: str) -> str: |
|
|
""" |
|
|
Clean and prepare text for TTS synthesis. |
|
|
|
|
|
Operations: |
|
|
1. Expand D&D abbreviations |
|
|
2. Convert dice notation to spoken form |
|
|
3. Clean markdown formatting |
|
|
4. Normalize whitespace |
|
|
|
|
|
Args: |
|
|
text: Raw input text. |
|
|
|
|
|
Returns: |
|
|
Cleaned text ready for TTS. |
|
|
""" |
|
|
result = text |
|
|
|
|
|
|
|
|
for abbr, expansion in self.ABBREVIATIONS.items(): |
|
|
result = self._abbreviation_patterns[abbr].sub(expansion, result) |
|
|
|
|
|
|
|
|
result = self.DICE_PATTERN.sub(self._convert_dice_notation, result) |
|
|
|
|
|
|
|
|
result = self._clean_markdown(result) |
|
|
|
|
|
|
|
|
result = " ".join(result.split()) |
|
|
|
|
|
return result.strip() |
|
|
|
|
|
def add_dramatic_pauses(self, text: str) -> str: |
|
|
""" |
|
|
Insert timing markers for dramatic effect. |
|
|
|
|
|
Uses natural punctuation since ElevenLabs respects pauses. |
|
|
|
|
|
Args: |
|
|
text: Text to add pauses to. |
|
|
|
|
|
Returns: |
|
|
Text with dramatic pauses inserted. |
|
|
""" |
|
|
result = text |
|
|
|
|
|
|
|
|
result = re.sub(r"\.{3,}", "... ", result) |
|
|
|
|
|
|
|
|
for word in self.DRAMATIC_WORDS: |
|
|
|
|
|
pattern = rf"(\s)({word})\b" |
|
|
result = re.sub( |
|
|
pattern, |
|
|
r"\1... \2", |
|
|
result, |
|
|
flags=re.IGNORECASE, |
|
|
) |
|
|
|
|
|
|
|
|
result = " ".join(result.split()) |
|
|
|
|
|
return result |
|
|
|
|
|
def handle_dialogue( |
|
|
self, |
|
|
text: str, |
|
|
default_npc_voice: VoiceType = VoiceType.NPC_MALE_GRUFF, |
|
|
) -> list[TextSegment]: |
|
|
""" |
|
|
Split text into segments with speaker detection. |
|
|
|
|
|
Detects quoted text as dialogue and assigns appropriate voice types. |
|
|
|
|
|
Args: |
|
|
text: Text containing potential dialogue. |
|
|
default_npc_voice: Voice type to use for dialogue if not specified. |
|
|
|
|
|
Returns: |
|
|
List of TextSegments with appropriate voice assignments. |
|
|
""" |
|
|
segments: list[TextSegment] = [] |
|
|
last_end = 0 |
|
|
|
|
|
|
|
|
for match in self.DIALOGUE_DOUBLE_QUOTE.finditer(text): |
|
|
|
|
|
if match.start() > last_end: |
|
|
narration = text[last_end : match.start()].strip() |
|
|
if narration: |
|
|
segments.append( |
|
|
TextSegment( |
|
|
text=self.process_for_tts(narration), |
|
|
voice_type=VoiceType.DM, |
|
|
is_dialogue=False, |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
dialogue = match.group(1) |
|
|
segments.append( |
|
|
TextSegment( |
|
|
text=self.process_for_tts(dialogue), |
|
|
voice_type=default_npc_voice, |
|
|
is_dialogue=True, |
|
|
pause_before_ms=100, |
|
|
pause_after_ms=100, |
|
|
) |
|
|
) |
|
|
|
|
|
last_end = match.end() |
|
|
|
|
|
|
|
|
if last_end < len(text): |
|
|
remaining = text[last_end:].strip() |
|
|
if remaining: |
|
|
segments.append( |
|
|
TextSegment( |
|
|
text=self.process_for_tts(remaining), |
|
|
voice_type=VoiceType.DM, |
|
|
is_dialogue=False, |
|
|
) |
|
|
) |
|
|
|
|
|
|
|
|
if not segments: |
|
|
segments.append( |
|
|
TextSegment( |
|
|
text=self.process_for_tts(text), |
|
|
voice_type=VoiceType.DM, |
|
|
is_dialogue=False, |
|
|
) |
|
|
) |
|
|
|
|
|
return segments |
|
|
|
|
|
def detect_voice_type( |
|
|
self, |
|
|
text: str, |
|
|
context: dict[str, object] | None = None, |
|
|
) -> VoiceType: |
|
|
""" |
|
|
Analyze text and context to determine appropriate voice type. |
|
|
|
|
|
Args: |
|
|
text: Text to analyze. |
|
|
context: Game context dictionary with keys like: |
|
|
- in_combat: bool |
|
|
- current_npc: dict | None |
|
|
|
|
|
Returns: |
|
|
VoiceType to use for this text. |
|
|
""" |
|
|
context = context or {} |
|
|
|
|
|
|
|
|
current_npc = context.get("current_npc") |
|
|
if current_npc and isinstance(current_npc, dict): |
|
|
from .voice_profiles import select_voice_from_npc_data |
|
|
|
|
|
return select_voice_from_npc_data(current_npc) |
|
|
|
|
|
|
|
|
has_dialogue = bool( |
|
|
self.DIALOGUE_DOUBLE_QUOTE.search(text) |
|
|
or self.DIALOGUE_SINGLE_QUOTE.search(text) |
|
|
) |
|
|
|
|
|
|
|
|
in_combat = bool(context.get("in_combat", False)) |
|
|
|
|
|
return select_voice_for_context( |
|
|
is_dialogue=has_dialogue, |
|
|
is_combat=in_combat, |
|
|
) |
|
|
|
|
|
def split_by_speaker( |
|
|
self, |
|
|
text: str, |
|
|
context: dict[str, object] | None = None, |
|
|
) -> list[tuple[str, VoiceType]]: |
|
|
""" |
|
|
Split response into segments by speaker. |
|
|
|
|
|
Convenience method that returns simpler tuple format. |
|
|
|
|
|
Args: |
|
|
text: Text to split. |
|
|
context: Game context for voice selection. |
|
|
|
|
|
Returns: |
|
|
List of (text, voice_type) tuples. |
|
|
""" |
|
|
context = context or {} |
|
|
|
|
|
|
|
|
npc_voice = VoiceType.NPC_MALE_GRUFF |
|
|
current_npc = context.get("current_npc") |
|
|
if current_npc and isinstance(current_npc, dict): |
|
|
from .voice_profiles import select_voice_from_npc_data |
|
|
|
|
|
npc_voice = select_voice_from_npc_data(current_npc) |
|
|
|
|
|
segments = self.handle_dialogue(text, default_npc_voice=npc_voice) |
|
|
return [(seg.text, seg.voice_type) for seg in segments] |
|
|
|
|
|
def process( |
|
|
self, |
|
|
text: str, |
|
|
context: dict[str, object] | None = None, |
|
|
add_pauses: bool = True, |
|
|
) -> ProcessedNarration: |
|
|
""" |
|
|
Full processing pipeline for narration. |
|
|
|
|
|
Args: |
|
|
text: Raw text to process. |
|
|
context: Game context for voice selection. |
|
|
add_pauses: Whether to add dramatic pauses. |
|
|
|
|
|
Returns: |
|
|
ProcessedNarration with all segments and metadata. |
|
|
""" |
|
|
context = context or {} |
|
|
|
|
|
|
|
|
npc_voice = VoiceType.NPC_MALE_GRUFF |
|
|
current_npc = context.get("current_npc") |
|
|
if current_npc and isinstance(current_npc, dict): |
|
|
from .voice_profiles import select_voice_from_npc_data |
|
|
|
|
|
npc_voice = select_voice_from_npc_data(current_npc) |
|
|
|
|
|
segments = self.handle_dialogue(text, default_npc_voice=npc_voice) |
|
|
|
|
|
|
|
|
if add_pauses: |
|
|
for segment in segments: |
|
|
segment.text = self.add_dramatic_pauses(segment.text) |
|
|
|
|
|
|
|
|
total_text = " ".join(seg.text for seg in segments) |
|
|
|
|
|
|
|
|
primary_voice = VoiceType.DM |
|
|
for seg in segments: |
|
|
if not seg.is_dialogue: |
|
|
primary_voice = seg.voice_type |
|
|
break |
|
|
|
|
|
|
|
|
has_dialogue = any(seg.is_dialogue for seg in segments) |
|
|
|
|
|
|
|
|
estimated_duration = int((len(total_text) / self.CHARS_PER_SECOND) * 1000) |
|
|
|
|
|
return ProcessedNarration( |
|
|
segments=segments, |
|
|
total_text=total_text, |
|
|
primary_voice=primary_voice, |
|
|
has_dialogue=has_dialogue, |
|
|
estimated_duration_ms=estimated_duration, |
|
|
) |
|
|
|
|
|
def _convert_dice_notation(self, match: re.Match[str]) -> str: |
|
|
""" |
|
|
Convert dice notation to speakable text. |
|
|
|
|
|
Examples: |
|
|
- "1d20" β "one dee twenty" |
|
|
- "2d6+3" β "two dee six plus three" |
|
|
- "d8" β "one dee eight" |
|
|
""" |
|
|
num_dice = match.group(1) or "1" |
|
|
die_size = match.group(2) |
|
|
modifier = match.group(3) |
|
|
|
|
|
|
|
|
num_dice_int = int(num_dice) |
|
|
die_size_int = int(die_size) |
|
|
|
|
|
num_word = self.NUMBER_WORDS.get(num_dice_int, num_dice) |
|
|
die_word = self.NUMBER_WORDS.get(die_size_int, die_size) |
|
|
|
|
|
result = f"{num_word} dee {die_word}" |
|
|
|
|
|
if modifier: |
|
|
mod_sign = modifier[0] |
|
|
mod_value = modifier[1:] |
|
|
|
|
|
if mod_sign == "+": |
|
|
result += f" plus {mod_value}" |
|
|
elif mod_sign == "-": |
|
|
result += f" minus {mod_value}" |
|
|
|
|
|
return result |
|
|
|
|
|
def _clean_markdown(self, text: str) -> str: |
|
|
""" |
|
|
Remove markdown formatting from text. |
|
|
|
|
|
Args: |
|
|
text: Text with potential markdown. |
|
|
|
|
|
Returns: |
|
|
Plain text without markdown. |
|
|
""" |
|
|
result = text |
|
|
|
|
|
|
|
|
result = re.sub(r"\*\*([^*]+)\*\*", r"\1", result) |
|
|
result = re.sub(r"\*([^*]+)\*", r"\1", result) |
|
|
result = re.sub(r"__([^_]+)__", r"\1", result) |
|
|
result = re.sub(r"_([^_]+)_", r"\1", result) |
|
|
|
|
|
|
|
|
result = re.sub(r"^#+\s*", "", result, flags=re.MULTILINE) |
|
|
|
|
|
|
|
|
result = re.sub(r"^[-*_]{3,}\s*$", "", result, flags=re.MULTILINE) |
|
|
|
|
|
|
|
|
result = re.sub(r"^\s*[-*+]\s+", "", result, flags=re.MULTILINE) |
|
|
|
|
|
|
|
|
result = re.sub(r"^\s*\d+\.\s+", "", result, flags=re.MULTILINE) |
|
|
|
|
|
|
|
|
result = re.sub(r"\[([^\]]+)\]\([^)]+\)", r"\1", result) |
|
|
|
|
|
|
|
|
result = re.sub(r"`([^`]+)`", r"\1", result) |
|
|
|
|
|
|
|
|
result = re.sub(r"```[\s\S]*?```", "", result) |
|
|
|
|
|
return result |
|
|
|
|
|
def estimate_duration_ms(self, text: str) -> int: |
|
|
""" |
|
|
Estimate audio duration for text. |
|
|
|
|
|
Args: |
|
|
text: Text to estimate duration for. |
|
|
|
|
|
Returns: |
|
|
Estimated duration in milliseconds. |
|
|
""" |
|
|
|
|
|
processed = self.process_for_tts(text) |
|
|
return int((len(processed) / self.CHARS_PER_SECOND) * 1000) |
|
|
|