Spaces:

MCP-1st-Birthday
/

DungeonMaster-AI

Running

App Files Files Community

DungeonMaster-AI / src /voice /text_processor.py

bhupesh-sf

first commit

f8ba6bf verified 14 days ago

raw

history blame contribute delete

14.1 kB

	"""
	DungeonMaster AI - Text Processor for Voice Synthesis

	Preprocesses text for optimal TTS synthesis, including abbreviation expansion,
	dice notation conversion, dialogue detection, and dramatic pause insertion.
	"""

	from __future__ import annotations

	import logging
	import re
	from typing import TYPE_CHECKING, Mapping

	from .models import ProcessedNarration, TextSegment, VoiceType
	from .voice_profiles import select_voice_for_context

	if TYPE_CHECKING:
	pass

	logger = logging.getLogger(__name__)


	class NarrationProcessor:
	"""
	Preprocesses text for optimal TTS synthesis.

	Handles:
	- D&D abbreviation expansion (HP → hit points)
	- Dice notation conversion (2d6 → two dee six)
	- Markdown cleanup
	- Dramatic pause insertion
	- Dialogue detection and speaker splitting
	"""

	# D&D abbreviation expansions (uppercase keys for matching)
	ABBREVIATIONS: Mapping[str, str] = {
	"HP": "hit points",
	"AC": "armor class",
	"DC": "difficulty class",
	"DM": "Dungeon Master",
	"NPC": "N P C",
	"PC": "player character",
	"XP": "experience points",
	"GP": "gold pieces",
	"SP": "silver pieces",
	"CP": "copper pieces",
	"EP": "electrum pieces",
	"PP": "platinum pieces",
	"STR": "Strength",
	"DEX": "Dexterity",
	"CON": "Constitution",
	"INT": "Intelligence",
	"WIS": "Wisdom",
	"CHA": "Charisma",
	"ATK": "attack",
	"DMG": "damage",
	"INIT": "initiative",
	"LVL": "level",
	"HD": "hit dice",
	"CR": "challenge rating",
	"AOE": "area of effect",
	"AOO": "attack of opportunity",
	}

	# Dice notation pattern: matches "1d20", "2d6+3", "d8-1", etc.
	DICE_PATTERN = re.compile(
	r"(\d*)d(\d+)([+-]\d+)?",
	re.IGNORECASE,
	)

	# Dialogue detection patterns
	DIALOGUE_DOUBLE_QUOTE = re.compile(r'"([^"]+)"')
	DIALOGUE_SINGLE_QUOTE = re.compile(r"'([^']+)'")

	# Number words for TTS
	NUMBER_WORDS: Mapping[int, str] = {
	1: "one",
	2: "two",
	3: "three",
	4: "four",
	5: "five",
	6: "six",
	7: "seven",
	8: "eight",
	9: "nine",
	10: "ten",
	11: "eleven",
	12: "twelve",
	20: "twenty",
	100: "one hundred",
	}

	# Dramatic words that benefit from a pause before them
	DRAMATIC_WORDS = frozenset([
	"suddenly",
	"however",
	"but",
	"then",
	"finally",
	"unfortunately",
	"meanwhile",
	"beware",
	"alas",
	"behold",
	])

	# Average characters per second for duration estimation
	CHARS_PER_SECOND = 15

	def __init__(self) -> None:
	"""Initialize the narration processor."""
	# Pre-compile abbreviation patterns for efficiency
	self._abbreviation_patterns: dict[str, re.Pattern[str]] = {}
	for abbr in self.ABBREVIATIONS:
	# Word boundary pattern for each abbreviation
	self._abbreviation_patterns[abbr] = re.compile(
	rf"\b{abbr}\b",
	re.IGNORECASE,
	)

	def process_for_tts(self, text: str) -> str:
	"""
	Clean and prepare text for TTS synthesis.

	Operations:
	1. Expand D&D abbreviations
	2. Convert dice notation to spoken form
	3. Clean markdown formatting
	4. Normalize whitespace

	Args:
	text: Raw input text.

	Returns:
	Cleaned text ready for TTS.
	"""
	result = text

	# Expand abbreviations (case-insensitive)
	for abbr, expansion in self.ABBREVIATIONS.items():
	result = self._abbreviation_patterns[abbr].sub(expansion, result)

	# Convert dice notation
	result = self.DICE_PATTERN.sub(self._convert_dice_notation, result)

	# Clean markdown formatting
	result = self._clean_markdown(result)

	# Normalize whitespace
	result = " ".join(result.split())

	return result.strip()

	def add_dramatic_pauses(self, text: str) -> str:
	"""
	Insert timing markers for dramatic effect.

	Uses natural punctuation since ElevenLabs respects pauses.

	Args:
	text: Text to add pauses to.

	Returns:
	Text with dramatic pauses inserted.
	"""
	result = text

	# Normalize ellipsis (ensure space after)
	result = re.sub(r"\.{3,}", "... ", result)

	# Add slight pause before dramatic words
	for word in self.DRAMATIC_WORDS:
	# Match word at start of sentence or after space
	pattern = rf"(\s)({word})\b"
	result = re.sub(
	pattern,
	r"\1... \2",
	result,
	flags=re.IGNORECASE,
	)

	# Clean up any double spaces created
	result = " ".join(result.split())

	return result

	def handle_dialogue(
	self,
	text: str,
	default_npc_voice: VoiceType = VoiceType.NPC_MALE_GRUFF,
	) -> list[TextSegment]:
	"""
	Split text into segments with speaker detection.

	Detects quoted text as dialogue and assigns appropriate voice types.

	Args:
	text: Text containing potential dialogue.
	default_npc_voice: Voice type to use for dialogue if not specified.

	Returns:
	List of TextSegments with appropriate voice assignments.
	"""
	segments: list[TextSegment] = []
	last_end = 0

	# Find all dialogue (double quotes)
	for match in self.DIALOGUE_DOUBLE_QUOTE.finditer(text):
	# Add narration before dialogue
	if match.start() > last_end:
	narration = text[last_end : match.start()].strip()
	if narration:
	segments.append(
	TextSegment(
	text=self.process_for_tts(narration),
	voice_type=VoiceType.DM,
	is_dialogue=False,
	)
	)

	# Add dialogue
	dialogue = match.group(1)
	segments.append(
	TextSegment(
	text=self.process_for_tts(dialogue),
	voice_type=default_npc_voice,
	is_dialogue=True,
	pause_before_ms=100, # Brief pause before dialogue
	pause_after_ms=100, # Brief pause after dialogue
	)
	)

	last_end = match.end()

	# Add remaining narration
	if last_end < len(text):
	remaining = text[last_end:].strip()
	if remaining:
	segments.append(
	TextSegment(
	text=self.process_for_tts(remaining),
	voice_type=VoiceType.DM,
	is_dialogue=False,
	)
	)

	# If no dialogue found, return single DM segment
	if not segments:
	segments.append(
	TextSegment(
	text=self.process_for_tts(text),
	voice_type=VoiceType.DM,
	is_dialogue=False,
	)
	)

	return segments

	def detect_voice_type(
	self,
	text: str,
	context: dict[str, object] \| None = None,
	) -> VoiceType:
	"""
	Analyze text and context to determine appropriate voice type.

	Args:
	text: Text to analyze.
	context: Game context dictionary with keys like:
	- in_combat: bool
	- current_npc: dict \| None

	Returns:
	VoiceType to use for this text.
	"""
	context = context or {}

	# Check for current NPC
	current_npc = context.get("current_npc")
	if current_npc and isinstance(current_npc, dict):
	from .voice_profiles import select_voice_from_npc_data

	return select_voice_from_npc_data(current_npc)

	# Check if text contains dialogue
	has_dialogue = bool(
	self.DIALOGUE_DOUBLE_QUOTE.search(text)
	or self.DIALOGUE_SINGLE_QUOTE.search(text)
	)

	# Get combat state
	in_combat = bool(context.get("in_combat", False))

	return select_voice_for_context(
	is_dialogue=has_dialogue,
	is_combat=in_combat,
	)

	def split_by_speaker(
	self,
	text: str,
	context: dict[str, object] \| None = None,
	) -> list[tuple[str, VoiceType]]:
	"""
	Split response into segments by speaker.

	Convenience method that returns simpler tuple format.

	Args:
	text: Text to split.
	context: Game context for voice selection.

	Returns:
	List of (text, voice_type) tuples.
	"""
	context = context or {}

	# Determine NPC voice from context
	npc_voice = VoiceType.NPC_MALE_GRUFF
	current_npc = context.get("current_npc")
	if current_npc and isinstance(current_npc, dict):
	from .voice_profiles import select_voice_from_npc_data

	npc_voice = select_voice_from_npc_data(current_npc)

	segments = self.handle_dialogue(text, default_npc_voice=npc_voice)
	return [(seg.text, seg.voice_type) for seg in segments]

	def process(
	self,
	text: str,
	context: dict[str, object] \| None = None,
	add_pauses: bool = True,
	) -> ProcessedNarration:
	"""
	Full processing pipeline for narration.

	Args:
	text: Raw text to process.
	context: Game context for voice selection.
	add_pauses: Whether to add dramatic pauses.

	Returns:
	ProcessedNarration with all segments and metadata.
	"""
	context = context or {}

	# Get segments with voice assignments
	npc_voice = VoiceType.NPC_MALE_GRUFF
	current_npc = context.get("current_npc")
	if current_npc and isinstance(current_npc, dict):
	from .voice_profiles import select_voice_from_npc_data

	npc_voice = select_voice_from_npc_data(current_npc)

	segments = self.handle_dialogue(text, default_npc_voice=npc_voice)

	# Add dramatic pauses if requested
	if add_pauses:
	for segment in segments:
	segment.text = self.add_dramatic_pauses(segment.text)

	# Build complete text
	total_text = " ".join(seg.text for seg in segments)

	# Determine primary voice
	primary_voice = VoiceType.DM
	for seg in segments:
	if not seg.is_dialogue:
	primary_voice = seg.voice_type
	break

	# Check for dialogue
	has_dialogue = any(seg.is_dialogue for seg in segments)

	# Estimate duration
	estimated_duration = int((len(total_text) / self.CHARS_PER_SECOND) * 1000)

	return ProcessedNarration(
	segments=segments,
	total_text=total_text,
	primary_voice=primary_voice,
	has_dialogue=has_dialogue,
	estimated_duration_ms=estimated_duration,
	)

	def _convert_dice_notation(self, match: re.Match[str]) -> str:
	"""
	Convert dice notation to speakable text.

	Examples:
	- "1d20" → "one dee twenty"
	- "2d6+3" → "two dee six plus three"
	- "d8" → "one dee eight"
	"""
	num_dice = match.group(1) or "1"
	die_size = match.group(2)
	modifier = match.group(3)

	# Convert numbers to words where available
	num_dice_int = int(num_dice)
	die_size_int = int(die_size)

	num_word = self.NUMBER_WORDS.get(num_dice_int, num_dice)
	die_word = self.NUMBER_WORDS.get(die_size_int, die_size)

	result = f"{num_word} dee {die_word}"

	if modifier:
	mod_sign = modifier[0]
	mod_value = modifier[1:]

	if mod_sign == "+":
	result += f" plus {mod_value}"
	elif mod_sign == "-":
	result += f" minus {mod_value}"

	return result

	def _clean_markdown(self, text: str) -> str:
	"""
	Remove markdown formatting from text.

	Args:
	text: Text with potential markdown.

	Returns:
	Plain text without markdown.
	"""
	result = text

	# Remove bold/italic markers
	result = re.sub(r"\\([^]+)\\", r"\1", result) # bold*
	result = re.sub(r"\([^]+)\", r"\1", result) # italic*
	result = re.sub(r"__([^_]+)__", r"\1", result) # __bold__
	result = re.sub(r"_([^_]+)_", r"\1", result) # _italic_

	# Remove headers
	result = re.sub(r"^#+\s*", "", result, flags=re.MULTILINE)

	# Remove horizontal rules
	result = re.sub(r"^[-_]{3,}\s$", "", result, flags=re.MULTILINE)

	# Remove bullet points (keep text)
	result = re.sub(r"^\s[-+]\s+", "", result, flags=re.MULTILINE)

	# Remove numbered lists (keep text)
	result = re.sub(r"^\s*\d+\.\s+", "", result, flags=re.MULTILINE)

	# Remove links but keep text: [text](url) → text
	result = re.sub(r"\[([^\]]+)\]$[^)]+$", r"\1", result)

	# Remove inline code
	result = re.sub(r"`([^`]+)`", r"\1", result)

	# Remove code blocks
	result = re.sub(r"```[\s\S]*?```", "", result)

	return result

	def estimate_duration_ms(self, text: str) -> int:
	"""
	Estimate audio duration for text.

	Args:
	text: Text to estimate duration for.

	Returns:
	Estimated duration in milliseconds.
	"""
	# Process text first to get accurate character count
	processed = self.process_for_tts(text)
	return int((len(processed) / self.CHARS_PER_SECOND) * 1000)