Add TTS Tokenizer, Technical Report, and Basic Tests

89a8916 4 months ago

15.3 kB

	"""
	REST API Server for Multi-lingual TTS
	FastAPI-based server with OpenAPI documentation

	Hackathon API Specification:
	- GET /Get_Inference with text, lang, speaker_wav parameters
	"""

	import os
	import io
	import time
	import logging
	import tempfile
	from typing import Optional, List
	from pathlib import Path
	import numpy as np

	from fastapi import (
	FastAPI,
	HTTPException,
	Query,
	Response,
	BackgroundTasks,
	UploadFile,
	File,
	)
	from fastapi.middleware.cors import CORSMiddleware
	from fastapi.responses import StreamingResponse, FileResponse, JSONResponse
	from pydantic import BaseModel, Field
	import soundfile as sf

	from .engine import TTSEngine, TTSOutput
	from .config import (
	LANGUAGE_CONFIGS,
	get_available_languages,
	get_available_voices,
	STYLE_PRESETS,
	)

	# Language name to voice key mapping (for hackathon API)
	LANG_TO_VOICE = {
	"hindi": "hi_female",
	"bengali": "bn_female",
	"marathi": "mr_female",
	"telugu": "te_female",
	"kannada": "kn_female",
	"bhojpuri": "bho_female",
	"chhattisgarhi": "hne_female",
	"maithili": "mai_female",
	"magahi": "mag_female",
	"english": "en_female",
	"gujarati": "gu_mms",
	}

	# Setup logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Initialize FastAPI app
	app = FastAPI(
	title="Voice Tech for All - Multi-lingual TTS API",
	description="""
	A multi-lingual Text-to-Speech API supporting 10+ Indian languages.

	## Features
	- 10 Indian languages with male/female voices
	- Real-time speech synthesis
	- Text normalization for Indian languages
	- Speed control
	- Multiple audio formats (WAV, MP3)

	## Supported Languages
	Hindi, Bengali, Marathi, Telugu, Kannada, Bhojpuri,
	Chhattisgarhi, Maithili, Magahi, English

	## Use Case
	Built for an LLM-based healthcare assistant for pregnant mothers
	in low-income communities.
	""",
	version="1.0.0",
	contact={
	"name": "Voice Tech for All Hackathon",
	"url": "https://huggingface.co/SYSPIN",
	},
	license_info={
	"name": "CC BY 4.0",
	"url": "https://creativecommons.org/licenses/by/4.0/",
	},
	)

	# CORS middleware
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Initialize TTS Engine (lazy loading)
	_engine: Optional[TTSEngine] = None


	def get_engine() -> TTSEngine:
	"""Get or create TTS engine instance"""
	global _engine
	if _engine is None:
	_engine = TTSEngine(device="auto")
	return _engine


	# Request/Response Models
	class SynthesizeRequest(BaseModel):
	"""Request body for text synthesis"""

	text: str = Field(
	..., description="Text to synthesize", min_length=1, max_length=5000
	)
	voice: str = Field(
	"hi_male", description="Voice key (e.g., hi_male, bn_female, gu_mms)"
	)
	speed: float = Field(1.0, description="Speech speed (0.5-2.0)", ge=0.5, le=2.0)
	pitch: float = Field(1.0, description="Pitch multiplier (0.5-2.0)", ge=0.5, le=2.0)
	energy: float = Field(1.0, description="Energy/volume (0.5-2.0)", ge=0.5, le=2.0)
	style: Optional[str] = Field(
	None, description="Style preset (happy, sad, calm, excited, etc.)"
	)
	normalize: bool = Field(True, description="Apply text normalization")

	class Config:
	schema_extra = {
	"example": {
	"text": "નમસ્તે, હું તમારી કેવી રીતે મદદ કરી શકું?",
	"voice": "gu_mms",
	"speed": 1.0,
	"pitch": 1.0,
	"energy": 1.0,
	"style": "calm",
	"normalize": True,
	}
	}


	class SynthesizeResponse(BaseModel):
	"""Response metadata for synthesis"""

	success: bool
	duration: float
	sample_rate: int
	voice: str
	text: str
	inference_time: float


	class VoiceInfo(BaseModel):
	"""Information about a voice"""

	key: str
	name: str
	language_code: str
	gender: str
	loaded: bool
	downloaded: bool
	model_type: str = "vits"


	class HealthResponse(BaseModel):
	"""Health check response"""

	status: str
	device: str
	loaded_voices: List[str]
	available_voices: int
	style_presets: List[str]


	# API Endpoints
	@app.get("/", response_class=JSONResponse)
	async def root():
	"""API root - welcome message"""
	return {
	"message": "Voice Tech for All - Multi-lingual TTS API",
	"docs": "/docs",
	"health": "/health",
	"synthesize": "/synthesize",
	}


	@app.get("/health", response_model=HealthResponse)
	async def health_check():
	"""Health check endpoint"""
	engine = get_engine()
	return HealthResponse(
	status="healthy",
	device=str(engine.device),
	loaded_voices=engine.get_loaded_voices(),
	available_voices=len(LANGUAGE_CONFIGS),
	style_presets=list(STYLE_PRESETS.keys()),
	)


	@app.get("/voices", response_model=List[VoiceInfo])
	async def list_voices():
	"""List all available voices"""
	engine = get_engine()
	voices = engine.get_available_voices()

	return [
	VoiceInfo(
	key=key,
	name=info["name"],
	language_code=info["code"],
	gender=info["gender"],
	loaded=info["loaded"],
	downloaded=info["downloaded"],
	model_type=info.get("type", "vits"),
	)
	for key, info in voices.items()
	]


	@app.get("/styles")
	async def list_styles():
	"""List available style presets for prosody control"""
	return {
	"presets": STYLE_PRESETS,
	"description": {
	"speed": "Speech rate multiplier (0.5-2.0)",
	"pitch": "Pitch multiplier (0.5-2.0), >1 = higher",
	"energy": "Volume/energy multiplier (0.5-2.0)",
	},
	}


	@app.get("/languages")
	async def list_languages():
	"""List supported languages"""
	return get_available_languages()


	@app.post("/synthesize", response_class=Response)
	async def synthesize_audio(request: SynthesizeRequest):
	"""
	Synthesize speech from text

	Returns WAV audio file directly
	"""
	engine = get_engine()

	# Validate voice
	if request.voice not in LANGUAGE_CONFIGS:
	raise HTTPException(
	status_code=400,
	detail=f"Unknown voice: {request.voice}. Use /voices to see available options.",
	)

	try:
	start_time = time.time()

	# Synthesize
	output = engine.synthesize(
	text=request.text,
	voice=request.voice,
	speed=request.speed,
	pitch=request.pitch,
	energy=request.energy,
	style=request.style,
	normalize_text=request.normalize,
	)

	inference_time = time.time() - start_time

	# Convert to WAV bytes
	buffer = io.BytesIO()
	sf.write(buffer, output.audio, output.sample_rate, format="WAV")
	buffer.seek(0)

	# Return audio with metadata headers
	return Response(
	content=buffer.read(),
	media_type="audio/wav",
	headers={
	"X-Duration": str(output.duration),
	"X-Sample-Rate": str(output.sample_rate),
	"X-Voice": output.voice,
	"X-Style": output.style or "default",
	"X-Inference-Time": str(inference_time),
	},
	)

	except Exception as e:
	logger.error(f"Synthesis error: {e}")
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/synthesize/stream")
	async def synthesize_stream(request: SynthesizeRequest):
	"""
	Synthesize speech and stream the audio

	Returns streaming WAV audio
	"""
	engine = get_engine()

	if request.voice not in LANGUAGE_CONFIGS:
	raise HTTPException(status_code=400, detail=f"Unknown voice: {request.voice}")

	try:
	output = engine.synthesize(
	text=request.text,
	voice=request.voice,
	speed=request.speed,
	pitch=request.pitch,
	energy=request.energy,
	style=request.style,
	normalize_text=request.normalize,
	)

	# Create streaming response
	buffer = io.BytesIO()
	sf.write(buffer, output.audio, output.sample_rate, format="WAV")
	buffer.seek(0)

	return StreamingResponse(
	buffer,
	media_type="audio/wav",
	headers={"Content-Disposition": "attachment; filename=speech.wav"},
	)

	except Exception as e:
	logger.error(f"Streaming error: {e}")
	raise HTTPException(status_code=500, detail=str(e))


	@app.get("/synthesize/get")
	async def synthesize_get(
	text: str = Query(
	..., description="Text to synthesize", min_length=1, max_length=1000
	),
	voice: str = Query("hi_male", description="Voice key"),
	speed: float = Query(1.0, description="Speech speed", ge=0.5, le=2.0),
	pitch: float = Query(1.0, description="Pitch", ge=0.5, le=2.0),
	energy: float = Query(1.0, description="Energy", ge=0.5, le=2.0),
	style: Optional[str] = Query(None, description="Style preset"),
	):
	"""
	GET endpoint for simple synthesis

	Useful for testing and simple integrations
	"""
	request = SynthesizeRequest(
	text=text, voice=voice, speed=speed, pitch=pitch, energy=energy, style=style
	)
	return await synthesize_audio(request)


	@app.api_route("/Get_Inference", methods=["GET", "POST"])
	async def get_inference(
	text: str = Query(
	...,
	description="The input text to be converted into speech. For English, text must be lowercase.",
	),
	lang: str = Query(
	...,
	description="Language of input text. Supported: bhojpuri, bengali, english, gujarati, hindi, chhattisgarhi, kannada, magahi, maithili, marathi, telugu",
	),
	speaker_wav: UploadFile = File(
	...,
	description="A reference WAV file representing the speaker's voice (mandatory per hackathon spec).",
	),
	):
	"""
	Hackathon API - Generate speech audio from text

	This endpoint follows the Voice Tech for All hackathon specification.

	Supports both GET and POST methods with multipart form data.

	Parameters:
	- text: Input text to synthesize (query param)
	- lang: Language (query param) - bhojpuri, bengali, english, gujarati, hindi, chhattisgarhi, kannada, magahi, maithili, marathi, telugu
	- speaker_wav: Reference WAV file (multipart file upload, mandatory)

	Returns:
	- 200 OK: WAV audio file as streaming response
	"""
	engine = get_engine()

	# Normalize language name
	lang_lower = lang.lower().strip()

	# Enforce lowercase for English text (per spec)
	if lang_lower == "english":
	text = text.lower()

	# Map language to voice
	if lang_lower not in LANG_TO_VOICE:
	supported = list(LANG_TO_VOICE.keys())
	raise HTTPException(
	status_code=400,
	detail=f"Unsupported language: {lang}. Supported languages: {', '.join(supported)}",
	)

	voice = LANG_TO_VOICE[lang_lower]

	# Read speaker_wav (mandatory per spec)
	# Note: Current VITS models don't support voice cloning, but we accept the file
	# for API compatibility and validation. In future, this could be used for voice adaptation.
	try:
	speaker_audio_bytes = await speaker_wav.read()
	logger.info(
	f"Received speaker reference WAV: {len(speaker_audio_bytes)} bytes, filename: {speaker_wav.filename}"
	)
	# Validate it's a valid audio file (basic check)
	if len(speaker_audio_bytes) < 44: # Minimum WAV header size
	raise HTTPException(
	status_code=400,
	detail="Invalid speaker_wav: file too small to be a valid WAV",
	)
	except HTTPException:
	raise
	except Exception as e:
	logger.error(f"Could not read speaker_wav: {e}")
	raise HTTPException(
	status_code=400, detail=f"Failed to read speaker_wav file: {str(e)}"
	)

	try:
	# Synthesize audio
	output = engine.synthesize(
	text=text,
	voice=voice,
	speed=1.0,
	normalize_text=True,
	)

	# Convert to WAV bytes
	buffer = io.BytesIO()
	sf.write(buffer, output.audio, output.sample_rate, format="WAV")
	buffer.seek(0)

	# Return as streaming response (per spec)
	return StreamingResponse(
	buffer,
	media_type="audio/wav",
	headers={
	"Content-Disposition": "attachment; filename=output.wav",
	"X-Duration": str(output.duration),
	"X-Sample-Rate": str(output.sample_rate),
	"X-Language": lang,
	"X-Voice": voice,
	},
	)

	except Exception as e:
	logger.error(f"Synthesis error: {e}")
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/preload")
	async def preload_voice(voice: str):
	"""Preload a voice model into memory"""
	engine = get_engine()

	if voice not in LANGUAGE_CONFIGS:
	raise HTTPException(status_code=400, detail=f"Unknown voice: {voice}")

	try:
	engine.load_voice(voice)
	return {"message": f"Voice {voice} loaded successfully"}
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))


	@app.post("/unload")
	async def unload_voice(voice: str):
	"""Unload a voice model from memory"""
	engine = get_engine()
	engine.unload_voice(voice)
	return {"message": f"Voice {voice} unloaded"}


	@app.post("/batch")
	async def batch_synthesize(
	texts: List[str], voice: str = "hi_male", speed: float = 1.0
	):
	"""
	Synthesize multiple texts

	Returns a list of base64-encoded audio
	"""
	import base64

	engine = get_engine()

	if voice not in LANGUAGE_CONFIGS:
	raise HTTPException(status_code=400, detail=f"Unknown voice: {voice}")

	results = []
	for text in texts:
	output = engine.synthesize(text, voice, speed)

	buffer = io.BytesIO()
	sf.write(buffer, output.audio, output.sample_rate, format="WAV")
	buffer.seek(0)

	results.append(
	{
	"text": text,
	"audio_base64": base64.b64encode(buffer.read()).decode(),
	"duration": output.duration,
	}
	)

	return results


	# Startup/Shutdown events
	@app.on_event("startup")
	async def startup_event():
	"""Initialize on startup"""
	logger.info("Starting TTS API server...")
	# Optionally preload default voice
	# get_engine().load_voice("hi_male")


	@app.on_event("shutdown")
	async def shutdown_event():
	"""Cleanup on shutdown"""
	logger.info("Shutting down TTS API server...")


	def start_server(host: str = "0.0.0.0", port: int = 8000, reload: bool = False):
	"""Start the API server"""
	import uvicorn

	uvicorn.run("src.api:app", host=host, port=port, reload=reload, log_level="info")


	if __name__ == "__main__":
	start_server()