import gradio as gr import assemblyai as aai from together import Together import base64 from io import BytesIO from PIL import Image import os import yaml # Function to load API credentials def load_credentials(): assemblyai_key = os.getenv("ASSEMBLYAI_API_KEY") together_key = os.getenv("TOGETHER_API_KEY") if not assemblyai_key or not together_key: try: with open('API.yml', 'r') as file: api_creds = yaml.safe_load(file) assemblyai_key = assemblyai_key or api_creds['assemblyai'] together_key = together_key or api_creds['Together_api'] except Exception as e: print(f"Failed to load API credentials: {str(e)}") return None, None return assemblyai_key, together_key # Initialize API clients ASSEMBLYAI_API_KEY, TOGETHER_API_KEY = load_credentials() if ASSEMBLYAI_API_KEY and TOGETHER_API_KEY: aai.settings.api_key = ASSEMBLYAI_API_KEY together_client = Together(api_key=TOGETHER_API_KEY) else: raise ValueError("API credentials not found. Please check your configuration.") def transcribe_audio(audio_path): """Transcribe audio using AssemblyAI.""" try: transcriber = aai.Transcriber() transcript = transcriber.transcribe(audio_path) return transcript.text except Exception as e: return f"Error in transcription: {str(e)}" def generate_image(prompt): """Generate image using Together AI.""" try: response = together_client.images.generate( prompt=prompt, model="black-forest-labs/FLUX.1-schnell-Free", width=1024, height=768, steps=4, n=1, response_format="b64_json" ) # Convert base64 to PIL Image img_data = base64.b64decode(response.data[0].b64_json) img = Image.open(BytesIO(img_data)) return img except Exception as e: return f"Error in image generation: {str(e)}" def process_audio(audio, progress=gr.Progress()): """Process audio file and generate image""" if audio is None: return None, "Please provide an audio input." progress(0.3, desc="Transcribing audio...") transcribed_text = transcribe_audio(audio) if isinstance(transcribed_text, str) and not transcribed_text.startswith("Error"): progress(0.6, desc="Generating image...") generated_image = generate_image(transcribed_text) if isinstance(generated_image, Image.Image): progress(1.0, desc="Complete!") return generated_image, transcribed_text else: return None, f"Image generation failed: {generated_image}" else: return None, f"Transcription failed: {transcribed_text}" # Custom CSS for better styling custom_css = """ #app-title { text-align: center; margin-bottom: 10px; } #app-subtitle { text-align: center; margin-bottom: 30px; } #main-container { max-width: 1200px; margin: auto; } """ # Create Gradio interface def create_interface(): with gr.Blocks(css=custom_css, theme=gr.themes.Soft( primary_hue="indigo", secondary_hue="blue", neutral_hue="slate" )) as app: gr.HTML( """

Voice Generated Visions

✨ Transform Your Words into Stunning Visual Art ✨

""" ) with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="Record or Upload Audio", sources=["microphone", "upload"], type="filepath" ) submit_btn = gr.Button("🚀 Generate Vision", variant="primary") with gr.Column(): output_image = gr.Image(label="Generated Image đŸ–ŧī¸") output_text = gr.Textbox( label="Transcribed Text 📝", placeholder="Your speech will appear here...", lines=3 ) # Add usage instructions with gr.Accordion("â„šī¸ How to Use"): gr.Markdown(""" 1. **Record or Upload** 🎤 - Click the microphone icon to record your voice - Or upload an audio file from your device 2. **Generate** 🎨 - Click 'Generate Vision' to process your audio - Wait for the magic to happen! 3. **Results** ✨ - View your transcribed text - See your words transformed into art """) submit_btn.click( fn=process_audio, inputs=[audio_input], outputs=[output_image, output_text] ) return app # Launch the app if __name__ == "__main__": demo = create_interface() demo.launch(share=True)