danielrosehill's picture
Complete Context Cruncher deployment for Hugging Face Spaces
59c7f4b
"""
Context Cruncher - Gradio Application
Extract structured context data from voice recordings using Gemini AI.
"""
import gradio as gr
import os
from pathlib import Path
import tempfile
from dotenv import load_dotenv
from gemini_processor import (
process_audio_with_gemini,
create_markdown_file,
create_json_file
)
# Load environment variables
load_dotenv()
def process_audio(
audio_input,
uploaded_file,
api_key: str,
user_identification: str,
user_name: str = ""
) -> tuple:
"""
Process audio from either recording or upload.
Args:
audio_input: Audio from microphone recording
uploaded_file: Uploaded audio file
api_key: Gemini API key
user_identification: "name" or "user"
user_name: User's name if using name identification
Returns:
Tuple of (markdown_content, markdown_file, json_file, status_message)
"""
try:
# Validate API key
if not api_key or api_key.strip() == "":
return (
"",
None,
None,
"Error: Please provide a Gemini API key"
)
# Determine which audio source to use
audio_path = None
if audio_input is not None:
audio_path = audio_input
elif uploaded_file is not None:
audio_path = uploaded_file.name
if audio_path is None:
return (
"",
None,
None,
"Error: Please record audio or upload an audio file"
)
# Determine user reference
user_ref = None
if user_identification == "name":
if not user_name or user_name.strip() == "":
return (
"",
None,
None,
"Error: Please provide your name when using name identification"
)
user_ref = user_name.strip()
# Process with Gemini
status_msg = "Processing audio with Gemini API..."
context_markdown, human_readable_name, snake_case_filename = process_audio_with_gemini(
audio_path,
api_key,
user_ref
)
# Create output files
md_filename, md_content = create_markdown_file(
context_markdown,
human_readable_name,
snake_case_filename
)
json_filename, json_content = create_json_file(
context_markdown,
human_readable_name,
snake_case_filename
)
# Write files to temp directory for download
temp_dir = tempfile.mkdtemp()
md_path = Path(temp_dir) / md_filename
json_path = Path(temp_dir) / json_filename
with open(md_path, 'w') as f:
f.write(md_content)
with open(json_path, 'w') as f:
f.write(json_content)
return (
md_content,
str(md_path),
str(json_path),
f"Success! Context extracted: {human_readable_name}"
)
except Exception as e:
return (
"",
None,
None,
f"Error: {str(e)}"
)
# Custom CSS for better styling
custom_css = """
.gradio-container {
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
}
.main-header {
text-align: center;
margin-bottom: 1.5rem;
padding-bottom: 1rem;
border-bottom: 2px solid #e5e7eb;
}
.main-header h1 {
font-size: 2rem;
font-weight: 600;
color: #1f2937;
margin-bottom: 0.5rem;
}
.main-header p {
color: #6b7280;
font-size: 1rem;
}
.section-header {
font-weight: 600;
color: #374151;
margin-bottom: 1rem;
}
"""
# Create Gradio interface
with gr.Blocks(css=custom_css, title="Context Cruncher") as demo:
gr.Markdown(
"""
# Context Cruncher
Extract structured context data from voice recordings using AI
""",
elem_classes="main-header"
)
with gr.Tabs():
with gr.Tab("Extract"):
with gr.Row():
with gr.Column(scale=1):
with gr.Accordion("Configuration", open=True):
api_key_input = gr.Textbox(
label="Gemini API Key",
placeholder="Enter your Gemini API key",
type="password",
value=os.getenv("GEMINI_API", ""),
info="Get your API key from https://ai.google.dev/"
)
user_identification = gr.Radio(
choices=["user", "name"],
value="user",
label="User Identification",
info="How should you be referred to in the context data?"
)
user_name_input = gr.Textbox(
label="Your Name",
placeholder="Enter your name",
visible=False,
info="Used when 'name' is selected above"
)
gr.Markdown("### Audio Input", elem_classes="section-header")
audio_recording = gr.Audio(
sources=["microphone"],
type="filepath",
label="Record Audio"
)
gr.Markdown("**OR**")
audio_upload = gr.File(
label="Upload Audio File",
file_types=["audio"],
type="filepath"
)
process_btn = gr.Button("Extract Context", variant="primary", size="lg")
with gr.Column(scale=1):
gr.Markdown("### Results", elem_classes="section-header")
status_output = gr.Textbox(
label="Status",
interactive=False,
show_label=True
)
context_display = gr.Textbox(
label="Context Data (Markdown)",
lines=18,
interactive=False,
show_copy_button=True
)
with gr.Row():
markdown_download = gr.File(label="Download Markdown")
json_download = gr.File(label="Download JSON")
with gr.Tab("About"):
gr.Markdown(
"""
## What is Context Cruncher?
Context Cruncher transforms casual voice recordings into clean, structured context data
that AI systems can use for personalization.
**Context data** refers to specific information about users that grounds AI inference
for more personalized results.
## How It Works
1. **Configure** - Enter your Gemini API key and choose how you want to be identified
2. **Input Audio** - Either record directly in your browser or upload an audio file (MP3, WAV, OPUS)
3. **Extract** - Click the button and let AI clean up your recording into structured context data
4. **Download** - Get your context data as Markdown or JSON, or copy directly from the text area
## What Gets Extracted
This tool processes your audio by:
- Removing irrelevant information and tangents
- Eliminating duplicates and redundancy
- Reformatting from first person to third person
- Organizing information hierarchically
- Outputting both Markdown and JSON formats
## Example Transformation
**Raw Audio:**
> "Okay so... let's document my health problems... I've had asthma since I was a kid.
> I take a daily inhaler called Relvar for that. Oh hey Jay! What's up!
> Okay, where was I... I also take Vyvanse for ADHD."
**Structured Output:**
```markdown
## Medical Conditions
- the user has had asthma since childhood
- the user has adult ADHD
## Medication List
- the user takes Relvar, daily, for asthma
- the user takes Vyvanse for ADHD
```
## Privacy Notice
Your audio is processed using the Gemini API. Review Google's privacy policies
before using this tool with sensitive information.
## Technical Details
- **AI Model**: Gemini 2.0 Flash (multimodal audio understanding)
- **Processing**: Direct audio file upload to Gemini API
- **Output Formats**: Markdown and JSON
## Use Cases
- AI assistant personalization
- Knowledge management
- Preference mapping
- Medical history documentation (note privacy considerations)
- Project context capture
"""
)
# Show/hide name input based on identification method
def toggle_name_input(identification_choice):
return gr.update(visible=identification_choice == "name")
user_identification.change(
fn=toggle_name_input,
inputs=[user_identification],
outputs=[user_name_input]
)
# Process button click
process_btn.click(
fn=process_audio,
inputs=[
audio_recording,
audio_upload,
api_key_input,
user_identification,
user_name_input
],
outputs=[
context_display,
markdown_download,
json_download,
status_output
]
)
if __name__ == "__main__":
# For Hugging Face Spaces, share should be False
# Set server_name to 0.0.0.0 for Spaces compatibility
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False
)