File size: 6,163 Bytes
59c7f4b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
"""
Gemini API integration for processing audio and extracting context data.
"""
import google.generativeai as genai
import json
from datetime import datetime
from typing import Dict, Tuple


def get_system_prompt(user_name: str = None) -> str:
    """
    Generate the system prompt for context extraction.

    Args:
        user_name: Optional name to use instead of "the user"

    Returns:
        System prompt string
    """
    user_reference = user_name if user_name else "the user"

    return f"""You are a context extraction assistant. Your task is to analyze audio recordings where users provide personal context information and extract it in a clean, structured format.

## Your Task

Extract context data from the user's audio recording. Context data refers to specific information about the user that can be used to ground AI inference for more personalized results.

## Transformation Guidelines

1. Remove irrelevant information (e.g., tangential conversations, notes to self)
2. Remove duplicates and redundancy
3. Reformat from first person to third person, referring to "{user_reference}"
4. Organize information hierarchically with clear sections
5. Present information in a clean, structured markdown format

## Example Transformation

INPUT (raw audio transcript):
"Okay so ... let's document my health problems and the meds I take for this AI project ... ehm.. where do i start ... well, I've had asthma since I was a kid. I take a daily inhaler called Relvar for that. I also take Vyvanse for ADHD which is a stimulant medication. Oh .. hey Jay! What's up, man! Yeah see you at the gym. Okay, where was I. Note to self, pick up the laundry later. Oh yeah .. I've been on Vyvanse for three years and think it's great. I get bloods every 3 months."

OUTPUT (cleaned context data):

## Medical Conditions

- {user_reference} has had asthma since childhood
- {user_reference} has adult ADHD

## Medication List

- {user_reference} takes Relvar, daily, for asthma
- {user_reference} takes Vyvanse 70mg, daily, for ADHD

## Important Notes

Follow a careful hierarchical structure that allows additional context to be easily integrated later. Use clear section headers and bullet points for organization.

Now process the provided audio recording and extract the context data following these guidelines."""


def get_naming_prompt() -> str:
    """Get the prompt for generating context data names."""
    return """Based on the context data you just extracted, provide a JSON object with:
1. human_readable_name: A clear, descriptive title for this context (e.g., "Medical History and Medications", "Movie Preferences")
2. snake_case_filename: A snake_case version suitable for a filename (e.g., "medical_history_medications", "movie_preferences")

Respond ONLY with a valid JSON object in this exact format:
{
  "human_readable_name": "Your Title Here",
  "snake_case_filename": "your_filename_here"
}"""


def process_audio_with_gemini(
    audio_file_path: str,
    api_key: str,
    user_name: str = None
) -> Tuple[str, str, str]:
    """
    Process audio file with Gemini API to extract context data.

    Args:
        audio_file_path: Path to the audio file
        api_key: Gemini API key
        user_name: Optional user name for personalization

    Returns:
        Tuple of (context_markdown, human_readable_name, snake_case_filename)

    Raises:
        Exception: If API call fails
    """
    genai.configure(api_key=api_key)

    # Use Gemini Pro 2.5 with audio understanding
    model = genai.GenerativeModel('gemini-2.0-flash-exp')

    # Upload the audio file
    audio_file = genai.upload_file(audio_file_path)

    # Generate context data
    system_prompt = get_system_prompt(user_name)
    response = model.generate_content([system_prompt, audio_file])
    context_markdown = response.text

    # Generate naming information
    naming_response = model.generate_content([
        context_markdown,
        get_naming_prompt()
    ])

    # Parse the JSON response
    try:
        # Extract JSON from response (handle potential markdown code blocks)
        naming_text = naming_response.text.strip()
        if naming_text.startswith('```'):
            # Remove markdown code block markers
            lines = naming_text.split('\n')
            naming_text = '\n'.join(lines[1:-1])

        naming_data = json.loads(naming_text)
        human_readable_name = naming_data['human_readable_name']
        snake_case_filename = naming_data['snake_case_filename']
    except (json.JSONDecodeError, KeyError) as e:
        # Fallback to generic naming if parsing fails
        human_readable_name = "Context Data"
        snake_case_filename = "context_data"

    return context_markdown, human_readable_name, snake_case_filename


def create_markdown_file(
    context_markdown: str,
    human_readable_name: str,
    snake_case_filename: str
) -> Tuple[str, str]:
    """
    Create a formatted markdown file content.

    Args:
        context_markdown: The extracted context data
        human_readable_name: Human readable title
        snake_case_filename: Filename

    Returns:
        Tuple of (filename, content)
    """
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    content = f"""## {human_readable_name}

{context_markdown}

---

Captured on: {timestamp}
"""

    filename = f"{snake_case_filename}.md"
    return filename, content


def create_json_file(
    context_markdown: str,
    human_readable_name: str,
    snake_case_filename: str
) -> Tuple[str, str]:
    """
    Create a JSON file content.

    Args:
        context_markdown: The extracted context data
        human_readable_name: Human readable title
        snake_case_filename: Filename

    Returns:
        Tuple of (filename, json_content)
    """
    timestamp = datetime.now().isoformat()

    data = {
        "human_readable_name": human_readable_name,
        "snake_case_filename": snake_case_filename,
        "context_data": context_markdown,
        "captured_on": timestamp
    }

    filename = f"{snake_case_filename}.json"
    json_content = json.dumps(data, indent=2)
    return filename, json_content