Spaces:

kingabzpro
/

Gemini-2-Pro-Chat

Running

App Files Files Community

Abid Ali Awan commited on Feb 9, 2025

Commit

22b0228

1 Parent(s): f15d60c

deploying the app

Browse files

Files changed (3) hide show

README.md +7 -3
app.py +526 -0
requirements.txt +1 -0

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 title: Gemini 2 Pro Chat
-emoji: 🐠
-colorFrom: pink
 colorTo: pink
 sdk: gradio
 sdk_version: 5.15.0
@@ -11,4 +11,8 @@ license: mit
 short_description: 'Image, Audio, and Document understanding + Code Execution. '
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Gemini 2 Pro Chat
+emoji: ♊💬
+colorFrom: Green
 colorTo: pink
 sdk: gradio
 sdk_version: 5.15.0
 short_description: 'Image, Audio, and Document understanding + Code Execution. '
 ---
+## Gemini 2.0 Pro Multi-modal Chatbot
+This module sets up a Gradio interface for a multi-modal chatbot powered by the Gemini 2.0 Pro model.
+It supports text, image, audio, and document inputs and uses the google.genai library to generate responses.
+All response-generation operations now use the streaming endpoint (generate_content_stream) so that the UI
+receives incremental updates.

app.py ADDED Viewed

	@@ -0,0 +1,526 @@

+import base64
+import io
+import os
+import time
+from typing import Dict, List, Optional, Union
+import gradio as gr
+from google import genai
+from google.genai import types  # New types module from google-genai
+from PIL import Image
+# Retrieve API key for Google GenAI from the environment variables.
+GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")
+# Initialize the client so that it can be reused across functions.
+CLIENT = genai.Client(api_key=GOOGLE_API_KEY)
+# General constants for the UI
+TITLE = """<h1 align="center">Gemini 2.0 Pro Multi-modal Chatbot</h1>"""
+AVATAR_IMAGES = (None, "https://media.roboflow.com/spaces/gemini-icon.png")
+IMAGE_WIDTH = 512
+def preprocess_stop_sequences(stop_sequences: str) -> Optional[List[str]]:
+    """
+    Convert a comma-separated string of stop sequences into a list.
+    Parameters:
+        stop_sequences (str): A string containing stop sequences separated by commas.
+    Returns:
+        Optional[List[str]]: A list of trimmed stop sequences if provided; otherwise, None.
+    """
+    if not stop_sequences:
+        return None
+    return [sequence.strip() for sequence in stop_sequences.split(",")]
+def preprocess_image(image: Image.Image) -> Image.Image:
+    """
+    Resize an image to a fixed width while maintaining the aspect ratio.
+    Parameters:
+        image (Image.Image): The original image.
+    Returns:
+        Image.Image: The resized image with width fixed at IMAGE_WIDTH.
+    """
+    image_height = int(image.height * IMAGE_WIDTH / image.width)
+    return image.resize((IMAGE_WIDTH, image_height))
+def image_to_base64_html_from_pil(image: Image.Image, max_width: int = 150) -> str:
+    """
+    Convert a PIL Image to an HTML <img> tag with base64-encoded image data.
+    Parameters:
+        image (Image.Image): The image to encode.
+        max_width (int): Maximum width (in pixels) for the displayed image.
+    Returns:
+        str: An HTML string with the embedded image.
+    """
+    buffered = io.BytesIO()
+    image.save(buffered, format="JPEG")
+    b64_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
+    return (
+        f'<img src="data:image/jpeg;base64,{b64_data}" alt="Uploaded Image" '
+        f'style="max-width:{max_width}px;">'
+    )
+def preprocess_chat_history_messages(
+    chat_history: List[Union[dict, gr.ChatMessage]],
+) -> List[Dict[str, Union[str, List[str]]]]:
+    """
+    Normalize chat history messages into a consistent list of dictionaries.
+    Each message (whether as a dict or gr.ChatMessage) is converted into a dictionary
+    containing a role and a list of parts (message content).
+    Parameters:
+        chat_history (List[Union[dict, gr.ChatMessage]]): The conversation history.
+    Returns:
+        List[Dict[str, Union[str, List[str]]]]: A normalized list of messages.
+    """
+    messages = []
+    for msg in chat_history:
+        if isinstance(msg, dict):
+            content = msg.get("content")
+            role = msg.get("role")
+        else:
+            content = msg.content
+            role = msg.role
+        if content is not None:
+            # Convert "assistant" role to "model" if needed.
+            role = "model" if role == "assistant" else role
+            messages.append({"role": role, "parts": [content]})
+    return messages
+def chat_history_to_prompt(chat_history: List[Union[dict, gr.ChatMessage]]) -> str:
+    """
+    Convert the entire chat conversation into a single text prompt.
+    Each message is prefixed by “User:” or “Assistant:” to form a full conversation.
+    Parameters:
+        chat_history (List[Union[dict, gr.ChatMessage]]): The conversation history.
+    Returns:
+        str: A string that concatenates the conversation history.
+    """
+    conversation = ""
+    for msg in chat_history:
+        content = get_message_content(msg)
+        role = msg.get("role") if isinstance(msg, dict) else msg.role
+        if role in ["assistant", "model"]:
+            conversation += f"Assistant: {content}\n"
+        else:
+            conversation += f"User: {content}\n"
+    return conversation
+def upload(files: Optional[List[str]], chatbot: List[Union[dict, gr.ChatMessage]]):
+    """
+    Process uploaded image files: resize them, convert to an HTML <img> tag (with base64 data),
+    and append it as a new user message to the chatbot history.
+    Parameters:
+        files (Optional[List[str]]): List of image file paths.
+        chatbot (List[Union[dict, gr.ChatMessage]]): The current conversation history.
+    Returns:
+        List[Union[dict, gr.ChatMessage]]: Updated conversation history.
+    """
+    for file in files:
+        image = Image.open(file).convert("RGB")
+        image = preprocess_image(image)
+        image_html = image_to_base64_html_from_pil(image)
+        chatbot.append(gr.ChatMessage(role="user", content=image_html))
+    return chatbot
+def upload_audio(
+    files: Optional[List[str]], chatbot: List[Union[dict, gr.ChatMessage]]
+):
+    """
+    Process uploaded audio files: read and base64-encode them, wrap the data in an HTML audio player,
+    and append it as a new user message.
+    Parameters:
+        files (Optional[List[str]]): List of audio file paths.
+        chatbot (List[Union[dict, gr.ChatMessage]]): The conversation history.
+    Returns:
+        List[Union[dict, gr.ChatMessage]]: The updated chatbot history.
+    """
+    for file in files:
+        with open(file, "rb") as f:
+            audio_bytes = f.read()
+        b64_data = base64.b64encode(audio_bytes).decode("utf-8")
+        audio_html = f"""<audio controls style="max-width:150px;">
+  <source src="data:audio/mp3;base64,{b64_data}" type="audio/mp3">
+  Your browser does not support the audio element.
+</audio>"""
+        chatbot.append(gr.ChatMessage(role="user", content=audio_html))
+    return chatbot
+def upload_document(
+    files: Optional[List[str]], chatbot: List[Union[dict, gr.ChatMessage]]
+):
+    """
+    Process uploaded document files (assumed to be PDFs) and add a notification message
+    (with an HTML snippet) indicating that the document has been uploaded.
+    Parameters:
+        files (Optional[List[str]]): List of document file paths.
+        chatbot (List[Union[dict, gr.ChatMessage]]): The conversation history.
+    Returns:
+        List[Union[dict, gr.ChatMessage]]: The updated chatbot history.
+    """
+    for file in files:
+        filename = os.path.basename(file)
+        doc_html = f"<p>📄 Document uploaded: {filename}</p>"
+        chatbot.append(gr.ChatMessage(role="user", content=doc_html))
+    return chatbot
+def user(text_prompt: str, chatbot: List[gr.ChatMessage]):
+    """
+    Append a new user text message to the chat history.
+    Parameters:
+        text_prompt (str): The input text provided by the user.
+        chatbot (List[gr.ChatMessage]): The existing conversation history.
+    Returns:
+        Tuple[str, List[gr.ChatMessage]]: A tuple of an empty string (clearing the prompt)
+            and the updated conversation history.
+    """
+    if text_prompt:
+        chatbot.append(gr.ChatMessage(role="user", content=text_prompt))
+    return "", chatbot
+def get_message_content(msg):
+    """
+    Retrieve the content of a message that can be either a dictionary or a gr.ChatMessage.
+    Parameters:
+        msg (Union[dict, gr.ChatMessage]): The message object.
+    Returns:
+        str: The textual content of the message.
+    """
+    if isinstance(msg, dict):
+        return msg.get("content", "")
+    return msg.content
+def bot(
+    image_files: Optional[List[str]],
+    audio_files: Optional[List[str]],
+    doc_files: Optional[List[str]],
+    chatbot: List[Union[dict, gr.ChatMessage]],
+):
+    """
+    Generate a chatbot response from Gemini 2.0 based on provided inputs.
+    This function supports three branches:
+      1. Document branch: when doc_files are provided.
+      2. Multi-modal branch: when image and/or audio files are provided.
+      3. Text-only conversation branch.
+    All branches now use generate_content_stream to yield incremental responses.
+    Parameters:
+        image_files (Optional[List[str]]): List of image file paths.
+        audio_files (Optional[List[str]]): List of audio file paths.
+        doc_files (Optional[List[str]]): List of document file paths.
+        chatbot (List[Union[dict, gr.ChatMessage]]): The conversation history.
+    Yields:
+        List[Union[dict, gr.ChatMessage]]: The updated conversation history with streamed responses.
+    """
+    if len(chatbot) == 0:
+        return chatbot
+    # Append a placeholder for the assistant's response.
+    chatbot.append(gr.ChatMessage(role="assistant", content=""))
+    generation_config = types.GenerateContentConfig(
+        temperature=0.4,
+        max_output_tokens=4096,
+        top_k=32,
+        top_p=1,
+    )
+    # Branch 1: Document uploads.
+    if doc_files and len(doc_files) > 0:
+        prev_msg_content = get_message_content(chatbot[-2]) if len(chatbot) >= 2 else ""
+        prompt = [prev_msg_content] if prev_msg_content else []
+        doc_parts = []
+        for file in doc_files:
+            with open(file, "rb") as f:
+                doc_bytes = f.read()
+            doc_parts.append(
+                types.Part.from_bytes(
+                    data=doc_bytes,
+                    mime_type="application/pdf",
+                )
+            )
+        # Combine document parts and previous text.
+        contents = doc_parts + prompt
+        # Use the streaming endpoint.
+        response = CLIENT.models.generate_content_stream(
+            model="gemini-2.0-pro-exp-02-05",
+            contents=contents,
+            config=generation_config,
+        )
+        for chunk in response:
+            for i in range(0, len(chunk.text), 10):
+                section = chunk.text[i : i + 10]
+                if isinstance(chatbot[-1], dict):
+                    chatbot[-1]["content"] += section
+                else:
+                    chatbot[-1].content += section
+                time.sleep(0.01)
+                yield chatbot
+        return
+    # Branch 2: Image or audio uploads.
+    elif (image_files and len(image_files) > 0) or (
+        audio_files and len(audio_files) > 0
+    ):
+        prev_msg_content = get_message_content(chatbot[-2]) if len(chatbot) >= 2 else ""
+        text_prompt = [prev_msg_content] if prev_msg_content else []
+        image_prompt = (
+            [Image.open(file).convert("RGB") for file in image_files]
+            if image_files
+            else []
+        )
+        audio_prompt = []
+        if audio_files:
+            for file in audio_files:
+                with open(file, "rb") as f:
+                    audio_bytes = f.read()
+                audio_prompt.append(
+                    types.Part.from_bytes(
+                        data=audio_bytes,
+                        mime_type="audio/mp3",
+                    )
+                )
+        # Combine all inputs into a multi-modal prompt.
+        contents = text_prompt + image_prompt + audio_prompt
+        response = CLIENT.models.generate_content_stream(
+            model="gemini-2.0-pro-exp-02-05",
+            contents=contents,
+            config=generation_config,
+        )
+        for chunk in response:
+            for i in range(0, len(chunk.text), 10):
+                section = chunk.text[i : i + 10]
+                if isinstance(chatbot[-1], dict):
+                    chatbot[-1]["content"] += section
+                else:
+                    chatbot[-1].content += section
+                time.sleep(0.01)
+                yield chatbot
+        return
+    # Branch 3: Text-only conversation.
+    else:
+        conversation_text = chat_history_to_prompt(chatbot)
+        response = CLIENT.models.generate_content_stream(
+            model="gemini-2.0-pro-exp-02-05",
+            contents=[conversation_text],
+            config=generation_config,
+        )
+        for chunk in response:
+            for i in range(0, len(chunk.text), 10):
+                section = chunk.text[i : i + 10]
+                if isinstance(chatbot[-1], dict):
+                    chatbot[-1]["content"] += section
+                else:
+                    chatbot[-1].content += section
+                time.sleep(0.01)
+                yield chatbot
+        return
+def run_code_execution(code_prompt: str, chatbot: List[Union[dict, gr.ChatMessage]]):
+    """
+    Append the user's code execution query to the chat history, then call Gemini
+    with code execution enabled using the user's input. The results (including any
+    generated code and execution output) are appended as a new assistant message.
+    """
+    # Only add a user message if there is content.
+    if code_prompt.strip():
+        chatbot.append(gr.ChatMessage(role="user", content=code_prompt))
+    # Append an empty assistant message to update with the code execution response.
+    chatbot.append(gr.ChatMessage(role="assistant", content=""))
+    generation_config = types.GenerateContentConfig(
+        tools=[types.Tool(code_execution=types.ToolCodeExecution)]
+    )
+    response = CLIENT.models.generate_content(
+        model="gemini-2.0-pro-exp-02-05",
+        contents=code_prompt,
+        config=generation_config,
+    )
+    output_text = ""
+    for part in response.candidates[0].content.parts:
+        if part.text is not None:
+            output_text += f"{part.text}\n"
+        if part.executable_code is not None:
+            # Display the executable code in a code block (using markdown formatting)
+            output_text += (
+                f"\n**Generated Code:**\n```python\n{part.executable_code.code}\n```\n"
+            )
+        if part.code_execution_result is not None:
+            output_text += (
+                f"\n**Output:**\n```\n{part.code_execution_result.output}\n```\n"
+            )
+        if part.inline_data is not None:
+            image_data = base64.b64decode(part.inline_data.data)
+            image = Image.open(io.BytesIO(image_data))
+            buffered = io.BytesIO()
+            image.save(buffered, format="PNG")
+            b64_data = base64.b64encode(buffered.getvalue()).decode("utf-8")
+            output_text += f'\n<img src="data:image/png;base64,{b64_data}" alt="Inline Image" style="max-width:300px;"/>\n'
+        output_text += "\n---\n"
+    # Update the last assistant message with the code execution result.
+    if isinstance(chatbot[-1], dict):
+        chatbot[-1]["content"] = output_text
+    else:
+        chatbot[-1].content = output_text
+    # Clear the text prompt after processing.
+    return "", chatbot
+# Define the Gradio UI components.
+chatbot_component = gr.Chatbot(
+    label="Gemini 2.0 Pro",
+    type="messages",  # Using message objects.
+    bubble_full_width=False,
+    avatar_images=AVATAR_IMAGES,
+    scale=2,
+    height=400,
+)
+text_prompt_component = gr.Textbox(
+    placeholder="Enter your message or code query here...",
+    show_label=False,
+    autofocus=True,
+    scale=19,
+)
+upload_button_component = gr.UploadButton(
+    label="Upload Images",
+    file_count="multiple",
+    file_types=["image"],
+    scale=1,
+)
+upload_audio_button_component = gr.UploadButton(
+    label="Upload Audio",
+    file_count="multiple",
+    file_types=["audio"],
+    scale=1,
+)
+upload_doc_button_component = gr.UploadButton(
+    label="Upload Documents",
+    file_count="multiple",
+    file_types=[".pdf"],
+    scale=1,
+)
+run_button_component = gr.Button(value="Run", variant="primary", scale=1, min_width=60)
+run_code_execution_button = gr.Button(
+    value="Run Code Execution", variant="secondary", scale=1
+)
+# Define input lists for button chaining.
+user_inputs = [text_prompt_component, chatbot_component]
+bot_inputs = [
+    upload_button_component,
+    upload_audio_button_component,
+    upload_doc_button_component,
+    chatbot_component,
+]
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.HTML(TITLE)
+    with gr.Column():
+        chatbot_component.render()
+        with gr.Row(equal_height=True):
+            text_prompt_component.render()
+            run_button_component.render()
+        with gr.Row():
+            # Render file-upload buttons and the code execution button in a single row.
+            upload_button_component.render()
+            upload_audio_button_component.render()
+            upload_doc_button_component.render()
+            run_code_execution_button.render()
+    # When the Run button is clicked, first process the user text then stream a response.
+    run_button_component.click(
+        fn=user,
+        inputs=user_inputs,
+        outputs=[text_prompt_component, chatbot_component],
+        queue=False,
+    ).then(
+        fn=bot,
+        inputs=bot_inputs,
+        outputs=[chatbot_component],
+    )
+    # Allow submission using the Enter key.
+    text_prompt_component.submit(
+        fn=user,
+        inputs=user_inputs,
+        outputs=[text_prompt_component, chatbot_component],
+        queue=False,
+    ).then(
+        fn=bot,
+        inputs=bot_inputs,
+        outputs=[chatbot_component],
+    )
+    # Handle image uploads.
+    upload_button_component.upload(
+        fn=upload,
+        inputs=[upload_button_component, chatbot_component],
+        outputs=[chatbot_component],
+        queue=False,
+    )
+    # Handle audio uploads.
+    upload_audio_button_component.upload(
+        fn=upload_audio,
+        inputs=[upload_audio_button_component, chatbot_component],
+        outputs=[chatbot_component],
+        queue=False,
+    )
+    # Handle document uploads.
+    upload_doc_button_component.upload(
+        fn=upload_document,
+        inputs=[upload_doc_button_component, chatbot_component],
+        outputs=[chatbot_component],
+        queue=False,
+    )
+    # When the Code Execution button is clicked, process the code prompt and stream the output.
+    run_code_execution_button.click(
+        fn=run_code_execution,
+        inputs=[text_prompt_component, chatbot_component],
+        outputs=[text_prompt_component, chatbot_component],
+        queue=False,
+    )
+# Launch the demo interface with queuing enabled.
+demo.queue(max_size=99, api_open=False).launch(debug=False, pwa=True, show_error=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ google-genai==1.0.0