import gradio as gr from datasets import load_dataset import json import pandas as pd # Load the dataset dataset = load_dataset("danielrosehill/multimodal-ai-taxonomy", split="train") # Extract taxonomy data and reconstruct nested structure taxonomy_data = {} for record in dataset: # Get modality info output_modality = record['output_modality'] operation_type = record['operation_type'] # Map output_modality to the keys used in MODALITY_INFO modality_key_map = { "video": "video_generation", "audio": "audio_generation", "image": "image_generation", "text": "text_generation", "3d": "3d_generation", "3d-model": "3d_generation" } modality_key = modality_key_map.get(output_modality, f"{output_modality}_generation") # Initialize nested structure if modality_key not in taxonomy_data: taxonomy_data[modality_key] = {} if operation_type not in taxonomy_data[modality_key]: taxonomy_data[modality_key][operation_type] = { "description": f"{output_modality.title()} {operation_type} modalities", "outputModality": output_modality, "operationType": operation_type, "modalities": [] } # Reconstruct the nested modality object modality_obj = { "id": record['id'], "name": record['name'], "input": { "primary": record['input_primary'], "secondary": record['input_secondary'] }, "output": { "primary": record['output_primary'], "audio": record['output_audio'] }, "characteristics": json.loads(record['characteristics']) if record['characteristics'] else {}, "metadata": { "maturityLevel": record['metadata_maturity_level'], "commonUseCases": record['metadata_common_use_cases'], "platforms": record['metadata_platforms'], "exampleModels": record['metadata_example_models'] }, "relationships": json.loads(record['relationships']) if record['relationships'] else {} } # Add audio type if present if record['output_audio'] and record.get('output_audio_type'): modality_obj["output"]["audioType"] = record['output_audio_type'] # Add to taxonomy data taxonomy_data[modality_key][operation_type]["modalities"].append(modality_obj) # Define modality display names MODALITY_INFO = { "video_generation": {"name": "Video Generation", "color": "#FF6B6B"}, "audio_generation": {"name": "Audio Generation", "color": "#4ECDC4"}, "image_generation": {"name": "Image Generation", "color": "#95E1D3"}, "text_generation": {"name": "Text Generation", "color": "#F38181"}, "3d_generation": {"name": "3D Generation", "color": "#AA96DA"}, } # CSS for styling custom_css = """ .modality-card { border: 2px solid #e0e0e0; border-radius: 10px; padding: 20px; margin: 10px 0; background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1); } .modality-header { font-size: 1.5em; font-weight: bold; margin-bottom: 10px; color: #333; } .modality-meta { background: #f5f5f5; padding: 10px; border-radius: 5px; margin: 10px 0; } .badge { display: inline-block; padding: 4px 12px; border-radius: 12px; margin: 2px; font-size: 0.85em; font-weight: 500; } .badge-mature { background: #4CAF50; color: white; } .badge-emerging { background: #FF9800; color: white; } .badge-experimental { background: #9C27B0; color: white; } .index-card { border: 2px solid #ddd; border-radius: 15px; padding: 30px; margin: 15px; text-align: center; cursor: pointer; transition: all 0.3s; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; } .index-card:hover { transform: translateY(-5px); box-shadow: 0 10px 20px rgba(0,0,0,0.2); } .stat-box { background: #f8f9fa; border-radius: 10px; padding: 15px; margin: 10px; text-align: center; } """ def create_modality_card(modality_obj): """Create an HTML card for a single modality""" # Maturity badge maturity = modality_obj['metadata']['maturityLevel'] badge_class = f"badge badge-{maturity}" # Input/Output info input_primary = modality_obj['input']['primary'] input_secondary = modality_obj['input'].get('secondary', []) output_primary = modality_obj['output']['primary'] # Build input string input_str = f"**Primary:** {input_primary}" if input_secondary: input_str += f"
**Secondary:** {', '.join(input_secondary)}" # Audio info for output audio_info = "" if modality_obj['output'].get('audio'): audio_type = modality_obj['output'].get('audioType', 'N/A') audio_info = f"
**Audio:** {audio_type}" # Characteristics chars = modality_obj.get('characteristics', {}) char_items = [f"**{k}:** {v}" for k, v in chars.items()] char_str = "
".join(char_items) if char_items else "N/A" # Use cases use_cases = modality_obj['metadata'].get('commonUseCases', []) use_case_str = "
• " + "
• ".join(use_cases) if use_cases else "N/A" # Platforms platforms = modality_obj['metadata'].get('platforms', []) platform_str = ", ".join(platforms) if platforms else "N/A" # Example models models = modality_obj['metadata'].get('exampleModels', []) model_str = ", ".join(models) if models else "N/A" html = f"""
{modality_obj['name']} {maturity}

Input
{input_str}

Output
**Primary:** {output_primary}{audio_info}

Characteristics
{char_str}
Common Use Cases
{use_case_str}
Platforms & Models

Platforms: {platform_str}

Example Models: {model_str}

""" return html def create_overview_page(): """Create the main overview/index page""" stats_html = "
" total_modalities = 0 for modality_key, operations in taxonomy_data.items(): info = MODALITY_INFO.get(modality_key, {"name": modality_key, "color": "#666"}) creation_count = len(operations.get('creation', {}).get('modalities', [])) editing_count = len(operations.get('editing', {}).get('modalities', [])) total_count = creation_count + editing_count total_modalities += total_count stats_html += f"""
{info['name']}
Creation: {creation_count} | Editing: {editing_count}
{total_count} modalities
""" stats_html += "
" overview_html = f"""

Multimodal AI Taxonomy

An attempt to define a structured taxonomy for multimodal generative AI capabilities, organized by output modality and operation type.

Dataset repository: danielrosehill/multimodal-ai-taxonomy

{total_modalities}
Total Modalities Defined
{stats_html}

How to Use This Space

Navigate through the tabs above to explore different output modalities (Video, Audio, Image, Text, 3D).

Each modality is organized into Creation (generating new content) and Editing (modifying existing content) operations.

Click on the details sections to expand and see characteristics, use cases, platforms, and example models.

""" return overview_html def create_modality_page(modality_key, operation_type): """Create a page for a specific modality and operation type""" if modality_key not in taxonomy_data: return f"

No data found for {modality_key}

" if operation_type not in taxonomy_data[modality_key]: return f"

No {operation_type} data found for {modality_key}

" data = taxonomy_data[modality_key][operation_type] modalities = data.get('modalities', []) info = MODALITY_INFO.get(modality_key, {"name": modality_key, "color": "#666"}) html = f"""

{info['name']} - {operation_type.title()}

{data.get('description', '')}

{len(modalities)} modalities defined
""" for modality in modalities: html += create_modality_card(modality) return html def create_comparison_table(modality_key): """Create a comparison table for creation vs editing""" if modality_key not in taxonomy_data: return pd.DataFrame() rows = [] for operation_type in ['creation', 'editing']: if operation_type in taxonomy_data[modality_key]: modalities = taxonomy_data[modality_key][operation_type].get('modalities', []) for mod in modalities: rows.append({ 'Operation': operation_type.title(), 'Name': mod['name'], 'Primary Input': mod['input']['primary'], 'Primary Output': mod['output']['primary'], 'Maturity': mod['metadata']['maturityLevel'], 'Platforms': len(mod['metadata'].get('platforms', [])), }) return pd.DataFrame(rows) # Create the Gradio interface with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: gr.Markdown("# Multimodal AI Taxonomy Explorer") with gr.Tabs(): # Overview tab with gr.Tab("Overview"): gr.HTML(create_overview_page()) # Video Generation with gr.Tab("Video"): with gr.Tabs(): with gr.Tab("Creation"): gr.HTML(create_modality_page("video_generation", "creation")) with gr.Tab("Editing"): gr.HTML(create_modality_page("video_generation", "editing")) with gr.Tab("Comparison"): gr.Dataframe(create_comparison_table("video_generation"), wrap=True) # Audio Generation with gr.Tab("Audio"): with gr.Tabs(): with gr.Tab("Creation"): gr.HTML(create_modality_page("audio_generation", "creation")) with gr.Tab("Editing"): gr.HTML(create_modality_page("audio_generation", "editing")) with gr.Tab("Comparison"): gr.Dataframe(create_comparison_table("audio_generation"), wrap=True) # Image Generation with gr.Tab("Image"): with gr.Tabs(): with gr.Tab("Creation"): gr.HTML(create_modality_page("image_generation", "creation")) with gr.Tab("Editing"): gr.HTML(create_modality_page("image_generation", "editing")) with gr.Tab("Comparison"): gr.Dataframe(create_comparison_table("image_generation"), wrap=True) # Text Generation with gr.Tab("Text"): with gr.Tabs(): with gr.Tab("Creation"): gr.HTML(create_modality_page("text_generation", "creation")) with gr.Tab("Editing"): gr.HTML(create_modality_page("text_generation", "editing")) with gr.Tab("Comparison"): gr.Dataframe(create_comparison_table("text_generation"), wrap=True) # 3D Generation with gr.Tab("3D"): with gr.Tabs(): with gr.Tab("Creation"): gr.HTML(create_modality_page("3d_generation", "creation")) with gr.Tab("Editing"): gr.HTML(create_modality_page("3d_generation", "editing")) with gr.Tab("Comparison"): gr.Dataframe(create_comparison_table("3d_generation"), wrap=True) # About tab with gr.Tab("About"): gr.Markdown(""" ## About This Taxonomy This is an attempt to define a structured taxonomy for multimodal AI capabilities, organized by: - **Output Modality**: The primary type of content being generated (video, audio, image, text, 3D) - **Operation Type**: Whether the task involves creation (from scratch) or editing (modifying existing content) ### Key Features - **Structured Metadata**: Each modality includes input/output specs, characteristics, maturity level, use cases, platforms, and example models - **Fine-grained Classification**: Goes beyond simple input/output categorization to capture nuanced differences ### Data Schema Each modality entry includes: - Unique identifier and human-readable name - Input specifications (primary and secondary modalities) - Output specifications (with audio metadata for video outputs) - Characteristics (process type, audio handling, motion type, etc.) - Metadata (maturity level, use cases, platforms, example models) ### Dataset This visualization is powered by the [multimodal-ai-taxonomy](https://huggingface.co/datasets/danielrosehill/multimodal-ai-taxonomy) dataset on Hugging Face. ### Maturity Levels - **Mature**: Well-established, widely available, production-ready - **Emerging**: Growing adoption, increasingly stable - **Experimental**: Cutting-edge, limited availability, proof-of-concept """) if __name__ == "__main__": demo.launch()