danielrosehill commited on
Commit
597e3a5
ยท
1 Parent(s): 80cfd1e
Files changed (2) hide show
  1. README.md +15 -2
  2. app.py +31 -30
README.md CHANGED
@@ -7,7 +7,20 @@ sdk: gradio
7
  sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
- short_description: Taxonomy of AI modalities
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  sdk_version: 5.49.1
8
  app_file: app.py
9
  pinned: false
10
+ short_description: An attempt to define a taxonomy for multimodal AI capabilities
11
  ---
12
 
13
+ # Multimodal AI Taxonomy
14
+
15
+ An attempt to define a structured taxonomy for multimodal generative AI capabilities, organized by output modality and operation type.
16
+
17
+ Dataset repository: https://huggingface.co/datasets/danielrosehill/multimodal-ai-taxonomy
18
+
19
+ This Space provides an interactive explorer for browsing and comparing different multimodal AI capabilities across:
20
+ - Video Generation
21
+ - Audio Generation
22
+ - Image Generation
23
+ - Text Generation
24
+ - 3D Generation
25
+
26
+ Each modality is categorized into Creation (generating new content) and Editing (modifying existing content) operations.
app.py CHANGED
@@ -67,13 +67,13 @@ for record in dataset:
67
  # Add to taxonomy data
68
  taxonomy_data[modality_key][operation_type]["modalities"].append(modality_obj)
69
 
70
- # Define modality display names and emojis
71
  MODALITY_INFO = {
72
- "video_generation": {"name": "Video Generation", "emoji": "๐ŸŽฌ", "color": "#FF6B6B"},
73
- "audio_generation": {"name": "Audio Generation", "emoji": "๐ŸŽต", "color": "#4ECDC4"},
74
- "image_generation": {"name": "Image Generation", "emoji": "๐Ÿ–ผ๏ธ", "color": "#95E1D3"},
75
- "text_generation": {"name": "Text Generation", "emoji": "๐Ÿ“", "color": "#F38181"},
76
- "3d_generation": {"name": "3D Generation", "emoji": "๐ŸŽจ", "color": "#AA96DA"},
77
  }
78
 
79
  # CSS for styling
@@ -181,26 +181,26 @@ def create_modality_card(modality_obj):
181
  </div>
182
 
183
  <div class="modality-meta">
184
- <p><strong>๐Ÿ”น Input</strong><br>{input_str}</p>
185
- <p><strong>๐Ÿ”ธ Output</strong><br>**Primary:** {output_primary}{audio_info}</p>
186
  </div>
187
 
188
  <details>
189
- <summary><strong>๐Ÿ“Š Characteristics</strong></summary>
190
  <div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;">
191
  {char_str}
192
  </div>
193
  </details>
194
 
195
  <details>
196
- <summary><strong>๐Ÿ’ก Common Use Cases</strong></summary>
197
  <div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;">
198
  {use_case_str}
199
  </div>
200
  </details>
201
 
202
  <details>
203
- <summary><strong>๐Ÿ› ๏ธ Platforms & Models</strong></summary>
204
  <div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;">
205
  <p><strong>Platforms:</strong> {platform_str}</p>
206
  <p><strong>Example Models:</strong> {model_str}</p>
@@ -217,7 +217,7 @@ def create_overview_page():
217
 
218
  total_modalities = 0
219
  for modality_key, operations in taxonomy_data.items():
220
- info = MODALITY_INFO.get(modality_key, {"name": modality_key, "emoji": "๐Ÿ“ฆ", "color": "#666"})
221
 
222
  creation_count = len(operations.get('creation', {}).get('modalities', []))
223
  editing_count = len(operations.get('editing', {}).get('modalities', []))
@@ -226,7 +226,6 @@ def create_overview_page():
226
 
227
  stats_html += f"""
228
  <div class="stat-box" style="border-left: 4px solid {info['color']};">
229
- <div style="font-size: 2em;">{info['emoji']}</div>
230
  <div style="font-size: 1.2em; font-weight: bold; margin: 10px 0;">{info['name']}</div>
231
  <div style="font-size: 0.9em; color: #666;">
232
  Creation: {creation_count} | Editing: {editing_count}
@@ -241,20 +240,23 @@ def create_overview_page():
241
 
242
  overview_html = f"""
243
  <div style="text-align: center; padding: 30px;">
244
- <h1>๐ŸŽฏ Multimodal AI Taxonomy</h1>
245
  <p style="font-size: 1.2em; color: #666; max-width: 800px; margin: 20px auto;">
246
- A comprehensive taxonomy for multimodal generative AI capabilities, organized by output modality and operation type.
 
 
 
247
  </p>
248
  <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px auto; max-width: 300px;">
249
  <div style="font-size: 3em; font-weight: bold;">{total_modalities}</div>
250
- <div style="font-size: 1.2em;">Total Modalities</div>
251
  </div>
252
  </div>
253
 
254
  {stats_html}
255
 
256
  <div style="margin: 30px; padding: 20px; background: #f0f7ff; border-radius: 10px; border-left: 4px solid #2196F3;">
257
- <h3>๐Ÿ“– How to Use This Space</h3>
258
  <p>Navigate through the tabs above to explore different output modalities (Video, Audio, Image, Text, 3D).</p>
259
  <p>Each modality is organized into <strong>Creation</strong> (generating new content) and <strong>Editing</strong> (modifying existing content) operations.</p>
260
  <p>Click on the details sections to expand and see characteristics, use cases, platforms, and example models.</p>
@@ -275,14 +277,14 @@ def create_modality_page(modality_key, operation_type):
275
  data = taxonomy_data[modality_key][operation_type]
276
  modalities = data.get('modalities', [])
277
 
278
- info = MODALITY_INFO.get(modality_key, {"name": modality_key, "emoji": "๐Ÿ“ฆ", "color": "#666"})
279
 
280
  html = f"""
281
  <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, {info['color']}22 0%, {info['color']}44 100%); border-radius: 15px; margin-bottom: 20px;">
282
- <h2>{info['emoji']} {info['name']} - {operation_type.title()}</h2>
283
  <p style="color: #666;">{data.get('description', '')}</p>
284
  <div style="font-size: 1.5em; font-weight: bold; color: {info['color']}; margin-top: 10px;">
285
- {len(modalities)} modalities
286
  </div>
287
  </div>
288
  """
@@ -317,15 +319,15 @@ def create_comparison_table(modality_key):
317
  # Create the Gradio interface
318
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
319
 
320
- gr.Markdown("# ๐ŸŽฏ Multimodal AI Taxonomy Explorer")
321
 
322
  with gr.Tabs():
323
  # Overview tab
324
- with gr.Tab("๐Ÿ  Overview"):
325
  gr.HTML(create_overview_page())
326
 
327
  # Video Generation
328
- with gr.Tab("๐ŸŽฌ Video"):
329
  with gr.Tabs():
330
  with gr.Tab("Creation"):
331
  gr.HTML(create_modality_page("video_generation", "creation"))
@@ -335,7 +337,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
335
  gr.Dataframe(create_comparison_table("video_generation"), wrap=True)
336
 
337
  # Audio Generation
338
- with gr.Tab("๐ŸŽต Audio"):
339
  with gr.Tabs():
340
  with gr.Tab("Creation"):
341
  gr.HTML(create_modality_page("audio_generation", "creation"))
@@ -345,7 +347,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
345
  gr.Dataframe(create_comparison_table("audio_generation"), wrap=True)
346
 
347
  # Image Generation
348
- with gr.Tab("๐Ÿ–ผ๏ธ Image"):
349
  with gr.Tabs():
350
  with gr.Tab("Creation"):
351
  gr.HTML(create_modality_page("image_generation", "creation"))
@@ -355,7 +357,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
355
  gr.Dataframe(create_comparison_table("image_generation"), wrap=True)
356
 
357
  # Text Generation
358
- with gr.Tab("๐Ÿ“ Text"):
359
  with gr.Tabs():
360
  with gr.Tab("Creation"):
361
  gr.HTML(create_modality_page("text_generation", "creation"))
@@ -365,7 +367,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
365
  gr.Dataframe(create_comparison_table("text_generation"), wrap=True)
366
 
367
  # 3D Generation
368
- with gr.Tab("๐ŸŽจ 3D"):
369
  with gr.Tabs():
370
  with gr.Tab("Creation"):
371
  gr.HTML(create_modality_page("3d_generation", "creation"))
@@ -375,18 +377,17 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
375
  gr.Dataframe(create_comparison_table("3d_generation"), wrap=True)
376
 
377
  # About tab
378
- with gr.Tab("โ„น๏ธ About"):
379
  gr.Markdown("""
380
  ## About This Taxonomy
381
 
382
- This taxonomy provides a structured classification of multimodal AI capabilities, organized by:
383
 
384
  - **Output Modality**: The primary type of content being generated (video, audio, image, text, 3D)
385
  - **Operation Type**: Whether the task involves creation (from scratch) or editing (modifying existing content)
386
 
387
  ### Key Features
388
 
389
- - **Comprehensive Coverage**: Covers all major multimodal AI capabilities
390
  - **Structured Metadata**: Each modality includes input/output specs, characteristics, maturity level, use cases, platforms, and example models
391
  - **Fine-grained Classification**: Goes beyond simple input/output categorization to capture nuanced differences
392
 
 
67
  # Add to taxonomy data
68
  taxonomy_data[modality_key][operation_type]["modalities"].append(modality_obj)
69
 
70
+ # Define modality display names
71
  MODALITY_INFO = {
72
+ "video_generation": {"name": "Video Generation", "color": "#FF6B6B"},
73
+ "audio_generation": {"name": "Audio Generation", "color": "#4ECDC4"},
74
+ "image_generation": {"name": "Image Generation", "color": "#95E1D3"},
75
+ "text_generation": {"name": "Text Generation", "color": "#F38181"},
76
+ "3d_generation": {"name": "3D Generation", "color": "#AA96DA"},
77
  }
78
 
79
  # CSS for styling
 
181
  </div>
182
 
183
  <div class="modality-meta">
184
+ <p><strong>Input</strong><br>{input_str}</p>
185
+ <p><strong>Output</strong><br>**Primary:** {output_primary}{audio_info}</p>
186
  </div>
187
 
188
  <details>
189
+ <summary><strong>Characteristics</strong></summary>
190
  <div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;">
191
  {char_str}
192
  </div>
193
  </details>
194
 
195
  <details>
196
+ <summary><strong>Common Use Cases</strong></summary>
197
  <div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;">
198
  {use_case_str}
199
  </div>
200
  </details>
201
 
202
  <details>
203
+ <summary><strong>Platforms & Models</strong></summary>
204
  <div style="margin: 10px; padding: 10px; background: #fafafa; border-radius: 5px;">
205
  <p><strong>Platforms:</strong> {platform_str}</p>
206
  <p><strong>Example Models:</strong> {model_str}</p>
 
217
 
218
  total_modalities = 0
219
  for modality_key, operations in taxonomy_data.items():
220
+ info = MODALITY_INFO.get(modality_key, {"name": modality_key, "color": "#666"})
221
 
222
  creation_count = len(operations.get('creation', {}).get('modalities', []))
223
  editing_count = len(operations.get('editing', {}).get('modalities', []))
 
226
 
227
  stats_html += f"""
228
  <div class="stat-box" style="border-left: 4px solid {info['color']};">
 
229
  <div style="font-size: 1.2em; font-weight: bold; margin: 10px 0;">{info['name']}</div>
230
  <div style="font-size: 0.9em; color: #666;">
231
  Creation: {creation_count} | Editing: {editing_count}
 
240
 
241
  overview_html = f"""
242
  <div style="text-align: center; padding: 30px;">
243
+ <h1>Multimodal AI Taxonomy</h1>
244
  <p style="font-size: 1.2em; color: #666; max-width: 800px; margin: 20px auto;">
245
+ An attempt to define a structured taxonomy for multimodal generative AI capabilities, organized by output modality and operation type.
246
+ </p>
247
+ <p style="font-size: 1em; color: #666; max-width: 800px; margin: 20px auto;">
248
+ Dataset repository: <a href="https://huggingface.co/datasets/danielrosehill/multimodal-ai-taxonomy" target="_blank">danielrosehill/multimodal-ai-taxonomy</a>
249
  </p>
250
  <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 15px; margin: 20px auto; max-width: 300px;">
251
  <div style="font-size: 3em; font-weight: bold;">{total_modalities}</div>
252
+ <div style="font-size: 1.2em;">Total Modalities Defined</div>
253
  </div>
254
  </div>
255
 
256
  {stats_html}
257
 
258
  <div style="margin: 30px; padding: 20px; background: #f0f7ff; border-radius: 10px; border-left: 4px solid #2196F3;">
259
+ <h3>How to Use This Space</h3>
260
  <p>Navigate through the tabs above to explore different output modalities (Video, Audio, Image, Text, 3D).</p>
261
  <p>Each modality is organized into <strong>Creation</strong> (generating new content) and <strong>Editing</strong> (modifying existing content) operations.</p>
262
  <p>Click on the details sections to expand and see characteristics, use cases, platforms, and example models.</p>
 
277
  data = taxonomy_data[modality_key][operation_type]
278
  modalities = data.get('modalities', [])
279
 
280
+ info = MODALITY_INFO.get(modality_key, {"name": modality_key, "color": "#666"})
281
 
282
  html = f"""
283
  <div style="text-align: center; padding: 20px; background: linear-gradient(135deg, {info['color']}22 0%, {info['color']}44 100%); border-radius: 15px; margin-bottom: 20px;">
284
+ <h2>{info['name']} - {operation_type.title()}</h2>
285
  <p style="color: #666;">{data.get('description', '')}</p>
286
  <div style="font-size: 1.5em; font-weight: bold; color: {info['color']}; margin-top: 10px;">
287
+ {len(modalities)} modalities defined
288
  </div>
289
  </div>
290
  """
 
319
  # Create the Gradio interface
320
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
321
 
322
+ gr.Markdown("# Multimodal AI Taxonomy Explorer")
323
 
324
  with gr.Tabs():
325
  # Overview tab
326
+ with gr.Tab("Overview"):
327
  gr.HTML(create_overview_page())
328
 
329
  # Video Generation
330
+ with gr.Tab("Video"):
331
  with gr.Tabs():
332
  with gr.Tab("Creation"):
333
  gr.HTML(create_modality_page("video_generation", "creation"))
 
337
  gr.Dataframe(create_comparison_table("video_generation"), wrap=True)
338
 
339
  # Audio Generation
340
+ with gr.Tab("Audio"):
341
  with gr.Tabs():
342
  with gr.Tab("Creation"):
343
  gr.HTML(create_modality_page("audio_generation", "creation"))
 
347
  gr.Dataframe(create_comparison_table("audio_generation"), wrap=True)
348
 
349
  # Image Generation
350
+ with gr.Tab("Image"):
351
  with gr.Tabs():
352
  with gr.Tab("Creation"):
353
  gr.HTML(create_modality_page("image_generation", "creation"))
 
357
  gr.Dataframe(create_comparison_table("image_generation"), wrap=True)
358
 
359
  # Text Generation
360
+ with gr.Tab("Text"):
361
  with gr.Tabs():
362
  with gr.Tab("Creation"):
363
  gr.HTML(create_modality_page("text_generation", "creation"))
 
367
  gr.Dataframe(create_comparison_table("text_generation"), wrap=True)
368
 
369
  # 3D Generation
370
+ with gr.Tab("3D"):
371
  with gr.Tabs():
372
  with gr.Tab("Creation"):
373
  gr.HTML(create_modality_page("3d_generation", "creation"))
 
377
  gr.Dataframe(create_comparison_table("3d_generation"), wrap=True)
378
 
379
  # About tab
380
+ with gr.Tab("About"):
381
  gr.Markdown("""
382
  ## About This Taxonomy
383
 
384
+ This is an attempt to define a structured taxonomy for multimodal AI capabilities, organized by:
385
 
386
  - **Output Modality**: The primary type of content being generated (video, audio, image, text, 3D)
387
  - **Operation Type**: Whether the task involves creation (from scratch) or editing (modifying existing content)
388
 
389
  ### Key Features
390
 
 
391
  - **Structured Metadata**: Each modality includes input/output specs, characteristics, maturity level, use cases, platforms, and example models
392
  - **Fine-grained Classification**: Goes beyond simple input/output categorization to capture nuanced differences
393