Commit
·
80cfd1e
1
Parent(s):
62cd7ca
Update Space to work with modernized dataset format
Browse files- Load dataset using new JSONL format (split="train")
- Reconstruct nested data structure from flattened records
- Add modality key mapping for consistent naming
- Parse JSON strings for characteristics and relationships
This update makes the Space compatible with the migrated dataset
that no longer uses a Python loading script.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
app.py
CHANGED
|
@@ -4,26 +4,68 @@ import json
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
# Load the dataset
|
| 7 |
-
dataset = load_dataset("danielrosehill/multimodal-ai-taxonomy")
|
| 8 |
|
| 9 |
-
# Extract taxonomy data
|
| 10 |
taxonomy_data = {}
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
# Define modality display names and emojis
|
| 29 |
MODALITY_INFO = {
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
|
| 6 |
# Load the dataset
|
| 7 |
+
dataset = load_dataset("danielrosehill/multimodal-ai-taxonomy", split="train")
|
| 8 |
|
| 9 |
+
# Extract taxonomy data and reconstruct nested structure
|
| 10 |
taxonomy_data = {}
|
| 11 |
+
|
| 12 |
+
for record in dataset:
|
| 13 |
+
# Get modality info
|
| 14 |
+
output_modality = record['output_modality']
|
| 15 |
+
operation_type = record['operation_type']
|
| 16 |
+
|
| 17 |
+
# Map output_modality to the keys used in MODALITY_INFO
|
| 18 |
+
modality_key_map = {
|
| 19 |
+
"video": "video_generation",
|
| 20 |
+
"audio": "audio_generation",
|
| 21 |
+
"image": "image_generation",
|
| 22 |
+
"text": "text_generation",
|
| 23 |
+
"3d": "3d_generation",
|
| 24 |
+
"3d-model": "3d_generation"
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
+
modality_key = modality_key_map.get(output_modality, f"{output_modality}_generation")
|
| 28 |
+
|
| 29 |
+
# Initialize nested structure
|
| 30 |
+
if modality_key not in taxonomy_data:
|
| 31 |
+
taxonomy_data[modality_key] = {}
|
| 32 |
+
|
| 33 |
+
if operation_type not in taxonomy_data[modality_key]:
|
| 34 |
+
taxonomy_data[modality_key][operation_type] = {
|
| 35 |
+
"description": f"{output_modality.title()} {operation_type} modalities",
|
| 36 |
+
"outputModality": output_modality,
|
| 37 |
+
"operationType": operation_type,
|
| 38 |
+
"modalities": []
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
# Reconstruct the nested modality object
|
| 42 |
+
modality_obj = {
|
| 43 |
+
"id": record['id'],
|
| 44 |
+
"name": record['name'],
|
| 45 |
+
"input": {
|
| 46 |
+
"primary": record['input_primary'],
|
| 47 |
+
"secondary": record['input_secondary']
|
| 48 |
+
},
|
| 49 |
+
"output": {
|
| 50 |
+
"primary": record['output_primary'],
|
| 51 |
+
"audio": record['output_audio']
|
| 52 |
+
},
|
| 53 |
+
"characteristics": json.loads(record['characteristics']) if record['characteristics'] else {},
|
| 54 |
+
"metadata": {
|
| 55 |
+
"maturityLevel": record['metadata_maturity_level'],
|
| 56 |
+
"commonUseCases": record['metadata_common_use_cases'],
|
| 57 |
+
"platforms": record['metadata_platforms'],
|
| 58 |
+
"exampleModels": record['metadata_example_models']
|
| 59 |
+
},
|
| 60 |
+
"relationships": json.loads(record['relationships']) if record['relationships'] else {}
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
# Add audio type if present
|
| 64 |
+
if record['output_audio'] and record.get('output_audio_type'):
|
| 65 |
+
modality_obj["output"]["audioType"] = record['output_audio_type']
|
| 66 |
+
|
| 67 |
+
# Add to taxonomy data
|
| 68 |
+
taxonomy_data[modality_key][operation_type]["modalities"].append(modality_obj)
|
| 69 |
|
| 70 |
# Define modality display names and emojis
|
| 71 |
MODALITY_INFO = {
|