frascuchon HF Staff commited on
Commit
cafce31
·
1 Parent(s): fdf97e3

Add more music tools

Browse files
mcp_server.py CHANGED
@@ -11,6 +11,18 @@ from tools.stems_separation import (
11
  )
12
  from tools.time_strech import align_songs_by_bpm, stretch_to_bpm
13
  from tools.youtube_extract import extract_audio_from_youtube
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
  def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
@@ -406,6 +418,247 @@ def create_interface():
406
  flagging_mode="never",
407
  )
408
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  return gr.TabbedInterface(
410
  [
411
  stem_interface,
@@ -419,6 +672,14 @@ def create_interface():
419
  medley_interface,
420
  audio_info_interface,
421
  youtube_interface,
 
 
 
 
 
 
 
 
422
  ],
423
  [
424
  "Stem Separation",
@@ -432,6 +693,14 @@ def create_interface():
432
  "Medley Creation",
433
  "Audio Information",
434
  "YouTube Extraction",
 
 
 
 
 
 
 
 
435
  ],
436
  )
437
 
 
11
  )
12
  from tools.time_strech import align_songs_by_bpm, stretch_to_bpm
13
  from tools.youtube_extract import extract_audio_from_youtube
14
+ from tools.audio_cutting import (
15
+ cut_audio,
16
+ mute_time_windows,
17
+ extract_segments,
18
+ trim_audio,
19
+ )
20
+ from tools.music_understanding import (
21
+ understand_music,
22
+ analyze_music_structure,
23
+ suggest_cutting_points,
24
+ analyze_genre_and_style,
25
+ )
26
 
27
 
28
  def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
 
418
  flagging_mode="never",
419
  )
420
 
421
+ # Tab 12: Audio Cutting
422
+ cut_interface = gr.Interface(
423
+ fn=cut_audio,
424
+ inputs=[
425
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
426
+ gr.Number(value=0.0, label="Start Time (seconds)"),
427
+ gr.Number(value=10.0, label="End Time (seconds)"),
428
+ gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
429
+ ],
430
+ outputs=gr.Audio(label="Cut Audio", type="filepath"),
431
+ title="Cut Audio Segment",
432
+ description="Extract a segment from an audio file between specified start and end times.",
433
+ examples=None,
434
+ cache_examples=False,
435
+ flagging_mode="never",
436
+ )
437
+
438
+ # Tab 13: Mute Time Windows
439
+ def mute_time_windows_wrapper(audio_path, windows_str, format_val):
440
+ try:
441
+ windows = eval(windows_str) if windows_str else []
442
+ return mute_time_windows(
443
+ audio_path=audio_path, mute_windows=windows, output_format=format_val
444
+ )
445
+ except Exception:
446
+ return None
447
+
448
+ mute_interface = gr.Interface(
449
+ fn=mute_time_windows_wrapper,
450
+ inputs=[
451
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
452
+ gr.Textbox(
453
+ value="[[1.0, 2.0], [3.0, 4.0]]",
454
+ label="Mute Windows (JSON format)",
455
+ placeholder="[[start1, end1], [start2, end2]]",
456
+ ),
457
+ gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
458
+ ],
459
+ outputs=gr.Audio(label="Muted Audio", type="filepath"),
460
+ title="Mute Time Windows",
461
+ description="Mute specific time windows in an audio file with smooth fade transitions.",
462
+ examples=None,
463
+ cache_examples=False,
464
+ flagging_mode="never",
465
+ )
466
+
467
+ # Tab 14: Extract Segments
468
+ def extract_segments_wrapper(audio_path, segments_str, format_val, join):
469
+ try:
470
+ segments = eval(segments_str) if segments_str else []
471
+ result = extract_segments(
472
+ audio_path=audio_path,
473
+ segments=segments,
474
+ output_format=format_val,
475
+ join_segments=join,
476
+ )
477
+ # If result is a list, return the first item for Gradio
478
+ if isinstance(result, list):
479
+ return result[0] if result else None
480
+ return result
481
+ except Exception:
482
+ return None
483
+
484
+ extract_interface = gr.Interface(
485
+ fn=extract_segments_wrapper,
486
+ inputs=[
487
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
488
+ gr.Textbox(
489
+ value="[[0.0, 1.0], [2.0, 3.0]]",
490
+ label="Segments (JSON format)",
491
+ placeholder="[[start1, end1], [start2, end2]]",
492
+ ),
493
+ gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
494
+ gr.Checkbox(value=False, label="Join Segments"),
495
+ ],
496
+ outputs=gr.Audio(label="Extracted Segments", type="filepath"),
497
+ title="Extract Segments",
498
+ description="Extract multiple segments from an audio file.",
499
+ examples=None,
500
+ cache_examples=False,
501
+ flagging_mode="never",
502
+ )
503
+
504
+ # Tab 15: Trim Audio
505
+ trim_interface = gr.Interface(
506
+ fn=trim_audio,
507
+ inputs=[
508
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
509
+ gr.Number(value=None, label="Trim Start (seconds, leave empty to skip)"),
510
+ gr.Number(value=None, label="Trim End (seconds, leave empty to skip)"),
511
+ gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
512
+ ],
513
+ outputs=gr.Audio(label="Trimmed Audio", type="filepath"),
514
+ title="Trim Audio",
515
+ description="Trim audio from the beginning and/or end.",
516
+ examples=None,
517
+ cache_examples=False,
518
+ flagging_mode="never",
519
+ )
520
+
521
+ # Tab 16: Music Understanding
522
+ def understand_music_wrapper(audio_path, prompt):
523
+ try:
524
+ result = understand_music(audio_path=audio_path, prompt_text=prompt)
525
+ if result["status"] == "success":
526
+ return result["analysis"]
527
+ else:
528
+ return f"Error: {result.get('error', 'Unknown error')}"
529
+ except Exception as e:
530
+ return f"Error: {str(e)}"
531
+
532
+ understand_interface = gr.Interface(
533
+ fn=understand_music_wrapper,
534
+ inputs=[
535
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
536
+ gr.Textbox(
537
+ value="Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
538
+ label="Analysis Prompt",
539
+ lines=3,
540
+ ),
541
+ ],
542
+ outputs=gr.Textbox(label="Music Analysis", lines=10),
543
+ title="Music Understanding (AI)",
544
+ description="Analyze music using NVIDIA's Music-Flamingo Audio Language Model.",
545
+ examples=None,
546
+ cache_examples=False,
547
+ flagging_mode="never",
548
+ )
549
+
550
+ # Tab 17: Song Structure Analysis
551
+ def analyze_music_structure_wrapper(audio_path):
552
+ try:
553
+ result = analyze_music_structure(audio_path=audio_path)
554
+ if result["status"] == "success":
555
+ return result["analysis"]
556
+ else:
557
+ return f"Error: {result.get('error', 'Unknown error')}"
558
+ except Exception as e:
559
+ return f"Error: {str(e)}"
560
+
561
+ structure_interface = gr.Interface(
562
+ fn=analyze_music_structure_wrapper,
563
+ inputs=[
564
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
565
+ ],
566
+ outputs=gr.Textbox(label="Structure Analysis", lines=10),
567
+ title="Song Structure Analysis",
568
+ description="Analyze song structure and identify sections (verse, chorus, bridge, etc.).",
569
+ examples=None,
570
+ cache_examples=False,
571
+ flagging_mode="never",
572
+ )
573
+
574
+ # Tab 18: Cutting Points Suggestions
575
+ def suggest_cutting_points_wrapper(audio_path, purpose):
576
+ try:
577
+ result = suggest_cutting_points(audio_path=audio_path, purpose=purpose)
578
+ if result["status"] == "success":
579
+ return result["analysis"]
580
+ else:
581
+ return f"Error: {result.get('error', 'Unknown error')}"
582
+ except Exception as e:
583
+ return f"Error: {str(e)}"
584
+
585
+ cutting_points_interface = gr.Interface(
586
+ fn=suggest_cutting_points_wrapper,
587
+ inputs=[
588
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
589
+ gr.Dropdown(
590
+ choices=["general", "dj_mix", "social_media", "ringtone"],
591
+ value="general",
592
+ label="Purpose",
593
+ ),
594
+ ],
595
+ outputs=gr.Textbox(label="Cutting Point Suggestions", lines=10),
596
+ title="AI Cutting Point Suggestions",
597
+ description="Get AI-suggested optimal cutting points for different purposes.",
598
+ examples=None,
599
+ cache_examples=False,
600
+ flagging_mode="never",
601
+ )
602
+
603
+ # Tab 19: Genre and Style Analysis
604
+ def analyze_genre_and_style_wrapper(audio_path):
605
+ try:
606
+ result = analyze_genre_and_style(audio_path=audio_path)
607
+ if result["status"] == "success":
608
+ return result["analysis"]
609
+ else:
610
+ return f"Error: {result.get('error', 'Unknown error')}"
611
+ except Exception as e:
612
+ return f"Error: {str(e)}"
613
+
614
+ genre_interface = gr.Interface(
615
+ fn=analyze_genre_and_style_wrapper,
616
+ inputs=[
617
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
618
+ ],
619
+ outputs=gr.Textbox(label="Genre & Style Analysis", lines=10),
620
+ title="Genre & Style Analysis",
621
+ description="Detailed analysis of genre, production style, and instrumentation.",
622
+ examples=None,
623
+ cache_examples=False,
624
+ flagging_mode="never",
625
+ )
626
+
627
+ # Tab 18: Cutting Points Suggestions
628
+ cutting_points_interface = gr.Interface(
629
+ fn=lambda audio, purpose: suggest_cutting_points(
630
+ audio_path=audio, purpose=purpose
631
+ ),
632
+ inputs=[
633
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
634
+ gr.Dropdown(
635
+ choices=["general", "dj_mix", "social_media", "ringtone"],
636
+ value="general",
637
+ label="Purpose",
638
+ ),
639
+ ],
640
+ outputs=gr.Textbox(label="Cutting Point Suggestions", lines=10),
641
+ title="AI Cutting Point Suggestions",
642
+ description="Get AI-suggested optimal cutting points for different purposes.",
643
+ examples=None,
644
+ cache_examples=False,
645
+ flagging_mode="never",
646
+ )
647
+
648
+ # Tab 19: Genre and Style Analysis
649
+ genre_interface = gr.Interface(
650
+ fn=analyze_genre_and_style,
651
+ inputs=[
652
+ gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
653
+ ],
654
+ outputs=gr.Textbox(label="Genre & Style Analysis", lines=10),
655
+ title="Genre & Style Analysis",
656
+ description="Detailed analysis of genre, production style, and instrumentation.",
657
+ examples=None,
658
+ cache_examples=False,
659
+ flagging_mode="never",
660
+ )
661
+
662
  return gr.TabbedInterface(
663
  [
664
  stem_interface,
 
672
  medley_interface,
673
  audio_info_interface,
674
  youtube_interface,
675
+ cut_interface,
676
+ mute_interface,
677
+ extract_interface,
678
+ trim_interface,
679
+ understand_interface,
680
+ structure_interface,
681
+ cutting_points_interface,
682
+ genre_interface,
683
  ],
684
  [
685
  "Stem Separation",
 
693
  "Medley Creation",
694
  "Audio Information",
695
  "YouTube Extraction",
696
+ "Audio Cutting",
697
+ "Mute Windows",
698
+ "Extract Segments",
699
+ "Trim Audio",
700
+ "Music Understanding",
701
+ "Song Structure",
702
+ "Cutting Points",
703
+ "Genre Analysis",
704
  ],
705
  )
706
 
requirements.txt CHANGED
@@ -13,4 +13,5 @@ ruff>=0.1.0
13
  mypy>=1.0.0
14
  smolagents[mcp]
15
  gradio[mcp]>=5.36.0
 
16
  yt_dlp>=2025.11.12
 
13
  mypy>=1.0.0
14
  smolagents[mcp]
15
  gradio[mcp]>=5.36.0
16
+ gradio_client>=1.0.0
17
  yt_dlp>=2025.11.12
tools/audio_cutting.py ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from typing import List, Optional, Tuple, Union
4
+
5
+ import librosa
6
+ import numpy as np
7
+ import soundfile as sf
8
+
9
+ from .audio_info import validate_audio_path
10
+
11
+
12
+ def cut_audio(
13
+ audio_path: str,
14
+ start_time: float,
15
+ end_time: float,
16
+ output_path: Optional[str] = None,
17
+ output_format: str = "wav",
18
+ ) -> str:
19
+ """
20
+ Cut a segment from an audio file between specified start and end times.
21
+
22
+ Args:
23
+ audio_path: Path to input audio file
24
+ start_time: Start time in seconds
25
+ end_time: End time in seconds
26
+ output_path: Optional output directory (default: None, uses current directory)
27
+ output_format: Output format ('wav' or 'mp3', default: 'wav')
28
+
29
+ Returns:
30
+ Path to the cut audio file
31
+
32
+ Raises:
33
+ ValueError: If start_time >= end_time or times are out of range
34
+ FileNotFoundError: If audio file doesn't exist
35
+ """
36
+ try:
37
+ # Validate audio path
38
+ validated_path = validate_audio_path(audio_path)
39
+
40
+ # Load audio
41
+ y, sr = librosa.load(validated_path, sr=None, mono=False)
42
+
43
+ # Get audio duration
44
+ duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
45
+
46
+ # Validate time range
47
+ if start_time >= end_time:
48
+ raise ValueError(
49
+ f"Start time ({start_time}s) must be less than end time ({end_time}s)"
50
+ )
51
+
52
+ if start_time < 0:
53
+ raise ValueError(f"Start time ({start_time}s) cannot be negative")
54
+
55
+ if end_time > duration:
56
+ raise ValueError(
57
+ f"End time ({end_time}s) exceeds audio duration ({duration:.2f}s)"
58
+ )
59
+
60
+ # Convert time to sample indices
61
+ start_sample = int(start_time * sr)
62
+ end_sample = int(end_time * sr)
63
+
64
+ # Cut the audio segment
65
+ if y.ndim == 1:
66
+ # Mono audio
67
+ y_cut = y[start_sample:end_sample]
68
+ else:
69
+ # Multi-channel audio
70
+ y_cut = y[:, start_sample:end_sample]
71
+
72
+ # Generate output filename
73
+ if not output_path:
74
+ output_path = "."
75
+ os.makedirs(output_path, exist_ok=True)
76
+
77
+ original_filename = Path(validated_path).stem
78
+ output_filename = f"{original_filename}_cut_{start_time:.1f}s_to_{end_time:.1f}s.{output_format.lower()}"
79
+ output_file_path = os.path.join(output_path, output_filename)
80
+
81
+ # Save the cut audio
82
+ if y_cut.ndim == 2:
83
+ y_cut = y_cut.T # Transpose for soundfile
84
+
85
+ if output_format.lower() == "mp3":
86
+ # For MP3, use ffmpeg through subprocess
87
+ import tempfile
88
+ import subprocess
89
+
90
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
91
+ sf.write(temp_wav.name, y_cut, sr)
92
+
93
+ cmd = [
94
+ "ffmpeg",
95
+ "-y",
96
+ "-i",
97
+ temp_wav.name,
98
+ "-c:a",
99
+ "libmp3lame",
100
+ "-b:a",
101
+ "192k",
102
+ output_file_path,
103
+ ]
104
+ subprocess.run(cmd, capture_output=True, check=True)
105
+ os.unlink(temp_wav.name)
106
+ else:
107
+ sf.write(output_file_path, y_cut, sr)
108
+
109
+ return output_file_path
110
+
111
+ except Exception as e:
112
+ raise RuntimeError(f"Error cutting audio: {str(e)}")
113
+
114
+
115
+ def mute_time_windows(
116
+ audio_path: str,
117
+ mute_windows: List[Tuple[float, float]],
118
+ output_path: Optional[str] = None,
119
+ output_format: str = "wav",
120
+ fade_duration: float = 0.1,
121
+ ) -> str:
122
+ """
123
+ Mute specific time windows in an audio file.
124
+
125
+ Args:
126
+ audio_path: Path to input audio file
127
+ mute_windows: List of (start_time, end_time) tuples in seconds
128
+ output_path: Optional output directory (default: None, uses current directory)
129
+ output_format: Output format ('wav' or 'mp3', default: 'wav')
130
+ fade_duration: Fade in/out duration in seconds for smooth transitions (default: 0.1s)
131
+
132
+ Returns:
133
+ Path to the processed audio file with muted sections
134
+
135
+ Raises:
136
+ ValueError: If mute windows are invalid or overlapping
137
+ """
138
+ try:
139
+ # Validate audio path
140
+ validated_path = validate_audio_path(audio_path)
141
+
142
+ # Load audio
143
+ y, sr = librosa.load(validated_path, sr=None, mono=False)
144
+
145
+ # Get audio duration
146
+ duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
147
+
148
+ # Validate and sort mute windows
149
+ sorted_windows = sorted(mute_windows, key=lambda x: x[0])
150
+
151
+ for i, (start, end) in enumerate(sorted_windows):
152
+ if start >= end:
153
+ raise ValueError(
154
+ f"Window {i}: start time ({start}s) must be less than end time ({end}s)"
155
+ )
156
+ if start < 0 or end > duration:
157
+ raise ValueError(
158
+ f"Window {i}: time range ({start}s-{end}s) outside audio duration (0-{duration:.2f}s)"
159
+ )
160
+
161
+ # Check for overlaps
162
+ if i > 0:
163
+ prev_start, prev_end = sorted_windows[i - 1]
164
+ if start < prev_end:
165
+ raise ValueError(f"Window {i} overlaps with previous window")
166
+
167
+ # Create a copy of the audio for processing
168
+ y_processed = y.copy()
169
+
170
+ # Apply muting with fade in/out
171
+ for start_time, end_time in sorted_windows:
172
+ start_sample = int(start_time * sr)
173
+ end_sample = int(end_time * sr)
174
+ fade_samples = int(fade_duration * sr)
175
+
176
+ if y_processed.ndim == 1:
177
+ # Mono audio
178
+ # Apply fade out before mute
179
+ fade_start = max(0, start_sample - fade_samples)
180
+ if fade_start < start_sample:
181
+ fade_out = np.linspace(1, 0, start_sample - fade_start)
182
+ y_processed[fade_start:start_sample] *= fade_out
183
+
184
+ # Apply mute
185
+ y_processed[start_sample:end_sample] = 0
186
+
187
+ # Apply fade in after mute
188
+ fade_end = min(len(y_processed), end_sample + fade_samples)
189
+ if end_sample < fade_end:
190
+ fade_in = np.linspace(0, 1, fade_end - end_sample)
191
+ y_processed[end_sample:fade_end] *= fade_in
192
+ else:
193
+ # Multi-channel audio
194
+ # Apply fade out before mute
195
+ fade_start = max(0, start_sample - fade_samples)
196
+ if fade_start < start_sample:
197
+ fade_out = np.linspace(1, 0, start_sample - fade_start)
198
+ y_processed[:, fade_start:start_sample] *= fade_out[np.newaxis, :]
199
+
200
+ # Apply mute
201
+ y_processed[:, start_sample:end_sample] = 0
202
+
203
+ # Apply fade in after mute
204
+ fade_end = min(y_processed.shape[1], end_sample + fade_samples)
205
+ if end_sample < fade_end:
206
+ fade_in = np.linspace(0, 1, fade_end - end_sample)
207
+ y_processed[:, end_sample:fade_end] *= fade_in[np.newaxis, :]
208
+
209
+ # Generate output filename
210
+ if not output_path:
211
+ output_path = "."
212
+ os.makedirs(output_path, exist_ok=True)
213
+
214
+ original_filename = Path(validated_path).stem
215
+ windows_str = "_".join([f"{s:.1f}-{e:.1f}" for s, e in sorted_windows[:3]])
216
+ if len(sorted_windows) > 3:
217
+ windows_str += f"_and_{len(sorted_windows) - 3}_more"
218
+
219
+ output_filename = (
220
+ f"{original_filename}_muted_{windows_str}.{output_format.lower()}"
221
+ )
222
+ output_file_path = os.path.join(output_path, output_filename)
223
+
224
+ # Save the processed audio
225
+ if y_processed.ndim == 2:
226
+ y_processed = y_processed.T # Transpose for soundfile
227
+
228
+ if output_format.lower() == "mp3":
229
+ # For MP3, use ffmpeg through subprocess
230
+ import tempfile
231
+ import subprocess
232
+
233
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
234
+ sf.write(temp_wav.name, y_processed, sr)
235
+
236
+ cmd = [
237
+ "ffmpeg",
238
+ "-y",
239
+ "-i",
240
+ temp_wav.name,
241
+ "-c:a",
242
+ "libmp3lame",
243
+ "-b:a",
244
+ "192k",
245
+ output_file_path,
246
+ ]
247
+ subprocess.run(cmd, capture_output=True, check=True)
248
+ os.unlink(temp_wav.name)
249
+ else:
250
+ sf.write(output_file_path, y_processed, sr)
251
+
252
+ return output_file_path
253
+
254
+ except Exception as e:
255
+ raise RuntimeError(f"Error muting audio windows: {str(e)}")
256
+
257
+
258
+ def extract_segments(
259
+ audio_path: str,
260
+ segments: List[Tuple[float, float]],
261
+ output_path: Optional[str] = None,
262
+ output_format: str = "wav",
263
+ join_segments: bool = False,
264
+ ) -> Union[str, List[str]]:
265
+ """
266
+ Extract multiple segments from an audio file.
267
+
268
+ Args:
269
+ audio_path: Path to input audio file
270
+ segments: List of (start_time, end_time) tuples in seconds
271
+ output_path: Optional output directory (default: None, uses current directory)
272
+ output_format: Output format ('wav' or 'mp3', default: 'wav')
273
+ join_segments: If True, join all segments into one file; if False, save separately
274
+
275
+ Returns:
276
+ If join_segments=True: Path to joined audio file
277
+ If join_segments=False: List of paths to individual segment files
278
+
279
+ Raises:
280
+ ValueError: If segments are invalid
281
+ """
282
+ try:
283
+ # Validate audio path
284
+ validated_path = validate_audio_path(audio_path)
285
+
286
+ # Load audio
287
+ y, sr = librosa.load(validated_path, sr=None, mono=False)
288
+
289
+ # Get audio duration
290
+ duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
291
+
292
+ # Validate segments
293
+ for i, (start, end) in enumerate(segments):
294
+ if start >= end:
295
+ raise ValueError(
296
+ f"Segment {i}: start time ({start}s) must be less than end time ({end}s)"
297
+ )
298
+ if start < 0 or end > duration:
299
+ raise ValueError(
300
+ f"Segment {i}: time range ({start}s-{end}s) outside audio duration"
301
+ )
302
+
303
+ if not output_path:
304
+ output_path = "."
305
+ os.makedirs(output_path, exist_ok=True)
306
+
307
+ original_filename = Path(validated_path).stem
308
+
309
+ if join_segments:
310
+ # Join all segments into one file
311
+ segments_audio = []
312
+
313
+ for start_time, end_time in segments:
314
+ start_sample = int(start_time * sr)
315
+ end_sample = int(end_time * sr)
316
+
317
+ if y.ndim == 1:
318
+ segment = y[start_sample:end_sample]
319
+ else:
320
+ segment = y[:, start_sample:end_sample]
321
+
322
+ segments_audio.append(segment)
323
+
324
+ # Concatenate all segments
325
+ if y.ndim == 1:
326
+ y_joined = np.concatenate(segments_audio)
327
+ else:
328
+ y_joined = np.concatenate(segments_audio, axis=1)
329
+
330
+ # Save joined audio
331
+ output_filename = (
332
+ f"{original_filename}_segments_joined.{output_format.lower()}"
333
+ )
334
+ output_file_path = os.path.join(output_path, output_filename)
335
+
336
+ if y_joined.ndim == 2:
337
+ y_joined = y_joined.T
338
+
339
+ if output_format.lower() == "mp3":
340
+ import tempfile
341
+ import subprocess
342
+
343
+ with tempfile.NamedTemporaryFile(
344
+ suffix=".wav", delete=False
345
+ ) as temp_wav:
346
+ sf.write(temp_wav.name, y_joined, sr)
347
+
348
+ cmd = [
349
+ "ffmpeg",
350
+ "-y",
351
+ "-i",
352
+ temp_wav.name,
353
+ "-c:a",
354
+ "libmp3lame",
355
+ "-b:a",
356
+ "192k",
357
+ output_file_path,
358
+ ]
359
+ subprocess.run(cmd, capture_output=True, check=True)
360
+ os.unlink(temp_wav.name)
361
+ else:
362
+ sf.write(output_file_path, y_joined, sr)
363
+
364
+ return output_file_path
365
+ else:
366
+ # Save segments separately
367
+ segment_files = []
368
+
369
+ for i, (start_time, end_time) in enumerate(segments):
370
+ start_sample = int(start_time * sr)
371
+ end_sample = int(end_time * sr)
372
+
373
+ if y.ndim == 1:
374
+ segment = y[start_sample:end_sample]
375
+ else:
376
+ segment = y[:, start_sample:end_sample]
377
+
378
+ output_filename = f"{original_filename}_segment_{i + 1}_{start_time:.1f}s_to_{end_time:.1f}s.{output_format.lower()}"
379
+ output_file_path = os.path.join(output_path, output_filename)
380
+
381
+ if segment.ndim == 2:
382
+ segment = segment.T
383
+
384
+ if output_format.lower() == "mp3":
385
+ import tempfile
386
+ import subprocess
387
+
388
+ with tempfile.NamedTemporaryFile(
389
+ suffix=".wav", delete=False
390
+ ) as temp_wav:
391
+ sf.write(temp_wav.name, segment, sr)
392
+
393
+ cmd = [
394
+ "ffmpeg",
395
+ "-y",
396
+ "-i",
397
+ temp_wav.name,
398
+ "-c:a",
399
+ "libmp3lame",
400
+ "-b:a",
401
+ "192k",
402
+ output_file_path,
403
+ ]
404
+ subprocess.run(cmd, capture_output=True, check=True)
405
+ os.unlink(temp_wav.name)
406
+ else:
407
+ sf.write(output_file_path, segment, sr)
408
+
409
+ segment_files.append(output_file_path)
410
+
411
+ return segment_files
412
+
413
+ except Exception as e:
414
+ raise RuntimeError(f"Error extracting segments: {str(e)}")
415
+
416
+
417
+ def trim_audio(
418
+ audio_path: str,
419
+ trim_start: Optional[float] = None,
420
+ trim_end: Optional[float] = None,
421
+ output_path: Optional[str] = None,
422
+ output_format: str = "wav",
423
+ ) -> str:
424
+ """
425
+ Trim audio from the beginning and/or end.
426
+
427
+ Args:
428
+ audio_path: Path to input audio file
429
+ trim_start: Amount to trim from start in seconds (None = no trim from start)
430
+ trim_end: Amount to trim from end in seconds (None = no trim from end)
431
+ output_path: Optional output directory (default: None, uses current directory)
432
+ output_format: Output format ('wav' or 'mp3', default: 'wav')
433
+
434
+ Returns:
435
+ Path to the trimmed audio file
436
+
437
+ Raises:
438
+ ValueError: If trim amounts are invalid or exceed audio duration
439
+ """
440
+ try:
441
+ # Validate audio path
442
+ validated_path = validate_audio_path(audio_path)
443
+
444
+ # Load audio
445
+ y, sr = librosa.load(validated_path, sr=None, mono=False)
446
+
447
+ # Get audio duration
448
+ duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
449
+
450
+ # Validate trim amounts
451
+ if trim_start is not None and trim_start < 0:
452
+ raise ValueError("Trim start amount cannot be negative")
453
+
454
+ if trim_end is not None and trim_end < 0:
455
+ raise ValueError("Trim end amount cannot be negative")
456
+
457
+ if trim_start is None:
458
+ trim_start = 0.0
459
+ if trim_end is None:
460
+ trim_end = 0.0
461
+
462
+ total_trim = trim_start + trim_end
463
+ if total_trim >= duration:
464
+ raise ValueError(
465
+ f"Total trim ({total_trim}s) exceeds or equals audio duration ({duration:.2f}s)"
466
+ )
467
+
468
+ # Calculate trim boundaries
469
+ start_sample = int(trim_start * sr)
470
+ if trim_end > 0:
471
+ end_sample = int((duration - trim_end) * sr)
472
+ else:
473
+ end_sample = len(y) if y.ndim == 1 else y.shape[1]
474
+
475
+ # Trim the audio
476
+ if y.ndim == 1:
477
+ y_trimmed = y[start_sample:end_sample]
478
+ else:
479
+ y_trimmed = y[:, start_sample:end_sample]
480
+
481
+ # Generate output filename
482
+ if not output_path:
483
+ output_path = "."
484
+ os.makedirs(output_path, exist_ok=True)
485
+
486
+ original_filename = Path(validated_path).stem
487
+ trim_parts = []
488
+ if trim_start > 0:
489
+ trim_parts.append(f"start_{trim_start:.1f}s")
490
+ if trim_end > 0:
491
+ trim_parts.append(f"end_{trim_end:.1f}s")
492
+
493
+ trim_str = "_".join(trim_parts) if trim_parts else "trimmed"
494
+ output_filename = f"{original_filename}_{trim_str}.{output_format.lower()}"
495
+ output_file_path = os.path.join(output_path, output_filename)
496
+
497
+ # Save the trimmed audio
498
+ if y_trimmed.ndim == 2:
499
+ y_trimmed = y_trimmed.T # Transpose for soundfile
500
+
501
+ if output_format.lower() == "mp3":
502
+ # For MP3, use ffmpeg through subprocess
503
+ import tempfile
504
+ import subprocess
505
+
506
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
507
+ sf.write(temp_wav.name, y_trimmed, sr)
508
+
509
+ cmd = [
510
+ "ffmpeg",
511
+ "-y",
512
+ "-i",
513
+ temp_wav.name,
514
+ "-c:a",
515
+ "libmp3lame",
516
+ "-b:a",
517
+ "192k",
518
+ output_file_path,
519
+ ]
520
+ subprocess.run(cmd, capture_output=True, check=True)
521
+ os.unlink(temp_wav.name)
522
+ else:
523
+ sf.write(output_file_path, y_trimmed, sr)
524
+
525
+ return output_file_path
526
+
527
+ except Exception as e:
528
+ raise RuntimeError(f"Error trimming audio: {str(e)}")
529
+
530
+
531
+ if __name__ == "__main__":
532
+ import argparse
533
+ import json
534
+
535
+ parser = argparse.ArgumentParser(description="Audio cutting and editing tools")
536
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
537
+
538
+ # Cut audio
539
+ cut_parser = subparsers.add_parser("cut", help="Cut audio segment")
540
+ cut_parser.add_argument("audio", help="Path to audio file")
541
+ cut_parser.add_argument("start", type=float, help="Start time in seconds")
542
+ cut_parser.add_argument("end", type=float, help="End time in seconds")
543
+ cut_parser.add_argument(
544
+ "--format", default="wav", choices=["wav", "mp3"], help="Output format"
545
+ )
546
+
547
+ # Mute windows
548
+ mute_parser = subparsers.add_parser("mute", help="Mute time windows")
549
+ mute_parser.add_argument("audio", help="Path to audio file")
550
+ mute_parser.add_argument("windows", help="JSON array of [start, end] pairs")
551
+ mute_parser.add_argument(
552
+ "--format", default="wav", choices=["wav", "mp3"], help="Output format"
553
+ )
554
+
555
+ # Extract segments
556
+ extract_parser = subparsers.add_parser("extract", help="Extract segments")
557
+ extract_parser.add_argument("audio", help="Path to audio file")
558
+ extract_parser.add_argument("segments", help="JSON array of [start, end] pairs")
559
+ extract_parser.add_argument(
560
+ "--join", action="store_true", help="Join segments into one file"
561
+ )
562
+ extract_parser.add_argument(
563
+ "--format", default="wav", choices=["wav", "mp3"], help="Output format"
564
+ )
565
+
566
+ # Trim audio
567
+ trim_parser = subparsers.add_parser("trim", help="Trim audio from start/end")
568
+ trim_parser.add_argument("audio", help="Path to audio file")
569
+ trim_parser.add_argument(
570
+ "--start", type=float, help="Trim amount from start in seconds"
571
+ )
572
+ trim_parser.add_argument(
573
+ "--end", type=float, help="Trim amount from end in seconds"
574
+ )
575
+ trim_parser.add_argument(
576
+ "--format", default="wav", choices=["wav", "mp3"], help="Output format"
577
+ )
578
+
579
+ args = parser.parse_args()
580
+
581
+ try:
582
+ if args.command == "cut":
583
+ output = cut_audio(
584
+ args.audio, args.start, args.end, output_format=args.format
585
+ )
586
+ print(f"Cut audio saved to: {output}")
587
+
588
+ elif args.command == "mute":
589
+ windows = json.loads(args.windows)
590
+ output = mute_time_windows(args.audio, windows, output_format=args.format)
591
+ print(f"Muted audio saved to: {output}")
592
+
593
+ elif args.command == "extract":
594
+ segments = json.loads(args.segments)
595
+ result = extract_segments(
596
+ args.audio, segments, join_segments=args.join, output_format=args.format
597
+ )
598
+ if args.join:
599
+ print(f"Joined segments saved to: {result}")
600
+ else:
601
+ print("Extracted segments:")
602
+ for i, segment_file in enumerate(result):
603
+ print(f" {i + 1}. {segment_file}")
604
+
605
+ elif args.command == "trim":
606
+ output = trim_audio(
607
+ args.audio, args.start, args.end, output_format=args.format
608
+ )
609
+ print(f"Trimmed audio saved to: {output}")
610
+
611
+ else:
612
+ parser.print_help()
613
+
614
+ except Exception as e:
615
+ print(f"Error: {e}")
616
+ exit(1)
tools/music_understanding.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
+ from typing import Any, Dict, Optional
4
+
5
+ from gradio_client import Client, handle_file
6
+
7
+ from .audio_info import validate_audio_path
8
+
9
+
10
+ def understand_music(
11
+ audio_path: Optional[str] = None,
12
+ audio_file: Optional[bytes] = None,
13
+ filename: str = "audio",
14
+ prompt_text: str = "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
15
+ youtube_url: Optional[str] = None,
16
+ ) -> Dict[str, Any]:
17
+ """
18
+ Analyze music using NVIDIA's Music-Flamingo Audio Language Model.
19
+
20
+ This function uses the flamingo-3 model to provide detailed analysis of audio content,
21
+ including genre, tempo, key, instrumentation, production style, and mood.
22
+
23
+ Args:
24
+ audio_path: Path to local audio file (supports WAV, MP3, FLAC, M4A)
25
+ audio_file: Raw audio bytes (alternative to audio_path)
26
+ filename: Original filename for reference (used with audio_file)
27
+ prompt_text: Custom prompt for analysis (default: comprehensive music description)
28
+ youtube_url: YouTube URL as alternative audio source
29
+
30
+ Returns:
31
+ Dictionary with analysis results:
32
+ {
33
+ "analysis": "Detailed music analysis text",
34
+ "audio_source": "path" or "bytes" or "youtube",
35
+ "filename": "Original filename",
36
+ "prompt": "Used prompt text",
37
+ "status": "success" or "error",
38
+ "error": "Error message if status is error"
39
+ }
40
+
41
+ Raises:
42
+ ValueError: If neither audio_path, audio_file, nor youtube_url is provided
43
+ FileNotFoundError: If audio_path doesn't exist
44
+ RuntimeError: If API call fails or network issues occur
45
+
46
+ Examples:
47
+ # Basic analysis with local file
48
+ result = understand_music(audio_path="song.mp3")
49
+ print(result["analysis"])
50
+
51
+ # Custom prompt for finding cut points
52
+ result = understand_music(
53
+ audio_path="song.mp3",
54
+ prompt_text="Identify the best cutting points for editing - suggest specific time stamps where verses, choruses, and bridges begin and end."
55
+ )
56
+
57
+ # Analysis with YouTube URL
58
+ result = understand_music(
59
+ youtube_url="https://youtube.com/watch?v=example",
60
+ prompt_text="Analyze the structure and suggest optimal edit points."
61
+ )
62
+ """
63
+ try:
64
+ # Validate input parameters
65
+ if not any([audio_path, audio_file, youtube_url]):
66
+ raise ValueError(
67
+ "Either audio_path, audio_file, or youtube_url must be provided"
68
+ )
69
+
70
+ # Handle different audio sources
71
+ audio_source = None
72
+ temp_file_path = None
73
+ source_type = "unknown"
74
+ source_filename = "unknown"
75
+
76
+ try:
77
+ if audio_path:
78
+ # Validate and use local audio file
79
+ validated_path = validate_audio_path(audio_path)
80
+ audio_source = handle_file(validated_path)
81
+ source_type = "path"
82
+ source_filename = os.path.basename(validated_path)
83
+
84
+ elif audio_file:
85
+ # Save bytes to temporary file
86
+ if not filename:
87
+ raise ValueError("Filename must be provided when using audio_file")
88
+
89
+ # Create temporary file with appropriate extension
90
+ temp_dir = tempfile.mkdtemp()
91
+ if filename.lower().endswith((".wav", ".mp3", ".flac", ".m4a")):
92
+ temp_filename = filename
93
+ else:
94
+ temp_filename = f"{filename}.wav"
95
+
96
+ temp_file_path = os.path.join(temp_dir, temp_filename)
97
+
98
+ with open(temp_file_path, "wb") as f:
99
+ f.write(audio_file)
100
+
101
+ audio_source = handle_file(temp_file_path)
102
+ source_type = "bytes"
103
+ source_filename = filename
104
+
105
+ elif youtube_url:
106
+ # Use YouTube URL directly
107
+ audio_source = youtube_url
108
+ source_type = "youtube"
109
+ source_filename = youtube_url
110
+
111
+ # Initialize client and make prediction
112
+ client = Client("nvidia/music-flamingo")
113
+
114
+ result = client.predict(
115
+ audio_path=audio_source,
116
+ youtube_url=youtube_url if youtube_url else "",
117
+ prompt_text=prompt_text,
118
+ api_name="/infer",
119
+ )
120
+
121
+ return {
122
+ "analysis": result,
123
+ "audio_source": source_type,
124
+ "filename": source_filename,
125
+ "prompt": prompt_text,
126
+ "status": "success",
127
+ }
128
+
129
+ finally:
130
+ # Clean up temporary file if created
131
+ if temp_file_path and os.path.exists(temp_file_path):
132
+ os.unlink(temp_file_path)
133
+ # Remove temp directory if empty
134
+ temp_dir = os.path.dirname(temp_file_path)
135
+ try:
136
+ os.rmdir(temp_dir)
137
+ except OSError:
138
+ pass # Directory not empty, leave it
139
+
140
+ except Exception as e:
141
+ return {
142
+ "analysis": None,
143
+ "audio_source": audio_path or "bytes" or youtube_url or "unknown",
144
+ "filename": filename
145
+ if audio_file
146
+ else (os.path.basename(audio_path) if audio_path else youtube_url),
147
+ "prompt": prompt_text,
148
+ "status": "error",
149
+ "error": str(e),
150
+ }
151
+
152
+
153
+ def analyze_music_structure(
154
+ audio_path: Optional[str] = None,
155
+ audio_file: Optional[bytes] = None,
156
+ filename: str = "audio",
157
+ youtube_url: Optional[str] = None,
158
+ ) -> Dict[str, Any]:
159
+ """
160
+ Analyze music structure and identify sections (verse, chorus, bridge, etc.).
161
+
162
+ This function provides a focused analysis on song structure, making it ideal
163
+ for understanding where to make cuts and edits.
164
+
165
+ Args:
166
+ audio_path: Path to local audio file
167
+ audio_file: Raw audio bytes
168
+ filename: Original filename for reference
169
+ youtube_url: YouTube URL as alternative audio source
170
+
171
+ Returns:
172
+ Dictionary with structure analysis results
173
+ """
174
+ structure_prompt = (
175
+ "Analyze the structure of this music track. Identify and timestamp the different sections: "
176
+ "intro, verses, choruses, pre-chorus, bridge, instrumental breaks, solo sections, and outro/outro. "
177
+ "Provide specific time stamps (in MM:SS format) for where each section begins and ends. "
178
+ "Also note any transitions, buildups, or breakdowns that would be important for editing."
179
+ )
180
+
181
+ return understand_music(
182
+ audio_path=audio_path,
183
+ audio_file=audio_file,
184
+ filename=filename,
185
+ prompt_text=structure_prompt,
186
+ youtube_url=youtube_url,
187
+ )
188
+
189
+
190
+ def suggest_cutting_points(
191
+ audio_path: Optional[str] = None,
192
+ audio_file: Optional[bytes] = None,
193
+ filename: str = "audio",
194
+ youtube_url: Optional[str] = None,
195
+ purpose: str = "general",
196
+ ) -> Dict[str, Any]:
197
+ """
198
+ Suggest optimal cutting points for audio editing.
199
+
200
+ Args:
201
+ audio_path: Path to local audio file
202
+ audio_file: Raw audio bytes
203
+ filename: Original filename for reference
204
+ youtube_url: YouTube URL as alternative audio source
205
+ purpose: Purpose of cutting ('general', 'dj_mix', 'social_media', 'ringtone')
206
+
207
+ Returns:
208
+ Dictionary with cutting point suggestions
209
+ """
210
+ purpose_prompts = {
211
+ "general": (
212
+ "Suggest the best cutting points for this track. Identify natural edit points where "
213
+ "the music flows well for cuts. Provide timestamps in MM:SS format and explain why "
214
+ "each point is good for editing (e.g., clean transitions, beat drops, phrase endings)."
215
+ ),
216
+ "dj_mix": (
217
+ "Analyze this track for DJ mixing purposes. Identify the best intro and outro sections "
218
+ "for beatmatching, suggest cue points for mixing, and provide timestamps for clean "
219
+ "transitions. Focus on drum patterns, BPM consistency, and mixable sections."
220
+ ),
221
+ "social_media": (
222
+ "Suggest cutting points for social media content (15-60 seconds). Identify the most "
223
+ "engaging parts of the track, catchy hooks, or impactful moments. Provide timestamps "
224
+ "for creating short, attention-grabbing clips."
225
+ ),
226
+ "ringtone": (
227
+ "Identify the best 15-30 second sections for ringtones. Look for memorable melodies, "
228
+ "catchy choruses, or distinctive instrumental parts. Provide timestamps and explain "
229
+ "why each section would work well as a ringtone."
230
+ ),
231
+ }
232
+
233
+ prompt = purpose_prompts.get(purpose, purpose_prompts["general"])
234
+
235
+ return understand_music(
236
+ audio_path=audio_path,
237
+ audio_file=audio_file,
238
+ filename=filename,
239
+ prompt_text=prompt,
240
+ youtube_url=youtube_url,
241
+ )
242
+
243
+
244
+ def analyze_genre_and_style(
245
+ audio_path: Optional[str] = None,
246
+ audio_file: Optional[bytes] = None,
247
+ filename: str = "audio",
248
+ youtube_url: Optional[str] = None,
249
+ ) -> Dict[str, Any]:
250
+ """
251
+ Provide detailed genre and production style analysis.
252
+
253
+ Args:
254
+ audio_path: Path to local audio file
255
+ audio_file: Raw audio bytes
256
+ filename: Original filename for reference
257
+ youtube_url: YouTube URL as alternative audio source
258
+
259
+ Returns:
260
+ Dictionary with genre and style analysis
261
+ """
262
+ genre_prompt = (
263
+ "Provide a detailed analysis of this track's genre and production style. Identify the "
264
+ "primary genre and any subgenres or fusion elements. Describe the production techniques, "
265
+ "mixing style, sound design choices, and arrangement. Analyze the instrumentation, "
266
+ "including both traditional and electronic elements. Discuss the era or period the music "
267
+ "seems to draw inspiration from, and compare it to similar artists or tracks if applicable."
268
+ )
269
+
270
+ return understand_music(
271
+ audio_path=audio_path,
272
+ audio_file=audio_file,
273
+ filename=filename,
274
+ prompt_text=genre_prompt,
275
+ youtube_url=youtube_url,
276
+ )
277
+
278
+
279
+ if __name__ == "__main__":
280
+ import argparse
281
+
282
+ parser = argparse.ArgumentParser(
283
+ description="Music understanding and analysis tools"
284
+ )
285
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
286
+
287
+ # General understanding
288
+ understand_parser = subparsers.add_parser(
289
+ "understand", help="General music analysis"
290
+ )
291
+ understand_parser.add_argument("--audio", help="Path to audio file")
292
+ understand_parser.add_argument("--prompt", help="Custom prompt text")
293
+ understand_parser.add_argument("--youtube", help="YouTube URL")
294
+
295
+ # Structure analysis
296
+ structure_parser = subparsers.add_parser("structure", help="Analyze song structure")
297
+ structure_parser.add_argument("--audio", help="Path to audio file")
298
+ structure_parser.add_argument("--youtube", help="YouTube URL")
299
+
300
+ # Cutting points
301
+ cutting_parser = subparsers.add_parser("cutting", help="Suggest cutting points")
302
+ cutting_parser.add_argument("--audio", help="Path to audio file")
303
+ cutting_parser.add_argument(
304
+ "--purpose",
305
+ choices=["general", "dj_mix", "social_media", "ringtone"],
306
+ default="general",
307
+ help="Purpose of cutting",
308
+ )
309
+ cutting_parser.add_argument("--youtube", help="YouTube URL")
310
+
311
+ # Genre analysis
312
+ genre_parser = subparsers.add_parser("genre", help="Analyze genre and style")
313
+ genre_parser.add_argument("--audio", help="Path to audio file")
314
+ genre_parser.add_argument("--youtube", help="YouTube URL")
315
+
316
+ args = parser.parse_args()
317
+
318
+ try:
319
+ if args.command == "understand":
320
+ result = understand_music(
321
+ audio_path=args.audio,
322
+ youtube_url=args.youtube,
323
+ prompt_text=args.prompt
324
+ if args.prompt
325
+ else "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
326
+ )
327
+
328
+ elif args.command == "cutting":
329
+ result = suggest_cutting_points(
330
+ audio_path=args.audio, youtube_url=args.youtube, purpose=args.purpose
331
+ )
332
+
333
+ elif args.command == "genre":
334
+ result = analyze_genre_and_style(
335
+ audio_path=args.audio, youtube_url=args.youtube
336
+ )
337
+
338
+ else:
339
+ parser.print_help()
340
+ exit(1)
341
+
342
+ # Output results
343
+ if result["status"] == "success":
344
+ print(f"Analysis for: {result['filename']}")
345
+ print(f"Source: {result['audio_source']}")
346
+ print(f"Prompt: {result['prompt']}")
347
+ print("\n" + "=" * 50)
348
+ print(result["analysis"])
349
+ else:
350
+ print(f"Error: {result['error']}")
351
+ exit(1)
352
+
353
+ except Exception as e:
354
+ print(f"Error: {e}")
355
+ exit(1)