Spaces:

frascuchon
/

music-mcp

Running on CPU Upgrade

App Files Files Community

frascuchon HF Staff commited on 27 days ago

Commit

cafce31

1 Parent(s): fdf97e3

Add more music tools

Browse files

Files changed (4) hide show

mcp_server.py +269 -0
requirements.txt +1 -0
tools/audio_cutting.py +616 -0
tools/music_understanding.py +355 -0

mcp_server.py CHANGED Viewed

@@ -11,6 +11,18 @@ from tools.stems_separation import (
 )
 from tools.time_strech import align_songs_by_bpm, stretch_to_bpm
 from tools.youtube_extract import extract_audio_from_youtube
 def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
@@ -406,6 +418,247 @@ def create_interface():
         flagging_mode="never",
     )
     return gr.TabbedInterface(
         [
             stem_interface,
@@ -419,6 +672,14 @@ def create_interface():
             medley_interface,
             audio_info_interface,
             youtube_interface,
         ],
         [
             "Stem Separation",
@@ -432,6 +693,14 @@ def create_interface():
             "Medley Creation",
             "Audio Information",
             "YouTube Extraction",
         ],
     )

 )
 from tools.time_strech import align_songs_by_bpm, stretch_to_bpm
 from tools.youtube_extract import extract_audio_from_youtube
+from tools.audio_cutting import (
+    cut_audio,
+    mute_time_windows,
+    extract_segments,
+    trim_audio,
+)
+from tools.music_understanding import (
+    understand_music,
+    analyze_music_structure,
+    suggest_cutting_points,
+    analyze_genre_and_style,
+)
 def pitch_shift_with_semitones(audio_path: str, semitones: int) -> str:
         flagging_mode="never",
     )
+    # Tab 12: Audio Cutting
+    cut_interface = gr.Interface(
+        fn=cut_audio,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+            gr.Number(value=0.0, label="Start Time (seconds)"),
+            gr.Number(value=10.0, label="End Time (seconds)"),
+            gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
+        ],
+        outputs=gr.Audio(label="Cut Audio", type="filepath"),
+        title="Cut Audio Segment",
+        description="Extract a segment from an audio file between specified start and end times.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 13: Mute Time Windows
+    def mute_time_windows_wrapper(audio_path, windows_str, format_val):
+        try:
+            windows = eval(windows_str) if windows_str else []
+            return mute_time_windows(
+                audio_path=audio_path, mute_windows=windows, output_format=format_val
+            )
+        except Exception:
+            return None
+    mute_interface = gr.Interface(
+        fn=mute_time_windows_wrapper,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+            gr.Textbox(
+                value="[[1.0, 2.0], [3.0, 4.0]]",
+                label="Mute Windows (JSON format)",
+                placeholder="[[start1, end1], [start2, end2]]",
+            ),
+            gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
+        ],
+        outputs=gr.Audio(label="Muted Audio", type="filepath"),
+        title="Mute Time Windows",
+        description="Mute specific time windows in an audio file with smooth fade transitions.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 14: Extract Segments
+    def extract_segments_wrapper(audio_path, segments_str, format_val, join):
+        try:
+            segments = eval(segments_str) if segments_str else []
+            result = extract_segments(
+                audio_path=audio_path,
+                segments=segments,
+                output_format=format_val,
+                join_segments=join,
+            )
+            # If result is a list, return the first item for Gradio
+            if isinstance(result, list):
+                return result[0] if result else None
+            return result
+        except Exception:
+            return None
+    extract_interface = gr.Interface(
+        fn=extract_segments_wrapper,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+            gr.Textbox(
+                value="[[0.0, 1.0], [2.0, 3.0]]",
+                label="Segments (JSON format)",
+                placeholder="[[start1, end1], [start2, end2]]",
+            ),
+            gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
+            gr.Checkbox(value=False, label="Join Segments"),
+        ],
+        outputs=gr.Audio(label="Extracted Segments", type="filepath"),
+        title="Extract Segments",
+        description="Extract multiple segments from an audio file.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 15: Trim Audio
+    trim_interface = gr.Interface(
+        fn=trim_audio,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+            gr.Number(value=None, label="Trim Start (seconds, leave empty to skip)"),
+            gr.Number(value=None, label="Trim End (seconds, leave empty to skip)"),
+            gr.Dropdown(choices=["wav", "mp3"], value="wav", label="Output Format"),
+        ],
+        outputs=gr.Audio(label="Trimmed Audio", type="filepath"),
+        title="Trim Audio",
+        description="Trim audio from the beginning and/or end.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 16: Music Understanding
+    def understand_music_wrapper(audio_path, prompt):
+        try:
+            result = understand_music(audio_path=audio_path, prompt_text=prompt)
+            if result["status"] == "success":
+                return result["analysis"]
+            else:
+                return f"Error: {result.get('error', 'Unknown error')}"
+        except Exception as e:
+            return f"Error: {str(e)}"
+    understand_interface = gr.Interface(
+        fn=understand_music_wrapper,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+            gr.Textbox(
+                value="Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
+                label="Analysis Prompt",
+                lines=3,
+            ),
+        ],
+        outputs=gr.Textbox(label="Music Analysis", lines=10),
+        title="Music Understanding (AI)",
+        description="Analyze music using NVIDIA's Music-Flamingo Audio Language Model.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 17: Song Structure Analysis
+    def analyze_music_structure_wrapper(audio_path):
+        try:
+            result = analyze_music_structure(audio_path=audio_path)
+            if result["status"] == "success":
+                return result["analysis"]
+            else:
+                return f"Error: {result.get('error', 'Unknown error')}"
+        except Exception as e:
+            return f"Error: {str(e)}"
+    structure_interface = gr.Interface(
+        fn=analyze_music_structure_wrapper,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+        ],
+        outputs=gr.Textbox(label="Structure Analysis", lines=10),
+        title="Song Structure Analysis",
+        description="Analyze song structure and identify sections (verse, chorus, bridge, etc.).",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 18: Cutting Points Suggestions
+    def suggest_cutting_points_wrapper(audio_path, purpose):
+        try:
+            result = suggest_cutting_points(audio_path=audio_path, purpose=purpose)
+            if result["status"] == "success":
+                return result["analysis"]
+            else:
+                return f"Error: {result.get('error', 'Unknown error')}"
+        except Exception as e:
+            return f"Error: {str(e)}"
+    cutting_points_interface = gr.Interface(
+        fn=suggest_cutting_points_wrapper,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+            gr.Dropdown(
+                choices=["general", "dj_mix", "social_media", "ringtone"],
+                value="general",
+                label="Purpose",
+            ),
+        ],
+        outputs=gr.Textbox(label="Cutting Point Suggestions", lines=10),
+        title="AI Cutting Point Suggestions",
+        description="Get AI-suggested optimal cutting points for different purposes.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 19: Genre and Style Analysis
+    def analyze_genre_and_style_wrapper(audio_path):
+        try:
+            result = analyze_genre_and_style(audio_path=audio_path)
+            if result["status"] == "success":
+                return result["analysis"]
+            else:
+                return f"Error: {result.get('error', 'Unknown error')}"
+        except Exception as e:
+            return f"Error: {str(e)}"
+    genre_interface = gr.Interface(
+        fn=analyze_genre_and_style_wrapper,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+        ],
+        outputs=gr.Textbox(label="Genre & Style Analysis", lines=10),
+        title="Genre & Style Analysis",
+        description="Detailed analysis of genre, production style, and instrumentation.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 18: Cutting Points Suggestions
+    cutting_points_interface = gr.Interface(
+        fn=lambda audio, purpose: suggest_cutting_points(
+            audio_path=audio, purpose=purpose
+        ),
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+            gr.Dropdown(
+                choices=["general", "dj_mix", "social_media", "ringtone"],
+                value="general",
+                label="Purpose",
+            ),
+        ],
+        outputs=gr.Textbox(label="Cutting Point Suggestions", lines=10),
+        title="AI Cutting Point Suggestions",
+        description="Get AI-suggested optimal cutting points for different purposes.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
+    # Tab 19: Genre and Style Analysis
+    genre_interface = gr.Interface(
+        fn=analyze_genre_and_style,
+        inputs=[
+            gr.Audio(type="filepath", label="Upload Audio File", sources=["upload"]),
+        ],
+        outputs=gr.Textbox(label="Genre & Style Analysis", lines=10),
+        title="Genre & Style Analysis",
+        description="Detailed analysis of genre, production style, and instrumentation.",
+        examples=None,
+        cache_examples=False,
+        flagging_mode="never",
+    )
     return gr.TabbedInterface(
         [
             stem_interface,
             medley_interface,
             audio_info_interface,
             youtube_interface,
+            cut_interface,
+            mute_interface,
+            extract_interface,
+            trim_interface,
+            understand_interface,
+            structure_interface,
+            cutting_points_interface,
+            genre_interface,
         ],
         [
             "Stem Separation",
             "Medley Creation",
             "Audio Information",
             "YouTube Extraction",
+            "Audio Cutting",
+            "Mute Windows",
+            "Extract Segments",
+            "Trim Audio",
+            "Music Understanding",
+            "Song Structure",
+            "Cutting Points",
+            "Genre Analysis",
         ],
     )

requirements.txt CHANGED Viewed

@@ -13,4 +13,5 @@ ruff>=0.1.0
 mypy>=1.0.0
 smolagents[mcp]
 gradio[mcp]>=5.36.0
 yt_dlp>=2025.11.12

 mypy>=1.0.0
 smolagents[mcp]
 gradio[mcp]>=5.36.0
+gradio_client>=1.0.0
 yt_dlp>=2025.11.12

tools/audio_cutting.py ADDED Viewed

	@@ -0,0 +1,616 @@

+import os
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+import librosa
+import numpy as np
+import soundfile as sf
+from .audio_info import validate_audio_path
+def cut_audio(
+    audio_path: str,
+    start_time: float,
+    end_time: float,
+    output_path: Optional[str] = None,
+    output_format: str = "wav",
+) -> str:
+    """
+    Cut a segment from an audio file between specified start and end times.
+    Args:
+        audio_path: Path to input audio file
+        start_time: Start time in seconds
+        end_time: End time in seconds
+        output_path: Optional output directory (default: None, uses current directory)
+        output_format: Output format ('wav' or 'mp3', default: 'wav')
+    Returns:
+        Path to the cut audio file
+    Raises:
+        ValueError: If start_time >= end_time or times are out of range
+        FileNotFoundError: If audio file doesn't exist
+    """
+    try:
+        # Validate audio path
+        validated_path = validate_audio_path(audio_path)
+        # Load audio
+        y, sr = librosa.load(validated_path, sr=None, mono=False)
+        # Get audio duration
+        duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
+        # Validate time range
+        if start_time >= end_time:
+            raise ValueError(
+                f"Start time ({start_time}s) must be less than end time ({end_time}s)"
+            )
+        if start_time < 0:
+            raise ValueError(f"Start time ({start_time}s) cannot be negative")
+        if end_time > duration:
+            raise ValueError(
+                f"End time ({end_time}s) exceeds audio duration ({duration:.2f}s)"
+            )
+        # Convert time to sample indices
+        start_sample = int(start_time * sr)
+        end_sample = int(end_time * sr)
+        # Cut the audio segment
+        if y.ndim == 1:
+            # Mono audio
+            y_cut = y[start_sample:end_sample]
+        else:
+            # Multi-channel audio
+            y_cut = y[:, start_sample:end_sample]
+        # Generate output filename
+        if not output_path:
+            output_path = "."
+        os.makedirs(output_path, exist_ok=True)
+        original_filename = Path(validated_path).stem
+        output_filename = f"{original_filename}_cut_{start_time:.1f}s_to_{end_time:.1f}s.{output_format.lower()}"
+        output_file_path = os.path.join(output_path, output_filename)
+        # Save the cut audio
+        if y_cut.ndim == 2:
+            y_cut = y_cut.T  # Transpose for soundfile
+        if output_format.lower() == "mp3":
+            # For MP3, use ffmpeg through subprocess
+            import tempfile
+            import subprocess
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
+                sf.write(temp_wav.name, y_cut, sr)
+                cmd = [
+                    "ffmpeg",
+                    "-y",
+                    "-i",
+                    temp_wav.name,
+                    "-c:a",
+                    "libmp3lame",
+                    "-b:a",
+                    "192k",
+                    output_file_path,
+                ]
+                subprocess.run(cmd, capture_output=True, check=True)
+                os.unlink(temp_wav.name)
+        else:
+            sf.write(output_file_path, y_cut, sr)
+        return output_file_path
+    except Exception as e:
+        raise RuntimeError(f"Error cutting audio: {str(e)}")
+def mute_time_windows(
+    audio_path: str,
+    mute_windows: List[Tuple[float, float]],
+    output_path: Optional[str] = None,
+    output_format: str = "wav",
+    fade_duration: float = 0.1,
+) -> str:
+    """
+    Mute specific time windows in an audio file.
+    Args:
+        audio_path: Path to input audio file
+        mute_windows: List of (start_time, end_time) tuples in seconds
+        output_path: Optional output directory (default: None, uses current directory)
+        output_format: Output format ('wav' or 'mp3', default: 'wav')
+        fade_duration: Fade in/out duration in seconds for smooth transitions (default: 0.1s)
+    Returns:
+        Path to the processed audio file with muted sections
+    Raises:
+        ValueError: If mute windows are invalid or overlapping
+    """
+    try:
+        # Validate audio path
+        validated_path = validate_audio_path(audio_path)
+        # Load audio
+        y, sr = librosa.load(validated_path, sr=None, mono=False)
+        # Get audio duration
+        duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
+        # Validate and sort mute windows
+        sorted_windows = sorted(mute_windows, key=lambda x: x[0])
+        for i, (start, end) in enumerate(sorted_windows):
+            if start >= end:
+                raise ValueError(
+                    f"Window {i}: start time ({start}s) must be less than end time ({end}s)"
+                )
+            if start < 0 or end > duration:
+                raise ValueError(
+                    f"Window {i}: time range ({start}s-{end}s) outside audio duration (0-{duration:.2f}s)"
+                )
+            # Check for overlaps
+            if i > 0:
+                prev_start, prev_end = sorted_windows[i - 1]
+                if start < prev_end:
+                    raise ValueError(f"Window {i} overlaps with previous window")
+        # Create a copy of the audio for processing
+        y_processed = y.copy()
+        # Apply muting with fade in/out
+        for start_time, end_time in sorted_windows:
+            start_sample = int(start_time * sr)
+            end_sample = int(end_time * sr)
+            fade_samples = int(fade_duration * sr)
+            if y_processed.ndim == 1:
+                # Mono audio
+                # Apply fade out before mute
+                fade_start = max(0, start_sample - fade_samples)
+                if fade_start < start_sample:
+                    fade_out = np.linspace(1, 0, start_sample - fade_start)
+                    y_processed[fade_start:start_sample] *= fade_out
+                # Apply mute
+                y_processed[start_sample:end_sample] = 0
+                # Apply fade in after mute
+                fade_end = min(len(y_processed), end_sample + fade_samples)
+                if end_sample < fade_end:
+                    fade_in = np.linspace(0, 1, fade_end - end_sample)
+                    y_processed[end_sample:fade_end] *= fade_in
+            else:
+                # Multi-channel audio
+                # Apply fade out before mute
+                fade_start = max(0, start_sample - fade_samples)
+                if fade_start < start_sample:
+                    fade_out = np.linspace(1, 0, start_sample - fade_start)
+                    y_processed[:, fade_start:start_sample] *= fade_out[np.newaxis, :]
+                # Apply mute
+                y_processed[:, start_sample:end_sample] = 0
+                # Apply fade in after mute
+                fade_end = min(y_processed.shape[1], end_sample + fade_samples)
+                if end_sample < fade_end:
+                    fade_in = np.linspace(0, 1, fade_end - end_sample)
+                    y_processed[:, end_sample:fade_end] *= fade_in[np.newaxis, :]
+        # Generate output filename
+        if not output_path:
+            output_path = "."
+        os.makedirs(output_path, exist_ok=True)
+        original_filename = Path(validated_path).stem
+        windows_str = "_".join([f"{s:.1f}-{e:.1f}" for s, e in sorted_windows[:3]])
+        if len(sorted_windows) > 3:
+            windows_str += f"_and_{len(sorted_windows) - 3}_more"
+        output_filename = (
+            f"{original_filename}_muted_{windows_str}.{output_format.lower()}"
+        )
+        output_file_path = os.path.join(output_path, output_filename)
+        # Save the processed audio
+        if y_processed.ndim == 2:
+            y_processed = y_processed.T  # Transpose for soundfile
+        if output_format.lower() == "mp3":
+            # For MP3, use ffmpeg through subprocess
+            import tempfile
+            import subprocess
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
+                sf.write(temp_wav.name, y_processed, sr)
+                cmd = [
+                    "ffmpeg",
+                    "-y",
+                    "-i",
+                    temp_wav.name,
+                    "-c:a",
+                    "libmp3lame",
+                    "-b:a",
+                    "192k",
+                    output_file_path,
+                ]
+                subprocess.run(cmd, capture_output=True, check=True)
+                os.unlink(temp_wav.name)
+        else:
+            sf.write(output_file_path, y_processed, sr)
+        return output_file_path
+    except Exception as e:
+        raise RuntimeError(f"Error muting audio windows: {str(e)}")
+def extract_segments(
+    audio_path: str,
+    segments: List[Tuple[float, float]],
+    output_path: Optional[str] = None,
+    output_format: str = "wav",
+    join_segments: bool = False,
+) -> Union[str, List[str]]:
+    """
+    Extract multiple segments from an audio file.
+    Args:
+        audio_path: Path to input audio file
+        segments: List of (start_time, end_time) tuples in seconds
+        output_path: Optional output directory (default: None, uses current directory)
+        output_format: Output format ('wav' or 'mp3', default: 'wav')
+        join_segments: If True, join all segments into one file; if False, save separately
+    Returns:
+        If join_segments=True: Path to joined audio file
+        If join_segments=False: List of paths to individual segment files
+    Raises:
+        ValueError: If segments are invalid
+    """
+    try:
+        # Validate audio path
+        validated_path = validate_audio_path(audio_path)
+        # Load audio
+        y, sr = librosa.load(validated_path, sr=None, mono=False)
+        # Get audio duration
+        duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
+        # Validate segments
+        for i, (start, end) in enumerate(segments):
+            if start >= end:
+                raise ValueError(
+                    f"Segment {i}: start time ({start}s) must be less than end time ({end}s)"
+                )
+            if start < 0 or end > duration:
+                raise ValueError(
+                    f"Segment {i}: time range ({start}s-{end}s) outside audio duration"
+                )
+        if not output_path:
+            output_path = "."
+        os.makedirs(output_path, exist_ok=True)
+        original_filename = Path(validated_path).stem
+        if join_segments:
+            # Join all segments into one file
+            segments_audio = []
+            for start_time, end_time in segments:
+                start_sample = int(start_time * sr)
+                end_sample = int(end_time * sr)
+                if y.ndim == 1:
+                    segment = y[start_sample:end_sample]
+                else:
+                    segment = y[:, start_sample:end_sample]
+                segments_audio.append(segment)
+            # Concatenate all segments
+            if y.ndim == 1:
+                y_joined = np.concatenate(segments_audio)
+            else:
+                y_joined = np.concatenate(segments_audio, axis=1)
+            # Save joined audio
+            output_filename = (
+                f"{original_filename}_segments_joined.{output_format.lower()}"
+            )
+            output_file_path = os.path.join(output_path, output_filename)
+            if y_joined.ndim == 2:
+                y_joined = y_joined.T
+            if output_format.lower() == "mp3":
+                import tempfile
+                import subprocess
+                with tempfile.NamedTemporaryFile(
+                    suffix=".wav", delete=False
+                ) as temp_wav:
+                    sf.write(temp_wav.name, y_joined, sr)
+                    cmd = [
+                        "ffmpeg",
+                        "-y",
+                        "-i",
+                        temp_wav.name,
+                        "-c:a",
+                        "libmp3lame",
+                        "-b:a",
+                        "192k",
+                        output_file_path,
+                    ]
+                    subprocess.run(cmd, capture_output=True, check=True)
+                    os.unlink(temp_wav.name)
+            else:
+                sf.write(output_file_path, y_joined, sr)
+            return output_file_path
+        else:
+            # Save segments separately
+            segment_files = []
+            for i, (start_time, end_time) in enumerate(segments):
+                start_sample = int(start_time * sr)
+                end_sample = int(end_time * sr)
+                if y.ndim == 1:
+                    segment = y[start_sample:end_sample]
+                else:
+                    segment = y[:, start_sample:end_sample]
+                output_filename = f"{original_filename}_segment_{i + 1}_{start_time:.1f}s_to_{end_time:.1f}s.{output_format.lower()}"
+                output_file_path = os.path.join(output_path, output_filename)
+                if segment.ndim == 2:
+                    segment = segment.T
+                if output_format.lower() == "mp3":
+                    import tempfile
+                    import subprocess
+                    with tempfile.NamedTemporaryFile(
+                        suffix=".wav", delete=False
+                    ) as temp_wav:
+                        sf.write(temp_wav.name, segment, sr)
+                        cmd = [
+                            "ffmpeg",
+                            "-y",
+                            "-i",
+                            temp_wav.name,
+                            "-c:a",
+                            "libmp3lame",
+                            "-b:a",
+                            "192k",
+                            output_file_path,
+                        ]
+                        subprocess.run(cmd, capture_output=True, check=True)
+                        os.unlink(temp_wav.name)
+                else:
+                    sf.write(output_file_path, segment, sr)
+                segment_files.append(output_file_path)
+            return segment_files
+    except Exception as e:
+        raise RuntimeError(f"Error extracting segments: {str(e)}")
+def trim_audio(
+    audio_path: str,
+    trim_start: Optional[float] = None,
+    trim_end: Optional[float] = None,
+    output_path: Optional[str] = None,
+    output_format: str = "wav",
+) -> str:
+    """
+    Trim audio from the beginning and/or end.
+    Args:
+        audio_path: Path to input audio file
+        trim_start: Amount to trim from start in seconds (None = no trim from start)
+        trim_end: Amount to trim from end in seconds (None = no trim from end)
+        output_path: Optional output directory (default: None, uses current directory)
+        output_format: Output format ('wav' or 'mp3', default: 'wav')
+    Returns:
+        Path to the trimmed audio file
+    Raises:
+        ValueError: If trim amounts are invalid or exceed audio duration
+    """
+    try:
+        # Validate audio path
+        validated_path = validate_audio_path(audio_path)
+        # Load audio
+        y, sr = librosa.load(validated_path, sr=None, mono=False)
+        # Get audio duration
+        duration = len(y) / sr if y.ndim == 1 else len(y[0]) / sr
+        # Validate trim amounts
+        if trim_start is not None and trim_start < 0:
+            raise ValueError("Trim start amount cannot be negative")
+        if trim_end is not None and trim_end < 0:
+            raise ValueError("Trim end amount cannot be negative")
+        if trim_start is None:
+            trim_start = 0.0
+        if trim_end is None:
+            trim_end = 0.0
+        total_trim = trim_start + trim_end
+        if total_trim >= duration:
+            raise ValueError(
+                f"Total trim ({total_trim}s) exceeds or equals audio duration ({duration:.2f}s)"
+            )
+        # Calculate trim boundaries
+        start_sample = int(trim_start * sr)
+        if trim_end > 0:
+            end_sample = int((duration - trim_end) * sr)
+        else:
+            end_sample = len(y) if y.ndim == 1 else y.shape[1]
+        # Trim the audio
+        if y.ndim == 1:
+            y_trimmed = y[start_sample:end_sample]
+        else:
+            y_trimmed = y[:, start_sample:end_sample]
+        # Generate output filename
+        if not output_path:
+            output_path = "."
+        os.makedirs(output_path, exist_ok=True)
+        original_filename = Path(validated_path).stem
+        trim_parts = []
+        if trim_start > 0:
+            trim_parts.append(f"start_{trim_start:.1f}s")
+        if trim_end > 0:
+            trim_parts.append(f"end_{trim_end:.1f}s")
+        trim_str = "_".join(trim_parts) if trim_parts else "trimmed"
+        output_filename = f"{original_filename}_{trim_str}.{output_format.lower()}"
+        output_file_path = os.path.join(output_path, output_filename)
+        # Save the trimmed audio
+        if y_trimmed.ndim == 2:
+            y_trimmed = y_trimmed.T  # Transpose for soundfile
+        if output_format.lower() == "mp3":
+            # For MP3, use ffmpeg through subprocess
+            import tempfile
+            import subprocess
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_wav:
+                sf.write(temp_wav.name, y_trimmed, sr)
+                cmd = [
+                    "ffmpeg",
+                    "-y",
+                    "-i",
+                    temp_wav.name,
+                    "-c:a",
+                    "libmp3lame",
+                    "-b:a",
+                    "192k",
+                    output_file_path,
+                ]
+                subprocess.run(cmd, capture_output=True, check=True)
+                os.unlink(temp_wav.name)
+        else:
+            sf.write(output_file_path, y_trimmed, sr)
+        return output_file_path
+    except Exception as e:
+        raise RuntimeError(f"Error trimming audio: {str(e)}")
+if __name__ == "__main__":
+    import argparse
+    import json
+    parser = argparse.ArgumentParser(description="Audio cutting and editing tools")
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # Cut audio
+    cut_parser = subparsers.add_parser("cut", help="Cut audio segment")
+    cut_parser.add_argument("audio", help="Path to audio file")
+    cut_parser.add_argument("start", type=float, help="Start time in seconds")
+    cut_parser.add_argument("end", type=float, help="End time in seconds")
+    cut_parser.add_argument(
+        "--format", default="wav", choices=["wav", "mp3"], help="Output format"
+    )
+    # Mute windows
+    mute_parser = subparsers.add_parser("mute", help="Mute time windows")
+    mute_parser.add_argument("audio", help="Path to audio file")
+    mute_parser.add_argument("windows", help="JSON array of [start, end] pairs")
+    mute_parser.add_argument(
+        "--format", default="wav", choices=["wav", "mp3"], help="Output format"
+    )
+    # Extract segments
+    extract_parser = subparsers.add_parser("extract", help="Extract segments")
+    extract_parser.add_argument("audio", help="Path to audio file")
+    extract_parser.add_argument("segments", help="JSON array of [start, end] pairs")
+    extract_parser.add_argument(
+        "--join", action="store_true", help="Join segments into one file"
+    )
+    extract_parser.add_argument(
+        "--format", default="wav", choices=["wav", "mp3"], help="Output format"
+    )
+    # Trim audio
+    trim_parser = subparsers.add_parser("trim", help="Trim audio from start/end")
+    trim_parser.add_argument("audio", help="Path to audio file")
+    trim_parser.add_argument(
+        "--start", type=float, help="Trim amount from start in seconds"
+    )
+    trim_parser.add_argument(
+        "--end", type=float, help="Trim amount from end in seconds"
+    )
+    trim_parser.add_argument(
+        "--format", default="wav", choices=["wav", "mp3"], help="Output format"
+    )
+    args = parser.parse_args()
+    try:
+        if args.command == "cut":
+            output = cut_audio(
+                args.audio, args.start, args.end, output_format=args.format
+            )
+            print(f"Cut audio saved to: {output}")
+        elif args.command == "mute":
+            windows = json.loads(args.windows)
+            output = mute_time_windows(args.audio, windows, output_format=args.format)
+            print(f"Muted audio saved to: {output}")
+        elif args.command == "extract":
+            segments = json.loads(args.segments)
+            result = extract_segments(
+                args.audio, segments, join_segments=args.join, output_format=args.format
+            )
+            if args.join:
+                print(f"Joined segments saved to: {result}")
+            else:
+                print("Extracted segments:")
+                for i, segment_file in enumerate(result):
+                    print(f"  {i + 1}. {segment_file}")
+        elif args.command == "trim":
+            output = trim_audio(
+                args.audio, args.start, args.end, output_format=args.format
+            )
+            print(f"Trimmed audio saved to: {output}")
+        else:
+            parser.print_help()
+    except Exception as e:
+        print(f"Error: {e}")
+        exit(1)

tools/music_understanding.py ADDED Viewed

	@@ -0,0 +1,355 @@

+import os
+import tempfile
+from typing import Any, Dict, Optional
+from gradio_client import Client, handle_file
+from .audio_info import validate_audio_path
+def understand_music(
+    audio_path: Optional[str] = None,
+    audio_file: Optional[bytes] = None,
+    filename: str = "audio",
+    prompt_text: str = "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
+    youtube_url: Optional[str] = None,
+) -> Dict[str, Any]:
+    """
+    Analyze music using NVIDIA's Music-Flamingo Audio Language Model.
+    This function uses the flamingo-3 model to provide detailed analysis of audio content,
+    including genre, tempo, key, instrumentation, production style, and mood.
+    Args:
+        audio_path: Path to local audio file (supports WAV, MP3, FLAC, M4A)
+        audio_file: Raw audio bytes (alternative to audio_path)
+        filename: Original filename for reference (used with audio_file)
+        prompt_text: Custom prompt for analysis (default: comprehensive music description)
+        youtube_url: YouTube URL as alternative audio source
+    Returns:
+        Dictionary with analysis results:
+        {
+            "analysis": "Detailed music analysis text",
+            "audio_source": "path" or "bytes" or "youtube",
+            "filename": "Original filename",
+            "prompt": "Used prompt text",
+            "status": "success" or "error",
+            "error": "Error message if status is error"
+        }
+    Raises:
+        ValueError: If neither audio_path, audio_file, nor youtube_url is provided
+        FileNotFoundError: If audio_path doesn't exist
+        RuntimeError: If API call fails or network issues occur
+    Examples:
+        # Basic analysis with local file
+        result = understand_music(audio_path="song.mp3")
+        print(result["analysis"])
+        # Custom prompt for finding cut points
+        result = understand_music(
+            audio_path="song.mp3",
+            prompt_text="Identify the best cutting points for editing - suggest specific time stamps where verses, choruses, and bridges begin and end."
+        )
+        # Analysis with YouTube URL
+        result = understand_music(
+            youtube_url="https://youtube.com/watch?v=example",
+            prompt_text="Analyze the structure and suggest optimal edit points."
+        )
+    """
+    try:
+        # Validate input parameters
+        if not any([audio_path, audio_file, youtube_url]):
+            raise ValueError(
+                "Either audio_path, audio_file, or youtube_url must be provided"
+            )
+        # Handle different audio sources
+        audio_source = None
+        temp_file_path = None
+        source_type = "unknown"
+        source_filename = "unknown"
+        try:
+            if audio_path:
+                # Validate and use local audio file
+                validated_path = validate_audio_path(audio_path)
+                audio_source = handle_file(validated_path)
+                source_type = "path"
+                source_filename = os.path.basename(validated_path)
+            elif audio_file:
+                # Save bytes to temporary file
+                if not filename:
+                    raise ValueError("Filename must be provided when using audio_file")
+                # Create temporary file with appropriate extension
+                temp_dir = tempfile.mkdtemp()
+                if filename.lower().endswith((".wav", ".mp3", ".flac", ".m4a")):
+                    temp_filename = filename
+                else:
+                    temp_filename = f"{filename}.wav"
+                temp_file_path = os.path.join(temp_dir, temp_filename)
+                with open(temp_file_path, "wb") as f:
+                    f.write(audio_file)
+                audio_source = handle_file(temp_file_path)
+                source_type = "bytes"
+                source_filename = filename
+            elif youtube_url:
+                # Use YouTube URL directly
+                audio_source = youtube_url
+                source_type = "youtube"
+                source_filename = youtube_url
+            # Initialize client and make prediction
+            client = Client("nvidia/music-flamingo")
+            result = client.predict(
+                audio_path=audio_source,
+                youtube_url=youtube_url if youtube_url else "",
+                prompt_text=prompt_text,
+                api_name="/infer",
+            )
+            return {
+                "analysis": result,
+                "audio_source": source_type,
+                "filename": source_filename,
+                "prompt": prompt_text,
+                "status": "success",
+            }
+        finally:
+            # Clean up temporary file if created
+            if temp_file_path and os.path.exists(temp_file_path):
+                os.unlink(temp_file_path)
+                # Remove temp directory if empty
+                temp_dir = os.path.dirname(temp_file_path)
+                try:
+                    os.rmdir(temp_dir)
+                except OSError:
+                    pass  # Directory not empty, leave it
+    except Exception as e:
+        return {
+            "analysis": None,
+            "audio_source": audio_path or "bytes" or youtube_url or "unknown",
+            "filename": filename
+            if audio_file
+            else (os.path.basename(audio_path) if audio_path else youtube_url),
+            "prompt": prompt_text,
+            "status": "error",
+            "error": str(e),
+        }
+def analyze_music_structure(
+    audio_path: Optional[str] = None,
+    audio_file: Optional[bytes] = None,
+    filename: str = "audio",
+    youtube_url: Optional[str] = None,
+) -> Dict[str, Any]:
+    """
+    Analyze music structure and identify sections (verse, chorus, bridge, etc.).
+    This function provides a focused analysis on song structure, making it ideal
+    for understanding where to make cuts and edits.
+    Args:
+        audio_path: Path to local audio file
+        audio_file: Raw audio bytes
+        filename: Original filename for reference
+        youtube_url: YouTube URL as alternative audio source
+    Returns:
+        Dictionary with structure analysis results
+    """
+    structure_prompt = (
+        "Analyze the structure of this music track. Identify and timestamp the different sections: "
+        "intro, verses, choruses, pre-chorus, bridge, instrumental breaks, solo sections, and outro/outro. "
+        "Provide specific time stamps (in MM:SS format) for where each section begins and ends. "
+        "Also note any transitions, buildups, or breakdowns that would be important for editing."
+    )
+    return understand_music(
+        audio_path=audio_path,
+        audio_file=audio_file,
+        filename=filename,
+        prompt_text=structure_prompt,
+        youtube_url=youtube_url,
+    )
+def suggest_cutting_points(
+    audio_path: Optional[str] = None,
+    audio_file: Optional[bytes] = None,
+    filename: str = "audio",
+    youtube_url: Optional[str] = None,
+    purpose: str = "general",
+) -> Dict[str, Any]:
+    """
+    Suggest optimal cutting points for audio editing.
+    Args:
+        audio_path: Path to local audio file
+        audio_file: Raw audio bytes
+        filename: Original filename for reference
+        youtube_url: YouTube URL as alternative audio source
+        purpose: Purpose of cutting ('general', 'dj_mix', 'social_media', 'ringtone')
+    Returns:
+        Dictionary with cutting point suggestions
+    """
+    purpose_prompts = {
+        "general": (
+            "Suggest the best cutting points for this track. Identify natural edit points where "
+            "the music flows well for cuts. Provide timestamps in MM:SS format and explain why "
+            "each point is good for editing (e.g., clean transitions, beat drops, phrase endings)."
+        ),
+        "dj_mix": (
+            "Analyze this track for DJ mixing purposes. Identify the best intro and outro sections "
+            "for beatmatching, suggest cue points for mixing, and provide timestamps for clean "
+            "transitions. Focus on drum patterns, BPM consistency, and mixable sections."
+        ),
+        "social_media": (
+            "Suggest cutting points for social media content (15-60 seconds). Identify the most "
+            "engaging parts of the track, catchy hooks, or impactful moments. Provide timestamps "
+            "for creating short, attention-grabbing clips."
+        ),
+        "ringtone": (
+            "Identify the best 15-30 second sections for ringtones. Look for memorable melodies, "
+            "catchy choruses, or distinctive instrumental parts. Provide timestamps and explain "
+            "why each section would work well as a ringtone."
+        ),
+    }
+    prompt = purpose_prompts.get(purpose, purpose_prompts["general"])
+    return understand_music(
+        audio_path=audio_path,
+        audio_file=audio_file,
+        filename=filename,
+        prompt_text=prompt,
+        youtube_url=youtube_url,
+    )
+def analyze_genre_and_style(
+    audio_path: Optional[str] = None,
+    audio_file: Optional[bytes] = None,
+    filename: str = "audio",
+    youtube_url: Optional[str] = None,
+) -> Dict[str, Any]:
+    """
+    Provide detailed genre and production style analysis.
+    Args:
+        audio_path: Path to local audio file
+        audio_file: Raw audio bytes
+        filename: Original filename for reference
+        youtube_url: YouTube URL as alternative audio source
+    Returns:
+        Dictionary with genre and style analysis
+    """
+    genre_prompt = (
+        "Provide a detailed analysis of this track's genre and production style. Identify the "
+        "primary genre and any subgenres or fusion elements. Describe the production techniques, "
+        "mixing style, sound design choices, and arrangement. Analyze the instrumentation, "
+        "including both traditional and electronic elements. Discuss the era or period the music "
+        "seems to draw inspiration from, and compare it to similar artists or tracks if applicable."
+    )
+    return understand_music(
+        audio_path=audio_path,
+        audio_file=audio_file,
+        filename=filename,
+        prompt_text=genre_prompt,
+        youtube_url=youtube_url,
+    )
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Music understanding and analysis tools"
+    )
+    subparsers = parser.add_subparsers(dest="command", help="Available commands")
+    # General understanding
+    understand_parser = subparsers.add_parser(
+        "understand", help="General music analysis"
+    )
+    understand_parser.add_argument("--audio", help="Path to audio file")
+    understand_parser.add_argument("--prompt", help="Custom prompt text")
+    understand_parser.add_argument("--youtube", help="YouTube URL")
+    # Structure analysis
+    structure_parser = subparsers.add_parser("structure", help="Analyze song structure")
+    structure_parser.add_argument("--audio", help="Path to audio file")
+    structure_parser.add_argument("--youtube", help="YouTube URL")
+    # Cutting points
+    cutting_parser = subparsers.add_parser("cutting", help="Suggest cutting points")
+    cutting_parser.add_argument("--audio", help="Path to audio file")
+    cutting_parser.add_argument(
+        "--purpose",
+        choices=["general", "dj_mix", "social_media", "ringtone"],
+        default="general",
+        help="Purpose of cutting",
+    )
+    cutting_parser.add_argument("--youtube", help="YouTube URL")
+    # Genre analysis
+    genre_parser = subparsers.add_parser("genre", help="Analyze genre and style")
+    genre_parser.add_argument("--audio", help="Path to audio file")
+    genre_parser.add_argument("--youtube", help="YouTube URL")
+    args = parser.parse_args()
+    try:
+        if args.command == "understand":
+            result = understand_music(
+                audio_path=args.audio,
+                youtube_url=args.youtube,
+                prompt_text=args.prompt
+                if args.prompt
+                else "Describe this track in full detail - tell me the genre, tempo, and key, then dive into the instruments, production style, and overall mood it creates.",
+            )
+        elif args.command == "cutting":
+            result = suggest_cutting_points(
+                audio_path=args.audio, youtube_url=args.youtube, purpose=args.purpose
+            )
+        elif args.command == "genre":
+            result = analyze_genre_and_style(
+                audio_path=args.audio, youtube_url=args.youtube
+            )
+        else:
+            parser.print_help()
+            exit(1)
+        # Output results
+        if result["status"] == "success":
+            print(f"Analysis for: {result['filename']}")
+            print(f"Source: {result['audio_source']}")
+            print(f"Prompt: {result['prompt']}")
+            print("\n" + "=" * 50)
+            print(result["analysis"])
+        else:
+            print(f"Error: {result['error']}")
+            exit(1)
+    except Exception as e:
+        print(f"Error: {e}")
+        exit(1)